Index creation does not cause the cluster health to go RED

Ali Beyad · Ali Beyad · commit 417bd0cd63b9 · 2016-07-11T15:30:47.000-04:00
Previously, index creation would momentarily cause the cluster health to go RED, because the primaries were still being assigned and activated. This commit ensures that when an index is created or an index is being recovered during cluster recovery and it does not have any active allocation ids, then the cluster health status will not go RED, but instead be YELLOW. Relates elastic#9126
diff --git a/core/src/main/java/org/elasticsearch/action/admin/indices/shards/TransportIndicesShardStoresAction.java b/core/src/main/java/org/elasticsearch/action/admin/indices/shards/TransportIndicesShardStoresAction.java
@@ -28,6 +28,7 @@
 import org.elasticsearch.cluster.block.ClusterBlockLevel;
 import org.elasticsearch.cluster.health.ClusterHealthStatus;
 import org.elasticsearch.cluster.health.ClusterShardHealth;
+import org.elasticsearch.cluster.metadata.IndexMetaData;
 import org.elasticsearch.cluster.metadata.IndexNameExpressionResolver;
 import org.elasticsearch.cluster.node.DiscoveryNode;
 import org.elasticsearch.cluster.node.DiscoveryNodes;
@@ -93,12 +94,16 @@ protected void masterOperation(IndicesShardStoresRequest request, ClusterState s
         logger.trace("using cluster state version [{}] to determine shards", state.version());
         // collect relevant shard ids of the requested indices for fetching store infos
         for (String index : concreteIndices) {
+            IndexMetaData indexMetaData = state.metaData().index(index);
             IndexRoutingTable indexShardRoutingTables = routingTables.index(index);
             if (indexShardRoutingTables == null) {
                 continue;
             }
             for (IndexShardRoutingTable routing : indexShardRoutingTables) {
-                ClusterShardHealth shardHealth = new ClusterShardHealth(routing.shardId().id(), routing);
+                final int shardId = routing.shardId().id();
+                ClusterShardHealth shardHealth = new ClusterShardHealth(shardId,
+                                                                        routing,
+                                                                        indexMetaData.activeAllocationIds(shardId).isEmpty());
                 if (request.shardStatuses().contains(shardHealth.getStatus())) {
                     shardIdsToFetch.add(routing.shardId());
                 }
diff --git a/core/src/main/java/org/elasticsearch/cluster/health/ClusterIndexHealth.java b/core/src/main/java/org/elasticsearch/cluster/health/ClusterIndexHealth.java
@@ -54,7 +54,7 @@ public ClusterIndexHealth(final IndexMetaData indexMetaData, final IndexRoutingT
 
         for (IndexShardRoutingTable shardRoutingTable : indexRoutingTable) {
             int shardId = shardRoutingTable.shardId().id();
-            shards.put(shardId, new ClusterShardHealth(shardId, shardRoutingTable));
+            shards.put(shardId, new ClusterShardHealth(shardId, shardRoutingTable, indexMetaData.activeAllocationIds(shardId).isEmpty()));
         }
 
         // update the index status
diff --git a/core/src/main/java/org/elasticsearch/cluster/health/ClusterShardHealth.java b/core/src/main/java/org/elasticsearch/cluster/health/ClusterShardHealth.java
@@ -21,6 +21,7 @@
 
 import org.elasticsearch.cluster.routing.IndexShardRoutingTable;
 import org.elasticsearch.cluster.routing.ShardRouting;
+import org.elasticsearch.cluster.routing.UnassignedInfo;
 import org.elasticsearch.common.io.stream.StreamInput;
 import org.elasticsearch.common.io.stream.StreamOutput;
 import org.elasticsearch.common.io.stream.Writeable;
@@ -37,45 +38,42 @@ public final class ClusterShardHealth implements Writeable {
     private final int unassignedShards;
     private final boolean primaryActive;
 
-    public ClusterShardHealth(final int shardId, final IndexShardRoutingTable shardRoutingTable) {
+    public ClusterShardHealth(final int shardId, final IndexShardRoutingTable shardRoutingTable, final boolean noActiveAllocationIds) {
         this.shardId = shardId;
         int computeActiveShards = 0;
         int computeRelocatingShards = 0;
         int computeInitializingShards = 0;
         int computeUnassignedShards = 0;
-        boolean computePrimaryActive = false;
         for (ShardRouting shardRouting : shardRoutingTable) {
             if (shardRouting.active()) {
                 computeActiveShards++;
                 if (shardRouting.relocating()) {
                     // the shard is relocating, the one it is relocating to will be in initializing state, so we don't count it
                     computeRelocatingShards++;
                 }
-                if (shardRouting.primary()) {
-                    computePrimaryActive = true;
-                }
             } else if (shardRouting.initializing()) {
                 computeInitializingShards++;
             } else if (shardRouting.unassigned()) {
                 computeUnassignedShards++;
             }
         }
         ClusterHealthStatus computeStatus;
-        if (computePrimaryActive) {
+        final ShardRouting primaryRouting = shardRoutingTable.primaryShard();
+        if (primaryRouting.active()) {
             if (computeActiveShards == shardRoutingTable.size()) {
                 computeStatus = ClusterHealthStatus.GREEN;
             } else {
                 computeStatus = ClusterHealthStatus.YELLOW;
             }
         } else {
-            computeStatus = ClusterHealthStatus.RED;
+            computeStatus = UnassignedInfo.unassignedPrimaryShardHealth(primaryRouting.unassignedInfo(), noActiveAllocationIds);
         }
         this.status = computeStatus;
         this.activeShards = computeActiveShards;
         this.relocatingShards = computeRelocatingShards;
         this.initializingShards = computeInitializingShards;
         this.unassignedShards = computeUnassignedShards;
-        this.primaryActive = computePrimaryActive;
+        this.primaryActive = primaryRouting.active();
     }
 
     public ClusterShardHealth(final StreamInput in) throws IOException {
@@ -126,4 +124,5 @@ public void writeTo(final StreamOutput out) throws IOException {
         out.writeVInt(unassignedShards);
         out.writeBoolean(primaryActive);
     }
+
 }
diff --git a/core/src/main/java/org/elasticsearch/cluster/routing/UnassignedInfo.java b/core/src/main/java/org/elasticsearch/cluster/routing/UnassignedInfo.java
@@ -21,6 +21,7 @@
 
 import org.elasticsearch.ExceptionsHelper;
 import org.elasticsearch.cluster.ClusterState;
+import org.elasticsearch.cluster.health.ClusterHealthStatus;
 import org.elasticsearch.cluster.metadata.MetaData;
 import org.elasticsearch.common.Nullable;
 import org.elasticsearch.common.io.stream.StreamInput;
@@ -289,6 +290,35 @@ public static long findNextDelayedAllocation(long currentNanoTime, ClusterState
         return nextDelayNanos == Long.MAX_VALUE ? -1L : nextDelayNanos;
     }
 
+    /**
+     * Returns the appropriate {@link ClusterHealthStatus} for an unassigned primary shard
+     * based on its unassigned information and allocation id history.
+     */
+    public static ClusterHealthStatus unassignedPrimaryShardHealth(final UnassignedInfo unassignedInfo,
+                                                                   final boolean noActiveAllocationIds) {
+        assert unassignedInfo != null;
+        final UnassignedInfo.Reason reason = unassignedInfo.getReason();
+        /**
+         * Normally, an inactive primary shard in an index should cause the cluster health to be RED.  However,
+         * there are exceptions where a health status of RED is inappropriate, namely in these scenarios:
+         *   1. Index Creation.  When an index is first created, the primary shards are in the initializing state, so
+         *      there is a small window where the cluster health is RED due to the primaries not being activated yet.
+         *      However, this leads to a false sense that the cluster is in an unhealthy state, when in reality, its
+         *      simply a case of needing to wait for the primaries to initialize.
+         *   2. When a cluster is in the recovery state, and the shard never had any allocation ids assigned to it,
+         *      which indicates the index was created and before allocation of the primary occurred for this shard,
+         *      a cluster restart happened.
+         *
+         * Here, we check for these scenarios and set the cluster health to YELLOW if any are applicable.
+         */
+        if (reason == UnassignedInfo.Reason.INDEX_CREATED
+                || (reason == UnassignedInfo.Reason.CLUSTER_RECOVERED && noActiveAllocationIds)) {
+            return ClusterHealthStatus.YELLOW;
+        } else {
+            return ClusterHealthStatus.RED;
+        }
+    }
+
     public String shortSummary() {
         StringBuilder sb = new StringBuilder();
         sb.append("[reason=").append(reason).append("]");
@@ -366,4 +396,5 @@ public int hashCode() {
         result = 31 * result + (failure != null ? failure.hashCode() : 0);
         return result;
     }
+
 }
diff --git a/core/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/ThrottlingAllocationDecider.java b/core/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/ThrottlingAllocationDecider.java
@@ -59,7 +59,7 @@ public class ThrottlingAllocationDecider extends AllocationDecider {
             Integer.toString(DEFAULT_CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_RECOVERIES),
             (s) -> Setting.parseInt(s, 0, "cluster.routing.allocation.node_concurrent_recoveries"),
             Property.Dynamic, Property.NodeScope);
-    public static final Setting<Integer> CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING =
+    public static final Setting<Integer>  CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING =
         Setting.intSetting("cluster.routing.allocation.node_initial_primaries_recoveries",
             DEFAULT_CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES, 0,
             Property.Dynamic, Property.NodeScope);
diff --git a/core/src/test/java/org/elasticsearch/cluster/ClusterHealthIT.java b/core/src/test/java/org/elasticsearch/cluster/ClusterHealthIT.java
@@ -20,28 +20,63 @@
 package org.elasticsearch.cluster;
 
 import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse;
+import org.elasticsearch.action.admin.cluster.state.ClusterStateResponse;
+import org.elasticsearch.action.admin.indices.create.CreateIndexResponse;
 import org.elasticsearch.cluster.health.ClusterHealthStatus;
+import org.elasticsearch.cluster.metadata.IndexMetaData;
+import org.elasticsearch.cluster.routing.ShardRouting;
+import org.elasticsearch.cluster.routing.allocation.decider.ThrottlingAllocationDecider;
+import org.elasticsearch.cluster.service.ClusterService;
 import org.elasticsearch.common.Priority;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.common.unit.TimeValue;
 import org.elasticsearch.test.ESIntegTestCase;
+import org.elasticsearch.threadpool.ThreadPool;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+
+import java.util.concurrent.TimeUnit;
 
 import static org.hamcrest.Matchers.equalTo;
 
 public class ClusterHealthIT extends ESIntegTestCase {
+
+    private static ThreadPool threadPool;
+
+    @BeforeClass
+    public static void customBeforeClass() throws Exception {
+        threadPool = new ThreadPool("ClusterHealthIT");
+    }
+
+    @AfterClass
+    public static void customAfterClass() throws Exception {
+        ThreadPool.terminate(threadPool, 10L, TimeUnit.SECONDS);
+        threadPool = null;
+    }
+
     public void testSimpleLocalHealth() {
         createIndex("test");
         ensureGreen(); // master should thing it's green now.
 
         for (String node : internalCluster().getNodeNames()) {
             // a very high time out, which should never fire due to the local flag
-            ClusterHealthResponse health = client(node).admin().cluster().prepareHealth().setLocal(true).setWaitForEvents(Priority.LANGUID).setTimeout("30s").get("10s");
+            ClusterHealthResponse health = client(node).admin().cluster().prepareHealth()
+                                                                         .setLocal(true)
+                                                                         .setWaitForEvents(Priority.LANGUID)
+                                                                         .setTimeout("30s")
+                                                                         .get("10s");
             assertThat(health.getStatus(), equalTo(ClusterHealthStatus.GREEN));
             assertThat(health.isTimedOut(), equalTo(false));
         }
     }
 
     public void testHealth() {
         logger.info("--> running cluster health on an index that does not exists");
-        ClusterHealthResponse healthResponse = client().admin().cluster().prepareHealth("test1").setWaitForYellowStatus().setTimeout("1s").execute().actionGet();
+        ClusterHealthResponse healthResponse = client().admin().cluster().prepareHealth("test1")
+                                                                         .setWaitForYellowStatus()
+                                                                         .setTimeout("1s")
+                                                                         .execute()
+                                                                         .actionGet();
         assertThat(healthResponse.isTimedOut(), equalTo(true));
         assertThat(healthResponse.getStatus(), equalTo(ClusterHealthStatus.RED));
         assertThat(healthResponse.getIndices().isEmpty(), equalTo(true));
@@ -62,10 +97,91 @@ public void testHealth() {
         assertThat(healthResponse.getIndices().get("test1").getStatus(), equalTo(ClusterHealthStatus.GREEN));
 
         logger.info("--> running cluster health on an index that does exists and an index that doesn't exists");
-        healthResponse = client().admin().cluster().prepareHealth("test1", "test2").setWaitForYellowStatus().setTimeout("1s").execute().actionGet();
+        healthResponse = client().admin().cluster().prepareHealth("test1", "test2")
+                                                   .setWaitForYellowStatus()
+                                                   .setTimeout("1s")
+                                                   .execute()
+                                                   .actionGet();
         assertThat(healthResponse.isTimedOut(), equalTo(true));
         assertThat(healthResponse.getStatus(), equalTo(ClusterHealthStatus.RED));
         assertThat(healthResponse.getIndices().get("test1").getStatus(), equalTo(ClusterHealthStatus.GREEN));
         assertThat(healthResponse.getIndices().size(), equalTo(1));
     }
-}
+
+    public void testHealthOnIndexCreation() throws Exception {
+        final int numNodes = randomIntBetween(2, 5);
+        logger.info("--> starting {} nodes", numNodes);
+        final Settings nodeSettings = Settings.builder().put(
+            ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), numNodes
+        ).build();
+        internalCluster().ensureAtLeastNumDataNodes(numNodes, nodeSettings);
+
+        ClusterHealthResponse healthResponse = client().admin().cluster()
+                                                               .prepareHealth()
+                                                               .setWaitForGreenStatus()
+                                                               .setTimeout("10s")
+                                                               .execute()
+                                                               .actionGet();
+        assertThat(healthResponse.isTimedOut(), equalTo(false));
+        assertThat(healthResponse.getStatus(), equalTo(ClusterHealthStatus.GREEN));
+
+        // first, register a cluster state observer that checks cluster health
+        // upon index creation in the cluster state
+        final String masterNode = internalCluster().getMasterName();
+        final String indexName = "test-idx";
+        final ClusterService clusterService = internalCluster().clusterService(masterNode);
+        final ClusterStateObserver observer = new ClusterStateObserver(clusterService, logger, threadPool.getThreadContext());
+        final ClusterStateObserver.ChangePredicate validationPredicate = new ClusterStateObserver.ValidationPredicate() {
+            @Override
+            protected boolean validate(ClusterState newState) {
+                return newState.status() == ClusterState.ClusterStateStatus.APPLIED
+                           && newState.metaData().hasIndex(indexName);
+            }
+        };
+
+        final ClusterStateObserver.Listener stateListener = new ClusterStateObserver.Listener() {
+            @Override
+            public void onNewClusterState(ClusterState clusterState) {
+                // make sure we have inactive primaries
+                // see if we can terminate observing on the cluster state
+                final ClusterStateResponse csResponse = client().admin().cluster().prepareState().execute().actionGet();
+                boolean inactivePrimaries = false;
+                for (ShardRouting shardRouting : csResponse.getState().routingTable().allShards(indexName)) {
+                    if (shardRouting.primary() == false) {
+                        continue;
+                    }
+                    if (shardRouting.active() == false) {
+                        inactivePrimaries = true;
+                        break;
+                    }
+                }
+                assertTrue(inactivePrimaries);
+                // verify cluster health is YELLOW (even though primaries are still being allocated)
+                final ClusterHealthResponse response = client().admin().cluster().prepareHealth(indexName).get();
+                assertThat(response.getStatus(), equalTo(ClusterHealthStatus.YELLOW));
+            }
+            @Override
+            public void onClusterServiceClose() {
+                fail("cluster service should not have closed");
+            }
+            @Override
+            public void onTimeout(TimeValue timeout) {
+                fail("timeout on cluster state observer");
+            }
+        };
+        observer.waitForNextChange(stateListener, validationPredicate, TimeValue.timeValueSeconds(30L));
+        final Settings settings = Settings.builder().put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, numNodes)
+                                                    .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, numNodes - 1)
+                                                    .build();
+        CreateIndexResponse response = client().admin().indices().prepareCreate(indexName)
+                                                                 .setSettings(settings)
+                                                                 .execute()
+                                                                 .actionGet();
+        assertTrue(response.isAcknowledged());
+
+        // now, make sure we eventually get to the green state,
+        // we have at least two nodes so this should happen
+        ensureGreen(indexName);
+    }
+
+}
diff --git a/core/src/test/java/org/elasticsearch/cluster/health/ClusterStateHealthTests.java b/core/src/test/java/org/elasticsearch/cluster/health/ClusterStateHealthTests.java
diff --git a/core/src/test/java/org/elasticsearch/cluster/routing/RoutingTableGenerator.java b/core/src/test/java/org/elasticsearch/cluster/routing/RoutingTableGenerator.java
diff --git a/test/framework/src/main/java/org/elasticsearch/test/InternalTestCluster.java b/test/framework/src/main/java/org/elasticsearch/test/InternalTestCluster.java

Original file line number	Diff line number	Diff line change
`@@ -54,7 +54,7 @@ public ClusterIndexHealth(final IndexMetaData indexMetaData, final IndexRoutingT`
`54`	`54`
`55`	`55`	`for (IndexShardRoutingTable shardRoutingTable : indexRoutingTable) {`
`56`	`56`	`int shardId = shardRoutingTable.shardId().id();`
`57`		`- shards.put(shardId, new ClusterShardHealth(shardId, shardRoutingTable));`
	`57`	`+ shards.put(shardId, new ClusterShardHealth(shardId, shardRoutingTable, indexMetaData.activeAllocationIds(shardId).isEmpty()));`
`58`	`58`	`}`
`59`	`59`
`60`	`60`	`// update the index status`