Avoid parallel reroutes in DiskThresholdMonitor

DaveCTurner · DaveCTurner · commit 7d8ce43bf3d6 · 2019-06-19T17:11:05.000+01:00
Today the `DiskThresholdMonitor` limits the frequency with which it submits reroute tasks, but it might still submit these tasks faster than the master can process them if, for instance, each reroute takes over 60 seconds. This causes a problem since the reroute task runs with priority `IMMEDIATE` and is always scheduled when there is a node over the high watermark, so this can starve any other pending tasks on the master. This change avoids further updates from the monitor while its last task(s) are still in progress, and it measures the time of each update from the completion time of the reroute task rather than its start time, to allow a larger window for other tasks to run. Fixes elastic#40174
diff --git a/server/src/main/java/org/elasticsearch/cluster/InternalClusterInfoService.java b/server/src/main/java/org/elasticsearch/cluster/InternalClusterInfoService.java
@@ -131,13 +131,13 @@ public void onMaster() {
             logger.trace("I have been elected master, scheduling a ClusterInfoUpdateJob");
         }
 
-        // Submit a job that will start after DEFAULT_STARTING_INTERVAL, and reschedule itself after running
+        // Submit a job that will reschedule itself after running
         threadPool.scheduleUnlessShuttingDown(updateFrequency, executorName(), new SubmitReschedulingClusterInfoUpdatedJob());
 
         try {
             if (clusterService.state().getNodes().getDataNodes().size() > 1) {
                 // Submit an info update job to be run immediately
-                threadPool.executor(executorName()).execute(() -> maybeRefresh());
+                threadPool.executor(executorName()).execute(this::maybeRefresh);
             }
         } catch (EsRejectedExecutionException ex) {
             logger.debug("Couldn't schedule cluster info update task - node might be shutting down", ex);
@@ -173,7 +173,7 @@ public void clusterChanged(ClusterChangedEvent event) {
             if (logger.isDebugEnabled()) {
                 logger.debug("data node was added, retrieving new cluster info");
             }
-            threadPool.executor(executorName()).execute(() -> maybeRefresh());
+            threadPool.executor(executorName()).execute(this::maybeRefresh);
         }
 
         if (this.isMaster && event.nodesRemoved()) {
@@ -316,7 +316,7 @@ public void onResponse(IndicesStatsResponse indicesStatsResponse) {
                 ShardStats[] stats = indicesStatsResponse.getShards();
                 ImmutableOpenMap.Builder<String, Long> newShardSizes = ImmutableOpenMap.builder();
                 ImmutableOpenMap.Builder<ShardRouting, String> newShardRoutingToDataPath = ImmutableOpenMap.builder();
-                buildShardLevelInfo(logger, stats, newShardSizes, newShardRoutingToDataPath, clusterService.state());
+                buildShardLevelInfo(logger, stats, newShardSizes, newShardRoutingToDataPath);
                 shardSizes = newShardSizes.build();
                 shardRoutingToDataPath = newShardRoutingToDataPath.build();
             }
@@ -365,7 +365,7 @@ public void onFailure(Exception e) {
     }
 
     static void buildShardLevelInfo(Logger logger, ShardStats[] stats, ImmutableOpenMap.Builder<String, Long> newShardSizes,
-                                    ImmutableOpenMap.Builder<ShardRouting, String> newShardRoutingToDataPath, ClusterState state) {
+                                    ImmutableOpenMap.Builder<ShardRouting, String> newShardRoutingToDataPath) {
         for (ShardStats s : stats) {
             newShardRoutingToDataPath.put(s.getShardRouting(), s.getDataPath());
             long size = s.getStats().getStore().sizeInBytes();
diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/DiskThresholdMonitor.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/DiskThresholdMonitor.java
@@ -21,12 +21,16 @@
 
 import java.util.HashSet;
 import java.util.Set;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.function.LongSupplier;
 import java.util.function.Supplier;
 
 import com.carrotsearch.hppc.ObjectLookupContainer;
 import com.carrotsearch.hppc.cursors.ObjectObjectCursor;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
+import org.elasticsearch.action.ActionListener;
+import org.elasticsearch.action.support.GroupedActionListener;
 import org.elasticsearch.client.Client;
 import org.elasticsearch.cluster.ClusterInfo;
 import org.elasticsearch.cluster.ClusterState;
@@ -54,11 +58,14 @@ public class DiskThresholdMonitor {
     private final Client client;
     private final Set<String> nodeHasPassedWatermark = Sets.newConcurrentHashSet();
     private final Supplier<ClusterState> clusterStateSupplier;
-    private long lastRunNS;
+    private final LongSupplier currentTimeMillisSupplier;
+    private long lastRunTimeMillis = Long.MIN_VALUE;
+    private final AtomicBoolean checkInProgress = new AtomicBoolean();
 
     public DiskThresholdMonitor(Settings settings, Supplier<ClusterState> clusterStateSupplier, ClusterSettings clusterSettings,
-                                Client client) {
+                                Client client, LongSupplier currentTimeMillisSupplier) {
         this.clusterStateSupplier = clusterStateSupplier;
+        this.currentTimeMillisSupplier = currentTimeMillisSupplier;
         this.diskThresholdSettings = new DiskThresholdSettings(settings, clusterSettings);
         this.client = client;
     }
@@ -92,88 +99,112 @@ private void warnAboutDiskIfNeeded(DiskUsage usage) {
         }
     }
 
+    private void checkFinished() {
+        final boolean checkFinished = checkInProgress.compareAndSet(true, false);
+        assert checkFinished;
+    }
 
     public void onNewInfo(ClusterInfo info) {
-        ImmutableOpenMap<String, DiskUsage> usages = info.getNodeLeastAvailableDiskUsages();
-        if (usages != null) {
-            boolean reroute = false;
-            String explanation = "";
-
-            // Garbage collect nodes that have been removed from the cluster
-            // from the map that tracks watermark crossing
-            ObjectLookupContainer<String> nodes = usages.keys();
-            for (String node : nodeHasPassedWatermark) {
-                if (nodes.contains(node) == false) {
-                    nodeHasPassedWatermark.remove(node);
-                }
+
+        if (checkInProgress.compareAndSet(false, true) == false) {
+            logger.info("skipping monitor as a check is already in progress");
+            return;
+        }
+
+        final ImmutableOpenMap<String, DiskUsage> usages = info.getNodeLeastAvailableDiskUsages();
+        if (usages == null) {
+            checkFinished();
+            return;
+        }
+
+        boolean reroute = false;
+        String explanation = "";
+        final long currentTimeMillis = currentTimeMillisSupplier.getAsLong();
+
+        // Garbage collect nodes that have been removed from the cluster
+        // from the map that tracks watermark crossing
+        final ObjectLookupContainer<String> nodes = usages.keys();
+        for (String node : nodeHasPassedWatermark) {
+            if (nodes.contains(node) == false) {
+                nodeHasPassedWatermark.remove(node);
             }
-            ClusterState state = clusterStateSupplier.get();
-            Set<String> indicesToMarkReadOnly = new HashSet<>();
-            for (ObjectObjectCursor<String, DiskUsage> entry : usages) {
-                String node = entry.key;
-                DiskUsage usage = entry.value;
-                warnAboutDiskIfNeeded(usage);
-                if (usage.getFreeBytes() < diskThresholdSettings.getFreeBytesThresholdFloodStage().getBytes() ||
-                    usage.getFreeDiskAsPercentage() < diskThresholdSettings.getFreeDiskThresholdFloodStage()) {
-                    RoutingNode routingNode = state.getRoutingNodes().node(node);
-                    if (routingNode != null) { // this might happen if we haven't got the full cluster-state yet?!
-                        for (ShardRouting routing : routingNode) {
-                            indicesToMarkReadOnly.add(routing.index().getName());
-                        }
+        }
+        final ClusterState state = clusterStateSupplier.get();
+        final Set<String> indicesToMarkReadOnly = new HashSet<>();
+
+        for (final ObjectObjectCursor<String, DiskUsage> entry : usages) {
+            final String node = entry.key;
+            final DiskUsage usage = entry.value;
+            warnAboutDiskIfNeeded(usage);
+            if (usage.getFreeBytes() < diskThresholdSettings.getFreeBytesThresholdFloodStage().getBytes() ||
+                usage.getFreeDiskAsPercentage() < diskThresholdSettings.getFreeDiskThresholdFloodStage()) {
+                final RoutingNode routingNode = state.getRoutingNodes().node(node);
+                if (routingNode != null) { // this might happen if we haven't got the full cluster-state yet?!
+                    for (ShardRouting routing : routingNode) {
+                        indicesToMarkReadOnly.add(routing.index().getName());
                     }
-                } else if (usage.getFreeBytes() < diskThresholdSettings.getFreeBytesThresholdHigh().getBytes() ||
-                    usage.getFreeDiskAsPercentage() < diskThresholdSettings.getFreeDiskThresholdHigh()) {
-                    if ((System.nanoTime() - lastRunNS) > diskThresholdSettings.getRerouteInterval().nanos()) {
-                        lastRunNS = System.nanoTime();
+                }
+            } else if (usage.getFreeBytes() < diskThresholdSettings.getFreeBytesThresholdHigh().getBytes() ||
+                usage.getFreeDiskAsPercentage() < diskThresholdSettings.getFreeDiskThresholdHigh()) {
+                if (lastRunTimeMillis < currentTimeMillis - diskThresholdSettings.getRerouteInterval().millis()) {
+                    reroute = true;
+                    explanation = "high disk watermark exceeded on one or more nodes";
+                } else {
+                    logger.debug("high disk watermark exceeded on {} but an automatic reroute has occurred " +
+                            "in the last [{}], skipping reroute",
+                        node, diskThresholdSettings.getRerouteInterval());
+                }
+                nodeHasPassedWatermark.add(node);
+            } else if (usage.getFreeBytes() < diskThresholdSettings.getFreeBytesThresholdLow().getBytes() ||
+                usage.getFreeDiskAsPercentage() < diskThresholdSettings.getFreeDiskThresholdLow()) {
+                nodeHasPassedWatermark.add(node);
+            } else {
+                if (nodeHasPassedWatermark.contains(node)) {
+                    // The node has previously been over the high or
+                    // low watermark, but is no longer, so we should
+                    // reroute so any unassigned shards can be allocated
+                    // if they are able to be
+                    if (lastRunTimeMillis < currentTimeMillis - diskThresholdSettings.getRerouteInterval().millis()) {
                         reroute = true;
-                        explanation = "high disk watermark exceeded on one or more nodes";
+                        explanation = "one or more nodes has gone under the high or low watermark";
+                        nodeHasPassedWatermark.remove(node);
                     } else {
-                        logger.debug("high disk watermark exceeded on {} but an automatic reroute has occurred " +
+                        logger.debug("{} has gone below a disk threshold, but an automatic reroute has occurred " +
                                 "in the last [{}], skipping reroute",
                             node, diskThresholdSettings.getRerouteInterval());
                     }
-                    nodeHasPassedWatermark.add(node);
-                } else if (usage.getFreeBytes() < diskThresholdSettings.getFreeBytesThresholdLow().getBytes() ||
-                    usage.getFreeDiskAsPercentage() < diskThresholdSettings.getFreeDiskThresholdLow()) {
-                    nodeHasPassedWatermark.add(node);
-                } else {
-                    if (nodeHasPassedWatermark.contains(node)) {
-                        // The node has previously been over the high or
-                        // low watermark, but is no longer, so we should
-                        // reroute so any unassigned shards can be allocated
-                        // if they are able to be
-                        if ((System.nanoTime() - lastRunNS) > diskThresholdSettings.getRerouteInterval().nanos()) {
-                            lastRunNS = System.nanoTime();
-                            reroute = true;
-                            explanation = "one or more nodes has gone under the high or low watermark";
-                            nodeHasPassedWatermark.remove(node);
-                        } else {
-                            logger.debug("{} has gone below a disk threshold, but an automatic reroute has occurred " +
-                                    "in the last [{}], skipping reroute",
-                                node, diskThresholdSettings.getRerouteInterval());
-                        }
-                    }
                 }
             }
-            if (reroute) {
-                logger.info("rerouting shards: [{}]", explanation);
-                reroute();
-            }
-            indicesToMarkReadOnly.removeIf(index -> state.getBlocks().indexBlocked(ClusterBlockLevel.WRITE, index));
-            if (indicesToMarkReadOnly.isEmpty() == false) {
-                markIndicesReadOnly(indicesToMarkReadOnly);
-            }
+        }
+
+        final ActionListener<Void> listener = new GroupedActionListener<>(ActionListener.wrap(this::checkFinished), 2);
+
+        if (reroute) {
+            logger.info("rerouting shards: [{}]", explanation);
+            reroute(ActionListener.wrap(r -> {
+                lastRunTimeMillis = currentTimeMillisSupplier.getAsLong();
+                listener.onResponse(r);
+            }, listener::onFailure));
+        } else {
+            listener.onResponse(null);
+        }
+        indicesToMarkReadOnly.removeIf(index -> state.getBlocks().indexBlocked(ClusterBlockLevel.WRITE, index));
+        if (indicesToMarkReadOnly.isEmpty() == false) {
+            markIndicesReadOnly(indicesToMarkReadOnly, listener);
+        } else {
+            listener.onResponse(null);
         }
     }
 
-    protected void markIndicesReadOnly(Set<String> indicesToMarkReadOnly) {
+    protected void markIndicesReadOnly(Set<String> indicesToMarkReadOnly, ActionListener<Void> listener) {
         // set read-only block but don't block on the response
-        client.admin().indices().prepareUpdateSettings(indicesToMarkReadOnly.toArray(Strings.EMPTY_ARRAY)).
-            setSettings(Settings.builder().put(IndexMetaData.SETTING_READ_ONLY_ALLOW_DELETE, true).build()).execute();
+        client.admin().indices().prepareUpdateSettings(indicesToMarkReadOnly.toArray(Strings.EMPTY_ARRAY))
+            .setSettings(Settings.builder().put(IndexMetaData.SETTING_READ_ONLY_ALLOW_DELETE, true).build())
+            .execute(ActionListener.map(listener, r -> null));
     }
 
-    protected void reroute() {
+    protected void reroute(ActionListener<Void> listener) {
         // Execute an empty reroute, but don't block on the response
-        client.admin().cluster().prepareReroute().execute();
+        client.admin().cluster().prepareReroute().execute(ActionListener.map(listener, r -> null));
     }
 }
diff --git a/server/src/main/java/org/elasticsearch/node/Node.java b/server/src/main/java/org/elasticsearch/node/Node.java
@@ -368,7 +368,7 @@ protected Node(
             final IngestService ingestService = new IngestService(clusterService, threadPool, this.environment,
                 scriptModule.getScriptService(), analysisModule.getAnalysisRegistry(), pluginsService.filterPlugins(IngestPlugin.class));
             final DiskThresholdMonitor listener = new DiskThresholdMonitor(settings, clusterService::state,
-                clusterService.getClusterSettings(), client);
+                clusterService.getClusterSettings(), client, threadPool::relativeTimeInMillis);
             final ClusterInfoService clusterInfoService = newClusterInfoService(settings, clusterService, threadPool, client,
                 listener::onNewInfo);
             final UsageService usageService = new UsageService();
diff --git a/server/src/test/java/org/elasticsearch/cluster/DiskUsageTests.java b/server/src/test/java/org/elasticsearch/cluster/DiskUsageTests.java
@@ -119,7 +119,7 @@ public void testFillShardLevelInfo() {
         ImmutableOpenMap.Builder<String, Long> shardSizes = ImmutableOpenMap.builder();
         ImmutableOpenMap.Builder<ShardRouting, String> routingToPath = ImmutableOpenMap.builder();
         ClusterState state = ClusterState.builder(new ClusterName("blarg")).version(0).build();
-        InternalClusterInfoService.buildShardLevelInfo(logger, stats, shardSizes, routingToPath, state);
+        InternalClusterInfoService.buildShardLevelInfo(logger, stats, shardSizes, routingToPath);
         assertEquals(2, shardSizes.size());
         assertTrue(shardSizes.containsKey(ClusterInfo.shardIdentifierFromRouting(test_0)));
         assertTrue(shardSizes.containsKey(ClusterInfo.shardIdentifierFromRouting(test_1)));
diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/DiskThresholdMonitorTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/DiskThresholdMonitorTests.java
diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/MockDiskUsagesIT.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/MockDiskUsagesIT.java