elastic · pxsalehi · Sep 8, 2022 · Jun 30, 2022 · Jun 30, 2022 · Jul 14, 2022
diff --git a/docs/changelog/88209.yaml b/docs/changelog/88209.yaml
@@ -0,0 +1,6 @@
+pr: 88209
+summary: Prioritize shard snapshot tasks over file snapshot tasks and limit the number of the concurrently running snapshot tasks
+area: Snapshot/Restore
+type: enhancement
+issues:
+ - 83408
@@ -0,0 +1,107 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0 and the Server Side Public License, v 1; you may not use this file except
+ * in compliance with, at your election, the Elastic License 2.0 or the Server
+ * Side Public License, v 1.
+ */
+
+package org.elasticsearch.common.util.concurrent;
+
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.Executor;
+import java.util.concurrent.PriorityBlockingQueue;
+import java.util.concurrent.atomic.AtomicInteger;
+
+/**
+ * {@link PrioritizedThrottledTaskRunner} performs the enqueued tasks in the order dictated by the
+ * natural ordering of the tasks, limiting the max number of concurrently running tasks. Each new task
+ * that is dequeued to be run, is forked off to the given executor.
+ */
+public class PrioritizedThrottledTaskRunner<T extends Comparable<T> & Runnable> {
+    private static final Logger logger = LogManager.getLogger(PrioritizedThrottledTaskRunner.class);
+
+    private final String taskRunnerName;
+    // The max number of tasks that this runner will schedule to concurrently run on the executor.
+    private final int maxRunningTasks;
+    // As we fork off dequeued tasks to the given executor, technically the following counter represents
+    // the number of the concurrent pollAndSpawn calls currently checking the queue for a task to run. This
+    // doesn't necessarily correspond to currently running tasks, since a pollAndSpawn could return without
+    // actually running a task when the queue is empty.
+    private final AtomicInteger runningTasks = new AtomicInteger();
+    private final BlockingQueue<T> tasks = new PriorityBlockingQueue<>();
+    private final Executor executor;
+
+    public PrioritizedThrottledTaskRunner(final String name, final int maxRunningTasks, final Executor executor) {
+        assert maxRunningTasks > 0;
+        this.taskRunnerName = name;
+        this.maxRunningTasks = maxRunningTasks;
+        this.executor = executor;
+    }
+
+    public void enqueueTask(final T task) {
+        logger.trace("[{}] enqueuing task {}", taskRunnerName, task);
+        tasks.add(task);
+        // Try to run a task since now there is at least one in the queue. If the maxRunningTasks is
+        // reached, the task is just enqueued.
+        pollAndSpawn();
+    }
+
+    // visible for testing
+    protected void pollAndSpawn() {
+        // A pollAndSpawn attempts to run a new task. There could be many concurrent pollAndSpawn calls competing
+        // to get a "free slot", since we attempt to run a new task on every enqueueTask call and every time an
+        // existing task is finished.
+        while (incrementRunningTasks()) {
+            T task = tasks.poll();
+            if (task == null) {
+                logger.trace("[{}] task queue is empty", taskRunnerName);
+                // We have taken up a "free slot", but there are no tasks in the queue! This could happen each time a worker
+                // sees an empty queue after running a task. Decrement to give competing pollAndSpawn calls a chance!
+                int decremented = runningTasks.decrementAndGet();
+                assert decremented >= 0;
+                // We might have blocked all competing pollAndSpawn calls. This could happen for example when
+                // maxRunningTasks=1 and a task got enqueued just after checking the queue but before decrementing.
+                // To be sure, return only if the queue is still empty. If the queue is not empty, this might be the
+                // only pollAndSpawn call in progress, and returning without peeking would risk ending up with a
+                // non-empty queue and no workers!
+                if (tasks.peek() == null) break;
+            } else {
+                executor.execute(() -> runTask(task));
+            }
+        }
+    }
+
+    // Each worker thread that runs a task, first needs to get a "free slot" in order to respect maxRunningTasks.
+    private boolean incrementRunningTasks() {
+        int preUpdateValue = runningTasks.getAndUpdate(v -> v < maxRunningTasks ? v + 1 : v);
+        assert preUpdateValue <= maxRunningTasks;
+        return preUpdateValue < maxRunningTasks;
+    }
+
+    // Only use for testing
+    public int runningTasks() {
+        return runningTasks.get();
+    }
+
+    // Only use for testing
+    public int queueSize() {
+        return tasks.size();
+    }
+
+    private void runTask(final T task) {
+        try {
+            logger.trace("[{}] running task {}", taskRunnerName, task);
+            task.run();
+        } finally {
+            // To avoid missing to run tasks that are enqueued and waiting, we check the queue again once running
+            // a task is finished.
+            int decremented = runningTasks.decrementAndGet();
+            assert decremented >= 0;
+            pollAndSpawn();
+        }
+    }
+}
diff --git a/server/src/main/java/org/elasticsearch/repositories/SnapshotShardContext.java b/server/src/main/java/org/elasticsearch/repositories/SnapshotShardContext.java
@@ -37,6 +37,7 @@ public final class SnapshotShardContext extends ActionListener.Delegating<ShardS
     private final IndexShardSnapshotStatus snapshotStatus;
     private final Version repositoryMetaVersion;
     private final Map<String, Object> userMetadata;
+    private final long snapshotStartTime;
 
     /**
      * @param store                 store to be snapshotted
@@ -51,6 +52,8 @@ public final class SnapshotShardContext extends ActionListener.Delegating<ShardS
      * @param repositoryMetaVersion version of the updated repository metadata to write
      * @param userMetadata          user metadata of the snapshot found in
      *                              {@link org.elasticsearch.cluster.SnapshotsInProgress.Entry#userMetadata()}
+     * @param snapshotStartTime     start time of the snapshot found in
+     *                              {@link org.elasticsearch.cluster.SnapshotsInProgress.Entry#startTime()}
      * @param listener              listener invoked on completion
      */
     public SnapshotShardContext(
@@ -63,6 +66,7 @@ public SnapshotShardContext(
         IndexShardSnapshotStatus snapshotStatus,
         Version repositoryMetaVersion,
         Map<String, Object> userMetadata,
+        final long snapshotStartTime,
         ActionListener<ShardSnapshotResult> listener
     ) {
         super(ActionListener.runBefore(listener, commitRef::close));
@@ -75,6 +79,7 @@ public SnapshotShardContext(
         this.snapshotStatus = snapshotStatus;
         this.repositoryMetaVersion = repositoryMetaVersion;
         this.userMetadata = userMetadata;
+        this.snapshotStartTime = snapshotStartTime;
     }
 
     public Store store() {
@@ -114,6 +119,10 @@ public Map<String, Object> userMetadata() {
         return userMetadata;
     }
 
+    public long snapshotStartTime() {
+        return snapshotStartTime;
+    }
+
     @Override
     public void onResponse(ShardSnapshotResult result) {
         delegate.onResponse(result);

diff --git a/server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java b/server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java
@@ -150,6 +150,7 @@
 import java.util.stream.Stream;
 
 import static org.elasticsearch.core.Strings.format;
+import static org.elasticsearch.index.snapshots.blobstore.BlobStoreIndexShardSnapshot.FileInfo;
 import static org.elasticsearch.index.snapshots.blobstore.BlobStoreIndexShardSnapshot.FileInfo.canonicalName;
 
 /**
@@ -376,6 +377,8 @@ public abstract class BlobStoreRepository extends AbstractLifecycleComponent imp
      */
     private final int maxSnapshotCount;
 
+    private final ShardSnapshotTaskRunner shardSnapshotTaskRunner;
+
     /**
      * Constructs new BlobStoreRepository
      * @param metadata   The metadata for this repository including name and settings
@@ -405,6 +408,12 @@ protected BlobStoreRepository(
         this.basePath = basePath;
         this.maxSnapshotCount = MAX_SNAPSHOTS_SETTING.get(metadata.settings());
         this.repoDataDeduplicator = new ResultDeduplicator<>(threadPool.getThreadContext());
+        shardSnapshotTaskRunner = new ShardSnapshotTaskRunner(
+            threadPool.info(ThreadPool.Names.SNAPSHOT).getMax(),
+            threadPool.executor(ThreadPool.Names.SNAPSHOT),
+            this::doSnapshotShard,
+            this::snapshotFile
+        );
     }
 
     @Override
@@ -2629,6 +2638,10 @@ private void writeAtomic(
 
     @Override
     public void snapshotShard(SnapshotShardContext context) {
+        shardSnapshotTaskRunner.enqueueShardSnapshot(context);
+    }
+
+    private void doSnapshotShard(SnapshotShardContext context) {
         if (isReadOnly()) {
             context.onFailure(new RepositoryException(metadata.name(), "cannot snapshot shard on a readonly repository"));
             return;
@@ -2889,45 +2902,19 @@ public void snapshotShard(SnapshotShardContext context) {
                 snapshotStatus.moveToDone(threadPool.absoluteTimeInMillis(), shardSnapshotResult);
                 context.onResponse(shardSnapshotResult);
             }, context::onFailure);
-            if (indexIncrementalFileCount == 0) {
+            if (indexIncrementalFileCount == 0 || filesToSnapshot.isEmpty()) {
                 allFilesUploadedListener.onResponse(Collections.emptyList());
                 return;
             }
-            final Executor executor = threadPool.executor(ThreadPool.Names.SNAPSHOT);
-            // Start as many workers as fit into the snapshot pool at once at the most
-            final int workers = Math.min(threadPool.info(ThreadPool.Names.SNAPSHOT).getMax(), indexIncrementalFileCount);
-            final ActionListener<Void> filesListener = fileQueueListener(filesToSnapshot, workers, allFilesUploadedListener);
-            for (int i = 0; i < workers; ++i) {
-                executeOneFileSnapshot(store, snapshotId, context.indexId(), snapshotStatus, filesToSnapshot, executor, filesListener);
+            final ActionListener<Void> filesListener = fileQueueListener(filesToSnapshot, filesToSnapshot.size(), allFilesUploadedListener);
+            for (FileInfo fileInfo : filesToSnapshot) {
+                shardSnapshotTaskRunner.enqueueFileSnapshot(context, fileInfo, filesListener);
             }
         } catch (Exception e) {
             context.onFailure(e);
         }
     }
 
-    private void executeOneFileSnapshot(
-        Store store,
-        SnapshotId snapshotId,
-        IndexId indexId,
-        IndexShardSnapshotStatus snapshotStatus,
-        BlockingQueue<BlobStoreIndexShardSnapshot.FileInfo> filesToSnapshot,
-        Executor executor,
-        ActionListener<Void> listener
-    ) throws InterruptedException {
-        final ShardId shardId = store.shardId();
-        final BlobStoreIndexShardSnapshot.FileInfo snapshotFileInfo = filesToSnapshot.poll(0L, TimeUnit.MILLISECONDS);
-        if (snapshotFileInfo == null) {
-            listener.onResponse(null);
-        } else {
-            executor.execute(ActionRunnable.wrap(listener, l -> {
-                try (Releasable ignored = incrementStoreRef(store, snapshotStatus, shardId)) {
-                    snapshotFile(snapshotFileInfo, indexId, shardId, snapshotId, snapshotStatus, store);
-                    executeOneFileSnapshot(store, snapshotId, indexId, snapshotStatus, filesToSnapshot, executor, l);
-                }
-            }));
-        }
-    }
-
     private static Releasable incrementStoreRef(Store store, IndexShardSnapshotStatus snapshotStatus, ShardId shardId) {
         if (store.tryIncRef() == false) {
             if (snapshotStatus.isAborted()) {
@@ -3116,10 +3103,10 @@ void ensureNotClosing(final Store store) throws AlreadyClosedException {
 
     private static ActionListener<Void> fileQueueListener(
         BlockingQueue<BlobStoreIndexShardSnapshot.FileInfo> files,
-        int workers,
+        int numberOfFiles,
         ActionListener<Collection<Void>> listener
     ) {
-        return new GroupedActionListener<>(listener, workers).delegateResponse((l, e) -> {
+        return new GroupedActionListener<>(listener, numberOfFiles).delegateResponse((l, e) -> {
             files.clear(); // Stop uploading the remaining files if we run into any exception
             l.onFailure(e);
         });
@@ -3426,19 +3413,20 @@ private Tuple<BlobStoreIndexShardSnapshots, Long> buildBlobStoreIndexShardSnapsh
 
     /**
      * Snapshot individual file
-     * @param fileInfo file to be snapshotted
+     * @param fileInfo file to snapshot
      */
-    private void snapshotFile(
-        BlobStoreIndexShardSnapshot.FileInfo fileInfo,
-        IndexId indexId,
-        ShardId shardId,
-        SnapshotId snapshotId,
-        IndexShardSnapshotStatus snapshotStatus,
-        Store store
-    ) throws IOException {
+    private void snapshotFile(SnapshotShardContext context, FileInfo fileInfo) throws IOException {
+        final IndexId indexId = context.indexId();
+        final Store store = context.store();
+        final ShardId shardId = store.shardId();
+        final IndexShardSnapshotStatus snapshotStatus = context.status();
+        final SnapshotId snapshotId = context.snapshotId();
         final BlobContainer shardContainer = shardContainer(indexId, shardId);
         final String file = fileInfo.physicalName();
-        try (IndexInput indexInput = store.openVerifyingInput(file, IOContext.READONCE, fileInfo.metadata())) {
+        try (
+            Releasable ignored = BlobStoreRepository.incrementStoreRef(store, snapshotStatus, store.shardId());
+            IndexInput indexInput = store.openVerifyingInput(file, IOContext.READONCE, fileInfo.metadata())
+        ) {
             for (int i = 0; i < fileInfo.numberOfParts(); i++) {
                 final long partBytes = fileInfo.partBytes(i);