Base initial GCP on the cloned retention lease

DaveCTurner · DaveCTurner · commit 14fde1c2a755 · 2019-07-31T08:51:37.000+01:00
diff --git a/server/src/main/java/org/elasticsearch/indices/recovery/RecoverySourceHandler.java b/server/src/main/java/org/elasticsearch/indices/recovery/RecoverySourceHandler.java
@@ -78,6 +78,7 @@
 import java.util.Comparator;
 import java.util.List;
 import java.util.Locale;
+import java.util.OptionalLong;
 import java.util.concurrent.CompletableFuture;
 import java.util.concurrent.CopyOnWriteArrayList;
 import java.util.concurrent.atomic.AtomicInteger;
@@ -196,7 +197,6 @@ && isTargetSameHistory()
             }
 
             final StepListener<SendFileResult> sendFileStep = new StepListener<>();
-            final StepListener<ReplicationResponse> establishRetentionLeaseStep = new StepListener<>();
             final StepListener<TimeValue> prepareEngineStep = new StepListener<>();
             final StepListener<SendSnapshotResult> sendSnapshotStep = new StepListener<>();
             final StepListener<Void> finalizeStep = new StepListener<>();
@@ -264,7 +264,16 @@ && isTargetSameHistory()
 
                     deleteRetentionLeaseStep.whenComplete(ignored -> {
                         assert Transports.assertNotTransportThread(RecoverySourceHandler.this + "[phase1]");
-                        phase1(safeCommitRef.getIndexCommit(), shard.getLastKnownGlobalCheckpoint(), () -> estimateNumOps, sendFileStep);
+
+                        final Consumer<ActionListener<Long>> getGlobalCheckpoint;
+                        if (useRetentionLeases) {
+                            getGlobalCheckpoint = l -> createRetentionLease(startingSeqNo, l);
+                        } else {
+                            final long globalCheckpoint = shard.getLastKnownGlobalCheckpoint();
+                            getGlobalCheckpoint = l -> l.onResponse(globalCheckpoint);
+                        }
+
+                        phase1(safeCommitRef.getIndexCommit(), getGlobalCheckpoint, () -> estimateNumOps, sendFileStep);
                     }, onFailure);
 
                 } catch (final Exception e) {
@@ -274,41 +283,6 @@ && isTargetSameHistory()
             assert startingSeqNo >= 0 : "startingSeqNo must be non negative. got: " + startingSeqNo;
 
             sendFileStep.whenComplete(r -> {
-                if (useRetentionLeases && isSequenceNumberBasedRecovery == false) {
-                    // We can in general use retention leases for peer recovery, but there is no lease for the target node right now.
-                    runUnderPrimaryPermit(() -> {
-                            // Clone the peer recovery retention lease belonging to the source shard. We are retaining history between the
-                            // the local checkpoint of the safe commit we're creating and this lease's retained seqno with the retention
-                            // lock, and by cloning an existing lease we (approximately) know that all our peers are also retaining history
-                            // as requested by the cloned lease. If the recovery now fails before copying enough history over then a
-                            // subsequent attempt will find this lease, determine it is not enough, and fall back to a file-based recovery.
-                            //
-                            // (approximately) because we do not guarantee to be able to satisfy every lease on every peer.
-                            logger.trace("cloning primary's retention lease");
-                            try {
-                                final RetentionLease clonedLease = shard.cloneLocalPeerRecoveryRetentionLease(request.targetNode().getId(),
-                                    new ThreadedActionListener<>(logger, shard.getThreadPool(),
-                                        ThreadPool.Names.GENERIC, establishRetentionLeaseStep, false));
-                                logger.trace("cloned primary's retention lease as [{}]", clonedLease);
-                            } catch (RetentionLeaseNotFoundException e) {
-                                // it's possible that the primary has no retention lease yet if we are doing a rolling upgrade from a
-                                // version before 7.4, and in that case we just create a lease using the local checkpoint of the safe commit
-                                // which we're using for recovery as a conservative estimate for the global checkpoint.
-                                assert shard.indexSettings().getIndexVersionCreated().before(Version.V_7_4_0);
-                                final long estimatedGlobalCheckpoint = startingSeqNo - 1;
-                                shard.addPeerRecoveryRetentionLease(request.targetNode().getId(),
-                                    estimatedGlobalCheckpoint, new ThreadedActionListener<>(logger, shard.getThreadPool(),
-                                        ThreadPool.Names.GENERIC, establishRetentionLeaseStep, false));
-                                logger.trace("created retention lease with estimated checkpoint of [{}]", estimatedGlobalCheckpoint);
-                            }
-                    }, shardId + " establishing retention lease for [" + request.targetAllocationId() + "]",
-                        shard, cancellableThreads, logger);
-                } else {
-                    establishRetentionLeaseStep.onResponse(null);
-                }
-            }, onFailure);
-
-            establishRetentionLeaseStep.whenComplete(r -> {
                 assert Transports.assertNotTransportThread(RecoverySourceHandler.this + "[prepareTargetForTranslog]");
                 // For a sequence based recovery, the target can keep its local translog
                 prepareTargetForTranslog(isSequenceNumberBasedRecovery == false,
@@ -455,7 +429,8 @@ static final class SendFileResult {
      * segments that are missing. Only segments that have the same size and
      * checksum can be reused
      */
-    void phase1(IndexCommit snapshot, long globalCheckpoint, IntSupplier translogOps, ActionListener<SendFileResult> listener) {
+    void phase1(IndexCommit snapshot, Consumer<ActionListener<Long>> getGlobalCheckpoint,
+                IntSupplier translogOps, ActionListener<SendFileResult> listener) {
         cancellableThreads.checkForCancel();
         // Total size of segment files that are recovered
         long totalSizeInBytes = 0;
@@ -518,6 +493,7 @@ void phase1(IndexCommit snapshot, long globalCheckpoint, IntSupplier translogOps
                     phase1ExistingFileNames.size(), new ByteSizeValue(existingTotalSizeInBytes));
                 final StepListener<Void> sendFileInfoStep = new StepListener<>();
                 final StepListener<Void> sendFilesStep = new StepListener<>();
+                final StepListener<Long> getGlobalCheckpointStep = new StepListener<>();
                 final StepListener<Void> cleanFilesStep = new StepListener<>();
                 cancellableThreads.execute(() ->
                     recoveryTarget.receiveFileInfo(phase1FileNames, phase1FileSizes, phase1ExistingFileNames,
@@ -526,7 +502,9 @@ void phase1(IndexCommit snapshot, long globalCheckpoint, IntSupplier translogOps
                 sendFileInfoStep.whenComplete(r ->
                     sendFiles(store, phase1Files.toArray(new StoreFileMetaData[0]), translogOps, sendFilesStep), listener::onFailure);
 
-                sendFilesStep.whenComplete(r ->
+                sendFilesStep.whenComplete(r -> getGlobalCheckpoint.accept(getGlobalCheckpointStep), listener::onFailure);
+
+                getGlobalCheckpointStep.whenComplete(globalCheckpoint ->
                     cleanFiles(store, recoverySourceMetadata, translogOps, globalCheckpoint, cleanFilesStep), listener::onFailure);
 
                 final long totalSize = totalSizeInBytes;
@@ -550,6 +528,45 @@ void phase1(IndexCommit snapshot, long globalCheckpoint, IntSupplier translogOps
         }
     }
 
+    private void createRetentionLease(final long startingSeqNo, ActionListener<Long> listener) {
+        runUnderPrimaryPermit(() -> {
+                // Clone the peer recovery retention lease belonging to the source shard. We are retaining history between the the local
+                // checkpoint of the safe commit we're creating and this lease's retained seqno with the retention lock, and by cloning an
+                // existing lease we (approximately) know that all our peers are also retaining history as requested by the cloned lease. If
+                // the recovery now fails before copying enough history over then a subsequent attempt will find this lease, determine it is
+                // not enough, and fall back to a file-based recovery.
+                //
+                // (approximately) because we do not guarantee to be able to satisfy every lease on every peer.
+                logger.trace("cloning primary's retention lease");
+                try {
+                    final StepListener<ReplicationResponse> cloneRetentionLeaseStep = new StepListener<>();
+                    final RetentionLease clonedLease
+                        = shard.cloneLocalPeerRecoveryRetentionLease(request.targetNode().getId(),
+                        new ThreadedActionListener<>(logger, shard.getThreadPool(),
+                            ThreadPool.Names.GENERIC, cloneRetentionLeaseStep, false));
+                    logger.trace("cloned primary's retention lease as [{}]", clonedLease);
+                    cloneRetentionLeaseStep.whenComplete(
+                        rr -> listener.onResponse(clonedLease.retainingSequenceNumber() - 1),
+                        listener::onFailure);
+                } catch (RetentionLeaseNotFoundException e) {
+                    // it's possible that the primary has no retention lease yet if we are doing a rolling upgrade from a version before
+                    // 7.4, and in that case we just create a lease using the local checkpoint of the safe commit which we're using for
+                    // recovery as a conservative estimate for the global checkpoint.
+                    assert shard.indexSettings().getIndexVersionCreated().before(Version.V_7_4_0);
+                    final StepListener<ReplicationResponse> addRetentionLeaseStep = new StepListener<>();
+                    final long estimatedGlobalCheckpoint = startingSeqNo - 1;
+                    shard.addPeerRecoveryRetentionLease(request.targetNode().getId(),
+                        estimatedGlobalCheckpoint, new ThreadedActionListener<>(logger, shard.getThreadPool(),
+                            ThreadPool.Names.GENERIC, addRetentionLeaseStep, false));
+                    addRetentionLeaseStep.whenComplete(
+                        rr -> listener.onResponse(estimatedGlobalCheckpoint),
+                        listener::onFailure);
+                    logger.trace("created retention lease with estimated checkpoint of [{}]", estimatedGlobalCheckpoint);
+                }
+            }, shardId + " establishing retention lease for [" + request.targetAllocationId() + "]",
+            shard, cancellableThreads, logger);
+    }
+
     boolean canSkipPhase1(Store.MetadataSnapshot source, Store.MetadataSnapshot target) {
         if (source.getSyncId() == null || source.getSyncId().equals(target.getSyncId()) == false) {
             return false;
diff --git a/server/src/test/java/org/elasticsearch/indices/recovery/RecoverySourceHandlerTests.java b/server/src/test/java/org/elasticsearch/indices/recovery/RecoverySourceHandlerTests.java
@@ -101,6 +101,7 @@
 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.atomic.AtomicLong;
+import java.util.function.Consumer;
 import java.util.function.IntSupplier;
 import java.util.zip.CRC32;
 
@@ -466,9 +467,10 @@ public void testThrowExceptionOnPrimaryRelocatedBeforePhase1Started() throws IOE
                 between(1, 8)) {
 
             @Override
-            void phase1(IndexCommit snapshot, long globalCheckpoint, IntSupplier translogOps, ActionListener<SendFileResult> listener) {
+            void phase1(IndexCommit snapshot, Consumer<ActionListener<Long>> getGlobalCheckpoint,
+                        IntSupplier translogOps, ActionListener<SendFileResult> listener) {
                 phase1Called.set(true);
-                super.phase1(snapshot, globalCheckpoint, translogOps, listener);
+                super.phase1(snapshot, getGlobalCheckpoint, translogOps, listener);
             }
 
             @Override
@@ -683,7 +685,9 @@ public void cleanFiles(int totalTranslogOps, long globalCheckpoint, Store.Metada
         final StepListener<RecoverySourceHandler.SendFileResult> phase1Listener = new StepListener<>();
         try {
             final CountDownLatch latch = new CountDownLatch(1);
-            handler.phase1(DirectoryReader.listCommits(dir).get(0), randomNonNegativeLong(), () -> 0,
+            handler.phase1(DirectoryReader.listCommits(dir).get(0),
+                l -> recoveryExecutor.execute(() -> l.onResponse(randomNonNegativeLong())),
+                () -> 0,
                 new LatchedActionListener<>(phase1Listener, latch));
             latch.await();
             phase1Listener.result();