Skip to content

Commit b38f464

Browse files
authored
Do not send recovery requests with CancellableThreads (#46287)
Previously, we send recovery requests using CancellableThreads because we send requests and wait for responses in a blocking manner. With async recovery, we no longer need to do so. Moreover, if we fail to submit a request, then we can release the Store using an interruptible thread which can risk invalidating the node lock. This PR is the first step to avoid forking when releasing the Store. Relates #45409 Relates #46178
1 parent c43ad1e commit b38f464

File tree

1 file changed

+36
-34
lines changed

1 file changed

+36
-34
lines changed

server/src/main/java/org/elasticsearch/indices/recovery/RecoverySourceHandler.java

+36-34
Original file line numberDiff line numberDiff line change
@@ -517,9 +517,9 @@ void phase1(IndexCommit snapshot, Consumer<ActionListener<RetentionLease>> creat
517517
final StepListener<Void> sendFilesStep = new StepListener<>();
518518
final StepListener<RetentionLease> createRetentionLeaseStep = new StepListener<>();
519519
final StepListener<Void> cleanFilesStep = new StepListener<>();
520-
cancellableThreads.execute(() ->
521-
recoveryTarget.receiveFileInfo(phase1FileNames, phase1FileSizes, phase1ExistingFileNames,
522-
phase1ExistingFileSizes, translogOps.getAsInt(), sendFileInfoStep));
520+
cancellableThreads.checkForCancel();
521+
recoveryTarget.receiveFileInfo(phase1FileNames, phase1FileSizes, phase1ExistingFileNames,
522+
phase1ExistingFileSizes, translogOps.getAsInt(), sendFileInfoStep);
523523

524524
sendFileInfoStep.whenComplete(r ->
525525
sendFiles(store, phase1Files.toArray(new StoreFileMetaData[0]), translogOps, sendFilesStep), listener::onFailure);
@@ -634,8 +634,8 @@ void prepareTargetForTranslog(int totalTranslogOps, ActionListener<TimeValue> li
634634
// Send a request preparing the new shard's translog to receive operations. This ensures the shard engine is started and disables
635635
// garbage collection (not the JVM's GC!) of tombstone deletes.
636636
logger.trace("recovery [phase1]: prepare remote engine for translog");
637-
cancellableThreads.execute(() ->
638-
recoveryTarget.prepareForTranslogOperations(totalTranslogOps, wrappedListener));
637+
cancellableThreads.checkForCancel();
638+
recoveryTarget.prepareForTranslogOperations(totalTranslogOps, wrappedListener);
639639
}
640640

641641
/**
@@ -741,30 +741,29 @@ private void sendBatch(
741741
final List<Translog.Operation> operations = nextBatch.get();
742742
// send the leftover operations or if no operations were sent, request the target to respond with its local checkpoint
743743
if (operations.isEmpty() == false || firstBatch) {
744-
cancellableThreads.execute(() -> {
745-
recoveryTarget.indexTranslogOperations(
746-
operations,
747-
totalTranslogOps,
748-
maxSeenAutoIdTimestamp,
749-
maxSeqNoOfUpdatesOrDeletes,
750-
retentionLeases,
751-
mappingVersionOnPrimary,
752-
ActionListener.wrap(
753-
newCheckpoint -> {
754-
sendBatch(
755-
nextBatch,
756-
false,
757-
SequenceNumbers.max(targetLocalCheckpoint, newCheckpoint),
758-
totalTranslogOps,
759-
maxSeenAutoIdTimestamp,
760-
maxSeqNoOfUpdatesOrDeletes,
761-
retentionLeases,
762-
mappingVersionOnPrimary,
763-
listener);
764-
},
765-
listener::onFailure
766-
));
767-
});
744+
cancellableThreads.checkForCancel();
745+
recoveryTarget.indexTranslogOperations(
746+
operations,
747+
totalTranslogOps,
748+
maxSeenAutoIdTimestamp,
749+
maxSeqNoOfUpdatesOrDeletes,
750+
retentionLeases,
751+
mappingVersionOnPrimary,
752+
ActionListener.wrap(
753+
newCheckpoint -> {
754+
sendBatch(
755+
nextBatch,
756+
false,
757+
SequenceNumbers.max(targetLocalCheckpoint, newCheckpoint),
758+
totalTranslogOps,
759+
maxSeenAutoIdTimestamp,
760+
maxSeqNoOfUpdatesOrDeletes,
761+
retentionLeases,
762+
mappingVersionOnPrimary,
763+
listener);
764+
},
765+
listener::onFailure
766+
));
768767
} else {
769768
listener.onResponse(targetLocalCheckpoint);
770769
}
@@ -787,7 +786,8 @@ void finalizeRecovery(long targetLocalCheckpoint, long trimAboveSeqNo, ActionLis
787786
shardId + " marking " + request.targetAllocationId() + " as in sync", shard, cancellableThreads, logger);
788787
final long globalCheckpoint = shard.getLastKnownGlobalCheckpoint(); // this global checkpoint is persisted in finalizeRecovery
789788
final StepListener<Void> finalizeListener = new StepListener<>();
790-
cancellableThreads.executeIO(() -> recoveryTarget.finalizeRecovery(globalCheckpoint, trimAboveSeqNo, finalizeListener));
789+
cancellableThreads.checkForCancel();
790+
recoveryTarget.finalizeRecovery(globalCheckpoint, trimAboveSeqNo, finalizeListener);
791791
finalizeListener.whenComplete(r -> {
792792
runUnderPrimaryPermit(() -> shard.updateGlobalCheckpointForShard(request.targetAllocationId(), globalCheckpoint),
793793
shardId + " updating " + request.targetAllocationId() + "'s global checkpoint", shard, cancellableThreads, logger);
@@ -894,8 +894,9 @@ protected FileChunk nextChunkRequest(StoreFileMetaData md) throws IOException {
894894

895895
@Override
896896
protected void sendChunkRequest(FileChunk request, ActionListener<Void> listener) {
897-
cancellableThreads.execute(() -> recoveryTarget.writeFileChunk(
898-
request.md, request.position, request.content, request.lastChunk, translogOps.getAsInt(), listener));
897+
cancellableThreads.checkForCancel();
898+
recoveryTarget.writeFileChunk(
899+
request.md, request.position, request.content, request.lastChunk, translogOps.getAsInt(), listener);
899900
}
900901

901902
@Override
@@ -922,13 +923,14 @@ private void cleanFiles(Store store, Store.MetadataSnapshot sourceMetadata, IntS
922923
// Once the files have been renamed, any other files that are not
923924
// related to this recovery (out of date segments, for example)
924925
// are deleted
925-
cancellableThreads.execute(() -> recoveryTarget.cleanFiles(translogOps.getAsInt(), globalCheckpoint, sourceMetadata,
926+
cancellableThreads.checkForCancel();
927+
recoveryTarget.cleanFiles(translogOps.getAsInt(), globalCheckpoint, sourceMetadata,
926928
ActionListener.delegateResponse(listener, (l, e) -> ActionListener.completeWith(l, () -> {
927929
StoreFileMetaData[] mds = StreamSupport.stream(sourceMetadata.spliterator(), false).toArray(StoreFileMetaData[]::new);
928930
ArrayUtil.timSort(mds, Comparator.comparingLong(StoreFileMetaData::length)); // check small files first
929931
handleErrorOnSendFiles(store, e, mds);
930932
throw e;
931-
}))));
933+
})));
932934
}
933935

934936
private void handleErrorOnSendFiles(Store store, Exception e, StoreFileMetaData[] mds) throws Exception {

0 commit comments

Comments
 (0)