@@ -197,51 +197,51 @@ public void recoverToTarget(ActionListener<RecoveryResponse> listener) {
197
197
assert requiredSeqNoRangeStart >= startingSeqNo : "requiredSeqNoRangeStart [" + requiredSeqNoRangeStart + "] is lower than ["
198
198
+ startingSeqNo + "]" ;
199
199
200
- final TimeValue prepareEngineTime ;
201
- try {
202
- // For a sequence based recovery, the target can keep its local translog
203
- prepareEngineTime = prepareTargetForTranslog (isSequenceNumberBasedRecovery == false ,
204
- shard .estimateNumberOfHistoryOperations ("peer-recovery" , startingSeqNo ));
205
- } catch (final Exception e ) {
206
- throw new RecoveryEngineException (shard .shardId (), 1 , "prepare target for translog failed" , e );
207
- }
200
+ final StepListener <TimeValue > prepareEngineStep = new StepListener <>();
201
+ // For a sequence based recovery, the target can keep its local translog
202
+ prepareTargetForTranslog (isSequenceNumberBasedRecovery == false ,
203
+ shard .estimateNumberOfHistoryOperations ("peer-recovery" , startingSeqNo ), prepareEngineStep );
204
+ final StepListener <SendSnapshotResult > sendSnapshotStep = new StepListener <>();
205
+ prepareEngineStep .whenComplete (prepareEngineTime -> {
206
+ /*
207
+ * add shard to replication group (shard will receive replication requests from this point on) now that engine is open.
208
+ * This means that any document indexed into the primary after this will be replicated to this replica as well
209
+ * make sure to do this before sampling the max sequence number in the next step, to ensure that we send
210
+ * all documents up to maxSeqNo in phase2.
211
+ */
212
+ runUnderPrimaryPermit (() -> shard .initiateTracking (request .targetAllocationId ()),
213
+ shardId + " initiating tracking of " + request .targetAllocationId (), shard , cancellableThreads , logger );
208
214
209
- /*
210
- * add shard to replication group (shard will receive replication requests from this point on) now that engine is open.
211
- * This means that any document indexed into the primary after this will be replicated to this replica as well
212
- * make sure to do this before sampling the max sequence number in the next step, to ensure that we send
213
- * all documents up to maxSeqNo in phase2.
214
- */
215
- runUnderPrimaryPermit (() -> shard .initiateTracking (request .targetAllocationId ()),
216
- shardId + " initiating tracking of " + request .targetAllocationId (), shard , cancellableThreads , logger );
217
-
218
- final long endingSeqNo = shard .seqNoStats ().getMaxSeqNo ();
219
- /*
220
- * We need to wait for all operations up to the current max to complete, otherwise we can not guarantee that all
221
- * operations in the required range will be available for replaying from the translog of the source.
222
- */
223
- cancellableThreads .execute (() -> shard .waitForOpsToComplete (endingSeqNo ));
224
-
225
- if (logger .isTraceEnabled ()) {
226
- logger .trace ("all operations up to [{}] completed, which will be used as an ending sequence number" , endingSeqNo );
227
- logger .trace ("snapshot translog for recovery; current size is [{}]" ,
228
- shard .estimateNumberOfHistoryOperations ("peer-recovery" , startingSeqNo ));
229
- }
215
+ final long endingSeqNo = shard .seqNoStats ().getMaxSeqNo ();
216
+ /*
217
+ * We need to wait for all operations up to the current max to complete, otherwise we can not guarantee that all
218
+ * operations in the required range will be available for replaying from the translog of the source.
219
+ */
220
+ cancellableThreads .execute (() -> shard .waitForOpsToComplete (endingSeqNo ));
221
+ if (logger .isTraceEnabled ()) {
222
+ logger .trace ("all operations up to [{}] completed, which will be used as an ending sequence number" , endingSeqNo );
223
+ logger .trace ("snapshot translog for recovery; current size is [{}]" ,
224
+ shard .estimateNumberOfHistoryOperations ("peer-recovery" , startingSeqNo ));
225
+ }
226
+ final Translog .Snapshot phase2Snapshot = shard .getHistoryOperations ("peer-recovery" , startingSeqNo );
227
+ resources .add (phase2Snapshot );
228
+ // we can release the retention lock here because the snapshot itself will retain the required operations.
229
+ retentionLock .close ();
230
+ // we have to capture the max_seen_auto_id_timestamp and the max_seq_no_of_updates to make sure that these values
231
+ // are at least as high as the corresponding values on the primary when any of these operations were executed on it.
232
+ final long maxSeenAutoIdTimestamp = shard .getMaxSeenAutoIdTimestamp ();
233
+ final long maxSeqNoOfUpdatesOrDeletes = shard .getMaxSeqNoOfUpdatesOrDeletes ();
234
+ phase2 (startingSeqNo , requiredSeqNoRangeStart , endingSeqNo , phase2Snapshot , maxSeenAutoIdTimestamp ,
235
+ maxSeqNoOfUpdatesOrDeletes , sendSnapshotStep );
236
+ sendSnapshotStep .whenComplete (
237
+ r -> IOUtils .close (phase2Snapshot ),
238
+ e -> {
239
+ IOUtils .closeWhileHandlingException (phase2Snapshot );
240
+ onFailure .accept (new RecoveryEngineException (shard .shardId (), 2 , "phase2 failed" , e ));
241
+ });
242
+
243
+ }, onFailure );
230
244
231
- final Translog .Snapshot phase2Snapshot = shard .getHistoryOperations ("peer-recovery" , startingSeqNo );
232
- resources .add (phase2Snapshot );
233
- // we can release the retention lock here because the snapshot itself will retain the required operations.
234
- IOUtils .close (retentionLock );
235
- // we have to capture the max_seen_auto_id_timestamp and the max_seq_no_of_updates to make sure that these values
236
- // are at least as high as the corresponding values on the primary when any of these operations were executed on it.
237
- final long maxSeenAutoIdTimestamp = shard .getMaxSeenAutoIdTimestamp ();
238
- final long maxSeqNoOfUpdatesOrDeletes = shard .getMaxSeqNoOfUpdatesOrDeletes ();
239
- final StepListener <SendSnapshotResult > sendSnapshotStep = new StepListener <>();
240
- phase2 (startingSeqNo , requiredSeqNoRangeStart , endingSeqNo , phase2Snapshot , maxSeenAutoIdTimestamp ,
241
- maxSeqNoOfUpdatesOrDeletes , sendSnapshotStep );
242
- sendSnapshotStep .whenComplete (
243
- r -> IOUtils .close (phase2Snapshot ),
244
- e -> onFailure .accept (new RecoveryEngineException (shard .shardId (), 2 , "phase2 failed" , e )));
245
245
final StepListener <Void > finalizeStep = new StepListener <>();
246
246
sendSnapshotStep .whenComplete (r -> finalizeRecovery (r .targetLocalCheckpoint , finalizeStep ), onFailure );
247
247
@@ -251,7 +251,7 @@ public void recoverToTarget(ActionListener<RecoveryResponse> listener) {
251
251
final RecoveryResponse response = new RecoveryResponse (sendFileResult .phase1FileNames , sendFileResult .phase1FileSizes ,
252
252
sendFileResult .phase1ExistingFileNames , sendFileResult .phase1ExistingFileSizes , sendFileResult .totalSize ,
253
253
sendFileResult .existingTotalSize , sendFileResult .took .millis (), phase1ThrottlingWaitTime ,
254
- prepareEngineTime .millis (), sendSnapshotResult .totalOperations , sendSnapshotResult .tookTime .millis ());
254
+ prepareEngineStep . result () .millis (), sendSnapshotResult .totalOperations , sendSnapshotResult .tookTime .millis ());
255
255
try {
256
256
wrappedListener .onResponse (response );
257
257
} finally {
@@ -484,16 +484,21 @@ public SendFileResult phase1(final IndexCommit snapshot, final Supplier<Integer>
484
484
}
485
485
}
486
486
487
- TimeValue prepareTargetForTranslog (final boolean fileBasedRecovery , final int totalTranslogOps ) throws IOException {
487
+ void prepareTargetForTranslog (boolean fileBasedRecovery , int totalTranslogOps , ActionListener < TimeValue > listener ) {
488
488
StopWatch stopWatch = new StopWatch ().start ();
489
- logger .trace ("recovery [phase1]: prepare remote engine for translog" );
489
+ final ActionListener <Void > wrappedListener = ActionListener .wrap (
490
+ nullVal -> {
491
+ stopWatch .stop ();
492
+ final TimeValue tookTime = stopWatch .totalTime ();
493
+ logger .trace ("recovery [phase1]: remote engine start took [{}]" , tookTime );
494
+ listener .onResponse (tookTime );
495
+ },
496
+ e -> listener .onFailure (new RecoveryEngineException (shard .shardId (), 1 , "prepare target for translog failed" , e )));
490
497
// Send a request preparing the new shard's translog to receive operations. This ensures the shard engine is started and disables
491
498
// garbage collection (not the JVM's GC!) of tombstone deletes.
492
- cancellableThreads .executeIO (() -> recoveryTarget .prepareForTranslogOperations (fileBasedRecovery , totalTranslogOps ));
493
- stopWatch .stop ();
494
- final TimeValue tookTime = stopWatch .totalTime ();
495
- logger .trace ("recovery [phase1]: remote engine start took [{}]" , tookTime );
496
- return tookTime ;
499
+ logger .trace ("recovery [phase1]: prepare remote engine for translog" );
500
+ cancellableThreads .execute (() ->
501
+ recoveryTarget .prepareForTranslogOperations (fileBasedRecovery , totalTranslogOps , wrappedListener ));
497
502
}
498
503
499
504
/**
0 commit comments