Skip to content

Commit e851fa8

Browse files
dnhatnywelsch
andcommitted
Do not wait for advancement of checkpoint in recovery (#39006)
With this change, we won't wait for the local checkpoint to advance to the max_seq_no before starting phase2 of peer-recovery. We also remove the sequence number range check in peer-recovery. We can safely do these thanks to Yannick's finding. The replication group to be used is currently sampled after indexing into the primary (see `ReplicationOperation` class). This means that when initiating tracking of a new replica, we have to consider the following two cases: - There are operations for which the replication group has not been sampled yet. As we initiated the new replica as tracking, we know that those operations will be replicated to the new replica and follow the typical replication group semantics (e.g. marked as stale when unavailable). - There are operations for which the replication group has already been sampled. These operations will not be sent to the new replica. However, we know that those operations are already indexed into Lucene and the translog on the primary, as the sampling is happening after that. This means that by taking a snapshot of Lucene or the translog, we will be getting those ops as well. What we cannot guarantee anymore is that all ops up to `endingSeqNo` are available in the snapshot (i.e. also see comment in `RecoverySourceHandler` saying `We need to wait for all operations up to the current max to complete, otherwise we can not guarantee that all operations in the required range will be available for replaying from the translog of the source.`). This is not needed, though, as we can no longer guarantee that max seq no == local checkpoint. Relates #39000 Closes #38949 Co-authored-by: Yannick Welsch <[email protected]>
1 parent e8aa85a commit e851fa8

File tree

9 files changed

+33
-100
lines changed

9 files changed

+33
-100
lines changed

server/src/main/java/org/elasticsearch/index/engine/Engine.java

-8
Original file line numberDiff line numberDiff line change
@@ -808,14 +808,6 @@ public final CommitStats commitStats() {
808808
*/
809809
public abstract long getLocalCheckpoint();
810810

811-
/**
812-
* Waits for all operations up to the provided sequence number to complete.
813-
*
814-
* @param seqNo the sequence number that the checkpoint must advance to before this method returns
815-
* @throws InterruptedException if the thread was interrupted while blocking on the condition
816-
*/
817-
public abstract void waitForOpsToComplete(long seqNo) throws InterruptedException;
818-
819811
/**
820812
* @return a {@link SeqNoStats} object, using local state and the supplied global checkpoint
821813
*/

server/src/main/java/org/elasticsearch/index/engine/InternalEngine.java

-5
Original file line numberDiff line numberDiff line change
@@ -2424,11 +2424,6 @@ public long getLocalCheckpoint() {
24242424
return localCheckpointTracker.getCheckpoint();
24252425
}
24262426

2427-
@Override
2428-
public void waitForOpsToComplete(long seqNo) throws InterruptedException {
2429-
localCheckpointTracker.waitForOpsToComplete(seqNo);
2430-
}
2431-
24322427
/**
24332428
* Marks the given seq_no as seen and advances the max_seq_no of this engine to at least that value.
24342429
*/

server/src/main/java/org/elasticsearch/index/engine/ReadOnlyEngine.java

-4
Original file line numberDiff line numberDiff line change
@@ -321,10 +321,6 @@ public long getLocalCheckpoint() {
321321
return seqNoStats.getLocalCheckpoint();
322322
}
323323

324-
@Override
325-
public void waitForOpsToComplete(long seqNo) {
326-
}
327-
328324
@Override
329325
public SeqNoStats getSeqNoStats(long globalCheckpoint) {
330326
return new SeqNoStats(seqNoStats.getMaxSeqNo(), seqNoStats.getLocalCheckpoint(), globalCheckpoint);

server/src/main/java/org/elasticsearch/index/shard/IndexShard.java

-10
Original file line numberDiff line numberDiff line change
@@ -2042,16 +2042,6 @@ public void syncRetentionLeases() {
20422042
}
20432043
}
20442044

2045-
/**
2046-
* Waits for all operations up to the provided sequence number to complete.
2047-
*
2048-
* @param seqNo the sequence number that the checkpoint must advance to before this method returns
2049-
* @throws InterruptedException if the thread was interrupted while blocking on the condition
2050-
*/
2051-
public void waitForOpsToComplete(final long seqNo) throws InterruptedException {
2052-
getEngine().waitForOpsToComplete(seqNo);
2053-
}
2054-
20552045
/**
20562046
* Called when the recovery process for a shard has opened the engine on the target shard. Ensures that the right data structures
20572047
* have been set up locally to track local checkpoint information for the shard and that the shard is added to the replication group.

server/src/main/java/org/elasticsearch/indices/recovery/RecoverySourceHandler.java

+3-37
Original file line numberDiff line numberDiff line change
@@ -158,14 +158,12 @@ public void recoverToTarget(ActionListener<RecoveryResponse> listener) {
158158
final Closeable retentionLock = shard.acquireRetentionLock();
159159
resources.add(retentionLock);
160160
final long startingSeqNo;
161-
final long requiredSeqNoRangeStart;
162161
final boolean isSequenceNumberBasedRecovery = request.startingSeqNo() != SequenceNumbers.UNASSIGNED_SEQ_NO &&
163162
isTargetSameHistory() && shard.hasCompleteHistoryOperations("peer-recovery", request.startingSeqNo());
164163
final SendFileResult sendFileResult;
165164
if (isSequenceNumberBasedRecovery) {
166165
logger.trace("performing sequence numbers based recovery. starting at [{}]", request.startingSeqNo());
167166
startingSeqNo = request.startingSeqNo();
168-
requiredSeqNoRangeStart = startingSeqNo;
169167
sendFileResult = SendFileResult.EMPTY;
170168
} else {
171169
final Engine.IndexCommitRef phase1Snapshot;
@@ -174,9 +172,6 @@ public void recoverToTarget(ActionListener<RecoveryResponse> listener) {
174172
} catch (final Exception e) {
175173
throw new RecoveryEngineException(shard.shardId(), 1, "snapshot failed", e);
176174
}
177-
// We must have everything above the local checkpoint in the commit
178-
requiredSeqNoRangeStart =
179-
Long.parseLong(phase1Snapshot.getIndexCommit().getUserData().get(SequenceNumbers.LOCAL_CHECKPOINT_KEY)) + 1;
180175
// We need to set this to 0 to create a translog roughly according to the retention policy on the target. Note that it will
181176
// still filter out legacy operations without seqNo.
182177
startingSeqNo = 0;
@@ -194,8 +189,6 @@ public void recoverToTarget(ActionListener<RecoveryResponse> listener) {
194189
}
195190
}
196191
assert startingSeqNo >= 0 : "startingSeqNo must be non negative. got: " + startingSeqNo;
197-
assert requiredSeqNoRangeStart >= startingSeqNo : "requiredSeqNoRangeStart [" + requiredSeqNoRangeStart + "] is lower than ["
198-
+ startingSeqNo + "]";
199192

200193
final StepListener<TimeValue> prepareEngineStep = new StepListener<>();
201194
// For a sequence based recovery, the target can keep its local translog
@@ -213,13 +206,7 @@ public void recoverToTarget(ActionListener<RecoveryResponse> listener) {
213206
shardId + " initiating tracking of " + request.targetAllocationId(), shard, cancellableThreads, logger);
214207

215208
final long endingSeqNo = shard.seqNoStats().getMaxSeqNo();
216-
/*
217-
* We need to wait for all operations up to the current max to complete, otherwise we can not guarantee that all
218-
* operations in the required range will be available for replaying from the translog of the source.
219-
*/
220-
cancellableThreads.execute(() -> shard.waitForOpsToComplete(endingSeqNo));
221209
if (logger.isTraceEnabled()) {
222-
logger.trace("all operations up to [{}] completed, which will be used as an ending sequence number", endingSeqNo);
223210
logger.trace("snapshot translog for recovery; current size is [{}]",
224211
shard.estimateNumberOfHistoryOperations("peer-recovery", startingSeqNo));
225212
}
@@ -232,15 +219,8 @@ public void recoverToTarget(ActionListener<RecoveryResponse> listener) {
232219
final long maxSeenAutoIdTimestamp = shard.getMaxSeenAutoIdTimestamp();
233220
final long maxSeqNoOfUpdatesOrDeletes = shard.getMaxSeqNoOfUpdatesOrDeletes();
234221
final RetentionLeases retentionLeases = shard.getRetentionLeases();
235-
phase2(
236-
startingSeqNo,
237-
requiredSeqNoRangeStart,
238-
endingSeqNo,
239-
phase2Snapshot,
240-
maxSeenAutoIdTimestamp,
241-
maxSeqNoOfUpdatesOrDeletes,
242-
retentionLeases,
243-
sendSnapshotStep);
222+
phase2(startingSeqNo, endingSeqNo, phase2Snapshot, maxSeenAutoIdTimestamp, maxSeqNoOfUpdatesOrDeletes,
223+
retentionLeases, sendSnapshotStep);
244224
sendSnapshotStep.whenComplete(
245225
r -> IOUtils.close(phase2Snapshot),
246226
e -> {
@@ -518,7 +498,6 @@ void prepareTargetForTranslog(boolean fileBasedRecovery, int totalTranslogOps, A
518498
*
519499
* @param startingSeqNo the sequence number to start recovery from, or {@link SequenceNumbers#UNASSIGNED_SEQ_NO} if all
520500
* ops should be sent
521-
* @param requiredSeqNoRangeStart the lower sequence number of the required range (ending with endingSeqNo)
522501
* @param endingSeqNo the highest sequence number that should be sent
523502
* @param snapshot a snapshot of the translog
524503
* @param maxSeenAutoIdTimestamp the max auto_id_timestamp of append-only requests on the primary
@@ -527,26 +506,19 @@ void prepareTargetForTranslog(boolean fileBasedRecovery, int totalTranslogOps, A
527506
*/
528507
void phase2(
529508
final long startingSeqNo,
530-
final long requiredSeqNoRangeStart,
531509
final long endingSeqNo,
532510
final Translog.Snapshot snapshot,
533511
final long maxSeenAutoIdTimestamp,
534512
final long maxSeqNoOfUpdatesOrDeletes,
535513
final RetentionLeases retentionLeases,
536514
final ActionListener<SendSnapshotResult> listener) throws IOException {
537-
assert requiredSeqNoRangeStart <= endingSeqNo + 1:
538-
"requiredSeqNoRangeStart " + requiredSeqNoRangeStart + " is larger than endingSeqNo " + endingSeqNo;
539-
assert startingSeqNo <= requiredSeqNoRangeStart :
540-
"startingSeqNo " + startingSeqNo + " is larger than requiredSeqNoRangeStart " + requiredSeqNoRangeStart;
541515
if (shard.state() == IndexShardState.CLOSED) {
542516
throw new IndexShardClosedException(request.shardId());
543517
}
544-
logger.trace("recovery [phase2]: sending transaction log operations (seq# from [" + startingSeqNo + "], " +
545-
"required [" + requiredSeqNoRangeStart + ":" + endingSeqNo + "]");
518+
logger.trace("recovery [phase2]: sending transaction log operations (from [" + startingSeqNo + "] to [" + endingSeqNo + "]");
546519

547520
final AtomicInteger skippedOps = new AtomicInteger();
548521
final AtomicInteger totalSentOps = new AtomicInteger();
549-
final LocalCheckpointTracker requiredOpsTracker = new LocalCheckpointTracker(endingSeqNo, requiredSeqNoRangeStart - 1);
550522
final AtomicInteger lastBatchCount = new AtomicInteger(); // used to estimate the count of the subsequent batch.
551523
final CheckedSupplier<List<Translog.Operation>, IOException> readNextBatch = () -> {
552524
// We need to synchronized Snapshot#next() because it's called by different threads through sendBatch.
@@ -568,7 +540,6 @@ void phase2(
568540
ops.add(operation);
569541
batchSizeInBytes += operation.estimateSize();
570542
totalSentOps.incrementAndGet();
571-
requiredOpsTracker.markSeqNoAsCompleted(seqNo);
572543

573544
// check if this request is past bytes threshold, and if so, send it off
574545
if (batchSizeInBytes >= chunkSizeInBytes) {
@@ -586,11 +557,6 @@ void phase2(
586557
assert snapshot.totalOperations() == snapshot.skippedOperations() + skippedOps.get() + totalSentOps.get()
587558
: String.format(Locale.ROOT, "expected total [%d], overridden [%d], skipped [%d], total sent [%d]",
588559
snapshot.totalOperations(), snapshot.skippedOperations(), skippedOps.get(), totalSentOps.get());
589-
if (requiredOpsTracker.getCheckpoint() < endingSeqNo) {
590-
throw new IllegalStateException("translog replay failed to cover required sequence numbers" +
591-
" (required range [" + requiredSeqNoRangeStart + ":" + endingSeqNo + "). first missing op is ["
592-
+ (requiredOpsTracker.getCheckpoint() + 1) + "]");
593-
}
594560
stopWatch.stop();
595561
final TimeValue tookTime = stopWatch.totalTime();
596562
logger.trace("recovery [phase2]: took [{}]", tookTime);

server/src/test/java/org/elasticsearch/indices/recovery/RecoverySourceHandlerTests.java

+5-19
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,6 @@
9797
import java.util.concurrent.atomic.AtomicReference;
9898
import java.util.function.IntSupplier;
9999
import java.util.function.Supplier;
100-
import java.util.stream.Collectors;
101100
import java.util.zip.CRC32;
102101

103102
import static java.util.Collections.emptyMap;
@@ -231,8 +230,7 @@ public void testSendSnapshotSendsOps() throws IOException {
231230
operations.add(new Translog.Index(index, new Engine.IndexResult(1, 1, i - initialNumberOfDocs, true)));
232231
}
233232
final long startingSeqNo = randomIntBetween(0, numberOfDocsWithValidSequenceNumbers - 1);
234-
final long requiredStartingSeqNo = randomIntBetween((int) startingSeqNo, numberOfDocsWithValidSequenceNumbers - 1);
235-
final long endingSeqNo = randomIntBetween((int) requiredStartingSeqNo - 1, numberOfDocsWithValidSequenceNumbers - 1);
233+
final long endingSeqNo = randomLongBetween(startingSeqNo, numberOfDocsWithValidSequenceNumbers - 1);
236234

237235
final List<Translog.Operation> shippedOps = new ArrayList<>();
238236
final AtomicLong checkpointOnTarget = new AtomicLong(SequenceNumbers.NO_OPS_PERFORMED);
@@ -247,7 +245,7 @@ public void indexTranslogOperations(List<Translog.Operation> operations, int tot
247245
};
248246
RecoverySourceHandler handler = new RecoverySourceHandler(shard, recoveryTarget, request, fileChunkSizeInBytes, between(1, 10));
249247
PlainActionFuture<RecoverySourceHandler.SendSnapshotResult> future = new PlainActionFuture<>();
250-
handler.phase2(startingSeqNo, requiredStartingSeqNo, endingSeqNo, newTranslogSnapshot(operations, Collections.emptyList()),
248+
handler.phase2(startingSeqNo, endingSeqNo, newTranslogSnapshot(operations, Collections.emptyList()),
251249
randomNonNegativeLong(), randomNonNegativeLong(), RetentionLeases.EMPTY, future);
252250
final int expectedOps = (int) (endingSeqNo - startingSeqNo + 1);
253251
RecoverySourceHandler.SendSnapshotResult result = future.actionGet();
@@ -258,18 +256,6 @@ public void indexTranslogOperations(List<Translog.Operation> operations, int tot
258256
assertThat(shippedOps.get(i), equalTo(operations.get(i + (int) startingSeqNo + initialNumberOfDocs)));
259257
}
260258
assertThat(result.targetLocalCheckpoint, equalTo(checkpointOnTarget.get()));
261-
if (endingSeqNo >= requiredStartingSeqNo + 1) {
262-
// check that missing ops blows up
263-
List<Translog.Operation> requiredOps = operations.subList(0, operations.size() - 1).stream() // remove last null marker
264-
.filter(o -> o.seqNo() >= requiredStartingSeqNo && o.seqNo() <= endingSeqNo).collect(Collectors.toList());
265-
List<Translog.Operation> opsToSkip = randomSubsetOf(randomIntBetween(1, requiredOps.size()), requiredOps);
266-
PlainActionFuture<RecoverySourceHandler.SendSnapshotResult> failedFuture = new PlainActionFuture<>();
267-
expectThrows(IllegalStateException.class, () -> {
268-
handler.phase2(startingSeqNo, requiredStartingSeqNo, endingSeqNo, newTranslogSnapshot(operations, opsToSkip),
269-
randomNonNegativeLong(), randomNonNegativeLong(), RetentionLeases.EMPTY, failedFuture);
270-
failedFuture.actionGet();
271-
});
272-
}
273259
}
274260

275261
public void testSendSnapshotStopOnError() throws Exception {
@@ -299,7 +285,7 @@ public void indexTranslogOperations(List<Translog.Operation> operations, int tot
299285
PlainActionFuture<RecoverySourceHandler.SendSnapshotResult> future = new PlainActionFuture<>();
300286
final long startingSeqNo = randomLongBetween(0, ops.size() - 1L);
301287
final long endingSeqNo = randomLongBetween(startingSeqNo, ops.size() - 1L);
302-
handler.phase2(startingSeqNo, startingSeqNo, endingSeqNo, newTranslogSnapshot(ops, Collections.emptyList()),
288+
handler.phase2(startingSeqNo, endingSeqNo, newTranslogSnapshot(ops, Collections.emptyList()),
303289
randomNonNegativeLong(), randomNonNegativeLong(), RetentionLeases.EMPTY, future);
304290
if (wasFailed.get()) {
305291
assertThat(expectThrows(RuntimeException.class, () -> future.actionGet()).getMessage(), equalTo("test - failed to index"));
@@ -498,11 +484,11 @@ void prepareTargetForTranslog(boolean fileBasedRecovery, int totalTranslogOps, A
498484
}
499485

500486
@Override
501-
void phase2(long startingSeqNo, long requiredSeqNoRangeStart, long endingSeqNo, Translog.Snapshot snapshot,
487+
void phase2(long startingSeqNo, long endingSeqNo, Translog.Snapshot snapshot,
502488
long maxSeenAutoIdTimestamp, long maxSeqNoOfUpdatesOrDeletes, RetentionLeases retentionLeases,
503489
ActionListener<SendSnapshotResult> listener) throws IOException {
504490
phase2Called.set(true);
505-
super.phase2(startingSeqNo, requiredSeqNoRangeStart, endingSeqNo, snapshot,
491+
super.phase2(startingSeqNo, endingSeqNo, snapshot,
506492
maxSeenAutoIdTimestamp, maxSeqNoOfUpdatesOrDeletes, retentionLeases, listener);
507493
}
508494

test/framework/src/main/java/org/elasticsearch/index/engine/EngineTestCase.java

+10
Original file line numberDiff line numberDiff line change
@@ -1109,6 +1109,16 @@ public static Translog getTranslog(Engine engine) {
11091109
return internalEngine.getTranslog();
11101110
}
11111111

1112+
/**
1113+
* Waits for all operations up to the provided sequence number to complete in the given internal engine.
1114+
*
1115+
* @param seqNo the sequence number that the checkpoint must advance to before this method returns
1116+
* @throws InterruptedException if the thread was interrupted while blocking on the condition
1117+
*/
1118+
public static void waitForOpsToComplete(InternalEngine engine, long seqNo) throws InterruptedException {
1119+
engine.getLocalCheckpointTracker().waitForOpsToComplete(seqNo);
1120+
}
1121+
11121122
public static boolean hasSnapshottedCommits(Engine engine) {
11131123
assert engine instanceof InternalEngine : "only InternalEngines have snapshotted commits, got: " + engine.getClass();
11141124
InternalEngine internalEngine = (InternalEngine) engine;

x-pack/plugin/ccr/src/test/java/org/elasticsearch/xpack/ccr/IndexFollowingIT.java

-2
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,6 @@
114114

115115
public class IndexFollowingIT extends CcrIntegTestCase {
116116

117-
@AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/38949")
118117
public void testFollowIndex() throws Exception {
119118
final int numberOfPrimaryShards = randomIntBetween(1, 3);
120119
int numberOfReplicas = between(0, 1);
@@ -221,7 +220,6 @@ public void testFollowIndex() throws Exception {
221220
}
222221
}
223222

224-
@AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/38949")
225223
public void testFollowIndexWithConcurrentMappingChanges() throws Exception {
226224
final int numberOfPrimaryShards = randomIntBetween(1, 3);
227225
final String leaderIndexSettings = getIndexSettings(numberOfPrimaryShards, between(0, 1),

0 commit comments

Comments
 (0)