Skip to content

Commit 7f333ea

Browse files
Fix testMasterFailoverDuringCloneStep1 (#63580) (#64126)
Assuming the clone failed when the request failed is not sufficient. There are failure modes where the request fails but the clone still works out because the data node resent the requeest after the first clone had already been failed and removed from the cluster state when master was restarted. Closes #63473
1 parent 4369211 commit 7f333ea

File tree

2 files changed

+9
-1
lines changed

2 files changed

+9
-1
lines changed

server/src/internalClusterTest/java/org/elasticsearch/snapshots/CloneSnapshotIT.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -358,8 +358,9 @@ public void testMasterFailoverDuringCloneStep1() throws Exception {
358358
createFullSnapshot(repoName, sourceSnapshot);
359359

360360
blockMasterOnReadIndexMeta(repoName);
361+
final String cloneName = "target-snapshot";
361362
final ActionFuture<AcknowledgedResponse> cloneFuture =
362-
startCloneFromDataNode(repoName, sourceSnapshot, "target-snapshot", testIndex);
363+
startCloneFromDataNode(repoName, sourceSnapshot, cloneName, testIndex);
363364
awaitNumberOfSnapshotsInProgress(1);
364365
final String masterNode = internalCluster().getMasterName();
365366
waitForBlock(masterNode, repoName);
@@ -375,6 +376,9 @@ public void testMasterFailoverDuringCloneStep1() throws Exception {
375376

376377
awaitNoMoreRunningOperations();
377378

379+
// Check if the clone operation worked out by chance as a result of the clone request being retried because of the master failover
380+
cloneSucceeded = cloneSucceeded ||
381+
getRepositoryData(repoName).getSnapshotIds().stream().anyMatch(snapshotId -> snapshotId.getName().equals(cloneName));
378382
assertAllSnapshotsSuccessful(getRepositoryData(repoName), cloneSucceeded ? 2 : 1);
379383
}
380384

server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -359,6 +359,8 @@ public void createSnapshot(final CreateSnapshotRequest request, final ActionList
359359
final String repositoryName = request.repository();
360360
final String snapshotName = indexNameExpressionResolver.resolveDateMathExpression(request.snapshot());
361361
validate(repositoryName, snapshotName);
362+
// TODO: create snapshot UUID in CreateSnapshotRequest and make this operation idempotent to cleanly deal with transport layer
363+
// retries
362364
final SnapshotId snapshotId = new SnapshotId(snapshotName, UUIDs.randomBase64UUID()); // new UUID for the snapshot
363365
Repository repository = repositoriesService.repository(request.repository());
364366
if (repository.isReadOnly()) {
@@ -486,6 +488,8 @@ public void cloneSnapshot(CloneSnapshotRequest request, ActionListener<Void> lis
486488
}
487489
final String snapshotName = indexNameExpressionResolver.resolveDateMathExpression(request.target());
488490
validate(repositoryName, snapshotName);
491+
// TODO: create snapshot UUID in CloneSnapshotRequest and make this operation idempotent to cleanly deal with transport layer
492+
// retries
489493
final SnapshotId snapshotId = new SnapshotId(snapshotName, UUIDs.randomBase64UUID());
490494
final Snapshot snapshot = new Snapshot(repositoryName, snapshotId);
491495
initializingClones.add(snapshot);

0 commit comments

Comments
 (0)