Skip to content

Commit 93c6d77

Browse files
Fix Incorrect Concurrent SnapshotException on Master Failover (#54877)
If we run into an INIT state snapshot and the current master didn't create it, it will be removed anyway. => no need to have that block another snapshot from starting. This has practical relevance because on master fail-over after snapshot INIT but before start, the create snapshot request will be retried by the client (as it's a transport master node action) and needlessly fail with an unexpected exception (snapshot clearly didn't exist so it's confusing to the user). This allowed making two disruption type tests stricter
1 parent a5ec204 commit 93c6d77

File tree

3 files changed

+15
-18
lines changed

3 files changed

+15
-18
lines changed

server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,11 @@ public ClusterState execute(ClusterState currentState) {
185185
"cannot snapshot while a repository cleanup is in-progress in [" + repositoryCleanupInProgress + "]");
186186
}
187187
SnapshotsInProgress snapshots = currentState.custom(SnapshotsInProgress.TYPE);
188-
if (snapshots != null && snapshots.entries().isEmpty() == false) {
188+
// Fail if there are any concurrently running snapshots. The only exception to this being a snapshot in INIT state from a
189+
// previous master that we can simply ignore and remove from the cluster state because we would clean it up from the
190+
// cluster state anyway in #applyClusterState.
191+
if (snapshots != null && snapshots.entries().stream().anyMatch(entry ->
192+
(entry.state() == State.INIT && initializingSnapshots.contains(entry.snapshot()) == false) == false)) {
189193
throw new ConcurrentSnapshotExecutionException(repositoryName, snapshotName, " a snapshot is already running");
190194
}
191195
// Store newSnapshot here to be processed in clusterStateProcessed

server/src/test/java/org/elasticsearch/discovery/SnapshotDisruptionIT.java

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@
3434
import org.elasticsearch.common.settings.Settings;
3535
import org.elasticsearch.common.unit.ByteSizeUnit;
3636
import org.elasticsearch.plugins.Plugin;
37-
import org.elasticsearch.snapshots.ConcurrentSnapshotExecutionException;
3837
import org.elasticsearch.snapshots.SnapshotException;
3938
import org.elasticsearch.snapshots.SnapshotInfo;
4039
import org.elasticsearch.snapshots.SnapshotMissingException;
@@ -149,17 +148,7 @@ public void clusterChanged(ClusterChangedEvent event) {
149148
ensureStableCluster(4, masterNode1);
150149
logger.info("--> done");
151150

152-
try {
153-
future.get();
154-
} catch (Exception ex) {
155-
Throwable cause = ex.getCause();
156-
if (cause.getCause() instanceof ConcurrentSnapshotExecutionException) {
157-
logger.info("--> got exception from race in master operation retries");
158-
} else {
159-
logger.info("--> got exception from hanged master", ex);
160-
}
161-
}
162-
151+
future.get();
163152
assertAllSnapshotsCompleted();
164153
}
165154

server/src/test/java/org/elasticsearch/snapshots/SnapshotResiliencyTests.java

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -716,6 +716,8 @@ public void testSnapshotPrimaryRelocations() {
716716
continueOrDie(createRepoAndIndex(repoName, index, shards),
717717
createIndexResponse -> client().admin().cluster().state(new ClusterStateRequest(), clusterStateResponseStepListener));
718718

719+
final StepListener<CreateSnapshotResponse> snapshotStartedListener = new StepListener<>();
720+
719721
continueOrDie(clusterStateResponseStepListener, clusterStateResponse -> {
720722
final ShardRouting shardToRelocate = clusterStateResponse.getState().routingTable().allShards(index).get(0);
721723
final TestClusterNodes.TestClusterNode currentPrimaryNode = testClusterNodes.nodeById(shardToRelocate.currentNodeId());
@@ -734,11 +736,7 @@ public void run() {
734736
scheduleNow(() -> testClusterNodes.stopNode(masterNode));
735737
}
736738
testClusterNodes.randomDataNodeSafe().client.admin().cluster().prepareCreateSnapshot(repoName, snapshotName)
737-
.execute(ActionListener.wrap(() -> {
738-
createdSnapshot.set(true);
739-
testClusterNodes.randomDataNodeSafe().client.admin().cluster().deleteSnapshot(
740-
new DeleteSnapshotRequest(repoName, snapshotName), noopListener());
741-
}));
739+
.execute(snapshotStartedListener);
742740
scheduleNow(
743741
() -> testClusterNodes.randomMasterNodeSafe().client.admin().cluster().reroute(
744742
new ClusterRerouteRequest().add(new AllocateEmptyPrimaryAllocationCommand(
@@ -751,6 +749,12 @@ public void run() {
751749
});
752750
});
753751

752+
continueOrDie(snapshotStartedListener, snapshotResponse -> {
753+
createdSnapshot.set(true);
754+
testClusterNodes.randomDataNodeSafe().client.admin().cluster().deleteSnapshot(
755+
new DeleteSnapshotRequest(repoName, snapshotName), noopListener());
756+
});
757+
754758
runUntil(() -> testClusterNodes.randomMasterNode().map(master -> {
755759
if (createdSnapshot.get() == false) {
756760
return false;

0 commit comments

Comments
 (0)