Skip to content

Commit 258f4b3

Browse files
Fix Incorrect Concurrent SnapshotException on Master Failover (#54877) (#55448)
If we run into an INIT state snapshot and the current master didn't create it, it will be removed anyway. => no need to have that block another snapshot from starting. This has practical relevance because on master fail-over after snapshot INIT but before start, the create snapshot request will be retried by the client (as it's a transport master node action) and needlessly fail with an unexpected exception (snapshot clearly didn't exist so it's confusing to the user). This allowed making two disruption type tests stricter
1 parent 5550d8f commit 258f4b3

File tree

3 files changed

+15
-18
lines changed

3 files changed

+15
-18
lines changed

server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,11 @@ public ClusterState execute(ClusterState currentState) {
191191
"cannot snapshot while a repository cleanup is in-progress in [" + repositoryCleanupInProgress + "]");
192192
}
193193
SnapshotsInProgress snapshots = currentState.custom(SnapshotsInProgress.TYPE);
194-
if (snapshots != null && snapshots.entries().isEmpty() == false) {
194+
// Fail if there are any concurrently running snapshots. The only exception to this being a snapshot in INIT state from a
195+
// previous master that we can simply ignore and remove from the cluster state because we would clean it up from the
196+
// cluster state anyway in #applyClusterState.
197+
if (snapshots != null && snapshots.entries().stream().anyMatch(entry ->
198+
(entry.state() == State.INIT && initializingSnapshots.contains(entry.snapshot()) == false) == false)) {
195199
throw new ConcurrentSnapshotExecutionException(repositoryName, snapshotName, " a snapshot is already running");
196200
}
197201
// Store newSnapshot here to be processed in clusterStateProcessed

server/src/test/java/org/elasticsearch/discovery/SnapshotDisruptionIT.java

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@
3434
import org.elasticsearch.common.settings.Settings;
3535
import org.elasticsearch.common.unit.ByteSizeUnit;
3636
import org.elasticsearch.plugins.Plugin;
37-
import org.elasticsearch.snapshots.ConcurrentSnapshotExecutionException;
3837
import org.elasticsearch.snapshots.SnapshotException;
3938
import org.elasticsearch.snapshots.SnapshotInfo;
4039
import org.elasticsearch.snapshots.SnapshotMissingException;
@@ -149,17 +148,7 @@ public void clusterChanged(ClusterChangedEvent event) {
149148
ensureStableCluster(4, masterNode1);
150149
logger.info("--> done");
151150

152-
try {
153-
future.get();
154-
} catch (Exception ex) {
155-
Throwable cause = ex.getCause();
156-
if (cause.getCause() instanceof ConcurrentSnapshotExecutionException) {
157-
logger.info("--> got exception from race in master operation retries");
158-
} else {
159-
logger.info("--> got exception from hanged master", ex);
160-
}
161-
}
162-
151+
future.get();
163152
assertAllSnapshotsCompleted();
164153
}
165154

server/src/test/java/org/elasticsearch/snapshots/SnapshotResiliencyTests.java

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -721,6 +721,8 @@ public void testSnapshotPrimaryRelocations() {
721721
continueOrDie(createRepoAndIndex(repoName, index, shards),
722722
createIndexResponse -> client().admin().cluster().state(new ClusterStateRequest(), clusterStateResponseStepListener));
723723

724+
final StepListener<CreateSnapshotResponse> snapshotStartedListener = new StepListener<>();
725+
724726
continueOrDie(clusterStateResponseStepListener, clusterStateResponse -> {
725727
final ShardRouting shardToRelocate = clusterStateResponse.getState().routingTable().allShards(index).get(0);
726728
final TestClusterNodes.TestClusterNode currentPrimaryNode = testClusterNodes.nodeById(shardToRelocate.currentNodeId());
@@ -739,11 +741,7 @@ public void run() {
739741
scheduleNow(() -> testClusterNodes.stopNode(masterNode));
740742
}
741743
testClusterNodes.randomDataNodeSafe().client.admin().cluster().prepareCreateSnapshot(repoName, snapshotName)
742-
.execute(ActionListener.wrap(() -> {
743-
createdSnapshot.set(true);
744-
testClusterNodes.randomDataNodeSafe().client.admin().cluster().deleteSnapshot(
745-
new DeleteSnapshotRequest(repoName, snapshotName), noopListener());
746-
}));
744+
.execute(snapshotStartedListener);
747745
scheduleNow(
748746
() -> testClusterNodes.randomMasterNodeSafe().client.admin().cluster().reroute(
749747
new ClusterRerouteRequest().add(new AllocateEmptyPrimaryAllocationCommand(
@@ -756,6 +754,12 @@ public void run() {
756754
});
757755
});
758756

757+
continueOrDie(snapshotStartedListener, snapshotResponse -> {
758+
createdSnapshot.set(true);
759+
testClusterNodes.randomDataNodeSafe().client.admin().cluster().deleteSnapshot(
760+
new DeleteSnapshotRequest(repoName, snapshotName), noopListener());
761+
});
762+
759763
runUntil(() -> testClusterNodes.randomMasterNode().map(master -> {
760764
if (createdSnapshot.get() == false) {
761765
return false;

0 commit comments

Comments
 (0)