Skip to content

Commit 2f6afd2

Browse files
Fix Concurrent Snapshot Ending And Stabilize Snapshot Finalization (#38368)
* The problem in #38226 is that in some corner cases multiple calls to `endSnapshot` were made concurrently, leading to non-deterministic behavior (`beginSnapshot` was triggering a repository finalization while one that was triggered by a `deleteSnapshot` was already in progress) * Fixed by: * Making all `endSnapshot` calls originate from the cluster state being in a "completed" state (apart from on short-circuit on initializing an empty snapshot). This forced putting the failure string into `SnapshotsInProgress.Entry`. * Adding deduplication logic to `endSnapshot` * Also: * Streamlined the init behavior to work the same way (keep state on the `SnapshotsService` to decide which snapshot entries are stale) * closes #38226
1 parent d862453 commit 2f6afd2

File tree

6 files changed

+234
-258
lines changed

6 files changed

+234
-258
lines changed

server/src/main/java/org/elasticsearch/cluster/SnapshotsInProgress.java

+32-4
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import com.carrotsearch.hppc.cursors.ObjectObjectCursor;
2525
import org.elasticsearch.Version;
2626
import org.elasticsearch.cluster.ClusterState.Custom;
27+
import org.elasticsearch.common.Nullable;
2728
import org.elasticsearch.common.collect.ImmutableOpenMap;
2829
import org.elasticsearch.common.io.stream.StreamInput;
2930
import org.elasticsearch.common.io.stream.StreamOutput;
@@ -87,9 +88,11 @@ public static class Entry {
8788
private final ImmutableOpenMap<String, List<ShardId>> waitingIndices;
8889
private final long startTime;
8990
private final long repositoryStateId;
91+
@Nullable private final String failure;
9092

9193
public Entry(Snapshot snapshot, boolean includeGlobalState, boolean partial, State state, List<IndexId> indices,
92-
long startTime, long repositoryStateId, ImmutableOpenMap<ShardId, ShardSnapshotStatus> shards) {
94+
long startTime, long repositoryStateId, ImmutableOpenMap<ShardId, ShardSnapshotStatus> shards,
95+
String failure) {
9396
this.state = state;
9497
this.snapshot = snapshot;
9598
this.includeGlobalState = includeGlobalState;
@@ -104,15 +107,26 @@ public Entry(Snapshot snapshot, boolean includeGlobalState, boolean partial, Sta
104107
this.waitingIndices = findWaitingIndices(shards);
105108
}
106109
this.repositoryStateId = repositoryStateId;
110+
this.failure = failure;
111+
}
112+
113+
public Entry(Snapshot snapshot, boolean includeGlobalState, boolean partial, State state, List<IndexId> indices,
114+
long startTime, long repositoryStateId, ImmutableOpenMap<ShardId, ShardSnapshotStatus> shards) {
115+
this(snapshot, includeGlobalState, partial, state, indices, startTime, repositoryStateId, shards, null);
107116
}
108117

109118
public Entry(Entry entry, State state, ImmutableOpenMap<ShardId, ShardSnapshotStatus> shards) {
110119
this(entry.snapshot, entry.includeGlobalState, entry.partial, state, entry.indices, entry.startTime,
111-
entry.repositoryStateId, shards);
120+
entry.repositoryStateId, shards, entry.failure);
121+
}
122+
123+
public Entry(Entry entry, State state, ImmutableOpenMap<ShardId, ShardSnapshotStatus> shards, String failure) {
124+
this(entry.snapshot, entry.includeGlobalState, entry.partial, state, entry.indices, entry.startTime,
125+
entry.repositoryStateId, shards, failure);
112126
}
113127

114128
public Entry(Entry entry, ImmutableOpenMap<ShardId, ShardSnapshotStatus> shards) {
115-
this(entry, entry.state, shards);
129+
this(entry, entry.state, shards, entry.failure);
116130
}
117131

118132
public Snapshot snapshot() {
@@ -151,6 +165,10 @@ public long getRepositoryStateId() {
151165
return repositoryStateId;
152166
}
153167

168+
public String failure() {
169+
return failure;
170+
}
171+
154172
@Override
155173
public boolean equals(Object o) {
156174
if (this == o) return true;
@@ -427,14 +445,21 @@ public SnapshotsInProgress(StreamInput in) throws IOException {
427445
}
428446
}
429447
long repositoryStateId = in.readLong();
448+
final String failure;
449+
if (in.getVersion().onOrAfter(Version.V_7_0_0)) {
450+
failure = in.readOptionalString();
451+
} else {
452+
failure = null;
453+
}
430454
entries[i] = new Entry(snapshot,
431455
includeGlobalState,
432456
partial,
433457
state,
434458
Collections.unmodifiableList(indexBuilder),
435459
startTime,
436460
repositoryStateId,
437-
builder.build());
461+
builder.build(),
462+
failure);
438463
}
439464
this.entries = Arrays.asList(entries);
440465
}
@@ -463,6 +488,9 @@ public void writeTo(StreamOutput out) throws IOException {
463488
}
464489
}
465490
out.writeLong(entry.repositoryStateId);
491+
if (out.getVersion().onOrAfter(Version.V_7_0_0)) {
492+
out.writeOptionalString(entry.failure);
493+
}
466494
}
467495
}
468496

server/src/main/java/org/elasticsearch/snapshots/SnapshotShardsService.java

-2
Original file line numberDiff line numberDiff line change
@@ -591,8 +591,6 @@ private class SnapshotStateExecutor implements ClusterStateTaskExecutor<UpdateIn
591591
// TODO: Add PARTIAL_SUCCESS status?
592592
SnapshotsInProgress.Entry updatedEntry = new SnapshotsInProgress.Entry(entry, State.SUCCESS, shards.build());
593593
entries.add(updatedEntry);
594-
// Finalize snapshot in the repository
595-
snapshotsService.endSnapshot(updatedEntry);
596594
}
597595
} else {
598596
entries.add(entry);

0 commit comments

Comments
 (0)