Skip to content

Commit e70c7c9

Browse files
Fix Index Deletion During Partial Snapshot Create (#50234)
* Fix Index Deletion During Partial Snapshot Create We can simply filter out shard generation updates for indices that were removed from the cluster state concurrently to fix index deletes during partial snapshots as that completely removes any reference to those shards from the snapshot. Follow up to #50202 Closes #50200
1 parent 08214eb commit e70c7c9

File tree

3 files changed

+66
-11
lines changed

3 files changed

+66
-11
lines changed

server/src/main/java/org/elasticsearch/repositories/ShardGenerations.java

+7
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,13 @@ private ShardGenerations(Map<IndexId, List<String>> shardGenerations) {
5454
this.shardGenerations = shardGenerations;
5555
}
5656

57+
/**
58+
* Returns the total number of shards tracked by this instance.
59+
*/
60+
public int totalShards() {
61+
return shardGenerations.values().stream().mapToInt(List::size).sum();
62+
}
63+
5764
/**
5865
* Returns all indices for which shard generations are tracked.
5966
*

server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java

+18-6
Original file line numberDiff line numberDiff line change
@@ -568,16 +568,17 @@ public void onNoLongerMaster() {
568568
private void cleanupAfterError(Exception exception) {
569569
threadPool.generic().execute(() -> {
570570
if (snapshotCreated) {
571+
final MetaData metaData = clusterService.state().metaData();
571572
repositoriesService.repository(snapshot.snapshot().getRepository())
572573
.finalizeSnapshot(snapshot.snapshot().getSnapshotId(),
573-
buildGenerations(snapshot),
574+
buildGenerations(snapshot, metaData),
574575
snapshot.startTime(),
575576
ExceptionsHelper.stackTrace(exception),
576577
0,
577578
Collections.emptyList(),
578579
snapshot.repositoryStateId(),
579580
snapshot.includeGlobalState(),
580-
metaDataForSnapshot(snapshot, clusterService.state().metaData()),
581+
metaDataForSnapshot(snapshot, metaData),
581582
snapshot.userMetadata(),
582583
snapshot.useShardGenerations(),
583584
ActionListener.runAfter(ActionListener.wrap(ignored -> {
@@ -593,11 +594,21 @@ private void cleanupAfterError(Exception exception) {
593594
}
594595
}
595596

596-
private static ShardGenerations buildGenerations(SnapshotsInProgress.Entry snapshot) {
597+
private static ShardGenerations buildGenerations(SnapshotsInProgress.Entry snapshot, MetaData metaData) {
597598
ShardGenerations.Builder builder = ShardGenerations.builder();
598599
final Map<String, IndexId> indexLookup = new HashMap<>();
599600
snapshot.indices().forEach(idx -> indexLookup.put(idx.getName(), idx));
600-
snapshot.shards().forEach(c -> builder.put(indexLookup.get(c.key.getIndexName()), c.key.id(), c.value.generation()));
601+
snapshot.shards().forEach(c -> {
602+
if (metaData.index(c.key.getIndex()) == null) {
603+
assert snapshot.partial() :
604+
"Index [" + c.key.getIndex() + "] was deleted during a snapshot but snapshot was not partial.";
605+
return;
606+
}
607+
final IndexId indexId = indexLookup.get(c.key.getIndexName());
608+
if (indexId != null) {
609+
builder.put(indexId, c.key.id(), c.value.generation());
610+
}
611+
});
601612
return builder.build();
602613
}
603614

@@ -1032,12 +1043,13 @@ protected void doRun() {
10321043
shardFailures.add(new SnapshotShardFailure(status.nodeId(), shardId, status.reason()));
10331044
}
10341045
}
1046+
final ShardGenerations shardGenerations = buildGenerations(entry, metaData);
10351047
repository.finalizeSnapshot(
10361048
snapshot.getSnapshotId(),
1037-
buildGenerations(entry),
1049+
shardGenerations,
10381050
entry.startTime(),
10391051
failure,
1040-
entry.shards().size(),
1052+
entry.partial() ? shardGenerations.totalShards() : entry.shards().size(),
10411053
unmodifiableList(shardFailures),
10421054
entry.repositoryStateId(),
10431055
entry.includeGlobalState(),

server/src/test/java/org/elasticsearch/snapshots/SnapshotResiliencyTests.java

+41-5
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
import org.apache.logging.log4j.LogManager;
2323
import org.apache.logging.log4j.Logger;
24+
import org.apache.lucene.util.SetOnce;
2425
import org.elasticsearch.ExceptionsHelper;
2526
import org.elasticsearch.Version;
2627
import org.elasticsearch.action.ActionListener;
@@ -143,6 +144,7 @@
143144
import org.elasticsearch.env.TestEnvironment;
144145
import org.elasticsearch.gateway.MetaStateService;
145146
import org.elasticsearch.gateway.TransportNodesListGatewayStartedShards;
147+
import org.elasticsearch.index.Index;
146148
import org.elasticsearch.index.analysis.AnalysisRegistry;
147149
import org.elasticsearch.index.seqno.GlobalCheckpointSyncAction;
148150
import org.elasticsearch.index.seqno.RetentionLeaseSyncer;
@@ -211,6 +213,7 @@
211213
import static org.hamcrest.Matchers.empty;
212214
import static org.hamcrest.Matchers.hasSize;
213215
import static org.hamcrest.Matchers.instanceOf;
216+
import static org.hamcrest.Matchers.is;
214217
import static org.hamcrest.Matchers.lessThanOrEqualTo;
215218
import static org.mockito.Mockito.mock;
216219

@@ -503,7 +506,7 @@ public void testConcurrentSnapshotCreateAndDeleteOther() {
503506
}
504507
}
505508

506-
public void testConcurrentSnapshotDeleteAndDeleteIndex() {
509+
public void testConcurrentSnapshotDeleteAndDeleteIndex() throws IOException {
507510
setupTestCluster(randomFrom(1, 3, 5), randomIntBetween(2, 10));
508511

509512
String repoName = "repo";
@@ -514,11 +517,13 @@ public void testConcurrentSnapshotDeleteAndDeleteIndex() {
514517
testClusterNodes.currentMaster(testClusterNodes.nodes.values().iterator().next().clusterService.state());
515518

516519
final StepListener<Collection<CreateIndexResponse>> createIndicesListener = new StepListener<>();
520+
final int indices = randomIntBetween(5, 20);
517521

522+
final SetOnce<Index> firstIndex = new SetOnce<>();
518523
continueOrDie(createRepoAndIndex(repoName, index, 1), createIndexResponse -> {
524+
firstIndex.set(masterNode.clusterService.state().metaData().index(index).getIndex());
519525
// create a few more indices to make it more likely that the subsequent index delete operation happens before snapshot
520526
// finalization
521-
final int indices = randomIntBetween(5, 20);
522527
final GroupedActionListener<CreateIndexResponse> listener = new GroupedActionListener<>(createIndicesListener, indices);
523528
for (int i = 0; i < indices; ++i) {
524529
client().admin().indices().create(new CreateIndexRequest("index-" + i), listener);
@@ -527,23 +532,54 @@ public void testConcurrentSnapshotDeleteAndDeleteIndex() {
527532

528533
final StepListener<CreateSnapshotResponse> createSnapshotResponseStepListener = new StepListener<>();
529534

535+
final boolean partialSnapshot = randomBoolean();
536+
530537
continueOrDie(createIndicesListener, createIndexResponses ->
531538
client().admin().cluster().prepareCreateSnapshot(repoName, snapshotName).setWaitForCompletion(false)
532-
.execute(createSnapshotResponseStepListener));
539+
.setPartial(partialSnapshot).execute(createSnapshotResponseStepListener));
533540

534541
continueOrDie(createSnapshotResponseStepListener,
535-
createSnapshotResponse -> client().admin().indices().delete(new DeleteIndexRequest(index), noopListener()));
542+
createSnapshotResponse -> client().admin().indices().delete(new DeleteIndexRequest(index), new ActionListener<>() {
543+
@Override
544+
public void onResponse(AcknowledgedResponse acknowledgedResponse) {
545+
if (partialSnapshot) {
546+
// Recreate index by the same name to test that we don't snapshot conflicting metadata in this scenario
547+
client().admin().indices().create(new CreateIndexRequest(index), noopListener());
548+
}
549+
}
550+
551+
@Override
552+
public void onFailure(Exception e) {
553+
if (partialSnapshot) {
554+
throw new AssertionError("Delete index should always work during partial snapshots", e);
555+
}
556+
}
557+
}));
536558

537559
deterministicTaskQueue.runAllRunnableTasks();
538560

539561
SnapshotsInProgress finalSnapshotsInProgress = masterNode.clusterService.state().custom(SnapshotsInProgress.TYPE);
540562
assertFalse(finalSnapshotsInProgress.entries().stream().anyMatch(entry -> entry.state().completed() == false));
541563
final Repository repository = masterNode.repositoriesService.repository(repoName);
542-
Collection<SnapshotId> snapshotIds = getRepositoryData(repository).getSnapshotIds();
564+
final RepositoryData repositoryData = getRepositoryData(repository);
565+
Collection<SnapshotId> snapshotIds = repositoryData.getSnapshotIds();
543566
assertThat(snapshotIds, hasSize(1));
544567

545568
final SnapshotInfo snapshotInfo = repository.getSnapshotInfo(snapshotIds.iterator().next());
546569
assertEquals(SnapshotState.SUCCESS, snapshotInfo.state());
570+
if (partialSnapshot) {
571+
// Single shard for each index so we either get all indices or all except for the deleted index
572+
assertThat(snapshotInfo.successfulShards(), either(is(indices + 1)).or(is(indices)));
573+
if (snapshotInfo.successfulShards() == indices + 1) {
574+
final IndexMetaData indexMetaData =
575+
repository.getSnapshotIndexMetaData(snapshotInfo.snapshotId(), repositoryData.resolveIndexId(index));
576+
// Make sure we snapshotted the metadata of this index and not the recreated version
577+
assertEquals(indexMetaData.getIndex(), firstIndex.get());
578+
}
579+
} else {
580+
// Index delete must be blocked for non-partial snapshots and we get a snapshot for every index
581+
assertEquals(snapshotInfo.successfulShards(), indices + 1);
582+
}
547583
assertEquals(0, snapshotInfo.failedShards());
548584
}
549585

0 commit comments

Comments
 (0)