Skip to content

Commit a0f80bd

Browse files
Cleanup Concurrent RepositoryData Loading (#48329)
The loading of `RepositoryData` is not an atomic operation. It uses a list + get combination of calls. This lead to accidentally returning an empty repository data for generations >=0 which can never not exist unless the repository is corrupted. In the test #48122 (and other SLM tests) there was a low chance of running into this concurrent modification scenario and the repository actually moving two index generations between listing out the index-N and loading the latest version of it. Since we only keep two index-N around at a time this lead to unexpectedly absent snapshots in status APIs. Fixing the behavior to be more resilient is non-trivial but in the works. For now I think we should simply throw in this scenario. This will also help prevent corruption in the unlikely event but possible of running into this issue in a snapshot create or delete operation on master failover on a repository like S3 which doesn't have the "no overwrites" protection on writing a new index-N. Fixes #48122
1 parent 3c2e27a commit a0f80bd

File tree

2 files changed

+18
-19
lines changed

2 files changed

+18
-19
lines changed

server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java

+1-9
Original file line numberDiff line numberDiff line change
@@ -902,9 +902,6 @@ public void endVerification(String seed) {
902902
public RepositoryData getRepositoryData() {
903903
try {
904904
return getRepositoryData(latestIndexBlobId());
905-
} catch (NoSuchFileException ex) {
906-
// repository doesn't have an index blob, its a new blank repo
907-
return RepositoryData.EMPTY;
908905
} catch (IOException ioe) {
909906
throw new RepositoryException(metadata.name(), "Could not determine repository generation from root blobs", ioe);
910907
}
@@ -917,17 +914,12 @@ private RepositoryData getRepositoryData(long indexGen) {
917914
try {
918915
final String snapshotsIndexBlobName = INDEX_FILE_PREFIX + Long.toString(indexGen);
919916

920-
RepositoryData repositoryData;
921917
// EMPTY is safe here because RepositoryData#fromXContent calls namedObject
922918
try (InputStream blob = blobContainer().readBlob(snapshotsIndexBlobName);
923919
XContentParser parser = XContentType.JSON.xContent().createParser(NamedXContentRegistry.EMPTY,
924920
LoggingDeprecationHandler.INSTANCE, blob)) {
925-
repositoryData = RepositoryData.snapshotsFromXContent(parser, indexGen);
921+
return RepositoryData.snapshotsFromXContent(parser, indexGen);
926922
}
927-
return repositoryData;
928-
} catch (NoSuchFileException ex) {
929-
// repository doesn't have an index blob, its a new blank repo
930-
return RepositoryData.EMPTY;
931923
} catch (IOException ioe) {
932924
throw new RepositoryException(metadata.name(), "could not read repository data from index blob", ioe);
933925
}

x-pack/plugin/ilm/src/test/java/org/elasticsearch/xpack/slm/SLMSnapshotBlockingIntegTests.java

+17-10
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import org.elasticsearch.index.query.QueryBuilders;
2222
import org.elasticsearch.plugins.Plugin;
2323
import org.elasticsearch.repositories.RepositoriesService;
24+
import org.elasticsearch.repositories.RepositoryException;
2425
import org.elasticsearch.snapshots.ConcurrentSnapshotExecutionException;
2526
import org.elasticsearch.snapshots.SnapshotInfo;
2627
import org.elasticsearch.snapshots.SnapshotMissingException;
@@ -355,18 +356,24 @@ private void testUnsuccessfulSnapshotRetention(boolean partialSuccess) throws Ex
355356
logger.info("--> waiting for {} snapshot [{}] to be deleted", expectedUnsuccessfulState, failedSnapshotName.get());
356357
assertBusy(() -> {
357358
try {
359+
try {
360+
GetSnapshotsResponse snapshotsStatusResponse = client().admin().cluster()
361+
.prepareGetSnapshots(REPO).setSnapshots(failedSnapshotName.get()).get();
362+
assertThat(snapshotsStatusResponse.getSnapshots(REPO), empty());
363+
} catch (SnapshotMissingException e) {
364+
// This is what we want to happen
365+
}
366+
logger.info("--> {} snapshot [{}] has been deleted, checking successful snapshot [{}] still exists",
367+
expectedUnsuccessfulState, failedSnapshotName.get(), successfulSnapshotName.get());
358368
GetSnapshotsResponse snapshotsStatusResponse = client().admin().cluster()
359-
.prepareGetSnapshots(REPO).setSnapshots(failedSnapshotName.get()).get();
360-
assertThat(snapshotsStatusResponse.getSnapshots(REPO), empty());
361-
} catch (SnapshotMissingException e) {
362-
// This is what we want to happen
369+
.prepareGetSnapshots(REPO).setSnapshots(successfulSnapshotName.get()).get();
370+
SnapshotInfo snapshotInfo = snapshotsStatusResponse.getSnapshots(REPO).get(0);
371+
assertEquals(SnapshotState.SUCCESS, snapshotInfo.state());
372+
} catch (RepositoryException re) {
373+
// Concurrent status calls and write operations may lead to failures in determining the current repository generation
374+
// TODO: Remove this hack once tracking the current repository generation has been made consistent
375+
throw new AssertionError(re);
363376
}
364-
logger.info("--> {} snapshot [{}] has been deleted, checking successful snapshot [{}] still exists",
365-
expectedUnsuccessfulState, failedSnapshotName.get(), successfulSnapshotName.get());
366-
GetSnapshotsResponse snapshotsStatusResponse = client().admin().cluster()
367-
.prepareGetSnapshots(REPO).setSnapshots(successfulSnapshotName.get()).get();
368-
SnapshotInfo snapshotInfo = snapshotsStatusResponse.getSnapshots(REPO).get(0);
369-
assertEquals(SnapshotState.SUCCESS, snapshotInfo.state());
370377
});
371378
}
372379
}

0 commit comments

Comments
 (0)