Skip to content

Commit d159e5d

Browse files
Cleanup Concurrent RepositoryData Loading (#48329) (#48835)
The loading of `RepositoryData` is not an atomic operation. It uses a list + get combination of calls. This lead to accidentally returning an empty repository data for generations >=0 which can never not exist unless the repository is corrupted. In the test #48122 (and other SLM tests) there was a low chance of running into this concurrent modification scenario and the repository actually moving two index generations between listing out the index-N and loading the latest version of it. Since we only keep two index-N around at a time this lead to unexpectedly absent snapshots in status APIs. Fixing the behavior to be more resilient is non-trivial but in the works. For now I think we should simply throw in this scenario. This will also help prevent corruption in the unlikely event but possible of running into this issue in a snapshot create or delete operation on master failover on a repository like S3 which doesn't have the "no overwrites" protection on writing a new index-N. Fixes #48122
1 parent 7c2b737 commit d159e5d

File tree

2 files changed

+17
-19
lines changed

2 files changed

+17
-19
lines changed

server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java

+1-9
Original file line numberDiff line numberDiff line change
@@ -849,9 +849,6 @@ public void endVerification(String seed) {
849849
public RepositoryData getRepositoryData() {
850850
try {
851851
return getRepositoryData(latestIndexBlobId());
852-
} catch (NoSuchFileException ex) {
853-
// repository doesn't have an index blob, its a new blank repo
854-
return RepositoryData.EMPTY;
855852
} catch (IOException ioe) {
856853
throw new RepositoryException(metadata.name(), "Could not determine repository generation from root blobs", ioe);
857854
}
@@ -864,17 +861,12 @@ private RepositoryData getRepositoryData(long indexGen) {
864861
try {
865862
final String snapshotsIndexBlobName = INDEX_FILE_PREFIX + Long.toString(indexGen);
866863

867-
RepositoryData repositoryData;
868864
// EMPTY is safe here because RepositoryData#fromXContent calls namedObject
869865
try (InputStream blob = blobContainer().readBlob(snapshotsIndexBlobName);
870866
XContentParser parser = XContentType.JSON.xContent().createParser(NamedXContentRegistry.EMPTY,
871867
LoggingDeprecationHandler.INSTANCE, blob)) {
872-
repositoryData = RepositoryData.snapshotsFromXContent(parser, indexGen);
868+
return RepositoryData.snapshotsFromXContent(parser, indexGen);
873869
}
874-
return repositoryData;
875-
} catch (NoSuchFileException ex) {
876-
// repository doesn't have an index blob, its a new blank repo
877-
return RepositoryData.EMPTY;
878870
} catch (IOException ioe) {
879871
throw new RepositoryException(metadata.name(), "could not read repository data from index blob", ioe);
880872
}

x-pack/plugin/ilm/src/test/java/org/elasticsearch/xpack/slm/SLMSnapshotBlockingIntegTests.java

+16-10
Original file line numberDiff line numberDiff line change
@@ -410,18 +410,24 @@ private void testUnsuccessfulSnapshotRetention(boolean partialSuccess) throws Ex
410410
logger.info("--> waiting for {} snapshot [{}] to be deleted", expectedUnsuccessfulState, failedSnapshotName.get());
411411
assertBusy(() -> {
412412
try {
413+
try {
414+
GetSnapshotsResponse snapshotsStatusResponse = client().admin().cluster()
415+
.prepareGetSnapshots(REPO).setSnapshots(failedSnapshotName.get()).get();
416+
assertThat(snapshotsStatusResponse.getSnapshots(), empty());
417+
} catch (SnapshotMissingException e) {
418+
// This is what we want to happen
419+
}
420+
logger.info("--> {} snapshot [{}] has been deleted, checking successful snapshot [{}] still exists",
421+
expectedUnsuccessfulState, failedSnapshotName.get(), successfulSnapshotName.get());
413422
GetSnapshotsResponse snapshotsStatusResponse = client().admin().cluster()
414-
.prepareGetSnapshots(REPO).setSnapshots(failedSnapshotName.get()).get();
415-
assertThat(snapshotsStatusResponse.getSnapshots(), empty());
416-
} catch (SnapshotMissingException e) {
417-
// This is what we want to happen
423+
.prepareGetSnapshots(REPO).setSnapshots(successfulSnapshotName.get()).get();
424+
SnapshotInfo snapshotInfo = snapshotsStatusResponse.getSnapshots().get(0);
425+
assertEquals(SnapshotState.SUCCESS, snapshotInfo.state());
426+
} catch (RepositoryException re) {
427+
// Concurrent status calls and write operations may lead to failures in determining the current repository generation
428+
// TODO: Remove this hack once tracking the current repository generation has been made consistent
429+
throw new AssertionError(re);
418430
}
419-
logger.info("--> {} snapshot [{}] has been deleted, checking successful snapshot [{}] still exists",
420-
expectedUnsuccessfulState, failedSnapshotName.get(), successfulSnapshotName.get());
421-
GetSnapshotsResponse snapshotsStatusResponse = client().admin().cluster()
422-
.prepareGetSnapshots(REPO).setSnapshots(successfulSnapshotName.get()).get();
423-
SnapshotInfo snapshotInfo = snapshotsStatusResponse.getSnapshots().get(0);
424-
assertEquals(SnapshotState.SUCCESS, snapshotInfo.state());
425431
});
426432
}
427433
}

0 commit comments

Comments
 (0)