Skip to content

Commit 3c20541

Browse files
Cleanup Concurrent RepositoryData Loading (#48329) (#48834)
The loading of `RepositoryData` is not an atomic operation. It uses a list + get combination of calls. This lead to accidentally returning an empty repository data for generations >=0 which can never not exist unless the repository is corrupted. In the test #48122 (and other SLM tests) there was a low chance of running into this concurrent modification scenario and the repository actually moving two index generations between listing out the index-N and loading the latest version of it. Since we only keep two index-N around at a time this lead to unexpectedly absent snapshots in status APIs. Fixing the behavior to be more resilient is non-trivial but in the works. For now I think we should simply throw in this scenario. This will also help prevent corruption in the unlikely event but possible of running into this issue in a snapshot create or delete operation on master failover on a repository like S3 which doesn't have the "no overwrites" protection on writing a new index-N. Fixes #48122
1 parent a22f6fb commit 3c20541

File tree

2 files changed

+17
-19
lines changed

2 files changed

+17
-19
lines changed

server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java

+1-9
Original file line numberDiff line numberDiff line change
@@ -907,9 +907,6 @@ public void endVerification(String seed) {
907907
public RepositoryData getRepositoryData() {
908908
try {
909909
return getRepositoryData(latestIndexBlobId());
910-
} catch (NoSuchFileException ex) {
911-
// repository doesn't have an index blob, its a new blank repo
912-
return RepositoryData.EMPTY;
913910
} catch (IOException ioe) {
914911
throw new RepositoryException(metadata.name(), "Could not determine repository generation from root blobs", ioe);
915912
}
@@ -922,17 +919,12 @@ private RepositoryData getRepositoryData(long indexGen) {
922919
try {
923920
final String snapshotsIndexBlobName = INDEX_FILE_PREFIX + Long.toString(indexGen);
924921

925-
RepositoryData repositoryData;
926922
// EMPTY is safe here because RepositoryData#fromXContent calls namedObject
927923
try (InputStream blob = blobContainer().readBlob(snapshotsIndexBlobName);
928924
XContentParser parser = XContentType.JSON.xContent().createParser(NamedXContentRegistry.EMPTY,
929925
LoggingDeprecationHandler.INSTANCE, blob)) {
930-
repositoryData = RepositoryData.snapshotsFromXContent(parser, indexGen);
926+
return RepositoryData.snapshotsFromXContent(parser, indexGen);
931927
}
932-
return repositoryData;
933-
} catch (NoSuchFileException ex) {
934-
// repository doesn't have an index blob, its a new blank repo
935-
return RepositoryData.EMPTY;
936928
} catch (IOException ioe) {
937929
throw new RepositoryException(metadata.name(), "could not read repository data from index blob", ioe);
938930
}

x-pack/plugin/ilm/src/test/java/org/elasticsearch/xpack/slm/SLMSnapshotBlockingIntegTests.java

+16-10
Original file line numberDiff line numberDiff line change
@@ -390,18 +390,24 @@ private void testUnsuccessfulSnapshotRetention(boolean partialSuccess) throws Ex
390390
logger.info("--> waiting for {} snapshot [{}] to be deleted", expectedUnsuccessfulState, failedSnapshotName.get());
391391
assertBusy(() -> {
392392
try {
393+
try {
394+
GetSnapshotsResponse snapshotsStatusResponse = client().admin().cluster()
395+
.prepareGetSnapshots(REPO).setSnapshots(failedSnapshotName.get()).get();
396+
assertThat(snapshotsStatusResponse.getSnapshots(), empty());
397+
} catch (SnapshotMissingException e) {
398+
// This is what we want to happen
399+
}
400+
logger.info("--> {} snapshot [{}] has been deleted, checking successful snapshot [{}] still exists",
401+
expectedUnsuccessfulState, failedSnapshotName.get(), successfulSnapshotName.get());
393402
GetSnapshotsResponse snapshotsStatusResponse = client().admin().cluster()
394-
.prepareGetSnapshots(REPO).setSnapshots(failedSnapshotName.get()).get();
395-
assertThat(snapshotsStatusResponse.getSnapshots(), empty());
396-
} catch (SnapshotMissingException e) {
397-
// This is what we want to happen
403+
.prepareGetSnapshots(REPO).setSnapshots(successfulSnapshotName.get()).get();
404+
SnapshotInfo snapshotInfo = snapshotsStatusResponse.getSnapshots().get(0);
405+
assertEquals(SnapshotState.SUCCESS, snapshotInfo.state());
406+
} catch (RepositoryException re) {
407+
// Concurrent status calls and write operations may lead to failures in determining the current repository generation
408+
// TODO: Remove this hack once tracking the current repository generation has been made consistent
409+
throw new AssertionError(re);
398410
}
399-
logger.info("--> {} snapshot [{}] has been deleted, checking successful snapshot [{}] still exists",
400-
expectedUnsuccessfulState, failedSnapshotName.get(), successfulSnapshotName.get());
401-
GetSnapshotsResponse snapshotsStatusResponse = client().admin().cluster()
402-
.prepareGetSnapshots(REPO).setSnapshots(successfulSnapshotName.get()).get();
403-
SnapshotInfo snapshotInfo = snapshotsStatusResponse.getSnapshots().get(0);
404-
assertEquals(SnapshotState.SUCCESS, snapshotInfo.state());
405411
});
406412
}
407413
}

0 commit comments

Comments
 (0)