Skip to content

Commit e911704

Browse files
Fix Infinite Retry Loop in loading RepositoryData (elastic#50987)
* Fix Infinite Retry Loop in loading RepositoryData We were running into an infinite loop when trying to load corrupted (or otherwise un-loadable) repository data for a repo that uses best effort consistency (e.g. that was just freshly mounted as done in the test) because we kepy resetting to `-1` on `IOException`, listing and finding the broken generation `N` and then interpreted the subsequent reset to `-1` as a concurrent change to the repository.
1 parent b345c7f commit e911704

File tree

2 files changed

+51
-4
lines changed

2 files changed

+51
-4
lines changed

server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java

+11-4
Original file line numberDiff line numberDiff line change
@@ -1065,6 +1065,9 @@ public void getRepositoryData(ActionListener<RepositoryData> listener) {
10651065
return;
10661066
}
10671067
// Retry loading RepositoryData in a loop in case we run into concurrent modifications of the repository.
1068+
// Keep track of the most recent generation we failed to load so we can break out of the loop if we fail to load the same
1069+
// generation repeatedly.
1070+
long lastFailedGeneration = RepositoryData.UNKNOWN_REPO_GEN;
10681071
while (true) {
10691072
final long genToLoad;
10701073
if (bestEffortConsistency) {
@@ -1074,7 +1077,9 @@ public void getRepositoryData(ActionListener<RepositoryData> listener) {
10741077
try {
10751078
generation = latestIndexBlobId();
10761079
} catch (IOException ioe) {
1077-
throw new RepositoryException(metadata.name(), "Could not determine repository generation from root blobs", ioe);
1080+
listener.onFailure(
1081+
new RepositoryException(metadata.name(), "Could not determine repository generation from root blobs", ioe));
1082+
return;
10781083
}
10791084
genToLoad = latestKnownRepoGen.updateAndGet(known -> Math.max(known, generation));
10801085
if (genToLoad > generation) {
@@ -1089,7 +1094,9 @@ public void getRepositoryData(ActionListener<RepositoryData> listener) {
10891094
listener.onResponse(getRepositoryData(genToLoad));
10901095
return;
10911096
} catch (RepositoryException e) {
1092-
if (genToLoad != latestKnownRepoGen.get()) {
1097+
// If the generation to load changed concurrently and we didn't just try loading the same generation before we retry
1098+
if (genToLoad != latestKnownRepoGen.get() && genToLoad != lastFailedGeneration) {
1099+
lastFailedGeneration = genToLoad;
10931100
logger.warn("Failed to load repository data generation [" + genToLoad +
10941101
"] because a concurrent operation moved the current generation to [" + latestKnownRepoGen.get() + "]", e);
10951102
continue;
@@ -1099,10 +1106,10 @@ public void getRepositoryData(ActionListener<RepositoryData> listener) {
10991106
// of N so we mark this repository as corrupted.
11001107
markRepoCorrupted(genToLoad, e,
11011108
ActionListener.wrap(v -> listener.onFailure(corruptedStateException(e)), listener::onFailure));
1102-
return;
11031109
} else {
1104-
throw e;
1110+
listener.onFailure(e);
11051111
}
1112+
return;
11061113
}
11071114
}
11081115
}

server/src/test/java/org/elasticsearch/snapshots/CorruptedBlobStoreRepositoryIT.java

+40
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,46 @@ public void testHandlingMissingRootLevelSnapshotMetadata() throws Exception {
303303
}
304304
}
305305

306+
public void testMountCorruptedRepositoryData() throws Exception {
307+
disableRepoConsistencyCheck("This test intentionally corrupts the repository contents");
308+
Client client = client();
309+
310+
Path repo = randomRepoPath();
311+
final String repoName = "test-repo";
312+
logger.info("--> creating repository at {}", repo.toAbsolutePath());
313+
assertAcked(client.admin().cluster().preparePutRepository(repoName)
314+
.setType("fs").setSettings(Settings.builder()
315+
.put("location", repo)
316+
.put("compress", false)));
317+
318+
final String snapshot = "test-snap";
319+
320+
logger.info("--> creating snapshot");
321+
CreateSnapshotResponse createSnapshotResponse = client.admin().cluster().prepareCreateSnapshot(repoName, snapshot)
322+
.setWaitForCompletion(true).setIndices("test-idx-*").get();
323+
assertThat(createSnapshotResponse.getSnapshotInfo().successfulShards(),
324+
equalTo(createSnapshotResponse.getSnapshotInfo().totalShards()));
325+
326+
logger.info("--> corrupt index-N blob");
327+
final Repository repository = internalCluster().getCurrentMasterNodeInstance(RepositoriesService.class).repository(repoName);
328+
final RepositoryData repositoryData = getRepositoryData(repository);
329+
Files.write(repo.resolve("index-" + repositoryData.getGenId()), randomByteArrayOfLength(randomIntBetween(1, 100)));
330+
331+
logger.info("--> verify loading repository data throws RepositoryException");
332+
expectThrows(RepositoryException.class, () -> getRepositoryData(repository));
333+
334+
logger.info("--> mount repository path in a new repository");
335+
final String otherRepoName = "other-repo";
336+
assertAcked(client.admin().cluster().preparePutRepository(otherRepoName)
337+
.setType("fs").setSettings(Settings.builder()
338+
.put("location", repo)
339+
.put("compress", false)));
340+
final Repository otherRepo = internalCluster().getCurrentMasterNodeInstance(RepositoriesService.class).repository(otherRepoName);
341+
342+
logger.info("--> verify loading repository data from newly mounted repository throws RepositoryException");
343+
expectThrows(RepositoryException.class, () -> getRepositoryData(otherRepo));
344+
}
345+
306346
private void assertRepositoryBlocked(Client client, String repo, String existingSnapshot) {
307347
logger.info("--> try to delete snapshot");
308348
final RepositoryException repositoryException3 = expectThrows(RepositoryException.class,

0 commit comments

Comments
 (0)