Skip to content

Commit 21c3053

Browse files
Fix Broken Index Shard Snapshot File Preventing Snapshot Creation (#41310) (#41476)
* The problem here is that if we run into a corrupted index-N file, instead of generating a new index-(N+1) file, we instead set the newest index generation to -1 and thus tried to create `index-0` * If `index-0` is corrupt, this prevents us from ever creating a new snapshot using the broken shard, because we are unable to create `index-0` since it already exists * Fixed by still using the index generation for naming the next index file, even if it was a broken index file * Added test that makes sure restoring as well as snapshotting on top of the broken shard index file work as expected * closes #41304
1 parent c423127 commit 21c3053

File tree

3 files changed

+120
-17
lines changed

3 files changed

+120
-17
lines changed

server/src/main/java/org/elasticsearch/index/snapshots/blobstore/BlobStoreIndexShardSnapshots.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,10 @@
3737
import static java.util.Collections.unmodifiableMap;
3838

3939
/**
40-
* Contains information about all snapshot for the given shard in repository
40+
* Contains information about all snapshots for the given shard in repository
4141
* <p>
4242
* This class is used to find files that were already snapshotted and clear out files that no longer referenced by any
43-
* snapshots
43+
* snapshots.
4444
*/
4545
public class BlobStoreIndexShardSnapshots implements Iterable<SnapshotFiles>, ToXContentFragment {
4646

server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -123,27 +123,28 @@
123123
* <pre>
124124
* {@code
125125
* STORE_ROOT
126-
* |- index-N - list of all snapshot ids and the indices belonging to each snapshot, N is the generation of the file
126+
* |- index-N - JSON serialized {@link RepositoryData} containing a list of all snapshot ids and the indices belonging to
127+
* | each snapshot, N is the generation of the file
127128
* |- index.latest - contains the numeric value of the latest generation of the index file (i.e. N from above)
128129
* |- incompatible-snapshots - list of all snapshot ids that are no longer compatible with the current version of the cluster
129-
* |- snap-20131010 - JSON serialized Snapshot for snapshot "20131010"
130-
* |- meta-20131010.dat - JSON serialized MetaData for snapshot "20131010" (includes only global metadata)
131-
* |- snap-20131011 - JSON serialized Snapshot for snapshot "20131011"
132-
* |- meta-20131011.dat - JSON serialized MetaData for snapshot "20131011"
130+
* |- snap-20131010.dat - SMILE serialized {@link SnapshotInfo} for snapshot "20131010"
131+
* |- meta-20131010.dat - SMILE serialized {@link MetaData} for snapshot "20131010" (includes only global metadata)
132+
* |- snap-20131011.dat - SMILE serialized {@link SnapshotInfo} for snapshot "20131011"
133+
* |- meta-20131011.dat - SMILE serialized {@link MetaData} for snapshot "20131011"
133134
* .....
134135
* |- indices/ - data for all indices
135136
* |- Ac1342-B_x/ - data for index "foo" which was assigned the unique id of Ac1342-B_x in the repository
136-
* | |- meta-20131010.dat - JSON Serialized IndexMetaData for index "foo"
137+
* | |- meta-20131010.dat - JSON Serialized {@link IndexMetaData} for index "foo"
137138
* | |- 0/ - data for shard "0" of index "foo"
138-
* | | |- __1 \
139-
* | | |- __2 |
140-
* | | |- __3 |- files from different segments see snapshot-* for their mappings to real segment files
141-
* | | |- __4 |
142-
* | | |- __5 /
139+
* | | |- __1 \ (files with numeric names were created by older ES versions)
140+
* | | |- __2 |
141+
* | | |- __VPO5oDMVT5y4Akv8T_AO_A |- files from different segments see snap-* for their mappings to real segment files
142+
* | | |- __1gbJy18wS_2kv1qI7FgKuQ |
143+
* | | |- __R8JvZAHlSMyMXyZc2SS8Zg /
143144
* | | .....
144-
* | | |- snap-20131010.dat - JSON serialized BlobStoreIndexShardSnapshot for snapshot "20131010"
145-
* | | |- snap-20131011.dat - JSON serialized BlobStoreIndexShardSnapshot for snapshot "20131011"
146-
* | | |- list-123 - JSON serialized BlobStoreIndexShardSnapshot for snapshot "20131011"
145+
* | | |- snap-20131010.dat - SMILE serialized {@link BlobStoreIndexShardSnapshot} for snapshot "20131010"
146+
* | | |- snap-20131011.dat - SMILE serialized {@link BlobStoreIndexShardSnapshot} for snapshot "20131011"
147+
* | | |- index-123 - SMILE serialized {@link BlobStoreIndexShardSnapshots} for the shard
147148
* | |
148149
* | |- 1/ - data for shard "1" of index "foo"
149150
* | | |- __1
@@ -1142,7 +1143,7 @@ protected Tuple<BlobStoreIndexShardSnapshots, Integer> buildBlobStoreIndexShardS
11421143
logger.warn(() -> new ParameterizedMessage("failed to read commit point [{}]", name), e);
11431144
}
11441145
}
1145-
return new Tuple<>(new BlobStoreIndexShardSnapshots(snapshots), -1);
1146+
return new Tuple<>(new BlobStoreIndexShardSnapshots(snapshots), latest);
11461147
}
11471148
}
11481149

server/src/test/java/org/elasticsearch/snapshots/SharedClusterSnapshotRestoreIT.java

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2963,6 +2963,108 @@ public void testRestoreSnapshotWithCorruptedIndexMetadata() throws Exception {
29632963
assertAcked(client().admin().cluster().prepareDeleteSnapshot("test-repo", snapshotInfo.snapshotId().getName()).get());
29642964
}
29652965

2966+
/**
2967+
* Tests that a shard snapshot with a corrupted shard index file can still be used for restore and incremental snapshots.
2968+
*/
2969+
public void testSnapshotWithCorruptedShardIndexFile() throws Exception {
2970+
final Client client = client();
2971+
final Path repo = randomRepoPath();
2972+
final String indexName = "test-idx";
2973+
final int nDocs = randomIntBetween(1, 10);
2974+
2975+
logger.info("--> creating index [{}] with [{}] documents in it", indexName, nDocs);
2976+
assertAcked(prepareCreate(indexName).setSettings(Settings.builder()
2977+
.put(SETTING_NUMBER_OF_SHARDS, 1).put(SETTING_NUMBER_OF_REPLICAS, 0)));
2978+
2979+
final IndexRequestBuilder[] documents = new IndexRequestBuilder[nDocs];
2980+
for (int j = 0; j < nDocs; j++) {
2981+
documents[j] = client.prepareIndex(indexName, "_doc").setSource("foo", "bar");
2982+
}
2983+
indexRandom(true, documents);
2984+
flushAndRefresh();
2985+
2986+
logger.info("--> creating repository");
2987+
assertAcked(client().admin().cluster().preparePutRepository("test-repo")
2988+
.setType("fs")
2989+
.setSettings(Settings.builder()
2990+
.put("location", repo)));
2991+
2992+
final String snapshot1 = "test-snap-1";
2993+
logger.info("--> creating snapshot [{}]", snapshot1);
2994+
final SnapshotInfo snapshotInfo = client().admin().cluster().prepareCreateSnapshot("test-repo", snapshot1)
2995+
.setWaitForCompletion(true)
2996+
.get()
2997+
.getSnapshotInfo();
2998+
assertThat(snapshotInfo.failedShards(), equalTo(0));
2999+
assertThat(snapshotInfo.successfulShards(), equalTo(snapshotInfo.totalShards()));
3000+
assertThat(snapshotInfo.indices(), hasSize(1));
3001+
3002+
RepositoriesService service = internalCluster().getInstance(RepositoriesService.class, internalCluster().getMasterName());
3003+
Repository repository = service.repository("test-repo");
3004+
3005+
final RepositoryData repositoryData = getRepositoryData(repository);
3006+
final Map<String, IndexId> indexIds = repositoryData.getIndices();
3007+
assertThat(indexIds.size(), equalTo(1));
3008+
3009+
final IndexId corruptedIndex = indexIds.get(indexName);
3010+
final Path shardIndexFile = repo.resolve("indices")
3011+
.resolve(corruptedIndex.getId()).resolve("0")
3012+
.resolve("index-0");
3013+
3014+
logger.info("--> truncating shard index file [{}]", shardIndexFile);
3015+
try (SeekableByteChannel outChan = Files.newByteChannel(shardIndexFile, StandardOpenOption.WRITE)) {
3016+
outChan.truncate(randomInt(10));
3017+
}
3018+
3019+
logger.info("--> verifying snapshot state for [{}]", snapshot1);
3020+
List<SnapshotInfo> snapshotInfos = client().admin().cluster().prepareGetSnapshots("test-repo").get().getSnapshots();
3021+
assertThat(snapshotInfos.size(), equalTo(1));
3022+
assertThat(snapshotInfos.get(0).state(), equalTo(SnapshotState.SUCCESS));
3023+
assertThat(snapshotInfos.get(0).snapshotId().getName(), equalTo(snapshot1));
3024+
3025+
logger.info("--> deleting index [{}]", indexName);
3026+
assertAcked(client().admin().indices().prepareDelete(indexName));
3027+
3028+
logger.info("--> restoring snapshot [{}]", snapshot1);
3029+
client().admin().cluster().prepareRestoreSnapshot("test-repo", snapshot1)
3030+
.setRestoreGlobalState(randomBoolean())
3031+
.setWaitForCompletion(true)
3032+
.get();
3033+
ensureGreen();
3034+
3035+
assertHitCount(client().prepareSearch(indexName).setSize(0).get(), nDocs);
3036+
3037+
logger.info("--> indexing [{}] more documents into [{}]", nDocs, indexName);
3038+
for (int j = 0; j < nDocs; j++) {
3039+
documents[j] = client.prepareIndex(indexName, "_doc").setSource("foo2", "bar2");
3040+
}
3041+
indexRandom(true, documents);
3042+
3043+
final String snapshot2 = "test-snap-2";
3044+
logger.info("--> creating snapshot [{}]", snapshot2);
3045+
final SnapshotInfo snapshotInfo2 = client().admin().cluster().prepareCreateSnapshot("test-repo", snapshot2)
3046+
.setWaitForCompletion(true)
3047+
.get()
3048+
.getSnapshotInfo();
3049+
assertThat(snapshotInfo2.state(), equalTo(SnapshotState.SUCCESS));
3050+
assertThat(snapshotInfo2.failedShards(), equalTo(0));
3051+
assertThat(snapshotInfo2.successfulShards(), equalTo(snapshotInfo.totalShards()));
3052+
assertThat(snapshotInfo2.indices(), hasSize(1));
3053+
3054+
logger.info("--> deleting index [{}]", indexName);
3055+
assertAcked(client().admin().indices().prepareDelete(indexName));
3056+
3057+
logger.info("--> restoring snapshot [{}]", snapshot2);
3058+
client().admin().cluster().prepareRestoreSnapshot("test-repo", snapshot2)
3059+
.setRestoreGlobalState(randomBoolean())
3060+
.setWaitForCompletion(true)
3061+
.get();
3062+
3063+
ensureGreen();
3064+
3065+
assertHitCount(client().prepareSearch(indexName).setSize(0).get(), 2 * nDocs);
3066+
}
3067+
29663068
public void testCannotCreateSnapshotsWithSameName() throws Exception {
29673069
final String repositoryName = "test-repo";
29683070
final String snapshotName = "test-snap";

0 commit comments

Comments
 (0)