Skip to content

Commit e1dbe26

Browse files
authored
Use snapshot information to build searchable snapshot store MetadataSnapshot (elastic#56289)
While investigating possible optimizations to speed up searchable snapshots shard restores, we noticed that Elasticsearch builds the list of shard files on local disk in order to compare it with the list of files contained in the snapshot to restore. This list of files is materialized with a MetadataSnapshot object whose construction involves to read the footer checksum of every files of the shard using Store.checksumFromLuceneFile() method. Further investigation shows that a MetadataSnapshot object is also created for other types of operations like building the list of files to recover in a peer recovery (and primary shard relocation) or in order to assign a shard to a node. These operations use the Store.getMetadata(IndexCommit) method to build the list of files and checksums. In the case of searchable snapshots building the MetadataSnapshot object can potentially trigger cache misses, which in turn can cause the download and the writing in cache of the last range of the file in order to check the 16 bytes footer. This in turn can cause more evictions. Since searchable snapshots already contains the footer information of every file in BlobStoreIndexShardSnapshot it can directly read the checksum from it and avoid to use the cache at all to create a MetadataSnapshot for the operations mentioned above. This commit adds a shortcut to the SearchableSnapshotDirectory.openInput() method - similarly to what already exists for segment infos - so that it creates a specific IndexInput for checksum reading operation.
1 parent 51052f9 commit e1dbe26

File tree

5 files changed

+334
-11
lines changed

5 files changed

+334
-11
lines changed

server/src/main/java/org/elasticsearch/index/store/Store.java

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,12 @@ public class Store extends AbstractIndexShardComponent implements Closeable, Ref
135135
public static final Setting<TimeValue> INDEX_STORE_STATS_REFRESH_INTERVAL_SETTING =
136136
Setting.timeSetting("index.store.stats_refresh_interval", TimeValue.timeValueSeconds(10), Property.IndexScope);
137137

138+
/**
139+
* Specific {@link IOContext} used to verify Lucene files footer checksums.
140+
* See {@link MetadataSnapshot#checksumFromLuceneFile(Directory, String, Map, Logger, Version, boolean)}
141+
*/
142+
public static final IOContext READONCE_CHECKSUM = new IOContext(IOContext.READONCE.context);
143+
138144
private final AtomicBoolean isClosed = new AtomicBoolean(false);
139145
private final StoreDirectory directory;
140146
private final ReentrantReadWriteLock metadataLock = new ReentrantReadWriteLock();
@@ -859,7 +865,7 @@ private static void checksumFromLuceneFile(Directory directory, String file, Map
859865
Logger logger, Version version, boolean readFileAsHash) throws IOException {
860866
final String checksum;
861867
final BytesRefBuilder fileHash = new BytesRefBuilder();
862-
try (IndexInput in = directory.openInput(file, IOContext.READONCE)) {
868+
try (IndexInput in = directory.openInput(file, READONCE_CHECKSUM)) {
863869
final long length;
864870
try {
865871
length = in.length();

x-pack/plugin/searchable-snapshots/src/main/java/org/elasticsearch/index/store/SearchableSnapshotDirectory.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
import org.elasticsearch.index.store.cache.CacheFile;
3939
import org.elasticsearch.index.store.cache.CacheKey;
4040
import org.elasticsearch.index.store.cache.CachedBlobContainerIndexInput;
41+
import org.elasticsearch.index.store.checksum.ChecksumBlobContainerIndexInput;
4142
import org.elasticsearch.index.store.direct.DirectBlobContainerIndexInput;
4243
import org.elasticsearch.repositories.IndexId;
4344
import org.elasticsearch.repositories.RepositoriesService;
@@ -323,6 +324,9 @@ public IndexInput openInput(final String name, final IOContext context) throws I
323324
final BytesRef content = fileInfo.metadata().hash();
324325
return new ByteArrayIndexInput("ByteArrayIndexInput(" + name + ')', content.bytes, content.offset, content.length);
325326
}
327+
if (context == Store.READONCE_CHECKSUM) {
328+
return ChecksumBlobContainerIndexInput.create(fileInfo.physicalName(), fileInfo.length(), fileInfo.checksum(), context);
329+
}
326330

327331
final IndexInputStats inputStats = stats.computeIfAbsent(name, n -> createIndexInputStats(fileInfo.length()));
328332
if (useCache && isExcludedFromCache(name) == false) {
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the Elastic License;
4+
* you may not use this file except in compliance with the Elastic License.
5+
*/
6+
7+
package org.elasticsearch.index.store.checksum;
8+
9+
import org.apache.lucene.codecs.CodecUtil;
10+
import org.apache.lucene.store.ByteBuffersDataOutput;
11+
import org.apache.lucene.store.ByteBuffersIndexOutput;
12+
import org.apache.lucene.store.IOContext;
13+
import org.apache.lucene.store.IndexInput;
14+
import org.apache.lucene.store.IndexOutput;
15+
import org.elasticsearch.index.store.Store;
16+
17+
import java.io.EOFException;
18+
import java.io.IOException;
19+
import java.util.Arrays;
20+
import java.util.Objects;
21+
22+
/**
23+
* A {@link IndexInput} that can only be used to verify footer checksums.
24+
*/
25+
public class ChecksumBlobContainerIndexInput extends IndexInput {
26+
27+
private final byte[] checksum;
28+
private final long length;
29+
30+
private long position;
31+
32+
private ChecksumBlobContainerIndexInput(String name, long length, byte[] checksum, IOContext context) {
33+
super("ChecksumBlobContainerIndexInput(" + name + ')');
34+
ensureReadOnceChecksumContext(context);
35+
assert checksum.length == CodecUtil.footerLength();
36+
this.checksum = Objects.requireNonNull(checksum);
37+
assert length >= this.checksum.length;
38+
this.length = length;
39+
this.position = 0L;
40+
}
41+
42+
@Override
43+
public long length() {
44+
return length;
45+
}
46+
47+
@Override
48+
public long getFilePointer() {
49+
return position;
50+
}
51+
52+
@Override
53+
public byte readByte() throws IOException {
54+
if (getFilePointer() >= length()) {
55+
throw new EOFException("seek past EOF");
56+
}
57+
return checksum[checksumPositionOrThrow(position++)];
58+
}
59+
60+
@Override
61+
public void readBytes(final byte[] b, final int off, int len) throws IOException {
62+
if (getFilePointer() + len > length()) {
63+
throw new EOFException("seek past EOF");
64+
}
65+
System.arraycopy(checksum, checksumPositionOrThrow(position), b, off, len);
66+
position += len;
67+
}
68+
69+
@Override
70+
public void seek(long pos) throws IOException {
71+
if (pos < 0) {
72+
throw new IllegalArgumentException("Seeking to negative position: " + pos);
73+
} else if (pos > length()) {
74+
throw new EOFException("seek past EOF");
75+
}
76+
checksumPositionOrThrow(pos);
77+
position = pos;
78+
}
79+
80+
@Override
81+
public IndexInput slice(String sliceDescription, long offset, long length) {
82+
assert false : "unexpected slicing (" + sliceDescription + ") for " + this;
83+
throw new UnsupportedOperationException();
84+
}
85+
86+
@Override
87+
public IndexInput clone() {
88+
assert false : "unexpected cloning for " + this;
89+
throw new UnsupportedOperationException();
90+
}
91+
92+
@Override
93+
public void close() {}
94+
95+
@Override
96+
public String toString() {
97+
return "ChecksumBlobContainerIndexInput{"
98+
+ "checksum="
99+
+ Arrays.toString(checksum)
100+
+ ", length="
101+
+ length
102+
+ ", position="
103+
+ position
104+
+ '}';
105+
}
106+
107+
private int checksumPositionOrThrow(long pos) {
108+
final long checksumPosition = length - checksum.length;
109+
if (pos < checksumPosition) {
110+
assert false : "unexpected read or seek at position [" + pos + "] but checksum starts at [" + checksumPosition + ']';
111+
throw new IllegalArgumentException("Can't read or seek before footer checksum");
112+
}
113+
return Math.toIntExact(checksum.length - (length - pos));
114+
}
115+
116+
private static void ensureReadOnceChecksumContext(IOContext context) {
117+
if (context != Store.READONCE_CHECKSUM) {
118+
assert false : "expected READONCE_CHECKSUM but got " + context;
119+
throw new IllegalArgumentException("ChecksumBlobContainerIndexInput should only be used with READONCE_CHECKSUM context");
120+
}
121+
}
122+
123+
/**
124+
* Creates a {@link ChecksumBlobContainerIndexInput} that can be used to verify a Lucene file's footer checksum without opening the
125+
* file on disk. The checksum verification should be executed using {@link CodecUtil#retrieveChecksum(IndexInput)}.
126+
*
127+
* @param name the physical name of the file
128+
* @param length the total length of the file
129+
* @param checksum the footer checksum provided as a {@link String}
130+
* @return a {@link ChecksumBlobContainerIndexInput}
131+
* @throws IOException if something goes wrong when creating the {@link ChecksumBlobContainerIndexInput}
132+
*/
133+
public static ChecksumBlobContainerIndexInput create(String name, long length, String checksum, IOContext context) throws IOException {
134+
final ByteBuffersDataOutput out = new ByteBuffersDataOutput();
135+
try (IndexOutput output = new ByteBuffersIndexOutput(out, "tmp", name)) {
136+
// reverse CodecUtil.writeFooter()
137+
output.writeInt(CodecUtil.FOOTER_MAGIC);
138+
output.writeInt(0);
139+
output.writeLong(Long.parseLong(checksum, Character.MAX_RADIX));
140+
output.close();
141+
return new ChecksumBlobContainerIndexInput(name, length, out.toArrayCopy(), context);
142+
}
143+
}
144+
}

x-pack/plugin/searchable-snapshots/src/test/java/org/elasticsearch/index/store/SearchableSnapshotDirectoryTests.java

Lines changed: 130 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
*/
66
package org.elasticsearch.index.store;
77

8+
import org.apache.lucene.codecs.CodecUtil;
89
import org.apache.lucene.document.Document;
910
import org.apache.lucene.document.Field;
1011
import org.apache.lucene.document.NumericDocValuesField;
@@ -14,6 +15,7 @@
1415
import org.apache.lucene.index.FieldInfo;
1516
import org.apache.lucene.index.FieldInfos;
1617
import org.apache.lucene.index.IndexCommit;
18+
import org.apache.lucene.index.IndexFileNames;
1719
import org.apache.lucene.index.IndexOptions;
1820
import org.apache.lucene.index.IndexWriter;
1921
import org.apache.lucene.index.IndexWriterConfig;
@@ -47,6 +49,7 @@
4749
import org.elasticsearch.common.lease.Releasables;
4850
import org.elasticsearch.common.lucene.BytesRefs;
4951
import org.elasticsearch.common.lucene.Lucene;
52+
import org.elasticsearch.common.lucene.store.ByteArrayIndexInput;
5053
import org.elasticsearch.common.settings.Settings;
5154
import org.elasticsearch.common.unit.ByteSizeUnit;
5255
import org.elasticsearch.common.unit.ByteSizeValue;
@@ -59,6 +62,7 @@
5962
import org.elasticsearch.index.shard.ShardId;
6063
import org.elasticsearch.index.snapshots.IndexShardSnapshotStatus;
6164
import org.elasticsearch.index.snapshots.blobstore.BlobStoreIndexShardSnapshot;
65+
import org.elasticsearch.index.store.checksum.ChecksumBlobContainerIndexInput;
6266
import org.elasticsearch.index.translog.Translog;
6367
import org.elasticsearch.repositories.IndexId;
6468
import org.elasticsearch.repositories.blobstore.BlobStoreRepository;
@@ -98,8 +102,10 @@
98102
import static org.hamcrest.Matchers.equalTo;
99103
import static org.hamcrest.Matchers.greaterThan;
100104
import static org.hamcrest.Matchers.hasSize;
105+
import static org.hamcrest.Matchers.instanceOf;
101106
import static org.hamcrest.Matchers.is;
102107
import static org.hamcrest.Matchers.lessThanOrEqualTo;
108+
import static org.hamcrest.Matchers.nullValue;
103109
import static org.hamcrest.Matchers.sameInstance;
104110

105111
public class SearchableSnapshotDirectoryTests extends ESTestCase {
@@ -308,21 +314,125 @@ public void testReadBytes() throws Exception {
308314
});
309315
}
310316

317+
public void testChecksumBlobContainerIndexInput() throws Exception {
318+
testDirectories(
319+
randomBoolean(),
320+
false, // no prewarming in this test because we want to ensure that files are accessed on purpose
321+
(directory, snapshotDirectory) -> {
322+
for (String fileName : randomSubsetOf(Arrays.asList(snapshotDirectory.listAll()))) {
323+
final long checksum;
324+
try (IndexInput input = directory.openInput(fileName, Store.READONCE_CHECKSUM)) {
325+
checksum = CodecUtil.checksumEntireFile(input);
326+
}
327+
328+
final long snapshotChecksum;
329+
try (IndexInput input = snapshotDirectory.openInput(fileName, Store.READONCE_CHECKSUM)) {
330+
snapshotChecksum = CodecUtil.retrieveChecksum(input);
331+
assertThat(
332+
input,
333+
"si".equals(IndexFileNames.getExtension(fileName)) || fileName.startsWith(IndexFileNames.SEGMENTS)
334+
? instanceOf(ByteArrayIndexInput.class)
335+
: instanceOf(ChecksumBlobContainerIndexInput.class)
336+
);
337+
}
338+
339+
assertThat(
340+
"Expected checksum [" + checksum + "] but got [" + snapshotChecksum + ']',
341+
snapshotChecksum,
342+
equalTo(checksum)
343+
);
344+
assertThat(
345+
"File [" + fileName + "] should have been read from heap",
346+
snapshotDirectory.getStats(fileName),
347+
nullValue()
348+
);
349+
}
350+
}
351+
);
352+
}
353+
354+
public void testMetadataSnapshotsDoesNotAccessFilesOnDisk() throws Exception {
355+
final ShardId shardId = new ShardId("_name", "_id", 0);
356+
final IndexSettings indexSettings = newIndexSettings();
357+
358+
// sometimes load store's MetadataSnapshot using an IndexCommit
359+
final boolean useIndexCommit = randomBoolean();
360+
logger.info("--> loading Store.MetadataSnapshot using index commit is [{}]", useIndexCommit);
361+
final CheckedFunction<Store, Store.MetadataSnapshot, IOException> loader = store -> {
362+
if (useIndexCommit) {
363+
return store.getMetadata(Lucene.getIndexCommit(Lucene.readSegmentInfos(store.directory()), store.directory()));
364+
} else {
365+
return store.getMetadata(null, true);
366+
}
367+
};
368+
369+
testDirectories(
370+
randomBoolean(),
371+
false, // no prewarming in this test because we want to ensure that files are accessed on purpose
372+
((directory, snapshotDirectory) -> {
373+
final Store.MetadataSnapshot metadata;
374+
try (Store store = new Store(shardId, indexSettings, directory, new DummyShardLock(shardId))) {
375+
metadata = loader.apply(store);
376+
assertNotNull(metadata);
377+
}
378+
379+
final Store.MetadataSnapshot snapshotMetadata;
380+
try (Store store = new Store(shardId, indexSettings, snapshotDirectory, new DummyShardLock(shardId))) {
381+
assertTrue("No files should have been read yet", snapshotDirectory.getStats().isEmpty());
382+
snapshotMetadata = store.getMetadata(null);
383+
assertTrue("No files should have been read to compute MetadataSnapshot", snapshotDirectory.getStats().isEmpty());
384+
assertNotNull(snapshotMetadata);
385+
}
386+
387+
final Store.RecoveryDiff diff = randomBoolean()
388+
? metadata.recoveryDiff(snapshotMetadata)
389+
: snapshotMetadata.recoveryDiff(metadata);
390+
391+
assertThat(
392+
"List of different files should be empty but got [" + metadata.asMap() + "] and [" + snapshotMetadata.asMap() + ']',
393+
diff.different.isEmpty(),
394+
is(true)
395+
);
396+
assertThat(
397+
"List of missing files should be empty but got [" + metadata.asMap() + "] and [" + snapshotMetadata.asMap() + ']',
398+
diff.missing.isEmpty(),
399+
is(true)
400+
);
401+
assertThat(
402+
"List of files should be identical [" + metadata.asMap() + "] and [" + snapshotMetadata.asMap() + ']',
403+
diff.identical.size(),
404+
equalTo(metadata.size())
405+
);
406+
assertThat("Number of files should be identical", snapshotMetadata.size(), equalTo(metadata.size()));
407+
408+
for (StoreFileMetadata storeFileMetadata : metadata) {
409+
final StoreFileMetadata snapshotFileMetadata = snapshotMetadata.get(storeFileMetadata.name());
410+
assertTrue(
411+
storeFileMetadata + " should be identical but got [" + snapshotFileMetadata + ']',
412+
storeFileMetadata.isSame(snapshotFileMetadata)
413+
);
414+
}
415+
})
416+
);
417+
}
418+
311419
/**
312420
* This method :
313421
* - sets up a default {@link Directory} and index random documents
314422
* - snapshots the directory using a FS repository
315423
* - creates a {@link SearchableSnapshotDirectory} instance based on the snapshotted files
316424
* - consumes the default and the searchable snapshot directories using the {@link CheckedBiConsumer}.
317425
*/
318-
private void testDirectories(final CheckedBiConsumer<Directory, Directory, Exception> consumer) throws Exception {
319-
final IndexSettings indexSettings = IndexSettingsModule.newIndexSettings(
320-
"_index",
321-
Settings.builder()
322-
.put(IndexMetadata.SETTING_INDEX_UUID, UUIDs.randomBase64UUID(random()))
323-
.put(IndexMetadata.SETTING_VERSION_CREATED, org.elasticsearch.Version.CURRENT)
324-
.build()
325-
);
426+
private void testDirectories(final CheckedBiConsumer<Directory, SearchableSnapshotDirectory, Exception> consumer) throws Exception {
427+
testDirectories(randomBoolean(), randomBoolean(), consumer);
428+
}
429+
430+
private void testDirectories(
431+
final boolean enableCache,
432+
final boolean prewarmCache,
433+
final CheckedBiConsumer<Directory, SearchableSnapshotDirectory, Exception> consumer
434+
) throws Exception {
435+
final IndexSettings indexSettings = newIndexSettings();
326436
final ShardId shardId = new ShardId(indexSettings.getIndex(), randomIntBetween(0, 10));
327437
final List<Releasable> releasables = new ArrayList<>();
328438

@@ -442,8 +552,8 @@ protected void assertSnapshotOrGenericThread() {
442552
indexId,
443553
shardId,
444554
Settings.builder()
445-
.put(SNAPSHOT_CACHE_ENABLED_SETTING.getKey(), randomBoolean())
446-
.put(SNAPSHOT_CACHE_PREWARM_ENABLED_SETTING.getKey(), randomBoolean())
555+
.put(SNAPSHOT_CACHE_ENABLED_SETTING.getKey(), enableCache)
556+
.put(SNAPSHOT_CACHE_PREWARM_ENABLED_SETTING.getKey(), prewarmCache)
447557
.build(),
448558
() -> 0L,
449559
cacheService,
@@ -606,4 +716,14 @@ private void assertListOfFiles(Path cacheDir, Matcher<Integer> matchNumberOfFile
606716
assertThat("Number of files (" + files.size() + ") mismatch, got : " + files.keySet(), files.size(), matchNumberOfFiles);
607717
assertThat("Sum of file sizes mismatch, got: " + files, files.values().stream().mapToLong(Long::longValue).sum(), matchSizeOfFiles);
608718
}
719+
720+
private static IndexSettings newIndexSettings() {
721+
return IndexSettingsModule.newIndexSettings(
722+
"_index",
723+
Settings.builder()
724+
.put(IndexMetadata.SETTING_INDEX_UUID, UUIDs.randomBase64UUID(random()))
725+
.put(IndexMetadata.SETTING_VERSION_CREATED, org.elasticsearch.Version.CURRENT)
726+
.build()
727+
);
728+
}
609729
}

0 commit comments

Comments
 (0)