Skip to content

Commit 30b5553

Browse files
authored
Optimize sequential reads in SearchableSnapshotIndexInput (#51230)
Today `SearchableSnapshotIndexInput` translates each `readBytesInternal` call to one or more calls to `readBlob` on the underlying repository. We make a lot of small `readBytesInternal` calls since they are used to fill a small in-memory buffer. Calls to `readBlob` are expensive: blob storage providers like AWS S3 charge money per API call. A common usage pattern is to take a brand-new `IndexInput`, seek to a particular location, and then sequentially read a substantial amount of data and stream it to disk. This commit optimizes the implementation for that specific usage pattern. Rather than calling `readBlob` each time the internal buffer needs filling we instead request a (potentially much larger) range of the blob and consume the response bit-by-bit as needed by a sequentially-reading client.
1 parent f1b0991 commit 30b5553

File tree

6 files changed

+293
-27
lines changed

6 files changed

+293
-27
lines changed

plugins/repository-s3/src/main/java/org/elasticsearch/repositories/s3/S3BlobContainer.java

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@
4646
import org.elasticsearch.common.blobstore.support.AbstractBlobContainer;
4747
import org.elasticsearch.common.blobstore.support.PlainBlobMetaData;
4848
import org.elasticsearch.common.collect.Tuple;
49+
import org.elasticsearch.common.unit.ByteSizeUnit;
50+
import org.elasticsearch.common.unit.ByteSizeValue;
4951

5052
import java.io.ByteArrayInputStream;
5153
import java.io.IOException;
@@ -88,7 +90,7 @@ public InputStream readBlob(String blobName) throws IOException {
8890
}
8991

9092
@Override
91-
public InputStream readBlob(String blobName, long position, int length) throws IOException {
93+
public InputStream readBlob(String blobName, long position, long length) throws IOException {
9294
if (position < 0L) {
9395
throw new IllegalArgumentException("position must be non-negative");
9496
}
@@ -102,6 +104,12 @@ public InputStream readBlob(String blobName, long position, int length) throws I
102104
}
103105
}
104106

107+
@Override
108+
public long readBlobPreferredLength() {
109+
// This container returns streams that must be fully consumed, so we tell consumers to make bounded requests.
110+
return new ByteSizeValue(32, ByteSizeUnit.MB).getBytes();
111+
}
112+
105113
/**
106114
* This implementation ignores the failIfAlreadyExists flag as the S3 API has no way to enforce this due to its weak consistency model.
107115
*/

server/src/main/java/org/elasticsearch/common/blobstore/BlobContainer.java

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,25 @@ public interface BlobContainer {
6161
* @throws NoSuchFileException if the blob does not exist
6262
* @throws IOException if the blob can not be read.
6363
*/
64-
default InputStream readBlob(final String blobName, final long position, final int length) throws IOException {
64+
default InputStream readBlob(final String blobName, final long position, final long length) throws IOException {
65+
throw new UnsupportedOperationException(); // NORELEASE
66+
}
67+
68+
/**
69+
* Provides a hint to clients for a suitable length to use with {@link BlobContainer#readBlob(String, long, long)}.
70+
*
71+
* Some blob containers have nontrivial costs attached to each readBlob call, so it is a good idea for consumers to speculatively
72+
* request more data than they need right now and to re-use this stream for future needs if possible.
73+
*
74+
* Also, some blob containers return streams that are expensive to close before the stream has been fully consumed, and the cost may
75+
* depend on the length of the data that was left unconsumed. For these containers it's best to bound the cost of a partial read by
76+
* bounding the length of the data requested.
77+
*
78+
* @return a hint to consumers regarding the length of data to request if there is a good chance that future reads can be satisfied from
79+
* the same stream.
80+
*
81+
*/
82+
default long readBlobPreferredLength() {
6583
throw new UnsupportedOperationException(); // NORELEASE
6684
}
6785

server/src/main/java/org/elasticsearch/common/blobstore/fs/FsBlobContainer.java

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -153,13 +153,19 @@ public InputStream readBlob(String name) throws IOException {
153153
}
154154

155155
@Override
156-
public InputStream readBlob(String blobName, long position, int length) throws IOException {
156+
public InputStream readBlob(String blobName, long position, long length) throws IOException {
157157
final InputStream inputStream = readBlob(blobName);
158158
long skipped = inputStream.skip(position); // NORELEASE
159159
assert skipped == position;
160160
return org.elasticsearch.common.io.Streams.limitStream(inputStream, length);
161161
}
162162

163+
@Override
164+
public long readBlobPreferredLength() {
165+
// This container returns streams that are cheap to close early, so we can tell consumers to request as much data as possible.
166+
return Long.MAX_VALUE;
167+
}
168+
163169
@Override
164170
public void writeBlob(String blobName, InputStream inputStream, long blobSize, boolean failIfAlreadyExists) throws IOException {
165171
if (failIfAlreadyExists == false) {

x-pack/plugin/searchable-snapshots/src/main/java/org/elasticsearch/index/store/SearchableSnapshotDirectory.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
package org.elasticsearch.index.store;
77

88
import org.apache.lucene.store.BaseDirectory;
9+
import org.apache.lucene.store.BufferedIndexInput;
910
import org.apache.lucene.store.Directory;
1011
import org.apache.lucene.store.IOContext;
1112
import org.apache.lucene.store.IndexInput;
@@ -68,7 +69,8 @@ public long fileLength(final String name) throws IOException {
6869
@Override
6970
public IndexInput openInput(final String name, final IOContext context) throws IOException {
7071
ensureOpen();
71-
return new SearchableSnapshotIndexInput(blobContainer, fileInfo(name));
72+
return new SearchableSnapshotIndexInput(blobContainer, fileInfo(name), blobContainer.readBlobPreferredLength(),
73+
BufferedIndexInput.BUFFER_SIZE);
7274
}
7375

7476
@Override

x-pack/plugin/searchable-snapshots/src/main/java/org/elasticsearch/index/store/SearchableSnapshotIndexInput.java

Lines changed: 165 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,12 @@
77

88
import org.apache.lucene.store.BufferedIndexInput;
99
import org.apache.lucene.store.IndexInput;
10+
import org.elasticsearch.common.Nullable;
1011
import org.elasticsearch.common.blobstore.BlobContainer;
12+
import org.elasticsearch.core.internal.io.IOUtils;
1113
import org.elasticsearch.index.snapshots.blobstore.BlobStoreIndexShardSnapshot.FileInfo;
1214

15+
import java.io.Closeable;
1316
import java.io.EOFException;
1417
import java.io.IOException;
1518
import java.io.InputStream;
@@ -32,6 +35,12 @@
3235
* next read will occur. In the case of a Lucene file snapshotted into multiple parts, this position is used to identify which part must
3336
* be read at which position (see {@link #readInternal(byte[], int, int)}. This position is also passed over to cloned and sliced input
3437
* along with the {@link FileInfo} so that they can also track their reading position.
38+
*
39+
* The {@code sequentialReadSize} constructor parameter configures the {@link SearchableSnapshotIndexInput} to perform a larger read on the
40+
* underlying {@link BlobContainer} than it needs in order to fill its internal buffer, on the assumption that the client is reading
41+
* sequentially from this file and will consume the rest of this stream in due course. It keeps hold of the partially-consumed
42+
* {@link InputStream} in {@code streamForSequentialReads}. Clones and slices, however, do not expect to be read sequentially and so make
43+
* a new request to the {@link BlobContainer} each time their internal buffer needs refilling.
3544
*/
3645
public class SearchableSnapshotIndexInput extends BufferedIndexInput {
3746

@@ -41,20 +50,30 @@ public class SearchableSnapshotIndexInput extends BufferedIndexInput {
4150
private final long length;
4251

4352
private long position;
44-
private boolean closed;
53+
private volatile boolean closed;
54+
55+
@Nullable // if not currently reading sequentially
56+
private StreamForSequentialReads streamForSequentialReads;
57+
private long sequentialReadSize;
58+
private static final long NO_SEQUENTIAL_READ_OPTIMIZATION = 0L;
59+
4560

46-
public SearchableSnapshotIndexInput(final BlobContainer blobContainer, final FileInfo fileInfo) {
47-
this("SearchableSnapshotIndexInput(" + fileInfo.physicalName() + ")", blobContainer, fileInfo, 0L, 0L, fileInfo.length());
61+
SearchableSnapshotIndexInput(final BlobContainer blobContainer, final FileInfo fileInfo, long sequentialReadSize, int bufferSize) {
62+
this("SearchableSnapshotIndexInput(" + fileInfo.physicalName() + ")", blobContainer, fileInfo, 0L, 0L, fileInfo.length(),
63+
sequentialReadSize, bufferSize);
4864
}
4965

50-
private SearchableSnapshotIndexInput(final String resourceDesc, final BlobContainer blobContainer,
51-
final FileInfo fileInfo, final long position, final long offset, final long length) {
52-
super(resourceDesc);
66+
private SearchableSnapshotIndexInput(final String resourceDesc, final BlobContainer blobContainer, final FileInfo fileInfo,
67+
final long position, final long offset, final long length, final long sequentialReadSize,
68+
final int bufferSize) {
69+
super(resourceDesc, bufferSize);
5370
this.blobContainer = Objects.requireNonNull(blobContainer);
5471
this.fileInfo = Objects.requireNonNull(fileInfo);
5572
this.offset = offset;
5673
this.length = length;
5774
this.position = position;
75+
assert sequentialReadSize >= 0;
76+
this.sequentialReadSize = sequentialReadSize;
5877
this.closed = false;
5978
}
6079

@@ -73,12 +92,12 @@ private void ensureOpen() throws IOException {
7392
protected void readInternal(byte[] b, int offset, int length) throws IOException {
7493
ensureOpen();
7594
if (fileInfo.numberOfParts() == 1L) {
76-
readInternalBytes(0L, position, b, offset, length);
95+
readInternalBytes(0, position, b, offset, length);
7796
} else {
7897
int len = length;
7998
int off = offset;
8099
while (len > 0) {
81-
long currentPart = position / fileInfo.partSize().getBytes();
100+
int currentPart = Math.toIntExact(position / fileInfo.partSize().getBytes());
82101
int remainingBytesInPart;
83102
if (currentPart < (fileInfo.numberOfParts() - 1)) {
84103
remainingBytesInPart = Math.toIntExact(((currentPart + 1L) * fileInfo.partSize().getBytes()) - position);
@@ -93,12 +112,93 @@ protected void readInternal(byte[] b, int offset, int length) throws IOException
93112
}
94113
}
95114

96-
private void readInternalBytes(final long part, final long pos, byte[] b, int offset, int length) throws IOException {
97-
try (InputStream inputStream = blobContainer.readBlob(fileInfo.partName(part), pos, length)) {
98-
int read = inputStream.read(b, offset, length);
99-
assert read == length;
100-
position += read;
115+
private void readInternalBytes(final int part, long pos, final byte[] b, int offset, int length) throws IOException {
116+
int optimizedReadSize = readOptimized(part, pos, b, offset, length);
117+
assert optimizedReadSize <= length;
118+
position += optimizedReadSize;
119+
120+
if (optimizedReadSize < length) {
121+
// we did not read everything in an optimized fashion, so read the remainder directly
122+
try (InputStream inputStream
123+
= blobContainer.readBlob(fileInfo.partName(part), pos + optimizedReadSize, length - optimizedReadSize)) {
124+
final int directReadSize = inputStream.read(b, offset + optimizedReadSize, length - optimizedReadSize);
125+
assert optimizedReadSize + directReadSize == length : optimizedReadSize + " and " + directReadSize + " vs " + length;
126+
position += directReadSize;
127+
}
128+
}
129+
}
130+
131+
/**
132+
* Attempt to satisfy this read in an optimized fashion using {@code streamForSequentialReadsRef}.
133+
* @return the number of bytes read
134+
*/
135+
private int readOptimized(int part, long pos, byte[] b, int offset, int length) throws IOException {
136+
if (sequentialReadSize == NO_SEQUENTIAL_READ_OPTIMIZATION) {
137+
return 0;
138+
}
139+
140+
int read = 0;
141+
if (streamForSequentialReads == null) {
142+
// starting a new sequential read
143+
read = readFromNewSequentialStream(part, pos, b, offset, length);
144+
} else if (streamForSequentialReads.canContinueSequentialRead(part, pos)) {
145+
// continuing a sequential read that we started previously
146+
read = streamForSequentialReads.read(b, offset, length);
147+
if (streamForSequentialReads.isFullyRead()) {
148+
// the current stream was exhausted by this read, so it should be closed
149+
streamForSequentialReads.close();
150+
streamForSequentialReads = null;
151+
} else {
152+
// the current stream contained enough data for this read and more besides, so we leave it in place
153+
assert read == length : length + " remaining";
154+
}
155+
156+
if (read < length) {
157+
// the current stream didn't contain enough data for this read, so we must read more
158+
read += readFromNewSequentialStream(part, pos + read, b, offset + read, length - read);
159+
}
160+
} else {
161+
// not a sequential read, so stop optimizing for this usage pattern and fall through to the unoptimized behaviour
162+
assert streamForSequentialReads.isFullyRead() == false;
163+
sequentialReadSize = NO_SEQUENTIAL_READ_OPTIMIZATION;
164+
closeStreamForSequentialReads();
101165
}
166+
return read;
167+
}
168+
169+
private void closeStreamForSequentialReads() throws IOException {
170+
try {
171+
IOUtils.close(streamForSequentialReads);
172+
} finally {
173+
streamForSequentialReads = null;
174+
}
175+
}
176+
177+
/**
178+
* If appropriate, open a new stream for sequential reading and satisfy the given read using it.
179+
* @return the number of bytes read; if a new stream wasn't opened then nothing was read so the caller should perform the read directly.
180+
*/
181+
private int readFromNewSequentialStream(int part, long pos, byte[] b, int offset, int length) throws IOException {
182+
183+
assert streamForSequentialReads == null : "should only be called when a new stream is needed";
184+
assert sequentialReadSize > 0L : "should only be called if optimizing sequential reads";
185+
186+
final long streamLength = Math.min(sequentialReadSize, fileInfo.partBytes(part) - pos);
187+
if (streamLength <= length) {
188+
// streamLength <= length so this single read will consume the entire stream, so there is no need to keep hold of it, so we can
189+
// tell the caller to read the data directly
190+
return 0;
191+
}
192+
193+
// if we open a stream of length streamLength then it will not be completely consumed by this read, so it is worthwhile to open
194+
// it and keep it open for future reads
195+
final InputStream inputStream = blobContainer.readBlob(fileInfo.partName(part), pos, streamLength);
196+
streamForSequentialReads = new StreamForSequentialReads(inputStream, part, pos, streamLength);
197+
198+
final int read = streamForSequentialReads.read(b, offset, length);
199+
assert read == length : read + " vs " + length;
200+
assert streamForSequentialReads.isFullyRead() == false;
201+
return read;
102202
}
103203

104204
@Override
@@ -108,19 +208,30 @@ protected void seekInternal(long pos) throws IOException {
108208
} else if (pos < 0L) {
109209
throw new IOException("Seeking to negative position [" + pos + "] for " + toString());
110210
}
111-
this.position = offset + pos;
211+
if (position != offset + pos) {
212+
position = offset + pos;
213+
closeStreamForSequentialReads();
214+
}
112215
}
113216

114217
@Override
115218
public BufferedIndexInput clone() {
116-
return new SearchableSnapshotIndexInput("clone(" + this + ")", blobContainer, fileInfo, position, offset, length);
219+
return new SearchableSnapshotIndexInput("clone(" + this + ")", blobContainer, fileInfo, position, offset, length,
220+
// Clones might not be closed when they are no longer needed, but we must always close streamForSequentialReads. The simple
221+
// solution: do not optimize sequential reads on clones.
222+
NO_SEQUENTIAL_READ_OPTIMIZATION,
223+
getBufferSize());
117224
}
118225

119226
@Override
120227
public IndexInput slice(String sliceDescription, long offset, long length) throws IOException {
121228
if ((offset >= 0L) && (length >= 0L) && (offset + length <= length())) {
122-
final SearchableSnapshotIndexInput slice =
123-
new SearchableSnapshotIndexInput(sliceDescription, blobContainer, fileInfo, position, this.offset + offset, length);
229+
final SearchableSnapshotIndexInput slice = new SearchableSnapshotIndexInput(sliceDescription, blobContainer, fileInfo, position,
230+
this.offset + offset, length,
231+
// Slices might not be closed when they are no longer needed, but we must always close streamForSequentialReads. The simple
232+
// solution: do not optimize sequential reads on slices.
233+
NO_SEQUENTIAL_READ_OPTIMIZATION,
234+
getBufferSize());
124235
slice.seek(0L);
125236
return slice;
126237
} else {
@@ -132,6 +243,7 @@ public IndexInput slice(String sliceDescription, long offset, long length) throw
132243
@Override
133244
public void close() throws IOException {
134245
closed = true;
246+
closeStreamForSequentialReads();
135247
}
136248

137249
@Override
@@ -144,4 +256,40 @@ public String toString() {
144256
", position=" + position +
145257
'}';
146258
}
259+
260+
private static class StreamForSequentialReads implements Closeable {
261+
private final InputStream inputStream;
262+
private final int part;
263+
private long pos; // position within this part
264+
private final long maxPos;
265+
266+
StreamForSequentialReads(InputStream inputStream, int part, long pos, long streamLength) {
267+
this.inputStream = Objects.requireNonNull(inputStream);
268+
this.part = part;
269+
this.pos = pos;
270+
this.maxPos = pos + streamLength;
271+
}
272+
273+
boolean canContinueSequentialRead(int part, long pos) {
274+
return this.part == part && this.pos == pos;
275+
}
276+
277+
int read(byte[] b, int offset, int length) throws IOException {
278+
assert this.pos < maxPos : "should not try and read from a fully-read stream";
279+
int read = inputStream.read(b, offset, length);
280+
assert read <= length : read + " vs " + length;
281+
pos += read;
282+
return read;
283+
}
284+
285+
boolean isFullyRead() {
286+
assert this.pos <= maxPos;
287+
return this.pos >= maxPos;
288+
}
289+
290+
@Override
291+
public void close() throws IOException {
292+
inputStream.close();
293+
}
294+
}
147295
}

0 commit comments

Comments
 (0)