Optimize sequential reads in SearchableSnapshotIndexInput (#51230)

DaveCTurner · web-flow · commit 30b5553c60b9 · 2020-02-04T17:08:50.000Z
Today `SearchableSnapshotIndexInput` translates each `readBytesInternal` call
to one or more calls to `readBlob` on the underlying repository. We make a lot
of small `readBytesInternal` calls since they are used to fill a small
in-memory buffer. Calls to `readBlob` are expensive: blob storage providers
like AWS S3 charge money per API call.

A common usage pattern is to take a brand-new `IndexInput`, seek to a
particular location, and then sequentially read a substantial amount of data
and stream it to disk.

This commit optimizes the implementation for that specific usage pattern.
Rather than calling `readBlob` each time the internal buffer needs filling we
instead request a (potentially much larger) range of the blob and consume the
response bit-by-bit as needed by a sequentially-reading client.
diff --git a/plugins/repository-s3/src/main/java/org/elasticsearch/repositories/s3/S3BlobContainer.java b/plugins/repository-s3/src/main/java/org/elasticsearch/repositories/s3/S3BlobContainer.java
@@ -46,6 +46,8 @@
 import org.elasticsearch.common.blobstore.support.AbstractBlobContainer;
 import org.elasticsearch.common.blobstore.support.PlainBlobMetaData;
 import org.elasticsearch.common.collect.Tuple;
+import org.elasticsearch.common.unit.ByteSizeUnit;
+import org.elasticsearch.common.unit.ByteSizeValue;
 
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
@@ -88,7 +90,7 @@ public InputStream readBlob(String blobName) throws IOException {
     }
 
     @Override
-    public InputStream readBlob(String blobName, long position, int length) throws IOException {
+    public InputStream readBlob(String blobName, long position, long length) throws IOException {
         if (position < 0L) {
             throw new IllegalArgumentException("position must be non-negative");
         }
@@ -102,6 +104,12 @@ public InputStream readBlob(String blobName, long position, int length) throws I
         }
     }
 
+    @Override
+    public long readBlobPreferredLength() {
+        // This container returns streams that must be fully consumed, so we tell consumers to make bounded requests.
+        return new ByteSizeValue(32, ByteSizeUnit.MB).getBytes();
+    }
+
     /**
      * This implementation ignores the failIfAlreadyExists flag as the S3 API has no way to enforce this due to its weak consistency model.
      */
diff --git a/server/src/main/java/org/elasticsearch/common/blobstore/BlobContainer.java b/server/src/main/java/org/elasticsearch/common/blobstore/BlobContainer.java
@@ -61,7 +61,25 @@ public interface BlobContainer {
      * @throws NoSuchFileException if the blob does not exist
      * @throws IOException         if the blob can not be read.
      */
-    default InputStream readBlob(final String blobName, final long position, final int length) throws IOException {
+    default InputStream readBlob(final String blobName, final long position, final long length) throws IOException {
+        throw new UnsupportedOperationException(); // NORELEASE
+    }
+
+    /**
+     * Provides a hint to clients for a suitable length to use with {@link BlobContainer#readBlob(String, long, long)}.
+     *
+     * Some blob containers have nontrivial costs attached to each readBlob call, so it is a good idea for consumers to speculatively
+     * request more data than they need right now and to re-use this stream for future needs if possible.
+     *
+     * Also, some blob containers return streams that are expensive to close before the stream has been fully consumed, and the cost may
+     * depend on the length of the data that was left unconsumed. For these containers it's best to bound the cost of a partial read by
+     * bounding the length of the data requested.
+     *
+     * @return a hint to consumers regarding the length of data to request if there is a good chance that future reads can be satisfied from
+     * the same stream.
+     *
+     */
+    default long readBlobPreferredLength() {
         throw new UnsupportedOperationException(); // NORELEASE
     }
 
diff --git a/server/src/main/java/org/elasticsearch/common/blobstore/fs/FsBlobContainer.java b/server/src/main/java/org/elasticsearch/common/blobstore/fs/FsBlobContainer.java
@@ -153,13 +153,19 @@ public InputStream readBlob(String name) throws IOException {
     }
 
     @Override
-    public InputStream readBlob(String blobName, long position, int length) throws IOException {
+    public InputStream readBlob(String blobName, long position, long length) throws IOException {
         final InputStream inputStream = readBlob(blobName);
         long skipped = inputStream.skip(position); // NORELEASE
         assert skipped == position;
         return org.elasticsearch.common.io.Streams.limitStream(inputStream, length);
     }
 
+    @Override
+    public long readBlobPreferredLength() {
+        // This container returns streams that are cheap to close early, so we can tell consumers to request as much data as possible.
+        return Long.MAX_VALUE;
+    }
+
     @Override
     public void writeBlob(String blobName, InputStream inputStream, long blobSize, boolean failIfAlreadyExists) throws IOException {
         if (failIfAlreadyExists == false) {
diff --git a/x-pack/plugin/searchable-snapshots/src/main/java/org/elasticsearch/index/store/SearchableSnapshotDirectory.java b/x-pack/plugin/searchable-snapshots/src/main/java/org/elasticsearch/index/store/SearchableSnapshotDirectory.java
@@ -6,6 +6,7 @@
 package org.elasticsearch.index.store;
 
 import org.apache.lucene.store.BaseDirectory;
+import org.apache.lucene.store.BufferedIndexInput;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IOContext;
 import org.apache.lucene.store.IndexInput;
@@ -68,7 +69,8 @@ public long fileLength(final String name) throws IOException {
     @Override
     public IndexInput openInput(final String name, final IOContext context) throws IOException {
         ensureOpen();
-        return new SearchableSnapshotIndexInput(blobContainer, fileInfo(name));
+        return new SearchableSnapshotIndexInput(blobContainer, fileInfo(name), blobContainer.readBlobPreferredLength(),
+            BufferedIndexInput.BUFFER_SIZE);
     }
 
     @Override
diff --git a/x-pack/plugin/searchable-snapshots/src/main/java/org/elasticsearch/index/store/SearchableSnapshotIndexInput.java b/x-pack/plugin/searchable-snapshots/src/main/java/org/elasticsearch/index/store/SearchableSnapshotIndexInput.java
@@ -7,9 +7,12 @@
 
 import org.apache.lucene.store.BufferedIndexInput;
 import org.apache.lucene.store.IndexInput;
+import org.elasticsearch.common.Nullable;
 import org.elasticsearch.common.blobstore.BlobContainer;
+import org.elasticsearch.core.internal.io.IOUtils;
 import org.elasticsearch.index.snapshots.blobstore.BlobStoreIndexShardSnapshot.FileInfo;
 
+import java.io.Closeable;
 import java.io.EOFException;
 import java.io.IOException;
 import java.io.InputStream;
@@ -32,6 +35,12 @@
  * next read will occur. In the case of a Lucene file snapshotted into multiple parts, this position is used to identify which part must
  * be read at which position (see {@link #readInternal(byte[], int, int)}. This position is also passed over to cloned and sliced input
  * along with the {@link FileInfo} so that they can also track their reading position.
+ *
+ * The {@code sequentialReadSize} constructor parameter configures the {@link SearchableSnapshotIndexInput} to perform a larger read on the
+ * underlying {@link BlobContainer} than it needs in order to fill its internal buffer, on the assumption that the client is reading
+ * sequentially from this file and will consume the rest of this stream in due course. It keeps hold of the partially-consumed
+ * {@link InputStream} in {@code streamForSequentialReads}. Clones and slices, however, do not expect to be read sequentially and so make
+ * a new request to the {@link BlobContainer} each time their internal buffer needs refilling.
  */
 public class SearchableSnapshotIndexInput extends BufferedIndexInput {
 
@@ -41,20 +50,30 @@ public class SearchableSnapshotIndexInput extends BufferedIndexInput {
     private final long length;
 
     private long position;
-    private boolean closed;
+    private volatile boolean closed;
+
+    @Nullable // if not currently reading sequentially
+    private StreamForSequentialReads streamForSequentialReads;
+    private long sequentialReadSize;
+    private static final long NO_SEQUENTIAL_READ_OPTIMIZATION = 0L;
+
 
-    public SearchableSnapshotIndexInput(final BlobContainer blobContainer, final FileInfo fileInfo) {
-        this("SearchableSnapshotIndexInput(" + fileInfo.physicalName() + ")", blobContainer, fileInfo, 0L, 0L, fileInfo.length());
+    SearchableSnapshotIndexInput(final BlobContainer blobContainer, final FileInfo fileInfo, long sequentialReadSize, int bufferSize) {
+        this("SearchableSnapshotIndexInput(" + fileInfo.physicalName() + ")", blobContainer, fileInfo, 0L, 0L, fileInfo.length(),
+            sequentialReadSize, bufferSize);
     }
 
-    private SearchableSnapshotIndexInput(final String resourceDesc, final BlobContainer blobContainer,
-                                         final FileInfo fileInfo, final long position, final long offset, final long length) {
-        super(resourceDesc);
+    private SearchableSnapshotIndexInput(final String resourceDesc, final BlobContainer blobContainer, final FileInfo fileInfo,
+                                         final long position, final long offset, final long length, final long sequentialReadSize,
+                                         final int bufferSize) {
+        super(resourceDesc, bufferSize);
         this.blobContainer = Objects.requireNonNull(blobContainer);
         this.fileInfo = Objects.requireNonNull(fileInfo);
         this.offset = offset;
         this.length = length;
         this.position = position;
+        assert sequentialReadSize >= 0;
+        this.sequentialReadSize = sequentialReadSize;
         this.closed = false;
     }
 
@@ -73,12 +92,12 @@ private void ensureOpen() throws IOException {
     protected void readInternal(byte[] b, int offset, int length) throws IOException {
         ensureOpen();
         if (fileInfo.numberOfParts() == 1L) {
-            readInternalBytes(0L, position, b, offset, length);
+            readInternalBytes(0, position, b, offset, length);
         } else {
             int len = length;
             int off = offset;
             while (len > 0) {
-                long currentPart = position / fileInfo.partSize().getBytes();
+                int currentPart = Math.toIntExact(position / fileInfo.partSize().getBytes());
                 int remainingBytesInPart;
                 if (currentPart < (fileInfo.numberOfParts() - 1)) {
                     remainingBytesInPart = Math.toIntExact(((currentPart + 1L) * fileInfo.partSize().getBytes()) - position);
@@ -93,12 +112,93 @@ protected void readInternal(byte[] b, int offset, int length) throws IOException
         }
     }
 
-    private void readInternalBytes(final long part, final long pos, byte[] b, int offset, int length) throws IOException {
-        try (InputStream inputStream = blobContainer.readBlob(fileInfo.partName(part), pos, length)) {
-            int read = inputStream.read(b, offset, length);
-            assert read == length;
-            position += read;
+    private void readInternalBytes(final int part, long pos, final byte[] b, int offset, int length) throws IOException {
+        int optimizedReadSize = readOptimized(part, pos, b, offset, length);
+        assert optimizedReadSize <= length;
+        position += optimizedReadSize;
+
+        if (optimizedReadSize < length) {
+            // we did not read everything in an optimized fashion, so read the remainder directly
+            try (InputStream inputStream
+                     = blobContainer.readBlob(fileInfo.partName(part), pos + optimizedReadSize, length - optimizedReadSize)) {
+                final int directReadSize = inputStream.read(b, offset + optimizedReadSize, length - optimizedReadSize);
+                assert optimizedReadSize + directReadSize == length : optimizedReadSize + " and " + directReadSize + " vs " + length;
+                position += directReadSize;
+            }
+        }
+    }
+
+    /**
+     * Attempt to satisfy this read in an optimized fashion using {@code streamForSequentialReadsRef}.
+     * @return the number of bytes read
+     */
+    private int readOptimized(int part, long pos, byte[] b, int offset, int length) throws IOException {
+        if (sequentialReadSize == NO_SEQUENTIAL_READ_OPTIMIZATION) {
+            return 0;
+        }
+
+        int read = 0;
+        if (streamForSequentialReads == null) {
+            // starting a new sequential read
+            read = readFromNewSequentialStream(part, pos, b, offset, length);
+        } else if (streamForSequentialReads.canContinueSequentialRead(part, pos)) {
+            // continuing a sequential read that we started previously
+            read = streamForSequentialReads.read(b, offset, length);
+            if (streamForSequentialReads.isFullyRead()) {
+                // the current stream was exhausted by this read, so it should be closed
+                streamForSequentialReads.close();
+                streamForSequentialReads = null;
+            } else {
+                // the current stream contained enough data for this read and more besides, so we leave it in place
+                assert read == length : length + " remaining";
+            }
+
+            if (read < length) {
+                // the current stream didn't contain enough data for this read, so we must read more
+                read += readFromNewSequentialStream(part, pos + read, b, offset + read, length - read);
+            }
+        } else {
+            // not a sequential read, so stop optimizing for this usage pattern and fall through to the unoptimized behaviour
+            assert streamForSequentialReads.isFullyRead() == false;
+            sequentialReadSize = NO_SEQUENTIAL_READ_OPTIMIZATION;
+            closeStreamForSequentialReads();
         }
+        return read;
+    }
+
+    private void closeStreamForSequentialReads() throws IOException {
+        try {
+            IOUtils.close(streamForSequentialReads);
+        } finally {
+            streamForSequentialReads = null;
+        }
+    }
+
+    /**
+     * If appropriate, open a new stream for sequential reading and satisfy the given read using it.
+     * @return the number of bytes read; if a new stream wasn't opened then nothing was read so the caller should perform the read directly.
+     */
+    private int readFromNewSequentialStream(int part, long pos, byte[] b, int offset, int length) throws IOException {
+
+        assert streamForSequentialReads == null : "should only be called when a new stream is needed";
+        assert sequentialReadSize > 0L : "should only be called if optimizing sequential reads";
+
+        final long streamLength = Math.min(sequentialReadSize, fileInfo.partBytes(part) - pos);
+        if (streamLength <= length) {
+            // streamLength <= length so this single read will consume the entire stream, so there is no need to keep hold of it, so we can
+            // tell the caller to read the data directly
+            return 0;
+        }
+
+        // if we open a stream of length streamLength then it will not be completely consumed by this read, so it is worthwhile to open
+        // it and keep it open for future reads
+        final InputStream inputStream = blobContainer.readBlob(fileInfo.partName(part), pos, streamLength);
+        streamForSequentialReads = new StreamForSequentialReads(inputStream, part, pos, streamLength);
+
+        final int read = streamForSequentialReads.read(b, offset, length);
+        assert read == length : read + " vs " + length;
+        assert streamForSequentialReads.isFullyRead() == false;
+        return read;
     }
 
     @Override
@@ -108,19 +208,30 @@ protected void seekInternal(long pos) throws IOException {
         } else if (pos < 0L) {
             throw new IOException("Seeking to negative position [" + pos + "] for " + toString());
         }
-        this.position = offset + pos;
+        if (position != offset + pos) {
+            position = offset + pos;
+            closeStreamForSequentialReads();
+        }
     }
 
     @Override
     public BufferedIndexInput clone() {
-        return new SearchableSnapshotIndexInput("clone(" + this + ")", blobContainer, fileInfo, position, offset, length);
+        return new SearchableSnapshotIndexInput("clone(" + this + ")", blobContainer, fileInfo, position, offset, length,
+            // Clones might not be closed when they are no longer needed, but we must always close streamForSequentialReads. The simple
+            // solution: do not optimize sequential reads on clones.
+            NO_SEQUENTIAL_READ_OPTIMIZATION,
+            getBufferSize());
     }
 
     @Override
     public IndexInput slice(String sliceDescription, long offset, long length) throws IOException {
         if ((offset >= 0L) && (length >= 0L) && (offset + length <= length())) {
-            final SearchableSnapshotIndexInput slice =
-                new SearchableSnapshotIndexInput(sliceDescription, blobContainer, fileInfo, position, this.offset + offset, length);
+            final SearchableSnapshotIndexInput slice = new SearchableSnapshotIndexInput(sliceDescription, blobContainer, fileInfo, position,
+                this.offset + offset, length,
+                // Slices might not be closed when they are no longer needed, but we must always close streamForSequentialReads. The simple
+                // solution: do not optimize sequential reads on slices.
+                NO_SEQUENTIAL_READ_OPTIMIZATION,
+                getBufferSize());
             slice.seek(0L);
             return slice;
         } else {
@@ -132,6 +243,7 @@ public IndexInput slice(String sliceDescription, long offset, long length) throw
     @Override
     public void close() throws IOException {
         closed = true;
+        closeStreamForSequentialReads();
     }
 
     @Override
@@ -144,4 +256,40 @@ public String toString() {
             ", position=" + position +
             '}';
     }
+
+    private static class StreamForSequentialReads implements Closeable {
+        private final InputStream inputStream;
+        private final int part;
+        private long pos; // position within this part
+        private final long maxPos;
+
+        StreamForSequentialReads(InputStream inputStream, int part, long pos, long streamLength) {
+            this.inputStream = Objects.requireNonNull(inputStream);
+            this.part = part;
+            this.pos = pos;
+            this.maxPos = pos + streamLength;
+        }
+
+        boolean canContinueSequentialRead(int part, long pos) {
+            return this.part == part && this.pos == pos;
+        }
+
+        int read(byte[] b, int offset, int length) throws IOException {
+            assert this.pos < maxPos : "should not try and read from a fully-read stream";
+            int read = inputStream.read(b, offset, length);
+            assert read <= length : read + " vs " + length;
+            pos += read;
+            return read;
+        }
+
+        boolean isFullyRead() {
+            assert this.pos <= maxPos;
+            return this.pos >= maxPos;
+        }
+
+        @Override
+        public void close() throws IOException {
+            inputStream.close();
+        }
+    }
 }
diff --git a/x-pack/plugin/searchable-snapshots/src/test/java/org/elasticsearch/index/store/SearchableSnapshotIndexInputTests.java b/x-pack/plugin/searchable-snapshots/src/test/java/org/elasticsearch/index/store/SearchableSnapshotIndexInputTests.java