elastic · dnhatn · Oct 28, 2017 · Oct 25, 2017 · Oct 26, 2017 · Oct 26, 2017
diff --git a/core/src/main/java/org/elasticsearch/index/shard/DocsStats.java b/core/src/main/java/org/elasticsearch/index/shard/DocsStats.java
@@ -19,6 +19,7 @@
 
 package org.elasticsearch.index.shard;
 
+import org.elasticsearch.Version;
 import org.elasticsearch.common.io.stream.StreamInput;
 import org.elasticsearch.common.io.stream.StreamOutput;
 import org.elasticsearch.common.io.stream.Streamable;
@@ -31,22 +32,30 @@ public class DocsStats implements Streamable, ToXContentFragment {
 
     long count = 0;
     long deleted = 0;
+    long averageSizeInBytes = 0;
 
     public DocsStats() {
 
     }
 
-    public DocsStats(long count, long deleted) {
+    public DocsStats(long count, long deleted, long averageSizeInBytes) {
         this.count = count;
         this.deleted = deleted;
+        this.averageSizeInBytes = averageSizeInBytes;
     }
 
-    public void add(DocsStats docsStats) {
-        if (docsStats == null) {
+    public void add(DocsStats that) {
+        if (that == null) {
             return;
         }
-        count += docsStats.count;
-        deleted += docsStats.deleted;
+        long totalBytes = this.averageSizeInBytes * (this.count + this.deleted)
+                        + that.averageSizeInBytes * (that.count + that.deleted);
+        long totalDocs = this.count + this.deleted + that.count + that.deleted;
+        if (totalDocs > 0) {
+            this.averageSizeInBytes = totalBytes / totalDocs;
+        }
+        this.count += that.count;
+        this.deleted += that.deleted;
     }
 
     public long getCount() {
@@ -57,16 +66,26 @@ public long getDeleted() {
         return this.deleted;
     }
 
+    public long getAverageSizeInBytes() {
+        return averageSizeInBytes;
+    }
+
     @Override
     public void readFrom(StreamInput in) throws IOException {
         count = in.readVLong();
         deleted = in.readVLong();
+        if (in.getVersion().onOrAfter(Version.V_6_1_0)) {
+            averageSizeInBytes = in.readVLong();
+        }
     }
 
     @Override
     public void writeTo(StreamOutput out) throws IOException {
         out.writeVLong(count);
         out.writeVLong(deleted);
+        if (out.getVersion().onOrAfter(Version.V_6_1_0)) {
+            out.writeVLong(averageSizeInBytes);
+        }
     }
 
     @Override

diff --git a/core/src/main/java/org/elasticsearch/index/shard/IndexShard.java b/core/src/main/java/org/elasticsearch/index/shard/IndexShard.java
@@ -880,8 +880,15 @@ public FlushStats flushStats() {
     }
 
     public DocsStats docStats() {
+        long totalDocsInSegments = 0L;
+        long totalBytesInSegments = 0L;
+        for (Segment segment : segments(false)) {
+            totalDocsInSegments += segment.getNumDocs();
+            totalBytesInSegments += segment.getSizeInBytes();
+        }
+        final long avgDocSize = totalDocsInSegments > 0 ? totalBytesInSegments / totalDocsInSegments : 0;
         try (Engine.Searcher searcher = acquireSearcher("doc_stats")) {
-            return new DocsStats(searcher.reader().numDocs(), searcher.reader().numDeletedDocs());
+            return new DocsStats(searcher.reader().numDocs(), searcher.reader().numDeletedDocs(), avgDocSize);
         }
     }
 

diff --git a/...st/java/org/elasticsearch/action/admin/indices/rollover/TransportRolloverActionTests.java b/...st/java/org/elasticsearch/action/admin/indices/rollover/TransportRolloverActionTests.java
@@ -82,12 +82,12 @@ public void testEvaluateConditions() throws Exception {
             .settings(settings)
             .build();
         final HashSet<Condition> conditions = Sets.newHashSet(maxDocsCondition, maxAgeCondition);
-        Set<Condition.Result> results = evaluateConditions(conditions, new DocsStats(matchMaxDocs, 0L), metaData);
+        Set<Condition.Result> results = evaluateConditions(conditions, new DocsStats(matchMaxDocs, 0L, between(1, 10000)), metaData);
         assertThat(results.size(), equalTo(2));
         for (Condition.Result result : results) {
             assertThat(result.matched, equalTo(true));
         }
-        results = evaluateConditions(conditions, new DocsStats(notMatchMaxDocs, 0), metaData);
+        results = evaluateConditions(conditions, new DocsStats(notMatchMaxDocs, 0, between(1, 10000)), metaData);
         assertThat(results.size(), equalTo(2));
         for (Condition.Result result : results) {
             if (result.condition instanceof MaxAgeCondition) {
@@ -213,10 +213,10 @@ public void testCreateIndexRequest() throws Exception {
 
     private IndicesStatsResponse createIndecesStatResponse(long totalDocs, long primaryDocs) {
         final CommonStats primaryStats = mock(CommonStats.class);
-        when(primaryStats.getDocs()).thenReturn(new DocsStats(primaryDocs, 0));
+        when(primaryStats.getDocs()).thenReturn(new DocsStats(primaryDocs, 0, between(1, 10000)));
 
         final CommonStats totalStats = mock(CommonStats.class);
-        when(totalStats.getDocs()).thenReturn(new DocsStats(totalDocs, 0));
+        when(totalStats.getDocs()).thenReturn(new DocsStats(totalDocs, 0, between(1, 10000)));
 
         final IndicesStatsResponse response = mock(IndicesStatsResponse.class);
         when(response.getPrimaries()).thenReturn(primaryStats);

diff --git a/...c/test/java/org/elasticsearch/action/admin/indices/shrink/TransportShrinkActionTests.java b/...c/test/java/org/elasticsearch/action/admin/indices/shrink/TransportShrinkActionTests.java
@@ -73,7 +73,7 @@ public void testErrorCondition() {
         assertTrue(
             expectThrows(IllegalStateException.class, () ->
             TransportShrinkAction.prepareCreateIndexRequest(new ShrinkRequest("target", "source"), state,
-                (i) -> new DocsStats(Integer.MAX_VALUE, randomIntBetween(1, 1000)), new IndexNameExpressionResolver(Settings.EMPTY))
+                (i) -> new DocsStats(Integer.MAX_VALUE, between(1, 1000), between(1, 100)), new IndexNameExpressionResolver(Settings.EMPTY))
         ).getMessage().startsWith("Can't merge index with more than [2147483519] docs - too many documents in shards "));
 
 
@@ -84,7 +84,7 @@ public void testErrorCondition() {
                 ClusterState clusterState = createClusterState("source", 8, 1,
                     Settings.builder().put("index.blocks.write", true).build());
                     TransportShrinkAction.prepareCreateIndexRequest(req, clusterState,
-                        (i) -> i == 2 || i == 3 ? new DocsStats(Integer.MAX_VALUE/2, randomIntBetween(1, 1000)) : null,
+                        (i) -> i == 2 || i == 3 ? new DocsStats(Integer.MAX_VALUE / 2, between(1, 1000), between(1, 10000)) : null,
                         new IndexNameExpressionResolver(Settings.EMPTY));
                 }
             ).getMessage().startsWith("Can't merge index with more than [2147483519] docs - too many documents in shards "));
@@ -106,7 +106,7 @@ public void testErrorCondition() {
         clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build();
 
         TransportShrinkAction.prepareCreateIndexRequest(new ShrinkRequest("target", "source"), clusterState,
-            (i) -> new DocsStats(randomIntBetween(1, 1000), randomIntBetween(1, 1000)), new IndexNameExpressionResolver(Settings.EMPTY));
+            (i) -> new DocsStats(between(1, 1000), between(1, 1000), between(0, 10000)), new IndexNameExpressionResolver(Settings.EMPTY));
     }
 
     public void testShrinkIndexSettings() {
@@ -128,7 +128,7 @@ public void testShrinkIndexSettings() {
             routingTable.index(indexName).shardsWithState(ShardRoutingState.INITIALIZING)).routingTable();
         clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build();
         int numSourceShards = clusterState.metaData().index(indexName).getNumberOfShards();
-        DocsStats stats = new DocsStats(randomIntBetween(0, (IndexWriter.MAX_DOCS) / numSourceShards), randomIntBetween(1, 1000));
+        DocsStats stats = new DocsStats(between(0, (IndexWriter.MAX_DOCS) / numSourceShards), between(1, 1000), between(1, 10000));
         ShrinkRequest target = new ShrinkRequest("target", indexName);
         final ActiveShardCount activeShardCount = randomBoolean() ? ActiveShardCount.ALL : ActiveShardCount.ONE;
         target.setWaitForActiveShards(activeShardCount);

diff --git a/core/src/test/java/org/elasticsearch/index/shard/DocsStatsTests.java b/core/src/test/java/org/elasticsearch/index/shard/DocsStatsTests.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.shard;
+
+import org.elasticsearch.common.bytes.BytesReference;
+import org.elasticsearch.common.io.stream.BytesStreamOutput;
+import org.elasticsearch.common.io.stream.StreamInput;
+import org.elasticsearch.test.ESTestCase;
+
+import static org.hamcrest.Matchers.equalTo;
+
+public class DocsStatsTests extends ESTestCase {
+
+    public void testCalculateAverageDocSize() throws Exception {
+        DocsStats stats = new DocsStats(10, 2, 10);
+        assertThat(stats.getAverageSizeInBytes(), equalTo(10L));
+
+        stats.add(new DocsStats(0, 0, randomNonNegativeLong()));
+        assertThat(stats.getAverageSizeInBytes(), equalTo(10L));
+
+        // (38*900 + 12*10) / 50 = 686L
+        stats.add(new DocsStats(8, 30, 900));
+        assertThat(stats.getCount(), equalTo(18L));
+        assertThat(stats.getDeleted(), equalTo(32L));
+        assertThat(stats.getAverageSizeInBytes(), equalTo(686L));
+
+        // (50*686 + 40*120) / 90 = 434L
+        stats.add(new DocsStats(0, 40, 120));
+        assertThat(stats.getCount(), equalTo(18L));
+        assertThat(stats.getDeleted(), equalTo(72L));
+        assertThat(stats.getAverageSizeInBytes(), equalTo(434L));
+
+        // (90*434 + 35*99) / 125 = 340L
+        stats.add(new DocsStats(35, 0, 99));
+        assertThat(stats.getCount(), equalTo(53L));
+        assertThat(stats.getDeleted(), equalTo(72L));
+        assertThat(stats.getAverageSizeInBytes(), equalTo(340L));
+    }
+
+    public void testSerialize() throws Exception {
+        DocsStats originalStats = new DocsStats(randomNonNegativeLong(), randomNonNegativeLong(), randomNonNegativeLong());
+        try (BytesStreamOutput out = new BytesStreamOutput()) {
+            originalStats.writeTo(out);
+            BytesReference bytes = out.bytes();
+            try (StreamInput in = bytes.streamInput()) {
+                DocsStats cloneStats = new DocsStats();
+                cloneStats.readFrom(in);
+                assertThat(cloneStats.getCount(), equalTo(originalStats.getCount()));
+                assertThat(cloneStats.getDeleted(), equalTo(originalStats.getDeleted()));
+                assertThat(cloneStats.getAverageSizeInBytes(), equalTo(originalStats.getAverageSizeInBytes()));
+            }
+        }
+    }
+}
diff --git a/core/src/test/java/org/elasticsearch/index/shard/IndexShardTests.java b/core/src/test/java/org/elasticsearch/index/shard/IndexShardTests.java
@@ -67,6 +67,7 @@
 import org.elasticsearch.common.util.concurrent.ConcurrentCollections;
 import org.elasticsearch.common.xcontent.NamedXContentRegistry;
 import org.elasticsearch.common.xcontent.XContentBuilder;
+import org.elasticsearch.common.xcontent.XContentFactory;
 import org.elasticsearch.common.xcontent.XContentType;
 import org.elasticsearch.env.NodeEnvironment;
 import org.elasticsearch.index.VersionType;
@@ -150,6 +151,7 @@
 import static org.hamcrest.Matchers.hasSize;
 import static org.hamcrest.Matchers.hasToString;
 import static org.hamcrest.Matchers.instanceOf;
+import static org.hamcrest.Matchers.lessThanOrEqualTo;
 import static org.hamcrest.Matchers.notNullValue;
 import static org.hamcrest.Matchers.nullValue;
 
@@ -2227,6 +2229,7 @@ public void testDocStats() throws IOException {
                 final DocsStats docsStats = indexShard.docStats();
                 assertThat(docsStats.getCount(), equalTo(numDocs));
                 assertThat(docsStats.getDeleted(), equalTo(0L));
+                assertThat(docsStats.getAverageSizeInBytes(), greaterThan(0L));
             }
 
             final List<Integer> ids = randomSubsetOf(
@@ -2263,12 +2266,67 @@ public void testDocStats() throws IOException {
                 final DocsStats docStats = indexShard.docStats();
                 assertThat(docStats.getCount(), equalTo(numDocs));
                 assertThat(docStats.getDeleted(), equalTo(0L));
+                assertThat(docStats.getAverageSizeInBytes(), greaterThan(0L));
             }
         } finally {
             closeShards(indexShard);
         }
     }
 
+    public void testEstimateAverageDocSize() throws Exception {
+        IndexShard indexShard = null;
+        try {
+            indexShard = newStartedShard(true);
+            int smallDocNum = randomIntBetween(5, 100);
+            for (int i = 0; i < smallDocNum; i++) {
+                indexDoc(indexShard, "test", "small-" + i);
+            }
+            // Average document size is estimated by sampling segments, thus it should be zero without flushing.
+            DocsStats withoutFlush = indexShard.docStats();
+            assertThat(withoutFlush.averageSizeInBytes, equalTo(0L));
+
+            indexShard.flush(new FlushRequest());
+            indexShard.refresh("test");
+            DocsStats smallStats = indexShard.docStats();
+            assertThat(smallStats.averageSizeInBytes, greaterThan(10L));
+
+            long storedAvgSize = indexShard.storeStats().sizeInBytes() / smallDocNum;
+            assertThat("Estimated average document size is too small compared with the average stored size",
+                smallStats.averageSizeInBytes, greaterThanOrEqualTo(storedAvgSize * 80/100));
+            assertThat("Estimated average document size is too large compared with the average stored size",
+                smallStats.averageSizeInBytes, lessThanOrEqualTo(storedAvgSize * 120/100));
+
+            // Indexing large documents should increase the average document size.
+            int largeDocNum = randomIntBetween(100, 200);
+            for (int i = 0; i < largeDocNum; i++) {
+                String doc = XContentFactory.jsonBuilder()
+                    .startObject()
+                        .field("count", randomInt())
+                        .field("point", randomFloat())
+                        .field("description", randomUnicodeOfCodepointLength(100))
+                    .endObject().string();
+                indexDoc(indexShard, "test", "large-" + i, doc);
+            }
+            indexShard.flush(new FlushRequest());
+            indexShard.refresh("test");
+            DocsStats largeStats = indexShard.docStats();
+            assertThat(largeStats.averageSizeInBytes, greaterThan(100L));
+            assertThat(largeStats.averageSizeInBytes, greaterThan(smallStats.averageSizeInBytes));
+
+            int deleteDocs = randomIntBetween(1, smallDocNum / 2);
+            for (int i = 0; i < deleteDocs; i++) {
+                deleteDoc(indexShard, "test", "small-" + i);
+            }
+            indexShard.flush(new FlushRequest());
+            indexShard.refresh("test");
+            DocsStats withDeletedStats = indexShard.docStats();
+            assertThat(withDeletedStats.averageSizeInBytes, greaterThan(largeStats.averageSizeInBytes));
+
+        } finally {
+            closeShards(indexShard);
+        }
+    }
+
     /**
      * here we are simulating the scenario that happens when we do async shard fetching from GatewaySerivce while we are finishing
      * a recovery and concurrently clean files. This should always be possible without any exception. Yet there was a bug where IndexShard