-
Notifications
You must be signed in to change notification settings - Fork 25.2k
Adds average document size to DocsStats #27117
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
b964ac8
a381d98
f38e957
662f062
6cca080
96e9be7
ca3023b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,6 +19,7 @@ | |
|
||
package org.elasticsearch.index.shard; | ||
|
||
import org.elasticsearch.Version; | ||
import org.elasticsearch.common.io.stream.StreamInput; | ||
import org.elasticsearch.common.io.stream.StreamOutput; | ||
import org.elasticsearch.common.io.stream.Streamable; | ||
|
@@ -31,22 +32,30 @@ public class DocsStats implements Streamable, ToXContentFragment { | |
|
||
long count = 0; | ||
long deleted = 0; | ||
long averageSizeInBytes = 0; | ||
|
||
public DocsStats() { | ||
|
||
} | ||
|
||
public DocsStats(long count, long deleted) { | ||
public DocsStats(long count, long deleted, long averageSizeInBytes) { | ||
this.count = count; | ||
this.deleted = deleted; | ||
this.averageSizeInBytes = averageSizeInBytes; | ||
} | ||
|
||
public void add(DocsStats docsStats) { | ||
if (docsStats == null) { | ||
public void add(DocsStats that) { | ||
if (that == null) { | ||
return; | ||
} | ||
count += docsStats.count; | ||
deleted += docsStats.deleted; | ||
long totalBytes = this.averageSizeInBytes * (this.count + this.deleted) | ||
+ that.averageSizeInBytes * (that.count + that.deleted); | ||
long totalDocs = this.count + this.deleted + that.count + that.deleted; | ||
if (totalDocs > 0) { | ||
this.averageSizeInBytes = totalBytes / totalDocs; | ||
} | ||
this.count += that.count; | ||
this.deleted += that.deleted; | ||
} | ||
|
||
public long getCount() { | ||
|
@@ -57,16 +66,26 @@ public long getDeleted() { | |
return this.deleted; | ||
} | ||
|
||
public long getAverageSizeInBytes() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add javadocs? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
return averageSizeInBytes; | ||
} | ||
|
||
@Override | ||
public void readFrom(StreamInput in) throws IOException { | ||
count = in.readVLong(); | ||
deleted = in.readVLong(); | ||
if (in.getVersion().onOrAfter(Version.V_6_1_0)) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you might want to make it There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the hint. I temporarily made this for v7 only (f38e957) |
||
averageSizeInBytes = in.readVLong(); | ||
} | ||
} | ||
|
||
@Override | ||
public void writeTo(StreamOutput out) throws IOException { | ||
out.writeVLong(count); | ||
out.writeVLong(deleted); | ||
if (out.getVersion().onOrAfter(Version.V_6_1_0)) { | ||
out.writeVLong(averageSizeInBytes); | ||
} | ||
} | ||
|
||
@Override | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -880,8 +880,15 @@ public FlushStats flushStats() { | |
} | ||
|
||
public DocsStats docStats() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this should rather be something like this: public DocsStats docStats() {
long numDocs = 0;
long numDeletedDocs = 0;
long sizeInByte = 0;
List<Segment> segments = segments(false);
for (Segment segment : segments) {
if (segment.search) {
numDocs += segment.getNumDocs();
numDeletedDocs += segment.getDeletedDocs();
sizeInByte += segment.getSizeInBytes();
}
}
return new DocsStats(numDocs, numDeletedDocs, sizeInByte);
} that way we maintain a consistent total and we can calculate the average at read time and aggregation of doc stats will be much simpler? I also think we should make sure the size in bytes is based on the currently used reader which is guaranteed by the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, this makes the DocsStats simpler and the average value more accurate. I have updated this in 662f062 |
||
long totalDocsInSegments = 0L; | ||
long totalBytesInSegments = 0L; | ||
for (Segment segment : segments(false)) { | ||
totalDocsInSegments += segment.getNumDocs(); | ||
totalBytesInSegments += segment.getSizeInBytes(); | ||
} | ||
final long avgDocSize = totalDocsInSegments > 0 ? totalBytesInSegments / totalDocsInSegments : 0; | ||
try (Engine.Searcher searcher = acquireSearcher("doc_stats")) { | ||
return new DocsStats(searcher.reader().numDocs(), searcher.reader().numDeletedDocs()); | ||
return new DocsStats(searcher.reader().numDocs(), searcher.reader().numDeletedDocs(), avgDocSize); | ||
} | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
/* | ||
* Licensed to Elasticsearch under one or more contributor | ||
* license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright | ||
* ownership. Elasticsearch licenses this file to you under | ||
* the Apache License, Version 2.0 (the "License"); you may | ||
* not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
|
||
package org.elasticsearch.index.shard; | ||
|
||
import org.elasticsearch.common.bytes.BytesReference; | ||
import org.elasticsearch.common.io.stream.BytesStreamOutput; | ||
import org.elasticsearch.common.io.stream.StreamInput; | ||
import org.elasticsearch.test.ESTestCase; | ||
|
||
import static org.hamcrest.Matchers.equalTo; | ||
|
||
public class DocsStatsTests extends ESTestCase { | ||
|
||
public void testCalculateAverageDocSize() throws Exception { | ||
DocsStats stats = new DocsStats(10, 2, 10); | ||
assertThat(stats.getAverageSizeInBytes(), equalTo(10L)); | ||
|
||
stats.add(new DocsStats(0, 0, randomNonNegativeLong())); | ||
assertThat(stats.getAverageSizeInBytes(), equalTo(10L)); | ||
|
||
// (38*900 + 12*10) / 50 = 686L | ||
stats.add(new DocsStats(8, 30, 900)); | ||
assertThat(stats.getCount(), equalTo(18L)); | ||
assertThat(stats.getDeleted(), equalTo(32L)); | ||
assertThat(stats.getAverageSizeInBytes(), equalTo(686L)); | ||
|
||
// (50*686 + 40*120) / 90 = 434L | ||
stats.add(new DocsStats(0, 40, 120)); | ||
assertThat(stats.getCount(), equalTo(18L)); | ||
assertThat(stats.getDeleted(), equalTo(72L)); | ||
assertThat(stats.getAverageSizeInBytes(), equalTo(434L)); | ||
|
||
// (90*434 + 35*99) / 125 = 340L | ||
stats.add(new DocsStats(35, 0, 99)); | ||
assertThat(stats.getCount(), equalTo(53L)); | ||
assertThat(stats.getDeleted(), equalTo(72L)); | ||
assertThat(stats.getAverageSizeInBytes(), equalTo(340L)); | ||
} | ||
|
||
public void testSerialize() throws Exception { | ||
DocsStats originalStats = new DocsStats(randomNonNegativeLong(), randomNonNegativeLong(), randomNonNegativeLong()); | ||
try (BytesStreamOutput out = new BytesStreamOutput()) { | ||
originalStats.writeTo(out); | ||
BytesReference bytes = out.bytes(); | ||
try (StreamInput in = bytes.streamInput()) { | ||
DocsStats cloneStats = new DocsStats(); | ||
cloneStats.readFrom(in); | ||
assertThat(cloneStats.getCount(), equalTo(originalStats.getCount())); | ||
assertThat(cloneStats.getDeleted(), equalTo(originalStats.getDeleted())); | ||
assertThat(cloneStats.getAverageSizeInBytes(), equalTo(originalStats.getAverageSizeInBytes())); | ||
} | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -67,6 +67,7 @@ | |
import org.elasticsearch.common.util.concurrent.ConcurrentCollections; | ||
import org.elasticsearch.common.xcontent.NamedXContentRegistry; | ||
import org.elasticsearch.common.xcontent.XContentBuilder; | ||
import org.elasticsearch.common.xcontent.XContentFactory; | ||
import org.elasticsearch.common.xcontent.XContentType; | ||
import org.elasticsearch.env.NodeEnvironment; | ||
import org.elasticsearch.index.VersionType; | ||
|
@@ -150,6 +151,7 @@ | |
import static org.hamcrest.Matchers.hasSize; | ||
import static org.hamcrest.Matchers.hasToString; | ||
import static org.hamcrest.Matchers.instanceOf; | ||
import static org.hamcrest.Matchers.lessThanOrEqualTo; | ||
import static org.hamcrest.Matchers.notNullValue; | ||
import static org.hamcrest.Matchers.nullValue; | ||
|
||
|
@@ -2227,6 +2229,7 @@ public void testDocStats() throws IOException { | |
final DocsStats docsStats = indexShard.docStats(); | ||
assertThat(docsStats.getCount(), equalTo(numDocs)); | ||
assertThat(docsStats.getDeleted(), equalTo(0L)); | ||
assertThat(docsStats.getAverageSizeInBytes(), greaterThan(0L)); | ||
} | ||
|
||
final List<Integer> ids = randomSubsetOf( | ||
|
@@ -2263,12 +2266,67 @@ public void testDocStats() throws IOException { | |
final DocsStats docStats = indexShard.docStats(); | ||
assertThat(docStats.getCount(), equalTo(numDocs)); | ||
assertThat(docStats.getDeleted(), equalTo(0L)); | ||
assertThat(docStats.getAverageSizeInBytes(), greaterThan(0L)); | ||
} | ||
} finally { | ||
closeShards(indexShard); | ||
} | ||
} | ||
|
||
public void testEstimateAverageDocSize() throws Exception { | ||
IndexShard indexShard = null; | ||
try { | ||
indexShard = newStartedShard(true); | ||
int smallDocNum = randomIntBetween(5, 100); | ||
for (int i = 0; i < smallDocNum; i++) { | ||
indexDoc(indexShard, "test", "small-" + i); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we've been trying to use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ++ There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
} | ||
// Average document size is estimated by sampling segments, thus it should be zero without flushing. | ||
DocsStats withoutFlush = indexShard.docStats(); | ||
assertThat(withoutFlush.averageSizeInBytes, equalTo(0L)); | ||
|
||
indexShard.flush(new FlushRequest()); | ||
indexShard.refresh("test"); | ||
DocsStats smallStats = indexShard.docStats(); | ||
assertThat(smallStats.averageSizeInBytes, greaterThan(10L)); | ||
|
||
long storedAvgSize = indexShard.storeStats().sizeInBytes() / smallDocNum; | ||
assertThat("Estimated average document size is too small compared with the average stored size", | ||
smallStats.averageSizeInBytes, greaterThanOrEqualTo(storedAvgSize * 80/100)); | ||
assertThat("Estimated average document size is too large compared with the average stored size", | ||
smallStats.averageSizeInBytes, lessThanOrEqualTo(storedAvgSize * 120/100)); | ||
|
||
// Indexing large documents should increase the average document size. | ||
int largeDocNum = randomIntBetween(100, 200); | ||
for (int i = 0; i < largeDocNum; i++) { | ||
String doc = XContentFactory.jsonBuilder() | ||
.startObject() | ||
.field("count", randomInt()) | ||
.field("point", randomFloat()) | ||
.field("description", randomUnicodeOfCodepointLength(100)) | ||
.endObject().string(); | ||
indexDoc(indexShard, "test", "large-" + i, doc); | ||
} | ||
indexShard.flush(new FlushRequest()); | ||
indexShard.refresh("test"); | ||
DocsStats largeStats = indexShard.docStats(); | ||
assertThat(largeStats.averageSizeInBytes, greaterThan(100L)); | ||
assertThat(largeStats.averageSizeInBytes, greaterThan(smallStats.averageSizeInBytes)); | ||
|
||
int deleteDocs = randomIntBetween(1, smallDocNum / 2); | ||
for (int i = 0; i < deleteDocs; i++) { | ||
deleteDoc(indexShard, "test", "small-" + i); | ||
} | ||
indexShard.flush(new FlushRequest()); | ||
indexShard.refresh("test"); | ||
DocsStats withDeletedStats = indexShard.docStats(); | ||
assertThat(withDeletedStats.averageSizeInBytes, greaterThan(largeStats.averageSizeInBytes)); | ||
|
||
} finally { | ||
closeShards(indexShard); | ||
} | ||
} | ||
|
||
/** | ||
* here we are simulating the scenario that happens when we do async shard fetching from GatewaySerivce while we are finishing | ||
* a recovery and concurrently clean files. This should always be possible without any exception. Yet there was a bug where IndexShard | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Personally
that
is a bit too close tothis
for typo avoidance, so I'd preferother
, but it's so minor that it's totally up to youThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I also prefer
other
overthat
(addressed in 6cca080) but I usedthat
in order to have the same indention for this expression (removed).