diff --git a/docs/reference/aggregations/bucket/significantterms-aggregation.asciidoc b/docs/reference/aggregations/bucket/significantterms-aggregation.asciidoc index 0a8a46a0b67b6..bfaeecc1f82d3 100644 --- a/docs/reference/aggregations/bucket/significantterms-aggregation.asciidoc +++ b/docs/reference/aggregations/bucket/significantterms-aggregation.asciidoc @@ -448,7 +448,7 @@ If the number of unique terms is greater than `size`, the returned list can be s size buckets was not returned). To ensure better accuracy a multiple of the final `size` is used as the number of terms to request from each shard -using a heuristic based on the number of shards. To take manual control of this setting the `shard_size` parameter +(`2 * (size * 1.5 + 10)`). To take manual control of this setting the `shard_size` parameter can be used to control the volumes of candidate terms produced by each shard. Low-frequency terms can turn out to be the most interesting ones once all results are combined so the diff --git a/docs/reference/aggregations/bucket/significanttext-aggregation.asciidoc b/docs/reference/aggregations/bucket/significanttext-aggregation.asciidoc index a541eb0ac14db..429c822d3623d 100644 --- a/docs/reference/aggregations/bucket/significanttext-aggregation.asciidoc +++ b/docs/reference/aggregations/bucket/significanttext-aggregation.asciidoc @@ -364,7 +364,7 @@ If the number of unique terms is greater than `size`, the returned list can be s size buckets was not returned). To ensure better accuracy a multiple of the final `size` is used as the number of terms to request from each shard -using a heuristic based on the number of shards. To take manual control of this setting the `shard_size` parameter +(`2 * (size * 1.5 + 10)`). To take manual control of this setting the `shard_size` parameter can be used to control the volumes of candidate terms produced by each shard. Low-frequency terms can turn out to be the most interesting ones once all results are combined so the diff --git a/docs/reference/aggregations/bucket/terms-aggregation.asciidoc b/docs/reference/aggregations/bucket/terms-aggregation.asciidoc index 1562bf41074e0..188b2ed3774c0 100644 --- a/docs/reference/aggregations/bucket/terms-aggregation.asciidoc +++ b/docs/reference/aggregations/bucket/terms-aggregation.asciidoc @@ -220,8 +220,7 @@ NOTE: `shard_size` cannot be smaller than `size` (as it doesn't make much sens override it and reset it to be equal to `size`. -The default `shard_size` will be `size` if the search request needs to go to a single shard, and `(size * 1.5 + 10)` -otherwise. +The default `shard_size` is `(size * 1.5 + 10)`. ==== Calculating Document Count Error diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/BucketUtils.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/BucketUtils.java index 17b50fa9bef5f..823a2b1e43422 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/BucketUtils.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/BucketUtils.java @@ -31,18 +31,12 @@ private BucketUtils() {} * * @param finalSize * The number of terms required in the final reduce phase. - * @param singleShard - * whether a single shard is being queried, or multiple shards * @return A suggested default for the size of any shard-side PriorityQueues */ - public static int suggestShardSideQueueSize(int finalSize, boolean singleShard) { + public static int suggestShardSideQueueSize(int finalSize) { if (finalSize < 1) { throw new IllegalArgumentException("size must be positive, got " + finalSize); } - if (singleShard) { - // In the case of a single shard, we do not need to over-request - return finalSize; - } // Request 50% more buckets on the shards in order to improve accuracy // as well as a small constant that should help with small values of 'size' final long shardSampleSize = (long) (finalSize * 1.5 + 10); diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/geogrid/GeoGridAggregationBuilder.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/geogrid/GeoGridAggregationBuilder.java index 353f391f213d6..38469ff875365 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/geogrid/GeoGridAggregationBuilder.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/geogrid/GeoGridAggregationBuilder.java @@ -157,7 +157,7 @@ public int shardSize() { if (shardSize < 0) { // Use default heuristic to avoid any wrong-ranking caused by // distributed counting - shardSize = BucketUtils.suggestShardSideQueueSize(requiredSize, context.numberOfShards() == 1); + shardSize = BucketUtils.suggestShardSideQueueSize(requiredSize); } if (requiredSize <= 0 || shardSize <= 0) { diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java index 7fe41407af4ca..09fd5877344f6 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java @@ -195,8 +195,7 @@ protected Aggregator doCreateInternal(ValuesSource valuesSource, Aggregator pare // each shard and as // such are impossible to differentiate from non-significant terms // at that early stage. - bucketCountThresholds.setShardSize(2 * BucketUtils.suggestShardSideQueueSize(bucketCountThresholds.getRequiredSize(), - context.numberOfShards() == 1)); + bucketCountThresholds.setShardSize(2 * BucketUtils.suggestShardSideQueueSize(bucketCountThresholds.getRequiredSize())); } if (valuesSource instanceof ValuesSource.Bytes) { diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTextAggregatorFactory.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTextAggregatorFactory.java index a51a33defdd00..92a136960395d 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTextAggregatorFactory.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTextAggregatorFactory.java @@ -175,8 +175,7 @@ protected Aggregator createInternal(Aggregator parent, boolean collectsFromSingl // we want to find have only one occurrence on each shard and as // such are impossible to differentiate from non-significant terms // at that early stage. - bucketCountThresholds.setShardSize(2 * BucketUtils.suggestShardSideQueueSize(bucketCountThresholds.getRequiredSize(), - context.numberOfShards() == 1)); + bucketCountThresholds.setShardSize(2 * BucketUtils.suggestShardSideQueueSize(bucketCountThresholds.getRequiredSize())); } // TODO - need to check with mapping that this is indeed a text field.... diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregatorFactory.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregatorFactory.java index 25f552075dead..1ff0efd3e8307 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregatorFactory.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregatorFactory.java @@ -121,8 +121,7 @@ protected Aggregator doCreateInternal(ValuesSource valuesSource, Aggregator pare // The user has not made a shardSize selection. Use default // heuristic to avoid any wrong-ranking caused by distributed // counting - bucketCountThresholds.setShardSize(BucketUtils.suggestShardSideQueueSize(bucketCountThresholds.getRequiredSize(), - context.numberOfShards() == 1)); + bucketCountThresholds.setShardSize(BucketUtils.suggestShardSideQueueSize(bucketCountThresholds.getRequiredSize())); } bucketCountThresholds.ensureValidity(); if (valuesSource instanceof ValuesSource.Bytes) { diff --git a/server/src/test/java/org/elasticsearch/search/aggregations/bucket/BucketUtilsTests.java b/server/src/test/java/org/elasticsearch/search/aggregations/bucket/BucketUtilsTests.java index 35f3175f7cfe5..756bc14a498ed 100644 --- a/server/src/test/java/org/elasticsearch/search/aggregations/bucket/BucketUtilsTests.java +++ b/server/src/test/java/org/elasticsearch/search/aggregations/bucket/BucketUtilsTests.java @@ -27,22 +27,14 @@ public class BucketUtilsTests extends ESTestCase { public void testBadInput() { IllegalArgumentException e = expectThrows(IllegalArgumentException.class, - () -> BucketUtils.suggestShardSideQueueSize(0, randomBoolean())); + () -> BucketUtils.suggestShardSideQueueSize(0)); assertEquals(e.getMessage(), "size must be positive, got 0"); } - public void testOptimizesSingleShard() { - for (int iter = 0; iter < 10; ++iter) { - final int size = randomIntBetween(1, Integer.MAX_VALUE); - assertEquals(size, BucketUtils.suggestShardSideQueueSize( size, true)); - } - } - public void testOverFlow() { for (int iter = 0; iter < 10; ++iter) { final int size = Integer.MAX_VALUE - randomInt(10); - final int numberOfShards = randomIntBetween(1, 10); - final int shardSize = BucketUtils.suggestShardSideQueueSize( size, numberOfShards == 1); + final int shardSize = BucketUtils.suggestShardSideQueueSize( size); assertThat(shardSize, greaterThanOrEqualTo(shardSize)); } } @@ -50,8 +42,7 @@ public void testOverFlow() { public void testShardSizeIsGreaterThanGlobalSize() { for (int iter = 0; iter < 10; ++iter) { final int size = randomIntBetween(1, Integer.MAX_VALUE); - final int numberOfShards = randomIntBetween(1, 10); - final int shardSize = BucketUtils.suggestShardSideQueueSize( size, numberOfShards == 1); + final int shardSize = BucketUtils.suggestShardSideQueueSize( size); assertThat(shardSize, greaterThanOrEqualTo(size)); } } diff --git a/server/src/test/java/org/elasticsearch/search/aggregations/bucket/TermsShardMinDocCountIT.java b/server/src/test/java/org/elasticsearch/search/aggregations/bucket/TermsShardMinDocCountIT.java index 0f685ded62c1c..a3311db11350f 100644 --- a/server/src/test/java/org/elasticsearch/search/aggregations/bucket/TermsShardMinDocCountIT.java +++ b/server/src/test/java/org/elasticsearch/search/aggregations/bucket/TermsShardMinDocCountIT.java @@ -23,11 +23,11 @@ import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.xcontent.XContentType; import org.elasticsearch.index.query.QueryBuilders; +import org.elasticsearch.search.aggregations.BucketOrder; import org.elasticsearch.search.aggregations.bucket.filter.InternalFilter; import org.elasticsearch.search.aggregations.bucket.significant.SignificantTerms; import org.elasticsearch.search.aggregations.bucket.significant.SignificantTermsAggregatorFactory; import org.elasticsearch.search.aggregations.bucket.terms.Terms; -import org.elasticsearch.search.aggregations.BucketOrder; import org.elasticsearch.test.ESIntegTestCase; import java.util.ArrayList; @@ -35,17 +35,18 @@ import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_REPLICAS; import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_SHARDS; -import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked; -import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertSearchResponse; import static org.elasticsearch.search.aggregations.AggregationBuilders.filter; import static org.elasticsearch.search.aggregations.AggregationBuilders.significantTerms; import static org.elasticsearch.search.aggregations.AggregationBuilders.terms; +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked; +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertSearchResponse; import static org.hamcrest.Matchers.equalTo; public class TermsShardMinDocCountIT extends ESIntegTestCase { private static final String index = "someindex"; private static final String type = "testtype"; - public String randomExecutionHint() { + + private static String randomExecutionHint() { return randomBoolean() ? null : randomFrom(SignificantTermsAggregatorFactory.ExecutionMode.values()).toString(); } @@ -74,7 +75,7 @@ public void testShardMinDocCountSignificantTermsTest() throws Exception { SearchResponse response = client().prepareSearch(index) .addAggregation( (filter("inclass", QueryBuilders.termQuery("class", true))) - .subAggregation(significantTerms("mySignificantTerms").field("text").minDocCount(2).size(2) + .subAggregation(significantTerms("mySignificantTerms").field("text").minDocCount(2).size(2).shardSize(2) .executionHint(randomExecutionHint())) ) .get(); @@ -87,16 +88,14 @@ public void testShardMinDocCountSignificantTermsTest() throws Exception { response = client().prepareSearch(index) .addAggregation( (filter("inclass", QueryBuilders.termQuery("class", true))) - .subAggregation(significantTerms("mySignificantTerms").field("text").minDocCount(2) - .shardMinDocCount(2).size(2) - .executionHint(randomExecutionHint())) + .subAggregation(significantTerms("mySignificantTerms").field("text").minDocCount(2).shardSize(2) + .shardMinDocCount(2).size(2).executionHint(randomExecutionHint())) ) .get(); assertSearchResponse(response); filteredBucket = response.getAggregations().get("inclass"); sigterms = filteredBucket.getAggregations().get("mySignificantTerms"); assertThat(sigterms.getBuckets().size(), equalTo(2)); - } private void addTermsDocs(String term, int numInClass, int numNotInClass, List builders) { @@ -133,19 +132,18 @@ public void testShardMinDocCountTermsTest() throws Exception { // first, check that indeed when not setting the shardMinDocCount parameter 0 terms are returned SearchResponse response = client().prepareSearch(index) .addAggregation( - terms("myTerms").field("text").minDocCount(2).size(2).executionHint(randomExecutionHint()) - .order(BucketOrder.key(true)) + terms("myTerms").field("text").minDocCount(2).size(2).shardSize(2).executionHint(randomExecutionHint()) + .order(BucketOrder.key(true)) ) .get(); assertSearchResponse(response); Terms sigterms = response.getAggregations().get("myTerms"); assertThat(sigterms.getBuckets().size(), equalTo(0)); - response = client().prepareSearch(index) .addAggregation( - terms("myTerms").field("text").minDocCount(2).shardMinDocCount(2).size(2).executionHint(randomExecutionHint()) - .order(BucketOrder.key(true)) + terms("myTerms").field("text").minDocCount(2).shardMinDocCount(2).size(2).shardSize(2) + .executionHint(randomExecutionHint()).order(BucketOrder.key(true)) ) .get(); assertSearchResponse(response); @@ -154,11 +152,10 @@ public void testShardMinDocCountTermsTest() throws Exception { } - private void addTermsDocs(String term, int numDocs, List builders) { + private static void addTermsDocs(String term, int numDocs, List builders) { String sourceClass = "{\"text\": \"" + term + "\"}"; for (int i = 0; i < numDocs; i++) { builders.add(client().prepareIndex(index, type).setSource(sourceClass, XContentType.JSON)); } - } }