Stop terms agg from losing buckets (#70493)

nik9000 · web-flow · commit 8e6d478c0e4f · 2021-03-22T12:20:23.000-04:00
When the `terms` agg is at the top level it can run as a `filters` agg instead because that is typically faster. This was added in #68871 and we mistakely made it so that a bucket without any hits could take up a slot on the way back to the coordinating node. You could trigger this by having a fairly precise `size` on the terms agg and a top level filter. This fixes the issue by properly mimicing the regular terms aggregator in the "as filters" version: only send back buckets without any matching documents if the min_doc_count is 0. Closes #70449
diff --git a/rest-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/20_terms.yml b/rest-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/20_terms.yml
@@ -1281,3 +1281,50 @@ setup:
   - match: { aggregations.str_terms.buckets.0.key: cow }
   - match: { aggregations.str_terms.buckets.0.doc_count: 1 }
   - match: { aggregations.str_terms.buckets.0.filter.max_number.value: 7.0 }
+
+---
+precise size:
+  - skip:
+      version: " - 7.99.99"
+      reason:  "Fixed in 8.0.0, to be backported to 7.13. Once backported this should *always* pass without a need for a skip"
+
+  - do:
+      bulk:
+        index: test_1
+        refresh: true
+        body: |
+          { "index": {} }
+          { "str": "a" }
+          { "index": {} }
+          { "str": "b" }
+          { "index": {} }
+          { "str": "c" }
+          { "index": {} }
+          { "str": "b" }
+          { "index": {} }
+          { "str": "c" }
+          { "index": {} }
+          { "str": "c" }
+
+  - do:
+      search:
+        index: test_1
+        body:
+          size: 0
+          query:
+            terms:
+              str:
+                - b
+                - c
+          aggs:
+            str_terms:
+              terms:
+                size: 2
+                field: str
+                order:
+                  - _key : asc
+  - length: { aggregations.str_terms.buckets: 2 }
+  - match: { aggregations.str_terms.buckets.0.key: b }
+  - match: { aggregations.str_terms.buckets.0.doc_count: 2 }
+  - match: { aggregations.str_terms.buckets.1.key: c }
+  - match: { aggregations.str_terms.buckets.1.doc_count: 3 }
diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringTermsAggregatorFromFilters.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringTermsAggregatorFromFilters.java
@@ -154,6 +154,21 @@ protected InternalAggregation adapt(InternalAggregation delegateResult) throws I
         List<StringTerms.Bucket> buckets;
         long otherDocsCount = 0;
         BucketOrder reduceOrder = isKeyOrder(order) ? order : InternalOrder.key(true);
+        /*
+         * We default to a shardMinDocCount of 0 which means we'd keep all
+         * hits, even those that don't have live documents or those that
+         * don't match any documents in the top level query. This is correct
+         * if the minDocCount is also 0, but if it is larger than 0 then we
+         * don't need to send those buckets back to the coordinating node.
+         * GlobalOrdinalsStringTermsAggregator doesn't collect those
+         * buckets either. It's a good thing, too, because if you take them
+         * into account when you sort by, say, key, you might throw away
+         * buckets with actual docs in them.
+         */
+        long minDocCount = bucketCountThresholds.getShardMinDocCount();
+        if (minDocCount == 0 && bucketCountThresholds.getMinDocCount() > 0) {
+            minDocCount = 1;
+        }
         if (filters.getBuckets().size() > bucketCountThresholds.getShardSize()) {
             PriorityQueue<OrdBucket> queue = new PriorityQueue<OrdBucket>(bucketCountThresholds.getShardSize()) {
                 private final Comparator<Bucket> comparator = order.comparator();
@@ -165,7 +180,7 @@ protected boolean lessThan(OrdBucket a, OrdBucket b) {
             };
             OrdBucket spare = null;
             for (InternalFilters.InternalBucket b : filters.getBuckets()) {
-                if (b.getDocCount() < bucketCountThresholds.getShardMinDocCount()) {
+                if (b.getDocCount() < minDocCount) {
                     continue;
                 }
                 if (spare == null) {
@@ -203,7 +218,7 @@ protected boolean lessThan(OrdBucket a, OrdBucket b) {
         } else {
             buckets = new ArrayList<>(filters.getBuckets().size());
             for (InternalFilters.InternalBucket b : filters.getBuckets()) {
-                if (b.getDocCount() < bucketCountThresholds.getShardMinDocCount()) {
+                if (b.getDocCount() < minDocCount) {
                     continue;
                 }
                 buckets.add(buildBucket(b));
diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregator.java
@@ -94,6 +94,13 @@ public void ensureValidity() {
             }
         }
 
+        /**
+         * The minimum number of documents a bucket must have in order to
+         * be returned from a shard.
+         * <p>
+         * Important: The default for this is 0, but we should only return
+         * 0 document buckets if {@link #getMinDocCount()} is *also* 0.
+         */
         public long getShardMinDocCount() {
             return shardMinDocCount;
         }
@@ -102,6 +109,10 @@ public void setShardMinDocCount(long shardMinDocCount) {
             this.shardMinDocCount = shardMinDocCount;
         }
 
+        /**
+         * The minimum numbers of documents a bucket must have in order to
+         * survive the final reduction.
+         */
         public long getMinDocCount() {
             return minDocCount;
         }
diff --git a/server/src/test/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregatorTests.java b/server/src/test/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregatorTests.java
@@ -25,6 +25,8 @@
 import org.apache.lucene.search.DocValuesFieldExistsQuery;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.MatchAllDocsQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermInSetQuery;
 import org.apache.lucene.search.TotalHits;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.BytesRef;
@@ -1687,6 +1689,48 @@ public void testAsSubAgg() throws IOException {
         }, dft, kft);
     }
 
+    public void testWithFilterAndPreciseSize() throws IOException {
+        KeywordFieldType kft = new KeywordFieldType("k", true, true, Collections.emptyMap());
+        CheckedConsumer<RandomIndexWriter, IOException> buildIndex = iw -> {
+            iw.addDocument(
+                List.of(
+                    new Field("k", new BytesRef("a"), KeywordFieldMapper.Defaults.FIELD_TYPE),
+                    new SortedSetDocValuesField("k", new BytesRef("a"))
+                )
+            );
+            iw.addDocument(
+                List.of(
+                    new Field("k", new BytesRef("b"), KeywordFieldMapper.Defaults.FIELD_TYPE),
+                    new SortedSetDocValuesField("k", new BytesRef("b"))
+                )
+            );
+            iw.addDocument(
+                List.of(
+                    new Field("k", new BytesRef("c"), KeywordFieldMapper.Defaults.FIELD_TYPE),
+                    new SortedSetDocValuesField("k", new BytesRef("c"))
+                )
+            );
+        };
+        TermsAggregationBuilder builder = new TermsAggregationBuilder("k").field("k");
+        /*
+         * There was a bug where we would accidentally send buckets with 0
+         * docs in them back to the coordinating node which would take up a
+         * slot that a bucket with docs in it deserves. Combination of
+         * ordering by bucket, the precise size, and the top level query
+         * would trigger that bug.
+         */
+        builder.size(2).order(BucketOrder.key(true));
+        Query topLevel = new TermInSetQuery("k", new BytesRef[] {new BytesRef("b"), new BytesRef("c")});
+        testCase(builder, topLevel, buildIndex, (StringTerms terms) -> {
+            assertThat(terms.getBuckets().stream().map(StringTerms.Bucket::getKey).collect(toList()), equalTo(List.of("b", "c")));
+        }, kft);
+        withAggregator(builder, topLevel, buildIndex, (searcher, terms) -> {
+            Map<String, Object> info = new HashMap<>();
+            terms.collectDebugInfo(info::put);
+            assertThat(info, hasEntry("delegate", "FiltersAggregator.FilterByFilter"));
+        }, kft);
+    }
+
     private final SeqNoFieldMapper.SequenceIDFields sequenceIDFields = SeqNoFieldMapper.SequenceIDFields.emptySeqID();
     private List<Document> generateDocsWithNested(String id, int value, int[] nestedValues) {
         List<Document> documents = new ArrayList<>();

Original file line number	Diff line number	Diff line change
`@@ -94,6 +94,13 @@ public void ensureValidity() {`
`94`	`94`	`}`
`95`	`95`	`}`
`96`	`96`
	`97`	`+ /**`
	`98`	`+ * The minimum number of documents a bucket must have in order to`
	`99`	`+ * be returned from a shard.`
	`100`	`+ * <p>`
	`101`	`+ * Important: The default for this is 0, but we should only return`
	`102`	`+ * 0 document buckets if {@link #getMinDocCount()} is also 0.`
	`103`	`+ */`
`97`	`104`	`public long getShardMinDocCount() {`
`98`	`105`	`return shardMinDocCount;`
`99`	`106`	`}`
`@@ -102,6 +109,10 @@ public void setShardMinDocCount(long shardMinDocCount) {`
`102`	`109`	`this.shardMinDocCount = shardMinDocCount;`
`103`	`110`	`}`
`104`	`111`
	`112`	`+ /**`
	`113`	`+ * The minimum numbers of documents a bucket must have in order to`
	`114`	`+ * survive the final reduction.`
	`115`	`+ */`
`105`	`116`	`public long getMinDocCount() {`
`106`	`117`	`return minDocCount;`
`107`	`118`	`}`