Stop terms agg from losing buckets

nik9000 · nik9000 · commit 331a83aead56 · 2021-03-17T09:27:43.000-04:00
When the `terms` agg is at the top level it can run as a `filters` agg instead because that is typically faster. This was added in elastic#68871 and we mistakely made it so that a bucket without any hits could take up a slot on the way back to the coordinating node. You could trigger this by having a fairly precise `size` on the terms agg and a top level filter. This fixes the issue by properly mimicing the regular terms aggregator in the "as filters" version: only send back buckets without any matching documents if the min_doc_count is 0. Closes elastic#70449
diff --git a/rest-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/20_terms.yml b/rest-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/20_terms.yml
@@ -1281,3 +1281,46 @@ setup:
   - match: { aggregations.str_terms.buckets.0.key: cow }
   - match: { aggregations.str_terms.buckets.0.doc_count: 1 }
   - match: { aggregations.str_terms.buckets.0.filter.max_number.value: 7.0 }
+
+---
+precise size:
+  - do:
+      bulk:
+        index: test_1
+        refresh: true
+        body: |
+          { "index": {} }
+          { "str": "a" }
+          { "index": {} }
+          { "str": "b" }
+          { "index": {} }
+          { "str": "c" }
+          { "index": {} }
+          { "str": "b" }
+          { "index": {} }
+          { "str": "c" }
+          { "index": {} }
+          { "str": "c" }
+
+  - do:
+      search:
+        index: test_1
+        body:
+          size: 0
+          query:
+            terms:
+              str:
+                - b
+                - c
+          aggs:
+            str_terms:
+              terms:
+                size: 2
+                field: str
+                order:
+                  - _key : asc
+  - length: { aggregations.str_terms.buckets: 2 }
+  - match: { aggregations.str_terms.buckets.0.key: b }
+  - match: { aggregations.str_terms.buckets.0.doc_count: 2 }
+  - match: { aggregations.str_terms.buckets.1.key: c }
+  - match: { aggregations.str_terms.buckets.1.doc_count: 3 }
diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringTermsAggregatorFromFilters.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringTermsAggregatorFromFilters.java
@@ -154,6 +154,21 @@ protected InternalAggregation adapt(InternalAggregation delegateResult) throws I
         List<StringTerms.Bucket> buckets;
         long otherDocsCount = 0;
         BucketOrder reduceOrder = isKeyOrder(order) ? order : InternalOrder.key(true);
+        /*
+         * We default to a shardMinDocCount of 0 which means we'd keep all
+         * hits, even those that don't have live documents or those that
+         * don't match any documents in the top level query. This is correct
+         * if the minDocCount is also 0, but if it is larger than 0 then we
+         * don't need to send those buckets back to the coordinating node.
+         * GlobalOrdinalsStringTermsAggregator doesn't collect those
+         * buckets either. It's a good thing, too, because if you take them
+         * into account when you sort by, say, key, you might throw away
+         * buckets with actual docs in them.
+         */
+        long minDocCount = bucketCountThresholds.getShardMinDocCount();
+        if (minDocCount == 0 && bucketCountThresholds.getMinDocCount() > 0) {
+            minDocCount = 1;
+        }
         if (filters.getBuckets().size() > bucketCountThresholds.getShardSize()) {
             PriorityQueue<OrdBucket> queue = new PriorityQueue<OrdBucket>(bucketCountThresholds.getShardSize()) {
                 private final Comparator<Bucket> comparator = order.comparator();
@@ -165,7 +180,7 @@ protected boolean lessThan(OrdBucket a, OrdBucket b) {
             };
             OrdBucket spare = null;
             for (InternalFilters.InternalBucket b : filters.getBuckets()) {
-                if (b.getDocCount() < bucketCountThresholds.getShardMinDocCount()) {
+                if (b.getDocCount() < minDocCount) {
                     continue;
                 }
                 if (spare == null) {
@@ -203,7 +218,7 @@ protected boolean lessThan(OrdBucket a, OrdBucket b) {
         } else {
             buckets = new ArrayList<>(filters.getBuckets().size());
             for (InternalFilters.InternalBucket b : filters.getBuckets()) {
-                if (b.getDocCount() < bucketCountThresholds.getShardMinDocCount()) {
+                if (b.getDocCount() < minDocCount) {
                     continue;
                 }
                 buckets.add(buildBucket(b));
diff --git a/server/src/test/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregatorTests.java b/server/src/test/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregatorTests.java
@@ -25,6 +25,8 @@
 import org.apache.lucene.search.DocValuesFieldExistsQuery;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.MatchAllDocsQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermInSetQuery;
 import org.apache.lucene.search.TotalHits;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.BytesRef;
@@ -1687,6 +1689,48 @@ public void testAsSubAgg() throws IOException {
         }, dft, kft);
     }
 
+    public void testWithFilterAndPreciseSize() throws IOException {
+        KeywordFieldType kft = new KeywordFieldType("k", true, true, Collections.emptyMap());
+        CheckedConsumer<RandomIndexWriter, IOException> buildIndex = iw -> {
+            iw.addDocument(
+                List.of(
+                    new Field("k", new BytesRef("a"), KeywordFieldMapper.Defaults.FIELD_TYPE),
+                    new SortedSetDocValuesField("k", new BytesRef("a"))
+                )
+            );
+            iw.addDocument(
+                List.of(
+                    new Field("k", new BytesRef("b"), KeywordFieldMapper.Defaults.FIELD_TYPE),
+                    new SortedSetDocValuesField("k", new BytesRef("b"))
+                )
+            );
+            iw.addDocument(
+                List.of(
+                    new Field("k", new BytesRef("c"), KeywordFieldMapper.Defaults.FIELD_TYPE),
+                    new SortedSetDocValuesField("k", new BytesRef("c"))
+                )
+            );
+        };
+        TermsAggregationBuilder builder = new TermsAggregationBuilder("k").field("k");
+        /*
+         * There was a bug where we would accidentally send buckets with 0
+         * docs in them back to the coordinating node which would take up a
+         * slot that a bucket with docs in it deserves. Combination of
+         * ordering by bucket, the precise size, and the top level query
+         * would trigger that bug.
+         */
+        builder.size(2).order(BucketOrder.key(true));
+        Query topLevel = new TermInSetQuery("k", new BytesRef[] {new BytesRef("b"), new BytesRef("c")});
+        testCase(builder, topLevel, buildIndex, (StringTerms terms) -> {
+            assertThat(terms.getBuckets().stream().map(StringTerms.Bucket::getKey).collect(toList()), equalTo(List.of("b", "c")));
+        }, kft);
+        withAggregator(builder, topLevel, buildIndex, (searcher, terms) -> {
+            Map<String, Object> info = new HashMap<>();
+            terms.collectDebugInfo(info::put);
+            assertThat(info, hasEntry("delegate", "FiltersAggregator.FilterByFilter"));
+        }, kft);
+    }
+
     private final SeqNoFieldMapper.SequenceIDFields sequenceIDFields = SeqNoFieldMapper.SequenceIDFields.emptySeqID();
     private List<Document> generateDocsWithNested(String id, int value, int[] nestedValues) {
         List<Document> documents = new ArrayList<>();