Fix bug with terms' min_doc_count (#62130) (#62177)

nik9000 · web-flow · commit 1104d6546502 · 2020-09-09T13:04:51.000-04:00
The `global_ordinals` implementation of `terms` had a bug when `min_doc_count: 0` that'd cause sub-aggregations to have array index out of bounds exceptions. Ooops. My fault. This fixes the bug by assigning ordinals to those buckets. Closes #62084
diff --git a/rest-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/20_terms.yml b/rest-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/20_terms.yml
@@ -1001,3 +1001,130 @@ setup:
   - match: { profile.shards.0.aggregations.0.breakdown.collect_count: 4 }
   - match: { profile.shards.0.aggregations.0.debug.result_strategy: long_terms }
   - match: { profile.shards.0.aggregations.0.debug.total_buckets: 2 }
+
+---
+"min_doc_count":
+  - skip:
+      version: " - 7.99.99"
+      reason: broken in 7.9.1, not yet backported
+
+  - do:
+      bulk:
+        index: test_1
+        refresh: true
+        body: |
+          { "index": {} }
+          { "str": "sheep", "number": 1 }
+          { "index": {} }
+          { "str": "sheep", "number": 3 }
+          { "index": {} }
+          { "str": "cow", "number": 1 }
+          { "index": {} }
+          { "str": "pig", "number": 1 }
+
+  - do:
+      search:
+        index: test_1
+        body:
+          size: 0
+          query:
+            simple_query_string:
+              fields: [str]
+              query: sheep cow
+              minimum_should_match: 1
+          aggs:
+            str_terms:
+              terms:
+                field: str
+                min_doc_count: 2
+              aggs:
+                max_number:
+                  max:
+                    field: number
+  - length: { aggregations.str_terms.buckets: 1 }
+  - match: { aggregations.str_terms.buckets.0.key: sheep }
+  - match: { aggregations.str_terms.buckets.0.doc_count: 2 }
+  - match: { aggregations.str_terms.buckets.0.max_number.value: 3 }
+
+  - do:
+      search:
+        index: test_1
+        body:
+          size: 0
+          query:
+            simple_query_string:
+              fields: [str]
+              query: sheep cow
+              minimum_should_match: 1
+          aggs:
+            str_terms:
+              terms:
+                field: str
+                min_doc_count: 1
+              aggs:
+                max_number:
+                  max:
+                    field: number
+  - length: { aggregations.str_terms.buckets: 2 }
+  - match: { aggregations.str_terms.buckets.0.key: sheep }
+  - match: { aggregations.str_terms.buckets.0.doc_count: 2 }
+  - match: { aggregations.str_terms.buckets.0.max_number.value: 3 }
+  - match: { aggregations.str_terms.buckets.1.key: cow }
+  - match: { aggregations.str_terms.buckets.1.doc_count: 1 }
+  - match: { aggregations.str_terms.buckets.1.max_number.value: 1 }
+
+  - do:
+      search:
+        index: test_1
+        body:
+          size: 0
+          query:
+            simple_query_string:
+              fields: [str]
+              query: sheep cow
+              minimum_should_match: 1
+          aggs:
+            str_terms:
+              terms:
+                field: str
+              aggs:
+                max_number:
+                  max:
+                    field: number
+  - length: { aggregations.str_terms.buckets: 2 }
+  - match: { aggregations.str_terms.buckets.0.key: sheep }
+  - match: { aggregations.str_terms.buckets.0.doc_count: 2 }
+  - match: { aggregations.str_terms.buckets.0.max_number.value: 3 }
+  - match: { aggregations.str_terms.buckets.1.key: cow }
+  - match: { aggregations.str_terms.buckets.1.doc_count: 1 }
+  - match: { aggregations.str_terms.buckets.1.max_number.value: 1 }
+
+  - do:
+      search:
+        index: test_1
+        body:
+          size: 0
+          query:
+            simple_query_string:
+              fields: [str]
+              query: sheep cow
+              minimum_should_match: 1
+          aggs:
+            str_terms:
+              terms:
+                field: str
+                min_doc_count: 0
+              aggs:
+                max_number:
+                  max:
+                    field: number
+  - length: { aggregations.str_terms.buckets: 3 }
+  - match: { aggregations.str_terms.buckets.0.key: sheep }
+  - match: { aggregations.str_terms.buckets.0.doc_count: 2 }
+  - match: { aggregations.str_terms.buckets.0.max_number.value: 3 }
+  - match: { aggregations.str_terms.buckets.1.key: cow }
+  - match: { aggregations.str_terms.buckets.1.doc_count: 1 }
+  - match: { aggregations.str_terms.buckets.1.max_number.value: 1.0 }
+  - match: { aggregations.str_terms.buckets.2.key: pig }
+  - match: { aggregations.str_terms.buckets.2.doc_count: 0 }
+  - match: { aggregations.str_terms.buckets.2.max_number.value: null }
diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java
@@ -513,8 +513,21 @@ void forEach(long owningBucketOrd, BucketInfoConsumer consumer) throws IOExcepti
                     if (false == acceptedGlobalOrdinals.test(globalOrd)) {
                         continue;
                     }
-                    long bucketOrd = bucketOrds.find(owningBucketOrd, globalOrd);
-                    long docCount = bucketOrd < 0 ? 0 : bucketDocCount(bucketOrd);
+                    /*
+                     * Use `add` instead of `find` here to assign an ordinal
+                     * even if the global ord wasn't found so we can build
+                     * sub-aggregations without trouble even though we haven't
+                     * hit any documents for them. This is wasteful, but
+                     * settings minDocCount == 0 is wasteful in general.....
+                     */
+                    long bucketOrd = bucketOrds.add(owningBucketOrd, globalOrd);
+                    long docCount;
+                    if (bucketOrd < 0) {
+                        bucketOrd = -1 - bucketOrd;
+                        docCount = bucketDocCount(bucketOrd);
+                    } else {
+                        docCount = 0;
+                    }
                     consumer.accept(globalOrd, bucketOrd, docCount);
                 }
             } else {