Skip to content

Commit c08daf2

Browse files
authored
Build global ordinals terms bucket from matching ordinals (#30166)
The global ordinals terms aggregator has an option to remap global ordinals to dense ordinal that match the request. This mode is automatically picked when the terms aggregator is a child of another bucket aggregator or when it needs to defer buckets to an aggregation that is used in the ordering of the terms. Though when building the final buckets, this aggregator loops over all possible global ordinals rather than using the hash map that was built to remap the ordinals. For fields with high cardinality this is highly inefficient and can lead to slow responses even when the number of terms that match the query is low. This change fixes this performance issue by using the hash table of matching ordinals to perform the pruning of the final buckets for the terms and significant_terms aggregation. I ran a simple benchmark with 1M documents containing 0 to 10 keywords randomly selected among 1M unique terms. This field is used to perform a multi-level terms aggregation using rally to collect the response times. The aggregation below is an example of a two-level terms aggregation that was used to perform the benchmark: ``` "aggregations":{ "1":{ "terms":{ "field":"keyword" }, "aggregations":{ "2":{ "terms":{ "field":"keyword" } } } } } ``` | Levels of aggregation | 50th percentile ms (master) | 50th percentile ms (patch) | | --- | --- | --- | | 2 | 640.41ms | 577.499ms | | 3 | 2239.66ms | 600.154ms | | 4 | 14141.2ms | 703.512ms | Closes #30117
1 parent 707ba28 commit c08daf2

File tree

2 files changed

+32
-16
lines changed

2 files changed

+32
-16
lines changed

server/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/GlobalOrdinalsSignificantTermsAggregator.java

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,8 @@
2020

2121
import org.apache.lucene.index.IndexReader;
2222
import org.apache.lucene.index.LeafReaderContext;
23-
import org.apache.lucene.index.SortedSetDocValues;
2423
import org.apache.lucene.util.BytesRef;
2524
import org.elasticsearch.common.lease.Releasables;
26-
import org.elasticsearch.common.util.LongHash;
2725
import org.elasticsearch.search.DocValueFormat;
2826
import org.elasticsearch.search.aggregations.Aggregator;
2927
import org.elasticsearch.search.aggregations.AggregatorFactories;
@@ -103,11 +101,22 @@ public SignificantStringTerms buildAggregation(long owningBucketOrdinal) throws
103101

104102
BucketSignificancePriorityQueue<SignificantStringTerms.Bucket> ordered = new BucketSignificancePriorityQueue<>(size);
105103
SignificantStringTerms.Bucket spare = null;
106-
for (long globalTermOrd = 0; globalTermOrd < valueCount; ++globalTermOrd) {
107-
if (includeExclude != null && !acceptedGlobalOrdinals.get(globalTermOrd)) {
104+
final boolean needsFullScan = bucketOrds == null || bucketCountThresholds.getMinDocCount() == 0;
105+
final long maxId = needsFullScan ? valueCount : bucketOrds.size();
106+
for (long ord = 0; ord < maxId; ord++) {
107+
final long globalOrd;
108+
final long bucketOrd;
109+
if (needsFullScan) {
110+
bucketOrd = bucketOrds == null ? ord : bucketOrds.find(ord);
111+
globalOrd = ord;
112+
} else {
113+
assert bucketOrds != null;
114+
bucketOrd = ord;
115+
globalOrd = bucketOrds.get(ord);
116+
}
117+
if (includeExclude != null && !acceptedGlobalOrdinals.get(globalOrd)) {
108118
continue;
109119
}
110-
final long bucketOrd = getBucketOrd(globalTermOrd);
111120
final int bucketDocCount = bucketOrd < 0 ? 0 : bucketDocCount(bucketOrd);
112121
if (bucketCountThresholds.getMinDocCount() > 0 && bucketDocCount == 0) {
113122
continue;
@@ -120,7 +129,7 @@ public SignificantStringTerms buildAggregation(long owningBucketOrdinal) throws
120129
spare = new SignificantStringTerms.Bucket(new BytesRef(), 0, 0, 0, 0, null, format);
121130
}
122131
spare.bucketOrd = bucketOrd;
123-
copy(lookupGlobalOrd.apply(globalTermOrd), spare.termBytes);
132+
copy(lookupGlobalOrd.apply(globalOrd), spare.termBytes);
124133
spare.subsetDf = bucketDocCount;
125134
spare.subsetSize = subsetSize;
126135
spare.supersetDf = termsAggFactory.getBackgroundFrequency(spare.termBytes);

server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr
7171
protected final long valueCount;
7272
protected final GlobalOrdLookupFunction lookupGlobalOrd;
7373

74-
private final LongHash bucketOrds;
74+
protected final LongHash bucketOrds;
7575

7676
public interface GlobalOrdLookupFunction {
7777
BytesRef apply(long ord) throws IOException;
@@ -107,10 +107,6 @@ boolean remapGlobalOrds() {
107107
return bucketOrds != null;
108108
}
109109

110-
protected final long getBucketOrd(long globalOrd) {
111-
return bucketOrds == null ? globalOrd : bucketOrds.find(globalOrd);
112-
}
113-
114110
private void collectGlobalOrd(int doc, long globalOrd, LeafBucketCollector sub) throws IOException {
115111
if (bucketOrds == null) {
116112
collectExistingBucket(sub, doc, globalOrd);
@@ -188,17 +184,28 @@ public InternalAggregation buildAggregation(long owningBucketOrdinal) throws IOE
188184
long otherDocCount = 0;
189185
BucketPriorityQueue<OrdBucket> ordered = new BucketPriorityQueue<>(size, order.comparator(this));
190186
OrdBucket spare = new OrdBucket(-1, 0, null, showTermDocCountError, 0);
191-
for (long globalTermOrd = 0; globalTermOrd < valueCount; ++globalTermOrd) {
192-
if (includeExclude != null && !acceptedGlobalOrdinals.get(globalTermOrd)) {
187+
final boolean needsFullScan = bucketOrds == null || bucketCountThresholds.getMinDocCount() == 0;
188+
final long maxId = needsFullScan ? valueCount : bucketOrds.size();
189+
for (long ord = 0; ord < maxId; ord++) {
190+
final long globalOrd;
191+
final long bucketOrd;
192+
if (needsFullScan) {
193+
bucketOrd = bucketOrds == null ? ord : bucketOrds.find(ord);
194+
globalOrd = ord;
195+
} else {
196+
assert bucketOrds != null;
197+
bucketOrd = ord;
198+
globalOrd = bucketOrds.get(ord);
199+
}
200+
if (includeExclude != null && !acceptedGlobalOrdinals.get(globalOrd)) {
193201
continue;
194202
}
195-
final long bucketOrd = getBucketOrd(globalTermOrd);
196203
final int bucketDocCount = bucketOrd < 0 ? 0 : bucketDocCount(bucketOrd);
197204
if (bucketCountThresholds.getMinDocCount() > 0 && bucketDocCount == 0) {
198205
continue;
199206
}
200207
otherDocCount += bucketDocCount;
201-
spare.globalOrd = globalTermOrd;
208+
spare.globalOrd = globalOrd;
202209
spare.bucketOrd = bucketOrd;
203210
spare.docCount = bucketDocCount;
204211
if (bucketCountThresholds.getShardMinDocCount() <= spare.docCount) {
@@ -378,7 +385,7 @@ private void mapSegmentCountsToGlobalCounts(LongUnaryOperator mapping) throws IO
378385
}
379386
final long ord = i - 1; // remember we do +1 when counting
380387
final long globalOrd = mapping.applyAsLong(ord);
381-
long bucketOrd = getBucketOrd(globalOrd);
388+
long bucketOrd = bucketOrds == null ? globalOrd : bucketOrds.find(globalOrd);
382389
incrementBucketDocCount(bucketOrd, inc);
383390
}
384391
}

0 commit comments

Comments
 (0)