Skip to content

Commit 91813ea

Browse files
authored
Save memory when significant_text is not on top (#58145)
This merges the aggregator for `significant_text` into `significant_terms`, applying the optimization built in #55873 to save memory when the aggregation is not on top. The `significant_text` aggregation is pretty memory intensive all on its own and this doesn't particularly help with that, but it'll help with the memory usage of any sub-aggregations.
1 parent 1bc2560 commit 91813ea

File tree

9 files changed

+353
-423
lines changed

9 files changed

+353
-423
lines changed

server/src/main/java/org/apache/lucene/analysis/miscellaneous/DuplicateByteSequenceSpotter.java

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,20 +25,18 @@
2525
* A Trie structure for analysing byte streams for duplicate sequences. Bytes
2626
* from a stream are added one at a time using the addByte method and the number
2727
* of times it has been seen as part of a sequence is returned.
28-
*
28+
* <p>
2929
* The minimum required length for a duplicate sequence detected is 6 bytes.
30-
*
30+
* <p>
3131
* The design goals are to maximize speed of lookup while minimizing the space
3232
* required to do so. This has led to a hybrid solution for representing the
3333
* bytes that make up a sequence in the trie.
34-
*
34+
* <p>
3535
* If we have 6 bytes in sequence e.g. abcdef then they are represented as
3636
* object nodes in the tree as follows:
3737
* <p>
3838
* (a)-(b)-(c)-(def as an int)
3939
* <p>
40-
*
41-
*
4240
* {@link RootTreeNode} objects are used for the first two levels of the tree
4341
* (representing bytes a and b in the example sequence). The combinations of
4442
* objects at these 2 levels are few so internally these objects allocate an
@@ -61,11 +59,9 @@
6159
* reached
6260
* <li>halting any growth of the tree
6361
* </ol>
64-
*
6562
* Tests on real-world-text show that the size of the tree is a multiple of the
6663
* input text where that multiplier varies between 10 and 5 times as the content
6764
* size increased from 10 to 100 megabytes of content.
68-
*
6965
*/
7066
public class DuplicateByteSequenceSpotter {
7167
public static final int TREE_DEPTH = 6;

server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/MapStringTermsAggregator.java

Lines changed: 99 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -47,22 +47,23 @@
4747
import java.util.function.BiConsumer;
4848
import java.util.function.Function;
4949
import java.util.function.Supplier;
50+
import java.util.function.LongConsumer;
5051

5152
/**
5253
* An aggregator of string values that hashes the strings on the fly rather
5354
* than up front like the {@link GlobalOrdinalsStringTermsAggregator}.
5455
*/
5556
public class MapStringTermsAggregator extends AbstractStringTermsAggregator {
57+
private final CollectorSource collectorSource;
5658
private final ResultStrategy<?, ?> resultStrategy;
57-
private final ValuesSource valuesSource;
5859
private final BytesKeyedBucketOrds bucketOrds;
5960
private final IncludeExclude.StringFilter includeExclude;
6061

6162
public MapStringTermsAggregator(
6263
String name,
6364
AggregatorFactories factories,
65+
CollectorSource collectorSource,
6466
Function<MapStringTermsAggregator, ResultStrategy<?, ?>> resultStrategy,
65-
ValuesSource valuesSource,
6667
BucketOrder order,
6768
DocValueFormat format,
6869
BucketCountThresholds bucketCountThresholds,
@@ -75,56 +76,39 @@ public MapStringTermsAggregator(
7576
Map<String, Object> metadata
7677
) throws IOException {
7778
super(name, factories, context, parent, order, format, bucketCountThresholds, collectionMode, showTermDocCountError, metadata);
79+
this.collectorSource = collectorSource;
7880
this.resultStrategy = resultStrategy.apply(this); // ResultStrategy needs a reference to the Aggregator to do its job.
79-
this.valuesSource = valuesSource;
8081
this.includeExclude = includeExclude;
8182
bucketOrds = BytesKeyedBucketOrds.build(context.bigArrays(), collectsFromSingleBucket);
8283
}
8384

8485
@Override
8586
public ScoreMode scoreMode() {
86-
if (valuesSource != null && valuesSource.needsScores()) {
87+
if (collectorSource.needsScores()) {
8788
return ScoreMode.COMPLETE;
8889
}
8990
return super.scoreMode();
9091
}
9192

9293
@Override
93-
public LeafBucketCollector getLeafCollector(LeafReaderContext ctx,
94-
final LeafBucketCollector sub) throws IOException {
95-
SortedBinaryDocValues values = valuesSource.bytesValues(ctx);
96-
return resultStrategy.wrapCollector(new LeafBucketCollectorBase(sub, values) {
97-
final BytesRefBuilder previous = new BytesRefBuilder();
98-
99-
@Override
100-
public void collect(int doc, long owningBucketOrd) throws IOException {
101-
if (false == values.advanceExact(doc)) {
102-
return;
103-
}
104-
int valuesCount = values.docValueCount();
105-
106-
// SortedBinaryDocValues don't guarantee uniqueness so we
107-
// need to take care of dups
108-
previous.clear();
109-
for (int i = 0; i < valuesCount; ++i) {
110-
final BytesRef bytes = values.nextValue();
111-
if (includeExclude != null && false == includeExclude.accept(bytes)) {
112-
continue;
113-
}
114-
if (i > 0 && previous.get().equals(bytes)) {
115-
continue;
116-
}
94+
public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, LeafBucketCollector sub) throws IOException {
95+
return resultStrategy.wrapCollector(
96+
collectorSource.getLeafCollector(
97+
includeExclude,
98+
ctx,
99+
sub,
100+
this::addRequestCircuitBreakerBytes,
101+
(s, doc, owningBucketOrd, bytes) -> {
117102
long bucketOrdinal = bucketOrds.add(owningBucketOrd, bytes);
118103
if (bucketOrdinal < 0) { // already seen
119104
bucketOrdinal = -1 - bucketOrdinal;
120-
collectExistingBucket(sub, doc, bucketOrdinal);
105+
collectExistingBucket(s, doc, bucketOrdinal);
121106
} else {
122-
collectBucket(sub, doc, bucketOrdinal);
107+
collectBucket(s, doc, bucketOrdinal);
123108
}
124-
previous.copyBytes(bytes);
125109
}
126-
}
127-
});
110+
)
111+
);
128112
}
129113

130114
@Override
@@ -146,7 +130,82 @@ public void collectDebugInfo(BiConsumer<String, Object> add) {
146130

147131
@Override
148132
public void doClose() {
149-
Releasables.close(bucketOrds, resultStrategy);
133+
Releasables.close(collectorSource, resultStrategy, bucketOrds);
134+
}
135+
136+
/**
137+
* Abstaction on top of building collectors to fetch values.
138+
*/
139+
public interface CollectorSource extends Releasable {
140+
boolean needsScores();
141+
142+
LeafBucketCollector getLeafCollector(
143+
IncludeExclude.StringFilter includeExclude,
144+
LeafReaderContext ctx,
145+
LeafBucketCollector sub,
146+
LongConsumer addRequestCircuitBreakerBytes,
147+
CollectConsumer consumer
148+
) throws IOException;
149+
}
150+
@FunctionalInterface
151+
public interface CollectConsumer {
152+
void accept(LeafBucketCollector sub, int doc, long owningBucketOrd, BytesRef bytes) throws IOException;
153+
}
154+
155+
/**
156+
* Fetch values from a {@link ValuesSource}.
157+
*/
158+
public static class ValuesSourceCollectorSource implements CollectorSource {
159+
private final ValuesSource valuesSource;
160+
161+
public ValuesSourceCollectorSource(ValuesSource valuesSource) {
162+
this.valuesSource = valuesSource;
163+
}
164+
165+
@Override
166+
public boolean needsScores() {
167+
return valuesSource.needsScores();
168+
}
169+
170+
@Override
171+
public LeafBucketCollector getLeafCollector(
172+
IncludeExclude.StringFilter includeExclude,
173+
LeafReaderContext ctx,
174+
LeafBucketCollector sub,
175+
LongConsumer addRequestCircuitBreakerBytes,
176+
CollectConsumer consumer
177+
) throws IOException {
178+
SortedBinaryDocValues values = valuesSource.bytesValues(ctx);
179+
return new LeafBucketCollectorBase(sub, values) {
180+
final BytesRefBuilder previous = new BytesRefBuilder();
181+
182+
@Override
183+
public void collect(int doc, long owningBucketOrd) throws IOException {
184+
if (false == values.advanceExact(doc)) {
185+
return;
186+
}
187+
int valuesCount = values.docValueCount();
188+
189+
// SortedBinaryDocValues don't guarantee uniqueness so we
190+
// need to take care of dups
191+
previous.clear();
192+
for (int i = 0; i < valuesCount; ++i) {
193+
BytesRef bytes = values.nextValue();
194+
if (includeExclude != null && false == includeExclude.accept(bytes)) {
195+
continue;
196+
}
197+
if (i > 0 && previous.get().equals(bytes)) {
198+
continue;
199+
}
200+
previous.copyBytes(bytes);
201+
consumer.accept(sub, doc, owningBucketOrd, bytes);
202+
}
203+
}
204+
};
205+
}
206+
207+
@Override
208+
public void close() {}
150209
}
151210

152211
/**
@@ -270,6 +329,12 @@ private InternalAggregation[] buildAggregations(long[] owningBucketOrds) throws
270329
* Builds results for the standard {@code terms} aggregation.
271330
*/
272331
class StandardTermsResults extends ResultStrategy<StringTerms, StringTerms.Bucket> {
332+
private final ValuesSource valuesSource;
333+
334+
StandardTermsResults(ValuesSource valuesSource) {
335+
this.valuesSource = valuesSource;
336+
}
337+
273338
@Override
274339
String describe() {
275340
return "terms";

server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/SignificanceLookup.java

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -36,10 +36,11 @@
3636
import org.elasticsearch.common.util.BytesRefHash;
3737
import org.elasticsearch.common.util.LongArray;
3838
import org.elasticsearch.common.util.LongHash;
39+
import org.elasticsearch.index.mapper.MappedFieldType;
3940
import org.elasticsearch.index.query.QueryBuilder;
4041
import org.elasticsearch.index.query.QueryShardContext;
42+
import org.elasticsearch.search.DocValueFormat;
4143
import org.elasticsearch.search.aggregations.bucket.terms.heuristic.SignificanceHeuristic;
42-
import org.elasticsearch.search.aggregations.support.ValuesSourceConfig;
4344

4445
import java.io.IOException;
4546

@@ -62,14 +63,17 @@ interface BackgroundFrequencyForLong extends Releasable {
6263
}
6364

6465
private final QueryShardContext context;
65-
private final ValuesSourceConfig config;
66+
private final MappedFieldType fieldType;
67+
private final DocValueFormat format;
6668
private final Query backgroundFilter;
6769
private final int supersetNumDocs;
6870
private TermsEnum termsEnum;
6971

70-
SignificanceLookup(QueryShardContext context, ValuesSourceConfig config, QueryBuilder backgroundFilter) throws IOException {
72+
SignificanceLookup(QueryShardContext context, MappedFieldType fieldType, DocValueFormat format, QueryBuilder backgroundFilter)
73+
throws IOException {
7174
this.context = context;
72-
this.config = config;
75+
this.fieldType = fieldType;
76+
this.format = format;
7377
this.backgroundFilter = backgroundFilter == null ? null : backgroundFilter.toQuery(context);
7478
/*
7579
* We need to use a superset size that includes deleted docs or we
@@ -129,7 +133,7 @@ public void close() {
129133
* Get the background frequency of a {@link BytesRef} term.
130134
*/
131135
private long getBackgroundFrequency(BytesRef term) throws IOException {
132-
return getBackgroundFrequency(config.fieldContext().fieldType().termQuery(config.format().format(term).toString(), context));
136+
return getBackgroundFrequency(fieldType.termQuery(format.format(term).toString(), context));
133137
}
134138

135139
/**
@@ -174,7 +178,7 @@ public void close() {
174178
* Get the background frequency of a {@code long} term.
175179
*/
176180
private long getBackgroundFrequency(long term) throws IOException {
177-
return getBackgroundFrequency(config.fieldContext().fieldType().termQuery(config.format().format(term).toString(), context));
181+
return getBackgroundFrequency(fieldType.termQuery(format.format(term).toString(), context));
178182
}
179183

180184
private long getBackgroundFrequency(Query query) throws IOException {
@@ -201,7 +205,7 @@ private TermsEnum getTermsEnum(String field) throws IOException {
201205
return termsEnum;
202206
}
203207
IndexReader reader = context.getIndexReader();
204-
termsEnum = new FilterableTermsEnum(reader, config.fieldContext().field(), PostingsEnum.NONE, backgroundFilter);
208+
termsEnum = new FilterableTermsEnum(reader, fieldType.name(), PostingsEnum.NONE, backgroundFilter);
205209
return termsEnum;
206210
}
207211

server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/SignificantTermsAggregatorFactory.java

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -227,7 +227,12 @@ protected Aggregator doCreateInternal(SearchContext searchContext,
227227
bucketCountThresholds.setShardSize(2 * BucketUtils.suggestShardSideQueueSize(bucketCountThresholds.getRequiredSize()));
228228
}
229229

230-
SignificanceLookup lookup = new SignificanceLookup(queryShardContext, config, backgroundFilter);
230+
SignificanceLookup lookup = new SignificanceLookup(
231+
queryShardContext,
232+
config.fieldContext().fieldType(),
233+
config.format(),
234+
backgroundFilter
235+
);
231236

232237
return sigTermsAggregatorSupplier.build(name, factories, config.getValuesSource(), config.format(),
233238
bucketCountThresholds, includeExclude, executionHint, searchContext, parent,
@@ -256,8 +261,8 @@ Aggregator create(String name,
256261
return new MapStringTermsAggregator(
257262
name,
258263
factories,
264+
new MapStringTermsAggregator.ValuesSourceCollectorSource(valuesSource),
259265
a -> a.new SignificantTermsResults(lookup, significanceHeuristic, collectsFromSingleBucket),
260-
valuesSource,
261266
null,
262267
format,
263268
bucketCountThresholds,

0 commit comments

Comments
 (0)