Skip to content

Commit 51916aa

Browse files
[7.x] Allow mixing set-based and regexp-based include and exclude (#63325) (#64014)
* Allow mixing set-based and regexp-based include and exclude (#63325) Co-authored-by: Hugo Chargois <[email protected]>
1 parent 97bbbe1 commit 51916aa

File tree

5 files changed

+301
-197
lines changed

5 files changed

+301
-197
lines changed

docs/reference/aggregations/bucket/terms-aggregation.asciidoc

+2
Original file line numberDiff line numberDiff line change
@@ -599,6 +599,8 @@ expire then we may be missing accounts of interest and have set our numbers too
599599
Ultimately this is a balancing act between managing the Elasticsearch resources required to process a single request and the volume
600600
of requests that the client application must issue to complete a task.
601601

602+
WARNING: Partitions cannot be used together with an `exclude` parameter.
603+
602604
==== Multi-field terms aggregation
603605

604606
The `terms` aggregation does not support collecting terms from multiple fields

server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/IncludeExclude.java

+121-123
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
import org.apache.lucene.util.automaton.Operations;
3737
import org.apache.lucene.util.automaton.RegExp;
3838
import org.elasticsearch.ElasticsearchParseException;
39+
import org.elasticsearch.Version;
3940
import org.elasticsearch.common.ParseField;
4041
import org.elasticsearch.common.io.stream.StreamInput;
4142
import org.elasticsearch.common.io.stream.StreamOutput;
@@ -78,17 +79,8 @@ public static IncludeExclude merge(IncludeExclude include, IncludeExclude exclud
7879
if (include.isPartitionBased()) {
7980
throw new IllegalArgumentException("Cannot specify any excludes when using a partition-based include");
8081
}
81-
String includeMethod = include.isRegexBased() ? "regex" : "set";
82-
String excludeMethod = exclude.isRegexBased() ? "regex" : "set";
83-
if (includeMethod.equals(excludeMethod) == false) {
84-
throw new IllegalArgumentException("Cannot mix a " + includeMethod + "-based include with a "
85-
+ excludeMethod + "-based method");
86-
}
87-
if (include.isRegexBased()) {
88-
return new IncludeExclude(include.include, exclude.exclude);
89-
} else {
90-
return new IncludeExclude(include.includeValues, exclude.excludeValues);
91-
}
82+
83+
return new IncludeExclude(include.include, exclude.exclude, include.includeValues, exclude.excludeValues);
9284
}
9385

9486
public static IncludeExclude parseInclude(XContentParser parser) throws IOException {
@@ -196,46 +188,39 @@ public boolean accept(BytesRef value) {
196188
}
197189
}
198190

199-
static class AutomatonBackedStringFilter extends StringFilter {
191+
class SetAndRegexStringFilter extends StringFilter {
200192

201193
private final ByteRunAutomaton runAutomaton;
202-
203-
private AutomatonBackedStringFilter(Automaton automaton) {
204-
this.runAutomaton = new ByteRunAutomaton(automaton);
205-
}
206-
207-
/**
208-
* Returns whether the given value is accepted based on the {@code include} &amp; {@code exclude} patterns.
209-
*/
210-
@Override
211-
public boolean accept(BytesRef value) {
212-
return runAutomaton.run(value.bytes, value.offset, value.length);
213-
}
214-
}
215-
216-
static class TermListBackedStringFilter extends StringFilter {
217-
218194
private final Set<BytesRef> valids;
219195
private final Set<BytesRef> invalids;
220196

221-
TermListBackedStringFilter(Set<BytesRef> includeValues, Set<BytesRef> excludeValues) {
222-
this.valids = includeValues;
223-
this.invalids = excludeValues;
197+
private SetAndRegexStringFilter(DocValueFormat format) {
198+
Automaton automaton = toAutomaton();
199+
this.runAutomaton = automaton == null ? null : new ByteRunAutomaton(automaton);
200+
this.valids = parseForDocValues(includeValues, format);
201+
this.invalids = parseForDocValues(excludeValues, format);
224202
}
225203

226204
/**
227-
* Returns whether the given value is accepted based on the
228-
* {@code include} &amp; {@code exclude} sets.
205+
* Returns whether the given value is accepted based on the {@code includeValues} &amp; {@code excludeValues}
206+
* sets, as well as the {@code include} &amp; {@code exclude} patterns.
229207
*/
230208
@Override
231209
public boolean accept(BytesRef value) {
232-
return ((valids == null) || (valids.contains(value))) && ((invalids == null) || (!invalids.contains(value)));
210+
if (valids != null && valids.contains(value) == false) {
211+
return false;
212+
}
213+
214+
if (runAutomaton != null && runAutomaton.run(value.bytes, value.offset, value.length) == false) {
215+
return false;
216+
}
217+
218+
return invalids == null || invalids.contains(value) == false;
233219
}
234220
}
235221

236222
public abstract static class OrdinalsFilter extends Filter {
237223
public abstract LongBitSet acceptedGlobalOrdinals(SortedSetDocValues globalOrdinals) throws IOException;
238-
239224
}
240225

241226
class PartitionedOrdinalsFilter extends OrdinalsFilter {
@@ -258,59 +243,64 @@ public LongBitSet acceptedGlobalOrdinals(SortedSetDocValues globalOrdinals) thro
258243
}
259244
}
260245

261-
static class AutomatonBackedOrdinalsFilter extends OrdinalsFilter {
246+
class SetAndRegexOrdinalsFilter extends OrdinalsFilter {
262247

263248
private final CompiledAutomaton compiled;
249+
private final SortedSet<BytesRef> valids;
250+
private final SortedSet<BytesRef> invalids;
264251

265-
private AutomatonBackedOrdinalsFilter(Automaton automaton) {
266-
this.compiled = new CompiledAutomaton(automaton);
252+
private SetAndRegexOrdinalsFilter(DocValueFormat format) {
253+
Automaton automaton = toAutomaton();
254+
this.compiled = automaton == null ? null : new CompiledAutomaton(automaton);
255+
this.valids = parseForDocValues(includeValues, format);
256+
this.invalids = parseForDocValues(excludeValues, format);
267257
}
268258

269259
/**
270-
* Computes which global ordinals are accepted by this IncludeExclude instance.
271-
*
260+
* Computes which global ordinals are accepted by this IncludeExclude instance, based on the combination of
261+
* the {@code includeValues} &amp; {@code excludeValues} sets, as well as the {@code include} &amp;
262+
* {@code exclude} patterns.
272263
*/
273264
@Override
274265
public LongBitSet acceptedGlobalOrdinals(SortedSetDocValues globalOrdinals) throws IOException {
275-
LongBitSet acceptedGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount());
276-
TermsEnum globalTermsEnum;
277-
Terms globalTerms = new DocValuesTerms(globalOrdinals);
278-
// TODO: specialize based on compiled.type: for ALL and prefixes (sinkState >= 0 ) we can avoid i/o and just set bits.
279-
globalTermsEnum = compiled.getTermsEnum(globalTerms);
280-
for (BytesRef term = globalTermsEnum.next(); term != null; term = globalTermsEnum.next()) {
281-
acceptedGlobalOrdinals.set(globalTermsEnum.ord());
282-
}
283-
return acceptedGlobalOrdinals;
284-
}
285-
286-
}
287-
288-
static class TermListBackedOrdinalsFilter extends OrdinalsFilter {
289-
290-
private final SortedSet<BytesRef> includeValues;
291-
private final SortedSet<BytesRef> excludeValues;
292-
293-
TermListBackedOrdinalsFilter(SortedSet<BytesRef> includeValues, SortedSet<BytesRef> excludeValues) {
294-
this.includeValues = includeValues;
295-
this.excludeValues = excludeValues;
296-
}
297-
298-
@Override
299-
public LongBitSet acceptedGlobalOrdinals(SortedSetDocValues globalOrdinals) throws IOException {
300-
LongBitSet acceptedGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount());
301-
if (includeValues != null) {
302-
for (BytesRef term : includeValues) {
266+
LongBitSet acceptedGlobalOrdinals = null;
267+
if (valids != null) {
268+
acceptedGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount());
269+
for (BytesRef term : valids) {
303270
long ord = globalOrdinals.lookupTerm(term);
304271
if (ord >= 0) {
305272
acceptedGlobalOrdinals.set(ord);
306273
}
307274
}
308-
} else if (acceptedGlobalOrdinals.length() > 0) {
309-
// default to all terms being acceptable
310-
acceptedGlobalOrdinals.set(0, acceptedGlobalOrdinals.length());
311275
}
312-
if (excludeValues != null) {
313-
for (BytesRef term : excludeValues) {
276+
277+
if (compiled != null) {
278+
LongBitSet automatonGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount());
279+
TermsEnum globalTermsEnum;
280+
Terms globalTerms = new DocValuesTerms(globalOrdinals);
281+
// TODO: specialize based on compiled.type: for ALL and prefixes (sinkState >= 0 ) we can avoid i/o and just set bits.
282+
globalTermsEnum = compiled.getTermsEnum(globalTerms);
283+
for (BytesRef term = globalTermsEnum.next(); term != null; term = globalTermsEnum.next()) {
284+
automatonGlobalOrdinals.set(globalTermsEnum.ord());
285+
}
286+
287+
if (acceptedGlobalOrdinals == null) {
288+
acceptedGlobalOrdinals = automatonGlobalOrdinals;
289+
} else {
290+
acceptedGlobalOrdinals.and(automatonGlobalOrdinals);
291+
}
292+
}
293+
294+
if (acceptedGlobalOrdinals == null) {
295+
acceptedGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount());
296+
if (acceptedGlobalOrdinals.length() > 0) {
297+
// default to all terms being acceptable
298+
acceptedGlobalOrdinals.set(0, acceptedGlobalOrdinals.length());
299+
}
300+
}
301+
302+
if (invalids != null) {
303+
for (BytesRef term : invalids) {
314304
long ord = globalOrdinals.lookupTerm(term);
315305
if (ord >= 0) {
316306
acceptedGlobalOrdinals.clear(ord);
@@ -319,9 +309,9 @@ public LongBitSet acceptedGlobalOrdinals(SortedSetDocValues globalOrdinals) thro
319309
}
320310
return acceptedGlobalOrdinals;
321311
}
322-
323312
}
324313

314+
325315
private final RegExp include, exclude;
326316
private final SortedSet<BytesRef> includeValues, excludeValues;
327317
private final int incZeroBasedPartition;
@@ -332,17 +322,36 @@ public LongBitSet acceptedGlobalOrdinals(SortedSetDocValues globalOrdinals) thro
332322
* @param exclude The regular expression pattern for the terms to be excluded
333323
*/
334324
public IncludeExclude(RegExp include, RegExp exclude) {
335-
if (include == null && exclude == null) {
325+
this(include, exclude, null, null);
326+
}
327+
328+
public IncludeExclude(RegExp include, RegExp exclude, SortedSet<BytesRef> includeValues, SortedSet<BytesRef> excludeValues) {
329+
if (include == null && exclude == null && includeValues == null && excludeValues == null) {
330+
throw new IllegalArgumentException();
331+
}
332+
if (include != null && includeValues != null) {
333+
throw new IllegalArgumentException();
334+
}
335+
if (exclude != null && excludeValues != null) {
336336
throw new IllegalArgumentException();
337337
}
338338
this.include = include;
339339
this.exclude = exclude;
340-
this.includeValues = null;
341-
this.excludeValues = null;
340+
this.includeValues = includeValues;
341+
this.excludeValues = excludeValues;
342342
this.incZeroBasedPartition = 0;
343343
this.incNumPartitions = 0;
344344
}
345345

346+
public IncludeExclude(String include, String exclude, String[] includeValues, String[] excludeValues) {
347+
this(
348+
include == null ? null : new RegExp(include),
349+
exclude == null ? null : new RegExp(exclude),
350+
convertToBytesRefSet(includeValues),
351+
convertToBytesRefSet(excludeValues)
352+
);
353+
}
354+
346355
public IncludeExclude(String include, String exclude) {
347356
this(include == null ? null : new RegExp(include), exclude == null ? null : new RegExp(exclude));
348357
}
@@ -352,15 +361,7 @@ public IncludeExclude(String include, String exclude) {
352361
* @param excludeValues The terms to be excluded
353362
*/
354363
public IncludeExclude(SortedSet<BytesRef> includeValues, SortedSet<BytesRef> excludeValues) {
355-
if (includeValues == null && excludeValues == null) {
356-
throw new IllegalArgumentException();
357-
}
358-
this.include = null;
359-
this.exclude = null;
360-
this.incZeroBasedPartition = 0;
361-
this.incNumPartitions = 0;
362-
this.includeValues = includeValues;
363-
this.excludeValues = excludeValues;
364+
this(null, null, includeValues, excludeValues);
364365
}
365366

366367
public IncludeExclude(String[] includeValues, String[] excludeValues) {
@@ -395,18 +396,21 @@ public IncludeExclude(int partition, int numPartitions) {
395396
*/
396397
public IncludeExclude(StreamInput in) throws IOException {
397398
if (in.readBoolean()) {
398-
includeValues = null;
399-
excludeValues = null;
400-
incZeroBasedPartition = 0;
401-
incNumPartitions = 0;
402399
String includeString = in.readOptionalString();
403400
include = includeString == null ? null : new RegExp(includeString);
404401
String excludeString = in.readOptionalString();
405402
exclude = excludeString == null ? null : new RegExp(excludeString);
406-
return;
403+
if (in.getVersion().before(Version.V_7_11_0)) {
404+
incZeroBasedPartition = 0;
405+
incNumPartitions = 0;
406+
includeValues = null;
407+
excludeValues = null;
408+
return;
409+
}
410+
} else {
411+
include = null;
412+
exclude = null;
407413
}
408-
include = null;
409-
exclude = null;
410414
if (in.readBoolean()) {
411415
int size = in.readVInt();
412416
includeValues = new TreeSet<>();
@@ -436,26 +440,28 @@ public void writeTo(StreamOutput out) throws IOException {
436440
if (regexBased) {
437441
out.writeOptionalString(include == null ? null : include.getOriginalString());
438442
out.writeOptionalString(exclude == null ? null : exclude.getOriginalString());
439-
} else {
440-
boolean hasIncludes = includeValues != null;
441-
out.writeBoolean(hasIncludes);
442-
if (hasIncludes) {
443-
out.writeVInt(includeValues.size());
444-
for (BytesRef value : includeValues) {
445-
out.writeBytesRef(value);
446-
}
443+
if (out.getVersion().before(Version.V_7_11_0)) {
444+
return;
447445
}
448-
boolean hasExcludes = excludeValues != null;
449-
out.writeBoolean(hasExcludes);
450-
if (hasExcludes) {
451-
out.writeVInt(excludeValues.size());
452-
for (BytesRef value : excludeValues) {
453-
out.writeBytesRef(value);
454-
}
446+
}
447+
boolean hasIncludes = includeValues != null;
448+
out.writeBoolean(hasIncludes);
449+
if (hasIncludes) {
450+
out.writeVInt(includeValues.size());
451+
for (BytesRef value : includeValues) {
452+
out.writeBytesRef(value);
455453
}
456-
out.writeVInt(incNumPartitions);
457-
out.writeVInt(incZeroBasedPartition);
458454
}
455+
boolean hasExcludes = excludeValues != null;
456+
out.writeBoolean(hasExcludes);
457+
if (hasExcludes) {
458+
out.writeVInt(excludeValues.size());
459+
for (BytesRef value : excludeValues) {
460+
out.writeBytesRef(value);
461+
}
462+
}
463+
out.writeVInt(incNumPartitions);
464+
out.writeVInt(incZeroBasedPartition);
459465
}
460466

461467
private static SortedSet<BytesRef> convertToBytesRefSet(String[] values) {
@@ -573,29 +579,25 @@ public boolean isPartitionBased() {
573579

574580
private Automaton toAutomaton() {
575581
Automaton a = null;
582+
if (include == null && exclude == null) {
583+
return a;
584+
}
576585
if (include != null) {
577586
a = include.toAutomaton();
578-
} else if (includeValues != null) {
579-
a = Automata.makeStringUnion(includeValues);
580587
} else {
581588
a = Automata.makeAnyString();
582589
}
583590
if (exclude != null) {
584591
a = Operations.minus(a, exclude.toAutomaton(), Operations.DEFAULT_MAX_DETERMINIZED_STATES);
585-
} else if (excludeValues != null) {
586-
a = Operations.minus(a, Automata.makeStringUnion(excludeValues), Operations.DEFAULT_MAX_DETERMINIZED_STATES);
587592
}
588593
return a;
589594
}
590595

591596
public StringFilter convertToStringFilter(DocValueFormat format) {
592-
if (isRegexBased()) {
593-
return new AutomatonBackedStringFilter(toAutomaton());
594-
}
595597
if (isPartitionBased()){
596598
return new PartitionedStringFilter();
597599
}
598-
return new TermListBackedStringFilter(parseForDocValues(includeValues, format), parseForDocValues(excludeValues, format));
600+
return new SetAndRegexStringFilter(format);
599601
}
600602

601603
private static SortedSet<BytesRef> parseForDocValues(SortedSet<BytesRef> endUserFormattedValues, DocValueFormat format) {
@@ -612,15 +614,11 @@ private static SortedSet<BytesRef> parseForDocValues(SortedSet<BytesRef> endUser
612614
}
613615

614616
public OrdinalsFilter convertToOrdinalsFilter(DocValueFormat format) {
615-
616-
if (isRegexBased()) {
617-
return new AutomatonBackedOrdinalsFilter(toAutomaton());
618-
}
619617
if (isPartitionBased()){
620618
return new PartitionedOrdinalsFilter();
621619
}
622620

623-
return new TermListBackedOrdinalsFilter(parseForDocValues(includeValues, format), parseForDocValues(excludeValues, format));
621+
return new SetAndRegexOrdinalsFilter(format);
624622
}
625623

626624
public LongFilter convertToLongFilter(DocValueFormat format) {

0 commit comments

Comments
 (0)