Skip to content

Commit 85f5382

Browse files
authored
Fix more query extraction bugs. (#29388)
I found the following bugs: - The 6.0 logic for conjunctions didn't work when there were only `match_all` queries in MUST/FILTER clauses as they didn't propagate the `matchAllDocs` flag. - Some queries still had the same issue as `BooleanQuery` used to have with duplicate terms (see #28353), eg. `MultiPhraseQuery`. Closes #29376
1 parent ae2a9f7 commit 85f5382

File tree

3 files changed

+205
-100
lines changed

3 files changed

+205
-100
lines changed

modules/percolator/src/main/java/org/elasticsearch/percolator/QueryAnalyzer.java

Lines changed: 94 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
import org.apache.lucene.search.SynonymQuery;
3939
import org.apache.lucene.search.TermInSetQuery;
4040
import org.apache.lucene.search.TermQuery;
41+
import org.apache.lucene.search.BooleanClause.Occur;
4142
import org.apache.lucene.search.spans.SpanFirstQuery;
4243
import org.apache.lucene.search.spans.SpanNearQuery;
4344
import org.apache.lucene.search.spans.SpanNotQuery;
@@ -235,20 +236,18 @@ private static BiFunction<Query, Version, Result> multiPhraseQuery() {
235236
return new Result(true, Collections.emptySet(), 0);
236237
}
237238

238-
if (version.onOrAfter(Version.V_6_1_0)) {
239-
Set<QueryExtraction> extractions = new HashSet<>();
240-
for (Term[] termArr : terms) {
241-
extractions.addAll(Arrays.stream(termArr).map(QueryExtraction::new).collect(toSet()));
242-
}
243-
return new Result(false, extractions, terms.length);
244-
} else {
245-
Set<QueryExtraction> bestTermArr = null;
246-
for (Term[] termArr : terms) {
247-
Set<QueryExtraction> queryExtractions = Arrays.stream(termArr).map(QueryExtraction::new).collect(toSet());
248-
bestTermArr = selectBestExtraction(bestTermArr, queryExtractions);
239+
// This query has the same problem as boolean queries when it comes to duplicated terms
240+
// So to keep things simple, we just rewrite to a boolean query
241+
BooleanQuery.Builder builder = new BooleanQuery.Builder();
242+
for (Term[] termArr : terms) {
243+
BooleanQuery.Builder subBuilder = new BooleanQuery.Builder();
244+
for (Term term : termArr) {
245+
subBuilder.add(new TermQuery(term), Occur.SHOULD);
249246
}
250-
return new Result(false, bestTermArr, 1);
247+
builder.add(subBuilder.build(), Occur.FILTER);
251248
}
249+
// Make sure to unverify the result
250+
return booleanQuery().apply(builder.build(), version).unverify();
252251
};
253252
}
254253

@@ -263,41 +262,35 @@ private static BiFunction<Query, Version, Result> spanNearQuery() {
263262
return (query, version) -> {
264263
SpanNearQuery spanNearQuery = (SpanNearQuery) query;
265264
if (version.onOrAfter(Version.V_6_1_0)) {
266-
Set<Result> results = Arrays.stream(spanNearQuery.getClauses()).map(clause -> analyze(clause, version)).collect(toSet());
267-
int msm = 0;
268-
Set<QueryExtraction> extractions = new HashSet<>();
269-
Set<String> seenRangeFields = new HashSet<>();
270-
for (Result result : results) {
271-
QueryExtraction[] t = result.extractions.toArray(new QueryExtraction[1]);
272-
if (result.extractions.size() == 1 && t[0].range != null) {
273-
if (seenRangeFields.add(t[0].range.fieldName)) {
274-
msm += 1;
275-
}
276-
} else {
277-
msm += result.minimumShouldMatch;
278-
}
279-
extractions.addAll(result.extractions);
265+
// This has the same problem as boolean queries when it comes to duplicated clauses
266+
// so we rewrite to a boolean query to keep things simple.
267+
BooleanQuery.Builder builder = new BooleanQuery.Builder();
268+
for (SpanQuery clause : spanNearQuery.getClauses()) {
269+
builder.add(clause, Occur.FILTER);
280270
}
281-
return new Result(false, extractions, msm);
271+
// make sure to unverify the result
272+
return booleanQuery().apply(builder.build(), version).unverify();
282273
} else {
283-
Set<QueryExtraction> bestClauses = null;
274+
Result bestClause = null;
284275
for (SpanQuery clause : spanNearQuery.getClauses()) {
285276
Result temp = analyze(clause, version);
286-
bestClauses = selectBestExtraction(temp.extractions, bestClauses);
277+
bestClause = selectBestResult(temp, bestClause);
287278
}
288-
return new Result(false, bestClauses, 1);
279+
return bestClause;
289280
}
290281
};
291282
}
292283

293284
private static BiFunction<Query, Version, Result> spanOrQuery() {
294285
return (query, version) -> {
295-
Set<QueryExtraction> terms = new HashSet<>();
296286
SpanOrQuery spanOrQuery = (SpanOrQuery) query;
287+
// handle it like a boolean query to not dulplicate eg. logic
288+
// about duplicated terms
289+
BooleanQuery.Builder builder = new BooleanQuery.Builder();
297290
for (SpanQuery clause : spanOrQuery.getClauses()) {
298-
terms.addAll(analyze(clause, version).extractions);
291+
builder.add(clause, Occur.SHOULD);
299292
}
300-
return new Result(false, terms, Math.min(1, terms.size()));
293+
return booleanQuery().apply(builder.build(), version);
301294
};
302295
}
303296

@@ -423,9 +416,13 @@ private static BiFunction<Query, Version, Result> booleanQuery() {
423416
}
424417
}
425418
} else {
426-
Set<QueryExtraction> bestClause = null;
419+
Result bestClause = null;
427420
UnsupportedQueryException uqe = null;
421+
boolean hasProhibitedClauses = false;
428422
for (BooleanClause clause : clauses) {
423+
if (clause.isProhibited()) {
424+
hasProhibitedClauses = true;
425+
}
429426
if (clause.isRequired() == false) {
430427
// skip must_not clauses, we don't need to remember the things that do *not* match...
431428
// skip should clauses, this bq has must clauses, so we don't need to remember should clauses,
@@ -440,17 +437,20 @@ private static BiFunction<Query, Version, Result> booleanQuery() {
440437
uqe = e;
441438
continue;
442439
}
443-
bestClause = selectBestExtraction(temp.extractions, bestClause);
440+
bestClause = selectBestResult(temp, bestClause);
444441
}
445442
if (bestClause != null) {
446-
return new Result(false, bestClause, 1);
443+
if (hasProhibitedClauses || minimumShouldMatch > 0) {
444+
bestClause = bestClause.unverify();
445+
}
446+
return bestClause;
447447
} else {
448448
if (uqe != null) {
449449
// we're unable to select the best clause and an exception occurred, so we bail
450450
throw uqe;
451451
} else {
452452
// We didn't find a clause and no exception occurred, so this bq only contained MatchNoDocsQueries,
453-
return new Result(true, Collections.emptySet(), 1);
453+
return new Result(true, Collections.emptySet(), 0);
454454
}
455455
}
456456
}
@@ -616,51 +616,69 @@ static class DisjunctionClause {
616616
}
617617
}
618618

619-
static Set<QueryExtraction> selectBestExtraction(Set<QueryExtraction> extractions1, Set<QueryExtraction> extractions2) {
620-
assert extractions1 != null || extractions2 != null;
621-
if (extractions1 == null) {
622-
return extractions2;
623-
} else if (extractions2 == null) {
624-
return extractions1;
619+
/**
620+
* Return an extraction for the conjunction of {@code result1} and {@code result2}
621+
* by picking up clauses that look most restrictive and making it unverified if
622+
* the other clause is not null and doesn't match all documents. This is used by
623+
* 6.0.0 indices which didn't use the terms_set query.
624+
*/
625+
static Result selectBestResult(Result result1, Result result2) {
626+
assert result1 != null || result2 != null;
627+
if (result1 == null) {
628+
return result2;
629+
} else if (result2 == null) {
630+
return result1;
631+
} else if (result1.matchAllDocs) { // conjunction with match_all
632+
Result result = result2;
633+
if (result1.verified == false) {
634+
result = result.unverify();
635+
}
636+
return result;
637+
} else if (result2.matchAllDocs) { // conjunction with match_all
638+
Result result = result1;
639+
if (result2.verified == false) {
640+
result = result.unverify();
641+
}
642+
return result;
625643
} else {
626644
// Prefer term based extractions over range based extractions:
627645
boolean onlyRangeBasedExtractions = true;
628-
for (QueryExtraction clause : extractions1) {
646+
for (QueryExtraction clause : result1.extractions) {
629647
if (clause.term != null) {
630648
onlyRangeBasedExtractions = false;
631649
break;
632650
}
633651
}
634-
for (QueryExtraction clause : extractions2) {
652+
for (QueryExtraction clause : result2.extractions) {
635653
if (clause.term != null) {
636654
onlyRangeBasedExtractions = false;
637655
break;
638656
}
639657
}
640658

641659
if (onlyRangeBasedExtractions) {
642-
BytesRef extraction1SmallestRange = smallestRange(extractions1);
643-
BytesRef extraction2SmallestRange = smallestRange(extractions2);
660+
BytesRef extraction1SmallestRange = smallestRange(result1.extractions);
661+
BytesRef extraction2SmallestRange = smallestRange(result2.extractions);
644662
if (extraction1SmallestRange == null) {
645-
return extractions2;
663+
return result2.unverify();
646664
} else if (extraction2SmallestRange == null) {
647-
return extractions1;
665+
return result1.unverify();
648666
}
649667

650668
// Keep the clause with smallest range, this is likely to be the rarest.
651669
if (extraction1SmallestRange.compareTo(extraction2SmallestRange) <= 0) {
652-
return extractions1;
670+
return result1.unverify();
653671
} else {
654-
return extractions2;
672+
return result2.unverify();
655673
}
656674
} else {
657-
int extraction1ShortestTerm = minTermLength(extractions1);
658-
int extraction2ShortestTerm = minTermLength(extractions2);
675+
int extraction1ShortestTerm = minTermLength(result1.extractions);
676+
int extraction2ShortestTerm = minTermLength(result2.extractions);
659677
// keep the clause with longest terms, this likely to be rarest.
660678
if (extraction1ShortestTerm >= extraction2ShortestTerm) {
661-
return extractions1;
679+
return result1.unverify();
662680
} else {
663-
return extractions2;
681+
return result2.unverify();
664682
}
665683
}
666684
}
@@ -695,31 +713,46 @@ private static BytesRef smallestRange(Set<QueryExtraction> terms) {
695713
return min;
696714
}
697715

716+
/**
717+
* Query extraction result. A result is a candidate for a given document either if:
718+
* - `matchAllDocs` is true
719+
* - `extractions` and the document have `minimumShouldMatch` terms in common
720+
* Further more, the match doesn't need to be verified if `verified` is true, checking
721+
* `matchAllDocs` and `extractions` is enough.
722+
*/
698723
static class Result {
699724

700725
final Set<QueryExtraction> extractions;
701726
final boolean verified;
702727
final int minimumShouldMatch;
703728
final boolean matchAllDocs;
704729

705-
Result(boolean verified, Set<QueryExtraction> extractions, int minimumShouldMatch) {
730+
private Result(boolean matchAllDocs, boolean verified, Set<QueryExtraction> extractions, int minimumShouldMatch) {
706731
if (minimumShouldMatch > extractions.size()) {
707732
throw new IllegalArgumentException("minimumShouldMatch can't be greater than the number of extractions: "
708733
+ minimumShouldMatch + " > " + extractions.size());
709734
}
735+
this.matchAllDocs = matchAllDocs;
710736
this.extractions = extractions;
711737
this.verified = verified;
712738
this.minimumShouldMatch = minimumShouldMatch;
713-
this.matchAllDocs = false;
739+
}
740+
741+
Result(boolean verified, Set<QueryExtraction> extractions, int minimumShouldMatch) {
742+
this(false, verified, extractions, minimumShouldMatch);
714743
}
715744

716745
Result(boolean matchAllDocs, boolean verified) {
717-
this.extractions = Collections.emptySet();
718-
this.verified = verified;
719-
this.minimumShouldMatch = 0;
720-
this.matchAllDocs = matchAllDocs;
746+
this(matchAllDocs, verified, Collections.emptySet(), 0);
721747
}
722748

749+
Result unverify() {
750+
if (verified) {
751+
return new Result(matchAllDocs, false, extractions, minimumShouldMatch);
752+
} else {
753+
return this;
754+
}
755+
}
723756
}
724757

725758
static class QueryExtraction {

0 commit comments

Comments
 (0)