Skip to content

Commit 1b16e74

Browse files
committed
Improved percolator's random candidate query duel test and
fixed bugs that were exposed by this: * Duplicates query leafs were not detected in a multi level boolean query * Tracking fields for numeric range queries did not work properly. * The sorting that was used to find the less restrictive clauses in disjunction query did not work too.
1 parent 7f21126 commit 1b16e74

File tree

3 files changed

+198
-98
lines changed

3 files changed

+198
-98
lines changed

modules/percolator/src/main/java/org/elasticsearch/percolator/QueryAnalyzer.java

+59-44
Original file line numberDiff line numberDiff line change
@@ -54,13 +54,15 @@
5454
import java.util.ArrayList;
5555
import java.util.Arrays;
5656
import java.util.Collections;
57+
import java.util.Comparator;
5758
import java.util.HashMap;
5859
import java.util.HashSet;
5960
import java.util.List;
6061
import java.util.Map;
6162
import java.util.Objects;
6263
import java.util.Set;
6364
import java.util.function.BiFunction;
65+
import java.util.stream.Collectors;
6466

6567
import static java.util.stream.Collectors.toSet;
6668

@@ -366,36 +368,37 @@ private static BiFunction<Query, Version, Result> booleanQuery() {
366368
Set<QueryExtraction> extractions = new HashSet<>();
367369
Set<String> seenRangeFields = new HashSet<>();
368370
for (Result result : results) {
369-
QueryExtraction[] t = result.extractions.toArray(new QueryExtraction[1]);
370-
if (result.extractions.size() == 1 && t[0].range != null) {
371-
// In case of range queries each extraction does not simply increment the minimum_should_match
372-
// for that percolator query like for a term based extraction, so that can lead to more false
373-
// positives for percolator queries with range queries than term based queries.
374-
// The is because the way number fields are extracted from the document to be percolated.
375-
// Per field a single range is extracted and if a percolator query has two or more range queries
376-
// on the same field than the the minimum should match can be higher than clauses in the CoveringQuery.
377-
// Therefore right now the minimum should match is incremented once per number field when processing
378-
// the percolator query at index time.
379-
if (seenRangeFields.add(t[0].range.fieldName)) {
380-
msm += 1;
381-
}
382-
} else {
383-
// In case that there are duplicate query extractions we need to be careful with incrementing msm,
384-
// because that could lead to valid matches not becoming candidate matches:
385-
// query: (field:val1 AND field:val2) AND (field:val2 AND field:val3)
386-
// doc: field: val1 val2 val3
387-
// So lets be protective and decrease the msm:
388-
int resultMsm = result.minimumShouldMatch;
389-
for (QueryExtraction queryExtraction : result.extractions) {
390-
if (extractions.contains(queryExtraction)) {
391-
// To protect against negative msm:
392-
// (sub results could consist out of disjunction and conjunction and
393-
// then we do not know which extraction contributed to msm)
394-
resultMsm = Math.max(0, resultMsm - 1);
371+
// In case that there are duplicate query extractions we need to be careful with incrementing msm,
372+
// because that could lead to valid matches not becoming candidate matches:
373+
// query: (field:val1 AND field:val2) AND (field:val2 AND field:val3)
374+
// doc: field: val1 val2 val3
375+
// So lets be protective and decrease the msm:
376+
int resultMsm = result.minimumShouldMatch;
377+
for (QueryExtraction queryExtraction : result.extractions) {
378+
if (queryExtraction.range != null) {
379+
// In case of range queries each extraction does not simply increment the minimum_should_match
380+
// for that percolator query like for a term based extraction, so that can lead to more false
381+
// positives for percolator queries with range queries than term based queries.
382+
// The is because the way number fields are extracted from the document to be percolated.
383+
// Per field a single range is extracted and if a percolator query has two or more range queries
384+
// on the same field than the the minimum should match can be higher than clauses in the CoveringQuery.
385+
// Therefore right now the minimum should match is incremented once per number field when processing
386+
// the percolator query at index time.
387+
if (seenRangeFields.add(queryExtraction.range.fieldName)) {
388+
resultMsm = 1;
389+
} else {
390+
resultMsm = 0;
395391
}
396392
}
397-
msm += resultMsm;
393+
394+
if (extractions.contains(queryExtraction)) {
395+
// To protect against negative msm:
396+
// (sub results could consist out of disjunction and conjunction and
397+
// then we do not know which extraction contributed to msm)
398+
resultMsm = Math.max(0, resultMsm - 1);
399+
}
398400
}
401+
msm += resultMsm;
399402
verified &= result.verified;
400403
matchAllDocs &= result.matchAllDocs;
401404
extractions.addAll(result.extractions);
@@ -518,8 +521,7 @@ private static BiFunction<Query, Version, Result> toParentBlockJoinQuery() {
518521
private static Result handleDisjunction(List<Query> disjunctions, int requiredShouldClauses, boolean otherClauses,
519522
Version version) {
520523
// Keep track of the msm for each clause:
521-
int[] msmPerClause = new int[disjunctions.size()];
522-
String[] rangeFieldNames = new String[disjunctions.size()];
524+
List<DisjunctionClause> clauses = new ArrayList<>(disjunctions.size());
523525
boolean verified = otherClauses == false;
524526
if (version.before(Version.V_6_1_0)) {
525527
verified &= requiredShouldClauses <= 1;
@@ -535,17 +537,14 @@ private static Result handleDisjunction(List<Query> disjunctions, int requiredSh
535537
}
536538
int resultMsm = subResult.minimumShouldMatch;
537539
for (QueryExtraction extraction : subResult.extractions) {
538-
if (terms.contains(extraction)) {
539-
resultMsm = Math.max(1, resultMsm - 1);
540+
if (terms.add(extraction) == false) {
541+
resultMsm = Math.max(0, resultMsm - 1);
540542
}
541543
}
542-
msmPerClause[i] = resultMsm;
543-
terms.addAll(subResult.extractions);
544-
545-
QueryExtraction[] t = subResult.extractions.toArray(new QueryExtraction[1]);
546-
if (subResult.extractions.size() == 1 && t[0].range != null) {
547-
rangeFieldNames[i] = t[0].range.fieldName;
548-
}
544+
clauses.add(new DisjunctionClause(resultMsm, subResult.extractions.stream()
545+
.filter(extraction -> extraction.range != null)
546+
.map(extraction -> extraction.range.fieldName)
547+
.collect(toSet())));
549548
}
550549
boolean matchAllDocs = numMatchAllClauses > 0 && numMatchAllClauses >= requiredShouldClauses;
551550

@@ -554,15 +553,20 @@ private static Result handleDisjunction(List<Query> disjunctions, int requiredSh
554553
Set<String> seenRangeFields = new HashSet<>();
555554
// Figure out what the combined msm is for this disjunction:
556555
// (sum the lowest required clauses, otherwise we're too strict and queries may not match)
557-
Arrays.sort(msmPerClause);
558-
int limit = Math.min(msmPerClause.length, Math.max(1, requiredShouldClauses));
556+
clauses = clauses.stream()
557+
.filter(o -> o.msm > 0)
558+
.sorted(Comparator.comparingInt(o -> o.msm))
559+
.collect(Collectors.toList());
560+
int limit = Math.min(clauses.size(), Math.max(1, requiredShouldClauses));
559561
for (int i = 0; i < limit; i++) {
560-
if (rangeFieldNames[i] != null) {
561-
if (seenRangeFields.add(rangeFieldNames[i])) {
562-
msm += 1;
562+
if (clauses.get(i).rangeFieldNames.isEmpty() == false) {
563+
for (String rangeField: clauses.get(i).rangeFieldNames) {
564+
if (seenRangeFields.add(rangeField)) {
565+
msm += 1;
566+
}
563567
}
564568
} else {
565-
msm += msmPerClause[i];
569+
msm += clauses.get(i).msm;
566570
}
567571
}
568572
} else {
@@ -575,6 +579,17 @@ private static Result handleDisjunction(List<Query> disjunctions, int requiredSh
575579
}
576580
}
577581

582+
static class DisjunctionClause {
583+
584+
final int msm;
585+
final Set<String> rangeFieldNames;
586+
587+
DisjunctionClause(int msm, Set<String> rangeFieldNames) {
588+
this.msm = msm;
589+
this.rangeFieldNames = rangeFieldNames;
590+
}
591+
}
592+
578593
static Set<QueryExtraction> selectBestExtraction(Set<QueryExtraction> extractions1, Set<QueryExtraction> extractions2) {
579594
assert extractions1 != null || extractions2 != null;
580595
if (extractions1 == null) {

0 commit comments

Comments
 (0)