38
38
import org .apache .lucene .search .SynonymQuery ;
39
39
import org .apache .lucene .search .TermInSetQuery ;
40
40
import org .apache .lucene .search .TermQuery ;
41
+ import org .apache .lucene .search .BooleanClause .Occur ;
41
42
import org .apache .lucene .search .spans .SpanFirstQuery ;
42
43
import org .apache .lucene .search .spans .SpanNearQuery ;
43
44
import org .apache .lucene .search .spans .SpanNotQuery ;
@@ -235,20 +236,18 @@ private static BiFunction<Query, Version, Result> multiPhraseQuery() {
235
236
return new Result (true , Collections .emptySet (), 0 );
236
237
}
237
238
238
- if (version .onOrAfter (Version .V_6_1_0 )) {
239
- Set <QueryExtraction > extractions = new HashSet <>();
240
- for (Term [] termArr : terms ) {
241
- extractions .addAll (Arrays .stream (termArr ).map (QueryExtraction ::new ).collect (toSet ()));
242
- }
243
- return new Result (false , extractions , terms .length );
244
- } else {
245
- Set <QueryExtraction > bestTermArr = null ;
246
- for (Term [] termArr : terms ) {
247
- Set <QueryExtraction > queryExtractions = Arrays .stream (termArr ).map (QueryExtraction ::new ).collect (toSet ());
248
- bestTermArr = selectBestExtraction (bestTermArr , queryExtractions );
239
+ // This query has the same problem as boolean queries when it comes to duplicated terms
240
+ // So to keep things simple, we just rewrite to a boolean query
241
+ BooleanQuery .Builder builder = new BooleanQuery .Builder ();
242
+ for (Term [] termArr : terms ) {
243
+ BooleanQuery .Builder subBuilder = new BooleanQuery .Builder ();
244
+ for (Term term : termArr ) {
245
+ subBuilder .add (new TermQuery (term ), Occur .SHOULD );
249
246
}
250
- return new Result ( false , bestTermArr , 1 );
247
+ builder . add ( subBuilder . build (), Occur . FILTER );
251
248
}
249
+ // Make sure to unverify the result
250
+ return booleanQuery ().apply (builder .build (), version ).unverify ();
252
251
};
253
252
}
254
253
@@ -263,41 +262,35 @@ private static BiFunction<Query, Version, Result> spanNearQuery() {
263
262
return (query , version ) -> {
264
263
SpanNearQuery spanNearQuery = (SpanNearQuery ) query ;
265
264
if (version .onOrAfter (Version .V_6_1_0 )) {
266
- Set <Result > results = Arrays .stream (spanNearQuery .getClauses ()).map (clause -> analyze (clause , version )).collect (toSet ());
267
- int msm = 0 ;
268
- Set <QueryExtraction > extractions = new HashSet <>();
269
- Set <String > seenRangeFields = new HashSet <>();
270
- for (Result result : results ) {
271
- QueryExtraction [] t = result .extractions .toArray (new QueryExtraction [1 ]);
272
- if (result .extractions .size () == 1 && t [0 ].range != null ) {
273
- if (seenRangeFields .add (t [0 ].range .fieldName )) {
274
- msm += 1 ;
275
- }
276
- } else {
277
- msm += result .minimumShouldMatch ;
278
- }
279
- extractions .addAll (result .extractions );
265
+ // This has the same problem as boolean queries when it comes to duplicated clauses
266
+ // so we rewrite to a boolean query to keep things simple.
267
+ BooleanQuery .Builder builder = new BooleanQuery .Builder ();
268
+ for (SpanQuery clause : spanNearQuery .getClauses ()) {
269
+ builder .add (clause , Occur .FILTER );
280
270
}
281
- return new Result (false , extractions , msm );
271
+ // make sure to unverify the result
272
+ return booleanQuery ().apply (builder .build (), version ).unverify ();
282
273
} else {
283
- Set < QueryExtraction > bestClauses = null ;
274
+ Result bestClause = null ;
284
275
for (SpanQuery clause : spanNearQuery .getClauses ()) {
285
276
Result temp = analyze (clause , version );
286
- bestClauses = selectBestExtraction (temp . extractions , bestClauses );
277
+ bestClause = selectBestResult (temp , bestClause );
287
278
}
288
- return new Result ( false , bestClauses , 1 ) ;
279
+ return bestClause ;
289
280
}
290
281
};
291
282
}
292
283
293
284
private static BiFunction <Query , Version , Result > spanOrQuery () {
294
285
return (query , version ) -> {
295
- Set <QueryExtraction > terms = new HashSet <>();
296
286
SpanOrQuery spanOrQuery = (SpanOrQuery ) query ;
287
+ // handle it like a boolean query to not dulplicate eg. logic
288
+ // about duplicated terms
289
+ BooleanQuery .Builder builder = new BooleanQuery .Builder ();
297
290
for (SpanQuery clause : spanOrQuery .getClauses ()) {
298
- terms . addAll ( analyze ( clause , version ). extractions );
291
+ builder . add ( clause , Occur . SHOULD );
299
292
}
300
- return new Result ( false , terms , Math . min ( 1 , terms . size ()) );
293
+ return booleanQuery (). apply ( builder . build (), version );
301
294
};
302
295
}
303
296
@@ -423,9 +416,13 @@ private static BiFunction<Query, Version, Result> booleanQuery() {
423
416
}
424
417
}
425
418
} else {
426
- Set < QueryExtraction > bestClause = null ;
419
+ Result bestClause = null ;
427
420
UnsupportedQueryException uqe = null ;
421
+ boolean hasProhibitedClauses = false ;
428
422
for (BooleanClause clause : clauses ) {
423
+ if (clause .isProhibited ()) {
424
+ hasProhibitedClauses = true ;
425
+ }
429
426
if (clause .isRequired () == false ) {
430
427
// skip must_not clauses, we don't need to remember the things that do *not* match...
431
428
// skip should clauses, this bq has must clauses, so we don't need to remember should clauses,
@@ -440,17 +437,20 @@ private static BiFunction<Query, Version, Result> booleanQuery() {
440
437
uqe = e ;
441
438
continue ;
442
439
}
443
- bestClause = selectBestExtraction (temp . extractions , bestClause );
440
+ bestClause = selectBestResult (temp , bestClause );
444
441
}
445
442
if (bestClause != null ) {
446
- return new Result (false , bestClause , 1 );
443
+ if (hasProhibitedClauses || minimumShouldMatch > 0 ) {
444
+ bestClause = bestClause .unverify ();
445
+ }
446
+ return bestClause ;
447
447
} else {
448
448
if (uqe != null ) {
449
449
// we're unable to select the best clause and an exception occurred, so we bail
450
450
throw uqe ;
451
451
} else {
452
452
// We didn't find a clause and no exception occurred, so this bq only contained MatchNoDocsQueries,
453
- return new Result (true , Collections .emptySet (), 1 );
453
+ return new Result (true , Collections .emptySet (), 0 );
454
454
}
455
455
}
456
456
}
@@ -616,51 +616,69 @@ static class DisjunctionClause {
616
616
}
617
617
}
618
618
619
- static Set <QueryExtraction > selectBestExtraction (Set <QueryExtraction > extractions1 , Set <QueryExtraction > extractions2 ) {
620
- assert extractions1 != null || extractions2 != null ;
621
- if (extractions1 == null ) {
622
- return extractions2 ;
623
- } else if (extractions2 == null ) {
624
- return extractions1 ;
619
+ /**
620
+ * Return an extraction for the conjunction of {@code result1} and {@code result2}
621
+ * by picking up clauses that look most restrictive and making it unverified if
622
+ * the other clause is not null and doesn't match all documents. This is used by
623
+ * 6.0.0 indices which didn't use the terms_set query.
624
+ */
625
+ static Result selectBestResult (Result result1 , Result result2 ) {
626
+ assert result1 != null || result2 != null ;
627
+ if (result1 == null ) {
628
+ return result2 ;
629
+ } else if (result2 == null ) {
630
+ return result1 ;
631
+ } else if (result1 .matchAllDocs ) { // conjunction with match_all
632
+ Result result = result2 ;
633
+ if (result1 .verified == false ) {
634
+ result = result .unverify ();
635
+ }
636
+ return result ;
637
+ } else if (result2 .matchAllDocs ) { // conjunction with match_all
638
+ Result result = result1 ;
639
+ if (result2 .verified == false ) {
640
+ result = result .unverify ();
641
+ }
642
+ return result ;
625
643
} else {
626
644
// Prefer term based extractions over range based extractions:
627
645
boolean onlyRangeBasedExtractions = true ;
628
- for (QueryExtraction clause : extractions1 ) {
646
+ for (QueryExtraction clause : result1 . extractions ) {
629
647
if (clause .term != null ) {
630
648
onlyRangeBasedExtractions = false ;
631
649
break ;
632
650
}
633
651
}
634
- for (QueryExtraction clause : extractions2 ) {
652
+ for (QueryExtraction clause : result2 . extractions ) {
635
653
if (clause .term != null ) {
636
654
onlyRangeBasedExtractions = false ;
637
655
break ;
638
656
}
639
657
}
640
658
641
659
if (onlyRangeBasedExtractions ) {
642
- BytesRef extraction1SmallestRange = smallestRange (extractions1 );
643
- BytesRef extraction2SmallestRange = smallestRange (extractions2 );
660
+ BytesRef extraction1SmallestRange = smallestRange (result1 . extractions );
661
+ BytesRef extraction2SmallestRange = smallestRange (result2 . extractions );
644
662
if (extraction1SmallestRange == null ) {
645
- return extractions2 ;
663
+ return result2 . unverify () ;
646
664
} else if (extraction2SmallestRange == null ) {
647
- return extractions1 ;
665
+ return result1 . unverify () ;
648
666
}
649
667
650
668
// Keep the clause with smallest range, this is likely to be the rarest.
651
669
if (extraction1SmallestRange .compareTo (extraction2SmallestRange ) <= 0 ) {
652
- return extractions1 ;
670
+ return result1 . unverify () ;
653
671
} else {
654
- return extractions2 ;
672
+ return result2 . unverify () ;
655
673
}
656
674
} else {
657
- int extraction1ShortestTerm = minTermLength (extractions1 );
658
- int extraction2ShortestTerm = minTermLength (extractions2 );
675
+ int extraction1ShortestTerm = minTermLength (result1 . extractions );
676
+ int extraction2ShortestTerm = minTermLength (result2 . extractions );
659
677
// keep the clause with longest terms, this likely to be rarest.
660
678
if (extraction1ShortestTerm >= extraction2ShortestTerm ) {
661
- return extractions1 ;
679
+ return result1 . unverify () ;
662
680
} else {
663
- return extractions2 ;
681
+ return result2 . unverify () ;
664
682
}
665
683
}
666
684
}
@@ -695,31 +713,46 @@ private static BytesRef smallestRange(Set<QueryExtraction> terms) {
695
713
return min ;
696
714
}
697
715
716
+ /**
717
+ * Query extraction result. A result is a candidate for a given document either if:
718
+ * - `matchAllDocs` is true
719
+ * - `extractions` and the document have `minimumShouldMatch` terms in common
720
+ * Further more, the match doesn't need to be verified if `verified` is true, checking
721
+ * `matchAllDocs` and `extractions` is enough.
722
+ */
698
723
static class Result {
699
724
700
725
final Set <QueryExtraction > extractions ;
701
726
final boolean verified ;
702
727
final int minimumShouldMatch ;
703
728
final boolean matchAllDocs ;
704
729
705
- Result (boolean verified , Set <QueryExtraction > extractions , int minimumShouldMatch ) {
730
+ private Result (boolean matchAllDocs , boolean verified , Set <QueryExtraction > extractions , int minimumShouldMatch ) {
706
731
if (minimumShouldMatch > extractions .size ()) {
707
732
throw new IllegalArgumentException ("minimumShouldMatch can't be greater than the number of extractions: "
708
733
+ minimumShouldMatch + " > " + extractions .size ());
709
734
}
735
+ this .matchAllDocs = matchAllDocs ;
710
736
this .extractions = extractions ;
711
737
this .verified = verified ;
712
738
this .minimumShouldMatch = minimumShouldMatch ;
713
- this .matchAllDocs = false ;
739
+ }
740
+
741
+ Result (boolean verified , Set <QueryExtraction > extractions , int minimumShouldMatch ) {
742
+ this (false , verified , extractions , minimumShouldMatch );
714
743
}
715
744
716
745
Result (boolean matchAllDocs , boolean verified ) {
717
- this .extractions = Collections .emptySet ();
718
- this .verified = verified ;
719
- this .minimumShouldMatch = 0 ;
720
- this .matchAllDocs = matchAllDocs ;
746
+ this (matchAllDocs , verified , Collections .emptySet (), 0 );
721
747
}
722
748
749
+ Result unverify () {
750
+ if (verified ) {
751
+ return new Result (matchAllDocs , false , extractions , minimumShouldMatch );
752
+ } else {
753
+ return this ;
754
+ }
755
+ }
723
756
}
724
757
725
758
static class QueryExtraction {
0 commit comments