Skip to content

Commit 3c29734

Browse files
Skip optimization if the index has duplicate data (#43121)
Skip sort optimization if the index has 50% or more data with the same value. When index has a lot of docs with the same value, sort optimization doesn't make sense, as DistanceFeatureQuery will produce same scores for these docs, and Lucene will use the second sort to tie-break. This could be slower than usual sorting.
1 parent 1a9deae commit 3c29734

File tree

2 files changed

+152
-39
lines changed

2 files changed

+152
-39
lines changed

server/src/main/java/org/elasticsearch/search/query/QueryPhase.java

+78-1
Original file line numberDiff line numberDiff line change
@@ -387,8 +387,12 @@ private static Query tryRewriteLongSort(SearchContext searchContext, IndexReader
387387
((sortField.getReverse() == false) && (missingValue == Long.MAX_VALUE));
388388
if (missingValuesAccordingToSort == false) return null;
389389

390+
int docCount = PointValues.getDocCount(reader, fieldName);
391+
// is not worth to run optimization on small index
392+
if (docCount <= 512) return null;
393+
390394
// check for multiple values
391-
if (PointValues.size(reader, fieldName) != PointValues.getDocCount(reader, fieldName)) return null; //TODO: handle multiple values
395+
if (PointValues.size(reader, fieldName) != docCount) return null; //TODO: handle multiple values
392396

393397
// check if the optimization makes sense with the track_total_hits setting
394398
if (searchContext.trackTotalHitsUpTo() == Integer.MAX_VALUE) {
@@ -408,6 +412,7 @@ private static Query tryRewriteLongSort(SearchContext searchContext, IndexReader
408412
if (minValue == maxValue) {
409413
rewrittenQuery = new DocValuesFieldExistsQuery(fieldName);
410414
} else {
415+
if (indexFieldHasDuplicateData(reader, fieldName)) return null;
411416
long origin = (sortField.getReverse()) ? maxValue : minValue;
412417
long pivotDistance = (maxValue - minValue) >>> 1; // division by 2 on the unsigned representation to avoid overflow
413418
if (pivotDistance == 0) { // 0 if maxValue = (minValue + 1)
@@ -469,5 +474,77 @@ private static boolean canEarlyTerminate(IndexReader reader, SortAndFormats sort
469474
return true;
470475
}
471476

477+
/**
478+
* Returns true if more than 50% of data in the index have the same value
479+
* The evaluation is approximation based on finding the median value and estimating its count
480+
* Returns true if the total count of median values is greater or equal to half of the total count of documents
481+
*/
482+
static boolean indexFieldHasDuplicateData(IndexReader reader, String field) throws IOException {
483+
long globalDocCount = 0;
484+
long globalMedianCount = 0;
485+
for (LeafReaderContext lrc : reader.leaves()) {
486+
PointValues pointValues = lrc.reader().getPointValues(field);
487+
if (pointValues == null) continue;
488+
int docCount = pointValues.getDocCount();
489+
if (docCount <= 512) { // skipping small segments as estimateMedianCount doesn't work well on them
490+
continue;
491+
}
492+
assert(pointValues.size() == docCount); // TODO: modify the code to handle multiple values
493+
globalDocCount += docCount;
494+
long medianValue = estimateMedianValue(pointValues);
495+
long medianCount = estimatePointCount(pointValues, medianValue, medianValue);
496+
globalMedianCount += medianCount;
497+
}
498+
return (globalMedianCount >= globalDocCount/2);
499+
}
500+
501+
static long estimateMedianValue(PointValues pointValues) throws IOException {
502+
long minValue = LongPoint.decodeDimension(pointValues.getMinPackedValue(), 0);
503+
long maxValue = LongPoint.decodeDimension(pointValues.getMaxPackedValue(), 0);
504+
while (minValue < maxValue) {
505+
long avgValue = Math.floorDiv(minValue + maxValue, 2);
506+
long countLeft = estimatePointCount(pointValues, minValue, avgValue);
507+
long countRight = estimatePointCount(pointValues, avgValue + 1, maxValue);
508+
if (countLeft >= countRight) {
509+
maxValue = avgValue;
510+
} else {
511+
minValue = avgValue + 1;
512+
}
513+
}
514+
return maxValue;
515+
}
516+
517+
static long estimatePointCount(PointValues pointValues, long minValue, long maxValue) {
518+
final byte[] minValueAsBytes = new byte[Long.BYTES];
519+
LongPoint.encodeDimension(minValue, minValueAsBytes, 0);
520+
final byte[] maxValueAsBytes = new byte[Long.BYTES];
521+
LongPoint.encodeDimension(maxValue, maxValueAsBytes, 0);
522+
523+
PointValues.IntersectVisitor visitor = new PointValues.IntersectVisitor() {
524+
@Override
525+
public void grow(int count) {}
526+
527+
@Override
528+
public void visit(int docID) {}
529+
530+
@Override
531+
public void visit(int docID, byte[] packedValue) {}
532+
533+
@Override
534+
public PointValues.Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
535+
if (Arrays.compareUnsigned(minPackedValue, 0, Long.BYTES, maxValueAsBytes, 0, Long.BYTES) > 0 ||
536+
Arrays.compareUnsigned(maxPackedValue, 0, Long.BYTES, minValueAsBytes, 0, Long.BYTES) < 0) {
537+
return PointValues.Relation.CELL_OUTSIDE_QUERY;
538+
}
539+
if (Arrays.compareUnsigned(minPackedValue, 0, Long.BYTES, minValueAsBytes, 0, Long.BYTES) < 0 ||
540+
Arrays.compareUnsigned(maxPackedValue, 0, Long.BYTES, maxValueAsBytes, 0, Long.BYTES) > 0) {
541+
return PointValues.Relation.CELL_CROSSES_QUERY;
542+
}
543+
return PointValues.Relation.CELL_INSIDE_QUERY;
544+
}
545+
};
546+
return pointValues.estimatePointCount(visitor);
547+
}
548+
472549
private static class TimeExceededException extends RuntimeException {}
473550
}

server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java

+74-38
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
import org.apache.lucene.document.TextField;
3232
import org.apache.lucene.index.DirectoryReader;
3333
import org.apache.lucene.index.IndexReader;
34+
import org.apache.lucene.index.IndexWriter;
3435
import org.apache.lucene.index.IndexWriterConfig;
3536
import org.apache.lucene.index.LeafReaderContext;
3637
import org.apache.lucene.index.NoMergePolicy;
@@ -65,8 +66,13 @@
6566
import org.apache.lucene.search.spans.SpanNearQuery;
6667
import org.apache.lucene.search.spans.SpanTermQuery;
6768
import org.apache.lucene.store.Directory;
69+
import org.apache.lucene.store.IOContext;
70+
import org.apache.lucene.store.IndexInput;
71+
import org.apache.lucene.store.IndexOutput;
6872
import org.apache.lucene.util.BytesRef;
6973
import org.apache.lucene.util.FixedBitSet;
74+
import org.apache.lucene.util.bkd.BKDReader;
75+
import org.apache.lucene.util.bkd.BKDWriter;
7076
import org.elasticsearch.action.search.SearchTask;
7177
import org.elasticsearch.common.settings.Settings;
7278
import org.elasticsearch.index.mapper.DateFieldMapper;
@@ -88,10 +94,14 @@
8894
import java.util.Collections;
8995
import java.util.List;
9096

97+
import static org.elasticsearch.search.query.QueryPhase.estimateMedianValue;
98+
import static org.elasticsearch.search.query.QueryPhase.estimatePointCount;
9199
import static org.hamcrest.Matchers.anyOf;
92100
import static org.hamcrest.Matchers.equalTo;
93101
import static org.hamcrest.Matchers.greaterThanOrEqualTo;
94102
import static org.hamcrest.Matchers.instanceOf;
103+
import static org.hamcrest.Matchers.lessThan;
104+
import static org.hamcrest.Matchers.lessThanOrEqualTo;
95105
import static org.mockito.Mockito.mock;
96106
import static org.mockito.Mockito.when;
97107
import static org.mockito.Mockito.spy;
@@ -652,9 +662,9 @@ public void testNumericLongOrDateSortOptimization() throws Exception {
652662
TestSearchContext searchContext = spy(new TestSearchContext(null, indexShard));
653663
when(searchContext.mapperService()).thenReturn(mapperService);
654664

655-
final int numDocs = scaledRandomIntBetween(50, 100);
665+
final int numDocs = 4000;
656666
Directory dir = newDirectory();
657-
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
667+
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(null));
658668
for (int i = 0; i < numDocs; ++i) {
659669
Document doc = new Document();
660670
long longValue = randomLongBetween(-10000000L, 10000000L);
@@ -708,6 +718,68 @@ public void testNumericLongOrDateSortOptimization() throws Exception {
708718
dir.close();
709719
}
710720

721+
public void testIndexHasDuplicateData() throws IOException {
722+
int valuesCount = 5000;
723+
int maxPointsInLeafNode = 40;
724+
long expectedMedianCount = (long)(valuesCount * 0.6);
725+
long expectedMedianValue = randomLongBetween(-10000000L, 10000000L);
726+
727+
try (Directory dir = newDirectory()) {
728+
BKDWriter w = new BKDWriter(valuesCount, dir, "tmp", 1, 1, 8, maxPointsInLeafNode, 1, valuesCount);
729+
byte[] longBytes = new byte[8];
730+
for (int docId = 0; docId < valuesCount; docId++) {
731+
long value = docId < expectedMedianCount ? expectedMedianValue : randomLongBetween(-10000000L, 10000000L);
732+
LongPoint.encodeDimension(value, longBytes, 0);
733+
w.add(longBytes, docId);
734+
}
735+
long indexFP;
736+
try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) {
737+
indexFP = w.finish(out);
738+
}
739+
try (IndexInput in = dir.openInput("bkd", IOContext.DEFAULT)) {
740+
in.seek(indexFP);
741+
BKDReader r = new BKDReader(in);
742+
long medianValue = estimateMedianValue(r);
743+
long medianCount = estimatePointCount(r, medianValue, medianValue);
744+
745+
assertEquals(expectedMedianValue, medianValue);
746+
assertThat(medianCount, greaterThanOrEqualTo((long) (valuesCount/2))); //assert that Index has duplicate data
747+
assertThat(medianCount, greaterThanOrEqualTo((long) (0.75 * expectedMedianCount)));
748+
assertThat(medianCount, lessThanOrEqualTo((long) (1.25 * expectedMedianCount)));
749+
}
750+
}
751+
}
752+
753+
public void testIndexHasNotDuplicateData() throws IOException {
754+
int valuesCount = 5000;
755+
int maxPointsInLeafNode = 40;
756+
long expectedMedianCount = (long)(valuesCount * 0.35);
757+
long expectedMedianValue = randomLongBetween(-10000000L, 10000000L);
758+
759+
try (Directory dir = newDirectory()) {
760+
BKDWriter w = new BKDWriter(valuesCount, dir, "tmp", 1, 1, 8, maxPointsInLeafNode, 1, valuesCount);
761+
byte[] longBytes = new byte[8];
762+
for (int docId = 0; docId < valuesCount; docId++) {
763+
long value = docId < expectedMedianCount ? expectedMedianValue : randomLongBetween(-10000000L, 10000000L);
764+
LongPoint.encodeDimension(value, longBytes, 0);
765+
w.add(longBytes, docId);
766+
}
767+
long indexFP;
768+
try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) {
769+
indexFP = w.finish(out);
770+
}
771+
try (IndexInput in = dir.openInput("bkd", IOContext.DEFAULT)) {
772+
in.seek(indexFP);
773+
BKDReader r = new BKDReader(in);
774+
long medianValue = estimateMedianValue(r);
775+
long medianCount = estimatePointCount(r, medianValue, medianValue);
776+
777+
// can't make any assertion about the values of medianValue and medianCount
778+
// as BKDReader::estimatePointCount can be really off for non-duplicate data
779+
assertThat(medianCount, lessThan((long) (valuesCount/2))); //assert that Index does NOT have duplicate data
780+
}
781+
}
782+
}
711783

712784
public void testMaxScoreQueryVisitor() {
713785
BitSetProducer producer = context -> new FixedBitSet(1);
@@ -760,42 +832,6 @@ public void testMaxScoreQueryVisitor() {
760832
}
761833
}
762834

763-
public void testNumericLongSortOptimizationDocsHaveTheSameValue() throws Exception {
764-
final String fieldNameLong = "long-field";
765-
MappedFieldType fieldTypeLong = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.LONG);
766-
MapperService mapperService = mock(MapperService.class);
767-
when(mapperService.fullName(fieldNameLong)).thenReturn(fieldTypeLong);
768-
TestSearchContext searchContext = spy(new TestSearchContext(null, indexShard));
769-
when(searchContext.mapperService()).thenReturn(mapperService);
770-
771-
final int numDocs = scaledRandomIntBetween(5, 10);
772-
long longValue = randomLongBetween(-10000000L, 10000000L); // all docs have the same value
773-
Directory dir = newDirectory();
774-
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
775-
for (int i = 0; i < numDocs; ++i) {
776-
Document doc = new Document();
777-
doc.add(new LongPoint(fieldNameLong, longValue));
778-
doc.add(new NumericDocValuesField(fieldNameLong, longValue));
779-
writer.addDocument(doc);
780-
}
781-
writer.close();
782-
final IndexReader reader = DirectoryReader.open(dir);
783-
IndexSearcher searcher = getAssertingSortOptimizedSearcher(reader, 1);
784-
785-
final SortField sortFieldLong = new SortField(fieldNameLong, SortField.Type.LONG);
786-
sortFieldLong.setMissingValue(Long.MAX_VALUE);
787-
final Sort longSort = new Sort(sortFieldLong);
788-
SortAndFormats sortAndFormats = new SortAndFormats(longSort, new DocValueFormat[]{DocValueFormat.RAW});
789-
searchContext.sort(sortAndFormats);
790-
searchContext.parsedQuery(new ParsedQuery(new MatchAllDocsQuery()));
791-
searchContext.setTask(new SearchTask(123L, "", "", "", null, Collections.emptyMap()));
792-
searchContext.setSize(10);
793-
QueryPhase.execute(searchContext, searcher, checkCancelled -> {});
794-
assertSortResults(searchContext.queryResult().topDocs().topDocs, (long) numDocs, false);
795-
reader.close();
796-
dir.close();
797-
}
798-
799835
// used to check that numeric long or date sort optimization was run
800836
private static IndexSearcher getAssertingSortOptimizedSearcher(IndexReader reader, int queryType) {
801837
return new IndexSearcher(reader) {

0 commit comments

Comments
 (0)