Skip optimization if the index has duplicate data (#43121)

mayya-sharipova · web-flow · commit 3c29734cce05 · 2019-07-03T14:27:00.000-04:00
Skip sort optimization if the index has 50% or more data
with the same value.
When index has a lot of docs with the same value, sort
optimization doesn't make sense, as DistanceFeatureQuery
will produce same scores for these docs, and Lucene
will use the second sort to tie-break. This could be slower
than usual sorting.
diff --git a/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java b/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java
@@ -387,8 +387,12 @@ private static Query tryRewriteLongSort(SearchContext searchContext, IndexReader
         ((sortField.getReverse() == false) && (missingValue == Long.MAX_VALUE));
         if (missingValuesAccordingToSort == false) return null;
 
+        int docCount = PointValues.getDocCount(reader, fieldName);
+         // is not worth to run optimization on small index
+        if (docCount <= 512) return null;
+
         // check for multiple values
-        if (PointValues.size(reader, fieldName) != PointValues.getDocCount(reader, fieldName)) return null; //TODO: handle multiple values
+        if (PointValues.size(reader, fieldName) != docCount) return null; //TODO: handle multiple values
 
         // check if the optimization makes sense with the track_total_hits setting
         if (searchContext.trackTotalHitsUpTo() == Integer.MAX_VALUE) {
@@ -408,6 +412,7 @@ private static Query tryRewriteLongSort(SearchContext searchContext, IndexReader
         if (minValue == maxValue) {
             rewrittenQuery = new DocValuesFieldExistsQuery(fieldName);
         } else {
+            if (indexFieldHasDuplicateData(reader, fieldName)) return null;
             long origin = (sortField.getReverse()) ? maxValue : minValue;
             long pivotDistance = (maxValue - minValue) >>> 1; // division by 2 on the unsigned representation to avoid overflow
             if (pivotDistance == 0) { // 0 if maxValue = (minValue + 1)
@@ -469,5 +474,77 @@ private static boolean canEarlyTerminate(IndexReader reader, SortAndFormats sort
         return true;
     }
 
+    /**
+    * Returns true if more than 50% of data in the index have the same value
+    * The evaluation is approximation based on finding the median value and estimating its count
+    * Returns true if the total count of median values is greater or equal to half of the total count of documents
+    */
+    static boolean indexFieldHasDuplicateData(IndexReader reader, String field) throws IOException {
+        long globalDocCount = 0;
+        long globalMedianCount = 0;
+        for (LeafReaderContext lrc : reader.leaves()) {
+            PointValues pointValues = lrc.reader().getPointValues(field);
+            if (pointValues == null) continue;
+            int docCount = pointValues.getDocCount();
+            if (docCount <= 512) { // skipping small segments as estimateMedianCount doesn't work well on them
+                continue;
+            }
+            assert(pointValues.size() == docCount); // TODO: modify the code to handle multiple values
+            globalDocCount += docCount;
+            long medianValue = estimateMedianValue(pointValues);
+            long medianCount = estimatePointCount(pointValues, medianValue, medianValue);
+            globalMedianCount += medianCount;
+        }
+        return (globalMedianCount >= globalDocCount/2);
+    }
+
+    static long estimateMedianValue(PointValues pointValues) throws IOException {
+        long minValue = LongPoint.decodeDimension(pointValues.getMinPackedValue(), 0);
+        long maxValue = LongPoint.decodeDimension(pointValues.getMaxPackedValue(), 0);
+        while (minValue < maxValue) {
+            long avgValue = Math.floorDiv(minValue + maxValue, 2);
+            long countLeft = estimatePointCount(pointValues, minValue, avgValue);
+            long countRight = estimatePointCount(pointValues, avgValue + 1, maxValue);
+            if (countLeft >= countRight) {
+                maxValue = avgValue;
+            } else {
+                minValue = avgValue + 1;
+            }
+        }
+        return maxValue;
+    }
+
+    static long estimatePointCount(PointValues pointValues, long minValue, long maxValue) {
+        final byte[] minValueAsBytes = new byte[Long.BYTES];
+        LongPoint.encodeDimension(minValue, minValueAsBytes, 0);
+        final byte[] maxValueAsBytes = new byte[Long.BYTES];
+        LongPoint.encodeDimension(maxValue, maxValueAsBytes, 0);
+
+        PointValues.IntersectVisitor visitor = new PointValues.IntersectVisitor() {
+            @Override
+            public void grow(int count) {}
+
+            @Override
+            public void visit(int docID) {}
+
+            @Override
+            public void visit(int docID, byte[] packedValue) {}
+
+            @Override
+            public PointValues.Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+                if (Arrays.compareUnsigned(minPackedValue, 0, Long.BYTES, maxValueAsBytes, 0, Long.BYTES) > 0 ||
+                    Arrays.compareUnsigned(maxPackedValue, 0, Long.BYTES, minValueAsBytes, 0, Long.BYTES) < 0) {
+                    return PointValues.Relation.CELL_OUTSIDE_QUERY;
+                }
+                if (Arrays.compareUnsigned(minPackedValue, 0, Long.BYTES, minValueAsBytes, 0, Long.BYTES) < 0 ||
+                    Arrays.compareUnsigned(maxPackedValue, 0, Long.BYTES, maxValueAsBytes, 0, Long.BYTES) > 0) {
+                    return PointValues.Relation.CELL_CROSSES_QUERY;
+                }
+                return PointValues.Relation.CELL_INSIDE_QUERY;
+            }
+        };
+        return pointValues.estimatePointCount(visitor);
+    }
+
     private static class TimeExceededException extends RuntimeException {}
 }
diff --git a/server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java b/server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java
@@ -31,6 +31,7 @@
 import org.apache.lucene.document.TextField;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.NoMergePolicy;
@@ -65,8 +66,13 @@
 import org.apache.lucene.search.spans.SpanNearQuery;
 import org.apache.lucene.search.spans.SpanTermQuery;
 import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.bkd.BKDReader;
+import org.apache.lucene.util.bkd.BKDWriter;
 import org.elasticsearch.action.search.SearchTask;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.index.mapper.DateFieldMapper;
@@ -88,10 +94,14 @@
 import java.util.Collections;
 import java.util.List;
 
+import static org.elasticsearch.search.query.QueryPhase.estimateMedianValue;
+import static org.elasticsearch.search.query.QueryPhase.estimatePointCount;
 import static org.hamcrest.Matchers.anyOf;
 import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.greaterThanOrEqualTo;
 import static org.hamcrest.Matchers.instanceOf;
+import static org.hamcrest.Matchers.lessThan;
+import static org.hamcrest.Matchers.lessThanOrEqualTo;
 import static org.mockito.Mockito.mock;
 import static org.mockito.Mockito.when;
 import static org.mockito.Mockito.spy;
@@ -652,9 +662,9 @@ public void testNumericLongOrDateSortOptimization() throws Exception {
         TestSearchContext searchContext = spy(new TestSearchContext(null, indexShard));
         when(searchContext.mapperService()).thenReturn(mapperService);
 
-        final int numDocs = scaledRandomIntBetween(50, 100);
+        final int numDocs = 4000;
         Directory dir = newDirectory();
-        RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
+        IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(null));
         for (int i = 0; i < numDocs; ++i) {
             Document doc = new Document();
             long longValue = randomLongBetween(-10000000L, 10000000L);
@@ -708,6 +718,68 @@ public void testNumericLongOrDateSortOptimization() throws Exception {
         dir.close();
     }
 
+    public void testIndexHasDuplicateData() throws IOException {
+        int valuesCount = 5000;
+        int maxPointsInLeafNode = 40;
+        long expectedMedianCount = (long)(valuesCount * 0.6);
+        long expectedMedianValue = randomLongBetween(-10000000L, 10000000L);
+
+        try (Directory dir = newDirectory()) {
+            BKDWriter w = new BKDWriter(valuesCount, dir, "tmp", 1, 1, 8, maxPointsInLeafNode, 1, valuesCount);
+            byte[] longBytes = new byte[8];
+            for (int docId = 0; docId < valuesCount; docId++) {
+                long value = docId < expectedMedianCount ? expectedMedianValue : randomLongBetween(-10000000L, 10000000L);
+                LongPoint.encodeDimension(value, longBytes, 0);
+                w.add(longBytes, docId);
+            }
+            long indexFP;
+            try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) {
+                indexFP = w.finish(out);
+            }
+            try (IndexInput in = dir.openInput("bkd", IOContext.DEFAULT)) {
+                in.seek(indexFP);
+                BKDReader r = new BKDReader(in);
+                long medianValue = estimateMedianValue(r);
+                long medianCount = estimatePointCount(r, medianValue, medianValue);
+
+                assertEquals(expectedMedianValue, medianValue);
+                assertThat(medianCount, greaterThanOrEqualTo((long) (valuesCount/2))); //assert that Index has duplicate data
+                assertThat(medianCount, greaterThanOrEqualTo((long) (0.75 * expectedMedianCount)));
+                assertThat(medianCount, lessThanOrEqualTo((long) (1.25 * expectedMedianCount)));
+            }
+        }
+    }
+
+    public void testIndexHasNotDuplicateData() throws IOException {
+        int valuesCount = 5000;
+        int maxPointsInLeafNode = 40;
+        long expectedMedianCount = (long)(valuesCount * 0.35);
+        long expectedMedianValue = randomLongBetween(-10000000L, 10000000L);
+
+        try (Directory dir = newDirectory()) {
+            BKDWriter w = new BKDWriter(valuesCount, dir, "tmp", 1, 1, 8, maxPointsInLeafNode, 1, valuesCount);
+            byte[] longBytes = new byte[8];
+            for (int docId = 0; docId < valuesCount; docId++) {
+                long value = docId < expectedMedianCount ? expectedMedianValue : randomLongBetween(-10000000L, 10000000L);
+                LongPoint.encodeDimension(value, longBytes, 0);
+                w.add(longBytes, docId);
+            }
+            long indexFP;
+            try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) {
+                indexFP = w.finish(out);
+            }
+            try (IndexInput in = dir.openInput("bkd", IOContext.DEFAULT)) {
+                in.seek(indexFP);
+                BKDReader r = new BKDReader(in);
+                long medianValue = estimateMedianValue(r);
+                long medianCount = estimatePointCount(r, medianValue, medianValue);
+
+                // can't make any assertion about the values of medianValue and medianCount
+                // as BKDReader::estimatePointCount can be really off for non-duplicate data
+                assertThat(medianCount, lessThan((long) (valuesCount/2))); //assert that Index does NOT have duplicate data
+            }
+        }
+    }
 
     public void testMaxScoreQueryVisitor() {
         BitSetProducer producer = context -> new FixedBitSet(1);
@@ -760,42 +832,6 @@ public void testMaxScoreQueryVisitor() {
         }
     }
 
-    public void testNumericLongSortOptimizationDocsHaveTheSameValue() throws Exception {
-        final String fieldNameLong = "long-field";
-        MappedFieldType fieldTypeLong = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.LONG);
-        MapperService mapperService = mock(MapperService.class);
-        when(mapperService.fullName(fieldNameLong)).thenReturn(fieldTypeLong);
-        TestSearchContext searchContext = spy(new TestSearchContext(null, indexShard));
-        when(searchContext.mapperService()).thenReturn(mapperService);
-
-        final int numDocs = scaledRandomIntBetween(5, 10);
-        long longValue = randomLongBetween(-10000000L, 10000000L); // all docs have the same value
-        Directory dir = newDirectory();
-        RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
-        for (int i = 0; i < numDocs; ++i) {
-            Document doc = new Document();
-            doc.add(new LongPoint(fieldNameLong, longValue));
-            doc.add(new NumericDocValuesField(fieldNameLong, longValue));
-            writer.addDocument(doc);
-        }
-        writer.close();
-        final IndexReader reader = DirectoryReader.open(dir);
-        IndexSearcher searcher = getAssertingSortOptimizedSearcher(reader, 1);
-
-        final SortField sortFieldLong = new SortField(fieldNameLong, SortField.Type.LONG);
-        sortFieldLong.setMissingValue(Long.MAX_VALUE);
-        final Sort longSort = new Sort(sortFieldLong);
-        SortAndFormats sortAndFormats = new SortAndFormats(longSort, new DocValueFormat[]{DocValueFormat.RAW});
-        searchContext.sort(sortAndFormats);
-        searchContext.parsedQuery(new ParsedQuery(new MatchAllDocsQuery()));
-        searchContext.setTask(new SearchTask(123L, "", "", "", null, Collections.emptyMap()));
-        searchContext.setSize(10);
-        QueryPhase.execute(searchContext, searcher, checkCancelled -> {});
-        assertSortResults(searchContext.queryResult().topDocs().topDocs, (long) numDocs, false);
-        reader.close();
-        dir.close();
-    }
-
     // used to check that numeric long or date sort optimization was run
     private static IndexSearcher getAssertingSortOptimizedSearcher(IndexReader reader, int queryType) {
         return new IndexSearcher(reader) {