Add test for finding duplicate data in BKD-tree

mayya-sharipova · mayya-sharipova · commit 9c71827301e7 · 2019-06-25T15:02:58.000-04:00
This allows to control the number of points in the leaf node
diff --git a/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java b/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java
@@ -487,40 +487,36 @@ static boolean indexFieldHasDuplicateData(IndexReader reader, String field) thro
         long globalMedianCount = 0;
         for (LeafReaderContext lrc : reader.leaves()) {
             PointValues pointValues = lrc.reader().getPointValues(field);
+            if (pointValues == null) continue;
             int docCount = pointValues.getDocCount();
             if (docCount <= 512) { // skipping small segments as estimateMedianCount doesn't work well on them
                 continue;
             }
             globalDocCount += docCount;
-            byte[] minValueAsBytes = pointValues.getMinPackedValue();
-            byte[] maxValueAsBytes = pointValues.getMaxPackedValue();
-            long minValue = LongPoint.decodeDimension(minValueAsBytes, 0);
-            long maxValue = LongPoint.decodeDimension(maxValueAsBytes, 0);
-            long medianCount = estimateMedianCount(pointValues, minValue, maxValue, docCount/2);
+            long medianValue = estimateMedianValue(pointValues);
+            long medianCount = estimatePointCount(pointValues, medianValue, medianValue);
             globalMedianCount += medianCount;
         }
         return (globalMedianCount >= globalDocCount/2);
     }
 
-    private static long estimateMedianCount(PointValues pointValues, long minValue, long maxValue, long threshold) {
+    static long estimateMedianValue(PointValues pointValues) throws IOException {
+        long minValue = LongPoint.decodeDimension(pointValues.getMinPackedValue(), 0);
+        long maxValue = LongPoint.decodeDimension(pointValues.getMaxPackedValue(), 0);
         while (minValue < maxValue) {
             long avgValue = Math.floorDiv(minValue + maxValue, 2);
             long countLeft = estimatePointCount(pointValues, minValue, avgValue);
-            if (countLeft >= threshold) {
+            long countRight = estimatePointCount(pointValues, avgValue + 1, maxValue);
+            if (countLeft >= countRight) {
                 maxValue = avgValue;
-                threshold = countLeft/2;
             } else {
-                long countRight = estimatePointCount(pointValues, avgValue + 1, maxValue);
                 minValue = avgValue + 1;
-                threshold = countRight/2;
             }
         }
-        // maxValue is the approximate median value, estimate its count
-        long medianCount = estimatePointCount(pointValues, maxValue, maxValue);
-        return medianCount;
+        return maxValue;
     }
 
-    private static long estimatePointCount(PointValues pointValues, long minValue, long maxValue) {
+    static long estimatePointCount(PointValues pointValues, long minValue, long maxValue) {
         final byte[] minValueAsBytes = new byte[Long.BYTES];
         LongPoint.encodeDimension(minValue, minValueAsBytes, 0);
         final byte[] maxValueAsBytes = new byte[Long.BYTES];
@@ -534,16 +530,7 @@ public void grow(int count) {}
             public void visit(int docID) {}
 
             @Override
-            public void visit(int docID, byte[] packedValue) {
-                if (Arrays.compareUnsigned(packedValue, 0, Long.BYTES, minValueAsBytes, 0, Long.BYTES) < 0) {
-                    // Doc's value is too low, in this dimension
-                    return;
-                }
-                if (Arrays.compareUnsigned(packedValue, 0, Long.BYTES, maxValueAsBytes, 0, Long.BYTES) > 0) {
-                    // Doc's value is too high, in this dimension
-                    return;
-                }
-            }
+            public void visit(int docID, byte[] packedValue) {}
 
             @Override
             public PointValues.Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
diff --git a/server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java b/server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java
@@ -66,8 +66,13 @@
 import org.apache.lucene.search.spans.SpanNearQuery;
 import org.apache.lucene.search.spans.SpanTermQuery;
 import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.bkd.BKDReader;
+import org.apache.lucene.util.bkd.BKDWriter;
 import org.elasticsearch.action.search.SearchTask;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.index.mapper.DateFieldMapper;
@@ -89,11 +94,14 @@
 import java.util.Collections;
 import java.util.List;
 
-import static org.elasticsearch.search.query.QueryPhase.indexFieldHasDuplicateData;
+import static org.elasticsearch.search.query.QueryPhase.estimateMedianValue;
+import static org.elasticsearch.search.query.QueryPhase.estimatePointCount;
 import static org.hamcrest.Matchers.anyOf;
 import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.greaterThanOrEqualTo;
 import static org.hamcrest.Matchers.instanceOf;
+import static org.hamcrest.Matchers.lessThan;
+import static org.hamcrest.Matchers.lessThanOrEqualTo;
 import static org.mockito.Mockito.mock;
 import static org.mockito.Mockito.when;
 import static org.mockito.Mockito.spy;
@@ -654,7 +662,7 @@ public void testNumericLongOrDateSortOptimization() throws Exception {
         TestSearchContext searchContext = spy(new TestSearchContext(null, indexShard));
         when(searchContext.mapperService()).thenReturn(mapperService);
 
-        final int numDocs = 1000;
+        final int numDocs = 4000;
         Directory dir = newDirectory();
         IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(null));
         for (int i = 0; i < numDocs; ++i) {
@@ -710,39 +718,68 @@ public void testNumericLongOrDateSortOptimization() throws Exception {
         dir.close();
     }
 
-    public void testIndexFieldHasDuplicateData() throws IOException {
-        final int numDocs = 10000;
-        final int threshold1 = numDocs * 60 / 100;
-        final int threshold2 = numDocs * 40 / 100;
-        final int threshold3 = numDocs * 5 / 100;
-
-        final String fieldName = "duplicateField";
-        final String fieldName2 = "notMuchDuplicateField";
-        final String fieldName3 = "notDuplicateField";
-
-        long duplicateValue = randomLongBetween(-10000000L, 10000000L);
-        long value, value2, value3;
-        Directory dir = newDirectory();
-        IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(null));
-        for (int i = 0; i < numDocs; ++i) {
-            value = i < threshold1 ? duplicateValue : i;
-            value2 = i < threshold2 ? duplicateValue : i;
-            value3 = i < threshold3 ? duplicateValue : i;
-            Document doc = new Document();
-            doc.add(new LongPoint(fieldName, value));
-            doc.add(new LongPoint(fieldName2, value2));
-            doc.add(new LongPoint(fieldName3, value3));
-            writer.addDocument(doc);
+    public void testIndexHasDuplicateData() throws IOException {
+        int valuesCount = 5000;
+        int maxPointsInLeafNode = 40;
+        long expectedMedianCount = (long)(valuesCount * 0.6);
+        long expectedMedianValue = randomLongBetween(-10000000L, 10000000L);
+
+        try (Directory dir = newDirectory()) {
+            BKDWriter w = new BKDWriter(valuesCount, dir, "tmp", 1, 1, 8, maxPointsInLeafNode, 1, valuesCount);
+            byte[] longBytes = new byte[8];
+            for (int docId = 0; docId < valuesCount; docId++) {
+                long value = docId < expectedMedianCount ? expectedMedianValue : randomLongBetween(-10000000L, 10000000L);
+                LongPoint.encodeDimension(value, longBytes, 0);
+                w.add(longBytes, docId);
+            }
+            long indexFP;
+            try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) {
+                indexFP = w.finish(out);
+            }
+            try (IndexInput in = dir.openInput("bkd", IOContext.DEFAULT)) {
+                in.seek(indexFP);
+                BKDReader r = new BKDReader(in);
+                long medianValue = estimateMedianValue(r);
+                long medianCount = estimatePointCount(r, medianValue, medianValue);
+
+                assertEquals(expectedMedianValue, medianValue);
+                assertThat(medianCount, greaterThanOrEqualTo((long) (valuesCount/2))); //assert that Index has duplicate data
+                assertThat(medianCount, greaterThanOrEqualTo((long) (0.75 * expectedMedianCount)));
+                assertThat(medianCount, lessThanOrEqualTo((long) (1.25 * expectedMedianCount)));
+            }
         }
-        writer.close();
-        final IndexReader reader = DirectoryReader.open(dir);
-        assertTrue(indexFieldHasDuplicateData(reader, fieldName));
-        assertFalse(indexFieldHasDuplicateData(reader, fieldName2));
-        assertFalse(indexFieldHasDuplicateData(reader, fieldName3));
-        reader.close();
-        dir.close();
     }
 
+    public void testIndexHasNotDuplicateData() throws IOException {
+        int valuesCount = 5000;
+        int maxPointsInLeafNode = 40;
+        long expectedMedianCount = (long)(valuesCount * 0.35);
+        long expectedMedianValue = randomLongBetween(-10000000L, 10000000L);
+
+        try (Directory dir = newDirectory()) {
+            BKDWriter w = new BKDWriter(valuesCount, dir, "tmp", 1, 1, 8, maxPointsInLeafNode, 1, valuesCount);
+            byte[] longBytes = new byte[8];
+            for (int docId = 0; docId < valuesCount; docId++) {
+                long value = docId < expectedMedianCount ? expectedMedianValue : randomLongBetween(-10000000L, 10000000L);
+                LongPoint.encodeDimension(value, longBytes, 0);
+                w.add(longBytes, docId);
+            }
+            long indexFP;
+            try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) {
+                indexFP = w.finish(out);
+            }
+            try (IndexInput in = dir.openInput("bkd", IOContext.DEFAULT)) {
+                in.seek(indexFP);
+                BKDReader r = new BKDReader(in);
+                long medianValue = estimateMedianValue(r);
+                long medianCount = estimatePointCount(r, medianValue, medianValue);
+
+                // can't make any assertion about the values of medianValue and medianCount
+                // as BKDReader::estimatePointCount can be really off for non-duplicate data
+                assertThat(medianCount, lessThan((long) (valuesCount/2))); //assert that Index does NOT have duplicate data
+            }
+        }
+    }
 
     public void testMaxScoreQueryVisitor() {
         BitSetProducer producer = context -> new FixedBitSet(1);