Skip to content

Commit 9c71827

Browse files
Add test for finding duplicate data in BKD-tree
This allows to control the number of points in the leaf node
1 parent e13761b commit 9c71827

File tree

2 files changed

+80
-56
lines changed

2 files changed

+80
-56
lines changed

server/src/main/java/org/elasticsearch/search/query/QueryPhase.java

+11-24
Original file line numberDiff line numberDiff line change
@@ -487,40 +487,36 @@ static boolean indexFieldHasDuplicateData(IndexReader reader, String field) thro
487487
long globalMedianCount = 0;
488488
for (LeafReaderContext lrc : reader.leaves()) {
489489
PointValues pointValues = lrc.reader().getPointValues(field);
490+
if (pointValues == null) continue;
490491
int docCount = pointValues.getDocCount();
491492
if (docCount <= 512) { // skipping small segments as estimateMedianCount doesn't work well on them
492493
continue;
493494
}
494495
globalDocCount += docCount;
495-
byte[] minValueAsBytes = pointValues.getMinPackedValue();
496-
byte[] maxValueAsBytes = pointValues.getMaxPackedValue();
497-
long minValue = LongPoint.decodeDimension(minValueAsBytes, 0);
498-
long maxValue = LongPoint.decodeDimension(maxValueAsBytes, 0);
499-
long medianCount = estimateMedianCount(pointValues, minValue, maxValue, docCount/2);
496+
long medianValue = estimateMedianValue(pointValues);
497+
long medianCount = estimatePointCount(pointValues, medianValue, medianValue);
500498
globalMedianCount += medianCount;
501499
}
502500
return (globalMedianCount >= globalDocCount/2);
503501
}
504502

505-
private static long estimateMedianCount(PointValues pointValues, long minValue, long maxValue, long threshold) {
503+
static long estimateMedianValue(PointValues pointValues) throws IOException {
504+
long minValue = LongPoint.decodeDimension(pointValues.getMinPackedValue(), 0);
505+
long maxValue = LongPoint.decodeDimension(pointValues.getMaxPackedValue(), 0);
506506
while (minValue < maxValue) {
507507
long avgValue = Math.floorDiv(minValue + maxValue, 2);
508508
long countLeft = estimatePointCount(pointValues, minValue, avgValue);
509-
if (countLeft >= threshold) {
509+
long countRight = estimatePointCount(pointValues, avgValue + 1, maxValue);
510+
if (countLeft >= countRight) {
510511
maxValue = avgValue;
511-
threshold = countLeft/2;
512512
} else {
513-
long countRight = estimatePointCount(pointValues, avgValue + 1, maxValue);
514513
minValue = avgValue + 1;
515-
threshold = countRight/2;
516514
}
517515
}
518-
// maxValue is the approximate median value, estimate its count
519-
long medianCount = estimatePointCount(pointValues, maxValue, maxValue);
520-
return medianCount;
516+
return maxValue;
521517
}
522518

523-
private static long estimatePointCount(PointValues pointValues, long minValue, long maxValue) {
519+
static long estimatePointCount(PointValues pointValues, long minValue, long maxValue) {
524520
final byte[] minValueAsBytes = new byte[Long.BYTES];
525521
LongPoint.encodeDimension(minValue, minValueAsBytes, 0);
526522
final byte[] maxValueAsBytes = new byte[Long.BYTES];
@@ -534,16 +530,7 @@ public void grow(int count) {}
534530
public void visit(int docID) {}
535531

536532
@Override
537-
public void visit(int docID, byte[] packedValue) {
538-
if (Arrays.compareUnsigned(packedValue, 0, Long.BYTES, minValueAsBytes, 0, Long.BYTES) < 0) {
539-
// Doc's value is too low, in this dimension
540-
return;
541-
}
542-
if (Arrays.compareUnsigned(packedValue, 0, Long.BYTES, maxValueAsBytes, 0, Long.BYTES) > 0) {
543-
// Doc's value is too high, in this dimension
544-
return;
545-
}
546-
}
533+
public void visit(int docID, byte[] packedValue) {}
547534

548535
@Override
549536
public PointValues.Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {

server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java

+69-32
Original file line numberDiff line numberDiff line change
@@ -66,8 +66,13 @@
6666
import org.apache.lucene.search.spans.SpanNearQuery;
6767
import org.apache.lucene.search.spans.SpanTermQuery;
6868
import org.apache.lucene.store.Directory;
69+
import org.apache.lucene.store.IOContext;
70+
import org.apache.lucene.store.IndexInput;
71+
import org.apache.lucene.store.IndexOutput;
6972
import org.apache.lucene.util.BytesRef;
7073
import org.apache.lucene.util.FixedBitSet;
74+
import org.apache.lucene.util.bkd.BKDReader;
75+
import org.apache.lucene.util.bkd.BKDWriter;
7176
import org.elasticsearch.action.search.SearchTask;
7277
import org.elasticsearch.common.settings.Settings;
7378
import org.elasticsearch.index.mapper.DateFieldMapper;
@@ -89,11 +94,14 @@
8994
import java.util.Collections;
9095
import java.util.List;
9196

92-
import static org.elasticsearch.search.query.QueryPhase.indexFieldHasDuplicateData;
97+
import static org.elasticsearch.search.query.QueryPhase.estimateMedianValue;
98+
import static org.elasticsearch.search.query.QueryPhase.estimatePointCount;
9399
import static org.hamcrest.Matchers.anyOf;
94100
import static org.hamcrest.Matchers.equalTo;
95101
import static org.hamcrest.Matchers.greaterThanOrEqualTo;
96102
import static org.hamcrest.Matchers.instanceOf;
103+
import static org.hamcrest.Matchers.lessThan;
104+
import static org.hamcrest.Matchers.lessThanOrEqualTo;
97105
import static org.mockito.Mockito.mock;
98106
import static org.mockito.Mockito.when;
99107
import static org.mockito.Mockito.spy;
@@ -654,7 +662,7 @@ public void testNumericLongOrDateSortOptimization() throws Exception {
654662
TestSearchContext searchContext = spy(new TestSearchContext(null, indexShard));
655663
when(searchContext.mapperService()).thenReturn(mapperService);
656664

657-
final int numDocs = 1000;
665+
final int numDocs = 4000;
658666
Directory dir = newDirectory();
659667
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(null));
660668
for (int i = 0; i < numDocs; ++i) {
@@ -710,39 +718,68 @@ public void testNumericLongOrDateSortOptimization() throws Exception {
710718
dir.close();
711719
}
712720

713-
public void testIndexFieldHasDuplicateData() throws IOException {
714-
final int numDocs = 10000;
715-
final int threshold1 = numDocs * 60 / 100;
716-
final int threshold2 = numDocs * 40 / 100;
717-
final int threshold3 = numDocs * 5 / 100;
718-
719-
final String fieldName = "duplicateField";
720-
final String fieldName2 = "notMuchDuplicateField";
721-
final String fieldName3 = "notDuplicateField";
722-
723-
long duplicateValue = randomLongBetween(-10000000L, 10000000L);
724-
long value, value2, value3;
725-
Directory dir = newDirectory();
726-
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(null));
727-
for (int i = 0; i < numDocs; ++i) {
728-
value = i < threshold1 ? duplicateValue : i;
729-
value2 = i < threshold2 ? duplicateValue : i;
730-
value3 = i < threshold3 ? duplicateValue : i;
731-
Document doc = new Document();
732-
doc.add(new LongPoint(fieldName, value));
733-
doc.add(new LongPoint(fieldName2, value2));
734-
doc.add(new LongPoint(fieldName3, value3));
735-
writer.addDocument(doc);
721+
public void testIndexHasDuplicateData() throws IOException {
722+
int valuesCount = 5000;
723+
int maxPointsInLeafNode = 40;
724+
long expectedMedianCount = (long)(valuesCount * 0.6);
725+
long expectedMedianValue = randomLongBetween(-10000000L, 10000000L);
726+
727+
try (Directory dir = newDirectory()) {
728+
BKDWriter w = new BKDWriter(valuesCount, dir, "tmp", 1, 1, 8, maxPointsInLeafNode, 1, valuesCount);
729+
byte[] longBytes = new byte[8];
730+
for (int docId = 0; docId < valuesCount; docId++) {
731+
long value = docId < expectedMedianCount ? expectedMedianValue : randomLongBetween(-10000000L, 10000000L);
732+
LongPoint.encodeDimension(value, longBytes, 0);
733+
w.add(longBytes, docId);
734+
}
735+
long indexFP;
736+
try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) {
737+
indexFP = w.finish(out);
738+
}
739+
try (IndexInput in = dir.openInput("bkd", IOContext.DEFAULT)) {
740+
in.seek(indexFP);
741+
BKDReader r = new BKDReader(in);
742+
long medianValue = estimateMedianValue(r);
743+
long medianCount = estimatePointCount(r, medianValue, medianValue);
744+
745+
assertEquals(expectedMedianValue, medianValue);
746+
assertThat(medianCount, greaterThanOrEqualTo((long) (valuesCount/2))); //assert that Index has duplicate data
747+
assertThat(medianCount, greaterThanOrEqualTo((long) (0.75 * expectedMedianCount)));
748+
assertThat(medianCount, lessThanOrEqualTo((long) (1.25 * expectedMedianCount)));
749+
}
736750
}
737-
writer.close();
738-
final IndexReader reader = DirectoryReader.open(dir);
739-
assertTrue(indexFieldHasDuplicateData(reader, fieldName));
740-
assertFalse(indexFieldHasDuplicateData(reader, fieldName2));
741-
assertFalse(indexFieldHasDuplicateData(reader, fieldName3));
742-
reader.close();
743-
dir.close();
744751
}
745752

753+
public void testIndexHasNotDuplicateData() throws IOException {
754+
int valuesCount = 5000;
755+
int maxPointsInLeafNode = 40;
756+
long expectedMedianCount = (long)(valuesCount * 0.35);
757+
long expectedMedianValue = randomLongBetween(-10000000L, 10000000L);
758+
759+
try (Directory dir = newDirectory()) {
760+
BKDWriter w = new BKDWriter(valuesCount, dir, "tmp", 1, 1, 8, maxPointsInLeafNode, 1, valuesCount);
761+
byte[] longBytes = new byte[8];
762+
for (int docId = 0; docId < valuesCount; docId++) {
763+
long value = docId < expectedMedianCount ? expectedMedianValue : randomLongBetween(-10000000L, 10000000L);
764+
LongPoint.encodeDimension(value, longBytes, 0);
765+
w.add(longBytes, docId);
766+
}
767+
long indexFP;
768+
try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) {
769+
indexFP = w.finish(out);
770+
}
771+
try (IndexInput in = dir.openInput("bkd", IOContext.DEFAULT)) {
772+
in.seek(indexFP);
773+
BKDReader r = new BKDReader(in);
774+
long medianValue = estimateMedianValue(r);
775+
long medianCount = estimatePointCount(r, medianValue, medianValue);
776+
777+
// can't make any assertion about the values of medianValue and medianCount
778+
// as BKDReader::estimatePointCount can be really off for non-duplicate data
779+
assertThat(medianCount, lessThan((long) (valuesCount/2))); //assert that Index does NOT have duplicate data
780+
}
781+
}
782+
}
746783

747784
public void testMaxScoreQueryVisitor() {
748785
BitSetProducer producer = context -> new FixedBitSet(1);

0 commit comments

Comments
 (0)