|
66 | 66 | import org.apache.lucene.search.spans.SpanNearQuery;
|
67 | 67 | import org.apache.lucene.search.spans.SpanTermQuery;
|
68 | 68 | import org.apache.lucene.store.Directory;
|
| 69 | +import org.apache.lucene.store.IOContext; |
| 70 | +import org.apache.lucene.store.IndexInput; |
| 71 | +import org.apache.lucene.store.IndexOutput; |
69 | 72 | import org.apache.lucene.util.BytesRef;
|
70 | 73 | import org.apache.lucene.util.FixedBitSet;
|
| 74 | +import org.apache.lucene.util.bkd.BKDReader; |
| 75 | +import org.apache.lucene.util.bkd.BKDWriter; |
71 | 76 | import org.elasticsearch.action.search.SearchTask;
|
72 | 77 | import org.elasticsearch.common.settings.Settings;
|
73 | 78 | import org.elasticsearch.index.mapper.DateFieldMapper;
|
|
89 | 94 | import java.util.Collections;
|
90 | 95 | import java.util.List;
|
91 | 96 |
|
92 |
| -import static org.elasticsearch.search.query.QueryPhase.indexFieldHasDuplicateData; |
| 97 | +import static org.elasticsearch.search.query.QueryPhase.estimateMedianValue; |
| 98 | +import static org.elasticsearch.search.query.QueryPhase.estimatePointCount; |
93 | 99 | import static org.hamcrest.Matchers.anyOf;
|
94 | 100 | import static org.hamcrest.Matchers.equalTo;
|
95 | 101 | import static org.hamcrest.Matchers.greaterThanOrEqualTo;
|
96 | 102 | import static org.hamcrest.Matchers.instanceOf;
|
| 103 | +import static org.hamcrest.Matchers.lessThan; |
| 104 | +import static org.hamcrest.Matchers.lessThanOrEqualTo; |
97 | 105 | import static org.mockito.Mockito.mock;
|
98 | 106 | import static org.mockito.Mockito.when;
|
99 | 107 | import static org.mockito.Mockito.spy;
|
@@ -654,7 +662,7 @@ public void testNumericLongOrDateSortOptimization() throws Exception {
|
654 | 662 | TestSearchContext searchContext = spy(new TestSearchContext(null, indexShard));
|
655 | 663 | when(searchContext.mapperService()).thenReturn(mapperService);
|
656 | 664 |
|
657 |
| - final int numDocs = 1000; |
| 665 | + final int numDocs = 4000; |
658 | 666 | Directory dir = newDirectory();
|
659 | 667 | IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(null));
|
660 | 668 | for (int i = 0; i < numDocs; ++i) {
|
@@ -710,39 +718,68 @@ public void testNumericLongOrDateSortOptimization() throws Exception {
|
710 | 718 | dir.close();
|
711 | 719 | }
|
712 | 720 |
|
713 |
| - public void testIndexFieldHasDuplicateData() throws IOException { |
714 |
| - final int numDocs = 10000; |
715 |
| - final int threshold1 = numDocs * 60 / 100; |
716 |
| - final int threshold2 = numDocs * 40 / 100; |
717 |
| - final int threshold3 = numDocs * 5 / 100; |
718 |
| - |
719 |
| - final String fieldName = "duplicateField"; |
720 |
| - final String fieldName2 = "notMuchDuplicateField"; |
721 |
| - final String fieldName3 = "notDuplicateField"; |
722 |
| - |
723 |
| - long duplicateValue = randomLongBetween(-10000000L, 10000000L); |
724 |
| - long value, value2, value3; |
725 |
| - Directory dir = newDirectory(); |
726 |
| - IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(null)); |
727 |
| - for (int i = 0; i < numDocs; ++i) { |
728 |
| - value = i < threshold1 ? duplicateValue : i; |
729 |
| - value2 = i < threshold2 ? duplicateValue : i; |
730 |
| - value3 = i < threshold3 ? duplicateValue : i; |
731 |
| - Document doc = new Document(); |
732 |
| - doc.add(new LongPoint(fieldName, value)); |
733 |
| - doc.add(new LongPoint(fieldName2, value2)); |
734 |
| - doc.add(new LongPoint(fieldName3, value3)); |
735 |
| - writer.addDocument(doc); |
| 721 | + public void testIndexHasDuplicateData() throws IOException { |
| 722 | + int valuesCount = 5000; |
| 723 | + int maxPointsInLeafNode = 40; |
| 724 | + long expectedMedianCount = (long)(valuesCount * 0.6); |
| 725 | + long expectedMedianValue = randomLongBetween(-10000000L, 10000000L); |
| 726 | + |
| 727 | + try (Directory dir = newDirectory()) { |
| 728 | + BKDWriter w = new BKDWriter(valuesCount, dir, "tmp", 1, 1, 8, maxPointsInLeafNode, 1, valuesCount); |
| 729 | + byte[] longBytes = new byte[8]; |
| 730 | + for (int docId = 0; docId < valuesCount; docId++) { |
| 731 | + long value = docId < expectedMedianCount ? expectedMedianValue : randomLongBetween(-10000000L, 10000000L); |
| 732 | + LongPoint.encodeDimension(value, longBytes, 0); |
| 733 | + w.add(longBytes, docId); |
| 734 | + } |
| 735 | + long indexFP; |
| 736 | + try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) { |
| 737 | + indexFP = w.finish(out); |
| 738 | + } |
| 739 | + try (IndexInput in = dir.openInput("bkd", IOContext.DEFAULT)) { |
| 740 | + in.seek(indexFP); |
| 741 | + BKDReader r = new BKDReader(in); |
| 742 | + long medianValue = estimateMedianValue(r); |
| 743 | + long medianCount = estimatePointCount(r, medianValue, medianValue); |
| 744 | + |
| 745 | + assertEquals(expectedMedianValue, medianValue); |
| 746 | + assertThat(medianCount, greaterThanOrEqualTo((long) (valuesCount/2))); //assert that Index has duplicate data |
| 747 | + assertThat(medianCount, greaterThanOrEqualTo((long) (0.75 * expectedMedianCount))); |
| 748 | + assertThat(medianCount, lessThanOrEqualTo((long) (1.25 * expectedMedianCount))); |
| 749 | + } |
736 | 750 | }
|
737 |
| - writer.close(); |
738 |
| - final IndexReader reader = DirectoryReader.open(dir); |
739 |
| - assertTrue(indexFieldHasDuplicateData(reader, fieldName)); |
740 |
| - assertFalse(indexFieldHasDuplicateData(reader, fieldName2)); |
741 |
| - assertFalse(indexFieldHasDuplicateData(reader, fieldName3)); |
742 |
| - reader.close(); |
743 |
| - dir.close(); |
744 | 751 | }
|
745 | 752 |
|
| 753 | + public void testIndexHasNotDuplicateData() throws IOException { |
| 754 | + int valuesCount = 5000; |
| 755 | + int maxPointsInLeafNode = 40; |
| 756 | + long expectedMedianCount = (long)(valuesCount * 0.35); |
| 757 | + long expectedMedianValue = randomLongBetween(-10000000L, 10000000L); |
| 758 | + |
| 759 | + try (Directory dir = newDirectory()) { |
| 760 | + BKDWriter w = new BKDWriter(valuesCount, dir, "tmp", 1, 1, 8, maxPointsInLeafNode, 1, valuesCount); |
| 761 | + byte[] longBytes = new byte[8]; |
| 762 | + for (int docId = 0; docId < valuesCount; docId++) { |
| 763 | + long value = docId < expectedMedianCount ? expectedMedianValue : randomLongBetween(-10000000L, 10000000L); |
| 764 | + LongPoint.encodeDimension(value, longBytes, 0); |
| 765 | + w.add(longBytes, docId); |
| 766 | + } |
| 767 | + long indexFP; |
| 768 | + try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) { |
| 769 | + indexFP = w.finish(out); |
| 770 | + } |
| 771 | + try (IndexInput in = dir.openInput("bkd", IOContext.DEFAULT)) { |
| 772 | + in.seek(indexFP); |
| 773 | + BKDReader r = new BKDReader(in); |
| 774 | + long medianValue = estimateMedianValue(r); |
| 775 | + long medianCount = estimatePointCount(r, medianValue, medianValue); |
| 776 | + |
| 777 | + // can't make any assertion about the values of medianValue and medianCount |
| 778 | + // as BKDReader::estimatePointCount can be really off for non-duplicate data |
| 779 | + assertThat(medianCount, lessThan((long) (valuesCount/2))); //assert that Index does NOT have duplicate data |
| 780 | + } |
| 781 | + } |
| 782 | + } |
746 | 783 |
|
747 | 784 | public void testMaxScoreQueryVisitor() {
|
748 | 785 | BitSetProducer producer = context -> new FixedBitSet(1);
|
|
0 commit comments