From e496e0d93ef925b17e70ecdda1531d4087cb632f Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Fri, 21 Mar 2025 13:57:45 +0100 Subject: [PATCH 01/43] First step optimizing tsdb doc values codec merging. The doc values codec iterates a few times over the doc value instance that needs to be written to disk. In case when merging and index sorting is enabled, this is much more expensive, as each time the doc values instance is iterated an expensive doc id sorting is performed (in order to get the doc ids in order of index sorting). There are several reasons why the doc value instance is iterated multiple times: * To compute stats (num values, number of docs with value) required for writing values to disk. * To write bitset that indicate which documents have a value. (indexed disi, jump table) * To write the actual values to disk. * To write the addresses to disk (in case docs have multiple values) This applies for numeric doc values, but also for the ordinals of sorted (set) doc values. This PR addresses solving the first reason why doc value instance needs to be iterated. This is done only when in case of merging and when the segments to be merged with are also of type es87 doc values, codec version is the same and there are no deletes. --- .../tsdb/TSDBDocValuesMergeBenchmark.java | 176 +++++++++++++ .../codec/tsdb/DocValuesConsumerUtil.java | 248 ++++++++++++++++++ .../codec/tsdb/ES87TSDBDocValuesConsumer.java | 192 +++++++++++++- .../codec/tsdb/ES87TSDBDocValuesFormat.java | 29 +- .../codec/tsdb/ES87TSDBDocValuesProducer.java | 18 +- ...ValuesFormatVariableSkipIntervalTests.java | 4 +- 6 files changed, 648 insertions(+), 19 deletions(-) create mode 100644 benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java create mode 100644 server/src/main/java/org/elasticsearch/index/codec/tsdb/DocValuesConsumerUtil.java diff --git a/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java b/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java new file mode 100644 index 0000000000000..86ffa07368cbe --- /dev/null +++ b/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java @@ -0,0 +1,176 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.benchmark.index.codec.tsdb; + +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.codecs.DocValuesFormat; +import org.apache.lucene.codecs.lucene101.Lucene101Codec; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.SortedDocValuesField; +import org.apache.lucene.document.SortedNumericDocValuesField; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; +import org.apache.lucene.search.SortedNumericSortField; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.index.codec.tsdb.ES87TSDBDocValuesFormat; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Level; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.TearDown; +import org.openjdk.jmh.annotations.Threads; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.profile.AsyncProfiler; +import org.openjdk.jmh.runner.Runner; +import org.openjdk.jmh.runner.RunnerException; +import org.openjdk.jmh.runner.options.Options; +import org.openjdk.jmh.runner.options.OptionsBuilder; + +import java.io.IOException; +import java.nio.file.Files; +import java.util.Random; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; + +@BenchmarkMode(Mode.SampleTime) +@OutputTimeUnit(TimeUnit.MILLISECONDS) +@State(Scope.Benchmark) +@Fork(1) +@Threads(1) +@Warmup(iterations = 0) +@Measurement(iterations = 1) +public class TSDBDocValuesMergeBenchmark { + + @Param("13431204") + private int nDocs; + + @Param("1000") + private int deltaTime; + + @Param("42") + private int seed; + + private static final String TIMESTAMP_FIELD = "@timestamp"; + private static final String HOSTNAME_FIELD = "host.name"; + private static final long BASE_TIMESTAMP = 1704067200000L; + + private IndexWriter indexWriterWithoutOptimizedMerge; + private IndexWriter indexWriterWithOptimizedMerge; + private ExecutorService executorService; + + public static void main(String[] args) throws RunnerException { + final Options options = new OptionsBuilder().include(TSDBDocValuesMergeBenchmark.class.getSimpleName()) + .addProfiler(AsyncProfiler.class) + .build(); + + new Runner(options).run(); + } + + @Setup(Level.Trial) + public void setup() throws IOException { + executorService = Executors.newSingleThreadExecutor(); + + final Directory tempDirectoryWithoutDocValuesSkipper = FSDirectory.open(Files.createTempDirectory("temp1-")); + final Directory tempDirectoryWithDocValuesSkipper = FSDirectory.open(Files.createTempDirectory("temp2-")); + + indexWriterWithoutOptimizedMerge = createIndex(tempDirectoryWithoutDocValuesSkipper, false); + indexWriterWithOptimizedMerge = createIndex(tempDirectoryWithDocValuesSkipper, true); + } + + private IndexWriter createIndex(final Directory directory, final boolean optimizedMergeEnabled) + throws IOException { + + final IndexWriterConfig config = new IndexWriterConfig(new StandardAnalyzer()); + // NOTE: index sort config matching LogsDB's sort order + config.setIndexSort( + new Sort( + new SortField(HOSTNAME_FIELD, SortField.Type.STRING, false), + new SortedNumericSortField(TIMESTAMP_FIELD, SortField.Type.LONG, true) + ) + ); + ES87TSDBDocValuesFormat docValuesFormat = new ES87TSDBDocValuesFormat(4096, optimizedMergeEnabled); + config.setCodec(new Lucene101Codec() { + + @Override + public DocValuesFormat getDocValuesFormatForField(String field) { + return docValuesFormat; + } + }); + + long counter1 = 0; + long counter2 = 10_000_000; + long[] gauge1Values = new long[] {2, 4, 6, 8, 10, 12, 14, 16}; + long[] gauge2Values = new long[] {-2, -4, -6, -8, -10, -12, -14, -16}; + int numHosts = 1000; + + final Random random = new Random(seed); + IndexWriter indexWriter = new IndexWriter(directory, config); + for (int i = 0; i < nDocs; i++) { + final Document doc = new Document(); + + final int batchIndex = i / numHosts; + final String hostName = "host-" + batchIndex; + // Slightly vary the timestamp in each document + final long timestamp = BASE_TIMESTAMP + ((i % numHosts) * deltaTime) + random.nextInt(0, deltaTime); + + doc.add(new SortedDocValuesField(HOSTNAME_FIELD, new BytesRef(hostName))); + doc.add(new SortedNumericDocValuesField(TIMESTAMP_FIELD, timestamp)); + doc.add(new SortedNumericDocValuesField("counter_1", counter1++)); + doc.add(new SortedNumericDocValuesField("counter_2", counter2++)); + doc.add(new SortedNumericDocValuesField("gauge_1", gauge1Values[i % gauge1Values.length])); + doc.add(new SortedNumericDocValuesField("gauge_2", gauge2Values[i % gauge1Values.length])); + + indexWriter.addDocument(doc); + } + indexWriter.commit(); + return indexWriter; + } + + @Benchmark + public void forceMergeWithoutOptimizedMerge() throws IOException { + forceMerge(indexWriterWithoutOptimizedMerge); + } + + @Benchmark + public void forceMergeWithOptimizedMerge() throws IOException { + forceMerge(indexWriterWithOptimizedMerge); + } + + private void forceMerge(final IndexWriter indexWriter) throws IOException { + indexWriter.forceMerge(1); + } + + @TearDown(Level.Trial) + public void tearDown() { + if (executorService != null) { + executorService.shutdown(); + try { + if (executorService.awaitTermination(30, TimeUnit.SECONDS) == false) { + executorService.shutdownNow(); + } + } catch (InterruptedException e) { + executorService.shutdownNow(); + Thread.currentThread().interrupt(); + } + } + } +} diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/DocValuesConsumerUtil.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/DocValuesConsumerUtil.java new file mode 100644 index 0000000000000..6935d45a62c09 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/DocValuesConsumerUtil.java @@ -0,0 +1,248 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.codec.tsdb; + +import org.apache.lucene.index.DocIDMerger; +import org.apache.lucene.index.MergeState; +import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.index.SortedDocValues; +import org.apache.lucene.index.SortedNumericDocValues; +import org.apache.lucene.util.BytesRef; + +import java.io.IOException; +import java.util.List; + +class DocValuesConsumerUtil { + + static SortedNumericDocValues mergeSortedNumericValues(List subs, boolean indexIsSorted) + throws IOException { + long cost = 0; + for (SortedNumericDocValuesSub sub : subs) { + cost += sub.values.cost(); + } + final long finalCost = cost; + + final DocIDMerger docIDMerger = DocIDMerger.of(subs, indexIsSorted); + + return new SortedNumericDocValues() { + private int docID = -1; + private SortedNumericDocValuesSub current; + + @Override + public int docID() { + return docID; + } + + @Override + public int nextDoc() throws IOException { + current = docIDMerger.next(); + if (current == null) { + docID = NO_MORE_DOCS; + } else { + docID = current.mappedDocID; + } + return docID; + } + + @Override + public int advance(int target) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public boolean advanceExact(int target) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public long cost() { + return finalCost; + } + + @Override + public long nextValue() throws IOException { + return current.values.nextValue(); + } + + @Override + public int docValueCount() { + return current.values.docValueCount(); + } + + }; + } + + static class SortedNumericDocValuesSub extends DocIDMerger.Sub { + + final SortedNumericDocValues values; + int docID = -1; + + SortedNumericDocValuesSub(MergeState.DocMap docMap, SortedNumericDocValues values) { + super(docMap); + this.values = values; + assert values.docID() == -1; + } + + @Override + public int nextDoc() throws IOException { + return docID = values.nextDoc(); + } + } + + static NumericDocValues mergeNumericValues(List subs, boolean indexIsSorted) + throws IOException { + long cost = 0; + for (NumericDocValuesSub sub : subs) { + cost += sub.values.cost(); + } + final long finalCost = cost; + + final DocIDMerger docIDMerger = DocIDMerger.of(subs, indexIsSorted); + + return new NumericDocValues() { + private int docID = -1; + private NumericDocValuesSub current; + + @Override + public int docID() { + return docID; + } + + @Override + public int nextDoc() throws IOException { + current = docIDMerger.next(); + if (current == null) { + docID = NO_MORE_DOCS; + } else { + docID = current.mappedDocID; + } + return docID; + } + + @Override + public int advance(int target) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public boolean advanceExact(int target) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public long cost() { + return finalCost; + } + + @Override + public long longValue() throws IOException { + return current.values.longValue(); + } + + }; + } + + static class NumericDocValuesSub extends DocIDMerger.Sub { + + final NumericDocValues values; + int docID = -1; + + NumericDocValuesSub(MergeState.DocMap docMap, NumericDocValues values) { + super(docMap); + this.values = values; + assert values.docID() == -1; + } + + @Override + public int nextDoc() throws IOException { + return docID = values.nextDoc(); + } + } + + static SortedDocValues mergeSortedValues(List subs, boolean indexIsSorted) + throws IOException { + long cost = 0; + for (SortedDocValuesSub sub : subs) { + cost += sub.values.cost(); + } + final long finalCost = cost; + + final DocIDMerger docIDMerger = DocIDMerger.of(subs, indexIsSorted); + + return new SortedDocValues() { + private int docID = -1; + private SortedDocValuesSub current; + + @Override + public int docID() { + return docID; + } + + @Override + public int nextDoc() throws IOException { + current = docIDMerger.next(); + if (current == null) { + docID = NO_MORE_DOCS; + } else { + docID = current.mappedDocID; + } + return docID; + } + + @Override + public int advance(int target) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public boolean advanceExact(int target) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public long cost() { + return finalCost; + } + + @Override + public int ordValue() throws IOException { + return current.values.ordValue(); + } + + @Override + public BytesRef lookupOrd(int ord) throws IOException { + return current.values.lookupOrd(ord); + } + + @Override + public int getValueCount() { + return current.values.getValueCount(); + } + }; + } + + static class SortedDocValuesSub extends DocIDMerger.Sub { + + final SortedDocValues values; + int docID = -1; + + SortedDocValuesSub(MergeState.DocMap docMap, SortedDocValues values) { + super(docMap); + this.values = values; + assert values.docID() == -1; + } + + @Override + public int nextDoc() throws IOException { + return docID = values.nextDoc(); + } + } + +} diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesConsumer.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesConsumer.java index dc73428a07c7c..57738383db500 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesConsumer.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesConsumer.java @@ -16,9 +16,11 @@ import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.DocValues; import org.apache.lucene.index.DocValuesSkipIndexType; +import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.EmptyDocValuesProducer; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.MergeState; import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.SortedDocValues; @@ -45,7 +47,9 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import java.util.function.BiFunction; +import static org.elasticsearch.index.codec.tsdb.DocValuesConsumerUtil.mergeSortedNumericValues; import static org.elasticsearch.index.codec.tsdb.ES87TSDBDocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT; import static org.elasticsearch.index.codec.tsdb.ES87TSDBDocValuesFormat.SKIP_INDEX_LEVEL_SHIFT; import static org.elasticsearch.index.codec.tsdb.ES87TSDBDocValuesFormat.SKIP_INDEX_MAX_LEVEL; @@ -55,11 +59,13 @@ final class ES87TSDBDocValuesConsumer extends DocValuesConsumer { IndexOutput data, meta; final int maxDoc; + final boolean enableOptimizedMerge; private byte[] termsDictBuffer; private final int skipIndexIntervalSize; ES87TSDBDocValuesConsumer( SegmentWriteState state, + boolean enableOptimizedMerge, int skipIndexIntervalSize, String dataCodec, String dataExtension, @@ -89,6 +95,7 @@ final class ES87TSDBDocValuesConsumer extends DocValuesConsumer { ); maxDoc = state.segmentInfo.maxDoc(); this.skipIndexIntervalSize = skipIndexIntervalSize; + this.enableOptimizedMerge = enableOptimizedMerge; success = true; } finally { if (success == false) { @@ -118,11 +125,17 @@ private long[] writeField(FieldInfo field, DocValuesProducer valuesProducer, lon int numDocsWithValue = 0; long numValues = 0; - SortedNumericDocValues values = valuesProducer.getSortedNumeric(field); - for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) { - numDocsWithValue++; - final int count = values.docValueCount(); - numValues += count; + SortedNumericDocValues values; + if (valuesProducer instanceof TsdbDocValuesProducer tsdbDocValuesProducer) { + numDocsWithValue = tsdbDocValuesProducer.mergeStats.sumNumDocsWithField; + numValues = tsdbDocValuesProducer.mergeStats.sumNumValues; + } else { + values = valuesProducer.getSortedNumeric(field); + for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) { + numDocsWithValue++; + final int count = values.docValueCount(); + numValues += count; + } } if (numDocsWithValue == 0) { // meta[-2, 0]: No documents with values @@ -209,6 +222,42 @@ private long[] writeField(FieldInfo field, DocValuesProducer valuesProducer, lon return new long[] { numDocsWithValue, numValues }; } + @Override + public void mergeNumericField(FieldInfo mergeFieldInfo, MergeState mergeState) throws IOException { + var result = compatibleWithOptimizedMerge(enableOptimizedMerge, mergeFieldInfo, mergeState, (es87TSDBDocValuesProducer, s) -> { + var entry = es87TSDBDocValuesProducer.numerics.get(s); + return new FieldEntry(entry.docsWithFieldOffset, entry.numValues, -1); + }); + if (result.supported() == false) { + super.mergeNumericField(mergeFieldInfo, mergeState); + return; + } + addNumericField(mergeFieldInfo, new TsdbDocValuesProducer(result) { + + @Override + public NumericDocValues getNumeric(FieldInfo field) throws IOException { + List subs = new ArrayList<>(); + assert mergeState.docMaps.length == mergeState.docValuesProducers.length; + for (int i = 0; i < mergeState.docValuesProducers.length; i++) { + NumericDocValues values = null; + DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; + if (docValuesProducer != null) { + FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name); + if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.NUMERIC) { + values = docValuesProducer.getNumeric(readerFieldInfo); + } + } + if (values != null) { + subs.add(new DocValuesConsumerUtil.NumericDocValuesSub(mergeState.docMaps[i], values)); + } + } + + return DocValuesConsumerUtil.mergeNumericValues(subs, mergeState.needsIndexSort); + } + + }); + } + @Override public void addBinaryField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException { meta.writeInt(field.number); @@ -284,6 +333,42 @@ public void addSortedField(FieldInfo field, DocValuesProducer valuesProducer) th doAddSortedField(field, valuesProducer, false); } + @Override + public void mergeSortedField(FieldInfo mergeFieldInfo, MergeState mergeState) throws IOException { + var result = compatibleWithOptimizedMerge(enableOptimizedMerge, mergeFieldInfo, mergeState, (es87TSDBDocValuesProducer, s) -> { + var entry = es87TSDBDocValuesProducer.sorted.get(s); + return new FieldEntry(entry.ordsEntry.docsWithFieldOffset, entry.ordsEntry.numValues, -1); + }); + if (result.supported() == false) { + super.mergeSortedField(mergeFieldInfo, mergeState); + return; + } + doAddSortedField(mergeFieldInfo, new TsdbDocValuesProducer(result) { + + @Override + public SortedDocValues getSorted(FieldInfo field) throws IOException { + List subs = new ArrayList<>(); + assert mergeState.docMaps.length == mergeState.docValuesProducers.length; + for (int i = 0; i < mergeState.docValuesProducers.length; i++) { + SortedDocValues values = null; + DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; + if (docValuesProducer != null) { + FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name); + if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED) { + values = docValuesProducer.getSorted(readerFieldInfo); + } + } + if (values != null) { + subs.add(new DocValuesConsumerUtil.SortedDocValuesSub(mergeState.docMaps[i], values)); + } + } + + return DocValuesConsumerUtil.mergeSortedValues(subs, mergeState.needsIndexSort); + } + + }, false); + } + private void doAddSortedField(FieldInfo field, DocValuesProducer valuesProducer, boolean addTypeByte) throws IOException { DocValuesProducer producer = new EmptyDocValuesProducer() { @Override @@ -519,6 +604,42 @@ private void writeSortedNumericField(FieldInfo field, DocValuesProducer valuesPr } } + @Override + public void mergeSortedNumericField(FieldInfo mergeFieldInfo, MergeState mergeState) throws IOException { + var result = compatibleWithOptimizedMerge(enableOptimizedMerge, mergeFieldInfo, mergeState, (es87TSDBDocValuesProducer, s) -> { + var entry = es87TSDBDocValuesProducer.sortedNumerics.get(s); + return new FieldEntry(entry.docsWithFieldOffset, entry.numValues, entry.numDocsWithField); + }); + if (result.supported() == false) { + super.mergeSortedNumericField(mergeFieldInfo, mergeState); + return; + } + addSortedNumericField(mergeFieldInfo, new TsdbDocValuesProducer(result) { + + @Override + public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException { + List subs = new ArrayList<>(); + assert mergeState.docMaps.length == mergeState.docValuesProducers.length; + for (int i = 0; i < mergeState.docValuesProducers.length; i++) { + SortedNumericDocValues values = null; + DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; + if (docValuesProducer != null) { + FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name); + if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED_NUMERIC) { + values = docValuesProducer.getSortedNumeric(readerFieldInfo); + } + } + if (values != null) { + subs.add(new DocValuesConsumerUtil.SortedNumericDocValuesSub(mergeState.docMaps[i], values)); + } + } + + return mergeSortedNumericValues(subs, mergeState.needsIndexSort); + } + + }); + } + private static boolean isSingleValued(SortedSetDocValues values) throws IOException { if (DocValues.unwrapSingleton(values) != null) { return true; @@ -785,4 +906,65 @@ private static int getLevels(int index, int size) { return 1; } + abstract static class TsdbDocValuesProducer extends EmptyDocValuesProducer { + + private final MergeStats mergeStats; + + TsdbDocValuesProducer(MergeStats mergeStats) { + this.mergeStats = mergeStats; + } + + } + + record MergeStats(boolean supported, long sumNumValues, int sumNumDocsWithField) {} + + record FieldEntry(long docsWithFieldOffset, long numValues, int numDocsWithField) {} + + static MergeStats compatibleWithOptimizedMerge( + boolean optimizedMergeEnabled, + FieldInfo mergeFieldInfo, + MergeState mergeState, + BiFunction function + ) { + if (optimizedMergeEnabled == false + || mergeState.needsIndexSort == false + || mergeFieldInfo.docValuesSkipIndexType() != DocValuesSkipIndexType.NONE) { + return new MergeStats(false, -1, -1); + } + + long sumNumValues = 0; + int sumNumDocsWithField = 0; + + for (DocValuesProducer docValuesProducer : mergeState.docValuesProducers) { + if (docValuesProducer instanceof ES87TSDBDocValuesProducer tsdbProducer) { + if (tsdbProducer.version != ES87TSDBDocValuesFormat.VERSION_CURRENT) { + return new MergeStats(false, -1, -1); + } + + var entry = function.apply(tsdbProducer, mergeFieldInfo.name); + assert entry != null; + // TODO: support also fields with offsets + if (entry.docsWithFieldOffset != -1) { + return new MergeStats(false, -1, -1); + } + sumNumValues += entry.numValues; + sumNumDocsWithField += entry.numDocsWithField; + } else { + return new MergeStats(false, -1, -1); + } + } + + if (Math.toIntExact(sumNumValues) != sumNumDocsWithField) { + return new MergeStats(false, -1, -1); + } + // Documents marked as deleted should be rare. Maybe in the case of noop operation? + for (int i = 0; i < mergeState.liveDocs.length; i++) { + if (mergeState.liveDocs[i] != null) { + return new MergeStats(false, -1, -1); + } + } + + return new MergeStats(true, sumNumValues, sumNumDocsWithField); + } + } diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormat.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormat.java index 496c41b42869a..a2b5a77bbd0ac 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormat.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormat.java @@ -13,6 +13,7 @@ import org.apache.lucene.codecs.DocValuesProducer; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; +import org.elasticsearch.common.util.FeatureFlag; import java.io.IOException; @@ -75,25 +76,47 @@ public class ES87TSDBDocValuesFormat extends org.apache.lucene.codecs.DocValuesF } } + // Default for escape hatch: + static final boolean OPTIMIZED_MERGE_ENABLE_DEFAULT; + static final FeatureFlag TSDB_DOC_VALUES_OPTIMIZED_MERGE = new FeatureFlag("tsdb_doc_values_optimized_merge"); + static final String OPTIMIZED_MERGE_ENABLED_NAME = ES87TSDBDocValuesConsumer.class.getName() + ".enableOptimizedMerge"; + + static { + boolean optimizedMergeDefault = TSDB_DOC_VALUES_OPTIMIZED_MERGE.isEnabled(); + OPTIMIZED_MERGE_ENABLE_DEFAULT = Boolean.parseBoolean( + System.getProperty(OPTIMIZED_MERGE_ENABLED_NAME, Boolean.toString(optimizedMergeDefault)) + ); + } + private final int skipIndexIntervalSize; + private final boolean enableOptimizedMerge; /** Default constructor. */ public ES87TSDBDocValuesFormat() { - this(DEFAULT_SKIP_INDEX_INTERVAL_SIZE); + this(DEFAULT_SKIP_INDEX_INTERVAL_SIZE, OPTIMIZED_MERGE_ENABLE_DEFAULT); } /** Doc values fields format with specified skipIndexIntervalSize. */ - public ES87TSDBDocValuesFormat(int skipIndexIntervalSize) { + public ES87TSDBDocValuesFormat(int skipIndexIntervalSize, boolean enableOptimizedMerge) { super(CODEC_NAME); if (skipIndexIntervalSize < 2) { throw new IllegalArgumentException("skipIndexIntervalSize must be > 1, got [" + skipIndexIntervalSize + "]"); } this.skipIndexIntervalSize = skipIndexIntervalSize; + this.enableOptimizedMerge = enableOptimizedMerge; } @Override public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - return new ES87TSDBDocValuesConsumer(state, skipIndexIntervalSize, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION); + return new ES87TSDBDocValuesConsumer( + state, + enableOptimizedMerge, + skipIndexIntervalSize, + DATA_CODEC, + DATA_EXTENSION, + META_CODEC, + META_EXTENSION + ); } @Override diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesProducer.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesProducer.java index 8a8095ecf6d21..54c7d83421a36 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesProducer.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesProducer.java @@ -51,15 +51,15 @@ import static org.elasticsearch.index.codec.tsdb.ES87TSDBDocValuesFormat.TERMS_DICT_BLOCK_LZ4_SHIFT; public class ES87TSDBDocValuesProducer extends DocValuesProducer { - private final Map numerics; + final Map numerics; private final Map binaries; - private final Map sorted; - private final Map sortedSets; - private final Map sortedNumerics; + final Map sorted; + final Map sortedSets; + final Map sortedNumerics; private final Map skippers; private final IndexInput data; private final int maxDoc; - private final int version; + final int version; private final boolean merging; ES87TSDBDocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) @@ -1416,7 +1416,7 @@ private void set() { private record DocValuesSkipperEntry(long offset, long length, long minValue, long maxValue, int docCount, int maxDocId) {} - private static class NumericEntry { + static class NumericEntry { long docsWithFieldOffset; long docsWithFieldLength; short jumpTableEntryCount; @@ -1444,19 +1444,19 @@ private static class BinaryEntry { DirectMonotonicReader.Meta addressesMeta; } - private static class SortedNumericEntry extends NumericEntry { + static class SortedNumericEntry extends NumericEntry { int numDocsWithField; DirectMonotonicReader.Meta addressesMeta; long addressesOffset; long addressesLength; } - private static class SortedEntry { + static class SortedEntry { NumericEntry ordsEntry; TermsDictEntry termsDictEntry; } - private static class SortedSetEntry { + static class SortedSetEntry { SortedEntry singleValueEntry; SortedNumericEntry ordsEntry; TermsDictEntry termsDictEntry; diff --git a/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatVariableSkipIntervalTests.java b/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatVariableSkipIntervalTests.java index 099b59808ef4a..8a4a5c59d1a9c 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatVariableSkipIntervalTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatVariableSkipIntervalTests.java @@ -32,13 +32,13 @@ public class ES87TSDBDocValuesFormatVariableSkipIntervalTests extends BaseDocVal @Override protected Codec getCodec() { // small interval size to test with many intervals - return TestUtil.alwaysDocValuesFormat(new ES87TSDBDocValuesFormat(random().nextInt(4, 16))); + return TestUtil.alwaysDocValuesFormat(new ES87TSDBDocValuesFormat(random().nextInt(4, 16), false)); } public void testSkipIndexIntervalSize() { IllegalArgumentException ex = expectThrows( IllegalArgumentException.class, - () -> new ES87TSDBDocValuesFormat(random().nextInt(Integer.MIN_VALUE, 2)) + () -> new ES87TSDBDocValuesFormat(random().nextInt(Integer.MIN_VALUE, 2), false) ); assertTrue(ex.getMessage().contains("skipIndexIntervalSize must be > 1")); } From 9bd2907813a19c3ffb34c2e30f34141cb74d158c Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Fri, 21 Mar 2025 13:10:43 +0000 Subject: [PATCH 02/43] [CI] Auto commit changes from spotless --- .../index/codec/tsdb/TSDBDocValuesMergeBenchmark.java | 7 +++---- .../index/codec/tsdb/DocValuesConsumerUtil.java | 9 +++------ 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java b/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java index 86ffa07368cbe..b2a1ea492fced 100644 --- a/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java +++ b/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java @@ -96,8 +96,7 @@ public void setup() throws IOException { indexWriterWithOptimizedMerge = createIndex(tempDirectoryWithDocValuesSkipper, true); } - private IndexWriter createIndex(final Directory directory, final boolean optimizedMergeEnabled) - throws IOException { + private IndexWriter createIndex(final Directory directory, final boolean optimizedMergeEnabled) throws IOException { final IndexWriterConfig config = new IndexWriterConfig(new StandardAnalyzer()); // NOTE: index sort config matching LogsDB's sort order @@ -118,8 +117,8 @@ public DocValuesFormat getDocValuesFormatForField(String field) { long counter1 = 0; long counter2 = 10_000_000; - long[] gauge1Values = new long[] {2, 4, 6, 8, 10, 12, 14, 16}; - long[] gauge2Values = new long[] {-2, -4, -6, -8, -10, -12, -14, -16}; + long[] gauge1Values = new long[] { 2, 4, 6, 8, 10, 12, 14, 16 }; + long[] gauge2Values = new long[] { -2, -4, -6, -8, -10, -12, -14, -16 }; int numHosts = 1000; final Random random = new Random(seed); diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/DocValuesConsumerUtil.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/DocValuesConsumerUtil.java index 6935d45a62c09..ee3afc76a3aa9 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/DocValuesConsumerUtil.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/DocValuesConsumerUtil.java @@ -21,8 +21,7 @@ class DocValuesConsumerUtil { - static SortedNumericDocValues mergeSortedNumericValues(List subs, boolean indexIsSorted) - throws IOException { + static SortedNumericDocValues mergeSortedNumericValues(List subs, boolean indexIsSorted) throws IOException { long cost = 0; for (SortedNumericDocValuesSub sub : subs) { cost += sub.values.cost(); @@ -96,8 +95,7 @@ public int nextDoc() throws IOException { } } - static NumericDocValues mergeNumericValues(List subs, boolean indexIsSorted) - throws IOException { + static NumericDocValues mergeNumericValues(List subs, boolean indexIsSorted) throws IOException { long cost = 0; for (NumericDocValuesSub sub : subs) { cost += sub.values.cost(); @@ -166,8 +164,7 @@ public int nextDoc() throws IOException { } } - static SortedDocValues mergeSortedValues(List subs, boolean indexIsSorted) - throws IOException { + static SortedDocValues mergeSortedValues(List subs, boolean indexIsSorted) throws IOException { long cost = 0; for (SortedDocValuesSub sub : subs) { cost += sub.values.cost(); From 65d97e53a2c104b0b5469b7ef94c5e5ddf2f4d69 Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Fri, 21 Mar 2025 16:37:45 +0100 Subject: [PATCH 03/43] actually use OrdinalMap when merging sorted and sorted dv fixed sorted set dv added unit test with index sorting --- .../codec/tsdb/DocValuesConsumerUtil.java | 281 +++++++++++++++++- .../codec/tsdb/ES87TSDBDocValuesConsumer.java | 143 +++++---- .../tsdb/ES87TSDBDocValuesFormatTests.java | 61 ++++ 3 files changed, 406 insertions(+), 79 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/DocValuesConsumerUtil.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/DocValuesConsumerUtil.java index ee3afc76a3aa9..be078fa61d633 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/DocValuesConsumerUtil.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/DocValuesConsumerUtil.java @@ -9,18 +9,98 @@ package org.elasticsearch.index.codec.tsdb; +import org.apache.lucene.codecs.DocValuesProducer; +import org.apache.lucene.index.BaseTermsEnum; import org.apache.lucene.index.DocIDMerger; +import org.apache.lucene.index.DocValuesSkipIndexType; +import org.apache.lucene.index.EmptyDocValuesProducer; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.ImpactsEnum; import org.apache.lucene.index.MergeState; import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.index.OrdinalMap; +import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.index.SortedNumericDocValues; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.index.TermState; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LongValues; import java.io.IOException; import java.util.List; +import java.util.function.BiFunction; +/** + * Contains mainly forked code from {@link org.apache.lucene.codecs.DocValuesConsumer}. + */ class DocValuesConsumerUtil { + static final MergeStats UNSUPPORTED = new MergeStats(false, -1, -1); + + abstract static class TsdbDocValuesProducer extends EmptyDocValuesProducer { + + final MergeStats mergeStats; + + TsdbDocValuesProducer(MergeStats mergeStats) { + this.mergeStats = mergeStats; + } + + } + + record MergeStats(boolean supported, long sumNumValues, int sumNumDocsWithField) {} + + record FieldEntry(long docsWithFieldOffset, long numValues, int numDocsWithField) {} + + static MergeStats compatibleWithOptimizedMerge( + boolean optimizedMergeEnabled, + FieldInfo mergeFieldInfo, + MergeState mergeState, + BiFunction function + ) { + if (optimizedMergeEnabled == false + || mergeState.needsIndexSort == false + || mergeFieldInfo.docValuesSkipIndexType() != DocValuesSkipIndexType.NONE) { + return UNSUPPORTED; + } + + long sumNumValues = 0; + int sumNumDocsWithField = 0; + + for (DocValuesProducer docValuesProducer : mergeState.docValuesProducers) { + if (docValuesProducer instanceof ES87TSDBDocValuesProducer tsdbProducer) { + if (tsdbProducer.version != ES87TSDBDocValuesFormat.VERSION_CURRENT) { + return UNSUPPORTED; + } + + var entry = function.apply(tsdbProducer, mergeFieldInfo.name); + assert entry != null; + // TODO: support also fields with offsets + if (entry.docsWithFieldOffset != -1) { + return UNSUPPORTED; + } + sumNumValues += entry.numValues; + sumNumDocsWithField += entry.numDocsWithField; + } else { + return UNSUPPORTED; + } + } + + if (Math.toIntExact(sumNumValues) != sumNumDocsWithField) { + return UNSUPPORTED; + } + // Documents marked as deleted should be rare. Maybe in the case of noop operation? + for (int i = 0; i < mergeState.liveDocs.length; i++) { + if (mergeState.liveDocs[i] != null) { + return UNSUPPORTED; + } + } + + return new MergeStats(true, sumNumValues, sumNumDocsWithField); + } + static SortedNumericDocValues mergeSortedNumericValues(List subs, boolean indexIsSorted) throws IOException { long cost = 0; for (SortedNumericDocValuesSub sub : subs) { @@ -164,7 +244,7 @@ public int nextDoc() throws IOException { } } - static SortedDocValues mergeSortedValues(List subs, boolean indexIsSorted) throws IOException { + static SortedDocValues mergeSortedValues(List subs, boolean indexIsSorted, OrdinalMap map) throws IOException { long cost = 0; for (SortedDocValuesSub sub : subs) { cost += sub.values.cost(); @@ -210,25 +290,38 @@ public long cost() { @Override public int ordValue() throws IOException { - return current.values.ordValue(); + int subOrd = current.values.ordValue(); + assert subOrd != -1; + return (int) current.map.get(subOrd); } @Override public BytesRef lookupOrd(int ord) throws IOException { - return current.values.lookupOrd(ord); + int segmentNumber = map.getFirstSegmentNumber(ord); + int segmentOrd = (int) map.getFirstSegmentOrd(ord); + return subs.get(segmentNumber).values.lookupOrd(segmentOrd); } @Override public int getValueCount() { - return current.values.getValueCount(); + return (int) map.getValueCount(); + } + + @Override + public TermsEnum termsEnum() throws IOException { + TermsEnum[] termsEnurmSubs = new TermsEnum[subs.size()]; + for (int sub = 0; sub < termsEnurmSubs.length; ++sub) { + termsEnurmSubs[sub] = subs.get(sub).values.termsEnum(); + } + return new MergedTermsEnum(map, termsEnurmSubs); } }; } static class SortedDocValuesSub extends DocIDMerger.Sub { + LongValues map; final SortedDocValues values; - int docID = -1; SortedDocValuesSub(MergeState.DocMap docMap, SortedDocValues values) { super(docMap); @@ -238,7 +331,183 @@ static class SortedDocValuesSub extends DocIDMerger.Sub { @Override public int nextDoc() throws IOException { - return docID = values.nextDoc(); + return values.nextDoc(); + } + } + + static SortedSetDocValues mergeSortedSetValues(List subs, boolean indexIsSorted, OrdinalMap map) + throws IOException { + long cost = 0; + for (SortedSetDocValuesSub sub : subs) { + cost += sub.values.cost(); + } + final long finalCost = cost; + + final DocIDMerger docIDMerger = DocIDMerger.of(subs, indexIsSorted); + + return new SortedSetDocValues() { + private int docID = -1; + private SortedSetDocValuesSub current; + + @Override + public int docID() { + return docID; + } + + @Override + public int nextDoc() throws IOException { + current = docIDMerger.next(); + if (current == null) { + docID = NO_MORE_DOCS; + } else { + docID = current.mappedDocID; + } + return docID; + } + + @Override + public int advance(int target) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public boolean advanceExact(int target) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public long cost() { + return finalCost; + } + + @Override + public long nextOrd() throws IOException { + long subOrd = current.values.nextOrd(); + return current.map.get(subOrd); + } + + @Override + public int docValueCount() { + return current.values.docValueCount(); + } + + @Override + public BytesRef lookupOrd(long ord) throws IOException { + int segmentNumber = map.getFirstSegmentNumber(ord); + int segmentOrd = (int) map.getFirstSegmentOrd(ord); + return subs.get(segmentNumber).values.lookupOrd(segmentOrd); + } + + @Override + public long getValueCount() { + return map.getValueCount(); + } + + @Override + public TermsEnum termsEnum() throws IOException { + TermsEnum[] termsEnurmSubs = new TermsEnum[subs.size()]; + for (int sub = 0; sub < termsEnurmSubs.length; ++sub) { + termsEnurmSubs[sub] = subs.get(sub).values.termsEnum(); + } + return new MergedTermsEnum(map, termsEnurmSubs); + } + }; + } + + static class SortedSetDocValuesSub extends DocIDMerger.Sub { + + LongValues map; + final SortedSetDocValues values; + + SortedSetDocValuesSub(MergeState.DocMap docMap, SortedSetDocValues values) { + super(docMap); + this.values = values; + assert values.docID() == -1; + } + + @Override + public int nextDoc() throws IOException { + return values.nextDoc(); + } + } + + static class MergedTermsEnum extends BaseTermsEnum { + + private final TermsEnum[] subs; + private final OrdinalMap ordinalMap; + private final long valueCount; + private long ord = -1; + private BytesRef term; + + MergedTermsEnum(OrdinalMap ordinalMap, TermsEnum[] subs) { + this.ordinalMap = ordinalMap; + this.subs = subs; + this.valueCount = ordinalMap.getValueCount(); + } + + @Override + public BytesRef term() throws IOException { + return term; + } + + @Override + public long ord() throws IOException { + return ord; + } + + @Override + public BytesRef next() throws IOException { + if (++ord >= valueCount) { + return null; + } + final int subNum = ordinalMap.getFirstSegmentNumber(ord); + final TermsEnum sub = subs[subNum]; + final long subOrd = ordinalMap.getFirstSegmentOrd(ord); + do { + term = sub.next(); + } while (sub.ord() < subOrd); + assert sub.ord() == subOrd; + return term; + } + + @Override + public AttributeSource attributes() { + throw new UnsupportedOperationException(); + } + + @Override + public SeekStatus seekCeil(BytesRef text) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public void seekExact(long ord) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public int docFreq() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public long totalTermFreq() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public ImpactsEnum impacts(int flags) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public TermState termState() throws IOException { + throw new UnsupportedOperationException(); } } diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesConsumer.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesConsumer.java index 57738383db500..82d93ff8d5c0a 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesConsumer.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesConsumer.java @@ -22,6 +22,7 @@ import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.MergeState; import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.index.OrdinalMap; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.index.SortedNumericDocValues; @@ -47,8 +48,8 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; -import java.util.function.BiFunction; +import static org.elasticsearch.index.codec.tsdb.DocValuesConsumerUtil.compatibleWithOptimizedMerge; import static org.elasticsearch.index.codec.tsdb.DocValuesConsumerUtil.mergeSortedNumericValues; import static org.elasticsearch.index.codec.tsdb.ES87TSDBDocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT; import static org.elasticsearch.index.codec.tsdb.ES87TSDBDocValuesFormat.SKIP_INDEX_LEVEL_SHIFT; @@ -126,9 +127,9 @@ private long[] writeField(FieldInfo field, DocValuesProducer valuesProducer, lon long numValues = 0; SortedNumericDocValues values; - if (valuesProducer instanceof TsdbDocValuesProducer tsdbDocValuesProducer) { - numDocsWithValue = tsdbDocValuesProducer.mergeStats.sumNumDocsWithField; - numValues = tsdbDocValuesProducer.mergeStats.sumNumValues; + if (valuesProducer instanceof DocValuesConsumerUtil.TsdbDocValuesProducer tsdbDocValuesProducer) { + numDocsWithValue = tsdbDocValuesProducer.mergeStats.sumNumDocsWithField(); + numValues = tsdbDocValuesProducer.mergeStats.sumNumValues(); } else { values = valuesProducer.getSortedNumeric(field); for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) { @@ -226,13 +227,13 @@ private long[] writeField(FieldInfo field, DocValuesProducer valuesProducer, lon public void mergeNumericField(FieldInfo mergeFieldInfo, MergeState mergeState) throws IOException { var result = compatibleWithOptimizedMerge(enableOptimizedMerge, mergeFieldInfo, mergeState, (es87TSDBDocValuesProducer, s) -> { var entry = es87TSDBDocValuesProducer.numerics.get(s); - return new FieldEntry(entry.docsWithFieldOffset, entry.numValues, -1); + return new DocValuesConsumerUtil.FieldEntry(entry.docsWithFieldOffset, entry.numValues, -1); }); if (result.supported() == false) { super.mergeNumericField(mergeFieldInfo, mergeState); return; } - addNumericField(mergeFieldInfo, new TsdbDocValuesProducer(result) { + addNumericField(mergeFieldInfo, new DocValuesConsumerUtil.TsdbDocValuesProducer(result) { @Override public NumericDocValues getNumeric(FieldInfo field) throws IOException { @@ -337,18 +338,21 @@ public void addSortedField(FieldInfo field, DocValuesProducer valuesProducer) th public void mergeSortedField(FieldInfo mergeFieldInfo, MergeState mergeState) throws IOException { var result = compatibleWithOptimizedMerge(enableOptimizedMerge, mergeFieldInfo, mergeState, (es87TSDBDocValuesProducer, s) -> { var entry = es87TSDBDocValuesProducer.sorted.get(s); - return new FieldEntry(entry.ordsEntry.docsWithFieldOffset, entry.ordsEntry.numValues, -1); + return new DocValuesConsumerUtil.FieldEntry(entry.ordsEntry.docsWithFieldOffset, entry.ordsEntry.numValues, -1); }); if (result.supported() == false) { super.mergeSortedField(mergeFieldInfo, mergeState); return; } - doAddSortedField(mergeFieldInfo, new TsdbDocValuesProducer(result) { + addSortedField(mergeFieldInfo, new DocValuesConsumerUtil.TsdbDocValuesProducer(result) { @Override public SortedDocValues getSorted(FieldInfo field) throws IOException { List subs = new ArrayList<>(); assert mergeState.docMaps.length == mergeState.docValuesProducers.length; + + TermsEnum[] liveTerms = new TermsEnum[subs.size()]; + long[] weights = new long[liveTerms.length]; for (int i = 0; i < mergeState.docValuesProducers.length; i++) { SortedDocValues values = null; DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; @@ -363,10 +367,14 @@ public SortedDocValues getSorted(FieldInfo field) throws IOException { } } - return DocValuesConsumerUtil.mergeSortedValues(subs, mergeState.needsIndexSort); + final OrdinalMap map = OrdinalMap.build(null, liveTerms, weights, PackedInts.COMPACT); + for (int i = 0; i < subs.size(); i++) { + subs.get(i).map = map.getGlobalOrds(i); + } + return DocValuesConsumerUtil.mergeSortedValues(subs, mergeState.needsIndexSort, map); } - }, false); + }); } private void doAddSortedField(FieldInfo field, DocValuesProducer valuesProducer, boolean addTypeByte) throws IOException { @@ -608,13 +616,13 @@ private void writeSortedNumericField(FieldInfo field, DocValuesProducer valuesPr public void mergeSortedNumericField(FieldInfo mergeFieldInfo, MergeState mergeState) throws IOException { var result = compatibleWithOptimizedMerge(enableOptimizedMerge, mergeFieldInfo, mergeState, (es87TSDBDocValuesProducer, s) -> { var entry = es87TSDBDocValuesProducer.sortedNumerics.get(s); - return new FieldEntry(entry.docsWithFieldOffset, entry.numValues, entry.numDocsWithField); + return new DocValuesConsumerUtil.FieldEntry(entry.docsWithFieldOffset, entry.numValues, entry.numDocsWithField); }); if (result.supported() == false) { super.mergeSortedNumericField(mergeFieldInfo, mergeState); return; } - addSortedNumericField(mergeFieldInfo, new TsdbDocValuesProducer(result) { + addSortedNumericField(mergeFieldInfo, new DocValuesConsumerUtil.TsdbDocValuesProducer(result) { @Override public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException { @@ -656,6 +664,56 @@ private static boolean isSingleValued(SortedSetDocValues values) throws IOExcept return true; } + @Override + public void mergeSortedSetField(FieldInfo mergeFieldInfo, MergeState mergeState) throws IOException { + var result = compatibleWithOptimizedMerge(enableOptimizedMerge, mergeFieldInfo, mergeState, (es87TSDBDocValuesProducer, s) -> { + var entry = es87TSDBDocValuesProducer.sortedSets.get(s); + return new DocValuesConsumerUtil.FieldEntry( + entry.ordsEntry.docsWithFieldOffset, + entry.ordsEntry.numValues, + entry.ordsEntry.numDocsWithField + ); + }); + if (result.supported() == false) { + super.mergeSortedSetField(mergeFieldInfo, mergeState); + return; + } + addSortedSetField(mergeFieldInfo, new DocValuesConsumerUtil.TsdbDocValuesProducer(result) { + + @Override + public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException { + List subs = new ArrayList<>(); + assert mergeState.docMaps.length == mergeState.docValuesProducers.length; + + TermsEnum[] liveTerms = new TermsEnum[subs.size()]; + long[] weights = new long[liveTerms.length]; + for (int i = 0; i < mergeState.docValuesProducers.length; i++) { + SortedSetDocValues values = null; + DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; + if (docValuesProducer != null) { + FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name); + if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED_SET) { + values = docValuesProducer.getSortedSet(readerFieldInfo); + } + liveTerms[i] = values.termsEnum(); + weights[i] = values.getValueCount(); + } + if (values != null) { + subs.add(new DocValuesConsumerUtil.SortedSetDocValuesSub(mergeState.docMaps[i], values)); + } + } + + final OrdinalMap map = OrdinalMap.build(null, liveTerms, weights, PackedInts.COMPACT); + for (int i = 0; i < subs.size(); i++) { + subs.get(i).map = map.getGlobalOrds(i); + } + + return DocValuesConsumerUtil.mergeSortedSetValues(subs, mergeState.needsIndexSort, map); + } + + }); + } + @Override public void addSortedSetField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException { meta.writeInt(field.number); @@ -906,65 +964,4 @@ private static int getLevels(int index, int size) { return 1; } - abstract static class TsdbDocValuesProducer extends EmptyDocValuesProducer { - - private final MergeStats mergeStats; - - TsdbDocValuesProducer(MergeStats mergeStats) { - this.mergeStats = mergeStats; - } - - } - - record MergeStats(boolean supported, long sumNumValues, int sumNumDocsWithField) {} - - record FieldEntry(long docsWithFieldOffset, long numValues, int numDocsWithField) {} - - static MergeStats compatibleWithOptimizedMerge( - boolean optimizedMergeEnabled, - FieldInfo mergeFieldInfo, - MergeState mergeState, - BiFunction function - ) { - if (optimizedMergeEnabled == false - || mergeState.needsIndexSort == false - || mergeFieldInfo.docValuesSkipIndexType() != DocValuesSkipIndexType.NONE) { - return new MergeStats(false, -1, -1); - } - - long sumNumValues = 0; - int sumNumDocsWithField = 0; - - for (DocValuesProducer docValuesProducer : mergeState.docValuesProducers) { - if (docValuesProducer instanceof ES87TSDBDocValuesProducer tsdbProducer) { - if (tsdbProducer.version != ES87TSDBDocValuesFormat.VERSION_CURRENT) { - return new MergeStats(false, -1, -1); - } - - var entry = function.apply(tsdbProducer, mergeFieldInfo.name); - assert entry != null; - // TODO: support also fields with offsets - if (entry.docsWithFieldOffset != -1) { - return new MergeStats(false, -1, -1); - } - sumNumValues += entry.numValues; - sumNumDocsWithField += entry.numDocsWithField; - } else { - return new MergeStats(false, -1, -1); - } - } - - if (Math.toIntExact(sumNumValues) != sumNumDocsWithField) { - return new MergeStats(false, -1, -1); - } - // Documents marked as deleted should be rare. Maybe in the case of noop operation? - for (int i = 0; i < mergeState.liveDocs.length; i++) { - if (mergeState.liveDocs[i] != null) { - return new MergeStats(false, -1, -1); - } - } - - return new MergeStats(true, sumNumValues, sumNumDocsWithField); - } - } diff --git a/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java index 12a17f5c263a8..481d541515e92 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java @@ -13,6 +13,7 @@ import org.apache.lucene.codecs.Codec; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; +import org.apache.lucene.document.NumericDocValuesField; import org.apache.lucene.document.SortedDocValuesField; import org.apache.lucene.document.SortedNumericDocValuesField; import org.apache.lucene.document.SortedSetDocValuesField; @@ -27,6 +28,9 @@ import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.index.StoredFields; import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; +import org.apache.lucene.search.SortedNumericSortField; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.analysis.MockAnalyzer; import org.apache.lucene.tests.index.BaseDocValuesFormatTestCase; @@ -59,6 +63,7 @@ public void testSortedDocValuesSingleUniqueValue() throws IOException { try (Directory directory = newDirectory()) { Analyzer analyzer = new MockAnalyzer(random()); IndexWriterConfig conf = newIndexWriterConfig(analyzer); + conf.setCodec(getCodec()); conf.setMergePolicy(newLogMergePolicy()); try (RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, conf)) { for (int i = 0; i < NUM_DOCS; i++) { @@ -95,6 +100,7 @@ public void testSortedSetDocValuesSingleUniqueValue() throws IOException { try (Directory directory = newDirectory()) { Analyzer analyzer = new MockAnalyzer(random()); IndexWriterConfig conf = newIndexWriterConfig(analyzer); + conf.setCodec(getCodec()); conf.setMergePolicy(newLogMergePolicy()); try (RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, conf)) { for (int i = 0; i < NUM_DOCS; i++) { @@ -132,6 +138,7 @@ public void testSortedSetDocValuesSingleUniqueValue() throws IOException { public void testOneDocManyValues() throws Exception { IndexWriterConfig config = new IndexWriterConfig(); + config.setCodec(getCodec()); try (Directory dir = newDirectory(); IndexWriter writer = new IndexWriter(dir, config)) { int numValues = 128 + random().nextInt(1024); // > 2^7 to require two blocks Document d = new Document(); @@ -159,6 +166,7 @@ public void testManyDocsWithManyValues() throws Exception { final Map sortedNumbers = new HashMap<>(); // key -> numbers try (Directory directory = newDirectory()) { IndexWriterConfig conf = newIndexWriterConfig(); + conf.setCodec(getCodec()); try (RandomIndexWriter writer = new RandomIndexWriter(random(), directory, conf)) { for (int i = 0; i < numDocs; i++) { Document doc = new Document(); @@ -242,4 +250,57 @@ public void testManyDocsWithManyValues() throws Exception { } } } + + public void testForceMerge() throws Exception { + String timestampField = "@timestamp"; + String hostnameField = "host.name"; + long baseTimestamp = 1704067200000L; + + IndexWriterConfig config = new IndexWriterConfig(); + config.setIndexSort( + new Sort( + new SortField(hostnameField, SortField.Type.STRING, false), + new SortedNumericSortField(timestampField, SortField.Type.LONG, true) + ) + ); + config.setCodec(getCodec()); + try (Directory dir = newDirectory(); IndexWriter iw = new IndexWriter(dir, config)) { + long counter1 = 0; + long counter2 = 10_000_000; + long[] gauge1Values = new long[] { 2, 4, 6, 8, 10, 12, 14, 16 }; + long[] gauge2Values = new long[] { -2, -4, -6, -8, -10, -12, -14, -16 }; + int numHosts = 1000; + + int numDocs = 256 + random().nextInt(1024); + for (int i = 0; i < numDocs; i++) { + Document d = new Document(); + + final int batchIndex = i / numHosts; + final String hostName = "host-" + batchIndex; + final long timestamp = baseTimestamp + (1000L * i); + + d.add(new SortedDocValuesField(hostnameField, new BytesRef(hostName))); + d.add(new NumericDocValuesField(timestampField, timestamp)); + d.add(new SortedNumericDocValuesField("counter_1", counter1++)); + d.add(new SortedNumericDocValuesField("counter_2", counter2++)); + d.add(new SortedNumericDocValuesField("gauge_1", gauge1Values[i % gauge1Values.length])); + d.add(new SortedNumericDocValuesField("gauge_2", gauge2Values[i % gauge1Values.length])); + + iw.addDocument(d); + } + + iw.forceMerge(1); + + try (DirectoryReader reader = DirectoryReader.open(iw)) { + assertEquals(1, reader.leaves().size()); + assertEquals(numDocs, reader.maxDoc()); + var leaf = reader.leaves().get(0).reader(); + var numericDocValues = leaf.getNumericDocValues(timestampField); + for (int i = 0; i < numDocs; i++) { + assertEquals(i, numericDocValues.nextDoc()); + assertEquals(baseTimestamp + (1000L * i), numericDocValues.longValue()); + } + } + } + } } From 7369a22d2c17bae019b205bb0aa43e0e6947de63 Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Fri, 21 Mar 2025 17:13:35 +0100 Subject: [PATCH 04/43] fix test --- .../tsdb/ES87TSDBDocValuesFormatTests.java | 34 ++++++++++++------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java index 481d541515e92..099b63dda5ea8 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java @@ -13,7 +13,6 @@ import org.apache.lucene.codecs.Codec; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; -import org.apache.lucene.document.NumericDocValuesField; import org.apache.lucene.document.SortedDocValuesField; import org.apache.lucene.document.SortedNumericDocValuesField; import org.apache.lucene.document.SortedSetDocValuesField; @@ -23,6 +22,7 @@ import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.NoMergePolicy; import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.index.SortedNumericDocValues; import org.apache.lucene.index.SortedSetDocValues; @@ -256,7 +256,7 @@ public void testForceMerge() throws Exception { String hostnameField = "host.name"; long baseTimestamp = 1704067200000L; - IndexWriterConfig config = new IndexWriterConfig(); + var config = new IndexWriterConfig(); config.setIndexSort( new Sort( new SortField(hostnameField, SortField.Type.STRING, false), @@ -264,41 +264,49 @@ public void testForceMerge() throws Exception { ) ); config.setCodec(getCodec()); - try (Directory dir = newDirectory(); IndexWriter iw = new IndexWriter(dir, config)) { + try (var dir = newDirectory(); var iw = new IndexWriter(dir, config)) { long counter1 = 0; long counter2 = 10_000_000; long[] gauge1Values = new long[] { 2, 4, 6, 8, 10, 12, 14, 16 }; long[] gauge2Values = new long[] { -2, -4, -6, -8, -10, -12, -14, -16 }; - int numHosts = 1000; + int numHosts = 10; int numDocs = 256 + random().nextInt(1024); for (int i = 0; i < numDocs; i++) { - Document d = new Document(); + var d = new Document(); - final int batchIndex = i / numHosts; - final String hostName = "host-" + batchIndex; - final long timestamp = baseTimestamp + (1000L * i); + int batchIndex = i / numHosts; + String hostName = String.format("host-%03d", batchIndex); + long timestamp = baseTimestamp + (1000L * i); d.add(new SortedDocValuesField(hostnameField, new BytesRef(hostName))); - d.add(new NumericDocValuesField(timestampField, timestamp)); + d.add(new SortedNumericDocValuesField(timestampField, timestamp)); d.add(new SortedNumericDocValuesField("counter_1", counter1++)); d.add(new SortedNumericDocValuesField("counter_2", counter2++)); d.add(new SortedNumericDocValuesField("gauge_1", gauge1Values[i % gauge1Values.length])); d.add(new SortedNumericDocValuesField("gauge_2", gauge2Values[i % gauge1Values.length])); iw.addDocument(d); + if (i % 100 == 0) { + iw.commit(); + } } + iw.commit(); iw.forceMerge(1); - try (DirectoryReader reader = DirectoryReader.open(iw)) { + try (var reader = DirectoryReader.open(iw)) { assertEquals(1, reader.leaves().size()); assertEquals(numDocs, reader.maxDoc()); var leaf = reader.leaves().get(0).reader(); - var numericDocValues = leaf.getNumericDocValues(timestampField); + var sortedDocValues = leaf.getSortedDocValues(hostnameField); + assertNotNull(sortedDocValues); for (int i = 0; i < numDocs; i++) { - assertEquals(i, numericDocValues.nextDoc()); - assertEquals(baseTimestamp + (1000L * i), numericDocValues.longValue()); + assertEquals(i, sortedDocValues.nextDoc()); + int batchIndex = i / numHosts; + assertEquals(batchIndex, sortedDocValues.ordValue()); + String expectedHostName = String.format("host-%03d", batchIndex); + assertEquals(expectedHostName, sortedDocValues.lookupOrd(sortedDocValues.ordValue()).utf8ToString()); } } } From 3b7822d07b8f524d7033d19aeb777bd34a25052e Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Fri, 21 Mar 2025 16:20:47 +0000 Subject: [PATCH 05/43] [CI] Auto commit changes from spotless --- .../index/codec/tsdb/ES87TSDBDocValuesFormatTests.java | 1 - 1 file changed, 1 deletion(-) diff --git a/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java index 099b63dda5ea8..29083e473b4cb 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java @@ -22,7 +22,6 @@ import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.index.NoMergePolicy; import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.index.SortedNumericDocValues; import org.apache.lucene.index.SortedSetDocValues; From ce4b326b2d4b67dbdaaf43a23bd5f1f1d07a3a93 Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Fri, 21 Mar 2025 17:41:57 +0100 Subject: [PATCH 06/43] fix test (2) --- .../index/codec/tsdb/ES87TSDBDocValuesFormatTests.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java index 29083e473b4cb..18af7a15b3b2c 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java @@ -42,6 +42,7 @@ import java.util.Arrays; import java.util.HashMap; import java.util.List; +import java.util.Locale; import java.util.Map; import static org.hamcrest.Matchers.equalTo; @@ -275,7 +276,7 @@ public void testForceMerge() throws Exception { var d = new Document(); int batchIndex = i / numHosts; - String hostName = String.format("host-%03d", batchIndex); + String hostName = String.format(Locale.ROOT, "host-%03d", batchIndex); long timestamp = baseTimestamp + (1000L * i); d.add(new SortedDocValuesField(hostnameField, new BytesRef(hostName))); @@ -304,7 +305,7 @@ public void testForceMerge() throws Exception { assertEquals(i, sortedDocValues.nextDoc()); int batchIndex = i / numHosts; assertEquals(batchIndex, sortedDocValues.ordValue()); - String expectedHostName = String.format("host-%03d", batchIndex); + String expectedHostName = String.format(Locale.ROOT, "host-%03d", batchIndex); assertEquals(expectedHostName, sortedDocValues.lookupOrd(sortedDocValues.ordValue()).utf8ToString()); } } From 486ea207e740db8d4a4523eeaab02c7dc043775b Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Fri, 21 Mar 2025 19:05:01 +0100 Subject: [PATCH 07/43] fix lost of stuff --- .../tsdb/TSDBDocValuesMergeBenchmark.java | 6 ++ .../codec/tsdb/DocValuesConsumerUtil.java | 29 +++---- .../codec/tsdb/ES87TSDBDocValuesConsumer.java | 80 +++++++++++++------ .../codec/tsdb/ES87TSDBDocValuesFormat.java | 1 + .../codec/tsdb/ES87TSDBDocValuesProducer.java | 73 ++++++++++++++--- .../tsdb/ES87TSDBDocValuesFormatTests.java | 5 ++ 6 files changed, 141 insertions(+), 53 deletions(-) diff --git a/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java b/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java index b2a1ea492fced..a6f3affa90f34 100644 --- a/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java +++ b/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java @@ -15,6 +15,7 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.SortedDocValuesField; import org.apache.lucene.document.SortedNumericDocValuesField; +import org.apache.lucene.document.SortedSetDocValuesField; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.search.Sort; @@ -120,6 +121,7 @@ public DocValuesFormat getDocValuesFormatForField(String field) { long[] gauge1Values = new long[] { 2, 4, 6, 8, 10, 12, 14, 16 }; long[] gauge2Values = new long[] { -2, -4, -6, -8, -10, -12, -14, -16 }; int numHosts = 1000; + String[] tags = new String[] { "tag_1", "tag_2", "tag_3", "tag_4", "tag_5", "tag_6", "tag_7", "tag_8" }; final Random random = new Random(seed); IndexWriter indexWriter = new IndexWriter(directory, config); @@ -137,6 +139,10 @@ public DocValuesFormat getDocValuesFormatForField(String field) { doc.add(new SortedNumericDocValuesField("counter_2", counter2++)); doc.add(new SortedNumericDocValuesField("gauge_1", gauge1Values[i % gauge1Values.length])); doc.add(new SortedNumericDocValuesField("gauge_2", gauge2Values[i % gauge1Values.length])); + int numTags = 3; + for (int j = 0; j < numTags; j++) { + doc.add(new SortedSetDocValuesField("tags", new BytesRef(tags[j]))); + } indexWriter.addDocument(doc); } diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/DocValuesConsumerUtil.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/DocValuesConsumerUtil.java index be078fa61d633..95dd33456abcb 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/DocValuesConsumerUtil.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/DocValuesConsumerUtil.java @@ -28,10 +28,10 @@ import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LongValues; +import org.elasticsearch.core.CheckedFunction; import java.io.IOException; import java.util.List; -import java.util.function.BiFunction; /** * Contains mainly forked code from {@link org.apache.lucene.codecs.DocValuesConsumer}. @@ -58,8 +58,8 @@ static MergeStats compatibleWithOptimizedMerge( boolean optimizedMergeEnabled, FieldInfo mergeFieldInfo, MergeState mergeState, - BiFunction function - ) { + CheckedFunction getEntryFunction + ) throws IOException { if (optimizedMergeEnabled == false || mergeState.needsIndexSort == false || mergeFieldInfo.docValuesSkipIndexType() != DocValuesSkipIndexType.NONE) { @@ -70,27 +70,16 @@ static MergeStats compatibleWithOptimizedMerge( int sumNumDocsWithField = 0; for (DocValuesProducer docValuesProducer : mergeState.docValuesProducers) { - if (docValuesProducer instanceof ES87TSDBDocValuesProducer tsdbProducer) { - if (tsdbProducer.version != ES87TSDBDocValuesFormat.VERSION_CURRENT) { - return UNSUPPORTED; - } - - var entry = function.apply(tsdbProducer, mergeFieldInfo.name); - assert entry != null; - // TODO: support also fields with offsets - if (entry.docsWithFieldOffset != -1) { - return UNSUPPORTED; - } - sumNumValues += entry.numValues; - sumNumDocsWithField += entry.numDocsWithField; - } else { + // TODO bring back codec version check? (per field doc values producer sits between ES87TSDBDocValuesConsumer) + var entry = getEntryFunction.apply(docValuesProducer); + if (entry == null) { return UNSUPPORTED; } - } - if (Math.toIntExact(sumNumValues) != sumNumDocsWithField) { - return UNSUPPORTED; + sumNumValues += entry.numValues; + sumNumDocsWithField += entry.numDocsWithField; } + // Documents marked as deleted should be rare. Maybe in the case of noop operation? for (int i = 0; i < mergeState.liveDocs.length; i++) { if (mergeState.liveDocs[i] != null) { diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesConsumer.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesConsumer.java index 82d93ff8d5c0a..3c86e7390d29b 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesConsumer.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesConsumer.java @@ -225,9 +225,14 @@ private long[] writeField(FieldInfo field, DocValuesProducer valuesProducer, lon @Override public void mergeNumericField(FieldInfo mergeFieldInfo, MergeState mergeState) throws IOException { - var result = compatibleWithOptimizedMerge(enableOptimizedMerge, mergeFieldInfo, mergeState, (es87TSDBDocValuesProducer, s) -> { - var entry = es87TSDBDocValuesProducer.numerics.get(s); - return new DocValuesConsumerUtil.FieldEntry(entry.docsWithFieldOffset, entry.numValues, -1); + var result = compatibleWithOptimizedMerge(enableOptimizedMerge, mergeFieldInfo, mergeState, (docValuesProducer) -> { + var numeric = docValuesProducer.getNumeric(mergeFieldInfo); + if (numeric instanceof ES87TSDBDocValuesProducer.BaseNumericDocValues baseNumericDocValues) { + var entry = baseNumericDocValues.entry; + return new DocValuesConsumerUtil.FieldEntry(entry.docsWithFieldOffset, entry.numValues, -1); + } else { + return null; + } }); if (result.supported() == false) { super.mergeNumericField(mergeFieldInfo, mergeState); @@ -336,9 +341,14 @@ public void addSortedField(FieldInfo field, DocValuesProducer valuesProducer) th @Override public void mergeSortedField(FieldInfo mergeFieldInfo, MergeState mergeState) throws IOException { - var result = compatibleWithOptimizedMerge(enableOptimizedMerge, mergeFieldInfo, mergeState, (es87TSDBDocValuesProducer, s) -> { - var entry = es87TSDBDocValuesProducer.sorted.get(s); - return new DocValuesConsumerUtil.FieldEntry(entry.ordsEntry.docsWithFieldOffset, entry.ordsEntry.numValues, -1); + var result = compatibleWithOptimizedMerge(enableOptimizedMerge, mergeFieldInfo, mergeState, (docValuesProducer) -> { + var sorted = docValuesProducer.getSorted(mergeFieldInfo); + if (sorted instanceof ES87TSDBDocValuesProducer.BaseSortedDocValues baseSortedDocValues) { + var entry = baseSortedDocValues.entry; + return new DocValuesConsumerUtil.FieldEntry(entry.ordsEntry.docsWithFieldOffset, entry.ordsEntry.numValues, -1); + } else { + return null; + } }); if (result.supported() == false) { super.mergeSortedField(mergeFieldInfo, mergeState); @@ -351,7 +361,7 @@ public SortedDocValues getSorted(FieldInfo field) throws IOException { List subs = new ArrayList<>(); assert mergeState.docMaps.length == mergeState.docValuesProducers.length; - TermsEnum[] liveTerms = new TermsEnum[subs.size()]; + TermsEnum[] liveTerms = new TermsEnum[mergeState.docValuesProducers.length]; long[] weights = new long[liveTerms.length]; for (int i = 0; i < mergeState.docValuesProducers.length; i++) { SortedDocValues values = null; @@ -362,9 +372,13 @@ public SortedDocValues getSorted(FieldInfo field) throws IOException { values = docValuesProducer.getSorted(readerFieldInfo); } } - if (values != null) { - subs.add(new DocValuesConsumerUtil.SortedDocValuesSub(mergeState.docMaps[i], values)); + if (values == null) { + values = DocValues.emptySorted(); } + + liveTerms[i] = values.termsEnum(); + weights[i] = values.getValueCount(); + subs.add(new DocValuesConsumerUtil.SortedDocValuesSub(mergeState.docMaps[i], values)); } final OrdinalMap map = OrdinalMap.build(null, liveTerms, weights, PackedInts.COMPACT); @@ -614,9 +628,14 @@ private void writeSortedNumericField(FieldInfo field, DocValuesProducer valuesPr @Override public void mergeSortedNumericField(FieldInfo mergeFieldInfo, MergeState mergeState) throws IOException { - var result = compatibleWithOptimizedMerge(enableOptimizedMerge, mergeFieldInfo, mergeState, (es87TSDBDocValuesProducer, s) -> { - var entry = es87TSDBDocValuesProducer.sortedNumerics.get(s); - return new DocValuesConsumerUtil.FieldEntry(entry.docsWithFieldOffset, entry.numValues, entry.numDocsWithField); + var result = compatibleWithOptimizedMerge(enableOptimizedMerge, mergeFieldInfo, mergeState, (docValuesProducer) -> { + var sortedNumeric = docValuesProducer.getSortedNumeric(mergeFieldInfo); + if (sortedNumeric instanceof ES87TSDBDocValuesProducer.BaseSortedNumericDocValues baseSortedNumericDocValues) { + var entry = baseSortedNumericDocValues.entry; + return new DocValuesConsumerUtil.FieldEntry(entry.docsWithFieldOffset, entry.numValues, entry.numDocsWithField); + } else { + return null; + } }); if (result.supported() == false) { super.mergeSortedNumericField(mergeFieldInfo, mergeState); @@ -653,6 +672,13 @@ private static boolean isSingleValued(SortedSetDocValues values) throws IOExcept return true; } + if (values instanceof ES87TSDBDocValuesProducer.BaseSortedSetDocValues baseSortedSet) { + var entry = baseSortedSet.entry; + if (entry.ordsEntry.numValues == entry.ordsEntry.numDocsWithField) { + return true; + } + } + assert values.docID() == -1; for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) { int docValueCount = values.docValueCount(); @@ -666,13 +692,18 @@ private static boolean isSingleValued(SortedSetDocValues values) throws IOExcept @Override public void mergeSortedSetField(FieldInfo mergeFieldInfo, MergeState mergeState) throws IOException { - var result = compatibleWithOptimizedMerge(enableOptimizedMerge, mergeFieldInfo, mergeState, (es87TSDBDocValuesProducer, s) -> { - var entry = es87TSDBDocValuesProducer.sortedSets.get(s); - return new DocValuesConsumerUtil.FieldEntry( - entry.ordsEntry.docsWithFieldOffset, - entry.ordsEntry.numValues, - entry.ordsEntry.numDocsWithField - ); + var result = compatibleWithOptimizedMerge(enableOptimizedMerge, mergeFieldInfo, mergeState, (docValuesProducer) -> { + var sortedSet = docValuesProducer.getSortedSet(mergeFieldInfo); + if (sortedSet instanceof ES87TSDBDocValuesProducer.BaseSortedSetDocValues baseSortedSet) { + var entry = baseSortedSet.entry; + return new DocValuesConsumerUtil.FieldEntry( + entry.ordsEntry.docsWithFieldOffset, + entry.ordsEntry.numValues, + entry.ordsEntry.numDocsWithField + ); + } else { + return null; + } }); if (result.supported() == false) { super.mergeSortedSetField(mergeFieldInfo, mergeState); @@ -685,7 +716,7 @@ public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException { List subs = new ArrayList<>(); assert mergeState.docMaps.length == mergeState.docValuesProducers.length; - TermsEnum[] liveTerms = new TermsEnum[subs.size()]; + TermsEnum[] liveTerms = new TermsEnum[mergeState.docValuesProducers.length]; long[] weights = new long[liveTerms.length]; for (int i = 0; i < mergeState.docValuesProducers.length; i++) { SortedSetDocValues values = null; @@ -695,12 +726,13 @@ public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException { if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED_SET) { values = docValuesProducer.getSortedSet(readerFieldInfo); } - liveTerms[i] = values.termsEnum(); - weights[i] = values.getValueCount(); } - if (values != null) { - subs.add(new DocValuesConsumerUtil.SortedSetDocValuesSub(mergeState.docMaps[i], values)); + if (values == null) { + values = DocValues.emptySortedSet(); } + liveTerms[i] = values.termsEnum(); + weights[i] = values.getValueCount(); + subs.add(new DocValuesConsumerUtil.SortedSetDocValuesSub(mergeState.docMaps[i], values)); } final OrdinalMap map = OrdinalMap.build(null, liveTerms, weights, PackedInts.COMPACT); diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormat.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormat.java index a2b5a77bbd0ac..ca8fc2e9774ee 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormat.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormat.java @@ -89,6 +89,7 @@ public class ES87TSDBDocValuesFormat extends org.apache.lucene.codecs.DocValuesF } private final int skipIndexIntervalSize; + // TODO: remove escape hatch? Is useful now when testing/benchmarking, but current optimized merge logic currently do too scary things. private final boolean enableOptimizedMerge; /** Default constructor. */ diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesProducer.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesProducer.java index 54c7d83421a36..758a18667498f 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesProducer.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesProducer.java @@ -360,7 +360,7 @@ public long cost() { }; } - private abstract class BaseSortedDocValues extends SortedDocValues { + abstract class BaseSortedDocValues extends SortedDocValues { final SortedEntry entry; final TermsEnum termsEnum; @@ -396,7 +396,7 @@ public TermsEnum termsEnum() throws IOException { } } - private abstract static class BaseSortedSetDocValues extends SortedSetDocValues { + abstract static class BaseSortedSetDocValues extends SortedSetDocValues { final SortedSetEntry entry; final IndexInput data; @@ -1032,7 +1032,7 @@ private NumericDocValues getNumeric(NumericEntry entry, long maxOrd) throws IOEx // Special case for maxOrd 1, no need to read blocks and use ordinal 0 as only value if (entry.docsWithFieldOffset == -1) { // Special case when all docs have a value - return new NumericDocValues() { + return new BaseNumericDocValues(entry) { private final int maxDoc = ES87TSDBDocValuesProducer.this.maxDoc; private int doc = -1; @@ -1081,7 +1081,7 @@ public long cost() { entry.denseRankPower, entry.numValues ); - return new NumericDocValues() { + return new BaseNumericDocValues(entry) { @Override public int advance(int target) throws IOException { @@ -1126,7 +1126,7 @@ public long longValue() { final int bitsPerOrd = maxOrd >= 0 ? PackedInts.bitsRequired(maxOrd - 1) : -1; if (entry.docsWithFieldOffset == -1) { // dense - return new NumericDocValues() { + return new BaseNumericDocValues(entry) { private final int maxDoc = ES87TSDBDocValuesProducer.this.maxDoc; private int doc = -1; @@ -1193,7 +1193,7 @@ public long longValue() throws IOException { entry.denseRankPower, entry.numValues ); - return new NumericDocValues() { + return new BaseNumericDocValues(entry) { private final TSDBDocValuesEncoder decoder = new TSDBDocValuesEncoder(ES87TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE); private long currentBlockIndex = -1; @@ -1248,6 +1248,15 @@ public long longValue() throws IOException { } } + abstract static class BaseNumericDocValues extends NumericDocValues { + + final NumericEntry entry; + + BaseNumericDocValues(NumericEntry entry) { + this.entry = entry; + } + } + private NumericValues getValues(NumericEntry entry, final long maxOrd) throws IOException { assert entry.numValues > 0; final RandomAccessInput indexSlice = data.randomAccessSlice(entry.indexOffset, entry.indexLength); @@ -1284,7 +1293,44 @@ long advance(long index) throws IOException { private SortedNumericDocValues getSortedNumeric(SortedNumericEntry entry, long maxOrd) throws IOException { if (entry.numValues == entry.numDocsWithField) { - return DocValues.singleton(getNumeric(entry, maxOrd)); + var numeric = getNumeric(entry, maxOrd); + return new BaseSortedNumericDocValues(entry) { + + @Override + public long nextValue() throws IOException { + return numeric.longValue(); + } + + @Override + public int docValueCount() { + return 1; + } + + @Override + public boolean advanceExact(int target) throws IOException { + return numeric.advanceExact(target); + } + + @Override + public int docID() { + return numeric.docID(); + } + + @Override + public int nextDoc() throws IOException { + return numeric.nextDoc(); + } + + @Override + public int advance(int target) throws IOException { + return numeric.advance(target); + } + + @Override + public long cost() { + return numeric.cost(); + } + }; } final RandomAccessInput addressesInput = data.randomAccessSlice(entry.addressesOffset, entry.addressesLength); @@ -1294,7 +1340,7 @@ private SortedNumericDocValues getSortedNumeric(SortedNumericEntry entry, long m if (entry.docsWithFieldOffset == -1) { // dense - return new SortedNumericDocValues() { + return new BaseSortedNumericDocValues(entry) { int doc = -1; long start, end; @@ -1355,7 +1401,7 @@ public int docValueCount() { entry.denseRankPower, entry.numDocsWithField ); - return new SortedNumericDocValues() { + return new BaseSortedNumericDocValues(entry) { boolean set; long start, end; @@ -1414,6 +1460,15 @@ private void set() { } } + abstract static class BaseSortedNumericDocValues extends SortedNumericDocValues { + + final SortedNumericEntry entry; + + BaseSortedNumericDocValues(SortedNumericEntry entry) { + this.entry = entry; + } + } + private record DocValuesSkipperEntry(long offset, long length, long minValue, long maxValue, int docCount, int maxDocId) {} static class NumericEntry { diff --git a/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java index 18af7a15b3b2c..75cb328d96bf3 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java @@ -269,6 +269,7 @@ public void testForceMerge() throws Exception { long counter2 = 10_000_000; long[] gauge1Values = new long[] { 2, 4, 6, 8, 10, 12, 14, 16 }; long[] gauge2Values = new long[] { -2, -4, -6, -8, -10, -12, -14, -16 }; + String[] tags = new String[] { "tag_1", "tag_2", "tag_3", "tag_4", "tag_5", "tag_6", "tag_7", "tag_8" }; int numHosts = 10; int numDocs = 256 + random().nextInt(1024); @@ -285,6 +286,10 @@ public void testForceMerge() throws Exception { d.add(new SortedNumericDocValuesField("counter_2", counter2++)); d.add(new SortedNumericDocValuesField("gauge_1", gauge1Values[i % gauge1Values.length])); d.add(new SortedNumericDocValuesField("gauge_2", gauge2Values[i % gauge1Values.length])); + int numTags = 3; + for (int j = 0; j < numTags; j++) { + d.add(new SortedSetDocValuesField("tags", new BytesRef(tags[j]))); + } iw.addDocument(d); if (i % 100 == 0) { From 984513a5412977a4ec653a54897475391051680d Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Mon, 24 Mar 2025 11:50:43 +0100 Subject: [PATCH 08/43] iter --- .../codec/tsdb/ES87TSDBDocValuesProducer.java | 65 +++++++++--------- .../tsdb/ES87TSDBDocValuesFormatTests.java | 67 +++++++++++++++++-- 2 files changed, 95 insertions(+), 37 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesProducer.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesProducer.java index 758a18667498f..f7e192bdccfaf 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesProducer.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesProducer.java @@ -1294,43 +1294,48 @@ long advance(long index) throws IOException { private SortedNumericDocValues getSortedNumeric(SortedNumericEntry entry, long maxOrd) throws IOException { if (entry.numValues == entry.numDocsWithField) { var numeric = getNumeric(entry, maxOrd); - return new BaseSortedNumericDocValues(entry) { + if (merging) { + return new BaseSortedNumericDocValues(entry) { - @Override - public long nextValue() throws IOException { - return numeric.longValue(); - } + @Override + public long nextValue() throws IOException { + return numeric.longValue(); + } - @Override - public int docValueCount() { - return 1; - } + @Override + public int docValueCount() { + return 1; + } - @Override - public boolean advanceExact(int target) throws IOException { - return numeric.advanceExact(target); - } + @Override + public boolean advanceExact(int target) throws IOException { + return numeric.advanceExact(target); + } - @Override - public int docID() { - return numeric.docID(); - } + @Override + public int docID() { + return numeric.docID(); + } - @Override - public int nextDoc() throws IOException { - return numeric.nextDoc(); - } + @Override + public int nextDoc() throws IOException { + return numeric.nextDoc(); + } - @Override - public int advance(int target) throws IOException { - return numeric.advance(target); - } + @Override + public int advance(int target) throws IOException { + return numeric.advance(target); + } - @Override - public long cost() { - return numeric.cost(); - } - }; + @Override + public long cost() { + return numeric.cost(); + } + }; + } else { + // Required otherwise search / compute engine can't otherwise optimize for when each document has exactly one value: + return DocValues.singleton(numeric); + } } final RandomAccessInput addressesInput = data.randomAccessSlice(entry.addressesOffset, entry.addressesLength); diff --git a/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java index 75cb328d96bf3..36626f3f20bcb 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java @@ -13,11 +13,13 @@ import org.apache.lucene.codecs.Codec; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; +import org.apache.lucene.document.NumericDocValuesField; import org.apache.lucene.document.SortedDocValuesField; import org.apache.lucene.document.SortedNumericDocValuesField; import org.apache.lucene.document.SortedSetDocValuesField; import org.apache.lucene.document.StringField; import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.DocValues; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; @@ -281,12 +283,13 @@ public void testForceMerge() throws Exception { long timestamp = baseTimestamp + (1000L * i); d.add(new SortedDocValuesField(hostnameField, new BytesRef(hostName))); + // Index sorting doesn't work with NumericDocValuesField: d.add(new SortedNumericDocValuesField(timestampField, timestamp)); - d.add(new SortedNumericDocValuesField("counter_1", counter1++)); + d.add(new NumericDocValuesField("counter_1", counter1++)); d.add(new SortedNumericDocValuesField("counter_2", counter2++)); d.add(new SortedNumericDocValuesField("gauge_1", gauge1Values[i % gauge1Values.length])); d.add(new SortedNumericDocValuesField("gauge_2", gauge2Values[i % gauge1Values.length])); - int numTags = 3; + int numTags = 1 + random().nextInt(8); for (int j = 0; j < numTags; j++) { d.add(new SortedSetDocValuesField("tags", new BytesRef(tags[j]))); } @@ -300,18 +303,68 @@ public void testForceMerge() throws Exception { iw.forceMerge(1); + // For asserting using binary search later on: + Arrays.sort(gauge2Values); + try (var reader = DirectoryReader.open(iw)) { assertEquals(1, reader.leaves().size()); assertEquals(numDocs, reader.maxDoc()); var leaf = reader.leaves().get(0).reader(); - var sortedDocValues = leaf.getSortedDocValues(hostnameField); - assertNotNull(sortedDocValues); + var hostNameDV = leaf.getSortedDocValues(hostnameField); + assertNotNull(hostNameDV); + var timestampDV = DocValues.unwrapSingleton(leaf.getSortedNumericDocValues(timestampField)); + assertNotNull(timestampDV); + var counterOneDV = leaf.getNumericDocValues("counter_1"); + assertNotNull(counterOneDV); + var counterTwoDV = leaf.getSortedNumericDocValues("counter_2"); + assertNotNull(counterTwoDV); + var gaugeOneDV = leaf.getSortedNumericDocValues("gauge_1"); + assertNotNull(gaugeOneDV); + var gaugeTwoDV = leaf.getSortedNumericDocValues("gauge_2"); + assertNotNull(gaugeTwoDV); + var tagsDV = leaf.getSortedSetDocValues("tags"); + assertNotNull(tagsDV); for (int i = 0; i < numDocs; i++) { - assertEquals(i, sortedDocValues.nextDoc()); + assertEquals(i, hostNameDV.nextDoc()); int batchIndex = i / numHosts; - assertEquals(batchIndex, sortedDocValues.ordValue()); + assertEquals(batchIndex, hostNameDV.ordValue()); String expectedHostName = String.format(Locale.ROOT, "host-%03d", batchIndex); - assertEquals(expectedHostName, sortedDocValues.lookupOrd(sortedDocValues.ordValue()).utf8ToString()); + assertEquals(expectedHostName, hostNameDV.lookupOrd(hostNameDV.ordValue()).utf8ToString()); + + assertEquals(i, timestampDV.nextDoc()); + long timestamp = timestampDV.longValue(); + long lowerBound = baseTimestamp; + long upperBound = baseTimestamp + (1000L * numDocs); + assertTrue( + "unexpected timestamp [" + timestamp + "], expected between [" + lowerBound + "] and [" + upperBound + "]", + timestamp >= lowerBound && timestamp < upperBound + ); + + assertEquals(i, counterOneDV.nextDoc()); + long counterOneValue = counterOneDV.longValue(); + assertTrue("unexpected counter [" + counterOneValue + "]", counterOneValue >= 0 && counterOneValue < counter1); + + assertEquals(i, counterTwoDV.nextDoc()); + assertEquals(1, counterTwoDV.docValueCount()); + long counterTwoValue = counterTwoDV.nextValue(); + assertTrue("unexpected counter [" + counterTwoValue + "]", counterTwoValue > 0 && counterTwoValue <= counter2); + + assertEquals(i, gaugeOneDV.nextDoc()); + assertEquals(1, gaugeOneDV.docValueCount()); + long gaugeOneValue = gaugeOneDV.nextValue(); + assertTrue("unexpected gauge [" + gaugeOneValue + "]", Arrays.binarySearch(gauge1Values, gaugeOneValue) >= 0); + + assertEquals(i, gaugeTwoDV.nextDoc()); + assertEquals(1, gaugeTwoDV.docValueCount()); + long gaugeTwoValue = gaugeTwoDV.nextValue(); + assertTrue("unexpected gauge [" + gaugeTwoValue + "]", Arrays.binarySearch(gauge2Values, gaugeTwoValue) >= 0); + + assertEquals(i, tagsDV.nextDoc()); + for (int j = 0; j < tagsDV.docValueCount(); j++) { + long ordinal = tagsDV.nextOrd(); + String actualTag = tagsDV.lookupOrd(ordinal).utf8ToString(); + assertTrue("unexpected tag [" + actualTag + "]", Arrays.binarySearch(tags, actualTag) >= 0); + } } } } From 3b53705d53f67d8b2912e8304ac15db54d84fc6e Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Mon, 24 Mar 2025 13:48:06 +0100 Subject: [PATCH 09/43] iter test --- .../index/codec/tsdb/ES87TSDBDocValuesFormatTests.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java index 36626f3f20bcb..eff3138aaea12 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java @@ -253,7 +253,7 @@ public void testManyDocsWithManyValues() throws Exception { } } - public void testForceMerge() throws Exception { + public void testForceMergeDenseCase() throws Exception { String timestampField = "@timestamp"; String hostnameField = "host.name"; long baseTimestamp = 1704067200000L; @@ -272,9 +272,9 @@ public void testForceMerge() throws Exception { long[] gauge1Values = new long[] { 2, 4, 6, 8, 10, 12, 14, 16 }; long[] gauge2Values = new long[] { -2, -4, -6, -8, -10, -12, -14, -16 }; String[] tags = new String[] { "tag_1", "tag_2", "tag_3", "tag_4", "tag_5", "tag_6", "tag_7", "tag_8" }; - int numHosts = 10; int numDocs = 256 + random().nextInt(1024); + int numHosts = numDocs / 20; for (int i = 0; i < numDocs; i++) { var d = new Document(); From 9fb38b6be8ab722724412e80a8e78db6055744cc Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Mon, 24 Mar 2025 14:38:43 +0100 Subject: [PATCH 10/43] moving code around --- .../codec/tsdb/DocValuesConsumerUtil.java | 178 +++++++++++++++--- .../codec/tsdb/ES87TSDBDocValuesConsumer.java | 141 ++------------ 2 files changed, 165 insertions(+), 154 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/DocValuesConsumerUtil.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/DocValuesConsumerUtil.java index 95dd33456abcb..8d178fbc81ae7 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/DocValuesConsumerUtil.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/DocValuesConsumerUtil.java @@ -12,7 +12,9 @@ import org.apache.lucene.codecs.DocValuesProducer; import org.apache.lucene.index.BaseTermsEnum; import org.apache.lucene.index.DocIDMerger; +import org.apache.lucene.index.DocValues; import org.apache.lucene.index.DocValuesSkipIndexType; +import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.EmptyDocValuesProducer; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.ImpactsEnum; @@ -28,9 +30,11 @@ import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LongValues; +import org.apache.lucene.util.packed.PackedInts; import org.elasticsearch.core.CheckedFunction; import java.io.IOException; +import java.util.ArrayList; import java.util.List; /** @@ -90,18 +94,44 @@ static MergeStats compatibleWithOptimizedMerge( return new MergeStats(true, sumNumValues, sumNumDocsWithField); } - static SortedNumericDocValues mergeSortedNumericValues(List subs, boolean indexIsSorted) throws IOException { + static DocValuesProducer mergeNumericProducer(MergeStats mergeStats, FieldInfo mergeFieldInfo, MergeState mergeState) { + return new TsdbDocValuesProducer(mergeStats) { + + @Override + public NumericDocValues getNumeric(FieldInfo field) throws IOException { + List subs = new ArrayList<>(); + assert mergeState.docMaps.length == mergeState.docValuesProducers.length; + for (int i = 0; i < mergeState.docValuesProducers.length; i++) { + NumericDocValues values = null; + DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; + if (docValuesProducer != null) { + FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name); + if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.NUMERIC) { + values = docValuesProducer.getNumeric(readerFieldInfo); + } + } + if (values != null) { + subs.add(new NumericDocValuesSub(mergeState.docMaps[i], values)); + } + } + + return mergeNumericValues(subs, mergeState.needsIndexSort); + } + }; + } + + static NumericDocValues mergeNumericValues(List subs, boolean indexIsSorted) throws IOException { long cost = 0; - for (SortedNumericDocValuesSub sub : subs) { + for (NumericDocValuesSub sub : subs) { cost += sub.values.cost(); } final long finalCost = cost; - final DocIDMerger docIDMerger = DocIDMerger.of(subs, indexIsSorted); + final DocIDMerger docIDMerger = DocIDMerger.of(subs, indexIsSorted); - return new SortedNumericDocValues() { + return new NumericDocValues() { private int docID = -1; - private SortedNumericDocValuesSub current; + private NumericDocValuesSub current; @Override public int docID() { @@ -135,24 +165,19 @@ public long cost() { } @Override - public long nextValue() throws IOException { - return current.values.nextValue(); - } - - @Override - public int docValueCount() { - return current.values.docValueCount(); + public long longValue() throws IOException { + return current.values.longValue(); } }; } - static class SortedNumericDocValuesSub extends DocIDMerger.Sub { + static class NumericDocValuesSub extends DocIDMerger.Sub { - final SortedNumericDocValues values; + final NumericDocValues values; int docID = -1; - SortedNumericDocValuesSub(MergeState.DocMap docMap, SortedNumericDocValues values) { + NumericDocValuesSub(MergeState.DocMap docMap, NumericDocValues values) { super(docMap); this.values = values; assert values.docID() == -1; @@ -164,18 +189,43 @@ public int nextDoc() throws IOException { } } - static NumericDocValues mergeNumericValues(List subs, boolean indexIsSorted) throws IOException { + static DocValuesProducer mergeSortedNumericProducer(MergeStats mergeStats, FieldInfo mergeFieldInfo, MergeState mergeState) { + return new TsdbDocValuesProducer(mergeStats) { + + @Override + public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException { + List subs = new ArrayList<>(); + assert mergeState.docMaps.length == mergeState.docValuesProducers.length; + for (int i = 0; i < mergeState.docValuesProducers.length; i++) { + SortedNumericDocValues values = null; + DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; + if (docValuesProducer != null) { + FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name); + if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED_NUMERIC) { + values = docValuesProducer.getSortedNumeric(readerFieldInfo); + } + } + if (values != null) { + subs.add(new SortedNumericDocValuesSub(mergeState.docMaps[i], values)); + } + } + return mergeSortedNumericValues(subs, mergeState.needsIndexSort); + } + }; + } + + static SortedNumericDocValues mergeSortedNumericValues(List subs, boolean indexIsSorted) throws IOException { long cost = 0; - for (NumericDocValuesSub sub : subs) { + for (SortedNumericDocValuesSub sub : subs) { cost += sub.values.cost(); } final long finalCost = cost; - final DocIDMerger docIDMerger = DocIDMerger.of(subs, indexIsSorted); + final DocIDMerger docIDMerger = DocIDMerger.of(subs, indexIsSorted); - return new NumericDocValues() { + return new SortedNumericDocValues() { private int docID = -1; - private NumericDocValuesSub current; + private SortedNumericDocValuesSub current; @Override public int docID() { @@ -209,19 +259,24 @@ public long cost() { } @Override - public long longValue() throws IOException { - return current.values.longValue(); + public long nextValue() throws IOException { + return current.values.nextValue(); + } + + @Override + public int docValueCount() { + return current.values.docValueCount(); } }; } - static class NumericDocValuesSub extends DocIDMerger.Sub { + static class SortedNumericDocValuesSub extends DocIDMerger.Sub { - final NumericDocValues values; + final SortedNumericDocValues values; int docID = -1; - NumericDocValuesSub(MergeState.DocMap docMap, NumericDocValues values) { + SortedNumericDocValuesSub(MergeState.DocMap docMap, SortedNumericDocValues values) { super(docMap); this.values = values; assert values.docID() == -1; @@ -233,6 +288,43 @@ public int nextDoc() throws IOException { } } + static DocValuesProducer mergeSortedProducer(MergeStats mergeStats, FieldInfo mergeFieldInfo, MergeState mergeState) { + return new TsdbDocValuesProducer(mergeStats) { + + @Override + public SortedDocValues getSorted(FieldInfo field) throws IOException { + List subs = new ArrayList<>(); + assert mergeState.docMaps.length == mergeState.docValuesProducers.length; + + TermsEnum[] liveTerms = new TermsEnum[mergeState.docValuesProducers.length]; + long[] weights = new long[liveTerms.length]; + for (int i = 0; i < mergeState.docValuesProducers.length; i++) { + SortedDocValues values = null; + DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; + if (docValuesProducer != null) { + FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name); + if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED) { + values = docValuesProducer.getSorted(readerFieldInfo); + } + } + if (values == null) { + values = DocValues.emptySorted(); + } + + liveTerms[i] = values.termsEnum(); + weights[i] = values.getValueCount(); + subs.add(new SortedDocValuesSub(mergeState.docMaps[i], values)); + } + + final OrdinalMap map = OrdinalMap.build(null, liveTerms, weights, PackedInts.COMPACT); + for (int i = 0; i < subs.size(); i++) { + subs.get(i).map = map.getGlobalOrds(i); + } + return mergeSortedValues(subs, mergeState.needsIndexSort, map); + } + }; + } + static SortedDocValues mergeSortedValues(List subs, boolean indexIsSorted, OrdinalMap map) throws IOException { long cost = 0; for (SortedDocValuesSub sub : subs) { @@ -324,6 +416,42 @@ public int nextDoc() throws IOException { } } + static DocValuesProducer mergeSortedSetProducer(MergeStats mergeStats, FieldInfo mergeFieldInfo, MergeState mergeState) { + return new TsdbDocValuesProducer(mergeStats) { + + @Override + public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException { + List subs = new ArrayList<>(); + assert mergeState.docMaps.length == mergeState.docValuesProducers.length; + + TermsEnum[] liveTerms = new TermsEnum[mergeState.docValuesProducers.length]; + long[] weights = new long[liveTerms.length]; + for (int i = 0; i < mergeState.docValuesProducers.length; i++) { + SortedSetDocValues values = null; + DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; + if (docValuesProducer != null) { + FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name); + if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED_SET) { + values = docValuesProducer.getSortedSet(readerFieldInfo); + } + } + if (values == null) { + values = DocValues.emptySortedSet(); + } + liveTerms[i] = values.termsEnum(); + weights[i] = values.getValueCount(); + subs.add(new SortedSetDocValuesSub(mergeState.docMaps[i], values)); + } + + final OrdinalMap map = OrdinalMap.build(null, liveTerms, weights, PackedInts.COMPACT); + for (int i = 0; i < subs.size(); i++) { + subs.get(i).map = map.getGlobalOrds(i); + } + return mergeSortedSetValues(subs, mergeState.needsIndexSort, map); + } + }; + } + static SortedSetDocValues mergeSortedSetValues(List subs, boolean indexIsSorted, OrdinalMap map) throws IOException { long cost = 0; diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesConsumer.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesConsumer.java index 3c86e7390d29b..738b3f75010da 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesConsumer.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesConsumer.java @@ -16,13 +16,11 @@ import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.DocValues; import org.apache.lucene.index.DocValuesSkipIndexType; -import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.EmptyDocValuesProducer; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.MergeState; import org.apache.lucene.index.NumericDocValues; -import org.apache.lucene.index.OrdinalMap; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.index.SortedNumericDocValues; @@ -50,7 +48,6 @@ import java.util.List; import static org.elasticsearch.index.codec.tsdb.DocValuesConsumerUtil.compatibleWithOptimizedMerge; -import static org.elasticsearch.index.codec.tsdb.DocValuesConsumerUtil.mergeSortedNumericValues; import static org.elasticsearch.index.codec.tsdb.ES87TSDBDocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT; import static org.elasticsearch.index.codec.tsdb.ES87TSDBDocValuesFormat.SKIP_INDEX_LEVEL_SHIFT; import static org.elasticsearch.index.codec.tsdb.ES87TSDBDocValuesFormat.SKIP_INDEX_MAX_LEVEL; @@ -234,34 +231,11 @@ public void mergeNumericField(FieldInfo mergeFieldInfo, MergeState mergeState) t return null; } }); - if (result.supported() == false) { + if (result.supported()) { + addNumericField(mergeFieldInfo, DocValuesConsumerUtil.mergeNumericProducer(result, mergeFieldInfo, mergeState)); + } else { super.mergeNumericField(mergeFieldInfo, mergeState); - return; } - addNumericField(mergeFieldInfo, new DocValuesConsumerUtil.TsdbDocValuesProducer(result) { - - @Override - public NumericDocValues getNumeric(FieldInfo field) throws IOException { - List subs = new ArrayList<>(); - assert mergeState.docMaps.length == mergeState.docValuesProducers.length; - for (int i = 0; i < mergeState.docValuesProducers.length; i++) { - NumericDocValues values = null; - DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; - if (docValuesProducer != null) { - FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name); - if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.NUMERIC) { - values = docValuesProducer.getNumeric(readerFieldInfo); - } - } - if (values != null) { - subs.add(new DocValuesConsumerUtil.NumericDocValuesSub(mergeState.docMaps[i], values)); - } - } - - return DocValuesConsumerUtil.mergeNumericValues(subs, mergeState.needsIndexSort); - } - - }); } @Override @@ -350,45 +324,11 @@ public void mergeSortedField(FieldInfo mergeFieldInfo, MergeState mergeState) th return null; } }); - if (result.supported() == false) { + if (result.supported()) { + addSortedField(mergeFieldInfo, DocValuesConsumerUtil.mergeSortedProducer(result, mergeFieldInfo, mergeState)); + } else { super.mergeSortedField(mergeFieldInfo, mergeState); - return; } - addSortedField(mergeFieldInfo, new DocValuesConsumerUtil.TsdbDocValuesProducer(result) { - - @Override - public SortedDocValues getSorted(FieldInfo field) throws IOException { - List subs = new ArrayList<>(); - assert mergeState.docMaps.length == mergeState.docValuesProducers.length; - - TermsEnum[] liveTerms = new TermsEnum[mergeState.docValuesProducers.length]; - long[] weights = new long[liveTerms.length]; - for (int i = 0; i < mergeState.docValuesProducers.length; i++) { - SortedDocValues values = null; - DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; - if (docValuesProducer != null) { - FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name); - if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED) { - values = docValuesProducer.getSorted(readerFieldInfo); - } - } - if (values == null) { - values = DocValues.emptySorted(); - } - - liveTerms[i] = values.termsEnum(); - weights[i] = values.getValueCount(); - subs.add(new DocValuesConsumerUtil.SortedDocValuesSub(mergeState.docMaps[i], values)); - } - - final OrdinalMap map = OrdinalMap.build(null, liveTerms, weights, PackedInts.COMPACT); - for (int i = 0; i < subs.size(); i++) { - subs.get(i).map = map.getGlobalOrds(i); - } - return DocValuesConsumerUtil.mergeSortedValues(subs, mergeState.needsIndexSort, map); - } - - }); } private void doAddSortedField(FieldInfo field, DocValuesProducer valuesProducer, boolean addTypeByte) throws IOException { @@ -637,34 +577,11 @@ public void mergeSortedNumericField(FieldInfo mergeFieldInfo, MergeState mergeSt return null; } }); - if (result.supported() == false) { + if (result.supported()) { + addSortedNumericField(mergeFieldInfo, DocValuesConsumerUtil.mergeSortedNumericProducer(result, mergeFieldInfo, mergeState)); + } else { super.mergeSortedNumericField(mergeFieldInfo, mergeState); - return; } - addSortedNumericField(mergeFieldInfo, new DocValuesConsumerUtil.TsdbDocValuesProducer(result) { - - @Override - public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException { - List subs = new ArrayList<>(); - assert mergeState.docMaps.length == mergeState.docValuesProducers.length; - for (int i = 0; i < mergeState.docValuesProducers.length; i++) { - SortedNumericDocValues values = null; - DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; - if (docValuesProducer != null) { - FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name); - if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED_NUMERIC) { - values = docValuesProducer.getSortedNumeric(readerFieldInfo); - } - } - if (values != null) { - subs.add(new DocValuesConsumerUtil.SortedNumericDocValuesSub(mergeState.docMaps[i], values)); - } - } - - return mergeSortedNumericValues(subs, mergeState.needsIndexSort); - } - - }); } private static boolean isSingleValued(SortedSetDocValues values) throws IOException { @@ -705,45 +622,11 @@ public void mergeSortedSetField(FieldInfo mergeFieldInfo, MergeState mergeState) return null; } }); - if (result.supported() == false) { + if (result.supported()) { + addSortedSetField(mergeFieldInfo, DocValuesConsumerUtil.mergeSortedSetProducer(result, mergeFieldInfo, mergeState)); + } else { super.mergeSortedSetField(mergeFieldInfo, mergeState); - return; } - addSortedSetField(mergeFieldInfo, new DocValuesConsumerUtil.TsdbDocValuesProducer(result) { - - @Override - public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException { - List subs = new ArrayList<>(); - assert mergeState.docMaps.length == mergeState.docValuesProducers.length; - - TermsEnum[] liveTerms = new TermsEnum[mergeState.docValuesProducers.length]; - long[] weights = new long[liveTerms.length]; - for (int i = 0; i < mergeState.docValuesProducers.length; i++) { - SortedSetDocValues values = null; - DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; - if (docValuesProducer != null) { - FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name); - if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED_SET) { - values = docValuesProducer.getSortedSet(readerFieldInfo); - } - } - if (values == null) { - values = DocValues.emptySortedSet(); - } - liveTerms[i] = values.termsEnum(); - weights[i] = values.getValueCount(); - subs.add(new DocValuesConsumerUtil.SortedSetDocValuesSub(mergeState.docMaps[i], values)); - } - - final OrdinalMap map = OrdinalMap.build(null, liveTerms, weights, PackedInts.COMPACT); - for (int i = 0; i < subs.size(); i++) { - subs.get(i).map = map.getGlobalOrds(i); - } - - return DocValuesConsumerUtil.mergeSortedSetValues(subs, mergeState.needsIndexSort, map); - } - - }); } @Override From 1e0e2f86bb0f4e2ed1c15fe3f20228d5ef88cdb3 Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Tue, 25 Mar 2025 10:05:05 +0100 Subject: [PATCH 11/43] benchmark iter --- .../index/codec/tsdb/TSDBDocValuesMergeBenchmark.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java b/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java index a6f3affa90f34..554bb042720df 100644 --- a/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java +++ b/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java @@ -61,7 +61,7 @@ @Measurement(iterations = 1) public class TSDBDocValuesMergeBenchmark { - @Param("13431204") + @Param("26431204") private int nDocs; @Param("1000") @@ -139,7 +139,7 @@ public DocValuesFormat getDocValuesFormatForField(String field) { doc.add(new SortedNumericDocValuesField("counter_2", counter2++)); doc.add(new SortedNumericDocValuesField("gauge_1", gauge1Values[i % gauge1Values.length])); doc.add(new SortedNumericDocValuesField("gauge_2", gauge2Values[i % gauge1Values.length])); - int numTags = 3; + int numTags = tags.length % (i + 1); for (int j = 0; j < numTags; j++) { doc.add(new SortedSetDocValuesField("tags", new BytesRef(tags[j]))); } From 1ec6308e73b1ee0dd0f14515ff5b199fbe6172d4 Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Tue, 25 Mar 2025 15:17:54 +0100 Subject: [PATCH 12/43] Check for deleted docs before getting doc value instances. --- .../index/codec/tsdb/DocValuesConsumerUtil.java | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/DocValuesConsumerUtil.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/DocValuesConsumerUtil.java index 8d178fbc81ae7..cf8b4dba6349f 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/DocValuesConsumerUtil.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/DocValuesConsumerUtil.java @@ -70,6 +70,13 @@ static MergeStats compatibleWithOptimizedMerge( return UNSUPPORTED; } + // Documents marked as deleted should be rare. Maybe in the case of noop operation? + for (int i = 0; i < mergeState.liveDocs.length; i++) { + if (mergeState.liveDocs[i] != null) { + return UNSUPPORTED; + } + } + long sumNumValues = 0; int sumNumDocsWithField = 0; @@ -84,13 +91,6 @@ static MergeStats compatibleWithOptimizedMerge( sumNumDocsWithField += entry.numDocsWithField; } - // Documents marked as deleted should be rare. Maybe in the case of noop operation? - for (int i = 0; i < mergeState.liveDocs.length; i++) { - if (mergeState.liveDocs[i] != null) { - return UNSUPPORTED; - } - } - return new MergeStats(true, sumNumValues, sumNumDocsWithField); } From 5e7cc1199c76c769cff18ed3807f0e2ac6dee089 Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Tue, 25 Mar 2025 15:39:53 +0100 Subject: [PATCH 13/43] remove doc value skipper check --- .../index/codec/tsdb/DocValuesConsumerUtil.java | 8 ++------ .../index/codec/tsdb/ES87TSDBDocValuesConsumer.java | 8 ++++---- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/DocValuesConsumerUtil.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/DocValuesConsumerUtil.java index cf8b4dba6349f..7913bc9800d86 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/DocValuesConsumerUtil.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/DocValuesConsumerUtil.java @@ -13,7 +13,6 @@ import org.apache.lucene.index.BaseTermsEnum; import org.apache.lucene.index.DocIDMerger; import org.apache.lucene.index.DocValues; -import org.apache.lucene.index.DocValuesSkipIndexType; import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.EmptyDocValuesProducer; import org.apache.lucene.index.FieldInfo; @@ -38,7 +37,7 @@ import java.util.List; /** - * Contains mainly forked code from {@link org.apache.lucene.codecs.DocValuesConsumer}. + * Mostly contains forked code from {@link org.apache.lucene.codecs.DocValuesConsumer}. */ class DocValuesConsumerUtil { @@ -60,13 +59,10 @@ record FieldEntry(long docsWithFieldOffset, long numValues, int numDocsWithField static MergeStats compatibleWithOptimizedMerge( boolean optimizedMergeEnabled, - FieldInfo mergeFieldInfo, MergeState mergeState, CheckedFunction getEntryFunction ) throws IOException { - if (optimizedMergeEnabled == false - || mergeState.needsIndexSort == false - || mergeFieldInfo.docValuesSkipIndexType() != DocValuesSkipIndexType.NONE) { + if (optimizedMergeEnabled == false || mergeState.needsIndexSort == false) { return UNSUPPORTED; } diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesConsumer.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesConsumer.java index 738b3f75010da..ed06c775acab2 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesConsumer.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesConsumer.java @@ -222,7 +222,7 @@ private long[] writeField(FieldInfo field, DocValuesProducer valuesProducer, lon @Override public void mergeNumericField(FieldInfo mergeFieldInfo, MergeState mergeState) throws IOException { - var result = compatibleWithOptimizedMerge(enableOptimizedMerge, mergeFieldInfo, mergeState, (docValuesProducer) -> { + var result = compatibleWithOptimizedMerge(enableOptimizedMerge, mergeState, (docValuesProducer) -> { var numeric = docValuesProducer.getNumeric(mergeFieldInfo); if (numeric instanceof ES87TSDBDocValuesProducer.BaseNumericDocValues baseNumericDocValues) { var entry = baseNumericDocValues.entry; @@ -315,7 +315,7 @@ public void addSortedField(FieldInfo field, DocValuesProducer valuesProducer) th @Override public void mergeSortedField(FieldInfo mergeFieldInfo, MergeState mergeState) throws IOException { - var result = compatibleWithOptimizedMerge(enableOptimizedMerge, mergeFieldInfo, mergeState, (docValuesProducer) -> { + var result = compatibleWithOptimizedMerge(enableOptimizedMerge, mergeState, (docValuesProducer) -> { var sorted = docValuesProducer.getSorted(mergeFieldInfo); if (sorted instanceof ES87TSDBDocValuesProducer.BaseSortedDocValues baseSortedDocValues) { var entry = baseSortedDocValues.entry; @@ -568,7 +568,7 @@ private void writeSortedNumericField(FieldInfo field, DocValuesProducer valuesPr @Override public void mergeSortedNumericField(FieldInfo mergeFieldInfo, MergeState mergeState) throws IOException { - var result = compatibleWithOptimizedMerge(enableOptimizedMerge, mergeFieldInfo, mergeState, (docValuesProducer) -> { + var result = compatibleWithOptimizedMerge(enableOptimizedMerge, mergeState, (docValuesProducer) -> { var sortedNumeric = docValuesProducer.getSortedNumeric(mergeFieldInfo); if (sortedNumeric instanceof ES87TSDBDocValuesProducer.BaseSortedNumericDocValues baseSortedNumericDocValues) { var entry = baseSortedNumericDocValues.entry; @@ -609,7 +609,7 @@ private static boolean isSingleValued(SortedSetDocValues values) throws IOExcept @Override public void mergeSortedSetField(FieldInfo mergeFieldInfo, MergeState mergeState) throws IOException { - var result = compatibleWithOptimizedMerge(enableOptimizedMerge, mergeFieldInfo, mergeState, (docValuesProducer) -> { + var result = compatibleWithOptimizedMerge(enableOptimizedMerge, mergeState, (docValuesProducer) -> { var sortedSet = docValuesProducer.getSortedSet(mergeFieldInfo); if (sortedSet instanceof ES87TSDBDocValuesProducer.BaseSortedSetDocValues baseSortedSet) { var entry = baseSortedSet.entry; From 744a6658ebab86a8620d2a2922f136bcd84e12a3 Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Tue, 25 Mar 2025 16:11:48 +0100 Subject: [PATCH 14/43] Remove getEntryFunction lamda and delegate to doc value instance directly in compatibleWithOptimizedMerge(...) method. --- .../codec/tsdb/DocValuesConsumerUtil.java | 79 ++++++++++++++++--- .../codec/tsdb/ES87TSDBDocValuesConsumer.java | 45 ++--------- 2 files changed, 72 insertions(+), 52 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/DocValuesConsumerUtil.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/DocValuesConsumerUtil.java index 7913bc9800d86..edf486ed0029a 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/DocValuesConsumerUtil.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/DocValuesConsumerUtil.java @@ -30,7 +30,6 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LongValues; import org.apache.lucene.util.packed.PackedInts; -import org.elasticsearch.core.CheckedFunction; import java.io.IOException; import java.util.ArrayList; @@ -55,12 +54,11 @@ abstract static class TsdbDocValuesProducer extends EmptyDocValuesProducer { record MergeStats(boolean supported, long sumNumValues, int sumNumDocsWithField) {} - record FieldEntry(long docsWithFieldOffset, long numValues, int numDocsWithField) {} - static MergeStats compatibleWithOptimizedMerge( boolean optimizedMergeEnabled, MergeState mergeState, - CheckedFunction getEntryFunction + FieldInfo fieldInfo, + DocValuesType docValuesType ) throws IOException { if (optimizedMergeEnabled == false || mergeState.needsIndexSort == false) { return UNSUPPORTED; @@ -76,15 +74,72 @@ static MergeStats compatibleWithOptimizedMerge( long sumNumValues = 0; int sumNumDocsWithField = 0; - for (DocValuesProducer docValuesProducer : mergeState.docValuesProducers) { - // TODO bring back codec version check? (per field doc values producer sits between ES87TSDBDocValuesConsumer) - var entry = getEntryFunction.apply(docValuesProducer); - if (entry == null) { - return UNSUPPORTED; + // TODO bring back codec version check? (per field doc values producer sits between ES87TSDBDocValuesConsumer) + for (int i = 0; i < mergeState.docValuesProducers.length; i++) { + DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; + switch (docValuesType) { + case NUMERIC -> { + var numeric = docValuesProducer.getNumeric(fieldInfo); + if (numeric instanceof ES87TSDBDocValuesProducer.BaseNumericDocValues baseNumericDocValues) { + var entry = baseNumericDocValues.entry; + sumNumValues += entry.numValues; + // In this case the numDocsWithField doesn't get recorded in meta: + int numDocsWithField; + if (entry.docsWithFieldOffset == -2) { + numDocsWithField = 0; + } else if (entry.docsWithFieldOffset == -1) { + numDocsWithField = mergeState.maxDocs[i]; + } else { + // Value doesn't matter in this case: + numDocsWithField = mergeState.maxDocs[i] - 1; + } + sumNumDocsWithField += numDocsWithField; + } else { + return UNSUPPORTED; + } + } + case SORTED_NUMERIC -> { + var sortedNumeric = docValuesProducer.getSortedNumeric(fieldInfo); + if (sortedNumeric instanceof ES87TSDBDocValuesProducer.BaseSortedNumericDocValues baseSortedNumericDocValues) { + var entry = baseSortedNumericDocValues.entry; + sumNumValues += entry.numValues; + sumNumDocsWithField += entry.numDocsWithField; + } else { + return UNSUPPORTED; + } + } + case SORTED -> { + var sorted = docValuesProducer.getSorted(fieldInfo); + if (sorted instanceof ES87TSDBDocValuesProducer.BaseSortedDocValues baseSortedDocValues) { + var entry = baseSortedDocValues.entry; + sumNumValues += entry.ordsEntry.numValues; + // In this case the numDocsWithField doesn't get recorded in meta: + int numDocsWithField; + if (entry.ordsEntry.docsWithFieldOffset == -2) { + numDocsWithField = 0; + } else if (entry.ordsEntry.docsWithFieldOffset == -1) { + numDocsWithField = mergeState.maxDocs[i]; + } else { + // Value doesn't matter in this case: + numDocsWithField = mergeState.maxDocs[i] - 1; + } + sumNumDocsWithField += numDocsWithField; + } else { + return UNSUPPORTED; + } + } + case SORTED_SET -> { + var sortedSet = docValuesProducer.getSortedSet(fieldInfo); + if (sortedSet instanceof ES87TSDBDocValuesProducer.BaseSortedSetDocValues baseSortedSet) { + var entry = baseSortedSet.entry; + sumNumValues += entry.ordsEntry.numValues; + sumNumDocsWithField += entry.ordsEntry.numDocsWithField; + } else { + return UNSUPPORTED; + } + } + default -> throw new IllegalStateException("unexpected doc values producer type: " + docValuesType); } - - sumNumValues += entry.numValues; - sumNumDocsWithField += entry.numDocsWithField; } return new MergeStats(true, sumNumValues, sumNumDocsWithField); diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesConsumer.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesConsumer.java index ed06c775acab2..395cdf0d3828b 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesConsumer.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesConsumer.java @@ -16,6 +16,7 @@ import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.DocValues; import org.apache.lucene.index.DocValuesSkipIndexType; +import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.EmptyDocValuesProducer; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.IndexFileNames; @@ -222,15 +223,7 @@ private long[] writeField(FieldInfo field, DocValuesProducer valuesProducer, lon @Override public void mergeNumericField(FieldInfo mergeFieldInfo, MergeState mergeState) throws IOException { - var result = compatibleWithOptimizedMerge(enableOptimizedMerge, mergeState, (docValuesProducer) -> { - var numeric = docValuesProducer.getNumeric(mergeFieldInfo); - if (numeric instanceof ES87TSDBDocValuesProducer.BaseNumericDocValues baseNumericDocValues) { - var entry = baseNumericDocValues.entry; - return new DocValuesConsumerUtil.FieldEntry(entry.docsWithFieldOffset, entry.numValues, -1); - } else { - return null; - } - }); + var result = compatibleWithOptimizedMerge(enableOptimizedMerge, mergeState, mergeFieldInfo, DocValuesType.NUMERIC); if (result.supported()) { addNumericField(mergeFieldInfo, DocValuesConsumerUtil.mergeNumericProducer(result, mergeFieldInfo, mergeState)); } else { @@ -315,15 +308,7 @@ public void addSortedField(FieldInfo field, DocValuesProducer valuesProducer) th @Override public void mergeSortedField(FieldInfo mergeFieldInfo, MergeState mergeState) throws IOException { - var result = compatibleWithOptimizedMerge(enableOptimizedMerge, mergeState, (docValuesProducer) -> { - var sorted = docValuesProducer.getSorted(mergeFieldInfo); - if (sorted instanceof ES87TSDBDocValuesProducer.BaseSortedDocValues baseSortedDocValues) { - var entry = baseSortedDocValues.entry; - return new DocValuesConsumerUtil.FieldEntry(entry.ordsEntry.docsWithFieldOffset, entry.ordsEntry.numValues, -1); - } else { - return null; - } - }); + var result = compatibleWithOptimizedMerge(enableOptimizedMerge, mergeState, mergeFieldInfo, DocValuesType.SORTED); if (result.supported()) { addSortedField(mergeFieldInfo, DocValuesConsumerUtil.mergeSortedProducer(result, mergeFieldInfo, mergeState)); } else { @@ -568,15 +553,7 @@ private void writeSortedNumericField(FieldInfo field, DocValuesProducer valuesPr @Override public void mergeSortedNumericField(FieldInfo mergeFieldInfo, MergeState mergeState) throws IOException { - var result = compatibleWithOptimizedMerge(enableOptimizedMerge, mergeState, (docValuesProducer) -> { - var sortedNumeric = docValuesProducer.getSortedNumeric(mergeFieldInfo); - if (sortedNumeric instanceof ES87TSDBDocValuesProducer.BaseSortedNumericDocValues baseSortedNumericDocValues) { - var entry = baseSortedNumericDocValues.entry; - return new DocValuesConsumerUtil.FieldEntry(entry.docsWithFieldOffset, entry.numValues, entry.numDocsWithField); - } else { - return null; - } - }); + var result = compatibleWithOptimizedMerge(enableOptimizedMerge, mergeState, mergeFieldInfo, DocValuesType.SORTED_NUMERIC); if (result.supported()) { addSortedNumericField(mergeFieldInfo, DocValuesConsumerUtil.mergeSortedNumericProducer(result, mergeFieldInfo, mergeState)); } else { @@ -609,19 +586,7 @@ private static boolean isSingleValued(SortedSetDocValues values) throws IOExcept @Override public void mergeSortedSetField(FieldInfo mergeFieldInfo, MergeState mergeState) throws IOException { - var result = compatibleWithOptimizedMerge(enableOptimizedMerge, mergeState, (docValuesProducer) -> { - var sortedSet = docValuesProducer.getSortedSet(mergeFieldInfo); - if (sortedSet instanceof ES87TSDBDocValuesProducer.BaseSortedSetDocValues baseSortedSet) { - var entry = baseSortedSet.entry; - return new DocValuesConsumerUtil.FieldEntry( - entry.ordsEntry.docsWithFieldOffset, - entry.ordsEntry.numValues, - entry.ordsEntry.numDocsWithField - ); - } else { - return null; - } - }); + var result = compatibleWithOptimizedMerge(enableOptimizedMerge, mergeState, mergeFieldInfo, DocValuesType.SORTED_SET); if (result.supported()) { addSortedSetField(mergeFieldInfo, DocValuesConsumerUtil.mergeSortedSetProducer(result, mergeFieldInfo, mergeState)); } else { From 176fac7b0856693ba438795e27131b60c63fb531 Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Tue, 25 Mar 2025 16:31:21 +0100 Subject: [PATCH 15/43] lower doc count in benchmark --- .../benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java b/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java index 554bb042720df..5dc7ed47d7b80 100644 --- a/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java +++ b/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java @@ -61,7 +61,7 @@ @Measurement(iterations = 1) public class TSDBDocValuesMergeBenchmark { - @Param("26431204") + @Param("20431204") private int nDocs; @Param("1000") From ec998a3b5029163092b9491d46b38fe0c6485208 Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Tue, 25 Mar 2025 20:32:07 +0100 Subject: [PATCH 16/43] added node setting to control whether optimized merge is enabled. --- .../tsdb/TSDBDocValuesMergeBenchmark.java | 2 +- .../index/codec/CodecService.java | 7 ++++++ .../index/codec/PerFieldFormatSupplier.java | 4 +++- .../codec/tsdb/ES87TSDBDocValuesFormat.java | 22 +++++++------------ .../codec/tsdb/DocValuesCodecDuelTests.java | 2 +- .../tsdb/ES87TSDBDocValuesFormatTests.java | 2 +- ...ValuesFormatVariableSkipIntervalTests.java | 4 ++-- 7 files changed, 23 insertions(+), 20 deletions(-) diff --git a/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java b/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java index 5dc7ed47d7b80..d01a355d46721 100644 --- a/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java +++ b/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java @@ -107,7 +107,7 @@ private IndexWriter createIndex(final Directory directory, final boolean optimiz new SortedNumericSortField(TIMESTAMP_FIELD, SortField.Type.LONG, true) ) ); - ES87TSDBDocValuesFormat docValuesFormat = new ES87TSDBDocValuesFormat(4096, optimizedMergeEnabled); + ES87TSDBDocValuesFormat docValuesFormat = new ES87TSDBDocValuesFormat(optimizedMergeEnabled); config.setCodec(new Lucene101Codec() { @Override diff --git a/server/src/main/java/org/elasticsearch/index/codec/CodecService.java b/server/src/main/java/org/elasticsearch/index/codec/CodecService.java index 06949a967eccd..71e8a2c66dfe9 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/CodecService.java +++ b/server/src/main/java/org/elasticsearch/index/codec/CodecService.java @@ -13,6 +13,7 @@ import org.apache.lucene.codecs.FieldInfosFormat; import org.apache.lucene.codecs.FilterCodec; import org.apache.lucene.codecs.lucene101.Lucene101Codec; +import org.elasticsearch.common.settings.Setting; import org.elasticsearch.common.util.BigArrays; import org.elasticsearch.common.util.FeatureFlag; import org.elasticsearch.core.Nullable; @@ -32,6 +33,12 @@ public class CodecService implements CodecProvider { public static final FeatureFlag ZSTD_STORED_FIELDS_FEATURE_FLAG = new FeatureFlag("zstd_stored_fields"); + public static final FeatureFlag TSDB_DOC_VALUES_OPTIMIZED_MERGE = new FeatureFlag("tsdb_doc_values_codec_optimized_merge"); + public static Setting TSDB_DOC_VALUES_OPTIMIZED_MERGE_SETTING = Setting.boolSetting( + "indices.time_series.doc_values_codec_optimized_merge", + TSDB_DOC_VALUES_OPTIMIZED_MERGE.isEnabled(), + Setting.Property.NodeScope + ); private final Map codecs; diff --git a/server/src/main/java/org/elasticsearch/index/codec/PerFieldFormatSupplier.java b/server/src/main/java/org/elasticsearch/index/codec/PerFieldFormatSupplier.java index 12c7d68d89293..e217f66dd16c8 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/PerFieldFormatSupplier.java +++ b/server/src/main/java/org/elasticsearch/index/codec/PerFieldFormatSupplier.java @@ -34,15 +34,17 @@ public class PerFieldFormatSupplier { private static final DocValuesFormat docValuesFormat = new Lucene90DocValuesFormat(); private static final KnnVectorsFormat knnVectorsFormat = new Lucene99HnswVectorsFormat(); - private static final ES87TSDBDocValuesFormat tsdbDocValuesFormat = new ES87TSDBDocValuesFormat(); private static final ES812PostingsFormat es812PostingsFormat = new ES812PostingsFormat(); private static final PostingsFormat completionPostingsFormat = PostingsFormat.forName("Completion101"); + private final ES87TSDBDocValuesFormat tsdbDocValuesFormat; private final ES87BloomFilterPostingsFormat bloomFilterPostingsFormat; private final MapperService mapperService; public PerFieldFormatSupplier(MapperService mapperService, BigArrays bigArrays) { this.mapperService = mapperService; + var nodeSettings = mapperService.getIndexSettings().getNodeSettings(); + this.tsdbDocValuesFormat = new ES87TSDBDocValuesFormat(CodecService.TSDB_DOC_VALUES_OPTIMIZED_MERGE_SETTING.get(nodeSettings)); this.bloomFilterPostingsFormat = new ES87BloomFilterPostingsFormat(bigArrays, this::internalGetPostingsFormatForField); } diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormat.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormat.java index ca8fc2e9774ee..8de2f69950375 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormat.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormat.java @@ -13,10 +13,13 @@ import org.apache.lucene.codecs.DocValuesProducer; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; +import org.elasticsearch.common.settings.Setting; import org.elasticsearch.common.util.FeatureFlag; import java.io.IOException; +import static org.elasticsearch.index.codec.CodecService.TSDB_DOC_VALUES_OPTIMIZED_MERGE; + public class ES87TSDBDocValuesFormat extends org.apache.lucene.codecs.DocValuesFormat { static final int NUMERIC_BLOCK_SHIFT = 7; @@ -76,25 +79,16 @@ public class ES87TSDBDocValuesFormat extends org.apache.lucene.codecs.DocValuesF } } - // Default for escape hatch: - static final boolean OPTIMIZED_MERGE_ENABLE_DEFAULT; - static final FeatureFlag TSDB_DOC_VALUES_OPTIMIZED_MERGE = new FeatureFlag("tsdb_doc_values_optimized_merge"); - static final String OPTIMIZED_MERGE_ENABLED_NAME = ES87TSDBDocValuesConsumer.class.getName() + ".enableOptimizedMerge"; - - static { - boolean optimizedMergeDefault = TSDB_DOC_VALUES_OPTIMIZED_MERGE.isEnabled(); - OPTIMIZED_MERGE_ENABLE_DEFAULT = Boolean.parseBoolean( - System.getProperty(OPTIMIZED_MERGE_ENABLED_NAME, Boolean.toString(optimizedMergeDefault)) - ); - } - private final int skipIndexIntervalSize; - // TODO: remove escape hatch? Is useful now when testing/benchmarking, but current optimized merge logic currently do too scary things. private final boolean enableOptimizedMerge; /** Default constructor. */ public ES87TSDBDocValuesFormat() { - this(DEFAULT_SKIP_INDEX_INTERVAL_SIZE, OPTIMIZED_MERGE_ENABLE_DEFAULT); + this(TSDB_DOC_VALUES_OPTIMIZED_MERGE.isEnabled()); + } + + public ES87TSDBDocValuesFormat(boolean enableOptimizedMerge) { + this(DEFAULT_SKIP_INDEX_INTERVAL_SIZE, enableOptimizedMerge); } /** Doc values fields format with specified skipIndexIntervalSize. */ diff --git a/server/src/test/java/org/elasticsearch/index/codec/tsdb/DocValuesCodecDuelTests.java b/server/src/test/java/org/elasticsearch/index/codec/tsdb/DocValuesCodecDuelTests.java index dddbdcc517f93..864be20dbb4b8 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/tsdb/DocValuesCodecDuelTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/tsdb/DocValuesCodecDuelTests.java @@ -48,7 +48,7 @@ public void testDuel() throws IOException { baselineConfig.setMergePolicy(mergePolicy); baselineConfig.setCodec(TestUtil.alwaysDocValuesFormat(new Lucene90DocValuesFormat())); var contenderConf = newIndexWriterConfig(); - contenderConf.setCodec(TestUtil.alwaysDocValuesFormat(new ES87TSDBDocValuesFormat())); + contenderConf.setCodec(TestUtil.alwaysDocValuesFormat(new ES87TSDBDocValuesFormat(true))); contenderConf.setMergePolicy(mergePolicy); try ( diff --git a/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java index eff3138aaea12..753e9ef198bf9 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java @@ -54,7 +54,7 @@ public class ES87TSDBDocValuesFormatTests extends BaseDocValuesFormatTestCase { private static final int NUM_DOCS = 10; - private final Codec codec = TestUtil.alwaysDocValuesFormat(new ES87TSDBDocValuesFormat()); + private final Codec codec = TestUtil.alwaysDocValuesFormat(new ES87TSDBDocValuesFormat(true)); @Override protected Codec getCodec() { diff --git a/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatVariableSkipIntervalTests.java b/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatVariableSkipIntervalTests.java index 8a4a5c59d1a9c..0696776fe42fd 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatVariableSkipIntervalTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatVariableSkipIntervalTests.java @@ -32,13 +32,13 @@ public class ES87TSDBDocValuesFormatVariableSkipIntervalTests extends BaseDocVal @Override protected Codec getCodec() { // small interval size to test with many intervals - return TestUtil.alwaysDocValuesFormat(new ES87TSDBDocValuesFormat(random().nextInt(4, 16), false)); + return TestUtil.alwaysDocValuesFormat(new ES87TSDBDocValuesFormat(random().nextInt(4, 16), true)); } public void testSkipIndexIntervalSize() { IllegalArgumentException ex = expectThrows( IllegalArgumentException.class, - () -> new ES87TSDBDocValuesFormat(random().nextInt(Integer.MIN_VALUE, 2), false) + () -> new ES87TSDBDocValuesFormat(random().nextInt(Integer.MIN_VALUE, 2), true) ); assertTrue(ex.getMessage().contains("skipIndexIntervalSize must be > 1")); } From 066b778a95ac044b5d731536833ed1a0f0c4a83e Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Tue, 25 Mar 2025 20:34:02 +0100 Subject: [PATCH 17/43] Update docs/changelog/125403.yaml --- docs/changelog/125403.yaml | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 docs/changelog/125403.yaml diff --git a/docs/changelog/125403.yaml b/docs/changelog/125403.yaml new file mode 100644 index 0000000000000..d953dae4db4fe --- /dev/null +++ b/docs/changelog/125403.yaml @@ -0,0 +1,5 @@ +pr: 125403 +summary: First step optimizing tsdb doc values codec merging +area: Codec +type: enhancement +issues: [] From 722f85e9ad7c5a3f49e7ca6f8d3f0f1a26b4dee2 Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Tue, 25 Mar 2025 19:42:45 +0000 Subject: [PATCH 18/43] [CI] Auto commit changes from spotless --- .../elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormat.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormat.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormat.java index 8de2f69950375..13fde99c09d70 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormat.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormat.java @@ -13,8 +13,6 @@ import org.apache.lucene.codecs.DocValuesProducer; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; -import org.elasticsearch.common.settings.Setting; -import org.elasticsearch.common.util.FeatureFlag; import java.io.IOException; From 2bb9867a75e2dc01e3c229b71aa9fc620aed913b Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Tue, 25 Mar 2025 20:47:21 +0100 Subject: [PATCH 19/43] register node setting --- .../org/elasticsearch/common/settings/ClusterSettings.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java b/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java index 9cee0872f3f7d..ecb8df7b196d6 100644 --- a/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java +++ b/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java @@ -89,6 +89,7 @@ import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.IndexingPressure; import org.elasticsearch.index.MergePolicyConfig; +import org.elasticsearch.index.codec.CodecService; import org.elasticsearch.index.engine.ThreadPoolMergeScheduler; import org.elasticsearch.index.shard.IndexingStatsSettings; import org.elasticsearch.indices.IndexingMemoryController; @@ -635,6 +636,7 @@ public void apply(Settings value, Settings current, Settings previous) { ShardsAvailabilityHealthIndicatorService.REPLICA_UNASSIGNED_BUFFER_TIME, DataStream.isFailureStoreFeatureFlagEnabled() ? DataStreamFailureStoreSettings.DATA_STREAM_FAILURE_STORED_ENABLED_SETTING : null, IndexingStatsSettings.RECENT_WRITE_LOAD_HALF_LIFE_SETTING, - TransportGetAllocationStatsAction.CACHE_TTL_SETTING + TransportGetAllocationStatsAction.CACHE_TTL_SETTING, + CodecService.TSDB_DOC_VALUES_OPTIMIZED_MERGE_SETTING ).filter(Objects::nonNull).collect(toSet()); } From 5bcc62c0fd3c474e19a6237363dd3c7a977b7c48 Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Tue, 25 Mar 2025 21:38:20 +0100 Subject: [PATCH 20/43] fix npe --- .../org/elasticsearch/index/codec/PerFieldFormatSupplier.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/index/codec/PerFieldFormatSupplier.java b/server/src/main/java/org/elasticsearch/index/codec/PerFieldFormatSupplier.java index e217f66dd16c8..38c19635e42e3 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/PerFieldFormatSupplier.java +++ b/server/src/main/java/org/elasticsearch/index/codec/PerFieldFormatSupplier.java @@ -14,6 +14,7 @@ import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat; import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat; +import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.util.BigArrays; import org.elasticsearch.index.IndexMode; import org.elasticsearch.index.IndexSettings; @@ -43,7 +44,7 @@ public class PerFieldFormatSupplier { public PerFieldFormatSupplier(MapperService mapperService, BigArrays bigArrays) { this.mapperService = mapperService; - var nodeSettings = mapperService.getIndexSettings().getNodeSettings(); + var nodeSettings = mapperService != null ? mapperService.getIndexSettings().getNodeSettings() : Settings.EMPTY; this.tsdbDocValuesFormat = new ES87TSDBDocValuesFormat(CodecService.TSDB_DOC_VALUES_OPTIMIZED_MERGE_SETTING.get(nodeSettings)); this.bloomFilterPostingsFormat = new ES87BloomFilterPostingsFormat(bigArrays, this::internalGetPostingsFormatForField); } From 27efdd24500cf51832c5b4909d2077c93e7da7b7 Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Wed, 26 Mar 2025 07:51:49 +0100 Subject: [PATCH 21/43] iter --- .../codec/tsdb/DocValuesConsumerUtil.java | 77 +++++++++++-------- .../codec/tsdb/ES87TSDBDocValuesConsumer.java | 9 +-- 2 files changed, 47 insertions(+), 39 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/DocValuesConsumerUtil.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/DocValuesConsumerUtil.java index edf486ed0029a..db7ba2ec8b4e6 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/DocValuesConsumerUtil.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/DocValuesConsumerUtil.java @@ -54,12 +54,8 @@ abstract static class TsdbDocValuesProducer extends EmptyDocValuesProducer { record MergeStats(boolean supported, long sumNumValues, int sumNumDocsWithField) {} - static MergeStats compatibleWithOptimizedMerge( - boolean optimizedMergeEnabled, - MergeState mergeState, - FieldInfo fieldInfo, - DocValuesType docValuesType - ) throws IOException { + static MergeStats compatibleWithOptimizedMerge(boolean optimizedMergeEnabled, MergeState mergeState, FieldInfo fieldInfo) + throws IOException { if (optimizedMergeEnabled == false || mergeState.needsIndexSort == false) { return UNSUPPORTED; } @@ -77,24 +73,15 @@ static MergeStats compatibleWithOptimizedMerge( // TODO bring back codec version check? (per field doc values producer sits between ES87TSDBDocValuesConsumer) for (int i = 0; i < mergeState.docValuesProducers.length; i++) { DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; - switch (docValuesType) { + switch (fieldInfo.getDocValuesType()) { case NUMERIC -> { var numeric = docValuesProducer.getNumeric(fieldInfo); - if (numeric instanceof ES87TSDBDocValuesProducer.BaseNumericDocValues baseNumericDocValues) { - var entry = baseNumericDocValues.entry; + if (numeric instanceof ES87TSDBDocValuesProducer.BaseNumericDocValues baseNumeric) { + var entry = baseNumeric.entry; sumNumValues += entry.numValues; - // In this case the numDocsWithField doesn't get recorded in meta: - int numDocsWithField; - if (entry.docsWithFieldOffset == -2) { - numDocsWithField = 0; - } else if (entry.docsWithFieldOffset == -1) { - numDocsWithField = mergeState.maxDocs[i]; - } else { - // Value doesn't matter in this case: - numDocsWithField = mergeState.maxDocs[i] - 1; - } + int numDocsWithField = getNumDocsWithField(entry, mergeState.maxDocs[i]); sumNumDocsWithField += numDocsWithField; - } else { + } else if (numeric != null) { return UNSUPPORTED; } } @@ -105,7 +92,16 @@ static MergeStats compatibleWithOptimizedMerge( sumNumValues += entry.numValues; sumNumDocsWithField += entry.numDocsWithField; } else { - return UNSUPPORTED; + var singleton = DocValues.unwrapSingleton(sortedNumeric); + if (singleton instanceof ES87TSDBDocValuesProducer.BaseNumericDocValues baseNumeric) { + var entry = baseNumeric.entry; + sumNumValues += entry.numValues; + // In this case the numDocsWithField doesn't get recorded in meta: + int numDocsWithField = getNumDocsWithField(entry, mergeState.maxDocs[i]); + sumNumDocsWithField += numDocsWithField; + } else if (sortedNumeric != null) { + return UNSUPPORTED; + } } } case SORTED -> { @@ -113,18 +109,10 @@ static MergeStats compatibleWithOptimizedMerge( if (sorted instanceof ES87TSDBDocValuesProducer.BaseSortedDocValues baseSortedDocValues) { var entry = baseSortedDocValues.entry; sumNumValues += entry.ordsEntry.numValues; - // In this case the numDocsWithField doesn't get recorded in meta: - int numDocsWithField; - if (entry.ordsEntry.docsWithFieldOffset == -2) { - numDocsWithField = 0; - } else if (entry.ordsEntry.docsWithFieldOffset == -1) { - numDocsWithField = mergeState.maxDocs[i]; - } else { - // Value doesn't matter in this case: - numDocsWithField = mergeState.maxDocs[i] - 1; - } + // In this case the numDocsWithField doesn't get recorded in meta:v + int numDocsWithField = getNumDocsWithField(entry.ordsEntry, mergeState.maxDocs[i]); sumNumDocsWithField += numDocsWithField; - } else { + } else if (sorted != null) { return UNSUPPORTED; } } @@ -135,16 +123,37 @@ static MergeStats compatibleWithOptimizedMerge( sumNumValues += entry.ordsEntry.numValues; sumNumDocsWithField += entry.ordsEntry.numDocsWithField; } else { - return UNSUPPORTED; + var singleton = DocValues.unwrapSingleton(sortedSet); + if (singleton instanceof ES87TSDBDocValuesProducer.BaseSortedDocValues baseSorted) { + var entry = baseSorted.entry; + sumNumValues += entry.ordsEntry.numValues; + // In this case the numDocsWithField doesn't get recorded in meta: + int numDocsWithField = getNumDocsWithField(entry.ordsEntry, mergeState.maxDocs[i]); + sumNumDocsWithField += numDocsWithField; + } else if (sortedSet != null) { + return UNSUPPORTED; + } } } - default -> throw new IllegalStateException("unexpected doc values producer type: " + docValuesType); + default -> throw new IllegalStateException("unexpected doc values producer type: " + fieldInfo.getDocValuesType()); } } return new MergeStats(true, sumNumValues, sumNumDocsWithField); } + private static int getNumDocsWithField(ES87TSDBDocValuesProducer.NumericEntry entry, int maxDoc) { + // In this case the numDocsWithField doesn't get recorded in meta: + if (entry.docsWithFieldOffset == -2) { + return 0; + } else if (entry.docsWithFieldOffset == -1) { + return maxDoc; + } else { + // numDocsWithField doesn't matter in this case: + return 1; + } + } + static DocValuesProducer mergeNumericProducer(MergeStats mergeStats, FieldInfo mergeFieldInfo, MergeState mergeState) { return new TsdbDocValuesProducer(mergeStats) { diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesConsumer.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesConsumer.java index 395cdf0d3828b..7d033d96d7c96 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesConsumer.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesConsumer.java @@ -16,7 +16,6 @@ import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.DocValues; import org.apache.lucene.index.DocValuesSkipIndexType; -import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.EmptyDocValuesProducer; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.IndexFileNames; @@ -223,7 +222,7 @@ private long[] writeField(FieldInfo field, DocValuesProducer valuesProducer, lon @Override public void mergeNumericField(FieldInfo mergeFieldInfo, MergeState mergeState) throws IOException { - var result = compatibleWithOptimizedMerge(enableOptimizedMerge, mergeState, mergeFieldInfo, DocValuesType.NUMERIC); + var result = compatibleWithOptimizedMerge(enableOptimizedMerge, mergeState, mergeFieldInfo); if (result.supported()) { addNumericField(mergeFieldInfo, DocValuesConsumerUtil.mergeNumericProducer(result, mergeFieldInfo, mergeState)); } else { @@ -308,7 +307,7 @@ public void addSortedField(FieldInfo field, DocValuesProducer valuesProducer) th @Override public void mergeSortedField(FieldInfo mergeFieldInfo, MergeState mergeState) throws IOException { - var result = compatibleWithOptimizedMerge(enableOptimizedMerge, mergeState, mergeFieldInfo, DocValuesType.SORTED); + var result = compatibleWithOptimizedMerge(enableOptimizedMerge, mergeState, mergeFieldInfo); if (result.supported()) { addSortedField(mergeFieldInfo, DocValuesConsumerUtil.mergeSortedProducer(result, mergeFieldInfo, mergeState)); } else { @@ -553,7 +552,7 @@ private void writeSortedNumericField(FieldInfo field, DocValuesProducer valuesPr @Override public void mergeSortedNumericField(FieldInfo mergeFieldInfo, MergeState mergeState) throws IOException { - var result = compatibleWithOptimizedMerge(enableOptimizedMerge, mergeState, mergeFieldInfo, DocValuesType.SORTED_NUMERIC); + var result = compatibleWithOptimizedMerge(enableOptimizedMerge, mergeState, mergeFieldInfo); if (result.supported()) { addSortedNumericField(mergeFieldInfo, DocValuesConsumerUtil.mergeSortedNumericProducer(result, mergeFieldInfo, mergeState)); } else { @@ -586,7 +585,7 @@ private static boolean isSingleValued(SortedSetDocValues values) throws IOExcept @Override public void mergeSortedSetField(FieldInfo mergeFieldInfo, MergeState mergeState) throws IOException { - var result = compatibleWithOptimizedMerge(enableOptimizedMerge, mergeState, mergeFieldInfo, DocValuesType.SORTED_SET); + var result = compatibleWithOptimizedMerge(enableOptimizedMerge, mergeState, mergeFieldInfo); if (result.supported()) { addSortedSetField(mergeFieldInfo, DocValuesConsumerUtil.mergeSortedSetProducer(result, mergeFieldInfo, mergeState)); } else { From 646c566ba815a0dcf7c7b6efe7bf154774b49341 Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Wed, 26 Mar 2025 07:52:09 +0100 Subject: [PATCH 22/43] Revert node setting for jvm env variable. --- .../tsdb/TSDBDocValuesMergeBenchmark.java | 2 +- .../common/settings/ClusterSettings.java | 4 +--- .../index/codec/CodecService.java | 7 ------ .../index/codec/PerFieldFormatSupplier.java | 5 +---- .../codec/tsdb/ES87TSDBDocValuesFormat.java | 22 +++++++++++++------ .../codec/tsdb/DocValuesCodecDuelTests.java | 2 +- .../tsdb/ES87TSDBDocValuesFormatTests.java | 2 +- ...ValuesFormatVariableSkipIntervalTests.java | 4 ++-- 8 files changed, 22 insertions(+), 26 deletions(-) diff --git a/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java b/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java index d01a355d46721..5dc7ed47d7b80 100644 --- a/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java +++ b/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java @@ -107,7 +107,7 @@ private IndexWriter createIndex(final Directory directory, final boolean optimiz new SortedNumericSortField(TIMESTAMP_FIELD, SortField.Type.LONG, true) ) ); - ES87TSDBDocValuesFormat docValuesFormat = new ES87TSDBDocValuesFormat(optimizedMergeEnabled); + ES87TSDBDocValuesFormat docValuesFormat = new ES87TSDBDocValuesFormat(4096, optimizedMergeEnabled); config.setCodec(new Lucene101Codec() { @Override diff --git a/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java b/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java index ecb8df7b196d6..9cee0872f3f7d 100644 --- a/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java +++ b/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java @@ -89,7 +89,6 @@ import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.IndexingPressure; import org.elasticsearch.index.MergePolicyConfig; -import org.elasticsearch.index.codec.CodecService; import org.elasticsearch.index.engine.ThreadPoolMergeScheduler; import org.elasticsearch.index.shard.IndexingStatsSettings; import org.elasticsearch.indices.IndexingMemoryController; @@ -636,7 +635,6 @@ public void apply(Settings value, Settings current, Settings previous) { ShardsAvailabilityHealthIndicatorService.REPLICA_UNASSIGNED_BUFFER_TIME, DataStream.isFailureStoreFeatureFlagEnabled() ? DataStreamFailureStoreSettings.DATA_STREAM_FAILURE_STORED_ENABLED_SETTING : null, IndexingStatsSettings.RECENT_WRITE_LOAD_HALF_LIFE_SETTING, - TransportGetAllocationStatsAction.CACHE_TTL_SETTING, - CodecService.TSDB_DOC_VALUES_OPTIMIZED_MERGE_SETTING + TransportGetAllocationStatsAction.CACHE_TTL_SETTING ).filter(Objects::nonNull).collect(toSet()); } diff --git a/server/src/main/java/org/elasticsearch/index/codec/CodecService.java b/server/src/main/java/org/elasticsearch/index/codec/CodecService.java index 71e8a2c66dfe9..06949a967eccd 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/CodecService.java +++ b/server/src/main/java/org/elasticsearch/index/codec/CodecService.java @@ -13,7 +13,6 @@ import org.apache.lucene.codecs.FieldInfosFormat; import org.apache.lucene.codecs.FilterCodec; import org.apache.lucene.codecs.lucene101.Lucene101Codec; -import org.elasticsearch.common.settings.Setting; import org.elasticsearch.common.util.BigArrays; import org.elasticsearch.common.util.FeatureFlag; import org.elasticsearch.core.Nullable; @@ -33,12 +32,6 @@ public class CodecService implements CodecProvider { public static final FeatureFlag ZSTD_STORED_FIELDS_FEATURE_FLAG = new FeatureFlag("zstd_stored_fields"); - public static final FeatureFlag TSDB_DOC_VALUES_OPTIMIZED_MERGE = new FeatureFlag("tsdb_doc_values_codec_optimized_merge"); - public static Setting TSDB_DOC_VALUES_OPTIMIZED_MERGE_SETTING = Setting.boolSetting( - "indices.time_series.doc_values_codec_optimized_merge", - TSDB_DOC_VALUES_OPTIMIZED_MERGE.isEnabled(), - Setting.Property.NodeScope - ); private final Map codecs; diff --git a/server/src/main/java/org/elasticsearch/index/codec/PerFieldFormatSupplier.java b/server/src/main/java/org/elasticsearch/index/codec/PerFieldFormatSupplier.java index 38c19635e42e3..12c7d68d89293 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/PerFieldFormatSupplier.java +++ b/server/src/main/java/org/elasticsearch/index/codec/PerFieldFormatSupplier.java @@ -14,7 +14,6 @@ import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat; import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat; -import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.util.BigArrays; import org.elasticsearch.index.IndexMode; import org.elasticsearch.index.IndexSettings; @@ -35,17 +34,15 @@ public class PerFieldFormatSupplier { private static final DocValuesFormat docValuesFormat = new Lucene90DocValuesFormat(); private static final KnnVectorsFormat knnVectorsFormat = new Lucene99HnswVectorsFormat(); + private static final ES87TSDBDocValuesFormat tsdbDocValuesFormat = new ES87TSDBDocValuesFormat(); private static final ES812PostingsFormat es812PostingsFormat = new ES812PostingsFormat(); private static final PostingsFormat completionPostingsFormat = PostingsFormat.forName("Completion101"); - private final ES87TSDBDocValuesFormat tsdbDocValuesFormat; private final ES87BloomFilterPostingsFormat bloomFilterPostingsFormat; private final MapperService mapperService; public PerFieldFormatSupplier(MapperService mapperService, BigArrays bigArrays) { this.mapperService = mapperService; - var nodeSettings = mapperService != null ? mapperService.getIndexSettings().getNodeSettings() : Settings.EMPTY; - this.tsdbDocValuesFormat = new ES87TSDBDocValuesFormat(CodecService.TSDB_DOC_VALUES_OPTIMIZED_MERGE_SETTING.get(nodeSettings)); this.bloomFilterPostingsFormat = new ES87BloomFilterPostingsFormat(bigArrays, this::internalGetPostingsFormatForField); } diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormat.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormat.java index 13fde99c09d70..ca8fc2e9774ee 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormat.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormat.java @@ -13,11 +13,10 @@ import org.apache.lucene.codecs.DocValuesProducer; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; +import org.elasticsearch.common.util.FeatureFlag; import java.io.IOException; -import static org.elasticsearch.index.codec.CodecService.TSDB_DOC_VALUES_OPTIMIZED_MERGE; - public class ES87TSDBDocValuesFormat extends org.apache.lucene.codecs.DocValuesFormat { static final int NUMERIC_BLOCK_SHIFT = 7; @@ -77,16 +76,25 @@ public class ES87TSDBDocValuesFormat extends org.apache.lucene.codecs.DocValuesF } } + // Default for escape hatch: + static final boolean OPTIMIZED_MERGE_ENABLE_DEFAULT; + static final FeatureFlag TSDB_DOC_VALUES_OPTIMIZED_MERGE = new FeatureFlag("tsdb_doc_values_optimized_merge"); + static final String OPTIMIZED_MERGE_ENABLED_NAME = ES87TSDBDocValuesConsumer.class.getName() + ".enableOptimizedMerge"; + + static { + boolean optimizedMergeDefault = TSDB_DOC_VALUES_OPTIMIZED_MERGE.isEnabled(); + OPTIMIZED_MERGE_ENABLE_DEFAULT = Boolean.parseBoolean( + System.getProperty(OPTIMIZED_MERGE_ENABLED_NAME, Boolean.toString(optimizedMergeDefault)) + ); + } + private final int skipIndexIntervalSize; + // TODO: remove escape hatch? Is useful now when testing/benchmarking, but current optimized merge logic currently do too scary things. private final boolean enableOptimizedMerge; /** Default constructor. */ public ES87TSDBDocValuesFormat() { - this(TSDB_DOC_VALUES_OPTIMIZED_MERGE.isEnabled()); - } - - public ES87TSDBDocValuesFormat(boolean enableOptimizedMerge) { - this(DEFAULT_SKIP_INDEX_INTERVAL_SIZE, enableOptimizedMerge); + this(DEFAULT_SKIP_INDEX_INTERVAL_SIZE, OPTIMIZED_MERGE_ENABLE_DEFAULT); } /** Doc values fields format with specified skipIndexIntervalSize. */ diff --git a/server/src/test/java/org/elasticsearch/index/codec/tsdb/DocValuesCodecDuelTests.java b/server/src/test/java/org/elasticsearch/index/codec/tsdb/DocValuesCodecDuelTests.java index 864be20dbb4b8..dddbdcc517f93 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/tsdb/DocValuesCodecDuelTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/tsdb/DocValuesCodecDuelTests.java @@ -48,7 +48,7 @@ public void testDuel() throws IOException { baselineConfig.setMergePolicy(mergePolicy); baselineConfig.setCodec(TestUtil.alwaysDocValuesFormat(new Lucene90DocValuesFormat())); var contenderConf = newIndexWriterConfig(); - contenderConf.setCodec(TestUtil.alwaysDocValuesFormat(new ES87TSDBDocValuesFormat(true))); + contenderConf.setCodec(TestUtil.alwaysDocValuesFormat(new ES87TSDBDocValuesFormat())); contenderConf.setMergePolicy(mergePolicy); try ( diff --git a/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java index 753e9ef198bf9..eff3138aaea12 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java @@ -54,7 +54,7 @@ public class ES87TSDBDocValuesFormatTests extends BaseDocValuesFormatTestCase { private static final int NUM_DOCS = 10; - private final Codec codec = TestUtil.alwaysDocValuesFormat(new ES87TSDBDocValuesFormat(true)); + private final Codec codec = TestUtil.alwaysDocValuesFormat(new ES87TSDBDocValuesFormat()); @Override protected Codec getCodec() { diff --git a/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatVariableSkipIntervalTests.java b/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatVariableSkipIntervalTests.java index 0696776fe42fd..8a4a5c59d1a9c 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatVariableSkipIntervalTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatVariableSkipIntervalTests.java @@ -32,13 +32,13 @@ public class ES87TSDBDocValuesFormatVariableSkipIntervalTests extends BaseDocVal @Override protected Codec getCodec() { // small interval size to test with many intervals - return TestUtil.alwaysDocValuesFormat(new ES87TSDBDocValuesFormat(random().nextInt(4, 16), true)); + return TestUtil.alwaysDocValuesFormat(new ES87TSDBDocValuesFormat(random().nextInt(4, 16), false)); } public void testSkipIndexIntervalSize() { IllegalArgumentException ex = expectThrows( IllegalArgumentException.class, - () -> new ES87TSDBDocValuesFormat(random().nextInt(Integer.MIN_VALUE, 2), true) + () -> new ES87TSDBDocValuesFormat(random().nextInt(Integer.MIN_VALUE, 2), false) ); assertTrue(ex.getMessage().contains("skipIndexIntervalSize must be > 1")); } From 71201c8516c701ae952bc88f04f2113c59ebdc11 Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Thu, 27 Mar 2025 11:13:54 +0100 Subject: [PATCH 23/43] more tests --- .../tsdb/ES87TSDBDocValuesFormatTests.java | 106 ++++++++++++++++-- 1 file changed, 98 insertions(+), 8 deletions(-) diff --git a/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java index eff3138aaea12..0f5e16e005c9c 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java @@ -24,6 +24,7 @@ import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.LogByteSizeMergePolicy; import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.index.SortedNumericDocValues; import org.apache.lucene.index.SortedSetDocValues; @@ -38,6 +39,7 @@ import org.apache.lucene.tests.index.RandomIndexWriter; import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.BytesRef; +import org.elasticsearch.cluster.metadata.DataStream; import java.io.IOException; import java.util.ArrayList; @@ -258,14 +260,7 @@ public void testForceMergeDenseCase() throws Exception { String hostnameField = "host.name"; long baseTimestamp = 1704067200000L; - var config = new IndexWriterConfig(); - config.setIndexSort( - new Sort( - new SortField(hostnameField, SortField.Type.STRING, false), - new SortedNumericSortField(timestampField, SortField.Type.LONG, true) - ) - ); - config.setCodec(getCodec()); + var config = getTimeSeriesIndexWriterConfig(hostnameField, timestampField); try (var dir = newDirectory(); var iw = new IndexWriter(dir, config)) { long counter1 = 0; long counter2 = 10_000_000; @@ -369,4 +364,99 @@ public void testForceMergeDenseCase() throws Exception { } } } + + public void testWithNoValueMultiValue() throws Exception { + String timestampField = "@timestamp"; + String hostnameField = "host.name"; + long baseTimestamp = 1704067200000L; + + var config = getTimeSeriesIndexWriterConfig(hostnameField, timestampField); + try (var dir = newDirectory(); var iw = new IndexWriter(dir, config)) { + int numRounds = 4 + random().nextInt(28); + int numDocsPerRound = 8 + random().nextInt(56); + long[] gauge1Values = new long[] { 2, 4, 6, 8, 10, 12, 14, 16 }; + String[] tags = new String[] { "tag_1", "tag_2", "tag_3", "tag_4", "tag_5", "tag_6", "tag_7", "tag_8" }; + { + long timestamp = baseTimestamp; + for (int i = 0; i < numRounds; i++) { + int r = random().nextInt(10); + for (int j = 0; j < numDocsPerRound; j++) { + var d = new Document(); + String hostName = String.format(Locale.ROOT, "host-%03d", i); + d.add(new SortedDocValuesField(hostnameField, new BytesRef(hostName))); + // Index sorting doesn't work with NumericDocValuesField: + d.add(new SortedNumericDocValuesField(timestampField, timestamp++)); + + if (r % 10 == 5) { + // sometimes no values + } else if (r % 10 > 5) { + // often multiple values: + int numValues = 2 + random().nextInt(4); + for (int k = 0; k < numValues; k++) { + d.add(new SortedNumericDocValuesField("gauge_1", gauge1Values[(j + k) % gauge1Values.length])); + d.add(new SortedSetDocValuesField("tags", new BytesRef(tags[(j + k) % tags.length]))); + } + } else { + // otherwise single value: + d.add(new SortedNumericDocValuesField("gauge_1", gauge1Values[j % gauge1Values.length])); + d.add(new SortedSetDocValuesField("tags", new BytesRef(tags[j % tags.length]))); + } + + iw.addDocument(d); + } + iw.commit(); + } + iw.forceMerge(1); + } + + int numDocs = numRounds * numDocsPerRound; + try (var reader = DirectoryReader.open(iw)) { + assertEquals(1, reader.leaves().size()); + assertEquals(numDocs, reader.maxDoc()); + var leaf = reader.leaves().get(0).reader(); + var hostNameDV = leaf.getSortedDocValues(hostnameField); + assertNotNull(hostNameDV); + var timestampDV = DocValues.unwrapSingleton(leaf.getSortedNumericDocValues(timestampField)); + assertNotNull(timestampDV); + var gaugeOneDV = leaf.getSortedNumericDocValues("gauge_1"); + assertNotNull(gaugeOneDV); + for (int i = 0; i < numDocs; i++) { + assertEquals(i, hostNameDV.nextDoc()); + int round = i / numDocsPerRound; + String expectedHostName = String.format(Locale.ROOT, "host-%03d", round); + String actualHostName = hostNameDV.lookupOrd(hostNameDV.ordValue()).utf8ToString(); + assertEquals(expectedHostName, actualHostName); + + assertEquals(i, timestampDV.nextDoc()); + long timestamp = timestampDV.longValue(); + long lowerBound = baseTimestamp; + long upperBound = baseTimestamp + numDocs; + assertTrue( + "unexpected timestamp [" + timestamp + "], expected between [" + lowerBound + "] and [" + upperBound + "]", + timestamp >= lowerBound && timestamp < upperBound + ); + if (gaugeOneDV.advanceExact(i)) { + for (int j = 0; j < gaugeOneDV.docValueCount(); j++) { + long value = gaugeOneDV.nextValue(); + assertTrue("unexpected gauge [" + value + "]", Arrays.binarySearch(gauge1Values, value) >= 0); + } + } + } + } + } + } + + private IndexWriterConfig getTimeSeriesIndexWriterConfig(String hostnameField, String timestampField) { + var config = new IndexWriterConfig(); + config.setIndexSort( + new Sort( + new SortField(hostnameField, SortField.Type.STRING, false), + new SortedNumericSortField(timestampField, SortField.Type.LONG, true) + ) + ); + config.setLeafSorter(DataStream.TIMESERIES_LEAF_READERS_SORTER); + config.setMergePolicy(new LogByteSizeMergePolicy()); + config.setCodec(getCodec()); + return config; + } } From e6fe87a0284bc9edfea1aea8e1e85902a2f72c33 Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Thu, 27 Mar 2025 11:17:15 +0100 Subject: [PATCH 24/43] iter --- .../index/codec/tsdb/ES87TSDBDocValuesFormatTests.java | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java index 0f5e16e005c9c..24b4cdba06162 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java @@ -420,6 +420,8 @@ public void testWithNoValueMultiValue() throws Exception { assertNotNull(timestampDV); var gaugeOneDV = leaf.getSortedNumericDocValues("gauge_1"); assertNotNull(gaugeOneDV); + var tagsDV = leaf.getSortedSetDocValues("tags"); + assertNotNull(tagsDV); for (int i = 0; i < numDocs; i++) { assertEquals(i, hostNameDV.nextDoc()); int round = i / numDocsPerRound; @@ -441,6 +443,13 @@ public void testWithNoValueMultiValue() throws Exception { assertTrue("unexpected gauge [" + value + "]", Arrays.binarySearch(gauge1Values, value) >= 0); } } + if (tagsDV.advanceExact(i)) { + for (int j = 0; j < tagsDV.docValueCount(); j++) { + long ordinal = tagsDV.nextOrd(); + String actualTag = tagsDV.lookupOrd(ordinal).utf8ToString(); + assertTrue("unexpected tag [" + actualTag + "]", Arrays.binarySearch(tags, actualTag) >= 0); + } + } } } } From 020bae786e596e6890fecababee04b423084b65f Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Thu, 27 Mar 2025 11:38:31 +0100 Subject: [PATCH 25/43] remove unused field --- .../index/codec/tsdb/DocValuesConsumerUtil.java | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/DocValuesConsumerUtil.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/DocValuesConsumerUtil.java index db7ba2ec8b4e6..d78fc414d4415 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/DocValuesConsumerUtil.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/DocValuesConsumerUtil.java @@ -235,7 +235,6 @@ public long longValue() throws IOException { static class NumericDocValuesSub extends DocIDMerger.Sub { final NumericDocValues values; - int docID = -1; NumericDocValuesSub(MergeState.DocMap docMap, NumericDocValues values) { super(docMap); @@ -245,7 +244,7 @@ static class NumericDocValuesSub extends DocIDMerger.Sub { @Override public int nextDoc() throws IOException { - return docID = values.nextDoc(); + return values.nextDoc(); } } @@ -334,7 +333,6 @@ public int docValueCount() { static class SortedNumericDocValuesSub extends DocIDMerger.Sub { final SortedNumericDocValues values; - int docID = -1; SortedNumericDocValuesSub(MergeState.DocMap docMap, SortedNumericDocValues values) { super(docMap); @@ -344,7 +342,7 @@ static class SortedNumericDocValuesSub extends DocIDMerger.Sub { @Override public int nextDoc() throws IOException { - return docID = values.nextDoc(); + return values.nextDoc(); } } From d8b3c1589db45345a11935922b9296aa466c8ad1 Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Thu, 27 Mar 2025 12:17:11 +0100 Subject: [PATCH 26/43] fixed bug --- .../codec/tsdb/ES87TSDBDocValuesConsumer.java | 6 +++++ .../tsdb/ES87TSDBDocValuesFormatTests.java | 23 +++++++++---------- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesConsumer.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesConsumer.java index 7d033d96d7c96..817afe0f11a7f 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesConsumer.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesConsumer.java @@ -176,7 +176,13 @@ private long[] writeField(FieldInfo field, DocValuesProducer valuesProducer, lon final TSDBDocValuesEncoder encoder = new TSDBDocValuesEncoder(ES87TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE); values = valuesProducer.getSortedNumeric(field); final int bitsPerOrd = maxOrd >= 0 ? PackedInts.bitsRequired(maxOrd - 1) : -1; + + // Reset and recompute. The value gathered from TsdbDocValuesProducer may not be accurate if one of the leaves was singleton + // This could cause failures when writing addresses in writeSortedNumericField(...) + numDocsWithValue = 0; + for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) { + numDocsWithValue++; final int count = values.docValueCount(); for (int i = 0; i < count; ++i) { buffer[bufferSize++] = values.nextValue(); diff --git a/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java index 24b4cdba06162..89b6b214f346a 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java @@ -369,11 +369,11 @@ public void testWithNoValueMultiValue() throws Exception { String timestampField = "@timestamp"; String hostnameField = "host.name"; long baseTimestamp = 1704067200000L; + int numRounds = 32 + random().nextInt(32); + int numDocsPerRound = 64 + random().nextInt(64); var config = getTimeSeriesIndexWriterConfig(hostnameField, timestampField); try (var dir = newDirectory(); var iw = new IndexWriter(dir, config)) { - int numRounds = 4 + random().nextInt(28); - int numDocsPerRound = 8 + random().nextInt(56); long[] gauge1Values = new long[] { 2, 4, 6, 8, 10, 12, 14, 16 }; String[] tags = new String[] { "tag_1", "tag_2", "tag_3", "tag_4", "tag_5", "tag_6", "tag_7", "tag_8" }; { @@ -382,7 +382,9 @@ public void testWithNoValueMultiValue() throws Exception { int r = random().nextInt(10); for (int j = 0; j < numDocsPerRound; j++) { var d = new Document(); - String hostName = String.format(Locale.ROOT, "host-%03d", i); + // host in reverse, otherwise merging will detect that segments are already ordered and will use sequential docid + // merger: + String hostName = String.format(Locale.ROOT, "host-%03d", numRounds - i); d.add(new SortedDocValuesField(hostnameField, new BytesRef(hostName))); // Index sorting doesn't work with NumericDocValuesField: d.add(new SortedNumericDocValuesField(timestampField, timestamp++)); @@ -390,18 +392,17 @@ public void testWithNoValueMultiValue() throws Exception { if (r % 10 == 5) { // sometimes no values } else if (r % 10 > 5) { - // often multiple values: + // often single value: + d.add(new SortedNumericDocValuesField("gauge_1", gauge1Values[j % gauge1Values.length])); + d.add(new SortedSetDocValuesField("tags", new BytesRef(tags[j % tags.length]))); + } else { + // otherwise multiple values: int numValues = 2 + random().nextInt(4); for (int k = 0; k < numValues; k++) { d.add(new SortedNumericDocValuesField("gauge_1", gauge1Values[(j + k) % gauge1Values.length])); d.add(new SortedSetDocValuesField("tags", new BytesRef(tags[(j + k) % tags.length]))); } - } else { - // otherwise single value: - d.add(new SortedNumericDocValuesField("gauge_1", gauge1Values[j % gauge1Values.length])); - d.add(new SortedSetDocValuesField("tags", new BytesRef(tags[j % tags.length]))); } - iw.addDocument(d); } iw.commit(); @@ -424,10 +425,8 @@ public void testWithNoValueMultiValue() throws Exception { assertNotNull(tagsDV); for (int i = 0; i < numDocs; i++) { assertEquals(i, hostNameDV.nextDoc()); - int round = i / numDocsPerRound; - String expectedHostName = String.format(Locale.ROOT, "host-%03d", round); String actualHostName = hostNameDV.lookupOrd(hostNameDV.ordValue()).utf8ToString(); - assertEquals(expectedHostName, actualHostName); + assertTrue("unexpected host name:" + actualHostName, actualHostName.startsWith("host-")); assertEquals(i, timestampDV.nextDoc()); long timestamp = timestampDV.longValue(); From 9f96da197ca5d8a8c35edf673b074b51dc57c797 Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Sat, 29 Mar 2025 20:53:54 +0100 Subject: [PATCH 27/43] Make it really work: * Detect when MergeStats#sumNumDocsWithField() cannot be used. * Use `TsdbDocValuesProducer` instead of EmptyDocValuesProducer where possible in ES87TSDBDocValuesConsumer * Fix TsdbDocValuesProducer#isSingleValued(...) --- .../codec/tsdb/DocValuesConsumerUtil.java | 23 +++---- .../codec/tsdb/ES87TSDBDocValuesConsumer.java | 38 +++++------ .../codec/tsdb/TsdbDocValuesProducer.java | 66 +++++++++++++++++++ 3 files changed, 93 insertions(+), 34 deletions(-) create mode 100644 server/src/main/java/org/elasticsearch/index/codec/tsdb/TsdbDocValuesProducer.java diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/DocValuesConsumerUtil.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/DocValuesConsumerUtil.java index d78fc414d4415..6d80d1a8d862b 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/DocValuesConsumerUtil.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/DocValuesConsumerUtil.java @@ -14,7 +14,6 @@ import org.apache.lucene.index.DocIDMerger; import org.apache.lucene.index.DocValues; import org.apache.lucene.index.DocValuesType; -import org.apache.lucene.index.EmptyDocValuesProducer; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.ImpactsEnum; import org.apache.lucene.index.MergeState; @@ -40,19 +39,9 @@ */ class DocValuesConsumerUtil { - static final MergeStats UNSUPPORTED = new MergeStats(false, -1, -1); + static final MergeStats UNSUPPORTED = new MergeStats(false, -1, -1, false); - abstract static class TsdbDocValuesProducer extends EmptyDocValuesProducer { - - final MergeStats mergeStats; - - TsdbDocValuesProducer(MergeStats mergeStats) { - this.mergeStats = mergeStats; - } - - } - - record MergeStats(boolean supported, long sumNumValues, int sumNumDocsWithField) {} + record MergeStats(boolean supported, long sumNumValues, int sumNumDocsWithField, boolean sumNumDocsWithFieldAccurate) {} static MergeStats compatibleWithOptimizedMerge(boolean optimizedMergeEnabled, MergeState mergeState, FieldInfo fieldInfo) throws IOException { @@ -69,6 +58,8 @@ static MergeStats compatibleWithOptimizedMerge(boolean optimizedMergeEnabled, Me long sumNumValues = 0; int sumNumDocsWithField = 0; + // TODO: move numDocsWithField field from SortedNumericEntry to NumericEntry and always store it. + boolean sumNumDocsWithFieldAccurate = true; // TODO bring back codec version check? (per field doc values producer sits between ES87TSDBDocValuesConsumer) for (int i = 0; i < mergeState.docValuesProducers.length; i++) { @@ -81,6 +72,7 @@ static MergeStats compatibleWithOptimizedMerge(boolean optimizedMergeEnabled, Me sumNumValues += entry.numValues; int numDocsWithField = getNumDocsWithField(entry, mergeState.maxDocs[i]); sumNumDocsWithField += numDocsWithField; + sumNumDocsWithFieldAccurate = false; } else if (numeric != null) { return UNSUPPORTED; } @@ -99,6 +91,7 @@ static MergeStats compatibleWithOptimizedMerge(boolean optimizedMergeEnabled, Me // In this case the numDocsWithField doesn't get recorded in meta: int numDocsWithField = getNumDocsWithField(entry, mergeState.maxDocs[i]); sumNumDocsWithField += numDocsWithField; + sumNumDocsWithFieldAccurate = false; } else if (sortedNumeric != null) { return UNSUPPORTED; } @@ -112,6 +105,7 @@ static MergeStats compatibleWithOptimizedMerge(boolean optimizedMergeEnabled, Me // In this case the numDocsWithField doesn't get recorded in meta:v int numDocsWithField = getNumDocsWithField(entry.ordsEntry, mergeState.maxDocs[i]); sumNumDocsWithField += numDocsWithField; + sumNumDocsWithFieldAccurate = false; } else if (sorted != null) { return UNSUPPORTED; } @@ -130,6 +124,7 @@ static MergeStats compatibleWithOptimizedMerge(boolean optimizedMergeEnabled, Me // In this case the numDocsWithField doesn't get recorded in meta: int numDocsWithField = getNumDocsWithField(entry.ordsEntry, mergeState.maxDocs[i]); sumNumDocsWithField += numDocsWithField; + sumNumDocsWithFieldAccurate = false; } else if (sortedSet != null) { return UNSUPPORTED; } @@ -139,7 +134,7 @@ static MergeStats compatibleWithOptimizedMerge(boolean optimizedMergeEnabled, Me } } - return new MergeStats(true, sumNumValues, sumNumDocsWithField); + return new MergeStats(true, sumNumValues, sumNumDocsWithField, sumNumDocsWithFieldAccurate); } private static int getNumDocsWithField(ES87TSDBDocValuesProducer.NumericEntry entry, int maxDoc) { diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesConsumer.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesConsumer.java index 817afe0f11a7f..2c70944892490 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesConsumer.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesConsumer.java @@ -16,7 +16,6 @@ import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.DocValues; import org.apache.lucene.index.DocValuesSkipIndexType; -import org.apache.lucene.index.EmptyDocValuesProducer; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.MergeState; @@ -106,7 +105,7 @@ final class ES87TSDBDocValuesConsumer extends DocValuesConsumer { public void addNumericField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException { meta.writeInt(field.number); meta.writeByte(ES87TSDBDocValuesFormat.NUMERIC); - DocValuesProducer producer = new EmptyDocValuesProducer() { + var producer = new TsdbDocValuesProducer(valuesProducer) { @Override public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException { return DocValues.singleton(valuesProducer.getNumeric(field)); @@ -119,14 +118,15 @@ public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOExcepti writeField(field, producer, -1); } - private long[] writeField(FieldInfo field, DocValuesProducer valuesProducer, long maxOrd) throws IOException { + private long[] writeField(FieldInfo field, TsdbDocValuesProducer valuesProducer, long maxOrd) throws IOException { int numDocsWithValue = 0; long numValues = 0; SortedNumericDocValues values; - if (valuesProducer instanceof DocValuesConsumerUtil.TsdbDocValuesProducer tsdbDocValuesProducer) { - numDocsWithValue = tsdbDocValuesProducer.mergeStats.sumNumDocsWithField(); - numValues = tsdbDocValuesProducer.mergeStats.sumNumValues(); + boolean computeEvenIfSupported = valuesProducer.mergeStats.sumNumDocsWithFieldAccurate() == false && maxOrd == 1; + if (valuesProducer.mergeStats.supported() && computeEvenIfSupported == false) { + numDocsWithValue = valuesProducer.mergeStats.sumNumDocsWithField(); + numValues = valuesProducer.mergeStats.sumNumValues(); } else { values = valuesProducer.getSortedNumeric(field); for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) { @@ -322,7 +322,7 @@ public void mergeSortedField(FieldInfo mergeFieldInfo, MergeState mergeState) th } private void doAddSortedField(FieldInfo field, DocValuesProducer valuesProducer, boolean addTypeByte) throws IOException { - DocValuesProducer producer = new EmptyDocValuesProducer() { + var producer = new TsdbDocValuesProducer(valuesProducer) { @Override public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException { SortedDocValues sorted = valuesProducer.getSorted(field); @@ -517,10 +517,10 @@ private void writeTermsIndex(SortedSetDocValues values) throws IOException { public void addSortedNumericField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException { meta.writeInt(field.number); meta.writeByte(ES87TSDBDocValuesFormat.SORTED_NUMERIC); - writeSortedNumericField(field, valuesProducer, -1); + writeSortedNumericField(field, new TsdbDocValuesProducer(valuesProducer), -1); } - private void writeSortedNumericField(FieldInfo field, DocValuesProducer valuesProducer, long maxOrd) throws IOException { + private void writeSortedNumericField(FieldInfo field, TsdbDocValuesProducer valuesProducer, long maxOrd) throws IOException { if (field.docValuesSkipIndexType() != DocValuesSkipIndexType.NONE) { writeSkipIndex(field, valuesProducer); } @@ -566,16 +566,14 @@ public void mergeSortedNumericField(FieldInfo mergeFieldInfo, MergeState mergeSt } } - private static boolean isSingleValued(SortedSetDocValues values) throws IOException { - if (DocValues.unwrapSingleton(values) != null) { - return true; + private static boolean isSingleValued(FieldInfo field, TsdbDocValuesProducer producer) throws IOException { + if (producer.mergeStats.supported() && producer.mergeStats.sumNumDocsWithFieldAccurate()) { + return producer.mergeStats.sumNumValues() == producer.mergeStats.sumNumDocsWithField(); } - if (values instanceof ES87TSDBDocValuesProducer.BaseSortedSetDocValues baseSortedSet) { - var entry = baseSortedSet.entry; - if (entry.ordsEntry.numValues == entry.ordsEntry.numDocsWithField) { - return true; - } + var values = producer.getSortedSet(field); + if (DocValues.unwrapSingleton(values) != null) { + return true; } assert values.docID() == -1; @@ -604,8 +602,8 @@ public void addSortedSetField(FieldInfo field, DocValuesProducer valuesProducer) meta.writeInt(field.number); meta.writeByte(SORTED_SET); - if (isSingleValued(valuesProducer.getSortedSet(field))) { - doAddSortedField(field, new EmptyDocValuesProducer() { + if (isSingleValued(field, new TsdbDocValuesProducer(valuesProducer))) { + doAddSortedField(field, new TsdbDocValuesProducer(valuesProducer) { @Override public SortedDocValues getSorted(FieldInfo field) throws IOException { return SortedSetSelector.wrap(valuesProducer.getSortedSet(field), SortedSetSelector.Type.MIN); @@ -616,7 +614,7 @@ public SortedDocValues getSorted(FieldInfo field) throws IOException { SortedSetDocValues values = valuesProducer.getSortedSet(field); long maxOrd = values.getValueCount(); - writeSortedNumericField(field, new EmptyDocValuesProducer() { + writeSortedNumericField(field, new TsdbDocValuesProducer(valuesProducer) { @Override public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException { SortedSetDocValues values = valuesProducer.getSortedSet(field); diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/TsdbDocValuesProducer.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/TsdbDocValuesProducer.java new file mode 100644 index 0000000000000..8838793d06cf9 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/TsdbDocValuesProducer.java @@ -0,0 +1,66 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.codec.tsdb; + +import org.apache.lucene.codecs.DocValuesProducer; +import org.apache.lucene.index.EmptyDocValuesProducer; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.SortedDocValues; +import org.apache.lucene.index.SortedNumericDocValues; +import org.apache.lucene.index.SortedSetDocValues; + +import java.io.IOException; + +class TsdbDocValuesProducer extends EmptyDocValuesProducer { + + final DocValuesConsumerUtil.MergeStats mergeStats; + final DocValuesProducer actual; + + TsdbDocValuesProducer(DocValuesConsumerUtil.MergeStats mergeStats) { + this.mergeStats = mergeStats; + this.actual = null; + } + + TsdbDocValuesProducer(DocValuesProducer valuesProducer) { + if (valuesProducer instanceof TsdbDocValuesProducer tsdb) { + mergeStats = tsdb.mergeStats; + } else { + mergeStats = DocValuesConsumerUtil.UNSUPPORTED; + } + this.actual = valuesProducer; + } + + @Override + public SortedDocValues getSorted(FieldInfo field) throws IOException { + if (actual != null) { + return actual.getSorted(field); + } else { + return super.getSorted(field); + } + } + + @Override + public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException { + if (actual != null) { + return actual.getSortedSet(field); + } else { + return super.getSortedSet(field); + } + } + + @Override + public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException { + if (actual != null) { + return actual.getSortedNumeric(field); + } else { + return super.getSortedNumeric(field); + } + } +} From 8edfc39e8779fdf7e6cc69a1bc75483e52bfe839 Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Mon, 31 Mar 2025 09:21:50 +0200 Subject: [PATCH 28/43] Store numDocsWithField statistic on NumericEntry instead of SortedNumericEntry. --- .../codec/tsdb/DocValuesConsumerUtil.java | 39 +++------------- .../codec/tsdb/ES87TSDBDocValuesConsumer.java | 12 ++--- .../codec/tsdb/ES87TSDBDocValuesFormat.java | 4 +- .../codec/tsdb/ES87TSDBDocValuesProducer.java | 45 ++++++++++--------- 4 files changed, 38 insertions(+), 62 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/DocValuesConsumerUtil.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/DocValuesConsumerUtil.java index 6d80d1a8d862b..e3bb69361bfd6 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/DocValuesConsumerUtil.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/DocValuesConsumerUtil.java @@ -39,9 +39,9 @@ */ class DocValuesConsumerUtil { - static final MergeStats UNSUPPORTED = new MergeStats(false, -1, -1, false); + static final MergeStats UNSUPPORTED = new MergeStats(false, -1, -1); - record MergeStats(boolean supported, long sumNumValues, int sumNumDocsWithField, boolean sumNumDocsWithFieldAccurate) {} + record MergeStats(boolean supported, long sumNumValues, int sumNumDocsWithField) {} static MergeStats compatibleWithOptimizedMerge(boolean optimizedMergeEnabled, MergeState mergeState, FieldInfo fieldInfo) throws IOException { @@ -58,8 +58,6 @@ static MergeStats compatibleWithOptimizedMerge(boolean optimizedMergeEnabled, Me long sumNumValues = 0; int sumNumDocsWithField = 0; - // TODO: move numDocsWithField field from SortedNumericEntry to NumericEntry and always store it. - boolean sumNumDocsWithFieldAccurate = true; // TODO bring back codec version check? (per field doc values producer sits between ES87TSDBDocValuesConsumer) for (int i = 0; i < mergeState.docValuesProducers.length; i++) { @@ -70,9 +68,7 @@ static MergeStats compatibleWithOptimizedMerge(boolean optimizedMergeEnabled, Me if (numeric instanceof ES87TSDBDocValuesProducer.BaseNumericDocValues baseNumeric) { var entry = baseNumeric.entry; sumNumValues += entry.numValues; - int numDocsWithField = getNumDocsWithField(entry, mergeState.maxDocs[i]); - sumNumDocsWithField += numDocsWithField; - sumNumDocsWithFieldAccurate = false; + sumNumDocsWithField += entry.numDocsWithField; } else if (numeric != null) { return UNSUPPORTED; } @@ -88,10 +84,7 @@ static MergeStats compatibleWithOptimizedMerge(boolean optimizedMergeEnabled, Me if (singleton instanceof ES87TSDBDocValuesProducer.BaseNumericDocValues baseNumeric) { var entry = baseNumeric.entry; sumNumValues += entry.numValues; - // In this case the numDocsWithField doesn't get recorded in meta: - int numDocsWithField = getNumDocsWithField(entry, mergeState.maxDocs[i]); - sumNumDocsWithField += numDocsWithField; - sumNumDocsWithFieldAccurate = false; + sumNumDocsWithField += entry.numDocsWithField; } else if (sortedNumeric != null) { return UNSUPPORTED; } @@ -102,10 +95,7 @@ static MergeStats compatibleWithOptimizedMerge(boolean optimizedMergeEnabled, Me if (sorted instanceof ES87TSDBDocValuesProducer.BaseSortedDocValues baseSortedDocValues) { var entry = baseSortedDocValues.entry; sumNumValues += entry.ordsEntry.numValues; - // In this case the numDocsWithField doesn't get recorded in meta:v - int numDocsWithField = getNumDocsWithField(entry.ordsEntry, mergeState.maxDocs[i]); - sumNumDocsWithField += numDocsWithField; - sumNumDocsWithFieldAccurate = false; + sumNumDocsWithField += entry.ordsEntry.numDocsWithField; } else if (sorted != null) { return UNSUPPORTED; } @@ -121,10 +111,7 @@ static MergeStats compatibleWithOptimizedMerge(boolean optimizedMergeEnabled, Me if (singleton instanceof ES87TSDBDocValuesProducer.BaseSortedDocValues baseSorted) { var entry = baseSorted.entry; sumNumValues += entry.ordsEntry.numValues; - // In this case the numDocsWithField doesn't get recorded in meta: - int numDocsWithField = getNumDocsWithField(entry.ordsEntry, mergeState.maxDocs[i]); - sumNumDocsWithField += numDocsWithField; - sumNumDocsWithFieldAccurate = false; + sumNumDocsWithField += entry.ordsEntry.numDocsWithField; } else if (sortedSet != null) { return UNSUPPORTED; } @@ -134,19 +121,7 @@ static MergeStats compatibleWithOptimizedMerge(boolean optimizedMergeEnabled, Me } } - return new MergeStats(true, sumNumValues, sumNumDocsWithField, sumNumDocsWithFieldAccurate); - } - - private static int getNumDocsWithField(ES87TSDBDocValuesProducer.NumericEntry entry, int maxDoc) { - // In this case the numDocsWithField doesn't get recorded in meta: - if (entry.docsWithFieldOffset == -2) { - return 0; - } else if (entry.docsWithFieldOffset == -1) { - return maxDoc; - } else { - // numDocsWithField doesn't matter in this case: - return 1; - } + return new MergeStats(true, sumNumValues, sumNumDocsWithField); } static DocValuesProducer mergeNumericProducer(MergeStats mergeStats, FieldInfo mergeFieldInfo, MergeState mergeState) { diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesConsumer.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesConsumer.java index 2c70944892490..c6296d96d81e9 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesConsumer.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesConsumer.java @@ -123,8 +123,7 @@ private long[] writeField(FieldInfo field, TsdbDocValuesProducer valuesProducer, long numValues = 0; SortedNumericDocValues values; - boolean computeEvenIfSupported = valuesProducer.mergeStats.sumNumDocsWithFieldAccurate() == false && maxOrd == 1; - if (valuesProducer.mergeStats.supported() && computeEvenIfSupported == false) { + if (valuesProducer.mergeStats.supported()) { numDocsWithValue = valuesProducer.mergeStats.sumNumDocsWithField(); numValues = valuesProducer.mergeStats.sumNumValues(); } else { @@ -156,6 +155,7 @@ private long[] writeField(FieldInfo field, TsdbDocValuesProducer valuesProducer, meta.writeByte(IndexedDISI.DEFAULT_DENSE_RANK_POWER); } meta.writeLong(numValues); + meta.writeInt(numDocsWithValue); if (numValues > 0) { // Special case for maxOrd of 1, signal -1 that no blocks will be written @@ -177,12 +177,7 @@ private long[] writeField(FieldInfo field, TsdbDocValuesProducer valuesProducer, values = valuesProducer.getSortedNumeric(field); final int bitsPerOrd = maxOrd >= 0 ? PackedInts.bitsRequired(maxOrd - 1) : -1; - // Reset and recompute. The value gathered from TsdbDocValuesProducer may not be accurate if one of the leaves was singleton - // This could cause failures when writing addresses in writeSortedNumericField(...) - numDocsWithValue = 0; - for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) { - numDocsWithValue++; final int count = values.docValueCount(); for (int i = 0; i < count; ++i) { buffer[bufferSize++] = values.nextValue(); @@ -532,7 +527,6 @@ private void writeSortedNumericField(FieldInfo field, TsdbDocValuesProducer valu long numValues = stats[1]; assert numValues >= numDocsWithField; - meta.writeInt(numDocsWithField); if (numValues > numDocsWithField) { long start = data.getFilePointer(); meta.writeLong(start); @@ -567,7 +561,7 @@ public void mergeSortedNumericField(FieldInfo mergeFieldInfo, MergeState mergeSt } private static boolean isSingleValued(FieldInfo field, TsdbDocValuesProducer producer) throws IOException { - if (producer.mergeStats.supported() && producer.mergeStats.sumNumDocsWithFieldAccurate()) { + if (producer.mergeStats.supported()) { return producer.mergeStats.sumNumValues() == producer.mergeStats.sumNumDocsWithField(); } diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormat.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormat.java index ca8fc2e9774ee..298a567001a64 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormat.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormat.java @@ -29,7 +29,9 @@ public class ES87TSDBDocValuesFormat extends org.apache.lucene.codecs.DocValuesF static final String META_CODEC = "ES87TSDBDocValuesMetadata"; static final String META_EXTENSION = "dvm"; static final int VERSION_START = 0; - static final int VERSION_CURRENT = VERSION_START; + // Move numDocsWithField from SortedNumericEntry to NumericEntry + static final int VERSION_META_MOVE_META_ENTRY = 1; + static final int VERSION_CURRENT = VERSION_META_MOVE_META_ENTRY; static final byte NUMERIC = 0; static final byte BINARY = 1; static final byte SORTED = 2; diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesProducer.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesProducer.java index 31966326cce5f..6b36899de3db7 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesProducer.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesProducer.java @@ -88,7 +88,7 @@ public class ES87TSDBDocValuesProducer extends DocValuesProducer { state.segmentSuffix ); - readFields(in, state.fieldInfos); + readFields(in, state.fieldInfos, version); } catch (Throwable exception) { priorE = exception; @@ -860,7 +860,7 @@ public void close() throws IOException { data.close(); } - private void readFields(IndexInput meta, FieldInfos infos) throws IOException { + private void readFields(IndexInput meta, FieldInfos infos, int version) throws IOException { for (int fieldNumber = meta.readInt(); fieldNumber != -1; fieldNumber = meta.readInt()) { FieldInfo info = infos.fieldInfo(fieldNumber); if (info == null) { @@ -871,24 +871,24 @@ private void readFields(IndexInput meta, FieldInfos infos) throws IOException { skippers.put(info.number, readDocValueSkipperMeta(meta)); } if (type == ES87TSDBDocValuesFormat.NUMERIC) { - numerics.put(info.number, readNumeric(meta)); + numerics.put(info.number, readNumeric(meta, version)); } else if (type == ES87TSDBDocValuesFormat.BINARY) { binaries.put(info.number, readBinary(meta)); } else if (type == ES87TSDBDocValuesFormat.SORTED) { - sorted.put(info.number, readSorted(meta)); + sorted.put(info.number, readSorted(meta, version)); } else if (type == ES87TSDBDocValuesFormat.SORTED_SET) { - sortedSets.put(info.number, readSortedSet(meta)); + sortedSets.put(info.number, readSortedSet(meta, version)); } else if (type == ES87TSDBDocValuesFormat.SORTED_NUMERIC) { - sortedNumerics.put(info.number, readSortedNumeric(meta)); + sortedNumerics.put(info.number, readSortedNumeric(meta, version)); } else { throw new CorruptIndexException("invalid type: " + type, meta); } } } - private static NumericEntry readNumeric(IndexInput meta) throws IOException { + private static NumericEntry readNumeric(IndexInput meta, int version) throws IOException { NumericEntry entry = new NumericEntry(); - readNumeric(meta, entry); + readNumeric(meta, entry, version); return entry; } @@ -903,12 +903,15 @@ private static DocValuesSkipperEntry readDocValueSkipperMeta(IndexInput meta) th return new DocValuesSkipperEntry(offset, length, minValue, maxValue, docCount, maxDocID); } - private static void readNumeric(IndexInput meta, NumericEntry entry) throws IOException { + private static void readNumeric(IndexInput meta, NumericEntry entry, int version) throws IOException { entry.docsWithFieldOffset = meta.readLong(); entry.docsWithFieldLength = meta.readLong(); entry.jumpTableEntryCount = meta.readShort(); entry.denseRankPower = meta.readByte(); entry.numValues = meta.readLong(); + if (version >= ES87TSDBDocValuesFormat.VERSION_META_MOVE_META_ENTRY) { + entry.numDocsWithField = meta.readInt(); + } if (entry.numValues > 0) { final int indexBlockShift = meta.readInt(); // Special case, -1 means there are no blocks, so no need to load the metadata for it @@ -951,15 +954,17 @@ private BinaryEntry readBinary(IndexInput meta) throws IOException { return entry; } - private static SortedNumericEntry readSortedNumeric(IndexInput meta) throws IOException { + private static SortedNumericEntry readSortedNumeric(IndexInput meta, int version) throws IOException { SortedNumericEntry entry = new SortedNumericEntry(); - readSortedNumeric(meta, entry); + readSortedNumeric(meta, entry, version); return entry; } - private static SortedNumericEntry readSortedNumeric(IndexInput meta, SortedNumericEntry entry) throws IOException { - readNumeric(meta, entry); - entry.numDocsWithField = meta.readInt(); + private static SortedNumericEntry readSortedNumeric(IndexInput meta, SortedNumericEntry entry, int version) throws IOException { + readNumeric(meta, entry, version); + if (version < ES87TSDBDocValuesFormat.VERSION_META_MOVE_META_ENTRY) { + entry.numDocsWithField = meta.readInt(); + } if (entry.numDocsWithField != entry.numValues) { entry.addressesOffset = meta.readLong(); final int blockShift = meta.readVInt(); @@ -969,21 +974,21 @@ private static SortedNumericEntry readSortedNumeric(IndexInput meta, SortedNumer return entry; } - private SortedEntry readSorted(IndexInput meta) throws IOException { + private SortedEntry readSorted(IndexInput meta, int version) throws IOException { SortedEntry entry = new SortedEntry(); entry.ordsEntry = new NumericEntry(); - readNumeric(meta, entry.ordsEntry); + readNumeric(meta, entry.ordsEntry, version); entry.termsDictEntry = new TermsDictEntry(); readTermDict(meta, entry.termsDictEntry); return entry; } - private SortedSetEntry readSortedSet(IndexInput meta) throws IOException { + private SortedSetEntry readSortedSet(IndexInput meta, int version) throws IOException { SortedSetEntry entry = new SortedSetEntry(); byte multiValued = meta.readByte(); switch (multiValued) { case 0: // singlevalued - entry.singleValueEntry = readSorted(meta); + entry.singleValueEntry = readSorted(meta, version); return entry; case 1: // multivalued break; @@ -991,7 +996,7 @@ private SortedSetEntry readSortedSet(IndexInput meta) throws IOException { throw new CorruptIndexException("Invalid multiValued flag: " + multiValued, meta); } entry.ordsEntry = new SortedNumericEntry(); - readSortedNumeric(meta, entry.ordsEntry); + readSortedNumeric(meta, entry.ordsEntry, version); entry.termsDictEntry = new TermsDictEntry(); readTermDict(meta, entry.termsDictEntry); return entry; @@ -1481,6 +1486,7 @@ static class NumericEntry { short jumpTableEntryCount; byte denseRankPower; long numValues; + int numDocsWithField; long indexOffset; long indexLength; DirectMonotonicReader.Meta indexMeta; @@ -1504,7 +1510,6 @@ private static class BinaryEntry { } static class SortedNumericEntry extends NumericEntry { - int numDocsWithField; DirectMonotonicReader.Meta addressesMeta; long addressesOffset; long addressesLength; From 638ae1344ae2a04485b6c85930f1b360d73a14e0 Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Wed, 2 Apr 2025 14:54:37 +0200 Subject: [PATCH 29/43] iter --- .../tsdb/TSDBDocValuesMergeBenchmark.java | 3 +- .../codec/tsdb/ES87TSDBDocValuesConsumer.java | 1 - .../es819/ES819TSDBDocValuesFormatTests.java | 137 ++++++++++++++++++ 3 files changed, 139 insertions(+), 2 deletions(-) diff --git a/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java b/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java index 5dc7ed47d7b80..38a3fda8ae46b 100644 --- a/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java +++ b/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java @@ -25,6 +25,7 @@ import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.BytesRef; import org.elasticsearch.index.codec.tsdb.ES87TSDBDocValuesFormat; +import org.elasticsearch.index.codec.tsdb.es819.ES819TSDBDocValuesFormat; import org.openjdk.jmh.annotations.Benchmark; import org.openjdk.jmh.annotations.BenchmarkMode; import org.openjdk.jmh.annotations.Fork; @@ -107,7 +108,7 @@ private IndexWriter createIndex(final Directory directory, final boolean optimiz new SortedNumericSortField(TIMESTAMP_FIELD, SortField.Type.LONG, true) ) ); - ES87TSDBDocValuesFormat docValuesFormat = new ES87TSDBDocValuesFormat(4096, optimizedMergeEnabled); + ES819TSDBDocValuesFormat docValuesFormat = new ES819TSDBDocValuesFormat(4096, optimizedMergeEnabled); config.setCodec(new Lucene101Codec() { @Override diff --git a/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesConsumer.java b/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesConsumer.java index 05fa201a31b4f..dc73428a07c7c 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesConsumer.java +++ b/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesConsumer.java @@ -165,7 +165,6 @@ private long[] writeField(FieldInfo field, DocValuesProducer valuesProducer, lon final TSDBDocValuesEncoder encoder = new TSDBDocValuesEncoder(ES87TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE); values = valuesProducer.getSortedNumeric(field); final int bitsPerOrd = maxOrd >= 0 ? PackedInts.bitsRequired(maxOrd - 1) : -1; - for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) { final int count = values.docValueCount(); for (int i = 0; i < count; ++i) { diff --git a/server/src/test/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesFormatTests.java index cf0a398310757..44242810969b9 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesFormatTests.java @@ -150,6 +150,143 @@ public void testForceMergeDenseCase() throws Exception { } } + public void testForceMergeSparseCase() throws Exception { + String timestampField = "@timestamp"; + String hostnameField = "host.name"; + long baseTimestamp = 1704067200000L; + + var config = getTimeSeriesIndexWriterConfig(hostnameField, timestampField); + try (var dir = newDirectory(); var iw = new IndexWriter(dir, config)) { + long counter1 = 0; + long counter2 = 10_000_000; + long[] gauge1Values = new long[] { 2, 4, 6, 8, 10, 12, 14, 16 }; + long[] gauge2Values = new long[] { -2, -4, -6, -8, -10, -12, -14, -16 }; + String[] tags = new String[] { "tag_1", "tag_2", "tag_3", "tag_4", "tag_5", "tag_6", "tag_7", "tag_8" }; + + int numDocs = 256 + random().nextInt(1024); + int numHosts = numDocs / 20; + for (int i = 0; i < numDocs; i++) { + var d = new Document(); + + int batchIndex = i / numHosts; + String hostName = String.format(Locale.ROOT, "host-%03d", batchIndex); + long timestamp = baseTimestamp + (1000L * i); + + d.add(new SortedDocValuesField(hostnameField, new BytesRef(hostName))); + // Index sorting doesn't work with NumericDocValuesField: + d.add(new SortedNumericDocValuesField(timestampField, timestamp)); + + if (random().nextBoolean()) { + d.add(new NumericDocValuesField("counter_1", counter1++)); + } + if (random().nextBoolean()) { + d.add(new SortedNumericDocValuesField("counter_2", counter2++)); + } + if (random().nextBoolean()) { + d.add(new SortedNumericDocValuesField("gauge_1", gauge1Values[i % gauge1Values.length])); + } + if (random().nextBoolean()) { + d.add(new SortedNumericDocValuesField("gauge_2", gauge2Values[i % gauge1Values.length])); + } + if (random().nextBoolean()) { + int numTags = 1 + random().nextInt(8); + for (int j = 0; j < numTags; j++) { + d.add(new SortedSetDocValuesField("tags", new BytesRef(tags[j]))); + } + } + if (random().nextBoolean()) { + int randomIndex = random().nextInt(tags.length); + d.add(new SortedDocValuesField("other_tag", new BytesRef(tags[randomIndex]))); + } + + iw.addDocument(d); + if (i % 100 == 0) { + iw.commit(); + } + } + iw.commit(); + + iw.forceMerge(1); + + // For asserting using binary search later on: + Arrays.sort(gauge2Values); + + try (var reader = DirectoryReader.open(iw)) { + assertEquals(1, reader.leaves().size()); + assertEquals(numDocs, reader.maxDoc()); + var leaf = reader.leaves().get(0).reader(); + var hostNameDV = leaf.getSortedDocValues(hostnameField); + assertNotNull(hostNameDV); + var timestampDV = DocValues.unwrapSingleton(leaf.getSortedNumericDocValues(timestampField)); + assertNotNull(timestampDV); + var counterOneDV = leaf.getNumericDocValues("counter_1"); + assertNotNull(counterOneDV); + var counterTwoDV = leaf.getSortedNumericDocValues("counter_2"); + assertNotNull(counterTwoDV); + var gaugeOneDV = leaf.getSortedNumericDocValues("gauge_1"); + assertNotNull(gaugeOneDV); + var gaugeTwoDV = leaf.getSortedNumericDocValues("gauge_2"); + assertNotNull(gaugeTwoDV); + var tagsDV = leaf.getSortedSetDocValues("tags"); + assertNotNull(tagsDV); + var otherTagDV = leaf.getSortedDocValues("other_tag"); + assertNotNull(otherTagDV); + for (int i = 0; i < numDocs; i++) { + assertEquals(i, hostNameDV.nextDoc()); + int batchIndex = i / numHosts; + assertEquals(batchIndex, hostNameDV.ordValue()); + String expectedHostName = String.format(Locale.ROOT, "host-%03d", batchIndex); + assertEquals(expectedHostName, hostNameDV.lookupOrd(hostNameDV.ordValue()).utf8ToString()); + + assertEquals(i, timestampDV.nextDoc()); + long timestamp = timestampDV.longValue(); + long lowerBound = baseTimestamp; + long upperBound = baseTimestamp + (1000L * numDocs); + assertTrue( + "unexpected timestamp [" + timestamp + "], expected between [" + lowerBound + "] and [" + upperBound + "]", + timestamp >= lowerBound && timestamp < upperBound + ); + + if (counterOneDV.advanceExact(i)) { + long counterOneValue = counterOneDV.longValue(); + assertTrue("unexpected counter [" + counterOneValue + "]", counterOneValue >= 0 && counterOneValue < counter1); + } + + if (counterTwoDV.advanceExact(i)) { + assertEquals(1, counterTwoDV.docValueCount()); + long counterTwoValue = counterTwoDV.nextValue(); + assertTrue("unexpected counter [" + counterTwoValue + "]", counterTwoValue > 0 && counterTwoValue <= counter2); + } + + if (gaugeOneDV.advanceExact(i)) { + assertEquals(1, gaugeOneDV.docValueCount()); + long gaugeOneValue = gaugeOneDV.nextValue(); + assertTrue("unexpected gauge [" + gaugeOneValue + "]", Arrays.binarySearch(gauge1Values, gaugeOneValue) >= 0); + } + + if (gaugeTwoDV.advanceExact(i)) { + assertEquals(1, gaugeTwoDV.docValueCount()); + long gaugeTwoValue = gaugeTwoDV.nextValue(); + assertTrue("unexpected gauge [" + gaugeTwoValue + "]", Arrays.binarySearch(gauge2Values, gaugeTwoValue) >= 0); + } + + if (tagsDV.advanceExact(i)) { + for (int j = 0; j < tagsDV.docValueCount(); j++) { + long ordinal = tagsDV.nextOrd(); + String actualTag = tagsDV.lookupOrd(ordinal).utf8ToString(); + assertTrue("unexpected tag [" + actualTag + "]", Arrays.binarySearch(tags, actualTag) >= 0); + } + } + if (otherTagDV.advanceExact(i)) { + int ordinal = otherTagDV.ordValue(); + String actualTag = otherTagDV.lookupOrd(ordinal).utf8ToString(); + assertTrue("unexpected tag [" + actualTag + "]", Arrays.binarySearch(tags, actualTag) >= 0); + } + } + } + } + } + public void testWithNoValueMultiValue() throws Exception { String timestampField = "@timestamp"; String hostnameField = "host.name"; From 7f4773b7422c2e82c0877c6078ffa4eed70e9781 Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Wed, 2 Apr 2025 15:02:54 +0200 Subject: [PATCH 30/43] removed unused import --- .../benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java | 1 - 1 file changed, 1 deletion(-) diff --git a/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java b/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java index 38a3fda8ae46b..a7d65656c5559 100644 --- a/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java +++ b/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java @@ -24,7 +24,6 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.BytesRef; -import org.elasticsearch.index.codec.tsdb.ES87TSDBDocValuesFormat; import org.elasticsearch.index.codec.tsdb.es819.ES819TSDBDocValuesFormat; import org.openjdk.jmh.annotations.Benchmark; import org.openjdk.jmh.annotations.BenchmarkMode; From 09ee20a82d56183d55c68c92af3022a381f12317 Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Wed, 2 Apr 2025 15:41:51 +0200 Subject: [PATCH 31/43] cleanup --- .../tsdb/TSDBDocValuesMergeBenchmark.java | 45 +++++++++++-------- 1 file changed, 26 insertions(+), 19 deletions(-) diff --git a/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java b/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java index a7d65656c5559..6c83c88108e04 100644 --- a/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java +++ b/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java @@ -18,12 +18,14 @@ import org.apache.lucene.document.SortedSetDocValuesField; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.LogByteSizeMergePolicy; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.apache.lucene.search.SortedNumericSortField; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.BytesRef; +import org.elasticsearch.cluster.metadata.DataStream; import org.elasticsearch.index.codec.tsdb.es819.ES819TSDBDocValuesFormat; import org.openjdk.jmh.annotations.Benchmark; import org.openjdk.jmh.annotations.BenchmarkMode; @@ -98,24 +100,7 @@ public void setup() throws IOException { } private IndexWriter createIndex(final Directory directory, final boolean optimizedMergeEnabled) throws IOException { - - final IndexWriterConfig config = new IndexWriterConfig(new StandardAnalyzer()); - // NOTE: index sort config matching LogsDB's sort order - config.setIndexSort( - new Sort( - new SortField(HOSTNAME_FIELD, SortField.Type.STRING, false), - new SortedNumericSortField(TIMESTAMP_FIELD, SortField.Type.LONG, true) - ) - ); - ES819TSDBDocValuesFormat docValuesFormat = new ES819TSDBDocValuesFormat(4096, optimizedMergeEnabled); - config.setCodec(new Lucene101Codec() { - - @Override - public DocValuesFormat getDocValuesFormatForField(String field) { - return docValuesFormat; - } - }); - + final var iwc = createIndexWriterConfig(optimizedMergeEnabled); long counter1 = 0; long counter2 = 10_000_000; long[] gauge1Values = new long[] { 2, 4, 6, 8, 10, 12, 14, 16 }; @@ -124,7 +109,7 @@ public DocValuesFormat getDocValuesFormatForField(String field) { String[] tags = new String[] { "tag_1", "tag_2", "tag_3", "tag_4", "tag_5", "tag_6", "tag_7", "tag_8" }; final Random random = new Random(seed); - IndexWriter indexWriter = new IndexWriter(directory, config); + IndexWriter indexWriter = new IndexWriter(directory, iwc); for (int i = 0; i < nDocs; i++) { final Document doc = new Document(); @@ -178,4 +163,26 @@ public void tearDown() { } } } + + private static IndexWriterConfig createIndexWriterConfig(boolean optimizedMergeEnabled) { + var config = new IndexWriterConfig(new StandardAnalyzer()); + // NOTE: index sort config matching LogsDB's sort order + config.setIndexSort( + new Sort( + new SortField(HOSTNAME_FIELD, SortField.Type.STRING, false), + new SortedNumericSortField(TIMESTAMP_FIELD, SortField.Type.LONG, true) + ) + ); + config.setLeafSorter(DataStream.TIMESERIES_LEAF_READERS_SORTER); + config.setMergePolicy(new LogByteSizeMergePolicy()); + var docValuesFormat = new ES819TSDBDocValuesFormat(4096, optimizedMergeEnabled); + config.setCodec(new Lucene101Codec() { + + @Override + public DocValuesFormat getDocValuesFormatForField(String field) { + return docValuesFormat; + } + }); + return config; + } } From 75f5a7555ad17187b846755801cab0ff33f35841 Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Fri, 4 Apr 2025 10:54:20 +0200 Subject: [PATCH 32/43] fork DocValuesConsumer --- .../codec/tsdb/es819/XDocValuesConsumer.java | 1051 +++++++++++++++++ 1 file changed, 1051 insertions(+) create mode 100644 server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/XDocValuesConsumer.java diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/XDocValuesConsumer.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/XDocValuesConsumer.java new file mode 100644 index 0000000000000..963cf72038904 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/XDocValuesConsumer.java @@ -0,0 +1,1051 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.elasticsearch.index.codec.tsdb.es819; + +import org.apache.lucene.codecs.DocValuesFormat; +import org.apache.lucene.codecs.DocValuesProducer; +import org.apache.lucene.index.BaseTermsEnum; +import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.index.DocIDMerger; +import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.DocValuesType; +import org.apache.lucene.index.EmptyDocValuesProducer; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FilteredTermsEnum; +import org.apache.lucene.index.ImpactsEnum; +import org.apache.lucene.index.MergeState; +import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.index.OrdinalMap; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.SortedDocValues; +import org.apache.lucene.index.SortedNumericDocValues; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.index.TermState; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LongBitSet; +import org.apache.lucene.util.LongValues; +import org.apache.lucene.util.packed.PackedInts; + +import java.io.Closeable; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; + +/** + * Abstract API that consumes numeric, binary and sorted docvalues. Concrete implementations of this + * actually do "something" with the docvalues (write it into the index in a specific format). + * + *

The lifecycle is: + * + *

    + *
  1. DocValuesConsumer is created by {@link DocValuesFormat#fieldsConsumer(SegmentWriteState)}. + *
  2. {@link #addNumericField}, {@link #addBinaryField}, {@link #addSortedField}, {@link + * #addSortedSetField}, or {@link #addSortedNumericField} are called for each Numeric, Binary, + * Sorted, SortedSet, or SortedNumeric docvalues field. The API is a "pull" rather than + * "push", and the implementation is free to iterate over the values multiple times ({@link + * Iterable#iterator()}). + *
  3. After all fields are added, the consumer is {@link #close}d. + *
+ * + * @lucene.experimental + */ +public abstract class XDocValuesConsumer implements Closeable { + + /** Sole constructor. (For invocation by subclass constructors, typically implicit.) */ + protected XDocValuesConsumer() {} + + /** + * Writes numeric docvalues for a field. + * + * @param field field information + * @param valuesProducer Numeric values to write. + * @throws IOException if an I/O error occurred. + */ + public abstract void addNumericField(FieldInfo field, DocValuesProducer valuesProducer) + throws IOException; + + /** + * Writes binary docvalues for a field. + * + * @param field field information + * @param valuesProducer Binary values to write. + * @throws IOException if an I/O error occurred. + */ + public abstract void addBinaryField(FieldInfo field, DocValuesProducer valuesProducer) + throws IOException; + + /** + * Writes pre-sorted binary docvalues for a field. + * + * @param field field information + * @param valuesProducer produces the values and ordinals to write + * @throws IOException if an I/O error occurred. + */ + public abstract void addSortedField(FieldInfo field, DocValuesProducer valuesProducer) + throws IOException; + + /** + * Writes pre-sorted numeric docvalues for a field + * + * @param field field information + * @param valuesProducer produces the values to write + * @throws IOException if an I/O error occurred. + */ + public abstract void addSortedNumericField(FieldInfo field, DocValuesProducer valuesProducer) + throws IOException; + + /** + * Writes pre-sorted set docvalues for a field + * + * @param field field information + * @param valuesProducer produces the values to write + * @throws IOException if an I/O error occurred. + */ + public abstract void addSortedSetField(FieldInfo field, DocValuesProducer valuesProducer) + throws IOException; + + /** + * Merges in the fields from the readers in mergeState. The default implementation + * calls {@link #mergeNumericField}, {@link #mergeBinaryField}, {@link #mergeSortedField}, {@link + * #mergeSortedSetField}, or {@link #mergeSortedNumericField} for each field, depending on its + * type. Implementations can override this method for more sophisticated merging (bulk-byte + * copying, etc). + */ + public void merge(MergeState mergeState) throws IOException { + for (DocValuesProducer docValuesProducer : mergeState.docValuesProducers) { + if (docValuesProducer != null) { + docValuesProducer.checkIntegrity(); + } + } + + for (FieldInfo mergeFieldInfo : mergeState.mergeFieldInfos) { + DocValuesType type = mergeFieldInfo.getDocValuesType(); + if (type != DocValuesType.NONE) { + if (type == DocValuesType.NUMERIC) { + mergeNumericField(mergeFieldInfo, mergeState); + } else if (type == DocValuesType.BINARY) { + mergeBinaryField(mergeFieldInfo, mergeState); + } else if (type == DocValuesType.SORTED) { + mergeSortedField(mergeFieldInfo, mergeState); + } else if (type == DocValuesType.SORTED_SET) { + mergeSortedSetField(mergeFieldInfo, mergeState); + } else if (type == DocValuesType.SORTED_NUMERIC) { + mergeSortedNumericField(mergeFieldInfo, mergeState); + } else { + throw new AssertionError("type=" + type); + } + } + } + } + + /** Tracks state of one numeric sub-reader that we are merging */ + private static class NumericDocValuesSub extends DocIDMerger.Sub { + + final NumericDocValues values; + + public NumericDocValuesSub(MergeState.DocMap docMap, NumericDocValues values) { + super(docMap); + this.values = values; + assert values.docID() == -1; + } + + @Override + public int nextDoc() throws IOException { + return values.nextDoc(); + } + } + + /** + * Merges the numeric docvalues from MergeState. + * + *

The default implementation calls {@link #addNumericField}, passing a DocValuesProducer that + * merges and filters deleted documents on the fly. + */ + public void mergeNumericField(final FieldInfo mergeFieldInfo, final MergeState mergeState) + throws IOException { + addNumericField( + mergeFieldInfo, + new EmptyDocValuesProducer() { + @Override + public NumericDocValues getNumeric(FieldInfo fieldInfo) throws IOException { + if (fieldInfo != mergeFieldInfo) { + throw new IllegalArgumentException("wrong fieldInfo"); + } + + List subs = new ArrayList<>(); + assert mergeState.docMaps.length == mergeState.docValuesProducers.length; + for (int i = 0; i < mergeState.docValuesProducers.length; i++) { + NumericDocValues values = null; + DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; + if (docValuesProducer != null) { + FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name); + if (readerFieldInfo != null + && readerFieldInfo.getDocValuesType() == DocValuesType.NUMERIC) { + values = docValuesProducer.getNumeric(readerFieldInfo); + } + } + if (values != null) { + subs.add(new NumericDocValuesSub(mergeState.docMaps[i], values)); + } + } + + return mergeNumericValues(subs, mergeState.needsIndexSort); + } + }); + } + + private static NumericDocValues mergeNumericValues( + List subs, boolean indexIsSorted) throws IOException { + long cost = 0; + for (NumericDocValuesSub sub : subs) { + cost += sub.values.cost(); + } + final long finalCost = cost; + + final DocIDMerger docIDMerger = DocIDMerger.of(subs, indexIsSorted); + + return new NumericDocValues() { + private int docID = -1; + private NumericDocValuesSub current; + + @Override + public int docID() { + return docID; + } + + @Override + public int nextDoc() throws IOException { + current = docIDMerger.next(); + if (current == null) { + docID = NO_MORE_DOCS; + } else { + docID = current.mappedDocID; + } + return docID; + } + + @Override + public int advance(int target) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public boolean advanceExact(int target) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public long cost() { + return finalCost; + } + + @Override + public long longValue() throws IOException { + return current.values.longValue(); + } + }; + } + + /** Tracks state of one binary sub-reader that we are merging */ + private static class BinaryDocValuesSub extends DocIDMerger.Sub { + + final BinaryDocValues values; + + public BinaryDocValuesSub(MergeState.DocMap docMap, BinaryDocValues values) { + super(docMap); + this.values = values; + assert values.docID() == -1; + } + + @Override + public int nextDoc() throws IOException { + return values.nextDoc(); + } + } + + /** + * Merges the binary docvalues from MergeState. + * + *

The default implementation calls {@link #addBinaryField}, passing a DocValuesProducer that + * merges and filters deleted documents on the fly. + */ + public void mergeBinaryField(FieldInfo mergeFieldInfo, final MergeState mergeState) + throws IOException { + addBinaryField( + mergeFieldInfo, + new EmptyDocValuesProducer() { + @Override + public BinaryDocValues getBinary(FieldInfo fieldInfo) throws IOException { + if (fieldInfo != mergeFieldInfo) { + throw new IllegalArgumentException("wrong fieldInfo"); + } + + List subs = new ArrayList<>(); + + long cost = 0; + for (int i = 0; i < mergeState.docValuesProducers.length; i++) { + BinaryDocValues values = null; + DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; + if (docValuesProducer != null) { + FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name); + if (readerFieldInfo != null + && readerFieldInfo.getDocValuesType() == DocValuesType.BINARY) { + values = docValuesProducer.getBinary(readerFieldInfo); + } + } + if (values != null) { + cost += values.cost(); + subs.add(new BinaryDocValuesSub(mergeState.docMaps[i], values)); + } + } + + final DocIDMerger docIDMerger = + DocIDMerger.of(subs, mergeState.needsIndexSort); + final long finalCost = cost; + + return new BinaryDocValues() { + private BinaryDocValuesSub current; + private int docID = -1; + + @Override + public int docID() { + return docID; + } + + @Override + public int nextDoc() throws IOException { + current = docIDMerger.next(); + if (current == null) { + docID = NO_MORE_DOCS; + } else { + docID = current.mappedDocID; + } + return docID; + } + + @Override + public int advance(int target) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public boolean advanceExact(int target) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public long cost() { + return finalCost; + } + + @Override + public BytesRef binaryValue() throws IOException { + return current.values.binaryValue(); + } + }; + } + }); + } + + /** Tracks state of one sorted numeric sub-reader that we are merging */ + private static class SortedNumericDocValuesSub extends DocIDMerger.Sub { + + final SortedNumericDocValues values; + + public SortedNumericDocValuesSub(MergeState.DocMap docMap, SortedNumericDocValues values) { + super(docMap); + this.values = values; + assert values.docID() == -1; + } + + @Override + public int nextDoc() throws IOException { + return values.nextDoc(); + } + } + + /** + * Merges the sorted docvalues from toMerge. + * + *

The default implementation calls {@link #addSortedNumericField}, passing iterables that + * filter deleted documents. + */ + public void mergeSortedNumericField(FieldInfo mergeFieldInfo, final MergeState mergeState) + throws IOException { + + addSortedNumericField( + mergeFieldInfo, + new EmptyDocValuesProducer() { + @Override + public SortedNumericDocValues getSortedNumeric(FieldInfo fieldInfo) throws IOException { + if (fieldInfo != mergeFieldInfo) { + throw new IllegalArgumentException("wrong FieldInfo"); + } + + // We must make new iterators + DocIDMerger for each iterator: + List subs = new ArrayList<>(); + long cost = 0; + boolean allSingletons = true; + for (int i = 0; i < mergeState.docValuesProducers.length; i++) { + DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; + SortedNumericDocValues values = null; + if (docValuesProducer != null) { + FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name); + if (readerFieldInfo != null + && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED_NUMERIC) { + values = docValuesProducer.getSortedNumeric(readerFieldInfo); + } + } + if (values == null) { + values = DocValues.emptySortedNumeric(); + } + cost += values.cost(); + if (allSingletons && DocValues.unwrapSingleton(values) == null) { + allSingletons = false; + } + subs.add(new SortedNumericDocValuesSub(mergeState.docMaps[i], values)); + } + + if (allSingletons) { + // All subs are single-valued. + // We specialize for that case since it makes it easier for codecs to optimize + // for single-valued fields. + List singleValuedSubs = new ArrayList<>(); + for (SortedNumericDocValuesSub sub : subs) { + final NumericDocValues singleValuedValues = DocValues.unwrapSingleton(sub.values); + assert singleValuedValues != null; + singleValuedSubs.add(new NumericDocValuesSub(sub.docMap, singleValuedValues)); + } + return DocValues.singleton( + mergeNumericValues(singleValuedSubs, mergeState.needsIndexSort)); + } + + final long finalCost = cost; + + final DocIDMerger docIDMerger = + DocIDMerger.of(subs, mergeState.needsIndexSort); + + return new SortedNumericDocValues() { + + private int docID = -1; + private SortedNumericDocValuesSub currentSub; + + @Override + public int docID() { + return docID; + } + + @Override + public int nextDoc() throws IOException { + currentSub = docIDMerger.next(); + if (currentSub == null) { + docID = NO_MORE_DOCS; + } else { + docID = currentSub.mappedDocID; + } + + return docID; + } + + @Override + public int advance(int target) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public boolean advanceExact(int target) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public int docValueCount() { + return currentSub.values.docValueCount(); + } + + @Override + public long cost() { + return finalCost; + } + + @Override + public long nextValue() throws IOException { + return currentSub.values.nextValue(); + } + }; + } + }); + } + + /** + * A merged {@link TermsEnum}. This helps avoid relying on the default terms enum, which calls + * {@link SortedDocValues#lookupOrd(int)} or {@link SortedSetDocValues#lookupOrd(long)} on every + * call to {@link TermsEnum#next()}. + */ + private static class MergedTermsEnum extends BaseTermsEnum { + + private final TermsEnum[] subs; + private final OrdinalMap ordinalMap; + private final long valueCount; + private long ord = -1; + private BytesRef term; + + MergedTermsEnum(OrdinalMap ordinalMap, TermsEnum[] subs) { + this.ordinalMap = ordinalMap; + this.subs = subs; + this.valueCount = ordinalMap.getValueCount(); + } + + @Override + public BytesRef term() throws IOException { + return term; + } + + @Override + public long ord() throws IOException { + return ord; + } + + @Override + public BytesRef next() throws IOException { + if (++ord >= valueCount) { + return null; + } + final int subNum = ordinalMap.getFirstSegmentNumber(ord); + final TermsEnum sub = subs[subNum]; + final long subOrd = ordinalMap.getFirstSegmentOrd(ord); + do { + term = sub.next(); + } while (sub.ord() < subOrd); + assert sub.ord() == subOrd; + return term; + } + + @Override + public AttributeSource attributes() { + throw new UnsupportedOperationException(); + } + + @Override + public SeekStatus seekCeil(BytesRef text) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public void seekExact(long ord) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public int docFreq() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public long totalTermFreq() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public ImpactsEnum impacts(int flags) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public TermState termState() throws IOException { + throw new UnsupportedOperationException(); + } + } + + /** Tracks state of one sorted sub-reader that we are merging */ + private static class SortedDocValuesSub extends DocIDMerger.Sub { + + final SortedDocValues values; + final LongValues map; + + public SortedDocValuesSub(MergeState.DocMap docMap, SortedDocValues values, LongValues map) { + super(docMap); + this.values = values; + this.map = map; + assert values.docID() == -1; + } + + @Override + public int nextDoc() throws IOException { + return values.nextDoc(); + } + } + + /** + * Merges the sorted docvalues from toMerge. + * + *

The default implementation calls {@link #addSortedField}, passing an Iterable that merges + * ordinals and values and filters deleted documents . + */ + public void mergeSortedField(FieldInfo fieldInfo, final MergeState mergeState) + throws IOException { + List toMerge = new ArrayList<>(); + for (int i = 0; i < mergeState.docValuesProducers.length; i++) { + SortedDocValues values = null; + DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; + if (docValuesProducer != null) { + FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(fieldInfo.name); + if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED) { + values = docValuesProducer.getSorted(readerFieldInfo); + } + } + if (values == null) { + values = DocValues.emptySorted(); + } + toMerge.add(values); + } + + final int numReaders = toMerge.size(); + final SortedDocValues[] dvs = toMerge.toArray(new SortedDocValues[numReaders]); + + // step 1: iterate thru each sub and mark terms still in use + TermsEnum[] liveTerms = new TermsEnum[dvs.length]; + long[] weights = new long[liveTerms.length]; + for (int sub = 0; sub < numReaders; sub++) { + SortedDocValues dv = dvs[sub]; + Bits liveDocs = mergeState.liveDocs[sub]; + if (liveDocs == null) { + liveTerms[sub] = dv.termsEnum(); + weights[sub] = dv.getValueCount(); + } else { + LongBitSet bitset = new LongBitSet(dv.getValueCount()); + int docID; + while ((docID = dv.nextDoc()) != NO_MORE_DOCS) { + if (liveDocs.get(docID)) { + int ord = dv.ordValue(); + if (ord >= 0) { + bitset.set(ord); + } + } + } + liveTerms[sub] = new BitsFilteredTermsEnum(dv.termsEnum(), bitset); + weights[sub] = bitset.cardinality(); + } + } + + // step 2: create ordinal map (this conceptually does the "merging") + final OrdinalMap map = OrdinalMap.build(null, liveTerms, weights, PackedInts.COMPACT); + + // step 3: add field + addSortedField( + fieldInfo, + new EmptyDocValuesProducer() { + @Override + public SortedDocValues getSorted(FieldInfo fieldInfoIn) throws IOException { + if (fieldInfoIn != fieldInfo) { + throw new IllegalArgumentException("wrong FieldInfo"); + } + + // We must make new iterators + DocIDMerger for each iterator: + + List subs = new ArrayList<>(); + for (int i = 0; i < mergeState.docValuesProducers.length; i++) { + SortedDocValues values = null; + DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; + if (docValuesProducer != null) { + FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(fieldInfo.name); + if (readerFieldInfo != null + && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED) { + values = docValuesProducer.getSorted(readerFieldInfo); + } + } + if (values == null) { + values = DocValues.emptySorted(); + } + + subs.add(new SortedDocValuesSub(mergeState.docMaps[i], values, map.getGlobalOrds(i))); + } + + return mergeSortedValues(subs, mergeState.needsIndexSort, map); + } + }); + } + + private static SortedDocValues mergeSortedValues( + List subs, boolean indexIsSorted, OrdinalMap map) throws IOException { + long cost = 0; + for (SortedDocValuesSub sub : subs) { + cost += sub.values.cost(); + } + final long finalCost = cost; + + final DocIDMerger docIDMerger = DocIDMerger.of(subs, indexIsSorted); + + return new SortedDocValues() { + private int docID = -1; + private SortedDocValuesSub current; + + @Override + public int docID() { + return docID; + } + + @Override + public int nextDoc() throws IOException { + current = docIDMerger.next(); + if (current == null) { + docID = NO_MORE_DOCS; + } else { + docID = current.mappedDocID; + } + return docID; + } + + @Override + public int ordValue() throws IOException { + int subOrd = current.values.ordValue(); + assert subOrd != -1; + return (int) current.map.get(subOrd); + } + + @Override + public int advance(int target) { + throw new UnsupportedOperationException(); + } + + @Override + public boolean advanceExact(int target) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public long cost() { + return finalCost; + } + + @Override + public int getValueCount() { + return (int) map.getValueCount(); + } + + @Override + public BytesRef lookupOrd(int ord) throws IOException { + int segmentNumber = map.getFirstSegmentNumber(ord); + int segmentOrd = (int) map.getFirstSegmentOrd(ord); + return subs.get(segmentNumber).values.lookupOrd(segmentOrd); + } + + @Override + public TermsEnum termsEnum() throws IOException { + TermsEnum[] termsEnurmSubs = new TermsEnum[subs.size()]; + for (int sub = 0; sub < termsEnurmSubs.length; ++sub) { + termsEnurmSubs[sub] = subs.get(sub).values.termsEnum(); + } + return new MergedTermsEnum(map, termsEnurmSubs); + } + }; + } + + /** Tracks state of one sorted set sub-reader that we are merging */ + private static class SortedSetDocValuesSub extends DocIDMerger.Sub { + + final SortedSetDocValues values; + final LongValues map; + + public SortedSetDocValuesSub( + MergeState.DocMap docMap, SortedSetDocValues values, LongValues map) { + super(docMap); + this.values = values; + this.map = map; + assert values.docID() == -1; + } + + @Override + public int nextDoc() throws IOException { + return values.nextDoc(); + } + + @Override + public String toString() { + return "SortedSetDocValuesSub(mappedDocID=" + mappedDocID + " values=" + values + ")"; + } + } + + /** + * Merges the sortedset docvalues from toMerge. + * + *

The default implementation calls {@link #addSortedSetField}, passing an Iterable that merges + * ordinals and values and filters deleted documents . + */ + public void mergeSortedSetField(FieldInfo mergeFieldInfo, final MergeState mergeState) + throws IOException { + + List toMerge = new ArrayList<>(); + for (int i = 0; i < mergeState.docValuesProducers.length; i++) { + SortedSetDocValues values = null; + DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; + if (docValuesProducer != null) { + FieldInfo fieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name); + if (fieldInfo != null && fieldInfo.getDocValuesType() == DocValuesType.SORTED_SET) { + values = docValuesProducer.getSortedSet(fieldInfo); + } + } + if (values == null) { + values = DocValues.emptySortedSet(); + } + toMerge.add(values); + } + + // step 1: iterate thru each sub and mark terms still in use + TermsEnum[] liveTerms = new TermsEnum[toMerge.size()]; + long[] weights = new long[liveTerms.length]; + for (int sub = 0; sub < liveTerms.length; sub++) { + SortedSetDocValues dv = toMerge.get(sub); + Bits liveDocs = mergeState.liveDocs[sub]; + if (liveDocs == null) { + liveTerms[sub] = dv.termsEnum(); + weights[sub] = dv.getValueCount(); + } else { + LongBitSet bitset = new LongBitSet(dv.getValueCount()); + int docID; + while ((docID = dv.nextDoc()) != NO_MORE_DOCS) { + if (liveDocs.get(docID)) { + for (int i = 0; i < dv.docValueCount(); i++) { + bitset.set(dv.nextOrd()); + } + } + } + liveTerms[sub] = new BitsFilteredTermsEnum(dv.termsEnum(), bitset); + weights[sub] = bitset.cardinality(); + } + } + + // step 2: create ordinal map (this conceptually does the "merging") + final OrdinalMap map = OrdinalMap.build(null, liveTerms, weights, PackedInts.COMPACT); + + // step 3: add field + addSortedSetField( + mergeFieldInfo, + new EmptyDocValuesProducer() { + @Override + public SortedSetDocValues getSortedSet(FieldInfo fieldInfo) throws IOException { + if (fieldInfo != mergeFieldInfo) { + throw new IllegalArgumentException("wrong FieldInfo"); + } + + // We must make new iterators + DocIDMerger for each iterator: + List subs = new ArrayList<>(); + + long cost = 0; + boolean allSingletons = true; + + for (int i = 0; i < mergeState.docValuesProducers.length; i++) { + SortedSetDocValues values = null; + DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; + if (docValuesProducer != null) { + FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name); + if (readerFieldInfo != null + && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED_SET) { + values = docValuesProducer.getSortedSet(readerFieldInfo); + } + } + if (values == null) { + values = DocValues.emptySortedSet(); + } + cost += values.cost(); + if (allSingletons && DocValues.unwrapSingleton(values) == null) { + allSingletons = false; + } + subs.add( + new SortedSetDocValuesSub(mergeState.docMaps[i], values, map.getGlobalOrds(i))); + } + + if (allSingletons) { + // All subs are single-valued. + // We specialize for that case since it makes it easier for codecs to optimize + // for single-valued fields. + List singleValuedSubs = new ArrayList<>(); + for (SortedSetDocValuesSub sub : subs) { + final SortedDocValues singleValuedValues = DocValues.unwrapSingleton(sub.values); + assert singleValuedValues != null; + singleValuedSubs.add( + new SortedDocValuesSub(sub.docMap, singleValuedValues, sub.map)); + } + return DocValues.singleton( + mergeSortedValues(singleValuedSubs, mergeState.needsIndexSort, map)); + } + + final DocIDMerger docIDMerger = + DocIDMerger.of(subs, mergeState.needsIndexSort); + + final long finalCost = cost; + + return new SortedSetDocValues() { + private int docID = -1; + private SortedSetDocValuesSub currentSub; + + @Override + public int docID() { + return docID; + } + + @Override + public int nextDoc() throws IOException { + currentSub = docIDMerger.next(); + if (currentSub == null) { + docID = NO_MORE_DOCS; + } else { + docID = currentSub.mappedDocID; + } + + return docID; + } + + @Override + public int advance(int target) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public boolean advanceExact(int target) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public long nextOrd() throws IOException { + long subOrd = currentSub.values.nextOrd(); + return currentSub.map.get(subOrd); + } + + @Override + public int docValueCount() { + return currentSub.values.docValueCount(); + } + + @Override + public long cost() { + return finalCost; + } + + @Override + public BytesRef lookupOrd(long ord) throws IOException { + int segmentNumber = map.getFirstSegmentNumber(ord); + long segmentOrd = map.getFirstSegmentOrd(ord); + return toMerge.get(segmentNumber).lookupOrd(segmentOrd); + } + + @Override + public long getValueCount() { + return map.getValueCount(); + } + + @Override + public TermsEnum termsEnum() throws IOException { + TermsEnum[] subs = new TermsEnum[toMerge.size()]; + for (int sub = 0; sub < subs.length; ++sub) { + subs[sub] = toMerge.get(sub).termsEnum(); + } + return new MergedTermsEnum(map, subs); + } + }; + } + }); + } + + // TODO: seek-by-ord to nextSetBit + static class BitsFilteredTermsEnum extends FilteredTermsEnum { + final LongBitSet liveTerms; + + BitsFilteredTermsEnum(TermsEnum in, LongBitSet liveTerms) { + super(in, false); // <-- not passing false here wasted about 3 hours of my time!!!!!!!!!!!!! + assert liveTerms != null; + this.liveTerms = liveTerms; + } + + @Override + protected AcceptStatus accept(BytesRef term) throws IOException { + if (liveTerms.get(ord())) { + return AcceptStatus.YES; + } else { + return AcceptStatus.NO; + } + } + } + + /** Helper: returns true if the given docToValue count contains only at most one value */ + public static boolean isSingleValued(Iterable docToValueCount) { + for (Number count : docToValueCount) { + if (count.longValue() > 1) { + return false; + } + } + return true; + } + + /** Helper: returns single-valued view, using {@code missingValue} when count is zero */ + public static Iterable singletonView( + final Iterable docToValueCount, + final Iterable values, + final Number missingValue) { + assert isSingleValued(docToValueCount); + return new Iterable() { + + @Override + public Iterator iterator() { + final Iterator countIterator = docToValueCount.iterator(); + final Iterator valuesIterator = values.iterator(); + return new Iterator() { + + @Override + public boolean hasNext() { + return countIterator.hasNext(); + } + + @Override + public Number next() { + int count = countIterator.next().intValue(); + if (count == 0) { + return missingValue; + } else { + return valuesIterator.next(); + } + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } + }; + } +} From fa0c5eee53c9636025c2885561050ae69c1af4a6 Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Fri, 4 Apr 2025 11:07:49 +0200 Subject: [PATCH 33/43] Remove unused code from XDocValuesConsumer and let ES819TSDBDocValuesConsumer use it. Additionally, remove duplicated DocValuesConsumer code from DocValuesConsumerUtil --- .../tsdb/es819/DocValuesConsumerUtil.java | 552 +----------------- .../es819/ES819TSDBDocValuesConsumer.java | 11 +- .../codec/tsdb/es819/XDocValuesConsumer.java | 294 +--------- 3 files changed, 19 insertions(+), 838 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/DocValuesConsumerUtil.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/DocValuesConsumerUtil.java index fd387b68a6ee3..fb0a494bbb1ca 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/DocValuesConsumerUtil.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/DocValuesConsumerUtil.java @@ -10,32 +10,14 @@ package org.elasticsearch.index.codec.tsdb.es819; import org.apache.lucene.codecs.DocValuesProducer; -import org.apache.lucene.index.BaseTermsEnum; -import org.apache.lucene.index.DocIDMerger; import org.apache.lucene.index.DocValues; -import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.FieldInfo; -import org.apache.lucene.index.ImpactsEnum; import org.apache.lucene.index.MergeState; -import org.apache.lucene.index.NumericDocValues; -import org.apache.lucene.index.OrdinalMap; -import org.apache.lucene.index.PostingsEnum; -import org.apache.lucene.index.SortedDocValues; -import org.apache.lucene.index.SortedNumericDocValues; -import org.apache.lucene.index.SortedSetDocValues; -import org.apache.lucene.index.TermState; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.util.AttributeSource; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.LongValues; -import org.apache.lucene.util.packed.PackedInts; import java.io.IOException; -import java.util.ArrayList; -import java.util.List; /** - * Mostly contains forked code from {@link org.apache.lucene.codecs.DocValuesConsumer}. + * Contains logic to determine whether optimized merge can occur. */ class DocValuesConsumerUtil { @@ -124,536 +106,4 @@ static MergeStats compatibleWithOptimizedMerge(boolean optimizedMergeEnabled, Me return new MergeStats(true, sumNumValues, sumNumDocsWithField); } - static DocValuesProducer mergeNumericProducer(MergeStats mergeStats, FieldInfo mergeFieldInfo, MergeState mergeState) { - return new TsdbDocValuesProducer(mergeStats) { - - @Override - public NumericDocValues getNumeric(FieldInfo field) throws IOException { - List subs = new ArrayList<>(); - assert mergeState.docMaps.length == mergeState.docValuesProducers.length; - for (int i = 0; i < mergeState.docValuesProducers.length; i++) { - NumericDocValues values = null; - DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; - if (docValuesProducer != null) { - FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name); - if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.NUMERIC) { - values = docValuesProducer.getNumeric(readerFieldInfo); - } - } - if (values != null) { - subs.add(new NumericDocValuesSub(mergeState.docMaps[i], values)); - } - } - - return mergeNumericValues(subs, mergeState.needsIndexSort); - } - }; - } - - static NumericDocValues mergeNumericValues(List subs, boolean indexIsSorted) throws IOException { - long cost = 0; - for (NumericDocValuesSub sub : subs) { - cost += sub.values.cost(); - } - final long finalCost = cost; - - final DocIDMerger docIDMerger = DocIDMerger.of(subs, indexIsSorted); - - return new NumericDocValues() { - private int docID = -1; - private NumericDocValuesSub current; - - @Override - public int docID() { - return docID; - } - - @Override - public int nextDoc() throws IOException { - current = docIDMerger.next(); - if (current == null) { - docID = NO_MORE_DOCS; - } else { - docID = current.mappedDocID; - } - return docID; - } - - @Override - public int advance(int target) throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public boolean advanceExact(int target) throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public long cost() { - return finalCost; - } - - @Override - public long longValue() throws IOException { - return current.values.longValue(); - } - - }; - } - - static class NumericDocValuesSub extends DocIDMerger.Sub { - - final NumericDocValues values; - - NumericDocValuesSub(MergeState.DocMap docMap, NumericDocValues values) { - super(docMap); - this.values = values; - assert values.docID() == -1; - } - - @Override - public int nextDoc() throws IOException { - return values.nextDoc(); - } - } - - static DocValuesProducer mergeSortedNumericProducer(MergeStats mergeStats, FieldInfo mergeFieldInfo, MergeState mergeState) { - return new TsdbDocValuesProducer(mergeStats) { - - @Override - public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException { - List subs = new ArrayList<>(); - assert mergeState.docMaps.length == mergeState.docValuesProducers.length; - for (int i = 0; i < mergeState.docValuesProducers.length; i++) { - SortedNumericDocValues values = null; - DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; - if (docValuesProducer != null) { - FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name); - if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED_NUMERIC) { - values = docValuesProducer.getSortedNumeric(readerFieldInfo); - } - } - if (values != null) { - subs.add(new SortedNumericDocValuesSub(mergeState.docMaps[i], values)); - } - } - return mergeSortedNumericValues(subs, mergeState.needsIndexSort); - } - }; - } - - static SortedNumericDocValues mergeSortedNumericValues(List subs, boolean indexIsSorted) throws IOException { - long cost = 0; - for (SortedNumericDocValuesSub sub : subs) { - cost += sub.values.cost(); - } - final long finalCost = cost; - - final DocIDMerger docIDMerger = DocIDMerger.of(subs, indexIsSorted); - - return new SortedNumericDocValues() { - private int docID = -1; - private SortedNumericDocValuesSub current; - - @Override - public int docID() { - return docID; - } - - @Override - public int nextDoc() throws IOException { - current = docIDMerger.next(); - if (current == null) { - docID = NO_MORE_DOCS; - } else { - docID = current.mappedDocID; - } - return docID; - } - - @Override - public int advance(int target) throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public boolean advanceExact(int target) throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public long cost() { - return finalCost; - } - - @Override - public long nextValue() throws IOException { - return current.values.nextValue(); - } - - @Override - public int docValueCount() { - return current.values.docValueCount(); - } - - }; - } - - static class SortedNumericDocValuesSub extends DocIDMerger.Sub { - - final SortedNumericDocValues values; - - SortedNumericDocValuesSub(MergeState.DocMap docMap, SortedNumericDocValues values) { - super(docMap); - this.values = values; - assert values.docID() == -1; - } - - @Override - public int nextDoc() throws IOException { - return values.nextDoc(); - } - } - - static DocValuesProducer mergeSortedProducer(MergeStats mergeStats, FieldInfo mergeFieldInfo, MergeState mergeState) { - return new TsdbDocValuesProducer(mergeStats) { - - @Override - public SortedDocValues getSorted(FieldInfo field) throws IOException { - List subs = new ArrayList<>(); - assert mergeState.docMaps.length == mergeState.docValuesProducers.length; - - TermsEnum[] liveTerms = new TermsEnum[mergeState.docValuesProducers.length]; - long[] weights = new long[liveTerms.length]; - for (int i = 0; i < mergeState.docValuesProducers.length; i++) { - SortedDocValues values = null; - DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; - if (docValuesProducer != null) { - FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name); - if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED) { - values = docValuesProducer.getSorted(readerFieldInfo); - } - } - if (values == null) { - values = DocValues.emptySorted(); - } - - liveTerms[i] = values.termsEnum(); - weights[i] = values.getValueCount(); - subs.add(new SortedDocValuesSub(mergeState.docMaps[i], values)); - } - - final OrdinalMap map = OrdinalMap.build(null, liveTerms, weights, PackedInts.COMPACT); - for (int i = 0; i < subs.size(); i++) { - subs.get(i).map = map.getGlobalOrds(i); - } - return mergeSortedValues(subs, mergeState.needsIndexSort, map); - } - }; - } - - static SortedDocValues mergeSortedValues(List subs, boolean indexIsSorted, OrdinalMap map) throws IOException { - long cost = 0; - for (SortedDocValuesSub sub : subs) { - cost += sub.values.cost(); - } - final long finalCost = cost; - - final DocIDMerger docIDMerger = DocIDMerger.of(subs, indexIsSorted); - - return new SortedDocValues() { - private int docID = -1; - private SortedDocValuesSub current; - - @Override - public int docID() { - return docID; - } - - @Override - public int nextDoc() throws IOException { - current = docIDMerger.next(); - if (current == null) { - docID = NO_MORE_DOCS; - } else { - docID = current.mappedDocID; - } - return docID; - } - - @Override - public int advance(int target) throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public boolean advanceExact(int target) throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public long cost() { - return finalCost; - } - - @Override - public int ordValue() throws IOException { - int subOrd = current.values.ordValue(); - assert subOrd != -1; - return (int) current.map.get(subOrd); - } - - @Override - public BytesRef lookupOrd(int ord) throws IOException { - int segmentNumber = map.getFirstSegmentNumber(ord); - int segmentOrd = (int) map.getFirstSegmentOrd(ord); - return subs.get(segmentNumber).values.lookupOrd(segmentOrd); - } - - @Override - public int getValueCount() { - return (int) map.getValueCount(); - } - - @Override - public TermsEnum termsEnum() throws IOException { - TermsEnum[] termsEnurmSubs = new TermsEnum[subs.size()]; - for (int sub = 0; sub < termsEnurmSubs.length; ++sub) { - termsEnurmSubs[sub] = subs.get(sub).values.termsEnum(); - } - return new MergedTermsEnum(map, termsEnurmSubs); - } - }; - } - - static class SortedDocValuesSub extends DocIDMerger.Sub { - - LongValues map; - final SortedDocValues values; - - SortedDocValuesSub(MergeState.DocMap docMap, SortedDocValues values) { - super(docMap); - this.values = values; - assert values.docID() == -1; - } - - @Override - public int nextDoc() throws IOException { - return values.nextDoc(); - } - } - - static DocValuesProducer mergeSortedSetProducer(MergeStats mergeStats, FieldInfo mergeFieldInfo, MergeState mergeState) { - return new TsdbDocValuesProducer(mergeStats) { - - @Override - public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException { - List subs = new ArrayList<>(); - assert mergeState.docMaps.length == mergeState.docValuesProducers.length; - - TermsEnum[] liveTerms = new TermsEnum[mergeState.docValuesProducers.length]; - long[] weights = new long[liveTerms.length]; - for (int i = 0; i < mergeState.docValuesProducers.length; i++) { - SortedSetDocValues values = null; - DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; - if (docValuesProducer != null) { - FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name); - if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED_SET) { - values = docValuesProducer.getSortedSet(readerFieldInfo); - } - } - if (values == null) { - values = DocValues.emptySortedSet(); - } - liveTerms[i] = values.termsEnum(); - weights[i] = values.getValueCount(); - subs.add(new SortedSetDocValuesSub(mergeState.docMaps[i], values)); - } - - final OrdinalMap map = OrdinalMap.build(null, liveTerms, weights, PackedInts.COMPACT); - for (int i = 0; i < subs.size(); i++) { - subs.get(i).map = map.getGlobalOrds(i); - } - return mergeSortedSetValues(subs, mergeState.needsIndexSort, map); - } - }; - } - - static SortedSetDocValues mergeSortedSetValues(List subs, boolean indexIsSorted, OrdinalMap map) - throws IOException { - long cost = 0; - for (SortedSetDocValuesSub sub : subs) { - cost += sub.values.cost(); - } - final long finalCost = cost; - - final DocIDMerger docIDMerger = DocIDMerger.of(subs, indexIsSorted); - - return new SortedSetDocValues() { - private int docID = -1; - private SortedSetDocValuesSub current; - - @Override - public int docID() { - return docID; - } - - @Override - public int nextDoc() throws IOException { - current = docIDMerger.next(); - if (current == null) { - docID = NO_MORE_DOCS; - } else { - docID = current.mappedDocID; - } - return docID; - } - - @Override - public int advance(int target) throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public boolean advanceExact(int target) throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public long cost() { - return finalCost; - } - - @Override - public long nextOrd() throws IOException { - long subOrd = current.values.nextOrd(); - return current.map.get(subOrd); - } - - @Override - public int docValueCount() { - return current.values.docValueCount(); - } - - @Override - public BytesRef lookupOrd(long ord) throws IOException { - int segmentNumber = map.getFirstSegmentNumber(ord); - int segmentOrd = (int) map.getFirstSegmentOrd(ord); - return subs.get(segmentNumber).values.lookupOrd(segmentOrd); - } - - @Override - public long getValueCount() { - return map.getValueCount(); - } - - @Override - public TermsEnum termsEnum() throws IOException { - TermsEnum[] termsEnurmSubs = new TermsEnum[subs.size()]; - for (int sub = 0; sub < termsEnurmSubs.length; ++sub) { - termsEnurmSubs[sub] = subs.get(sub).values.termsEnum(); - } - return new MergedTermsEnum(map, termsEnurmSubs); - } - }; - } - - static class SortedSetDocValuesSub extends DocIDMerger.Sub { - - LongValues map; - final SortedSetDocValues values; - - SortedSetDocValuesSub(MergeState.DocMap docMap, SortedSetDocValues values) { - super(docMap); - this.values = values; - assert values.docID() == -1; - } - - @Override - public int nextDoc() throws IOException { - return values.nextDoc(); - } - } - - static class MergedTermsEnum extends BaseTermsEnum { - - private final TermsEnum[] subs; - private final OrdinalMap ordinalMap; - private final long valueCount; - private long ord = -1; - private BytesRef term; - - MergedTermsEnum(OrdinalMap ordinalMap, TermsEnum[] subs) { - this.ordinalMap = ordinalMap; - this.subs = subs; - this.valueCount = ordinalMap.getValueCount(); - } - - @Override - public BytesRef term() throws IOException { - return term; - } - - @Override - public long ord() throws IOException { - return ord; - } - - @Override - public BytesRef next() throws IOException { - if (++ord >= valueCount) { - return null; - } - final int subNum = ordinalMap.getFirstSegmentNumber(ord); - final TermsEnum sub = subs[subNum]; - final long subOrd = ordinalMap.getFirstSegmentOrd(ord); - do { - term = sub.next(); - } while (sub.ord() < subOrd); - assert sub.ord() == subOrd; - return term; - } - - @Override - public AttributeSource attributes() { - throw new UnsupportedOperationException(); - } - - @Override - public SeekStatus seekCeil(BytesRef text) throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public void seekExact(long ord) throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public int docFreq() throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public long totalTermFreq() throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public ImpactsEnum impacts(int flags) throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public TermState termState() throws IOException { - throw new UnsupportedOperationException(); - } - } - } diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesConsumer.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesConsumer.java index a866fff863473..b860c0f5983c7 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesConsumer.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesConsumer.java @@ -10,7 +10,6 @@ package org.elasticsearch.index.codec.tsdb.es819; import org.apache.lucene.codecs.CodecUtil; -import org.apache.lucene.codecs.DocValuesConsumer; import org.apache.lucene.codecs.DocValuesProducer; import org.apache.lucene.codecs.lucene90.IndexedDISI; import org.apache.lucene.index.BinaryDocValues; @@ -53,7 +52,7 @@ import static org.elasticsearch.index.codec.tsdb.es819.ES819TSDBDocValuesFormat.SKIP_INDEX_MAX_LEVEL; import static org.elasticsearch.index.codec.tsdb.es819.ES819TSDBDocValuesFormat.SORTED_SET; -final class ES819TSDBDocValuesConsumer extends DocValuesConsumer { +final class ES819TSDBDocValuesConsumer extends XDocValuesConsumer { IndexOutput data, meta; final int maxDoc; @@ -226,7 +225,7 @@ private long[] writeField(FieldInfo field, TsdbDocValuesProducer valuesProducer, public void mergeNumericField(FieldInfo mergeFieldInfo, MergeState mergeState) throws IOException { var result = compatibleWithOptimizedMerge(enableOptimizedMerge, mergeState, mergeFieldInfo); if (result.supported()) { - addNumericField(mergeFieldInfo, DocValuesConsumerUtil.mergeNumericProducer(result, mergeFieldInfo, mergeState)); + mergeNumericField(result, mergeFieldInfo, mergeState); } else { super.mergeNumericField(mergeFieldInfo, mergeState); } @@ -311,7 +310,7 @@ public void addSortedField(FieldInfo field, DocValuesProducer valuesProducer) th public void mergeSortedField(FieldInfo mergeFieldInfo, MergeState mergeState) throws IOException { var result = compatibleWithOptimizedMerge(enableOptimizedMerge, mergeState, mergeFieldInfo); if (result.supported()) { - addSortedField(mergeFieldInfo, DocValuesConsumerUtil.mergeSortedProducer(result, mergeFieldInfo, mergeState)); + mergeSortedField(result, mergeFieldInfo, mergeState); } else { super.mergeSortedField(mergeFieldInfo, mergeState); } @@ -555,7 +554,7 @@ private void writeSortedNumericField(FieldInfo field, TsdbDocValuesProducer valu public void mergeSortedNumericField(FieldInfo mergeFieldInfo, MergeState mergeState) throws IOException { var result = compatibleWithOptimizedMerge(enableOptimizedMerge, mergeState, mergeFieldInfo); if (result.supported()) { - addSortedNumericField(mergeFieldInfo, DocValuesConsumerUtil.mergeSortedNumericProducer(result, mergeFieldInfo, mergeState)); + mergeSortedNumericField(result, mergeFieldInfo, mergeState); } else { super.mergeSortedNumericField(mergeFieldInfo, mergeState); } @@ -586,7 +585,7 @@ private static boolean isSingleValued(FieldInfo field, TsdbDocValuesProducer pro public void mergeSortedSetField(FieldInfo mergeFieldInfo, MergeState mergeState) throws IOException { var result = compatibleWithOptimizedMerge(enableOptimizedMerge, mergeState, mergeFieldInfo); if (result.supported()) { - addSortedSetField(mergeFieldInfo, DocValuesConsumerUtil.mergeSortedSetProducer(result, mergeFieldInfo, mergeState)); + mergeSortedSetField(result, mergeFieldInfo, mergeState); } else { super.mergeSortedSetField(mergeFieldInfo, mergeState); } diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/XDocValuesConsumer.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/XDocValuesConsumer.java index 963cf72038904..e4b2bfb1f928f 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/XDocValuesConsumer.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/XDocValuesConsumer.java @@ -6,33 +6,14 @@ * your election, the "Elastic License 2.0", the "GNU Affero General Public * License v3.0 only", or the "Server Side Public License, v 1". */ - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ package org.elasticsearch.index.codec.tsdb.es819; -import org.apache.lucene.codecs.DocValuesFormat; +import org.apache.lucene.codecs.DocValuesConsumer; import org.apache.lucene.codecs.DocValuesProducer; import org.apache.lucene.index.BaseTermsEnum; -import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.DocIDMerger; import org.apache.lucene.index.DocValues; import org.apache.lucene.index.DocValuesType; -import org.apache.lucene.index.EmptyDocValuesProducer; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FilteredTermsEnum; import org.apache.lucene.index.ImpactsEnum; @@ -40,7 +21,6 @@ import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.OrdinalMap; import org.apache.lucene.index.PostingsEnum; -import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.index.SortedNumericDocValues; import org.apache.lucene.index.SortedSetDocValues; @@ -52,122 +32,23 @@ import org.apache.lucene.util.LongBitSet; import org.apache.lucene.util.LongValues; import org.apache.lucene.util.packed.PackedInts; +import org.elasticsearch.index.codec.tsdb.es819.DocValuesConsumerUtil.MergeStats; -import java.io.Closeable; import java.io.IOException; import java.util.ArrayList; -import java.util.Iterator; import java.util.List; import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; /** - * Abstract API that consumes numeric, binary and sorted docvalues. Concrete implementations of this - * actually do "something" with the docvalues (write it into the index in a specific format). - * - *

The lifecycle is: - * - *

    - *
  1. DocValuesConsumer is created by {@link DocValuesFormat#fieldsConsumer(SegmentWriteState)}. - *
  2. {@link #addNumericField}, {@link #addBinaryField}, {@link #addSortedField}, {@link - * #addSortedSetField}, or {@link #addSortedNumericField} are called for each Numeric, Binary, - * Sorted, SortedSet, or SortedNumeric docvalues field. The API is a "pull" rather than - * "push", and the implementation is free to iterate over the values multiple times ({@link - * Iterable#iterator()}). - *
  3. After all fields are added, the consumer is {@link #close}d. - *
- * - * @lucene.experimental + * Forks the merging logic from {@link DocValuesConsumer} that {@link ES819TSDBDocValuesConsumer} needs. + * This class should be removed when merging logic in {@link DocValuesConsumer} becomes accessible / overwritable in Lucene. */ -public abstract class XDocValuesConsumer implements Closeable { +public abstract class XDocValuesConsumer extends DocValuesConsumer { /** Sole constructor. (For invocation by subclass constructors, typically implicit.) */ protected XDocValuesConsumer() {} - /** - * Writes numeric docvalues for a field. - * - * @param field field information - * @param valuesProducer Numeric values to write. - * @throws IOException if an I/O error occurred. - */ - public abstract void addNumericField(FieldInfo field, DocValuesProducer valuesProducer) - throws IOException; - - /** - * Writes binary docvalues for a field. - * - * @param field field information - * @param valuesProducer Binary values to write. - * @throws IOException if an I/O error occurred. - */ - public abstract void addBinaryField(FieldInfo field, DocValuesProducer valuesProducer) - throws IOException; - - /** - * Writes pre-sorted binary docvalues for a field. - * - * @param field field information - * @param valuesProducer produces the values and ordinals to write - * @throws IOException if an I/O error occurred. - */ - public abstract void addSortedField(FieldInfo field, DocValuesProducer valuesProducer) - throws IOException; - - /** - * Writes pre-sorted numeric docvalues for a field - * - * @param field field information - * @param valuesProducer produces the values to write - * @throws IOException if an I/O error occurred. - */ - public abstract void addSortedNumericField(FieldInfo field, DocValuesProducer valuesProducer) - throws IOException; - - /** - * Writes pre-sorted set docvalues for a field - * - * @param field field information - * @param valuesProducer produces the values to write - * @throws IOException if an I/O error occurred. - */ - public abstract void addSortedSetField(FieldInfo field, DocValuesProducer valuesProducer) - throws IOException; - - /** - * Merges in the fields from the readers in mergeState. The default implementation - * calls {@link #mergeNumericField}, {@link #mergeBinaryField}, {@link #mergeSortedField}, {@link - * #mergeSortedSetField}, or {@link #mergeSortedNumericField} for each field, depending on its - * type. Implementations can override this method for more sophisticated merging (bulk-byte - * copying, etc). - */ - public void merge(MergeState mergeState) throws IOException { - for (DocValuesProducer docValuesProducer : mergeState.docValuesProducers) { - if (docValuesProducer != null) { - docValuesProducer.checkIntegrity(); - } - } - - for (FieldInfo mergeFieldInfo : mergeState.mergeFieldInfos) { - DocValuesType type = mergeFieldInfo.getDocValuesType(); - if (type != DocValuesType.NONE) { - if (type == DocValuesType.NUMERIC) { - mergeNumericField(mergeFieldInfo, mergeState); - } else if (type == DocValuesType.BINARY) { - mergeBinaryField(mergeFieldInfo, mergeState); - } else if (type == DocValuesType.SORTED) { - mergeSortedField(mergeFieldInfo, mergeState); - } else if (type == DocValuesType.SORTED_SET) { - mergeSortedSetField(mergeFieldInfo, mergeState); - } else if (type == DocValuesType.SORTED_NUMERIC) { - mergeSortedNumericField(mergeFieldInfo, mergeState); - } else { - throw new AssertionError("type=" + type); - } - } - } - } - /** Tracks state of one numeric sub-reader that we are merging */ private static class NumericDocValuesSub extends DocIDMerger.Sub { @@ -191,11 +72,11 @@ public int nextDoc() throws IOException { *

The default implementation calls {@link #addNumericField}, passing a DocValuesProducer that * merges and filters deleted documents on the fly. */ - public void mergeNumericField(final FieldInfo mergeFieldInfo, final MergeState mergeState) + public void mergeNumericField(MergeStats mergeStats, final FieldInfo mergeFieldInfo, final MergeState mergeState) throws IOException { addNumericField( mergeFieldInfo, - new EmptyDocValuesProducer() { + new TsdbDocValuesProducer(mergeStats) { @Override public NumericDocValues getNumeric(FieldInfo fieldInfo) throws IOException { if (fieldInfo != mergeFieldInfo) { @@ -276,107 +157,6 @@ public long longValue() throws IOException { }; } - /** Tracks state of one binary sub-reader that we are merging */ - private static class BinaryDocValuesSub extends DocIDMerger.Sub { - - final BinaryDocValues values; - - public BinaryDocValuesSub(MergeState.DocMap docMap, BinaryDocValues values) { - super(docMap); - this.values = values; - assert values.docID() == -1; - } - - @Override - public int nextDoc() throws IOException { - return values.nextDoc(); - } - } - - /** - * Merges the binary docvalues from MergeState. - * - *

The default implementation calls {@link #addBinaryField}, passing a DocValuesProducer that - * merges and filters deleted documents on the fly. - */ - public void mergeBinaryField(FieldInfo mergeFieldInfo, final MergeState mergeState) - throws IOException { - addBinaryField( - mergeFieldInfo, - new EmptyDocValuesProducer() { - @Override - public BinaryDocValues getBinary(FieldInfo fieldInfo) throws IOException { - if (fieldInfo != mergeFieldInfo) { - throw new IllegalArgumentException("wrong fieldInfo"); - } - - List subs = new ArrayList<>(); - - long cost = 0; - for (int i = 0; i < mergeState.docValuesProducers.length; i++) { - BinaryDocValues values = null; - DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; - if (docValuesProducer != null) { - FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name); - if (readerFieldInfo != null - && readerFieldInfo.getDocValuesType() == DocValuesType.BINARY) { - values = docValuesProducer.getBinary(readerFieldInfo); - } - } - if (values != null) { - cost += values.cost(); - subs.add(new BinaryDocValuesSub(mergeState.docMaps[i], values)); - } - } - - final DocIDMerger docIDMerger = - DocIDMerger.of(subs, mergeState.needsIndexSort); - final long finalCost = cost; - - return new BinaryDocValues() { - private BinaryDocValuesSub current; - private int docID = -1; - - @Override - public int docID() { - return docID; - } - - @Override - public int nextDoc() throws IOException { - current = docIDMerger.next(); - if (current == null) { - docID = NO_MORE_DOCS; - } else { - docID = current.mappedDocID; - } - return docID; - } - - @Override - public int advance(int target) throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public boolean advanceExact(int target) throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public long cost() { - return finalCost; - } - - @Override - public BytesRef binaryValue() throws IOException { - return current.values.binaryValue(); - } - }; - } - }); - } - /** Tracks state of one sorted numeric sub-reader that we are merging */ private static class SortedNumericDocValuesSub extends DocIDMerger.Sub { @@ -400,12 +180,12 @@ public int nextDoc() throws IOException { *

The default implementation calls {@link #addSortedNumericField}, passing iterables that * filter deleted documents. */ - public void mergeSortedNumericField(FieldInfo mergeFieldInfo, final MergeState mergeState) + public void mergeSortedNumericField(MergeStats mergeStats, FieldInfo mergeFieldInfo, final MergeState mergeState) throws IOException { addSortedNumericField( mergeFieldInfo, - new EmptyDocValuesProducer() { + new TsdbDocValuesProducer(mergeStats) { @Override public SortedNumericDocValues getSortedNumeric(FieldInfo fieldInfo) throws IOException { if (fieldInfo != mergeFieldInfo) { @@ -616,7 +396,7 @@ public int nextDoc() throws IOException { *

The default implementation calls {@link #addSortedField}, passing an Iterable that merges * ordinals and values and filters deleted documents . */ - public void mergeSortedField(FieldInfo fieldInfo, final MergeState mergeState) + public void mergeSortedField(MergeStats mergeStats, FieldInfo fieldInfo, final MergeState mergeState) throws IOException { List toMerge = new ArrayList<>(); for (int i = 0; i < mergeState.docValuesProducers.length; i++) { @@ -668,7 +448,7 @@ public void mergeSortedField(FieldInfo fieldInfo, final MergeState mergeState) // step 3: add field addSortedField( fieldInfo, - new EmptyDocValuesProducer() { + new TsdbDocValuesProducer(mergeStats) { @Override public SortedDocValues getSorted(FieldInfo fieldInfoIn) throws IOException { if (fieldInfoIn != fieldInfo) { @@ -806,7 +586,7 @@ public String toString() { *

The default implementation calls {@link #addSortedSetField}, passing an Iterable that merges * ordinals and values and filters deleted documents . */ - public void mergeSortedSetField(FieldInfo mergeFieldInfo, final MergeState mergeState) + public void mergeSortedSetField(MergeStats mergeStats, FieldInfo mergeFieldInfo, final MergeState mergeState) throws IOException { List toMerge = new ArrayList<>(); @@ -855,7 +635,7 @@ public void mergeSortedSetField(FieldInfo mergeFieldInfo, final MergeState merge // step 3: add field addSortedSetField( mergeFieldInfo, - new EmptyDocValuesProducer() { + new TsdbDocValuesProducer(mergeStats) { @Override public SortedSetDocValues getSortedSet(FieldInfo fieldInfo) throws IOException { if (fieldInfo != mergeFieldInfo) { @@ -1000,52 +780,4 @@ protected AcceptStatus accept(BytesRef term) throws IOException { } } } - - /** Helper: returns true if the given docToValue count contains only at most one value */ - public static boolean isSingleValued(Iterable docToValueCount) { - for (Number count : docToValueCount) { - if (count.longValue() > 1) { - return false; - } - } - return true; - } - - /** Helper: returns single-valued view, using {@code missingValue} when count is zero */ - public static Iterable singletonView( - final Iterable docToValueCount, - final Iterable values, - final Number missingValue) { - assert isSingleValued(docToValueCount); - return new Iterable() { - - @Override - public Iterator iterator() { - final Iterator countIterator = docToValueCount.iterator(); - final Iterator valuesIterator = values.iterator(); - return new Iterator() { - - @Override - public boolean hasNext() { - return countIterator.hasNext(); - } - - @Override - public Number next() { - int count = countIterator.next().intValue(); - if (count == 0) { - return missingValue; - } else { - return valuesIterator.next(); - } - } - - @Override - public void remove() { - throw new UnsupportedOperationException(); - } - }; - } - }; - } } From fb9fd6dd0b097106b631081ed01ec12b73b3024d Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Fri, 4 Apr 2025 11:11:21 +0200 Subject: [PATCH 34/43] spotless and checkstyle --- .../codec/tsdb/es819/XDocValuesConsumer.java | 1260 ++++++++--------- 1 file changed, 618 insertions(+), 642 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/XDocValuesConsumer.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/XDocValuesConsumer.java index e4b2bfb1f928f..af6fc2587a49a 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/XDocValuesConsumer.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/XDocValuesConsumer.java @@ -46,738 +46,714 @@ */ public abstract class XDocValuesConsumer extends DocValuesConsumer { - /** Sole constructor. (For invocation by subclass constructors, typically implicit.) */ - protected XDocValuesConsumer() {} + /** Sole constructor. (For invocation by subclass constructors, typically implicit.) */ + protected XDocValuesConsumer() {} - /** Tracks state of one numeric sub-reader that we are merging */ - private static class NumericDocValuesSub extends DocIDMerger.Sub { + /** Tracks state of one numeric sub-reader that we are merging */ + private static class NumericDocValuesSub extends DocIDMerger.Sub { - final NumericDocValues values; + final NumericDocValues values; - public NumericDocValuesSub(MergeState.DocMap docMap, NumericDocValues values) { - super(docMap); - this.values = values; - assert values.docID() == -1; - } - - @Override - public int nextDoc() throws IOException { - return values.nextDoc(); - } - } - - /** - * Merges the numeric docvalues from MergeState. - * - *

The default implementation calls {@link #addNumericField}, passing a DocValuesProducer that - * merges and filters deleted documents on the fly. - */ - public void mergeNumericField(MergeStats mergeStats, final FieldInfo mergeFieldInfo, final MergeState mergeState) - throws IOException { - addNumericField( - mergeFieldInfo, - new TsdbDocValuesProducer(mergeStats) { - @Override - public NumericDocValues getNumeric(FieldInfo fieldInfo) throws IOException { - if (fieldInfo != mergeFieldInfo) { - throw new IllegalArgumentException("wrong fieldInfo"); - } - - List subs = new ArrayList<>(); - assert mergeState.docMaps.length == mergeState.docValuesProducers.length; - for (int i = 0; i < mergeState.docValuesProducers.length; i++) { - NumericDocValues values = null; - DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; - if (docValuesProducer != null) { - FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name); - if (readerFieldInfo != null - && readerFieldInfo.getDocValuesType() == DocValuesType.NUMERIC) { - values = docValuesProducer.getNumeric(readerFieldInfo); - } - } - if (values != null) { - subs.add(new NumericDocValuesSub(mergeState.docMaps[i], values)); - } - } - - return mergeNumericValues(subs, mergeState.needsIndexSort); - } - }); - } + NumericDocValuesSub(MergeState.DocMap docMap, NumericDocValues values) { + super(docMap); + this.values = values; + assert values.docID() == -1; + } - private static NumericDocValues mergeNumericValues( - List subs, boolean indexIsSorted) throws IOException { - long cost = 0; - for (NumericDocValuesSub sub : subs) { - cost += sub.values.cost(); - } - final long finalCost = cost; - - final DocIDMerger docIDMerger = DocIDMerger.of(subs, indexIsSorted); - - return new NumericDocValues() { - private int docID = -1; - private NumericDocValuesSub current; - - @Override - public int docID() { - return docID; - } - - @Override - public int nextDoc() throws IOException { - current = docIDMerger.next(); - if (current == null) { - docID = NO_MORE_DOCS; - } else { - docID = current.mappedDocID; + @Override + public int nextDoc() throws IOException { + return values.nextDoc(); } - return docID; - } - - @Override - public int advance(int target) throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public boolean advanceExact(int target) throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public long cost() { - return finalCost; - } - - @Override - public long longValue() throws IOException { - return current.values.longValue(); - } - }; - } - - /** Tracks state of one sorted numeric sub-reader that we are merging */ - private static class SortedNumericDocValuesSub extends DocIDMerger.Sub { - - final SortedNumericDocValues values; - - public SortedNumericDocValuesSub(MergeState.DocMap docMap, SortedNumericDocValues values) { - super(docMap); - this.values = values; - assert values.docID() == -1; } - @Override - public int nextDoc() throws IOException { - return values.nextDoc(); - } - } - - /** - * Merges the sorted docvalues from toMerge. - * - *

The default implementation calls {@link #addSortedNumericField}, passing iterables that - * filter deleted documents. - */ - public void mergeSortedNumericField(MergeStats mergeStats, FieldInfo mergeFieldInfo, final MergeState mergeState) - throws IOException { - - addSortedNumericField( - mergeFieldInfo, - new TsdbDocValuesProducer(mergeStats) { - @Override - public SortedNumericDocValues getSortedNumeric(FieldInfo fieldInfo) throws IOException { - if (fieldInfo != mergeFieldInfo) { - throw new IllegalArgumentException("wrong FieldInfo"); - } + /** + * Merges the numeric docvalues from MergeState. + * + *

The default implementation calls {@link #addNumericField}, passing a DocValuesProducer that + * merges and filters deleted documents on the fly. + */ + public void mergeNumericField(MergeStats mergeStats, final FieldInfo mergeFieldInfo, final MergeState mergeState) throws IOException { + addNumericField(mergeFieldInfo, new TsdbDocValuesProducer(mergeStats) { + @Override + public NumericDocValues getNumeric(FieldInfo fieldInfo) throws IOException { + if (fieldInfo != mergeFieldInfo) { + throw new IllegalArgumentException("wrong fieldInfo"); + } - // We must make new iterators + DocIDMerger for each iterator: - List subs = new ArrayList<>(); - long cost = 0; - boolean allSingletons = true; - for (int i = 0; i < mergeState.docValuesProducers.length; i++) { - DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; - SortedNumericDocValues values = null; - if (docValuesProducer != null) { - FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name); - if (readerFieldInfo != null - && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED_NUMERIC) { - values = docValuesProducer.getSortedNumeric(readerFieldInfo); + List subs = new ArrayList<>(); + assert mergeState.docMaps.length == mergeState.docValuesProducers.length; + for (int i = 0; i < mergeState.docValuesProducers.length; i++) { + NumericDocValues values = null; + DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; + if (docValuesProducer != null) { + FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name); + if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.NUMERIC) { + values = docValuesProducer.getNumeric(readerFieldInfo); + } + } + if (values != null) { + subs.add(new NumericDocValuesSub(mergeState.docMaps[i], values)); + } } - } - if (values == null) { - values = DocValues.emptySortedNumeric(); - } - cost += values.cost(); - if (allSingletons && DocValues.unwrapSingleton(values) == null) { - allSingletons = false; - } - subs.add(new SortedNumericDocValuesSub(mergeState.docMaps[i], values)); - } - if (allSingletons) { - // All subs are single-valued. - // We specialize for that case since it makes it easier for codecs to optimize - // for single-valued fields. - List singleValuedSubs = new ArrayList<>(); - for (SortedNumericDocValuesSub sub : subs) { - final NumericDocValues singleValuedValues = DocValues.unwrapSingleton(sub.values); - assert singleValuedValues != null; - singleValuedSubs.add(new NumericDocValuesSub(sub.docMap, singleValuedValues)); - } - return DocValues.singleton( - mergeNumericValues(singleValuedSubs, mergeState.needsIndexSort)); + return mergeNumericValues(subs, mergeState.needsIndexSort); } + }); + } - final long finalCost = cost; - - final DocIDMerger docIDMerger = - DocIDMerger.of(subs, mergeState.needsIndexSort); + private static NumericDocValues mergeNumericValues(List subs, boolean indexIsSorted) throws IOException { + long cost = 0; + for (NumericDocValuesSub sub : subs) { + cost += sub.values.cost(); + } + final long finalCost = cost; - return new SortedNumericDocValues() { + final DocIDMerger docIDMerger = DocIDMerger.of(subs, indexIsSorted); - private int docID = -1; - private SortedNumericDocValuesSub currentSub; + return new NumericDocValues() { + private int docID = -1; + private NumericDocValuesSub current; - @Override - public int docID() { + @Override + public int docID() { return docID; - } + } - @Override - public int nextDoc() throws IOException { - currentSub = docIDMerger.next(); - if (currentSub == null) { - docID = NO_MORE_DOCS; + @Override + public int nextDoc() throws IOException { + current = docIDMerger.next(); + if (current == null) { + docID = NO_MORE_DOCS; } else { - docID = currentSub.mappedDocID; + docID = current.mappedDocID; } - return docID; - } + } - @Override - public int advance(int target) throws IOException { + @Override + public int advance(int target) throws IOException { throw new UnsupportedOperationException(); - } + } - @Override - public boolean advanceExact(int target) throws IOException { + @Override + public boolean advanceExact(int target) throws IOException { throw new UnsupportedOperationException(); - } - - @Override - public int docValueCount() { - return currentSub.values.docValueCount(); - } + } - @Override - public long cost() { + @Override + public long cost() { return finalCost; - } - - @Override - public long nextValue() throws IOException { - return currentSub.values.nextValue(); - } - }; - } - }); - } - - /** - * A merged {@link TermsEnum}. This helps avoid relying on the default terms enum, which calls - * {@link SortedDocValues#lookupOrd(int)} or {@link SortedSetDocValues#lookupOrd(long)} on every - * call to {@link TermsEnum#next()}. - */ - private static class MergedTermsEnum extends BaseTermsEnum { - - private final TermsEnum[] subs; - private final OrdinalMap ordinalMap; - private final long valueCount; - private long ord = -1; - private BytesRef term; - - MergedTermsEnum(OrdinalMap ordinalMap, TermsEnum[] subs) { - this.ordinalMap = ordinalMap; - this.subs = subs; - this.valueCount = ordinalMap.getValueCount(); - } - - @Override - public BytesRef term() throws IOException { - return term; - } - - @Override - public long ord() throws IOException { - return ord; - } + } - @Override - public BytesRef next() throws IOException { - if (++ord >= valueCount) { - return null; - } - final int subNum = ordinalMap.getFirstSegmentNumber(ord); - final TermsEnum sub = subs[subNum]; - final long subOrd = ordinalMap.getFirstSegmentOrd(ord); - do { - term = sub.next(); - } while (sub.ord() < subOrd); - assert sub.ord() == subOrd; - return term; + @Override + public long longValue() throws IOException { + return current.values.longValue(); + } + }; } - @Override - public AttributeSource attributes() { - throw new UnsupportedOperationException(); - } + /** Tracks state of one sorted numeric sub-reader that we are merging */ + private static class SortedNumericDocValuesSub extends DocIDMerger.Sub { - @Override - public SeekStatus seekCeil(BytesRef text) throws IOException { - throw new UnsupportedOperationException(); - } + final SortedNumericDocValues values; - @Override - public void seekExact(long ord) throws IOException { - throw new UnsupportedOperationException(); - } + SortedNumericDocValuesSub(MergeState.DocMap docMap, SortedNumericDocValues values) { + super(docMap); + this.values = values; + assert values.docID() == -1; + } - @Override - public int docFreq() throws IOException { - throw new UnsupportedOperationException(); + @Override + public int nextDoc() throws IOException { + return values.nextDoc(); + } } - @Override - public long totalTermFreq() throws IOException { - throw new UnsupportedOperationException(); - } + /** + * Merges the sorted docvalues from toMerge. + * + *

The default implementation calls {@link #addSortedNumericField}, passing iterables that + * filter deleted documents. + */ + public void mergeSortedNumericField(MergeStats mergeStats, FieldInfo mergeFieldInfo, final MergeState mergeState) throws IOException { + + addSortedNumericField(mergeFieldInfo, new TsdbDocValuesProducer(mergeStats) { + @Override + public SortedNumericDocValues getSortedNumeric(FieldInfo fieldInfo) throws IOException { + if (fieldInfo != mergeFieldInfo) { + throw new IllegalArgumentException("wrong FieldInfo"); + } - @Override - public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException { - throw new UnsupportedOperationException(); - } + // We must make new iterators + DocIDMerger for each iterator: + List subs = new ArrayList<>(); + long cost = 0; + boolean allSingletons = true; + for (int i = 0; i < mergeState.docValuesProducers.length; i++) { + DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; + SortedNumericDocValues values = null; + if (docValuesProducer != null) { + FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name); + if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED_NUMERIC) { + values = docValuesProducer.getSortedNumeric(readerFieldInfo); + } + } + if (values == null) { + values = DocValues.emptySortedNumeric(); + } + cost += values.cost(); + if (allSingletons && DocValues.unwrapSingleton(values) == null) { + allSingletons = false; + } + subs.add(new SortedNumericDocValuesSub(mergeState.docMaps[i], values)); + } - @Override - public ImpactsEnum impacts(int flags) throws IOException { - throw new UnsupportedOperationException(); - } + if (allSingletons) { + // All subs are single-valued. + // We specialize for that case since it makes it easier for codecs to optimize + // for single-valued fields. + List singleValuedSubs = new ArrayList<>(); + for (SortedNumericDocValuesSub sub : subs) { + final NumericDocValues singleValuedValues = DocValues.unwrapSingleton(sub.values); + assert singleValuedValues != null; + singleValuedSubs.add(new NumericDocValuesSub(sub.docMap, singleValuedValues)); + } + return DocValues.singleton(mergeNumericValues(singleValuedSubs, mergeState.needsIndexSort)); + } - @Override - public TermState termState() throws IOException { - throw new UnsupportedOperationException(); + final long finalCost = cost; + + final DocIDMerger docIDMerger = DocIDMerger.of(subs, mergeState.needsIndexSort); + + return new SortedNumericDocValues() { + + private int docID = -1; + private SortedNumericDocValuesSub currentSub; + + @Override + public int docID() { + return docID; + } + + @Override + public int nextDoc() throws IOException { + currentSub = docIDMerger.next(); + if (currentSub == null) { + docID = NO_MORE_DOCS; + } else { + docID = currentSub.mappedDocID; + } + + return docID; + } + + @Override + public int advance(int target) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public boolean advanceExact(int target) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public int docValueCount() { + return currentSub.values.docValueCount(); + } + + @Override + public long cost() { + return finalCost; + } + + @Override + public long nextValue() throws IOException { + return currentSub.values.nextValue(); + } + }; + } + }); } - } - - /** Tracks state of one sorted sub-reader that we are merging */ - private static class SortedDocValuesSub extends DocIDMerger.Sub { - final SortedDocValues values; - final LongValues map; + /** + * A merged {@link TermsEnum}. This helps avoid relying on the default terms enum, which calls + * {@link SortedDocValues#lookupOrd(int)} or {@link SortedSetDocValues#lookupOrd(long)} on every + * call to {@link TermsEnum#next()}. + */ + private static class MergedTermsEnum extends BaseTermsEnum { + + private final TermsEnum[] subs; + private final OrdinalMap ordinalMap; + private final long valueCount; + private long ord = -1; + private BytesRef term; + + MergedTermsEnum(OrdinalMap ordinalMap, TermsEnum[] subs) { + this.ordinalMap = ordinalMap; + this.subs = subs; + this.valueCount = ordinalMap.getValueCount(); + } - public SortedDocValuesSub(MergeState.DocMap docMap, SortedDocValues values, LongValues map) { - super(docMap); - this.values = values; - this.map = map; - assert values.docID() == -1; - } + @Override + public BytesRef term() throws IOException { + return term; + } - @Override - public int nextDoc() throws IOException { - return values.nextDoc(); - } - } - - /** - * Merges the sorted docvalues from toMerge. - * - *

The default implementation calls {@link #addSortedField}, passing an Iterable that merges - * ordinals and values and filters deleted documents . - */ - public void mergeSortedField(MergeStats mergeStats, FieldInfo fieldInfo, final MergeState mergeState) - throws IOException { - List toMerge = new ArrayList<>(); - for (int i = 0; i < mergeState.docValuesProducers.length; i++) { - SortedDocValues values = null; - DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; - if (docValuesProducer != null) { - FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(fieldInfo.name); - if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED) { - values = docValuesProducer.getSorted(readerFieldInfo); + @Override + public long ord() throws IOException { + return ord; } - } - if (values == null) { - values = DocValues.emptySorted(); - } - toMerge.add(values); - } - final int numReaders = toMerge.size(); - final SortedDocValues[] dvs = toMerge.toArray(new SortedDocValues[numReaders]); - - // step 1: iterate thru each sub and mark terms still in use - TermsEnum[] liveTerms = new TermsEnum[dvs.length]; - long[] weights = new long[liveTerms.length]; - for (int sub = 0; sub < numReaders; sub++) { - SortedDocValues dv = dvs[sub]; - Bits liveDocs = mergeState.liveDocs[sub]; - if (liveDocs == null) { - liveTerms[sub] = dv.termsEnum(); - weights[sub] = dv.getValueCount(); - } else { - LongBitSet bitset = new LongBitSet(dv.getValueCount()); - int docID; - while ((docID = dv.nextDoc()) != NO_MORE_DOCS) { - if (liveDocs.get(docID)) { - int ord = dv.ordValue(); - if (ord >= 0) { - bitset.set(ord); + @Override + public BytesRef next() throws IOException { + if (++ord >= valueCount) { + return null; } - } + final int subNum = ordinalMap.getFirstSegmentNumber(ord); + final TermsEnum sub = subs[subNum]; + final long subOrd = ordinalMap.getFirstSegmentOrd(ord); + do { + term = sub.next(); + } while (sub.ord() < subOrd); + assert sub.ord() == subOrd; + return term; } - liveTerms[sub] = new BitsFilteredTermsEnum(dv.termsEnum(), bitset); - weights[sub] = bitset.cardinality(); - } - } - // step 2: create ordinal map (this conceptually does the "merging") - final OrdinalMap map = OrdinalMap.build(null, liveTerms, weights, PackedInts.COMPACT); - - // step 3: add field - addSortedField( - fieldInfo, - new TsdbDocValuesProducer(mergeStats) { - @Override - public SortedDocValues getSorted(FieldInfo fieldInfoIn) throws IOException { - if (fieldInfoIn != fieldInfo) { - throw new IllegalArgumentException("wrong FieldInfo"); - } + @Override + public AttributeSource attributes() { + throw new UnsupportedOperationException(); + } - // We must make new iterators + DocIDMerger for each iterator: + @Override + public SeekStatus seekCeil(BytesRef text) throws IOException { + throw new UnsupportedOperationException(); + } - List subs = new ArrayList<>(); - for (int i = 0; i < mergeState.docValuesProducers.length; i++) { - SortedDocValues values = null; - DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; - if (docValuesProducer != null) { - FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(fieldInfo.name); - if (readerFieldInfo != null - && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED) { - values = docValuesProducer.getSorted(readerFieldInfo); - } - } - if (values == null) { - values = DocValues.emptySorted(); - } + @Override + public void seekExact(long ord) throws IOException { + throw new UnsupportedOperationException(); + } - subs.add(new SortedDocValuesSub(mergeState.docMaps[i], values, map.getGlobalOrds(i))); - } + @Override + public int docFreq() throws IOException { + throw new UnsupportedOperationException(); + } - return mergeSortedValues(subs, mergeState.needsIndexSort, map); - } - }); - } + @Override + public long totalTermFreq() throws IOException { + throw new UnsupportedOperationException(); + } - private static SortedDocValues mergeSortedValues( - List subs, boolean indexIsSorted, OrdinalMap map) throws IOException { - long cost = 0; - for (SortedDocValuesSub sub : subs) { - cost += sub.values.cost(); - } - final long finalCost = cost; - - final DocIDMerger docIDMerger = DocIDMerger.of(subs, indexIsSorted); - - return new SortedDocValues() { - private int docID = -1; - private SortedDocValuesSub current; - - @Override - public int docID() { - return docID; - } - - @Override - public int nextDoc() throws IOException { - current = docIDMerger.next(); - if (current == null) { - docID = NO_MORE_DOCS; - } else { - docID = current.mappedDocID; + @Override + public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException { + throw new UnsupportedOperationException(); } - return docID; - } - - @Override - public int ordValue() throws IOException { - int subOrd = current.values.ordValue(); - assert subOrd != -1; - return (int) current.map.get(subOrd); - } - - @Override - public int advance(int target) { - throw new UnsupportedOperationException(); - } - - @Override - public boolean advanceExact(int target) throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public long cost() { - return finalCost; - } - - @Override - public int getValueCount() { - return (int) map.getValueCount(); - } - - @Override - public BytesRef lookupOrd(int ord) throws IOException { - int segmentNumber = map.getFirstSegmentNumber(ord); - int segmentOrd = (int) map.getFirstSegmentOrd(ord); - return subs.get(segmentNumber).values.lookupOrd(segmentOrd); - } - - @Override - public TermsEnum termsEnum() throws IOException { - TermsEnum[] termsEnurmSubs = new TermsEnum[subs.size()]; - for (int sub = 0; sub < termsEnurmSubs.length; ++sub) { - termsEnurmSubs[sub] = subs.get(sub).values.termsEnum(); + + @Override + public ImpactsEnum impacts(int flags) throws IOException { + throw new UnsupportedOperationException(); } - return new MergedTermsEnum(map, termsEnurmSubs); - } - }; - } - - /** Tracks state of one sorted set sub-reader that we are merging */ - private static class SortedSetDocValuesSub extends DocIDMerger.Sub { - - final SortedSetDocValues values; - final LongValues map; - - public SortedSetDocValuesSub( - MergeState.DocMap docMap, SortedSetDocValues values, LongValues map) { - super(docMap); - this.values = values; - this.map = map; - assert values.docID() == -1; - } - @Override - public int nextDoc() throws IOException { - return values.nextDoc(); + @Override + public TermState termState() throws IOException { + throw new UnsupportedOperationException(); + } } - @Override - public String toString() { - return "SortedSetDocValuesSub(mappedDocID=" + mappedDocID + " values=" + values + ")"; - } - } - - /** - * Merges the sortedset docvalues from toMerge. - * - *

The default implementation calls {@link #addSortedSetField}, passing an Iterable that merges - * ordinals and values and filters deleted documents . - */ - public void mergeSortedSetField(MergeStats mergeStats, FieldInfo mergeFieldInfo, final MergeState mergeState) - throws IOException { - - List toMerge = new ArrayList<>(); - for (int i = 0; i < mergeState.docValuesProducers.length; i++) { - SortedSetDocValues values = null; - DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; - if (docValuesProducer != null) { - FieldInfo fieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name); - if (fieldInfo != null && fieldInfo.getDocValuesType() == DocValuesType.SORTED_SET) { - values = docValuesProducer.getSortedSet(fieldInfo); + /** Tracks state of one sorted sub-reader that we are merging */ + private static class SortedDocValuesSub extends DocIDMerger.Sub { + + final SortedDocValues values; + final LongValues map; + + SortedDocValuesSub(MergeState.DocMap docMap, SortedDocValues values, LongValues map) { + super(docMap); + this.values = values; + this.map = map; + assert values.docID() == -1; + } + + @Override + public int nextDoc() throws IOException { + return values.nextDoc(); } - } - if (values == null) { - values = DocValues.emptySortedSet(); - } - toMerge.add(values); } - // step 1: iterate thru each sub and mark terms still in use - TermsEnum[] liveTerms = new TermsEnum[toMerge.size()]; - long[] weights = new long[liveTerms.length]; - for (int sub = 0; sub < liveTerms.length; sub++) { - SortedSetDocValues dv = toMerge.get(sub); - Bits liveDocs = mergeState.liveDocs[sub]; - if (liveDocs == null) { - liveTerms[sub] = dv.termsEnum(); - weights[sub] = dv.getValueCount(); - } else { - LongBitSet bitset = new LongBitSet(dv.getValueCount()); - int docID; - while ((docID = dv.nextDoc()) != NO_MORE_DOCS) { - if (liveDocs.get(docID)) { - for (int i = 0; i < dv.docValueCount(); i++) { - bitset.set(dv.nextOrd()); + /** + * Merges the sorted docvalues from toMerge. + * + *

The default implementation calls {@link #addSortedField}, passing an Iterable that merges + * ordinals and values and filters deleted documents . + */ + public void mergeSortedField(MergeStats mergeStats, FieldInfo fieldInfo, final MergeState mergeState) throws IOException { + List toMerge = new ArrayList<>(); + for (int i = 0; i < mergeState.docValuesProducers.length; i++) { + SortedDocValues values = null; + DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; + if (docValuesProducer != null) { + FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(fieldInfo.name); + if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED) { + values = docValuesProducer.getSorted(readerFieldInfo); + } + } + if (values == null) { + values = DocValues.emptySorted(); } - } + toMerge.add(values); } - liveTerms[sub] = new BitsFilteredTermsEnum(dv.termsEnum(), bitset); - weights[sub] = bitset.cardinality(); - } - } - // step 2: create ordinal map (this conceptually does the "merging") - final OrdinalMap map = OrdinalMap.build(null, liveTerms, weights, PackedInts.COMPACT); - - // step 3: add field - addSortedSetField( - mergeFieldInfo, - new TsdbDocValuesProducer(mergeStats) { - @Override - public SortedSetDocValues getSortedSet(FieldInfo fieldInfo) throws IOException { - if (fieldInfo != mergeFieldInfo) { - throw new IllegalArgumentException("wrong FieldInfo"); + final int numReaders = toMerge.size(); + final SortedDocValues[] dvs = toMerge.toArray(new SortedDocValues[numReaders]); + + // step 1: iterate thru each sub and mark terms still in use + TermsEnum[] liveTerms = new TermsEnum[dvs.length]; + long[] weights = new long[liveTerms.length]; + for (int sub = 0; sub < numReaders; sub++) { + SortedDocValues dv = dvs[sub]; + Bits liveDocs = mergeState.liveDocs[sub]; + if (liveDocs == null) { + liveTerms[sub] = dv.termsEnum(); + weights[sub] = dv.getValueCount(); + } else { + LongBitSet bitset = new LongBitSet(dv.getValueCount()); + int docID; + while ((docID = dv.nextDoc()) != NO_MORE_DOCS) { + if (liveDocs.get(docID)) { + int ord = dv.ordValue(); + if (ord >= 0) { + bitset.set(ord); + } + } + } + liveTerms[sub] = new BitsFilteredTermsEnum(dv.termsEnum(), bitset); + weights[sub] = bitset.cardinality(); } + } - // We must make new iterators + DocIDMerger for each iterator: - List subs = new ArrayList<>(); + // step 2: create ordinal map (this conceptually does the "merging") + final OrdinalMap map = OrdinalMap.build(null, liveTerms, weights, PackedInts.COMPACT); - long cost = 0; - boolean allSingletons = true; + // step 3: add field + addSortedField(fieldInfo, new TsdbDocValuesProducer(mergeStats) { + @Override + public SortedDocValues getSorted(FieldInfo fieldInfoIn) throws IOException { + if (fieldInfoIn != fieldInfo) { + throw new IllegalArgumentException("wrong FieldInfo"); + } - for (int i = 0; i < mergeState.docValuesProducers.length; i++) { - SortedSetDocValues values = null; - DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; - if (docValuesProducer != null) { - FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name); - if (readerFieldInfo != null - && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED_SET) { - values = docValuesProducer.getSortedSet(readerFieldInfo); + // We must make new iterators + DocIDMerger for each iterator: + + List subs = new ArrayList<>(); + for (int i = 0; i < mergeState.docValuesProducers.length; i++) { + SortedDocValues values = null; + DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; + if (docValuesProducer != null) { + FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(fieldInfo.name); + if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED) { + values = docValuesProducer.getSorted(readerFieldInfo); + } + } + if (values == null) { + values = DocValues.emptySorted(); + } + + subs.add(new SortedDocValuesSub(mergeState.docMaps[i], values, map.getGlobalOrds(i))); } - } - if (values == null) { - values = DocValues.emptySortedSet(); - } - cost += values.cost(); - if (allSingletons && DocValues.unwrapSingleton(values) == null) { - allSingletons = false; - } - subs.add( - new SortedSetDocValuesSub(mergeState.docMaps[i], values, map.getGlobalOrds(i))); - } - if (allSingletons) { - // All subs are single-valued. - // We specialize for that case since it makes it easier for codecs to optimize - // for single-valued fields. - List singleValuedSubs = new ArrayList<>(); - for (SortedSetDocValuesSub sub : subs) { - final SortedDocValues singleValuedValues = DocValues.unwrapSingleton(sub.values); - assert singleValuedValues != null; - singleValuedSubs.add( - new SortedDocValuesSub(sub.docMap, singleValuedValues, sub.map)); - } - return DocValues.singleton( - mergeSortedValues(singleValuedSubs, mergeState.needsIndexSort, map)); + return mergeSortedValues(subs, mergeState.needsIndexSort, map); } + }); + } - final DocIDMerger docIDMerger = - DocIDMerger.of(subs, mergeState.needsIndexSort); + private static SortedDocValues mergeSortedValues(List subs, boolean indexIsSorted, OrdinalMap map) + throws IOException { + long cost = 0; + for (SortedDocValuesSub sub : subs) { + cost += sub.values.cost(); + } + final long finalCost = cost; - final long finalCost = cost; + final DocIDMerger docIDMerger = DocIDMerger.of(subs, indexIsSorted); - return new SortedSetDocValues() { - private int docID = -1; - private SortedSetDocValuesSub currentSub; + return new SortedDocValues() { + private int docID = -1; + private SortedDocValuesSub current; - @Override - public int docID() { + @Override + public int docID() { return docID; - } + } - @Override - public int nextDoc() throws IOException { - currentSub = docIDMerger.next(); - if (currentSub == null) { - docID = NO_MORE_DOCS; + @Override + public int nextDoc() throws IOException { + current = docIDMerger.next(); + if (current == null) { + docID = NO_MORE_DOCS; } else { - docID = currentSub.mappedDocID; + docID = current.mappedDocID; } - return docID; - } + } - @Override - public int advance(int target) throws IOException { - throw new UnsupportedOperationException(); - } + @Override + public int ordValue() throws IOException { + int subOrd = current.values.ordValue(); + assert subOrd != -1; + return (int) current.map.get(subOrd); + } - @Override - public boolean advanceExact(int target) throws IOException { + @Override + public int advance(int target) { throw new UnsupportedOperationException(); - } - - @Override - public long nextOrd() throws IOException { - long subOrd = currentSub.values.nextOrd(); - return currentSub.map.get(subOrd); - } + } - @Override - public int docValueCount() { - return currentSub.values.docValueCount(); - } + @Override + public boolean advanceExact(int target) throws IOException { + throw new UnsupportedOperationException(); + } - @Override - public long cost() { + @Override + public long cost() { return finalCost; - } + } + + @Override + public int getValueCount() { + return (int) map.getValueCount(); + } - @Override - public BytesRef lookupOrd(long ord) throws IOException { + @Override + public BytesRef lookupOrd(int ord) throws IOException { int segmentNumber = map.getFirstSegmentNumber(ord); - long segmentOrd = map.getFirstSegmentOrd(ord); - return toMerge.get(segmentNumber).lookupOrd(segmentOrd); - } - - @Override - public long getValueCount() { - return map.getValueCount(); - } - - @Override - public TermsEnum termsEnum() throws IOException { - TermsEnum[] subs = new TermsEnum[toMerge.size()]; - for (int sub = 0; sub < subs.length; ++sub) { - subs[sub] = toMerge.get(sub).termsEnum(); + int segmentOrd = (int) map.getFirstSegmentOrd(ord); + return subs.get(segmentNumber).values.lookupOrd(segmentOrd); + } + + @Override + public TermsEnum termsEnum() throws IOException { + TermsEnum[] termsEnurmSubs = new TermsEnum[subs.size()]; + for (int sub = 0; sub < termsEnurmSubs.length; ++sub) { + termsEnurmSubs[sub] = subs.get(sub).values.termsEnum(); } - return new MergedTermsEnum(map, subs); - } - }; - } - }); - } + return new MergedTermsEnum(map, termsEnurmSubs); + } + }; + } + + /** Tracks state of one sorted set sub-reader that we are merging */ + private static class SortedSetDocValuesSub extends DocIDMerger.Sub { - // TODO: seek-by-ord to nextSetBit - static class BitsFilteredTermsEnum extends FilteredTermsEnum { - final LongBitSet liveTerms; + final SortedSetDocValues values; + final LongValues map; - BitsFilteredTermsEnum(TermsEnum in, LongBitSet liveTerms) { - super(in, false); // <-- not passing false here wasted about 3 hours of my time!!!!!!!!!!!!! - assert liveTerms != null; - this.liveTerms = liveTerms; + SortedSetDocValuesSub(MergeState.DocMap docMap, SortedSetDocValues values, LongValues map) { + super(docMap); + this.values = values; + this.map = map; + assert values.docID() == -1; + } + + @Override + public int nextDoc() throws IOException { + return values.nextDoc(); + } + + @Override + public String toString() { + return "SortedSetDocValuesSub(mappedDocID=" + mappedDocID + " values=" + values + ")"; + } } - @Override - protected AcceptStatus accept(BytesRef term) throws IOException { - if (liveTerms.get(ord())) { - return AcceptStatus.YES; - } else { - return AcceptStatus.NO; - } + /** + * Merges the sortedset docvalues from toMerge. + * + *

The default implementation calls {@link #addSortedSetField}, passing an Iterable that merges + * ordinals and values and filters deleted documents . + */ + public void mergeSortedSetField(MergeStats mergeStats, FieldInfo mergeFieldInfo, final MergeState mergeState) throws IOException { + + List toMerge = new ArrayList<>(); + for (int i = 0; i < mergeState.docValuesProducers.length; i++) { + SortedSetDocValues values = null; + DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; + if (docValuesProducer != null) { + FieldInfo fieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name); + if (fieldInfo != null && fieldInfo.getDocValuesType() == DocValuesType.SORTED_SET) { + values = docValuesProducer.getSortedSet(fieldInfo); + } + } + if (values == null) { + values = DocValues.emptySortedSet(); + } + toMerge.add(values); + } + + // step 1: iterate thru each sub and mark terms still in use + TermsEnum[] liveTerms = new TermsEnum[toMerge.size()]; + long[] weights = new long[liveTerms.length]; + for (int sub = 0; sub < liveTerms.length; sub++) { + SortedSetDocValues dv = toMerge.get(sub); + Bits liveDocs = mergeState.liveDocs[sub]; + if (liveDocs == null) { + liveTerms[sub] = dv.termsEnum(); + weights[sub] = dv.getValueCount(); + } else { + LongBitSet bitset = new LongBitSet(dv.getValueCount()); + int docID; + while ((docID = dv.nextDoc()) != NO_MORE_DOCS) { + if (liveDocs.get(docID)) { + for (int i = 0; i < dv.docValueCount(); i++) { + bitset.set(dv.nextOrd()); + } + } + } + liveTerms[sub] = new BitsFilteredTermsEnum(dv.termsEnum(), bitset); + weights[sub] = bitset.cardinality(); + } + } + + // step 2: create ordinal map (this conceptually does the "merging") + final OrdinalMap map = OrdinalMap.build(null, liveTerms, weights, PackedInts.COMPACT); + + // step 3: add field + addSortedSetField(mergeFieldInfo, new TsdbDocValuesProducer(mergeStats) { + @Override + public SortedSetDocValues getSortedSet(FieldInfo fieldInfo) throws IOException { + if (fieldInfo != mergeFieldInfo) { + throw new IllegalArgumentException("wrong FieldInfo"); + } + + // We must make new iterators + DocIDMerger for each iterator: + List subs = new ArrayList<>(); + + long cost = 0; + boolean allSingletons = true; + + for (int i = 0; i < mergeState.docValuesProducers.length; i++) { + SortedSetDocValues values = null; + DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; + if (docValuesProducer != null) { + FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name); + if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED_SET) { + values = docValuesProducer.getSortedSet(readerFieldInfo); + } + } + if (values == null) { + values = DocValues.emptySortedSet(); + } + cost += values.cost(); + if (allSingletons && DocValues.unwrapSingleton(values) == null) { + allSingletons = false; + } + subs.add(new SortedSetDocValuesSub(mergeState.docMaps[i], values, map.getGlobalOrds(i))); + } + + if (allSingletons) { + // All subs are single-valued. + // We specialize for that case since it makes it easier for codecs to optimize + // for single-valued fields. + List singleValuedSubs = new ArrayList<>(); + for (SortedSetDocValuesSub sub : subs) { + final SortedDocValues singleValuedValues = DocValues.unwrapSingleton(sub.values); + assert singleValuedValues != null; + singleValuedSubs.add(new SortedDocValuesSub(sub.docMap, singleValuedValues, sub.map)); + } + return DocValues.singleton(mergeSortedValues(singleValuedSubs, mergeState.needsIndexSort, map)); + } + + final DocIDMerger docIDMerger = DocIDMerger.of(subs, mergeState.needsIndexSort); + + final long finalCost = cost; + + return new SortedSetDocValues() { + private int docID = -1; + private SortedSetDocValuesSub currentSub; + + @Override + public int docID() { + return docID; + } + + @Override + public int nextDoc() throws IOException { + currentSub = docIDMerger.next(); + if (currentSub == null) { + docID = NO_MORE_DOCS; + } else { + docID = currentSub.mappedDocID; + } + + return docID; + } + + @Override + public int advance(int target) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public boolean advanceExact(int target) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public long nextOrd() throws IOException { + long subOrd = currentSub.values.nextOrd(); + return currentSub.map.get(subOrd); + } + + @Override + public int docValueCount() { + return currentSub.values.docValueCount(); + } + + @Override + public long cost() { + return finalCost; + } + + @Override + public BytesRef lookupOrd(long ord) throws IOException { + int segmentNumber = map.getFirstSegmentNumber(ord); + long segmentOrd = map.getFirstSegmentOrd(ord); + return toMerge.get(segmentNumber).lookupOrd(segmentOrd); + } + + @Override + public long getValueCount() { + return map.getValueCount(); + } + + @Override + public TermsEnum termsEnum() throws IOException { + TermsEnum[] subs = new TermsEnum[toMerge.size()]; + for (int sub = 0; sub < subs.length; ++sub) { + subs[sub] = toMerge.get(sub).termsEnum(); + } + return new MergedTermsEnum(map, subs); + } + }; + } + }); + } + + // TODO: seek-by-ord to nextSetBit + static class BitsFilteredTermsEnum extends FilteredTermsEnum { + final LongBitSet liveTerms; + + BitsFilteredTermsEnum(TermsEnum in, LongBitSet liveTerms) { + super(in, false); // <-- not passing false here wasted about 3 hours of my time!!!!!!!!!!!!! + assert liveTerms != null; + this.liveTerms = liveTerms; + } + + @Override + protected AcceptStatus accept(BytesRef term) throws IOException { + if (liveTerms.get(ord())) { + return AcceptStatus.YES; + } else { + return AcceptStatus.NO; + } + } } - } } From dd460e86a24c538d45dc78ecb44523d9f3bc242c Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Sat, 5 Apr 2025 00:33:27 +0200 Subject: [PATCH 35/43] fork PerFieldDocValuesFormat in order to avoid tricky unwrapping in DocValuesConsumerUtil --- .../tsdb/TSDBDocValuesMergeBenchmark.java | 12 +- .../codec/Elasticsearch900Lucene101Codec.java | 3 +- .../index/codec/XPerFieldDocValuesFormat.java | 364 ++++++++++++++++++ .../index/codec/XPerFieldMergeState.java | 261 +++++++++++++ .../tsdb/es819/DocValuesConsumerUtil.java | 95 ++--- .../codec/tsdb/DocValuesCodecDuelTests.java | 21 +- .../tsdb/ES87TSDBDocValuesFormatTests.java | 18 +- .../codec/tsdb/TsdbDocValueBwcTests.java | 12 +- .../es819/ES819TSDBDocValuesFormatTests.java | 13 +- 9 files changed, 733 insertions(+), 66 deletions(-) create mode 100644 server/src/main/java/org/elasticsearch/index/codec/XPerFieldDocValuesFormat.java create mode 100644 server/src/main/java/org/elasticsearch/index/codec/XPerFieldMergeState.java diff --git a/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java b/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java index 6c83c88108e04..e3eb3405038dd 100644 --- a/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java +++ b/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java @@ -11,7 +11,6 @@ import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.codecs.DocValuesFormat; -import org.apache.lucene.codecs.lucene101.Lucene101Codec; import org.apache.lucene.document.Document; import org.apache.lucene.document.SortedDocValuesField; import org.apache.lucene.document.SortedNumericDocValuesField; @@ -26,6 +25,8 @@ import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.BytesRef; import org.elasticsearch.cluster.metadata.DataStream; +import org.elasticsearch.common.logging.LogConfigurator; +import org.elasticsearch.index.codec.Elasticsearch900Lucene101Codec; import org.elasticsearch.index.codec.tsdb.es819.ES819TSDBDocValuesFormat; import org.openjdk.jmh.annotations.Benchmark; import org.openjdk.jmh.annotations.BenchmarkMode; @@ -63,6 +64,13 @@ @Measurement(iterations = 1) public class TSDBDocValuesMergeBenchmark { + static { + // For Elasticsearch900Lucene101Codec: + LogConfigurator.loadLog4jPlugins(); + LogConfigurator.configureESLogging(); + LogConfigurator.setNodeName("test"); + } + @Param("20431204") private int nDocs; @@ -176,7 +184,7 @@ private static IndexWriterConfig createIndexWriterConfig(boolean optimizedMergeE config.setLeafSorter(DataStream.TIMESERIES_LEAF_READERS_SORTER); config.setMergePolicy(new LogByteSizeMergePolicy()); var docValuesFormat = new ES819TSDBDocValuesFormat(4096, optimizedMergeEnabled); - config.setCodec(new Lucene101Codec() { + config.setCodec(new Elasticsearch900Lucene101Codec() { @Override public DocValuesFormat getDocValuesFormatForField(String field) { diff --git a/server/src/main/java/org/elasticsearch/index/codec/Elasticsearch900Lucene101Codec.java b/server/src/main/java/org/elasticsearch/index/codec/Elasticsearch900Lucene101Codec.java index ae7fa481a1caa..596f672d2630e 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/Elasticsearch900Lucene101Codec.java +++ b/server/src/main/java/org/elasticsearch/index/codec/Elasticsearch900Lucene101Codec.java @@ -17,7 +17,6 @@ import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat; import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat; import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat; -import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; import org.elasticsearch.index.codec.zstd.Zstd814StoredFieldsFormat; @@ -39,7 +38,7 @@ public PostingsFormat getPostingsFormatForField(String field) { }; private final DocValuesFormat defaultDVFormat; - private final DocValuesFormat docValuesFormat = new PerFieldDocValuesFormat() { + private final DocValuesFormat docValuesFormat = new XPerFieldDocValuesFormat() { @Override public DocValuesFormat getDocValuesFormatForField(String field) { return Elasticsearch900Lucene101Codec.this.getDocValuesFormatForField(field); diff --git a/server/src/main/java/org/elasticsearch/index/codec/XPerFieldDocValuesFormat.java b/server/src/main/java/org/elasticsearch/index/codec/XPerFieldDocValuesFormat.java new file mode 100644 index 0000000000000..571e85a1deb63 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/index/codec/XPerFieldDocValuesFormat.java @@ -0,0 +1,364 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.codec; + +import org.apache.lucene.codecs.DocValuesConsumer; +import org.apache.lucene.codecs.DocValuesFormat; +import org.apache.lucene.codecs.DocValuesProducer; +import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; +import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.index.DocValuesSkipper; +import org.apache.lucene.index.DocValuesType; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.MergeState; +import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.SortedDocValues; +import org.apache.lucene.index.SortedNumericDocValues; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.internal.hppc.IntObjectHashMap; +import org.apache.lucene.util.IOUtils; +import org.elasticsearch.core.SuppressForbidden; + +import java.io.Closeable; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.IdentityHashMap; +import java.util.Map; + +/** + * Fork of {@link PerFieldDocValuesFormat} to allow access FieldsReader's fields field, otherwise no changes. + */ +public abstract class XPerFieldDocValuesFormat extends DocValuesFormat { + /** Name of this {@link DocValuesFormat}. */ + public static final String PER_FIELD_NAME = "PerFieldDV40"; + + /** {@link FieldInfo} attribute name used to store the format name for each field. */ + public static final String PER_FIELD_FORMAT_KEY = XPerFieldDocValuesFormat.class.getSimpleName() + ".format"; + + /** {@link FieldInfo} attribute name used to store the segment suffix name for each field. */ + public static final String PER_FIELD_SUFFIX_KEY = XPerFieldDocValuesFormat.class.getSimpleName() + ".suffix"; + + /** Sole constructor. */ + protected XPerFieldDocValuesFormat() { + super(PER_FIELD_NAME); + } + + @Override + public final DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException { + return new FieldsWriter(state); + } + + record ConsumerAndSuffix(DocValuesConsumer consumer, int suffix) implements Closeable { + @Override + public void close() throws IOException { + consumer.close(); + } + } + + @SuppressForbidden(reason = "forked from Lucene") + private class FieldsWriter extends DocValuesConsumer { + + private final Map formats = new HashMap<>(); + private final Map suffixes = new HashMap<>(); + + private final SegmentWriteState segmentWriteState; + + FieldsWriter(SegmentWriteState state) { + segmentWriteState = state; + } + + @Override + public void addNumericField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException { + getInstance(field).addNumericField(field, valuesProducer); + } + + @Override + public void addBinaryField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException { + getInstance(field).addBinaryField(field, valuesProducer); + } + + @Override + public void addSortedField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException { + getInstance(field).addSortedField(field, valuesProducer); + } + + @Override + public void addSortedNumericField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException { + getInstance(field).addSortedNumericField(field, valuesProducer); + } + + @Override + public void addSortedSetField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException { + getInstance(field).addSortedSetField(field, valuesProducer); + } + + @Override + public void merge(MergeState mergeState) throws IOException { + Map> consumersToField = new IdentityHashMap<>(); + + // Group each consumer by the fields it handles + for (FieldInfo fi : mergeState.mergeFieldInfos) { + if (fi.getDocValuesType() == DocValuesType.NONE) { + continue; + } + // merge should ignore current format for the fields being merged + DocValuesConsumer consumer = getInstance(fi, true); + Collection fieldsForConsumer = consumersToField.get(consumer); + if (fieldsForConsumer == null) { + fieldsForConsumer = new ArrayList<>(); + consumersToField.put(consumer, fieldsForConsumer); + } + fieldsForConsumer.add(fi.name); + } + + // Delegate the merge to the appropriate consumer + for (Map.Entry> e : consumersToField.entrySet()) { + e.getKey().merge(XPerFieldMergeState.restrictFields(mergeState, e.getValue())); + } + } + + private DocValuesConsumer getInstance(FieldInfo field) throws IOException { + return getInstance(field, false); + } + + /** + * DocValuesConsumer for the given field. + * + * @param field - FieldInfo object. + * @param ignoreCurrentFormat - ignore the existing format attributes. + * @return DocValuesConsumer for the field. + * @throws IOException if there is a low-level IO error + */ + private DocValuesConsumer getInstance(FieldInfo field, boolean ignoreCurrentFormat) throws IOException { + DocValuesFormat format = null; + if (field.getDocValuesGen() != -1) { + String formatName = null; + if (ignoreCurrentFormat == false) { + formatName = field.getAttribute(PER_FIELD_FORMAT_KEY); + } + // this means the field never existed in that segment, yet is applied updates + if (formatName != null) { + format = DocValuesFormat.forName(formatName); + } + } + if (format == null) { + format = getDocValuesFormatForField(field.name); + } + if (format == null) { + throw new IllegalStateException("invalid null DocValuesFormat for field=\"" + field.name + "\""); + } + final String formatName = format.getName(); + + field.putAttribute(PER_FIELD_FORMAT_KEY, formatName); + Integer suffix = null; + + ConsumerAndSuffix consumer = formats.get(format); + if (consumer == null) { + // First time we are seeing this format; create a new instance + + if (field.getDocValuesGen() != -1) { + String suffixAtt = null; + if (ignoreCurrentFormat == false) { + suffixAtt = field.getAttribute(PER_FIELD_SUFFIX_KEY); + } + // even when dvGen is != -1, it can still be a new field, that never + // existed in the segment, and therefore doesn't have the recorded + // attributes yet. + if (suffixAtt != null) { + suffix = Integer.valueOf(suffixAtt); + } + } + + if (suffix == null) { + // bump the suffix + suffix = suffixes.get(formatName); + if (suffix == null) { + suffix = 0; + } else { + suffix = suffix + 1; + } + } + suffixes.put(formatName, suffix); + + final String segmentSuffix = getFullSegmentSuffix( + segmentWriteState.segmentSuffix, + getSuffix(formatName, Integer.toString(suffix)) + ); + consumer = new ConsumerAndSuffix(format.fieldsConsumer(new SegmentWriteState(segmentWriteState, segmentSuffix)), suffix); + formats.put(format, consumer); + } else { + // we've already seen this format, so just grab its suffix + assert suffixes.containsKey(formatName); + suffix = consumer.suffix; + } + + field.putAttribute(PER_FIELD_SUFFIX_KEY, Integer.toString(suffix)); + // TODO: we should only provide the "slice" of FIS + // that this DVF actually sees ... + return consumer.consumer; + } + + @Override + public void close() throws IOException { + // Close all subs + IOUtils.close(formats.values()); + } + } + + static String getSuffix(String formatName, String suffix) { + return formatName + "_" + suffix; + } + + static String getFullSegmentSuffix(String outerSegmentSuffix, String segmentSuffix) { + if (outerSegmentSuffix.length() == 0) { + return segmentSuffix; + } else { + return outerSegmentSuffix + "_" + segmentSuffix; + } + } + + @SuppressForbidden(reason = "forked from Lucene") + public static class FieldsReader extends DocValuesProducer { + + private final IntObjectHashMap fields = new IntObjectHashMap<>(); + private final Map formats = new HashMap<>(); + + // clone for merge + FieldsReader(FieldsReader other) { + Map oldToNew = new IdentityHashMap<>(); + // First clone all formats + for (Map.Entry ent : other.formats.entrySet()) { + DocValuesProducer values = ent.getValue().getMergeInstance(); + formats.put(ent.getKey(), values); + oldToNew.put(ent.getValue(), values); + } + + // Then rebuild fields: + for (IntObjectHashMap.IntObjectCursor ent : other.fields) { + DocValuesProducer producer = oldToNew.get(ent.value); + assert producer != null; + fields.put(ent.key, producer); + } + } + + FieldsReader(final SegmentReadState readState) throws IOException { + + // Init each unique format: + boolean success = false; + try { + // Read field name -> format name + for (FieldInfo fi : readState.fieldInfos) { + if (fi.getDocValuesType() != DocValuesType.NONE) { + final String fieldName = fi.name; + final String formatName = fi.getAttribute(PER_FIELD_FORMAT_KEY); + if (formatName != null) { + // null formatName means the field is in fieldInfos, but has no docvalues! + final String suffix = fi.getAttribute(PER_FIELD_SUFFIX_KEY); + if (suffix == null) { + throw new IllegalStateException("missing attribute: " + PER_FIELD_SUFFIX_KEY + " for field: " + fieldName); + } + DocValuesFormat format = DocValuesFormat.forName(formatName); + String segmentSuffix = getFullSegmentSuffix(readState.segmentSuffix, getSuffix(formatName, suffix)); + if (formats.containsKey(segmentSuffix) == false) { + formats.put(segmentSuffix, format.fieldsProducer(new SegmentReadState(readState, segmentSuffix))); + } + fields.put(fi.number, formats.get(segmentSuffix)); + } + } + } + success = true; + } finally { + if (success == false) { + IOUtils.closeWhileHandlingException(formats.values()); + } + } + } + + public DocValuesProducer getDocValuesProducer(FieldInfo field) { + return fields.get(field.number); + } + + @Override + public NumericDocValues getNumeric(FieldInfo field) throws IOException { + DocValuesProducer producer = fields.get(field.number); + return producer == null ? null : producer.getNumeric(field); + } + + @Override + public BinaryDocValues getBinary(FieldInfo field) throws IOException { + DocValuesProducer producer = fields.get(field.number); + return producer == null ? null : producer.getBinary(field); + } + + @Override + public SortedDocValues getSorted(FieldInfo field) throws IOException { + DocValuesProducer producer = fields.get(field.number); + return producer == null ? null : producer.getSorted(field); + } + + @Override + public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException { + DocValuesProducer producer = fields.get(field.number); + return producer == null ? null : producer.getSortedNumeric(field); + } + + @Override + public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException { + DocValuesProducer producer = fields.get(field.number); + return producer == null ? null : producer.getSortedSet(field); + } + + @Override + public DocValuesSkipper getSkipper(FieldInfo field) throws IOException { + DocValuesProducer producer = fields.get(field.number); + return producer == null ? null : producer.getSkipper(field); + } + + @Override + public void close() throws IOException { + IOUtils.close(formats.values()); + } + + @Override + public void checkIntegrity() throws IOException { + for (DocValuesProducer format : formats.values()) { + format.checkIntegrity(); + } + } + + @Override + public DocValuesProducer getMergeInstance() { + return new FieldsReader(this); + } + + @Override + public String toString() { + return "PerFieldDocValues(formats=" + formats.size() + ")"; + } + } + + @Override + public final DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException { + return new FieldsReader(state); + } + + /** + * Returns the doc values format that should be used for writing new segments of field + * . + * + *

The field to format mapping is written to the index, so this method is only invoked when + * writing, not when reading. + */ + public abstract DocValuesFormat getDocValuesFormatForField(String field); +} diff --git a/server/src/main/java/org/elasticsearch/index/codec/XPerFieldMergeState.java b/server/src/main/java/org/elasticsearch/index/codec/XPerFieldMergeState.java new file mode 100644 index 0000000000000..8c8b90c6b4bdd --- /dev/null +++ b/server/src/main/java/org/elasticsearch/index/codec/XPerFieldMergeState.java @@ -0,0 +1,261 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.codec; + +import org.apache.lucene.codecs.FieldsProducer; +import org.apache.lucene.index.DocValuesType; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.MergeState; +import org.apache.lucene.index.Terms; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Set; + +/** Fork of org.apache.lucene.codecs.perfield.PerFieldMergeState, because of {@link XPerFieldDocValuesFormat} */ +final class XPerFieldMergeState { + + /** + * Create a new MergeState from the given {@link MergeState} instance with restricted fields. + * + * @param fields The fields to keep in the new instance. + * @return The new MergeState with restricted fields + */ + static MergeState restrictFields(MergeState in, Collection fields) { + var fieldInfos = new FieldInfos[in.fieldInfos.length]; + for (int i = 0; i < in.fieldInfos.length; i++) { + fieldInfos[i] = new FilterFieldInfos(in.fieldInfos[i], fields); + } + var fieldsProducers = new FieldsProducer[in.fieldsProducers.length]; + for (int i = 0; i < in.fieldsProducers.length; i++) { + fieldsProducers[i] = in.fieldsProducers[i] == null ? null : new FilterFieldsProducer(in.fieldsProducers[i], fields); + } + var mergeFieldInfos = new FilterFieldInfos(in.mergeFieldInfos, fields); + return new MergeState( + in.docMaps, + in.segmentInfo, + mergeFieldInfos, + in.storedFieldsReaders, + in.termVectorsReaders, + in.normsProducers, + in.docValuesProducers, + fieldInfos, + in.liveDocs, + fieldsProducers, + in.pointsReaders, + in.knnVectorsReaders, + in.maxDocs, + in.infoStream, + in.intraMergeTaskExecutor, + in.needsIndexSort + ); + } + + private static class FilterFieldInfos extends FieldInfos { + private final Set filteredNames; + private final List filtered; + + // Copy of the private fields from FieldInfos + // Renamed so as to be less confusing about which fields we're referring to + private final boolean filteredHasVectors; + private final boolean filteredHasPostings; + private final boolean filteredHasProx; + private final boolean filteredHasPayloads; + private final boolean filteredHasOffsets; + private final boolean filteredHasFreq; + private final boolean filteredHasNorms; + private final boolean filteredHasDocValues; + private final boolean filteredHasPointValues; + + FilterFieldInfos(FieldInfos src, Collection filterFields) { + // Copy all the input FieldInfo objects since the field numbering must be kept consistent + super(toArray(src)); + + boolean hasVectors = false; + boolean hasPostings = false; + boolean hasProx = false; + boolean hasPayloads = false; + boolean hasOffsets = false; + boolean hasFreq = false; + boolean hasNorms = false; + boolean hasDocValues = false; + boolean hasPointValues = false; + + this.filteredNames = new HashSet<>(filterFields); + this.filtered = new ArrayList<>(filterFields.size()); + for (FieldInfo fi : src) { + if (this.filteredNames.contains(fi.name)) { + this.filtered.add(fi); + hasVectors |= fi.hasTermVectors(); + hasPostings |= fi.getIndexOptions() != IndexOptions.NONE; + hasProx |= fi.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + hasFreq |= fi.getIndexOptions() != IndexOptions.DOCS; + hasOffsets |= fi.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; + hasNorms |= fi.hasNorms(); + hasDocValues |= fi.getDocValuesType() != DocValuesType.NONE; + hasPayloads |= fi.hasPayloads(); + hasPointValues |= (fi.getPointDimensionCount() != 0); + } + } + + this.filteredHasVectors = hasVectors; + this.filteredHasPostings = hasPostings; + this.filteredHasProx = hasProx; + this.filteredHasPayloads = hasPayloads; + this.filteredHasOffsets = hasOffsets; + this.filteredHasFreq = hasFreq; + this.filteredHasNorms = hasNorms; + this.filteredHasDocValues = hasDocValues; + this.filteredHasPointValues = hasPointValues; + } + + private static FieldInfo[] toArray(FieldInfos src) { + FieldInfo[] res = new FieldInfo[src.size()]; + int i = 0; + for (FieldInfo fi : src) { + res[i++] = fi; + } + return res; + } + + @Override + public Iterator iterator() { + return filtered.iterator(); + } + + @Override + public boolean hasFreq() { + return filteredHasFreq; + } + + @Override + public boolean hasPostings() { + return filteredHasPostings; + } + + @Override + public boolean hasProx() { + return filteredHasProx; + } + + @Override + public boolean hasPayloads() { + return filteredHasPayloads; + } + + @Override + public boolean hasOffsets() { + return filteredHasOffsets; + } + + @Override + public boolean hasTermVectors() { + return filteredHasVectors; + } + + @Override + public boolean hasNorms() { + return filteredHasNorms; + } + + @Override + public boolean hasDocValues() { + return filteredHasDocValues; + } + + @Override + public boolean hasPointValues() { + return filteredHasPointValues; + } + + @Override + public int size() { + return filtered.size(); + } + + @Override + public FieldInfo fieldInfo(String fieldName) { + if (filteredNames.contains(fieldName) == false) { + // Throw IAE to be consistent with fieldInfo(int) which throws it as well on invalid numbers + throw new IllegalArgumentException( + "The field named '" + + fieldName + + "' is not accessible in the current " + + "merge context, available ones are: " + + filteredNames + ); + } + return super.fieldInfo(fieldName); + } + + @Override + public FieldInfo fieldInfo(int fieldNumber) { + FieldInfo res = super.fieldInfo(fieldNumber); + if (filteredNames.contains(res.name) == false) { + throw new IllegalArgumentException( + "The field named '" + + res.name + + "' numbered '" + + fieldNumber + + "' is not " + + "accessible in the current merge context, available ones are: " + + filteredNames + ); + } + return res; + } + } + + private static class FilterFieldsProducer extends FieldsProducer { + private final FieldsProducer in; + private final List filtered; + + FilterFieldsProducer(FieldsProducer in, Collection filterFields) { + this.in = in; + this.filtered = new ArrayList<>(filterFields); + } + + @Override + public Iterator iterator() { + return filtered.iterator(); + } + + @Override + public Terms terms(String field) throws IOException { + if (filtered.contains(field) == false) { + throw new IllegalArgumentException( + "The field named '" + field + "' is not accessible in the current " + "merge context, available ones are: " + filtered + ); + } + return in.terms(field); + } + + @Override + public int size() { + return filtered.size(); + } + + @Override + public void close() throws IOException { + in.close(); + } + + @Override + public void checkIntegrity() throws IOException { + in.checkIntegrity(); + } + } +} diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/DocValuesConsumerUtil.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/DocValuesConsumerUtil.java index fb0a494bbb1ca..211ca46e870a5 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/DocValuesConsumerUtil.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/DocValuesConsumerUtil.java @@ -10,9 +10,9 @@ package org.elasticsearch.index.codec.tsdb.es819; import org.apache.lucene.codecs.DocValuesProducer; -import org.apache.lucene.index.DocValues; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.MergeState; +import org.elasticsearch.index.codec.XPerFieldDocValuesFormat; import java.io.IOException; @@ -43,63 +43,50 @@ static MergeStats compatibleWithOptimizedMerge(boolean optimizedMergeEnabled, Me for (int i = 0; i < mergeState.docValuesProducers.length; i++) { DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; - switch (fieldInfo.getDocValuesType()) { - case NUMERIC -> { - var numeric = docValuesProducer.getNumeric(fieldInfo); - // (checking instance type as serves as a version check) - if (numeric instanceof ES819TSDBDocValuesProducer.BaseNumericDocValues baseNumeric) { - var entry = baseNumeric.entry; - sumNumValues += entry.numValues; - sumNumDocsWithField += entry.numDocsWithField; - } else if (numeric != null) { - return UNSUPPORTED; - } - } - case SORTED_NUMERIC -> { - var sortedNumeric = docValuesProducer.getSortedNumeric(fieldInfo); - if (sortedNumeric instanceof ES819TSDBDocValuesProducer.BaseSortedNumericDocValues baseSortedNumericDocValues) { - var entry = baseSortedNumericDocValues.entry; - sumNumValues += entry.numValues; - sumNumDocsWithField += entry.numDocsWithField; - } else { - var singleton = DocValues.unwrapSingleton(sortedNumeric); - if (singleton instanceof ES819TSDBDocValuesProducer.BaseNumericDocValues baseNumeric) { - var entry = baseNumeric.entry; - sumNumValues += entry.numValues; - sumNumDocsWithField += entry.numDocsWithField; - } else if (sortedNumeric != null) { - return UNSUPPORTED; + if (docValuesProducer instanceof XPerFieldDocValuesFormat.FieldsReader perFieldReader) { + var wrapped = perFieldReader.getDocValuesProducer(fieldInfo); + if (wrapped instanceof ES819TSDBDocValuesProducer tsdbDocValuesProducer) { + switch (fieldInfo.getDocValuesType()) { + case NUMERIC -> { + var entry = tsdbDocValuesProducer.numerics.get(fieldInfo.number); + if (entry != null) { + sumNumValues += entry.numValues; + sumNumDocsWithField += entry.numDocsWithField; + } } - } - } - case SORTED -> { - var sorted = docValuesProducer.getSorted(fieldInfo); - if (sorted instanceof ES819TSDBDocValuesProducer.BaseSortedDocValues baseSortedDocValues) { - var entry = baseSortedDocValues.entry; - sumNumValues += entry.ordsEntry.numValues; - sumNumDocsWithField += entry.ordsEntry.numDocsWithField; - } else if (sorted != null) { - return UNSUPPORTED; - } - } - case SORTED_SET -> { - var sortedSet = docValuesProducer.getSortedSet(fieldInfo); - if (sortedSet instanceof ES819TSDBDocValuesProducer.BaseSortedSetDocValues baseSortedSet) { - var entry = baseSortedSet.entry; - sumNumValues += entry.ordsEntry.numValues; - sumNumDocsWithField += entry.ordsEntry.numDocsWithField; - } else { - var singleton = DocValues.unwrapSingleton(sortedSet); - if (singleton instanceof ES819TSDBDocValuesProducer.BaseSortedDocValues baseSorted) { - var entry = baseSorted.entry; - sumNumValues += entry.ordsEntry.numValues; - sumNumDocsWithField += entry.ordsEntry.numDocsWithField; - } else if (sortedSet != null) { - return UNSUPPORTED; + case SORTED_NUMERIC -> { + var entry = tsdbDocValuesProducer.sortedNumerics.get(fieldInfo.number); + if (entry != null) { + sumNumValues += entry.numValues; + sumNumDocsWithField += entry.numDocsWithField; + } + } + case SORTED -> { + var entry = tsdbDocValuesProducer.sorted.get(fieldInfo.number); + if (entry != null) { + sumNumValues += entry.ordsEntry.numValues; + sumNumDocsWithField += entry.ordsEntry.numDocsWithField; + } } + case SORTED_SET -> { + var entry = tsdbDocValuesProducer.sortedSets.get(fieldInfo.number); + if (entry != null) { + if (entry.singleValueEntry != null) { + sumNumValues += entry.singleValueEntry.ordsEntry.numValues; + sumNumDocsWithField += entry.singleValueEntry.ordsEntry.numDocsWithField; + } else { + sumNumValues += entry.ordsEntry.numValues; + sumNumDocsWithField += entry.ordsEntry.numDocsWithField; + } + } + } + default -> throw new IllegalStateException("unexpected doc values producer type: " + fieldInfo.getDocValuesType()); } + } else { + return UNSUPPORTED; } - default -> throw new IllegalStateException("unexpected doc values producer type: " + fieldInfo.getDocValuesType()); + } else { + return UNSUPPORTED; } } diff --git a/server/src/test/java/org/elasticsearch/index/codec/tsdb/DocValuesCodecDuelTests.java b/server/src/test/java/org/elasticsearch/index/codec/tsdb/DocValuesCodecDuelTests.java index ea6d944a1271c..388660e4e0c18 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/tsdb/DocValuesCodecDuelTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/tsdb/DocValuesCodecDuelTests.java @@ -9,6 +9,8 @@ package org.elasticsearch.index.codec.tsdb; +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat; import org.apache.lucene.document.BinaryDocValuesField; import org.apache.lucene.document.Document; @@ -24,6 +26,7 @@ import org.apache.lucene.tests.index.RandomIndexWriter; import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.BytesRef; +import org.elasticsearch.index.codec.Elasticsearch900Lucene101Codec; import org.elasticsearch.index.codec.tsdb.ES87TSDBDocValuesFormatTests.TestES87TSDBDocValuesFormat; import org.elasticsearch.index.codec.tsdb.es819.ES819TSDBDocValuesFormat; import org.elasticsearch.test.ESTestCase; @@ -51,9 +54,21 @@ public void testDuel() throws IOException { baselineConfig.setMergePolicy(mergePolicy); baselineConfig.setCodec(TestUtil.alwaysDocValuesFormat(new Lucene90DocValuesFormat())); var contenderConf = newIndexWriterConfig(); - contenderConf.setCodec( - TestUtil.alwaysDocValuesFormat(rarely() ? new TestES87TSDBDocValuesFormat() : new ES819TSDBDocValuesFormat()) - ); + if (randomBoolean()) { + contenderConf.setMergePolicy(mergePolicy); + } + Codec codec = new Elasticsearch900Lucene101Codec() { + + final DocValuesFormat docValuesFormat = randomBoolean() + ? new ES819TSDBDocValuesFormat() + : new TestES87TSDBDocValuesFormat(); + + @Override + public DocValuesFormat getDocValuesFormatForField(String field) { + return docValuesFormat; + } + }; + contenderConf.setCodec(codec); contenderConf.setMergePolicy(mergePolicy); try ( diff --git a/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java index c6c721336dba0..a219ebb3740cc 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java @@ -12,6 +12,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.DocValuesConsumer; +import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.SortedDocValuesField; @@ -33,8 +34,9 @@ import org.apache.lucene.tests.analysis.MockAnalyzer; import org.apache.lucene.tests.index.BaseDocValuesFormatTestCase; import org.apache.lucene.tests.index.RandomIndexWriter; -import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.BytesRef; +import org.elasticsearch.common.logging.LogConfigurator; +import org.elasticsearch.index.codec.Elasticsearch900Lucene101Codec; import java.io.IOException; import java.util.ArrayList; @@ -50,6 +52,12 @@ public class ES87TSDBDocValuesFormatTests extends BaseDocValuesFormatTestCase { private static final int NUM_DOCS = 10; + static { + // For Elasticsearch900Lucene101Codec: + LogConfigurator.loadLog4jPlugins(); + LogConfigurator.configureESLogging(); + } + static class TestES87TSDBDocValuesFormat extends ES87TSDBDocValuesFormat { TestES87TSDBDocValuesFormat() { @@ -66,7 +74,13 @@ public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOExcept } } - private final Codec codec = TestUtil.alwaysDocValuesFormat(new TestES87TSDBDocValuesFormat()); + private final Codec codec = new Elasticsearch900Lucene101Codec() { + + @Override + public DocValuesFormat getDocValuesFormatForField(String field) { + return new TestES87TSDBDocValuesFormat(); + } + }; @Override protected Codec getCodec() { diff --git a/server/src/test/java/org/elasticsearch/index/codec/tsdb/TsdbDocValueBwcTests.java b/server/src/test/java/org/elasticsearch/index/codec/tsdb/TsdbDocValueBwcTests.java index 32b2a90322911..c5c6df6491850 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/tsdb/TsdbDocValueBwcTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/tsdb/TsdbDocValueBwcTests.java @@ -10,6 +10,7 @@ package org.elasticsearch.index.codec.tsdb; import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.DocValuesProducer; import org.apache.lucene.document.Document; import org.apache.lucene.document.NumericDocValuesField; @@ -31,6 +32,7 @@ import org.apache.lucene.util.BytesRef; import org.elasticsearch.cluster.metadata.DataStream; import org.elasticsearch.core.SuppressForbidden; +import org.elasticsearch.index.codec.Elasticsearch900Lucene101Codec; import org.elasticsearch.index.codec.tsdb.ES87TSDBDocValuesFormatTests.TestES87TSDBDocValuesFormat; import org.elasticsearch.index.codec.tsdb.es819.ES819TSDBDocValuesFormat; import org.elasticsearch.test.ESTestCase; @@ -46,7 +48,15 @@ public class TsdbDocValueBwcTests extends ESTestCase { public void testMixedIndex() throws Exception { Codec oldCodec = TestUtil.alwaysDocValuesFormat(new TestES87TSDBDocValuesFormat()); - Codec newCodec = TestUtil.alwaysDocValuesFormat(new ES819TSDBDocValuesFormat()); + Codec newCodec = new Elasticsearch900Lucene101Codec() { + + final ES819TSDBDocValuesFormat docValuesFormat = new ES819TSDBDocValuesFormat(); + + @Override + public DocValuesFormat getDocValuesFormatForField(String field) { + return docValuesFormat; + } + }; testMixedIndex(oldCodec, newCodec); } diff --git a/server/src/test/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesFormatTests.java index 44242810969b9..2e787c9e56d6d 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesFormatTests.java @@ -10,6 +10,7 @@ package org.elasticsearch.index.codec.tsdb.es819; import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.document.Document; import org.apache.lucene.document.NumericDocValuesField; import org.apache.lucene.document.SortedDocValuesField; @@ -23,9 +24,9 @@ import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.apache.lucene.search.SortedNumericSortField; -import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.BytesRef; import org.elasticsearch.cluster.metadata.DataStream; +import org.elasticsearch.index.codec.Elasticsearch900Lucene101Codec; import org.elasticsearch.index.codec.tsdb.ES87TSDBDocValuesFormatTests; import java.util.Arrays; @@ -33,7 +34,15 @@ public class ES819TSDBDocValuesFormatTests extends ES87TSDBDocValuesFormatTests { - private final Codec codec = TestUtil.alwaysDocValuesFormat(new ES819TSDBDocValuesFormat()); + private final Codec codec = new Elasticsearch900Lucene101Codec() { + + final ES819TSDBDocValuesFormat docValuesFormat = new ES819TSDBDocValuesFormat(); + + @Override + public DocValuesFormat getDocValuesFormatForField(String field) { + return docValuesFormat; + } + }; @Override protected Codec getCodec() { From c3abb0e81a5c66bc2c9181cf5cca2a9c9000926f Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Sat, 5 Apr 2025 00:35:43 +0200 Subject: [PATCH 36/43] oops --- .../index/codec/tsdb/DocValuesCodecDuelTests.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/server/src/test/java/org/elasticsearch/index/codec/tsdb/DocValuesCodecDuelTests.java b/server/src/test/java/org/elasticsearch/index/codec/tsdb/DocValuesCodecDuelTests.java index 388660e4e0c18..f0ce28f11a51a 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/tsdb/DocValuesCodecDuelTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/tsdb/DocValuesCodecDuelTests.java @@ -54,9 +54,7 @@ public void testDuel() throws IOException { baselineConfig.setMergePolicy(mergePolicy); baselineConfig.setCodec(TestUtil.alwaysDocValuesFormat(new Lucene90DocValuesFormat())); var contenderConf = newIndexWriterConfig(); - if (randomBoolean()) { - contenderConf.setMergePolicy(mergePolicy); - } + contenderConf.setMergePolicy(mergePolicy); Codec codec = new Elasticsearch900Lucene101Codec() { final DocValuesFormat docValuesFormat = randomBoolean() From 6921dd8036747eb2faaa2785103d8880655f2d8e Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Sat, 5 Apr 2025 00:39:32 +0200 Subject: [PATCH 37/43] rename codec name --- .../org/elasticsearch/index/codec/XPerFieldDocValuesFormat.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/index/codec/XPerFieldDocValuesFormat.java b/server/src/main/java/org/elasticsearch/index/codec/XPerFieldDocValuesFormat.java index 571e85a1deb63..2cd650f975a67 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/XPerFieldDocValuesFormat.java +++ b/server/src/main/java/org/elasticsearch/index/codec/XPerFieldDocValuesFormat.java @@ -41,7 +41,7 @@ */ public abstract class XPerFieldDocValuesFormat extends DocValuesFormat { /** Name of this {@link DocValuesFormat}. */ - public static final String PER_FIELD_NAME = "PerFieldDV40"; + public static final String PER_FIELD_NAME = "ESPerFieldDV819"; /** {@link FieldInfo} attribute name used to store the format name for each field. */ public static final String PER_FIELD_FORMAT_KEY = XPerFieldDocValuesFormat.class.getSimpleName() + ".format"; From 3e60a74e239e08f4a8a3727ff72e34d1100374d2 Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Sat, 5 Apr 2025 00:47:08 +0200 Subject: [PATCH 38/43] removed unused exception --- .../index/codec/tsdb/es819/DocValuesConsumerUtil.java | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/DocValuesConsumerUtil.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/DocValuesConsumerUtil.java index 211ca46e870a5..471a7dcdea280 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/DocValuesConsumerUtil.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/DocValuesConsumerUtil.java @@ -14,8 +14,6 @@ import org.apache.lucene.index.MergeState; import org.elasticsearch.index.codec.XPerFieldDocValuesFormat; -import java.io.IOException; - /** * Contains logic to determine whether optimized merge can occur. */ @@ -25,8 +23,7 @@ class DocValuesConsumerUtil { record MergeStats(boolean supported, long sumNumValues, int sumNumDocsWithField) {} - static MergeStats compatibleWithOptimizedMerge(boolean optimizedMergeEnabled, MergeState mergeState, FieldInfo fieldInfo) - throws IOException { + static MergeStats compatibleWithOptimizedMerge(boolean optimizedMergeEnabled, MergeState mergeState, FieldInfo fieldInfo) { if (optimizedMergeEnabled == false || mergeState.needsIndexSort == false) { return UNSUPPORTED; } From 5a2da252c810ed67d5decd3c8124a40d50635819 Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Sat, 5 Apr 2025 08:51:46 +0200 Subject: [PATCH 39/43] address bwc failures --- .../index/codec/XPerFieldDocValuesFormat.java | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/codec/XPerFieldDocValuesFormat.java b/server/src/main/java/org/elasticsearch/index/codec/XPerFieldDocValuesFormat.java index 2cd650f975a67..a8a39122dd71f 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/XPerFieldDocValuesFormat.java +++ b/server/src/main/java/org/elasticsearch/index/codec/XPerFieldDocValuesFormat.java @@ -44,10 +44,13 @@ public abstract class XPerFieldDocValuesFormat extends DocValuesFormat { public static final String PER_FIELD_NAME = "ESPerFieldDV819"; /** {@link FieldInfo} attribute name used to store the format name for each field. */ - public static final String PER_FIELD_FORMAT_KEY = XPerFieldDocValuesFormat.class.getSimpleName() + ".format"; + // FORK note: usage of PerFieldDocValuesFormat is needed for bwc purposes. + // (Otherwise, we load no fields from indices that use PerFieldDocValuesFormat) + public static final String PER_FIELD_FORMAT_KEY = PerFieldDocValuesFormat.class.getSimpleName() + ".format"; /** {@link FieldInfo} attribute name used to store the segment suffix name for each field. */ - public static final String PER_FIELD_SUFFIX_KEY = XPerFieldDocValuesFormat.class.getSimpleName() + ".suffix"; + // FORK note: usage of PerFieldDocValuesFormat is needed for bwc purposes. + public static final String PER_FIELD_SUFFIX_KEY = PerFieldDocValuesFormat.class.getSimpleName() + ".suffix"; /** Sole constructor. */ protected XPerFieldDocValuesFormat() { @@ -285,6 +288,7 @@ public static class FieldsReader extends DocValuesProducer { } } + // FORK note: the reason why PerFieldDocValuesFormat is forked:. public DocValuesProducer getDocValuesProducer(FieldInfo field) { return fields.get(field.number); } From 39dc98f93202e136a6097a9674a95fc385d6beb2 Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Sat, 5 Apr 2025 09:04:42 +0200 Subject: [PATCH 40/43] Remove BaseNumericDocValues and BaseSortedNumericDocValues --- .../es819/ES819TSDBDocValuesProducer.java | 74 ++----------------- 1 file changed, 7 insertions(+), 67 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesProducer.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesProducer.java index eaf13964df316..22172268add5f 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesProducer.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesProducer.java @@ -1046,7 +1046,7 @@ private NumericDocValues getNumeric(NumericEntry entry, long maxOrd) throws IOEx // Special case for maxOrd 1, no need to read blocks and use ordinal 0 as only value if (entry.docsWithFieldOffset == -1) { // Special case when all docs have a value - return new BaseNumericDocValues(entry) { + return new NumericDocValues() { private final int maxDoc = ES819TSDBDocValuesProducer.this.maxDoc; private int doc = -1; @@ -1095,7 +1095,7 @@ public long cost() { entry.denseRankPower, entry.numValues ); - return new BaseNumericDocValues(entry) { + return new NumericDocValues() { @Override public int advance(int target) throws IOException { @@ -1140,7 +1140,7 @@ public long longValue() { final int bitsPerOrd = maxOrd >= 0 ? PackedInts.bitsRequired(maxOrd - 1) : -1; if (entry.docsWithFieldOffset == -1) { // dense - return new BaseNumericDocValues(entry) { + return new NumericDocValues() { private final int maxDoc = ES819TSDBDocValuesProducer.this.maxDoc; private int doc = -1; @@ -1207,7 +1207,7 @@ public long longValue() throws IOException { entry.denseRankPower, entry.numValues ); - return new BaseNumericDocValues(entry) { + return new NumericDocValues() { private final TSDBDocValuesEncoder decoder = new TSDBDocValuesEncoder(ES819TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE); private long currentBlockIndex = -1; @@ -1262,15 +1262,6 @@ public long longValue() throws IOException { } } - abstract static class BaseNumericDocValues extends NumericDocValues { - - final NumericEntry entry; - - BaseNumericDocValues(NumericEntry entry) { - this.entry = entry; - } - } - private NumericValues getValues(NumericEntry entry, final long maxOrd) throws IOException { assert entry.numValues > 0; final RandomAccessInput indexSlice = data.randomAccessSlice(entry.indexOffset, entry.indexLength); @@ -1307,49 +1298,7 @@ long advance(long index) throws IOException { private SortedNumericDocValues getSortedNumeric(SortedNumericEntry entry, long maxOrd) throws IOException { if (entry.numValues == entry.numDocsWithField) { - var numeric = getNumeric(entry, maxOrd); - if (merging) { - return new BaseSortedNumericDocValues(entry) { - - @Override - public long nextValue() throws IOException { - return numeric.longValue(); - } - - @Override - public int docValueCount() { - return 1; - } - - @Override - public boolean advanceExact(int target) throws IOException { - return numeric.advanceExact(target); - } - - @Override - public int docID() { - return numeric.docID(); - } - - @Override - public int nextDoc() throws IOException { - return numeric.nextDoc(); - } - - @Override - public int advance(int target) throws IOException { - return numeric.advance(target); - } - - @Override - public long cost() { - return numeric.cost(); - } - }; - } else { - // Required otherwise search / compute engine can't otherwise optimize for when each document has exactly one value: - return DocValues.singleton(getNumeric(entry, maxOrd)); - } + return DocValues.singleton(getNumeric(entry, maxOrd)); } final RandomAccessInput addressesInput = data.randomAccessSlice(entry.addressesOffset, entry.addressesLength); @@ -1359,7 +1308,7 @@ public long cost() { if (entry.docsWithFieldOffset == -1) { // dense - return new BaseSortedNumericDocValues(entry) { + return new SortedNumericDocValues() { int doc = -1; long start, end; @@ -1420,7 +1369,7 @@ public int docValueCount() { entry.denseRankPower, entry.numDocsWithField ); - return new BaseSortedNumericDocValues(entry) { + return new SortedNumericDocValues() { boolean set; long start, end; @@ -1479,15 +1428,6 @@ private void set() { } } - abstract static class BaseSortedNumericDocValues extends SortedNumericDocValues { - - final SortedNumericEntry entry; - - BaseSortedNumericDocValues(SortedNumericEntry entry) { - this.entry = entry; - } - } - private record DocValuesSkipperEntry(long offset, long length, long minValue, long maxValue, int docCount, int maxDocId) {} static class NumericEntry { From 5bfb3020f9749ad27c42b1ddc2873edcb9556c30 Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Sat, 5 Apr 2025 14:30:06 +0200 Subject: [PATCH 41/43] improve TsdbDocValueBwcTests --- .../index/codec/XPerFieldDocValuesFormat.java | 6 +- .../codec/tsdb/TsdbDocValueBwcTests.java | 153 +++++++++++------- 2 files changed, 96 insertions(+), 63 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/codec/XPerFieldDocValuesFormat.java b/server/src/main/java/org/elasticsearch/index/codec/XPerFieldDocValuesFormat.java index a8a39122dd71f..f7b86e614499c 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/XPerFieldDocValuesFormat.java +++ b/server/src/main/java/org/elasticsearch/index/codec/XPerFieldDocValuesFormat.java @@ -288,11 +288,15 @@ public static class FieldsReader extends DocValuesProducer { } } - // FORK note: the reason why PerFieldDocValuesFormat is forked:. + // FORK note: the reason why PerFieldDocValuesFormat is forked: public DocValuesProducer getDocValuesProducer(FieldInfo field) { return fields.get(field.number); } + public Map getFormats() { + return formats; + } + @Override public NumericDocValues getNumeric(FieldInfo field) throws IOException { DocValuesProducer producer = fields.get(field.number); diff --git a/server/src/test/java/org/elasticsearch/index/codec/tsdb/TsdbDocValueBwcTests.java b/server/src/test/java/org/elasticsearch/index/codec/tsdb/TsdbDocValueBwcTests.java index c5c6df6491850..adc0cb9347510 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/tsdb/TsdbDocValueBwcTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/tsdb/TsdbDocValueBwcTests.java @@ -32,7 +32,9 @@ import org.apache.lucene.util.BytesRef; import org.elasticsearch.cluster.metadata.DataStream; import org.elasticsearch.core.SuppressForbidden; +import org.elasticsearch.index.codec.Elasticsearch816Codec; import org.elasticsearch.index.codec.Elasticsearch900Lucene101Codec; +import org.elasticsearch.index.codec.XPerFieldDocValuesFormat; import org.elasticsearch.index.codec.tsdb.ES87TSDBDocValuesFormatTests.TestES87TSDBDocValuesFormat; import org.elasticsearch.index.codec.tsdb.es819.ES819TSDBDocValuesFormat; import org.elasticsearch.test.ESTestCase; @@ -48,9 +50,23 @@ public class TsdbDocValueBwcTests extends ESTestCase { public void testMixedIndex() throws Exception { Codec oldCodec = TestUtil.alwaysDocValuesFormat(new TestES87TSDBDocValuesFormat()); + Codec newCodec = TestUtil.alwaysDocValuesFormat(new ES819TSDBDocValuesFormat()); + testMixedIndex(oldCodec, newCodec); + } + + public void testMixedIndex816To900Lucene101() throws Exception { + Codec oldCodec = new Elasticsearch816Codec() { + + final DocValuesFormat docValuesFormat = new TestES87TSDBDocValuesFormat(); + + @Override + public DocValuesFormat getDocValuesFormatForField(String field) { + return docValuesFormat; + } + }; Codec newCodec = new Elasticsearch900Lucene101Codec() { - final ES819TSDBDocValuesFormat docValuesFormat = new ES819TSDBDocValuesFormat(); + final DocValuesFormat docValuesFormat = new ES819TSDBDocValuesFormat(); @Override public DocValuesFormat getDocValuesFormatForField(String field) { @@ -111,55 +127,53 @@ void testMixedIndex(Codec oldCodec, Codec newCodec) throws IOException, NoSuchFi } } // Check documents before force merge: - try (var iw = new IndexWriter(dir, getTimeSeriesIndexWriterConfig(hostnameField, timestampField, newCodec))) { - try (var reader = DirectoryReader.open(iw)) { - assertOldDocValuesFormatVersion(reader); + try (var reader = DirectoryReader.open(dir)) { + assertOldDocValuesFormatVersion(reader); - var hostNameDV = MultiDocValues.getSortedValues(reader, hostnameField); - assertNotNull(hostNameDV); - var timestampDV = MultiDocValues.getSortedNumericValues(reader, timestampField); - assertNotNull(timestampDV); - var counterOneDV = MultiDocValues.getNumericValues(reader, "counter_1"); - if (counterOneDV == null) { - counterOneDV = DocValues.emptyNumeric(); - } - var gaugeOneDV = MultiDocValues.getSortedNumericValues(reader, "gauge_1"); - if (gaugeOneDV == null) { - gaugeOneDV = DocValues.emptySortedNumeric(); - } - var tagsDV = MultiDocValues.getSortedSetValues(reader, "tags"); - if (tagsDV == null) { - tagsDV = DocValues.emptySortedSet(); - } - for (int i = 0; i < numDocs; i++) { - assertEquals(i, hostNameDV.nextDoc()); - String actualHostName = hostNameDV.lookupOrd(hostNameDV.ordValue()).utf8ToString(); - assertTrue("unexpected host name:" + actualHostName, actualHostName.startsWith("host-")); + var hostNameDV = MultiDocValues.getSortedValues(reader, hostnameField); + assertNotNull(hostNameDV); + var timestampDV = MultiDocValues.getSortedNumericValues(reader, timestampField); + assertNotNull(timestampDV); + var counterOneDV = MultiDocValues.getNumericValues(reader, "counter_1"); + if (counterOneDV == null) { + counterOneDV = DocValues.emptyNumeric(); + } + var gaugeOneDV = MultiDocValues.getSortedNumericValues(reader, "gauge_1"); + if (gaugeOneDV == null) { + gaugeOneDV = DocValues.emptySortedNumeric(); + } + var tagsDV = MultiDocValues.getSortedSetValues(reader, "tags"); + if (tagsDV == null) { + tagsDV = DocValues.emptySortedSet(); + } + for (int i = 0; i < numDocs; i++) { + assertEquals(i, hostNameDV.nextDoc()); + String actualHostName = hostNameDV.lookupOrd(hostNameDV.ordValue()).utf8ToString(); + assertTrue("unexpected host name:" + actualHostName, actualHostName.startsWith("host-")); - assertEquals(i, timestampDV.nextDoc()); - long timestamp = timestampDV.nextValue(); - long lowerBound = baseTimestamp; - long upperBound = baseTimestamp + numDocs; - assertTrue( - "unexpected timestamp [" + timestamp + "], expected between [" + lowerBound + "] and [" + upperBound + "]", - timestamp >= lowerBound && timestamp < upperBound - ); - if (counterOneDV.advanceExact(i)) { - long counterOneValue = counterOneDV.longValue(); - assertTrue("unexpected counter [" + counterOneValue + "]", counterOneValue >= 0 && counterOneValue < counter1); - } - if (gaugeOneDV.advanceExact(i)) { - for (int j = 0; j < gaugeOneDV.docValueCount(); j++) { - long value = gaugeOneDV.nextValue(); - assertTrue("unexpected gauge [" + value + "]", Arrays.binarySearch(gauge1Values, value) >= 0); - } + assertEquals(i, timestampDV.nextDoc()); + long timestamp = timestampDV.nextValue(); + long lowerBound = baseTimestamp; + long upperBound = baseTimestamp + numDocs; + assertTrue( + "unexpected timestamp [" + timestamp + "], expected between [" + lowerBound + "] and [" + upperBound + "]", + timestamp >= lowerBound && timestamp < upperBound + ); + if (counterOneDV.advanceExact(i)) { + long counterOneValue = counterOneDV.longValue(); + assertTrue("unexpected counter [" + counterOneValue + "]", counterOneValue >= 0 && counterOneValue < counter1); + } + if (gaugeOneDV.advanceExact(i)) { + for (int j = 0; j < gaugeOneDV.docValueCount(); j++) { + long value = gaugeOneDV.nextValue(); + assertTrue("unexpected gauge [" + value + "]", Arrays.binarySearch(gauge1Values, value) >= 0); } - if (tagsDV.advanceExact(i)) { - for (int j = 0; j < tagsDV.docValueCount(); j++) { - long ordinal = tagsDV.nextOrd(); - String actualTag = tagsDV.lookupOrd(ordinal).utf8ToString(); - assertTrue("unexpected tag [" + actualTag + "]", Arrays.binarySearch(tags, actualTag) >= 0); - } + } + if (tagsDV.advanceExact(i)) { + for (int j = 0; j < tagsDV.docValueCount(); j++) { + long ordinal = tagsDV.nextOrd(); + String actualTag = tagsDV.lookupOrd(ordinal).utf8ToString(); + assertTrue("unexpected tag [" + actualTag + "]", Arrays.binarySearch(tags, actualTag) >= 0); } } } @@ -259,6 +273,7 @@ private void assertOldDocValuesFormatVersion(DirectoryReader reader) throws NoSu var dvReader = leaf.getDocValuesReader(); var field = getFormatsFieldFromPerFieldFieldsReader(dvReader.getClass()); Map formats = (Map) field.get(dvReader); + assertThat(formats, Matchers.aMapWithSize(1)); var tsdbDvReader = (DocValuesProducer) formats.get("ES87TSDB_0"); tsdbDvReader.checkIntegrity(); assertThat(tsdbDvReader, Matchers.instanceOf(ES87TSDBDocValuesProducer.class)); @@ -267,25 +282,39 @@ private void assertOldDocValuesFormatVersion(DirectoryReader reader) throws NoSu private void assertNewDocValuesFormatVersion(DirectoryReader reader) throws NoSuchFieldException, IllegalAccessException, IOException, ClassNotFoundException { - if (System.getSecurityManager() != null) { - // With jvm version 24 entitlements are used and security manager is nog longer used. - // Making this assertion work with security manager requires granting the entire test codebase privileges to use - // suppressAccessChecks and suppressAccessChecks. This is undesired from a security manager perspective. - logger.info("not asserting doc values format version, because security manager is used"); - return; - } for (var leafReaderContext : reader.leaves()) { var leaf = (SegmentReader) leafReaderContext.reader(); var dvReader = leaf.getDocValuesReader(); - var field = getFormatsFieldFromPerFieldFieldsReader(dvReader.getClass()); - Map formats = (Map) field.get(dvReader); - var tsdbDvReader = (DocValuesProducer) formats.get("ES819TSDB_0"); - tsdbDvReader.checkIntegrity(); - assertThat( - tsdbDvReader, - Matchers.instanceOf(Class.forName("org.elasticsearch.index.codec.tsdb.es819.ES819TSDBDocValuesProducer")) - ); + dvReader.checkIntegrity(); + + if (dvReader instanceof XPerFieldDocValuesFormat.FieldsReader perFieldDvReader) { + var formats = perFieldDvReader.getFormats(); + assertThat(formats, Matchers.aMapWithSize(1)); + var tsdbDvReader = formats.get("ES819TSDB_0"); + tsdbDvReader.checkIntegrity(); + assertThat( + tsdbDvReader, + Matchers.instanceOf(Class.forName("org.elasticsearch.index.codec.tsdb.es819.ES819TSDBDocValuesProducer")) + ); + } else { + if (System.getSecurityManager() != null) { + // With jvm version 24 entitlements are used and security manager is nog longer used. + // Making this assertion work with security manager requires granting the entire test codebase privileges to use + // suppressAccessChecks and suppressAccessChecks. This is undesired from a security manager perspective. + logger.info("not asserting doc values format version, because security manager is used"); + continue; + } + var field = getFormatsFieldFromPerFieldFieldsReader(dvReader.getClass()); + Map formats = (Map) field.get(dvReader); + assertThat(formats, Matchers.aMapWithSize(1)); + var tsdbDvReader = (DocValuesProducer) formats.get("ES819TSDB_0"); + tsdbDvReader.checkIntegrity(); + assertThat( + tsdbDvReader, + Matchers.instanceOf(Class.forName("org.elasticsearch.index.codec.tsdb.es819.ES819TSDBDocValuesProducer")) + ); + } } } From 5e8ea4236e71553efc0493fd8fda03ec7fda56b5 Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Sat, 5 Apr 2025 15:00:51 +0200 Subject: [PATCH 42/43] Assert per field format field info attributes. --- .../codec/tsdb/TsdbDocValueBwcTests.java | 27 ++++++++++++++++--- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/server/src/test/java/org/elasticsearch/index/codec/tsdb/TsdbDocValueBwcTests.java b/server/src/test/java/org/elasticsearch/index/codec/tsdb/TsdbDocValueBwcTests.java index adc0cb9347510..b04b5c0906d48 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/tsdb/TsdbDocValueBwcTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/tsdb/TsdbDocValueBwcTests.java @@ -49,13 +49,13 @@ public class TsdbDocValueBwcTests extends ESTestCase { public void testMixedIndex() throws Exception { - Codec oldCodec = TestUtil.alwaysDocValuesFormat(new TestES87TSDBDocValuesFormat()); - Codec newCodec = TestUtil.alwaysDocValuesFormat(new ES819TSDBDocValuesFormat()); + var oldCodec = TestUtil.alwaysDocValuesFormat(new TestES87TSDBDocValuesFormat()); + var newCodec = TestUtil.alwaysDocValuesFormat(new ES819TSDBDocValuesFormat()); testMixedIndex(oldCodec, newCodec); } public void testMixedIndex816To900Lucene101() throws Exception { - Codec oldCodec = new Elasticsearch816Codec() { + var oldCodec = new Elasticsearch816Codec() { final DocValuesFormat docValuesFormat = new TestES87TSDBDocValuesFormat(); @@ -64,7 +64,7 @@ public DocValuesFormat getDocValuesFormatForField(String field) { return docValuesFormat; } }; - Codec newCodec = new Elasticsearch900Lucene101Codec() { + var newCodec = new Elasticsearch900Lucene101Codec() { final DocValuesFormat docValuesFormat = new ES819TSDBDocValuesFormat(); @@ -129,6 +129,16 @@ void testMixedIndex(Codec oldCodec, Codec newCodec) throws IOException, NoSuchFi // Check documents before force merge: try (var reader = DirectoryReader.open(dir)) { assertOldDocValuesFormatVersion(reader); + // Assert per field format field info attributes: + // (XPerFieldDocValuesFormat must produce the same attributes as PerFieldDocValuesFormat for BWC. + // Otherwise, doc values fields may disappear) + for (var leaf : reader.leaves()) { + for (var fieldInfo : leaf.reader().getFieldInfos()) { + assertThat(fieldInfo.attributes(), Matchers.aMapWithSize(2)); + assertThat(fieldInfo.attributes(), Matchers.hasEntry("PerFieldDocValuesFormat.suffix", "0")); + assertThat(fieldInfo.attributes(), Matchers.hasEntry("PerFieldDocValuesFormat.format", "ES87TSDB")); + } + } var hostNameDV = MultiDocValues.getSortedValues(reader, hostnameField); assertNotNull(hostNameDV); @@ -189,6 +199,15 @@ void testMixedIndex(Codec oldCodec, Codec newCodec) throws IOException, NoSuchFi assertEquals(numDocs, reader.maxDoc()); assertNewDocValuesFormatVersion(reader); var leaf = reader.leaves().get(0).reader(); + // Assert per field format field info attributes: + // (XPerFieldDocValuesFormat must produce the same attributes as PerFieldDocValuesFormat for BWC. + // Otherwise, doc values fields may disappear) + for (var fieldInfo : leaf.getFieldInfos()) { + assertThat(fieldInfo.attributes(), Matchers.aMapWithSize(2)); + assertThat(fieldInfo.attributes(), Matchers.hasEntry("PerFieldDocValuesFormat.suffix", "0")); + assertThat(fieldInfo.attributes(), Matchers.hasEntry("PerFieldDocValuesFormat.format", "ES819TSDB")); + } + var hostNameDV = leaf.getSortedDocValues(hostnameField); assertNotNull(hostNameDV); var timestampDV = DocValues.unwrapSingleton(leaf.getSortedNumericDocValues(timestampField)); From a44ab595498b5c91dd971a587f8d7f86b038146c Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Tue, 8 Apr 2025 20:45:23 +0200 Subject: [PATCH 43/43] move per field dv code to dedicated package --- server/src/main/java/module-info.java | 1 + .../index/codec/Elasticsearch900Lucene101Codec.java | 1 + .../index/codec/{ => perfield}/XPerFieldDocValuesFormat.java | 2 +- .../index/codec/{ => perfield}/XPerFieldMergeState.java | 2 +- .../index/codec/tsdb/es819/DocValuesConsumerUtil.java | 2 +- .../elasticsearch/index/codec/tsdb/TsdbDocValueBwcTests.java | 2 +- 6 files changed, 6 insertions(+), 4 deletions(-) rename server/src/main/java/org/elasticsearch/index/codec/{ => perfield}/XPerFieldDocValuesFormat.java (99%) rename server/src/main/java/org/elasticsearch/index/codec/{ => perfield}/XPerFieldMergeState.java (99%) diff --git a/server/src/main/java/module-info.java b/server/src/main/java/module-info.java index 6a1b5bfb97685..9fa84efcd1099 100644 --- a/server/src/main/java/module-info.java +++ b/server/src/main/java/module-info.java @@ -475,4 +475,5 @@ exports org.elasticsearch.monitor.metrics; exports org.elasticsearch.plugins.internal.rewriter to org.elasticsearch.inference; exports org.elasticsearch.lucene.util.automaton; + exports org.elasticsearch.index.codec.perfield; } diff --git a/server/src/main/java/org/elasticsearch/index/codec/Elasticsearch900Lucene101Codec.java b/server/src/main/java/org/elasticsearch/index/codec/Elasticsearch900Lucene101Codec.java index 596f672d2630e..d96495fb0f615 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/Elasticsearch900Lucene101Codec.java +++ b/server/src/main/java/org/elasticsearch/index/codec/Elasticsearch900Lucene101Codec.java @@ -19,6 +19,7 @@ import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat; import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; +import org.elasticsearch.index.codec.perfield.XPerFieldDocValuesFormat; import org.elasticsearch.index.codec.zstd.Zstd814StoredFieldsFormat; /** diff --git a/server/src/main/java/org/elasticsearch/index/codec/XPerFieldDocValuesFormat.java b/server/src/main/java/org/elasticsearch/index/codec/perfield/XPerFieldDocValuesFormat.java similarity index 99% rename from server/src/main/java/org/elasticsearch/index/codec/XPerFieldDocValuesFormat.java rename to server/src/main/java/org/elasticsearch/index/codec/perfield/XPerFieldDocValuesFormat.java index f7b86e614499c..2b5a5d9d45f10 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/XPerFieldDocValuesFormat.java +++ b/server/src/main/java/org/elasticsearch/index/codec/perfield/XPerFieldDocValuesFormat.java @@ -7,7 +7,7 @@ * License v3.0 only", or the "Server Side Public License, v 1". */ -package org.elasticsearch.index.codec; +package org.elasticsearch.index.codec.perfield; import org.apache.lucene.codecs.DocValuesConsumer; import org.apache.lucene.codecs.DocValuesFormat; diff --git a/server/src/main/java/org/elasticsearch/index/codec/XPerFieldMergeState.java b/server/src/main/java/org/elasticsearch/index/codec/perfield/XPerFieldMergeState.java similarity index 99% rename from server/src/main/java/org/elasticsearch/index/codec/XPerFieldMergeState.java rename to server/src/main/java/org/elasticsearch/index/codec/perfield/XPerFieldMergeState.java index 8c8b90c6b4bdd..72a8c4bc1492b 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/XPerFieldMergeState.java +++ b/server/src/main/java/org/elasticsearch/index/codec/perfield/XPerFieldMergeState.java @@ -7,7 +7,7 @@ * License v3.0 only", or the "Server Side Public License, v 1". */ -package org.elasticsearch.index.codec; +package org.elasticsearch.index.codec.perfield; import org.apache.lucene.codecs.FieldsProducer; import org.apache.lucene.index.DocValuesType; diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/DocValuesConsumerUtil.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/DocValuesConsumerUtil.java index 471a7dcdea280..d6dae9ea882f9 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/DocValuesConsumerUtil.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/DocValuesConsumerUtil.java @@ -12,7 +12,7 @@ import org.apache.lucene.codecs.DocValuesProducer; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.MergeState; -import org.elasticsearch.index.codec.XPerFieldDocValuesFormat; +import org.elasticsearch.index.codec.perfield.XPerFieldDocValuesFormat; /** * Contains logic to determine whether optimized merge can occur. diff --git a/server/src/test/java/org/elasticsearch/index/codec/tsdb/TsdbDocValueBwcTests.java b/server/src/test/java/org/elasticsearch/index/codec/tsdb/TsdbDocValueBwcTests.java index b04b5c0906d48..9c41e7a80ed66 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/tsdb/TsdbDocValueBwcTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/tsdb/TsdbDocValueBwcTests.java @@ -34,7 +34,7 @@ import org.elasticsearch.core.SuppressForbidden; import org.elasticsearch.index.codec.Elasticsearch816Codec; import org.elasticsearch.index.codec.Elasticsearch900Lucene101Codec; -import org.elasticsearch.index.codec.XPerFieldDocValuesFormat; +import org.elasticsearch.index.codec.perfield.XPerFieldDocValuesFormat; import org.elasticsearch.index.codec.tsdb.ES87TSDBDocValuesFormatTests.TestES87TSDBDocValuesFormat; import org.elasticsearch.index.codec.tsdb.es819.ES819TSDBDocValuesFormat; import org.elasticsearch.test.ESTestCase;