Skip to content

Commit c3ff707

Browse files
authored
Store _doc_count field as custom term frequency (#65825)
A while back, Lucene introduced the ability to index custom term frequencies, ie. giving users the ability to provide a numeric value that should be indexed as a term frequency rather than letting Lucene compute the term frequency by itself based on the number of occurrences of a term. This PR modifies the _doc_count field so that it is stored as Lucene custom term frequency. A benefit of moving to custom term frequencies is that Lucene will automatically compute global term statistics like totalTermFreq which will let us know the sum of the values of the _doc_count field across an entire shard. This could in-turn be useful to generalize optimizations to rollup indices, e.g. buckets aggregations where all documents fall into the same bucket. Relates to #64503
1 parent aaf853a commit c3ff707

File tree

10 files changed

+160
-48
lines changed

10 files changed

+160
-48
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
/*
2+
* Licensed to Elasticsearch under one or more contributor
3+
* license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright
5+
* ownership. Elasticsearch licenses this file to you under
6+
* the Apache License, Version 2.0 (the "License"); you may
7+
* not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.elasticsearch.index.mapper;
21+
22+
import org.apache.lucene.analysis.Analyzer;
23+
import org.apache.lucene.analysis.TokenStream;
24+
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
25+
import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
26+
import org.apache.lucene.document.Field;
27+
import org.apache.lucene.document.FieldType;
28+
import org.apache.lucene.index.IndexOptions;
29+
30+
/**
31+
* Custom field that allows storing an integer value as a term frequency in lucene.
32+
*/
33+
public final class CustomTermFreqField extends Field {
34+
35+
private static final FieldType FIELD_TYPE = new FieldType();
36+
static {
37+
FIELD_TYPE.setTokenized(false);
38+
FIELD_TYPE.setOmitNorms(true);
39+
FIELD_TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
40+
}
41+
42+
private int fieldValue;
43+
44+
public CustomTermFreqField(String fieldName, CharSequence term, int fieldValue) {
45+
super(fieldName, term, FIELD_TYPE);
46+
this.fieldValue = fieldValue;
47+
}
48+
49+
public void setFieldValue(int fieldValue) {
50+
this.fieldValue = fieldValue;
51+
}
52+
53+
@Override
54+
public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) {
55+
CustomTermFreqTokenStream stream;
56+
if (reuse instanceof CustomTermFreqTokenStream) {
57+
stream = (CustomTermFreqTokenStream) reuse;
58+
} else {
59+
stream = new CustomTermFreqTokenStream();
60+
}
61+
stream.setValues((String) fieldsData, fieldValue);
62+
return stream;
63+
}
64+
65+
private static final class CustomTermFreqTokenStream extends TokenStream {
66+
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
67+
private final TermFrequencyAttribute freqAttribute = addAttribute(TermFrequencyAttribute.class);
68+
private boolean used = true;
69+
private String value = null;
70+
private int freq = 0;
71+
72+
private CustomTermFreqTokenStream() {
73+
}
74+
75+
/** Sets the values */
76+
void setValues(String value, int freq) {
77+
this.value = value;
78+
this.freq = freq;
79+
}
80+
81+
@Override
82+
public boolean incrementToken() {
83+
if (used) {
84+
return false;
85+
}
86+
clearAttributes();
87+
termAttribute.append(value);
88+
freqAttribute.setTermFrequency(freq);
89+
used = true;
90+
return true;
91+
}
92+
93+
@Override
94+
public void reset() {
95+
used = false;
96+
}
97+
98+
@Override
99+
public void close() {
100+
value = null;
101+
}
102+
}
103+
}

server/src/main/java/org/elasticsearch/index/mapper/DocCountFieldMapper.java

+16-17
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,6 @@
1818
*/
1919
package org.elasticsearch.index.mapper;
2020

21-
import org.apache.lucene.document.Field;
22-
import org.apache.lucene.document.NumericDocValuesField;
23-
import org.apache.lucene.search.DocValuesFieldExistsQuery;
2421
import org.apache.lucene.search.Query;
2522
import org.elasticsearch.common.xcontent.XContentParser;
2623
import org.elasticsearch.common.xcontent.XContentParserUtils;
@@ -42,10 +39,10 @@ public static final class DocCountFieldType extends MappedFieldType {
4239

4340
public static final DocCountFieldType INSTANCE = new DocCountFieldType();
4441

45-
private static final Long defaultValue = 1L;
42+
public static final int DEFAULT_VALUE = 1;
4643

4744
public DocCountFieldType() {
48-
super(NAME, false, false, true, TextSearchInfo.NONE, Collections.emptyMap());
45+
super(NAME, false, false, false, TextSearchInfo.NONE, Collections.emptyMap());
4946
}
5047

5148

@@ -56,12 +53,12 @@ public String typeName() {
5653

5754
@Override
5855
public String familyTypeName() {
59-
return NumberFieldMapper.NumberType.LONG.typeName();
56+
return NumberFieldMapper.NumberType.INTEGER.typeName();
6057
}
6158

6259
@Override
6360
public Query existsQuery(QueryShardContext context) {
64-
return new DocValuesFieldExistsQuery(NAME);
61+
throw new QueryShardException(context, "Field [" + name() + "] of type [" + typeName() + "] does not support exists queries");
6562
}
6663

6764
@Override
@@ -75,13 +72,13 @@ public ValueFetcher valueFetcher(QueryShardContext context, String format) {
7572
throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() + "] doesn't support formats.");
7673
}
7774

78-
return new SourceValueFetcher(name(), context, defaultValue) {
75+
return new SourceValueFetcher(name(), context, DEFAULT_VALUE) {
7976
@Override
8077
protected Object parseSourceValue(Object value) {
8178
if ("".equals(value)) {
82-
return defaultValue;
79+
return DEFAULT_VALUE;
8380
} else {
84-
return NumberFieldMapper.NumberType.objectToLong(value, false);
81+
return NumberFieldMapper.NumberType.INTEGER.parse(value, false);
8582
}
8683
}
8784
};
@@ -97,17 +94,19 @@ protected void parseCreateField(ParseContext context) throws IOException {
9794
XContentParser parser = context.parser();
9895
XContentParserUtils.ensureExpectedToken(XContentParser.Token.VALUE_NUMBER, parser.currentToken(), parser);
9996

100-
long value = parser.longValue(false);
97+
// Check that _doc_count is a single value and not an array
98+
if (context.doc().getByKey(NAME) != null) {
99+
throw new IllegalArgumentException("Arrays are not allowed for field [" + fieldType().name() + "].");
100+
}
101+
102+
int value = parser.intValue(false);
101103
if (value <= 0) {
102-
throw new IllegalArgumentException("Field [" + fieldType().name() + "] must be a positive integer.");
104+
throw new IllegalArgumentException("Field [" + fieldType().name() + "] must be a positive integer. Value ["
105+
+ value + "] is not allowed.");
103106
}
104-
final Field docCount = new NumericDocValuesField(NAME, value);
105-
context.doc().add(docCount);
107+
context.doc().addWithKey(NAME, new CustomTermFreqField(NAME, NAME, value));
106108
}
107109

108-
@Override
109-
public void preParse(ParseContext context) { }
110-
111110
@Override
112111
public DocCountFieldType fieldType() {
113112
return (DocCountFieldType) super.fieldType();

server/src/main/java/org/elasticsearch/search/aggregations/bucket/BucketsAggregator.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ public final void collectBucket(LeafBucketCollector subCollector, int doc, long
8787
* Same as {@link #collectBucket(LeafBucketCollector, int, long)}, but doesn't check if the docCounts needs to be re-sized.
8888
*/
8989
public final void collectExistingBucket(LeafBucketCollector subCollector, int doc, long bucketOrd) throws IOException {
90-
long docCount = docCountProvider.getDocCount(doc);
90+
int docCount = docCountProvider.getDocCount(doc);
9191
if (docCounts.increment(bucketOrd, docCount) == docCount) {
9292
// We calculate the final number of buckets only during the reduce phase. But we still need to
9393
// trigger bucket consumer from time to time in order to give it a chance to check available memory and break

server/src/main/java/org/elasticsearch/search/aggregations/bucket/DocCountProvider.java

+16-8
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,9 @@
1919

2020
package org.elasticsearch.search.aggregations.bucket;
2121

22-
import org.apache.lucene.index.DocValues;
2322
import org.apache.lucene.index.LeafReaderContext;
24-
import org.apache.lucene.index.NumericDocValues;
23+
import org.apache.lucene.index.PostingsEnum;
24+
import org.apache.lucene.index.Term;
2525
import org.elasticsearch.index.mapper.DocCountFieldMapper;
2626

2727
import java.io.IOException;
@@ -33,17 +33,25 @@
3333
*/
3434
public class DocCountProvider {
3535

36-
private NumericDocValues docCountValues;
36+
public static final int DEFAULT_VALUE = DocCountFieldMapper.DocCountFieldType.DEFAULT_VALUE;
3737

38-
public long getDocCount(int doc) throws IOException {
39-
if (docCountValues != null && docCountValues.advanceExact(doc)) {
40-
return docCountValues.longValue();
38+
private PostingsEnum docCountPostings;
39+
40+
public int getDocCount(int doc) throws IOException {
41+
if (docCountPostings == null) {
42+
return DEFAULT_VALUE;
43+
}
44+
if (docCountPostings.docID() < doc) {
45+
docCountPostings.advance(doc);
46+
}
47+
if (docCountPostings.docID() == doc) {
48+
return docCountPostings.freq();
4149
} else {
42-
return 1L;
50+
return DEFAULT_VALUE;
4351
}
4452
}
4553

4654
public void setLeafReaderContext(LeafReaderContext ctx) throws IOException {
47-
docCountValues = DocValues.getNumeric(ctx.reader(), DocCountFieldMapper.NAME);
55+
docCountPostings = ctx.reader().postings(new Term(DocCountFieldMapper.NAME, DocCountFieldMapper.NAME));
4856
}
4957
}

server/src/main/java/org/elasticsearch/search/aggregations/bucket/composite/CompositeAggregator.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -430,7 +430,7 @@ private LeafBucketCollector getFirstPassCollector(RoaringDocIdSet.Builder builde
430430
@Override
431431
public void collect(int doc, long bucket) throws IOException {
432432
try {
433-
long docCount = docCountProvider.getDocCount(doc);
433+
int docCount = docCountProvider.getDocCount(doc);
434434
if (queue.addIfCompetitive(indexSortPrefix, docCount)) {
435435
if (builder != null && lastDoc != doc) {
436436
builder.add(doc);

server/src/main/java/org/elasticsearch/search/aggregations/bucket/composite/SortedDocsProducer.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ protected boolean processBucket(CompositeValuesCollectorQueue queue, LeafReaderC
6969
@Override
7070
public void collect(int doc, long bucket) throws IOException {
7171
hasCollected[0] = true;
72-
long docCount = docCountProvider.getDocCount(doc);
72+
int docCount = docCountProvider.getDocCount(doc);
7373
if (queue.addIfCompetitive(docCount)) {
7474
topCompositeCollected[0]++;
7575
if (adder != null && doc != lastDoc) {

server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java

+2-2
Original file line numberDiff line numberDiff line change
@@ -315,7 +315,7 @@ public void collect(int doc, long owningBucketOrd) throws IOException {
315315
return;
316316
}
317317
int ord = singleValues.ordValue();
318-
long docCount = docCountProvider.getDocCount(doc);
318+
int docCount = docCountProvider.getDocCount(doc);
319319
segmentDocCounts.increment(ord + 1, docCount);
320320
}
321321
});
@@ -329,7 +329,7 @@ public void collect(int doc, long owningBucketOrd) throws IOException {
329329
return;
330330
}
331331
for (long segmentOrd = segmentOrds.nextOrd(); segmentOrd != NO_MORE_ORDS; segmentOrd = segmentOrds.nextOrd()) {
332-
long docCount = docCountProvider.getDocCount(doc);
332+
int docCount = docCountProvider.getDocCount(doc);
333333
segmentDocCounts.increment(segmentOrd + 1, docCount);
334334
}
335335
}

server/src/test/java/org/elasticsearch/index/mapper/DocCountFieldMapperTests.java

+9-7
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
*/
1919
package org.elasticsearch.index.mapper;
2020

21-
import org.apache.lucene.index.DocValuesType;
2221
import org.apache.lucene.index.IndexableField;
2322

2423
import static org.hamcrest.Matchers.containsString;
@@ -28,9 +27,6 @@ public class DocCountFieldMapperTests extends MapperServiceTestCase {
2827
private static final String CONTENT_TYPE = DocCountFieldMapper.CONTENT_TYPE;
2928
private static final String DOC_COUNT_FIELD = DocCountFieldMapper.NAME;
3029

31-
/**
32-
* Test parsing field mapping and adding simple field
33-
*/
3430
public void testParseValue() throws Exception {
3531
DocumentMapper mapper = createDocumentMapper(mapping(b -> {}));
3632
ParsedDocument doc = mapper.parse(source(b ->
@@ -39,8 +35,7 @@ public void testParseValue() throws Exception {
3935
));
4036

4137
IndexableField field = doc.rootDoc().getField(DOC_COUNT_FIELD);
42-
assertEquals(100L, field.numericValue());
43-
assertEquals(DocValuesType.NUMERIC, field.fieldType().docValuesType());
38+
assertEquals(DOC_COUNT_FIELD, field.stringValue());
4439
assertEquals(1, doc.rootDoc().getFields(DOC_COUNT_FIELD).length);
4540
}
4641

@@ -66,6 +61,13 @@ public void testInvalidDocument_NonNumericDocCount() throws Exception {
6661
public void testInvalidDocument_FractionalDocCount() throws Exception {
6762
DocumentMapper mapper = createDocumentMapper(mapping(b -> {}));
6863
Exception e = expectThrows(MapperParsingException.class, () -> mapper.parse(source(b -> b.field(CONTENT_TYPE, 100.23))));
69-
assertThat(e.getCause().getMessage(), containsString("100.23 cannot be converted to Long without data loss"));
64+
assertThat(e.getCause().getMessage(), containsString("100.23 cannot be converted to Integer without data loss"));
65+
}
66+
67+
public void testInvalidDocument_ArrayDocCount() throws Exception {
68+
DocumentMapper mapper = createDocumentMapper(mapping(b -> {}));
69+
Exception e = expectThrows(MapperParsingException.class,
70+
() -> mapper.parse(source(b -> b.array(CONTENT_TYPE, 10, 20, 30))));
71+
assertThat(e.getCause().getMessage(), containsString("Arrays are not allowed for field [_doc_count]."));
7072
}
7173
}

server/src/test/java/org/elasticsearch/index/mapper/DocCountFieldTypeTests.java

+6-6
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
*/
1919
package org.elasticsearch.index.mapper;
2020

21-
import org.apache.lucene.search.DocValuesFieldExistsQuery;
2221
import org.elasticsearch.index.query.QueryShardException;
2322

2423
import java.io.IOException;
@@ -42,14 +41,15 @@ public void testRangeQuery() {
4241

4342
public void testExistsQuery() {
4443
MappedFieldType ft = new DocCountFieldMapper.DocCountFieldType();
45-
assertTrue(ft.existsQuery(randomMockShardContext()) instanceof DocValuesFieldExistsQuery);
44+
QueryShardException e = expectThrows(QueryShardException.class, () -> ft.existsQuery(randomMockShardContext()));
45+
assertEquals("Field [_doc_count] of type [_doc_count] does not support exists queries", e.getMessage());
4646
}
4747

4848
public void testFetchSourceValue() throws IOException {
4949
MappedFieldType fieldType = new DocCountFieldMapper.DocCountFieldType();
50-
assertEquals(Arrays.asList(14L), fetchSourceValue(fieldType, 14));
51-
assertEquals(Arrays.asList(14L), fetchSourceValue(fieldType, "14"));
52-
assertEquals(Arrays.asList(1L), fetchSourceValue(fieldType, ""));
53-
assertEquals(Arrays.asList(1L), fetchSourceValue(fieldType, null));
50+
assertEquals(Arrays.asList(14), fetchSourceValue(fieldType, 14));
51+
assertEquals(Arrays.asList(14), fetchSourceValue(fieldType, "14"));
52+
assertEquals(Arrays.asList(1), fetchSourceValue(fieldType, ""));
53+
assertEquals(Arrays.asList(1), fetchSourceValue(fieldType, null));
5454
}
5555
}

server/src/test/java/org/elasticsearch/search/aggregations/bucket/DocCountProviderTests.java

+5-5
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,12 @@
2020
package org.elasticsearch.search.aggregations.bucket;
2121

2222
import org.apache.lucene.document.IntPoint;
23-
import org.apache.lucene.document.NumericDocValuesField;
2423
import org.apache.lucene.document.SortedNumericDocValuesField;
2524
import org.apache.lucene.index.RandomIndexWriter;
2625
import org.apache.lucene.search.MatchAllDocsQuery;
2726
import org.apache.lucene.search.Query;
2827
import org.elasticsearch.common.CheckedConsumer;
28+
import org.elasticsearch.index.mapper.CustomTermFreqField;
2929
import org.elasticsearch.index.mapper.DocCountFieldMapper;
3030
import org.elasticsearch.index.mapper.MappedFieldType;
3131
import org.elasticsearch.index.mapper.NumberFieldMapper;
@@ -48,11 +48,11 @@ public class DocCountProviderTests extends AggregatorTestCase {
4848
public void testDocsWithDocCount() throws IOException {
4949
testAggregation(new MatchAllDocsQuery(), iw -> {
5050
iw.addDocument(List.of(
51-
new NumericDocValuesField(DOC_COUNT_FIELD, 4),
51+
new CustomTermFreqField(DOC_COUNT_FIELD, DOC_COUNT_FIELD, 4),
5252
new SortedNumericDocValuesField(NUMBER_FIELD, 1)
5353
));
5454
iw.addDocument(List.of(
55-
new NumericDocValuesField(DOC_COUNT_FIELD, 5),
55+
new CustomTermFreqField(DOC_COUNT_FIELD, DOC_COUNT_FIELD, 5),
5656
new SortedNumericDocValuesField(NUMBER_FIELD, 7)
5757
));
5858
iw.addDocument(List.of(
@@ -77,11 +77,11 @@ public void testDocsWithoutDocCount() throws IOException {
7777
public void testQueryFiltering() throws IOException {
7878
testAggregation(IntPoint.newRangeQuery(NUMBER_FIELD, 4, 5), iw -> {
7979
iw.addDocument(List.of(
80-
new NumericDocValuesField(DOC_COUNT_FIELD, 4),
80+
new CustomTermFreqField(DOC_COUNT_FIELD, DOC_COUNT_FIELD, 4),
8181
new IntPoint(NUMBER_FIELD, 6)
8282
));
8383
iw.addDocument(List.of(
84-
new NumericDocValuesField(DOC_COUNT_FIELD, 2),
84+
new CustomTermFreqField(DOC_COUNT_FIELD, DOC_COUNT_FIELD, 2),
8585
new IntPoint(NUMBER_FIELD, 5)
8686
));
8787
iw.addDocument(List.of(

0 commit comments

Comments
 (0)