Skip to content

Store _doc_count field as custom term frequency #65776

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Dec 3, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.index.mapper;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.IndexOptions;

/**
* Custom field that allows storing an integer value as a term frequency in lucene.
*/
public final class CustomTermFreqField extends Field {

private static final FieldType FIELD_TYPE = new FieldType();
static {
FIELD_TYPE.setTokenized(false);
FIELD_TYPE.setOmitNorms(true);
FIELD_TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
}

private int fieldValue;

public CustomTermFreqField(String fieldName, CharSequence term, int fieldValue) {
super(fieldName, term, FIELD_TYPE);
this.fieldValue = fieldValue;
}

public void setFieldValue(int fieldValue) {
this.fieldValue = fieldValue;
}

@Override
public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) {
CustomTermFreqTokenStream stream;
if (reuse instanceof CustomTermFreqTokenStream) {
stream = (CustomTermFreqTokenStream) reuse;
} else {
stream = new CustomTermFreqTokenStream();
}
stream.setValues((String) fieldsData, fieldValue);
return stream;
}

private static final class CustomTermFreqTokenStream extends TokenStream {
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
private final TermFrequencyAttribute freqAttribute = addAttribute(TermFrequencyAttribute.class);
private boolean used = true;
private String value = null;
private int freq = 0;

private CustomTermFreqTokenStream() {
}

/** Sets the values */
void setValues(String value, int freq) {
this.value = value;
this.freq = freq;
}

@Override
public boolean incrementToken() {
if (used) {
return false;
}
clearAttributes();
termAttribute.append(value);
freqAttribute.setTermFrequency(freq);
used = true;
return true;
}

@Override
public void reset() {
used = false;
}

@Override
public void close() {
value = null;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,6 @@
*/
package org.elasticsearch.index.mapper;

import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.search.DocValuesFieldExistsQuery;
import org.apache.lucene.search.Query;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.common.xcontent.XContentParserUtils;
Expand All @@ -42,10 +39,10 @@ public static final class DocCountFieldType extends MappedFieldType {

public static final DocCountFieldType INSTANCE = new DocCountFieldType();

private static final Long defaultValue = 1L;
public static final int DEFAULT_VALUE = 1;

public DocCountFieldType() {
super(NAME, false, false, true, TextSearchInfo.NONE, Collections.emptyMap());
super(NAME, false, false, false, TextSearchInfo.NONE, Collections.emptyMap());
}

@Override
Expand All @@ -55,12 +52,12 @@ public String typeName() {

@Override
public String familyTypeName() {
return NumberFieldMapper.NumberType.LONG.typeName();
return NumberFieldMapper.NumberType.INTEGER.typeName();
}

@Override
public Query existsQuery(QueryShardContext context) {
return new DocValuesFieldExistsQuery(NAME);
throw new QueryShardException(context, "Field [" + name() + "] of type [" + typeName() + "] does not support exists queries");
}

@Override
Expand All @@ -74,13 +71,13 @@ public ValueFetcher valueFetcher(QueryShardContext context, String format) {
throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() + "] doesn't support formats.");
}

return new SourceValueFetcher(name(), context, defaultValue) {
return new SourceValueFetcher(name(), context, DEFAULT_VALUE) {
@Override
protected Object parseSourceValue(Object value) {
if ("".equals(value)) {
return defaultValue;
return DEFAULT_VALUE;
} else {
return NumberFieldMapper.NumberType.objectToLong(value, false);
return NumberFieldMapper.NumberType.INTEGER.parse(value, false);
}
}
};
Expand All @@ -96,17 +93,19 @@ protected void parseCreateField(ParseContext context) throws IOException {
XContentParser parser = context.parser();
XContentParserUtils.ensureExpectedToken(XContentParser.Token.VALUE_NUMBER, parser.currentToken(), parser);

long value = parser.longValue(false);
// Check that _doc_count is a single value and not an array
if (context.doc().getByKey(NAME) != null) {
throw new IllegalArgumentException("Arrays are not allowed for field [" + fieldType().name() + "].");
}

int value = parser.intValue(false);
if (value <= 0) {
throw new IllegalArgumentException("Field [" + fieldType().name() + "] must be a positive integer.");
throw new IllegalArgumentException("Field [" + fieldType().name() + "] must be a positive integer. Value ["
+ value + "] is not allowed.");
}
final Field docCount = new NumericDocValuesField(NAME, value);
context.doc().add(docCount);
context.doc().addWithKey(NAME, new CustomTermFreqField(NAME, NAME, value));
}

@Override
public void preParse(ParseContext context) { }

@Override
public DocCountFieldType fieldType() {
return (DocCountFieldType) super.fieldType();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ public final void collectBucket(LeafBucketCollector subCollector, int doc, long
* Same as {@link #collectBucket(LeafBucketCollector, int, long)}, but doesn't check if the docCounts needs to be re-sized.
*/
public final void collectExistingBucket(LeafBucketCollector subCollector, int doc, long bucketOrd) throws IOException {
long docCount = docCountProvider.getDocCount(doc);
int docCount = docCountProvider.getDocCount(doc);
if (docCounts.increment(bucketOrd, docCount) == docCount) {
// We calculate the final number of buckets only during the reduce phase. But we still need to
// trigger bucket consumer from time to time in order to give it a chance to check available memory and break
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@

package org.elasticsearch.search.aggregations.bucket;

import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
import org.elasticsearch.index.mapper.DocCountFieldMapper;

import java.io.IOException;
Expand All @@ -33,17 +33,25 @@
*/
public class DocCountProvider {

private NumericDocValues docCountValues;
public static final int DEFAULT_VALUE = DocCountFieldMapper.DocCountFieldType.DEFAULT_VALUE;

public long getDocCount(int doc) throws IOException {
if (docCountValues != null && docCountValues.advanceExact(doc)) {
return docCountValues.longValue();
private PostingsEnum docCountPostings;

public int getDocCount(int doc) throws IOException {
if (docCountPostings == null) {
return DEFAULT_VALUE;
}
if (docCountPostings.docID() < doc) {
docCountPostings.advance(doc);
}
if (docCountPostings.docID() == doc) {
return docCountPostings.freq();
} else {
return 1L;
return DEFAULT_VALUE;
}
}

public void setLeafReaderContext(LeafReaderContext ctx) throws IOException {
docCountValues = DocValues.getNumeric(ctx.reader(), DocCountFieldMapper.NAME);
docCountPostings = ctx.reader().postings(new Term(DocCountFieldMapper.NAME, DocCountFieldMapper.NAME));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,7 @@ private LeafBucketCollector getFirstPassCollector(RoaringDocIdSet.Builder builde
@Override
public void collect(int doc, long bucket) throws IOException {
try {
long docCount = docCountProvider.getDocCount(doc);
int docCount = docCountProvider.getDocCount(doc);
if (queue.addIfCompetitive(indexSortPrefix, docCount)) {
if (builder != null && lastDoc != doc) {
builder.add(doc);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ protected boolean processBucket(CompositeValuesCollectorQueue queue, LeafReaderC
@Override
public void collect(int doc, long bucket) throws IOException {
hasCollected[0] = true;
long docCount = docCountProvider.getDocCount(doc);
int docCount = docCountProvider.getDocCount(doc);
if (queue.addIfCompetitive(docCount)) {
topCompositeCollected[0]++;
if (adder != null && doc != lastDoc) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,7 @@ public void collect(int doc, long owningBucketOrd) throws IOException {
return;
}
int ord = singleValues.ordValue();
long docCount = docCountProvider.getDocCount(doc);
int docCount = docCountProvider.getDocCount(doc);
segmentDocCounts.increment(ord + 1, docCount);
}
});
Expand All @@ -329,7 +329,7 @@ public void collect(int doc, long owningBucketOrd) throws IOException {
return;
}
for (long segmentOrd = segmentOrds.nextOrd(); segmentOrd != NO_MORE_ORDS; segmentOrd = segmentOrds.nextOrd()) {
long docCount = docCountProvider.getDocCount(doc);
int docCount = docCountProvider.getDocCount(doc);
segmentDocCounts.increment(segmentOrd + 1, docCount);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
*/
package org.elasticsearch.index.mapper;

import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.IndexableField;

import static org.hamcrest.Matchers.containsString;
Expand All @@ -28,9 +27,6 @@ public class DocCountFieldMapperTests extends MapperServiceTestCase {
private static final String CONTENT_TYPE = DocCountFieldMapper.CONTENT_TYPE;
private static final String DOC_COUNT_FIELD = DocCountFieldMapper.NAME;

/**
* Test parsing field mapping and adding simple field
*/
public void testParseValue() throws Exception {
DocumentMapper mapper = createDocumentMapper(mapping(b -> {}));
ParsedDocument doc = mapper.parse(source(b ->
Expand All @@ -39,8 +35,7 @@ public void testParseValue() throws Exception {
));

IndexableField field = doc.rootDoc().getField(DOC_COUNT_FIELD);
assertEquals(100L, field.numericValue());
assertEquals(DocValuesType.NUMERIC, field.fieldType().docValuesType());
assertEquals(DOC_COUNT_FIELD, field.stringValue());
assertEquals(1, doc.rootDoc().getFields(DOC_COUNT_FIELD).length);
}

Expand All @@ -66,6 +61,13 @@ public void testInvalidDocument_NonNumericDocCount() throws Exception {
public void testInvalidDocument_FractionalDocCount() throws Exception {
DocumentMapper mapper = createDocumentMapper(mapping(b -> {}));
Exception e = expectThrows(MapperParsingException.class, () -> mapper.parse(source(b -> b.field(CONTENT_TYPE, 100.23))));
assertThat(e.getCause().getMessage(), containsString("100.23 cannot be converted to Long without data loss"));
assertThat(e.getCause().getMessage(), containsString("100.23 cannot be converted to Integer without data loss"));
}

public void testInvalidDocument_ArrayDocCount() throws Exception {
DocumentMapper mapper = createDocumentMapper(mapping(b -> {}));
Exception e = expectThrows(MapperParsingException.class,
() -> mapper.parse(source(b -> b.array(CONTENT_TYPE, 10, 20, 30))));
assertThat(e.getCause().getMessage(), containsString("Arrays are not allowed for field [_doc_count]."));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
*/
package org.elasticsearch.index.mapper;

import org.apache.lucene.search.DocValuesFieldExistsQuery;
import org.elasticsearch.index.query.QueryShardException;

import java.io.IOException;
Expand All @@ -42,14 +41,15 @@ public void testRangeQuery() {

public void testExistsQuery() {
MappedFieldType ft = new DocCountFieldMapper.DocCountFieldType();
assertTrue(ft.existsQuery(randomMockShardContext()) instanceof DocValuesFieldExistsQuery);
QueryShardException e = expectThrows(QueryShardException.class, () -> ft.existsQuery(randomMockShardContext()));
assertEquals("Field [_doc_count] of type [_doc_count] does not support exists queries", e.getMessage());
}

public void testFetchSourceValue() throws IOException {
MappedFieldType fieldType = new DocCountFieldMapper.DocCountFieldType();
assertEquals(Arrays.asList(14L), fetchSourceValue(fieldType, 14));
assertEquals(Arrays.asList(14L), fetchSourceValue(fieldType, "14"));
assertEquals(Arrays.asList(1L), fetchSourceValue(fieldType, ""));
assertEquals(Arrays.asList(1L), fetchSourceValue(fieldType, null));
assertEquals(Arrays.asList(14), fetchSourceValue(fieldType, 14));
assertEquals(Arrays.asList(14), fetchSourceValue(fieldType, "14"));
assertEquals(Arrays.asList(1), fetchSourceValue(fieldType, ""));
assertEquals(Arrays.asList(1), fetchSourceValue(fieldType, null));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,12 @@
package org.elasticsearch.search.aggregations.bucket;

import org.apache.lucene.document.IntPoint;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.SortedNumericDocValuesField;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Query;
import org.elasticsearch.common.CheckedConsumer;
import org.elasticsearch.index.mapper.CustomTermFreqField;
import org.elasticsearch.index.mapper.DocCountFieldMapper;
import org.elasticsearch.index.mapper.MappedFieldType;
import org.elasticsearch.index.mapper.NumberFieldMapper;
Expand All @@ -48,11 +48,11 @@ public class DocCountProviderTests extends AggregatorTestCase {
public void testDocsWithDocCount() throws IOException {
testAggregation(new MatchAllDocsQuery(), iw -> {
iw.addDocument(List.of(
new NumericDocValuesField(DOC_COUNT_FIELD, 4),
new CustomTermFreqField(DOC_COUNT_FIELD, DOC_COUNT_FIELD, 4),
new SortedNumericDocValuesField(NUMBER_FIELD, 1)
));
iw.addDocument(List.of(
new NumericDocValuesField(DOC_COUNT_FIELD, 5),
new CustomTermFreqField(DOC_COUNT_FIELD, DOC_COUNT_FIELD, 5),
new SortedNumericDocValuesField(NUMBER_FIELD, 7)
));
iw.addDocument(List.of(
Expand All @@ -77,11 +77,11 @@ public void testDocsWithoutDocCount() throws IOException {
public void testQueryFiltering() throws IOException {
testAggregation(IntPoint.newRangeQuery(NUMBER_FIELD, 4, 5), iw -> {
iw.addDocument(List.of(
new NumericDocValuesField(DOC_COUNT_FIELD, 4),
new CustomTermFreqField(DOC_COUNT_FIELD, DOC_COUNT_FIELD, 4),
new IntPoint(NUMBER_FIELD, 6)
));
iw.addDocument(List.of(
new NumericDocValuesField(DOC_COUNT_FIELD, 2),
new CustomTermFreqField(DOC_COUNT_FIELD, DOC_COUNT_FIELD, 2),
new IntPoint(NUMBER_FIELD, 5)
));
iw.addDocument(List.of(
Expand Down