Skip to content

Commit f53d159

Browse files
Limit analyzed text for highlighting (improvements) (#28808)
Increase the default limit of `index.highlight.max_analyzed_offset` to 1M instead of previous 10K. Enhance an error message when offset increased to include field name, index name and doc_id. Relates to elastic/kibana#16764
1 parent f207aac commit f53d159

File tree

8 files changed

+24
-30
lines changed

8 files changed

+24
-30
lines changed

docs/reference/index-modules.asciidoc

+1-1
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,7 @@ specific index module:
202202

203203
The maximum number of characters that will be analyzed for a highlight request.
204204
This setting is only applicable when highlighting is requested on a text that was indexed without offsets or term vectors.
205-
Defaults to `10000`.
205+
Defaults to `1000000`.
206206

207207
`index.max_terms_count`::
208208

docs/reference/migration/migrate_7_0/analysis.asciidoc

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,5 +21,5 @@ Highlighting a text that was indexed without offsets or term vectors,
2121
requires analysis of this text in memory real time during the search request.
2222
For large texts this analysis may take substantial amount of time and memory.
2323
To protect against this, the maximum number of characters that will be analyzed has been
24-
limited to 10000. This default limit can be changed
24+
limited to 1000000. This default limit can be changed
2525
for a particular index with the index setting `index.highlight.max_analyzed_offset`.

docs/reference/search/request/highlighting.asciidoc

+1-1
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ needs highlighting. The `plain` highlighter always uses plain highlighting.
104104
[WARNING]
105105
Plain highlighting for large texts may require substantial amount of time and memory.
106106
To protect against this, the maximum number of text characters that will be analyzed has been
107-
limited to 10000. This default limit can be changed
107+
limited to 1000000. This default limit can be changed
108108
for a particular index with the index setting `index.highlight.max_analyzed_offset`.
109109

110110
[[highlighting-settings]]

server/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java

+1-13
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,6 @@
3737
import org.elasticsearch.common.Nullable;
3838
import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery;
3939
import org.elasticsearch.common.lucene.search.function.FunctionScoreQuery;
40-
import org.elasticsearch.index.IndexSettings;
4140
import org.elasticsearch.index.search.ESToParentBlockJoinQuery;
4241

4342
import java.io.IOException;
@@ -68,7 +67,6 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
6867
private final BreakIterator breakIterator;
6968
private final Locale breakIteratorLocale;
7069
private final int noMatchSize;
71-
private final int maxAnalyzedOffset;
7270

7371
/**
7472
* Creates a new instance of {@link CustomUnifiedHighlighter}
@@ -83,7 +81,6 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
8381
* If null {@link BreakIterator#getSentenceInstance(Locale)} is used.
8482
* @param fieldValue the original field values delimited by MULTIVAL_SEP_CHAR.
8583
* @param noMatchSize The size of the text that should be returned when no highlighting can be performed.
86-
* @param maxAnalyzedOffset The maximum number of characters that will be analyzed for highlighting.
8784
*/
8885
public CustomUnifiedHighlighter(IndexSearcher searcher,
8986
Analyzer analyzer,
@@ -92,16 +89,14 @@ public CustomUnifiedHighlighter(IndexSearcher searcher,
9289
@Nullable Locale breakIteratorLocale,
9390
@Nullable BreakIterator breakIterator,
9491
String fieldValue,
95-
int noMatchSize,
96-
int maxAnalyzedOffset) {
92+
int noMatchSize) {
9793
super(searcher, analyzer);
9894
this.offsetSource = offsetSource;
9995
this.breakIterator = breakIterator;
10096
this.breakIteratorLocale = breakIteratorLocale == null ? Locale.ROOT : breakIteratorLocale;
10197
this.passageFormatter = passageFormatter;
10298
this.fieldValue = fieldValue;
10399
this.noMatchSize = noMatchSize;
104-
this.maxAnalyzedOffset = maxAnalyzedOffset;
105100
}
106101

107102
/**
@@ -125,13 +120,6 @@ public Snippet[] highlightField(String field, Query query, int docId, int maxPas
125120
@Override
126121
protected List<CharSequence[]> loadFieldValues(String[] fields, DocIdSetIterator docIter,
127122
int cacheCharsThreshold) throws IOException {
128-
if ((offsetSource == OffsetSource.ANALYSIS) && (fieldValue.length() > maxAnalyzedOffset)) {
129-
throw new IllegalArgumentException(
130-
"The length of the text to be analyzed for highlighting has exceeded the allowed maximum of [" +
131-
maxAnalyzedOffset + "]. " + "This maximum can be set by changing the [" +
132-
IndexSettings.MAX_ANALYZED_OFFSET_SETTING.getKey() + "] index level setting. " +
133-
"For large texts, indexing with offsets or term vectors is recommended!");
134-
}
135123
// we only highlight one field, one document at a time
136124
return Collections.singletonList(new String[]{fieldValue});
137125
}

server/src/main/java/org/elasticsearch/index/IndexSettings.java

+2-2
Original file line numberDiff line numberDiff line change
@@ -123,11 +123,11 @@ public final class IndexSettings {
123123
* A setting describing the maximum number of characters that will be analyzed for a highlight request.
124124
* This setting is only applicable when highlighting is requested on a text that was indexed without
125125
* offsets or term vectors.
126-
* The default maximum of 10000 characters is defensive as for highlighting larger texts,
126+
* The default maximum of 1M characters is defensive as for highlighting larger texts,
127127
* indexing with offsets or term vectors is recommended.
128128
*/
129129
public static final Setting<Integer> MAX_ANALYZED_OFFSET_SETTING =
130-
Setting.intSetting("index.highlight.max_analyzed_offset", 10000, 1, Property.Dynamic, Property.IndexScope);
130+
Setting.intSetting("index.highlight.max_analyzed_offset", 1000000, 1, Property.Dynamic, Property.IndexScope);
131131

132132

133133
/**

server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/PlainHighlighter.java

+6-5
Original file line numberDiff line numberDiff line change
@@ -113,11 +113,12 @@ public HighlightField highlight(HighlighterContext highlighterContext) {
113113
String text = convertFieldValue(mapper.fieldType(), textToHighlight);
114114
if (text.length() > maxAnalyzedOffset) {
115115
throw new IllegalArgumentException(
116-
"The length of the text to be analyzed for highlighting has exceeded the allowed maximum of [" +
117-
maxAnalyzedOffset + "]. " + "This maximum can be set by changing the [" +
118-
IndexSettings.MAX_ANALYZED_OFFSET_SETTING.getKey() + "] index level setting. " +
119-
"For large texts, indexing with offsets or term vectors, and highlighting with unified or " +
120-
"fvh highlighter is recommended!");
116+
"The length of [" + highlighterContext.fieldName + "] field of [" + hitContext.hit().getId() +
117+
"] doc of [" + context.indexShard().shardId().getIndexName() + "] index " +
118+
"has exceeded [" + maxAnalyzedOffset + "] - maximum allowed to be analyzed for highlighting. " +
119+
"This maximum can be set by changing the [" + IndexSettings.MAX_ANALYZED_OFFSET_SETTING.getKey() +
120+
"] index level setting. " + "For large texts, indexing with offsets or term vectors, and highlighting " +
121+
"with unified or fvh highlighter is recommended!");
121122
}
122123

123124
try (TokenStream tokenStream = analyzer.tokenStream(mapper.fieldType().name(), text)) {

server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/UnifiedHighlighter.java

+11-6
Original file line numberDiff line numberDiff line change
@@ -32,22 +32,20 @@
3232
import org.apache.lucene.util.CollectionUtil;
3333
import org.elasticsearch.common.Strings;
3434
import org.elasticsearch.common.text.Text;
35+
import org.elasticsearch.index.IndexSettings;
3536
import org.elasticsearch.index.mapper.DocumentMapper;
3637
import org.elasticsearch.index.mapper.FieldMapper;
3738
import org.elasticsearch.index.mapper.KeywordFieldMapper;
3839
import org.elasticsearch.index.mapper.MappedFieldType;
39-
import org.elasticsearch.index.mapper.MapperService;
4040
import org.elasticsearch.search.fetch.FetchPhaseExecutionException;
4141
import org.elasticsearch.search.fetch.FetchSubPhase;
4242
import org.elasticsearch.search.internal.SearchContext;
4343

4444
import java.io.IOException;
4545
import java.text.BreakIterator;
4646
import java.util.ArrayList;
47-
import java.util.HashMap;
4847
import java.util.List;
4948
import java.util.Locale;
50-
import java.util.Map;
5149
import java.util.stream.Collectors;
5250

5351
import static org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR;
@@ -83,21 +81,28 @@ public HighlightField highlight(HighlighterContext highlighterContext) {
8381
final CustomUnifiedHighlighter highlighter;
8482
final String fieldValue = mergeFieldValues(fieldValues, MULTIVAL_SEP_CHAR);
8583
final OffsetSource offsetSource = getOffsetSource(fieldMapper.fieldType());
84+
if ((offsetSource == OffsetSource.ANALYSIS) && (fieldValue.length() > maxAnalyzedOffset)) {
85+
throw new IllegalArgumentException(
86+
"The length of [" + highlighterContext.fieldName + "] field of [" + hitContext.hit().getId() +
87+
"] doc of [" + context.indexShard().shardId().getIndexName() + "] index " + "has exceeded [" +
88+
maxAnalyzedOffset + "] - maximum allowed to be analyzed for highlighting. " +
89+
"This maximum can be set by changing the [" + IndexSettings.MAX_ANALYZED_OFFSET_SETTING.getKey() +
90+
"] index level setting. " + "For large texts, indexing with offsets or term vectors is recommended!");
91+
}
8692
if (field.fieldOptions().numberOfFragments() == 0) {
8793
// we use a control char to separate values, which is the only char that the custom break iterator
8894
// breaks the text on, so we don't lose the distinction between the different values of a field and we
8995
// get back a snippet per value
9096
CustomSeparatorBreakIterator breakIterator = new CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR);
9197
highlighter = new CustomUnifiedHighlighter(searcher, analyzer, offsetSource, passageFormatter,
92-
field.fieldOptions().boundaryScannerLocale(), breakIterator, fieldValue, field.fieldOptions().noMatchSize(),
93-
maxAnalyzedOffset);
98+
field.fieldOptions().boundaryScannerLocale(), breakIterator, fieldValue, field.fieldOptions().noMatchSize());
9499
numberOfFragments = fieldValues.size(); // we are highlighting the whole content, one snippet per value
95100
} else {
96101
//using paragraph separator we make sure that each field value holds a discrete passage for highlighting
97102
BreakIterator bi = getBreakIterator(field);
98103
highlighter = new CustomUnifiedHighlighter(searcher, analyzer, offsetSource, passageFormatter,
99104
field.fieldOptions().boundaryScannerLocale(), bi,
100-
fieldValue, field.fieldOptions().noMatchSize(), maxAnalyzedOffset);
105+
fieldValue, field.fieldOptions().noMatchSize());
101106
numberOfFragments = field.fieldOptions().numberOfFragments();
102107
}
103108

server/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ private void assertHighlightOneDoc(String fieldName, String[] inputs, Analyzer a
7878
String rawValue = Strings.arrayToDelimitedString(inputs, String.valueOf(MULTIVAL_SEP_CHAR));
7979
CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer, null,
8080
new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder()), locale,
81-
breakIterator, rawValue, noMatchSize, 10000);
81+
breakIterator, rawValue, noMatchSize);
8282
highlighter.setFieldMatcher((name) -> "text".equals(name));
8383
final Snippet[] snippets =
8484
highlighter.highlightField("text", query, topDocs.scoreDocs[0].doc, expectedPassages.length);

0 commit comments

Comments
 (0)