Skip to content

Commit ad1553b

Browse files
Limit analyzed text for highlighting (improvements) (#28907)
Increase the default limit of index.highlight.max_analyzed_offset to 1M instead of previous 10K. Enhance an error message when offset increased to include field name, index name and doc_id. Relates to #27934, elastic/kibana#16764
1 parent 8f91a6a commit ad1553b

File tree

6 files changed

+48
-49
lines changed

6 files changed

+48
-49
lines changed

docs/reference/migration/migrate_6_0/analysis.asciidoc

+2-2
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ Highlighting a text that was indexed without offsets or term vectors,
1717
requires analysis of this text in memory real time during the search request.
1818
For large texts this analysis may take substantial amount of time and memory.
1919
To protect against this, the maximum number of characters that to be analyzed will be
20-
limited to 10000 in the next major Elastic version. For this version, by default the limit
21-
is not set. A deprecation warning will be issued when an analyzed text exceeds 10000.
20+
limited to 1000000 in the next major Elastic version. For this version, by default the limit
21+
is not set. A deprecation warning will be issued when an analyzed text exceeds 1000000.
2222
The limit can be set for a particular index with the index setting
2323
`index.highlight.max_analyzed_offset`.

docs/reference/search/request/highlighting.asciidoc

+1-1
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ needs highlighting. The `plain` highlighter always uses plain highlighting.
107107
[WARNING]
108108
Plain highlighting for large texts may require substantial amount of time and memory.
109109
To protect against this, the maximum number of text characters to be analyzed will be
110-
limited to 10000 in the next major Elastic version. The default limit is not set for this version,
110+
limited to 1000000 in the next major Elastic version. The default limit is not set for this version,
111111
but can be set for a particular index with the index setting `index.highlight.max_analyzed_offset`.
112112

113113
[[highlighting-settings]]

server/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java

+1-26
Original file line numberDiff line numberDiff line change
@@ -35,12 +35,9 @@
3535
import org.apache.lucene.util.BytesRef;
3636
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
3737
import org.elasticsearch.common.Nullable;
38-
import org.elasticsearch.common.logging.DeprecationLogger;
39-
import org.elasticsearch.common.logging.Loggers;
4038
import org.elasticsearch.common.lucene.all.AllTermQuery;
4139
import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery;
4240
import org.elasticsearch.common.lucene.search.function.FunctionScoreQuery;
43-
import org.elasticsearch.index.IndexSettings;
4441
import org.elasticsearch.index.search.ESToParentBlockJoinQuery;
4542

4643
import java.io.IOException;
@@ -71,7 +68,6 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
7168
private final BreakIterator breakIterator;
7269
private final Locale breakIteratorLocale;
7370
private final int noMatchSize;
74-
private final int maxAnalyzedOffset;
7571

7672
/**
7773
* Creates a new instance of {@link CustomUnifiedHighlighter}
@@ -86,7 +82,6 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
8682
* If null {@link BreakIterator#getSentenceInstance(Locale)} is used.
8783
* @param fieldValue the original field values delimited by MULTIVAL_SEP_CHAR.
8884
* @param noMatchSize The size of the text that should be returned when no highlighting can be performed.
89-
* @param maxAnalyzedOffset The maximum number of characters that will be analyzed for highlighting.
9085
*/
9186
public CustomUnifiedHighlighter(IndexSearcher searcher,
9287
Analyzer analyzer,
@@ -95,16 +90,14 @@ public CustomUnifiedHighlighter(IndexSearcher searcher,
9590
@Nullable Locale breakIteratorLocale,
9691
@Nullable BreakIterator breakIterator,
9792
String fieldValue,
98-
int noMatchSize,
99-
int maxAnalyzedOffset) {
93+
int noMatchSize) {
10094
super(searcher, analyzer);
10195
this.offsetSource = offsetSource;
10296
this.breakIterator = breakIterator;
10397
this.breakIteratorLocale = breakIteratorLocale == null ? Locale.ROOT : breakIteratorLocale;
10498
this.passageFormatter = passageFormatter;
10599
this.fieldValue = fieldValue;
106100
this.noMatchSize = noMatchSize;
107-
this.maxAnalyzedOffset = maxAnalyzedOffset;
108101
}
109102

110103
/**
@@ -128,24 +121,6 @@ public Snippet[] highlightField(String field, Query query, int docId, int maxPas
128121
@Override
129122
protected List<CharSequence[]> loadFieldValues(String[] fields, DocIdSetIterator docIter,
130123
int cacheCharsThreshold) throws IOException {
131-
// Issue deprecation warning if maxAnalyzedOffset is not set, and field length > default setting for 7.0
132-
final int defaultMaxAnalyzedOffset7 = 10000;
133-
if ((offsetSource == OffsetSource.ANALYSIS) && (maxAnalyzedOffset == -1) && (fieldValue.length() > defaultMaxAnalyzedOffset7)) {
134-
DeprecationLogger deprecationLogger = new DeprecationLogger(Loggers.getLogger(CustomUnifiedHighlighter.class));
135-
deprecationLogger.deprecated(
136-
"The length of text to be analyzed for highlighting [" + fieldValue.length() +
137-
"] exceeded the allowed maximum of [" + defaultMaxAnalyzedOffset7 + "] set for the next major Elastic version. " +
138-
"For large texts, indexing with offsets or term vectors is recommended!");
139-
}
140-
// Throw an error if maxAnalyzedOffset is explicitly set by the user, and field length > maxAnalyzedOffset
141-
if ((offsetSource == OffsetSource.ANALYSIS) && (maxAnalyzedOffset > 0) && (fieldValue.length() > maxAnalyzedOffset)) {
142-
// maxAnalyzedOffset is not set by user
143-
throw new IllegalArgumentException(
144-
"The length of text to be analyzed for highlighting [" + fieldValue.length() +
145-
"] exceeded the allowed maximum of [" + maxAnalyzedOffset + "]. This maximum can be set by changing the [" +
146-
IndexSettings.MAX_ANALYZED_OFFSET_SETTING.getKey() + "] index level setting. " +
147-
"For large texts, indexing with offsets or term vectors is recommended!");
148-
}
149124
// we only highlight one field, one document at a time
150125
return Collections.singletonList(new String[]{fieldValue});
151126
}

server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/PlainHighlighter.java

+14-10
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@
5555

5656
public class PlainHighlighter implements Highlighter {
5757
private static final String CACHE_KEY = "highlight-plain";
58+
private static final DeprecationLogger deprecationLogger = new DeprecationLogger(Loggers.getLogger(PlainHighlighter.class));
5859

5960
@Override
6061
public HighlightField highlight(HighlighterContext highlighterContext) {
@@ -110,26 +111,29 @@ public HighlightField highlight(HighlighterContext highlighterContext) {
110111

111112
try {
112113
textsToHighlight = HighlightUtils.loadFieldValues(field, mapper, context, hitContext);
113-
final int defaultMaxAnalyzedOffset7 = 10000;
114+
final int maxAnalyzedOffset7 = 1000000;
114115
for (Object textToHighlight : textsToHighlight) {
115116
String text = convertFieldValue(mapper.fieldType(), textToHighlight);
116117

117118
// Issue deprecation warning if maxAnalyzedOffset is not set, and text length > default setting for 7.0
118-
if ((maxAnalyzedOffset == -1) && (text.length() > defaultMaxAnalyzedOffset7)) {
119-
DeprecationLogger deprecationLogger = new DeprecationLogger(Loggers.getLogger(PlainHighlighter.class));
119+
if ((maxAnalyzedOffset == -1) && (text.length() > maxAnalyzedOffset7)) {
120120
deprecationLogger.deprecated(
121-
"The length of text to be analyzed for highlighting [" + text.length() + "] exceeded the allowed maximum of [" +
122-
defaultMaxAnalyzedOffset7 + "] set for the next major Elastic version. " +
123-
"For large texts, indexing with offsets or term vectors is recommended!");
121+
"The length [" + text.length()+ "] of [" + highlighterContext.fieldName + "] field of [" +
122+
hitContext.hit().getId() + "] doc of [" + context.indexShard().shardId().getIndexName() + "] index has " +
123+
"exceeded the allowed maximum of ["+ maxAnalyzedOffset7 + "] set for the next major Elastic version. " +
124+
"This maximum can be set by changing the [" + IndexSettings.MAX_ANALYZED_OFFSET_SETTING.getKey() +
125+
"] index level setting. For large texts, indexing with offsets or term vectors is recommended!");
124126
}
125127
// Throw an error if maxAnalyzedOffset is explicitly set by the user, and text length > maxAnalyzedOffset
126128
if ((maxAnalyzedOffset > 0) && (text.length() > maxAnalyzedOffset)) {
127129
// maxAnalyzedOffset is not set by user
128130
throw new IllegalArgumentException(
129-
"The length of text to be analyzed for highlighting [" + text.length() +
130-
"] exceeded the allowed maximum of [" + maxAnalyzedOffset + "]. This maximum can be set by changing the [" +
131-
IndexSettings.MAX_ANALYZED_OFFSET_SETTING.getKey() + "] index level setting. " +
132-
"For large texts, indexing with offsets or term vectors is recommended!");
131+
"The length [" + text.length()+ "] of [" + highlighterContext.fieldName + "] field of [" +
132+
hitContext.hit().getId() + "] doc of [" + context.indexShard().shardId().getIndexName() + "] index " +
133+
"has exceeded [" + maxAnalyzedOffset + "] - maximum allowed to be analyzed for highlighting. " +
134+
"This maximum can be set by changing the [" + IndexSettings.MAX_ANALYZED_OFFSET_SETTING.getKey() +
135+
"] index level setting. " + "For large texts, indexing with offsets or term vectors, and highlighting " +
136+
"with unified or fvh highlighter is recommended!");
133137
}
134138

135139
try (TokenStream tokenStream = analyzer.tokenStream(mapper.fieldType().name(), text)) {

server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/UnifiedHighlighter.java

+29-9
Original file line numberDiff line numberDiff line change
@@ -31,28 +31,30 @@
3131
import org.apache.lucene.util.BytesRef;
3232
import org.apache.lucene.util.CollectionUtil;
3333
import org.elasticsearch.common.Strings;
34+
import org.elasticsearch.common.logging.DeprecationLogger;
35+
import org.elasticsearch.common.logging.Loggers;
3436
import org.elasticsearch.common.text.Text;
3537
import org.elasticsearch.index.mapper.DocumentMapper;
3638
import org.elasticsearch.index.mapper.FieldMapper;
3739
import org.elasticsearch.index.mapper.KeywordFieldMapper;
3840
import org.elasticsearch.index.mapper.MappedFieldType;
39-
import org.elasticsearch.index.mapper.MapperService;
4041
import org.elasticsearch.search.fetch.FetchPhaseExecutionException;
4142
import org.elasticsearch.search.fetch.FetchSubPhase;
4243
import org.elasticsearch.search.internal.SearchContext;
44+
import org.elasticsearch.index.IndexSettings;
4345

4446
import java.io.IOException;
4547
import java.text.BreakIterator;
4648
import java.util.ArrayList;
47-
import java.util.HashMap;
4849
import java.util.List;
4950
import java.util.Locale;
50-
import java.util.Map;
5151
import java.util.stream.Collectors;
5252

5353
import static org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR;
5454

5555
public class UnifiedHighlighter implements Highlighter {
56+
private static final DeprecationLogger deprecationLogger = new DeprecationLogger(Loggers.getLogger(UnifiedHighlighter.class));
57+
5658
@Override
5759
public boolean canHighlight(FieldMapper fieldMapper) {
5860
return true;
@@ -67,8 +69,6 @@ public HighlightField highlight(HighlighterContext highlighterContext) {
6769
Encoder encoder = field.fieldOptions().encoder().equals("html") ? HighlightUtils.Encoders.HTML : HighlightUtils.Encoders.DEFAULT;
6870
CustomPassageFormatter passageFormatter = new CustomPassageFormatter(field.fieldOptions().preTags()[0],
6971
field.fieldOptions().postTags()[0], encoder);
70-
final int maxAnalyzedOffset = context.indexShard().indexSettings().getHighlightMaxAnalyzedOffset();
71-
7272
List<Snippet> snippets = new ArrayList<>();
7373
int numberOfFragments;
7474
try {
@@ -83,21 +83,41 @@ public HighlightField highlight(HighlighterContext highlighterContext) {
8383
final CustomUnifiedHighlighter highlighter;
8484
final String fieldValue = mergeFieldValues(fieldValues, MULTIVAL_SEP_CHAR);
8585
final OffsetSource offsetSource = getOffsetSource(fieldMapper.fieldType());
86+
87+
final int maxAnalyzedOffset = context.indexShard().indexSettings().getHighlightMaxAnalyzedOffset();
88+
// Issue a deprecation warning if maxAnalyzedOffset is not set, and field length > default setting for 7.0
89+
final int maxAnalyzedOffset7 = 1000000;
90+
if ((offsetSource == OffsetSource.ANALYSIS) && (maxAnalyzedOffset == -1) && (fieldValue.length() > maxAnalyzedOffset7)) {
91+
deprecationLogger.deprecated(
92+
"The length [" + fieldValue.length() + "] of [" + highlighterContext.fieldName + "] field of [" +
93+
hitContext.hit().getId() + "] doc of [" + context.indexShard().shardId().getIndexName() + "] index has " +
94+
"exceeded the allowed maximum of [" + maxAnalyzedOffset7 + "] set for the next major Elastic version. " +
95+
"This maximum can be set by changing the [" + IndexSettings.MAX_ANALYZED_OFFSET_SETTING.getKey() +
96+
"] index level setting. " + "For large texts, indexing with offsets or term vectors is recommended!");
97+
}
98+
// Throw an error if maxAnalyzedOffset is explicitly set by the user, and field length > maxAnalyzedOffset
99+
if ((offsetSource == OffsetSource.ANALYSIS) && (maxAnalyzedOffset > 0) && (fieldValue.length() > maxAnalyzedOffset)) {
100+
throw new IllegalArgumentException(
101+
"The length [" + fieldValue.length() + "] of [" + highlighterContext.fieldName + "] field of [" +
102+
hitContext.hit().getId() + "] doc of [" + context.indexShard().shardId().getIndexName() + "] index " +
103+
"has exceeded [" + maxAnalyzedOffset + "] - maximum allowed to be analyzed for highlighting. " +
104+
"This maximum can be set by changing the [" + IndexSettings.MAX_ANALYZED_OFFSET_SETTING.getKey() +
105+
"] index level setting. " + "For large texts, indexing with offsets or term vectors is recommended!");
106+
}
107+
86108
if (field.fieldOptions().numberOfFragments() == 0) {
87109
// we use a control char to separate values, which is the only char that the custom break iterator
88110
// breaks the text on, so we don't lose the distinction between the different values of a field and we
89111
// get back a snippet per value
90112
CustomSeparatorBreakIterator breakIterator = new CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR);
91113
highlighter = new CustomUnifiedHighlighter(searcher, analyzer, offsetSource, passageFormatter,
92-
field.fieldOptions().boundaryScannerLocale(), breakIterator, fieldValue, field.fieldOptions().noMatchSize(),
93-
maxAnalyzedOffset);
114+
field.fieldOptions().boundaryScannerLocale(), breakIterator, fieldValue, field.fieldOptions().noMatchSize());
94115
numberOfFragments = fieldValues.size(); // we are highlighting the whole content, one snippet per value
95116
} else {
96117
//using paragraph separator we make sure that each field value holds a discrete passage for highlighting
97118
BreakIterator bi = getBreakIterator(field);
98119
highlighter = new CustomUnifiedHighlighter(searcher, analyzer, offsetSource, passageFormatter,
99-
field.fieldOptions().boundaryScannerLocale(), bi,
100-
fieldValue, field.fieldOptions().noMatchSize(), maxAnalyzedOffset);
120+
field.fieldOptions().boundaryScannerLocale(), bi, fieldValue, field.fieldOptions().noMatchSize());
101121
numberOfFragments = field.fieldOptions().numberOfFragments();
102122
}
103123

server/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ private void assertHighlightOneDoc(String fieldName, String[] inputs, Analyzer a
7979
String rawValue = Strings.arrayToDelimitedString(inputs, String.valueOf(MULTIVAL_SEP_CHAR));
8080
CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer, null,
8181
new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder()), locale,
82-
breakIterator, rawValue, noMatchSize, -1);
82+
breakIterator, rawValue, noMatchSize);
8383
highlighter.setFieldMatcher((name) -> "text".equals(name));
8484
final Snippet[] snippets =
8585
highlighter.highlightField("text", query, topDocs.scoreDocs[0].doc, expectedPassages.length);

0 commit comments

Comments
 (0)