Skip to content

Commit f9af60b

Browse files
authored
Add query param to limit highlighting to specified length (#67325)
Add a `max_analyzed_offset` query parameter to allow users to limit the highlighting of text fields to a value less than or equal to the `index.highlight.max_analyzed_offset`, thus avoiding an exception when the length of the text field exceeds the limit. The highlighting still takes place, but stops at the length defined by the new parameter. Closes: #52155
1 parent 9d1b2a5 commit f9af60b

File tree

14 files changed

+564
-214
lines changed

14 files changed

+564
-214
lines changed

docs/reference/index-modules.asciidoc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,7 @@ specific index module:
223223
The maximum number of tokens that can be produced using _analyze API.
224224
Defaults to `10000`.
225225

226+
[[index-max-analyzed-offset]]
226227
`index.highlight.max_analyzed_offset`::
227228

228229
The maximum number of characters that will be analyzed for a highlight request.

docs/reference/search/search-your-data/highlighting.asciidoc

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ needs highlighting. The `plain` highlighter always uses plain highlighting.
117117
Plain highlighting for large texts may require substantial amount of time and memory.
118118
To protect against this, the maximum number of text characters that will be analyzed has been
119119
limited to 1000000. This default limit can be changed
120-
for a particular index with the index setting `index.highlight.max_analyzed_offset`.
120+
for a particular index with the index setting <<index-max-analyzed-offset,`index.highlight.max_analyzed_offset`>>.
121121

122122
[discrete]
123123
[[highlighting-settings]]
@@ -242,6 +242,17 @@ require_field_match:: By default, only fields that contains a query match are
242242
highlighted. Set `require_field_match` to `false` to highlight all fields.
243243
Defaults to `true`.
244244

245+
[[max-analyzed-offset]]
246+
max_analyzed_offset:: By default, the maximum number of characters
247+
analyzed for a highlight request is bounded by the value defined in the
248+
<<index-max-analyzed-offset, `index.highlight.max_analyzed_offset`>> setting,
249+
and when the number of characters exceeds this limit an error is returned. If
250+
this setting is set to a non-negative value, the highlighting stops at this defined
251+
maximum limit, and the rest of the text is not processed, thus not highlighted and
252+
no error is returned. The <<max-analyzed-offset, `max_analyzed_offset`>> query setting
253+
does *not* override the <<index-max-analyzed-offset, `index.highlight.max_analyzed_offset`>>
254+
which prevails when it's set to lower value than the query setting.
255+
245256
tags_schema:: Set to `styled` to use the built-in tag schema. The `styled`
246257
schema defines the following `pre_tags` and defines `post_tags` as
247258
`</em>`.
@@ -1119,4 +1130,4 @@ using the passages's `matchStarts` and `matchEnds` information:
11191130
I'll be the <em>only</em> <em>fox</em> in the world for you.
11201131

11211132
This kind of formatted strings are the final result of the highlighter returned
1122-
to the user.
1133+
to the user.

plugins/mapper-annotated-text/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/AnnotatedTextHighlighter.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,8 @@ protected List<Object> loadFieldValues(
5050
}
5151

5252
@Override
53-
protected Analyzer wrapAnalyzer(Analyzer analyzer) {
54-
return new AnnotatedHighlighterAnalyzer(super.wrapAnalyzer(analyzer));
53+
protected Analyzer wrapAnalyzer(Analyzer analyzer, Integer maxAnalyzedOffset) {
54+
return new AnnotatedHighlighterAnalyzer(super.wrapAnalyzer(analyzer, maxAnalyzedOffset));
5555
}
5656

5757
@Override

plugins/mapper-annotated-text/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/AnnotatedTextHighlighterTests.java

Lines changed: 143 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,14 @@
88

99
package org.elasticsearch.search.fetch.subphase.highlight;
1010

11+
import static org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR;
12+
import static org.hamcrest.CoreMatchers.equalTo;
13+
14+
import java.net.URLEncoder;
15+
import java.text.BreakIterator;
16+
import java.util.ArrayList;
17+
import java.util.Locale;
18+
1119
import org.apache.lucene.analysis.Analyzer;
1220
import org.apache.lucene.analysis.standard.StandardAnalyzer;
1321
import org.apache.lucene.document.Document;
@@ -31,95 +39,96 @@
3139
import org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter;
3240
import org.apache.lucene.search.uhighlight.Snippet;
3341
import org.apache.lucene.search.uhighlight.SplittingBreakIterator;
42+
import org.apache.lucene.search.uhighlight.UnifiedHighlighter;
3443
import org.apache.lucene.store.Directory;
3544
import org.elasticsearch.common.Strings;
3645
import org.elasticsearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedHighlighterAnalyzer;
3746
import org.elasticsearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedText;
3847
import org.elasticsearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotationAnalyzerWrapper;
3948
import org.elasticsearch.test.ESTestCase;
4049

41-
import java.net.URLEncoder;
42-
import java.text.BreakIterator;
43-
import java.util.ArrayList;
44-
import java.util.Locale;
50+
public class AnnotatedTextHighlighterTests extends ESTestCase {
4551

46-
import static org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR;
47-
import static org.hamcrest.CoreMatchers.equalTo;
52+
private void assertHighlightOneDoc(String fieldName, String[] markedUpInputs,
53+
Query query, Locale locale, BreakIterator breakIterator,
54+
int noMatchSize, String[] expectedPassages) throws Exception {
4855

49-
public class AnnotatedTextHighlighterTests extends ESTestCase {
56+
assertHighlightOneDoc(fieldName, markedUpInputs, query, locale, breakIterator, noMatchSize, expectedPassages,
57+
Integer.MAX_VALUE, null);
58+
}
5059

5160
private void assertHighlightOneDoc(String fieldName, String []markedUpInputs,
5261
Query query, Locale locale, BreakIterator breakIterator,
53-
int noMatchSize, String[] expectedPassages) throws Exception {
54-
55-
56-
// Annotated fields wrap the usual analyzer with one that injects extra tokens
57-
Analyzer wrapperAnalyzer = new AnnotationAnalyzerWrapper(new StandardAnalyzer());
58-
Directory dir = newDirectory();
59-
IndexWriterConfig iwc = newIndexWriterConfig(wrapperAnalyzer);
60-
iwc.setMergePolicy(newTieredMergePolicy(random()));
61-
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
62-
FieldType ft = new FieldType(TextField.TYPE_STORED);
63-
if (randomBoolean()) {
64-
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
65-
} else {
66-
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
67-
}
68-
ft.freeze();
69-
Document doc = new Document();
70-
for (String input : markedUpInputs) {
71-
Field field = new Field(fieldName, "", ft);
72-
field.setStringValue(input);
73-
doc.add(field);
74-
}
75-
iw.addDocument(doc);
76-
DirectoryReader reader = iw.getReader();
77-
IndexSearcher searcher = newSearcher(reader);
78-
iw.close();
79-
80-
AnnotatedText[] annotations = new AnnotatedText[markedUpInputs.length];
81-
for (int i = 0; i < markedUpInputs.length; i++) {
82-
annotations[i] = AnnotatedText.parse(markedUpInputs[i]);
83-
}
84-
AnnotatedHighlighterAnalyzer hiliteAnalyzer = new AnnotatedHighlighterAnalyzer(wrapperAnalyzer);
85-
hiliteAnalyzer.setAnnotations(annotations);
86-
AnnotatedPassageFormatter passageFormatter = new AnnotatedPassageFormatter(new DefaultEncoder());
87-
passageFormatter.setAnnotations(annotations);
88-
89-
ArrayList<Object> plainTextForHighlighter = new ArrayList<>(annotations.length);
90-
for (int i = 0; i < annotations.length; i++) {
91-
plainTextForHighlighter.add(annotations[i].textMinusMarkup);
92-
}
62+
int noMatchSize, String[] expectedPassages,
63+
int maxAnalyzedOffset, Integer queryMaxAnalyzedOffset) throws Exception {
9364

94-
TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), 1, Sort.INDEXORDER);
95-
assertThat(topDocs.totalHits.value, equalTo(1L));
96-
String rawValue = Strings.collectionToDelimitedString(plainTextForHighlighter, String.valueOf(MULTIVAL_SEP_CHAR));
97-
CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(
98-
searcher,
99-
hiliteAnalyzer,
100-
null,
101-
passageFormatter,
102-
locale,
103-
breakIterator,
104-
"index",
105-
"text",
106-
query,
107-
noMatchSize,
108-
expectedPassages.length,
109-
name -> "text".equals(name),
110-
Integer.MAX_VALUE
111-
);
112-
highlighter.setFieldMatcher((name) -> "text".equals(name));
113-
final Snippet[] snippets = highlighter.highlightField(getOnlyLeafReader(reader), topDocs.scoreDocs[0].doc, () -> rawValue);
114-
assertEquals(expectedPassages.length, snippets.length);
115-
for (int i = 0; i < snippets.length; i++) {
116-
assertEquals(expectedPassages[i], snippets[i].getText());
65+
try (Directory dir = newDirectory()) {
66+
// Annotated fields wrap the usual analyzer with one that injects extra tokens
67+
Analyzer wrapperAnalyzer = new AnnotationAnalyzerWrapper(new StandardAnalyzer());
68+
IndexWriterConfig iwc = newIndexWriterConfig(wrapperAnalyzer);
69+
iwc.setMergePolicy(newTieredMergePolicy(random()));
70+
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
71+
FieldType ft = new FieldType(TextField.TYPE_STORED);
72+
if (randomBoolean()) {
73+
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
74+
} else {
75+
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
76+
}
77+
ft.freeze();
78+
Document doc = new Document();
79+
for (String input : markedUpInputs) {
80+
Field field = new Field(fieldName, "", ft);
81+
field.setStringValue(input);
82+
doc.add(field);
83+
}
84+
iw.addDocument(doc);
85+
try (DirectoryReader reader = iw.getReader()) {
86+
IndexSearcher searcher = newSearcher(reader);
87+
iw.close();
88+
89+
AnnotatedText[] annotations = new AnnotatedText[markedUpInputs.length];
90+
for (int i = 0; i < markedUpInputs.length; i++) {
91+
annotations[i] = AnnotatedText.parse(markedUpInputs[i]);
92+
}
93+
AnnotatedHighlighterAnalyzer hiliteAnalyzer = new AnnotatedHighlighterAnalyzer(wrapperAnalyzer);
94+
hiliteAnalyzer.setAnnotations(annotations);
95+
AnnotatedPassageFormatter passageFormatter = new AnnotatedPassageFormatter(new DefaultEncoder());
96+
passageFormatter.setAnnotations(annotations);
97+
98+
ArrayList<Object> plainTextForHighlighter = new ArrayList<>(annotations.length);
99+
for (int i = 0; i < annotations.length; i++) {
100+
plainTextForHighlighter.add(annotations[i].textMinusMarkup);
101+
}
102+
103+
TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), 1, Sort.INDEXORDER);
104+
assertThat(topDocs.totalHits.value, equalTo(1L));
105+
String rawValue = Strings.collectionToDelimitedString(plainTextForHighlighter, String.valueOf(MULTIVAL_SEP_CHAR));
106+
CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(
107+
searcher,
108+
hiliteAnalyzer,
109+
UnifiedHighlighter.OffsetSource.ANALYSIS,
110+
passageFormatter,
111+
locale,
112+
breakIterator,
113+
"index",
114+
"text",
115+
query,
116+
noMatchSize,
117+
expectedPassages.length,
118+
name -> "text".equals(name),
119+
maxAnalyzedOffset,
120+
queryMaxAnalyzedOffset
121+
);
122+
highlighter.setFieldMatcher((name) -> "text".equals(name));
123+
final Snippet[] snippets = highlighter.highlightField(getOnlyLeafReader(reader), topDocs.scoreDocs[0].doc, () -> rawValue);
124+
assertEquals(expectedPassages.length, snippets.length);
125+
for (int i = 0; i < snippets.length; i++) {
126+
assertEquals(expectedPassages[i], snippets[i].getText());
127+
}
128+
}
117129
}
118-
reader.close();
119-
dir.close();
120130
}
121131

122-
123132
public void testAnnotatedTextStructuredMatch() throws Exception {
124133
// Check that a structured token eg a URL can be highlighted in a query
125134
// on marked-up
@@ -191,4 +200,65 @@ public void testBadAnnotation() throws Exception {
191200
assertHighlightOneDoc("text", markedUpInputs, query, Locale.ROOT, breakIterator, 0, expectedPassages);
192201
}
193202

203+
public void testExceedMaxAnalyzedOffset() throws Exception {
204+
TermQuery query = new TermQuery(new Term("text", "exceeds"));
205+
BreakIterator breakIterator = new CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR);
206+
assertHighlightOneDoc("text", new String[] { "[Short Text](Short+Text)" }, query, Locale.ROOT, breakIterator, 0, new String[] {},
207+
10, null);
208+
209+
IllegalArgumentException e = expectThrows(
210+
IllegalArgumentException.class,
211+
() -> assertHighlightOneDoc(
212+
"text",
213+
new String[] { "[Long Text exceeds](Long+Text+exceeds) MAX analyzed offset)" },
214+
query,
215+
Locale.ROOT,
216+
breakIterator,
217+
0,
218+
new String[] {},
219+
20,
220+
null
221+
)
222+
);
223+
assertEquals(
224+
"The length [38] of field [text] in doc[0]/index[index] exceeds the [index.highlight.max_analyzed_offset] limit [20]. "
225+
+ "To avoid this error, set the query parameter [max_analyzed_offset] to a value less than index setting [20] and this "
226+
+ "will tolerate long field values by truncating them.",
227+
e.getMessage()
228+
);
229+
230+
final Integer queryMaxOffset = randomIntBetween(21, 1000);
231+
e = expectThrows(
232+
IllegalArgumentException.class,
233+
() -> assertHighlightOneDoc(
234+
"text",
235+
new String[] { "[Long Text exceeds](Long+Text+exceeds) MAX analyzed offset)" },
236+
query,
237+
Locale.ROOT,
238+
breakIterator,
239+
0,
240+
new String[] {},
241+
20,
242+
queryMaxOffset
243+
)
244+
);
245+
assertEquals(
246+
"The length [38] of field [text] in doc[0]/index[index] exceeds the [index.highlight.max_analyzed_offset] limit [20]. "
247+
+ "To avoid this error, set the query parameter [max_analyzed_offset] to a value less than index setting [20] and this "
248+
+ "will tolerate long field values by truncating them.",
249+
e.getMessage()
250+
);
251+
252+
assertHighlightOneDoc(
253+
"text",
254+
new String[] { "[Long Text Exceeds](Long+Text+Exceeds) MAX analyzed offset [Long Text Exceeds](Long+Text+Exceeds)" },
255+
query,
256+
Locale.ROOT,
257+
breakIterator,
258+
0,
259+
new String[] { "Long Text [Exceeds](_hit_term=exceeds) MAX analyzed offset [Long Text Exceeds](Long+Text+Exceeds)" },
260+
20,
261+
15
262+
);
263+
}
194264
}

rest-api-spec/src/main/resources/rest-api-spec/test/search.highlight/30_max_analyzed_offset.yml

Lines changed: 62 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ setup:
66
body:
77
settings:
88
number_of_shards: 1
9-
index.highlight.max_analyzed_offset: 10
9+
index.highlight.max_analyzed_offset: 30
1010
mappings:
1111
properties:
1212
field1:
@@ -37,6 +37,20 @@ setup:
3737
body: {"query" : {"match" : {"field1" : "fox"}}, "highlight" : {"type" : "unified", "fields" : {"field1" : {}}}}
3838
- match: { error.root_cause.0.type: "illegal_argument_exception" }
3939

40+
---
41+
"Unified highlighter on a field WITHOUT OFFSETS exceeding index.highlight.max_analyzed_offset with max_analyzed_offset=20 should SUCCEED":
42+
43+
- skip:
44+
version: " - 7.99.99"
45+
reason: max_analyzed_offset query param added in 8.0
46+
47+
- do:
48+
search:
49+
rest_total_hits_as_int: true
50+
index: test1
51+
body: {"query" : {"match" : {"field1" : "fox"}}, "highlight" : {"type" : "unified", "fields" : {"field1" : {}}, "max_analyzed_offset": "20"}}
52+
- match: {hits.hits.0.highlight.field1.0: "The quick brown <em>fox</em> went to the forest and saw another fox."}
53+
4054

4155
---
4256
"Plain highlighter on a field WITHOUT OFFSETS exceeding index.highlight.max_analyzed_offset should FAIL":
@@ -46,9 +60,23 @@ setup:
4660
search:
4761
rest_total_hits_as_int: true
4862
index: test1
49-
body: {"query" : {"match" : {"field1" : "fox"}}, "highlight" : {"type" : "unified", "fields" : {"field1" : {}}}}
63+
body: {"query" : {"match" : {"field1" : "fox"}}, "highlight" : {"type" : "plain", "fields" : {"field1" : {}}}}
5064
- match: { error.root_cause.0.type: "illegal_argument_exception" }
5165

66+
---
67+
"Plain highlighter on a field WITHOUT OFFSETS exceeding index.highlight.max_analyzed_offset with max_analyzed_offset=20 should SUCCEED":
68+
69+
- skip:
70+
version: " - 7.99.99"
71+
reason: max_analyzed_offset query param added in 8.0
72+
73+
- do:
74+
search:
75+
rest_total_hits_as_int: true
76+
index: test1
77+
body: {"query" : {"match" : {"field1" : "fox"}}, "highlight" : {"type" : "plain", "fields" : {"field1" : {}}, "max_analyzed_offset": 20}}
78+
- match: {hits.hits.0.highlight.field1.0: "The quick brown <em>fox</em> went to the forest and saw another fox."}
79+
5280

5381
---
5482
"Unified highlighter on a field WITH OFFSETS exceeding index.highlight.max_analyzed_offset should SUCCEED":
@@ -71,3 +99,35 @@ setup:
7199
index: test1
72100
body: {"query" : {"match" : {"field2" : "fox"}}, "highlight" : {"type" : "plain", "fields" : {"field2" : {}}}}
73101
- match: { error.root_cause.0.type: "illegal_argument_exception" }
102+
103+
---
104+
"Plain highlighter on a field WITH OFFSETS exceeding index.highlight.max_analyzed_offset with max_analyzed_offset=20 should SUCCEED":
105+
106+
- skip:
107+
version: " - 7.99.99"
108+
reason: max_analyzed_offset query param added in 8.0
109+
110+
- do:
111+
search:
112+
rest_total_hits_as_int: true
113+
index: test1
114+
body: {"query" : {"match" : {"field2" : "fox"}}, "highlight" : {"type" : "plain", "fields" : {"field2" : {}}, "max_analyzed_offset": 20}}
115+
- match: {hits.hits.0.highlight.field2.0: "The quick brown <em>fox</em> went to the forest and saw another fox."}
116+
117+
---
118+
"Plain highlighter with max_analyzed_offset < 0 should FAIL":
119+
120+
- skip:
121+
version: " - 7.99.99"
122+
reason: max_analyzed_offset query param added in 8.0
123+
124+
- do:
125+
catch: bad_request
126+
search:
127+
rest_total_hits_as_int: true
128+
index: test1
129+
body: {"query" : {"match" : {"field2" : "fox"}}, "highlight" : {"type" : "plain", "fields" : {"field2" : {}}, "max_analyzed_offset": -10}}
130+
- match: { status: 400 }
131+
- match: { error.root_cause.0.type: "x_content_parse_exception" }
132+
- match: { error.caused_by.type: "illegal_argument_exception" }
133+
- match: { error.caused_by.reason: "[max_analyzed_offset] must be a positive integer" }

0 commit comments

Comments
 (0)