Skip to content

Commit 931e149

Browse files
authored
Add query param to limit highlighting to specified length (#67325) (#69016)
Add a `max_analyzed_offset` query parameter to allow users to limit the highlighting of text fields to a value less than or equal to the `index.highlight.max_analyzed_offset`, thus avoiding an exception when the length of the text field exceeds the limit. The highlighting still takes place, but stops at the length defined by the new parameter. Closes: #52155 (cherry picked from commit f9af60b)
1 parent 4b8c8f8 commit 931e149

File tree

14 files changed

+564
-214
lines changed

14 files changed

+564
-214
lines changed

docs/reference/index-modules.asciidoc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,7 @@ specific index module:
223223
The maximum number of tokens that can be produced using _analyze API.
224224
Defaults to `10000`.
225225

226+
[[index-max-analyzed-offset]]
226227
`index.highlight.max_analyzed_offset`::
227228

228229
The maximum number of characters that will be analyzed for a highlight request.

docs/reference/search/search-your-data/highlighting.asciidoc

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ needs highlighting. The `plain` highlighter always uses plain highlighting.
117117
Plain highlighting for large texts may require substantial amount of time and memory.
118118
To protect against this, the maximum number of text characters that will be analyzed has been
119119
limited to 1000000. This default limit can be changed
120-
for a particular index with the index setting `index.highlight.max_analyzed_offset`.
120+
for a particular index with the index setting <<index-max-analyzed-offset,`index.highlight.max_analyzed_offset`>>.
121121

122122
[discrete]
123123
[[highlighting-settings]]
@@ -242,6 +242,17 @@ require_field_match:: By default, only fields that contains a query match are
242242
highlighted. Set `require_field_match` to `false` to highlight all fields.
243243
Defaults to `true`.
244244

245+
[[max-analyzed-offset]]
246+
max_analyzed_offset:: By default, the maximum number of characters
247+
analyzed for a highlight request is bounded by the value defined in the
248+
<<index-max-analyzed-offset, `index.highlight.max_analyzed_offset`>> setting,
249+
and when the number of characters exceeds this limit an error is returned. If
250+
this setting is set to a non-negative value, the highlighting stops at this defined
251+
maximum limit, and the rest of the text is not processed, thus not highlighted and
252+
no error is returned. The <<max-analyzed-offset, `max_analyzed_offset`>> query setting
253+
does *not* override the <<index-max-analyzed-offset, `index.highlight.max_analyzed_offset`>>
254+
which prevails when it's set to lower value than the query setting.
255+
245256
tags_schema:: Set to `styled` to use the built-in tag schema. The `styled`
246257
schema defines the following `pre_tags` and defines `post_tags` as
247258
`</em>`.
@@ -1121,4 +1132,4 @@ using the passages's `matchStarts` and `matchEnds` information:
11211132
I'll be the <em>only</em> <em>fox</em> in the world for you.
11221133

11231134
This kind of formatted strings are the final result of the highlighter returned
1124-
to the user.
1135+
to the user.

plugins/mapper-annotated-text/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/AnnotatedTextHighlighter.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,8 @@ protected List<Object> loadFieldValues(
5050
}
5151

5252
@Override
53-
protected Analyzer wrapAnalyzer(Analyzer analyzer) {
54-
return new AnnotatedHighlighterAnalyzer(super.wrapAnalyzer(analyzer));
53+
protected Analyzer wrapAnalyzer(Analyzer analyzer, Integer maxAnalyzedOffset) {
54+
return new AnnotatedHighlighterAnalyzer(super.wrapAnalyzer(analyzer, maxAnalyzedOffset));
5555
}
5656

5757
@Override

plugins/mapper-annotated-text/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/AnnotatedTextHighlighterTests.java

Lines changed: 143 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,14 @@
88

99
package org.elasticsearch.search.fetch.subphase.highlight;
1010

11+
import static org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR;
12+
import static org.hamcrest.CoreMatchers.equalTo;
13+
14+
import java.net.URLEncoder;
15+
import java.text.BreakIterator;
16+
import java.util.ArrayList;
17+
import java.util.Locale;
18+
1119
import org.apache.lucene.analysis.Analyzer;
1220
import org.apache.lucene.analysis.standard.StandardAnalyzer;
1321
import org.apache.lucene.document.Document;
@@ -31,95 +39,96 @@
3139
import org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter;
3240
import org.apache.lucene.search.uhighlight.Snippet;
3341
import org.apache.lucene.search.uhighlight.SplittingBreakIterator;
42+
import org.apache.lucene.search.uhighlight.UnifiedHighlighter;
3443
import org.apache.lucene.store.Directory;
3544
import org.elasticsearch.common.Strings;
3645
import org.elasticsearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedHighlighterAnalyzer;
3746
import org.elasticsearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedText;
3847
import org.elasticsearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotationAnalyzerWrapper;
3948
import org.elasticsearch.test.ESTestCase;
4049

41-
import java.net.URLEncoder;
42-
import java.text.BreakIterator;
43-
import java.util.ArrayList;
44-
import java.util.Locale;
50+
public class AnnotatedTextHighlighterTests extends ESTestCase {
4551

46-
import static org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR;
47-
import static org.hamcrest.CoreMatchers.equalTo;
52+
private void assertHighlightOneDoc(String fieldName, String[] markedUpInputs,
53+
Query query, Locale locale, BreakIterator breakIterator,
54+
int noMatchSize, String[] expectedPassages) throws Exception {
4855

49-
public class AnnotatedTextHighlighterTests extends ESTestCase {
56+
assertHighlightOneDoc(fieldName, markedUpInputs, query, locale, breakIterator, noMatchSize, expectedPassages,
57+
Integer.MAX_VALUE, null);
58+
}
5059

5160
private void assertHighlightOneDoc(String fieldName, String []markedUpInputs,
5261
Query query, Locale locale, BreakIterator breakIterator,
53-
int noMatchSize, String[] expectedPassages) throws Exception {
54-
55-
56-
// Annotated fields wrap the usual analyzer with one that injects extra tokens
57-
Analyzer wrapperAnalyzer = new AnnotationAnalyzerWrapper(new StandardAnalyzer());
58-
Directory dir = newDirectory();
59-
IndexWriterConfig iwc = newIndexWriterConfig(wrapperAnalyzer);
60-
iwc.setMergePolicy(newTieredMergePolicy(random()));
61-
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
62-
FieldType ft = new FieldType(TextField.TYPE_STORED);
63-
if (randomBoolean()) {
64-
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
65-
} else {
66-
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
67-
}
68-
ft.freeze();
69-
Document doc = new Document();
70-
for (String input : markedUpInputs) {
71-
Field field = new Field(fieldName, "", ft);
72-
field.setStringValue(input);
73-
doc.add(field);
74-
}
75-
iw.addDocument(doc);
76-
DirectoryReader reader = iw.getReader();
77-
IndexSearcher searcher = newSearcher(reader);
78-
iw.close();
79-
80-
AnnotatedText[] annotations = new AnnotatedText[markedUpInputs.length];
81-
for (int i = 0; i < markedUpInputs.length; i++) {
82-
annotations[i] = AnnotatedText.parse(markedUpInputs[i]);
83-
}
84-
AnnotatedHighlighterAnalyzer hiliteAnalyzer = new AnnotatedHighlighterAnalyzer(wrapperAnalyzer);
85-
hiliteAnalyzer.setAnnotations(annotations);
86-
AnnotatedPassageFormatter passageFormatter = new AnnotatedPassageFormatter(new DefaultEncoder());
87-
passageFormatter.setAnnotations(annotations);
88-
89-
ArrayList<Object> plainTextForHighlighter = new ArrayList<>(annotations.length);
90-
for (int i = 0; i < annotations.length; i++) {
91-
plainTextForHighlighter.add(annotations[i].textMinusMarkup);
92-
}
62+
int noMatchSize, String[] expectedPassages,
63+
int maxAnalyzedOffset, Integer queryMaxAnalyzedOffset) throws Exception {
9364

94-
TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), 1, Sort.INDEXORDER);
95-
assertThat(topDocs.totalHits.value, equalTo(1L));
96-
String rawValue = Strings.collectionToDelimitedString(plainTextForHighlighter, String.valueOf(MULTIVAL_SEP_CHAR));
97-
CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(
98-
searcher,
99-
hiliteAnalyzer,
100-
null,
101-
passageFormatter,
102-
locale,
103-
breakIterator,
104-
"index",
105-
"text",
106-
query,
107-
noMatchSize,
108-
expectedPassages.length,
109-
name -> "text".equals(name),
110-
Integer.MAX_VALUE
111-
);
112-
highlighter.setFieldMatcher((name) -> "text".equals(name));
113-
final Snippet[] snippets = highlighter.highlightField(getOnlyLeafReader(reader), topDocs.scoreDocs[0].doc, () -> rawValue);
114-
assertEquals(expectedPassages.length, snippets.length);
115-
for (int i = 0; i < snippets.length; i++) {
116-
assertEquals(expectedPassages[i], snippets[i].getText());
65+
try (Directory dir = newDirectory()) {
66+
// Annotated fields wrap the usual analyzer with one that injects extra tokens
67+
Analyzer wrapperAnalyzer = new AnnotationAnalyzerWrapper(new StandardAnalyzer());
68+
IndexWriterConfig iwc = newIndexWriterConfig(wrapperAnalyzer);
69+
iwc.setMergePolicy(newTieredMergePolicy(random()));
70+
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
71+
FieldType ft = new FieldType(TextField.TYPE_STORED);
72+
if (randomBoolean()) {
73+
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
74+
} else {
75+
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
76+
}
77+
ft.freeze();
78+
Document doc = new Document();
79+
for (String input : markedUpInputs) {
80+
Field field = new Field(fieldName, "", ft);
81+
field.setStringValue(input);
82+
doc.add(field);
83+
}
84+
iw.addDocument(doc);
85+
try (DirectoryReader reader = iw.getReader()) {
86+
IndexSearcher searcher = newSearcher(reader);
87+
iw.close();
88+
89+
AnnotatedText[] annotations = new AnnotatedText[markedUpInputs.length];
90+
for (int i = 0; i < markedUpInputs.length; i++) {
91+
annotations[i] = AnnotatedText.parse(markedUpInputs[i]);
92+
}
93+
AnnotatedHighlighterAnalyzer hiliteAnalyzer = new AnnotatedHighlighterAnalyzer(wrapperAnalyzer);
94+
hiliteAnalyzer.setAnnotations(annotations);
95+
AnnotatedPassageFormatter passageFormatter = new AnnotatedPassageFormatter(new DefaultEncoder());
96+
passageFormatter.setAnnotations(annotations);
97+
98+
ArrayList<Object> plainTextForHighlighter = new ArrayList<>(annotations.length);
99+
for (int i = 0; i < annotations.length; i++) {
100+
plainTextForHighlighter.add(annotations[i].textMinusMarkup);
101+
}
102+
103+
TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), 1, Sort.INDEXORDER);
104+
assertThat(topDocs.totalHits.value, equalTo(1L));
105+
String rawValue = Strings.collectionToDelimitedString(plainTextForHighlighter, String.valueOf(MULTIVAL_SEP_CHAR));
106+
CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(
107+
searcher,
108+
hiliteAnalyzer,
109+
UnifiedHighlighter.OffsetSource.ANALYSIS,
110+
passageFormatter,
111+
locale,
112+
breakIterator,
113+
"index",
114+
"text",
115+
query,
116+
noMatchSize,
117+
expectedPassages.length,
118+
name -> "text".equals(name),
119+
maxAnalyzedOffset,
120+
queryMaxAnalyzedOffset
121+
);
122+
highlighter.setFieldMatcher((name) -> "text".equals(name));
123+
final Snippet[] snippets = highlighter.highlightField(getOnlyLeafReader(reader), topDocs.scoreDocs[0].doc, () -> rawValue);
124+
assertEquals(expectedPassages.length, snippets.length);
125+
for (int i = 0; i < snippets.length; i++) {
126+
assertEquals(expectedPassages[i], snippets[i].getText());
127+
}
128+
}
117129
}
118-
reader.close();
119-
dir.close();
120130
}
121131

122-
123132
public void testAnnotatedTextStructuredMatch() throws Exception {
124133
// Check that a structured token eg a URL can be highlighted in a query
125134
// on marked-up
@@ -191,4 +200,65 @@ public void testBadAnnotation() throws Exception {
191200
assertHighlightOneDoc("text", markedUpInputs, query, Locale.ROOT, breakIterator, 0, expectedPassages);
192201
}
193202

203+
public void testExceedMaxAnalyzedOffset() throws Exception {
204+
TermQuery query = new TermQuery(new Term("text", "exceeds"));
205+
BreakIterator breakIterator = new CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR);
206+
assertHighlightOneDoc("text", new String[] { "[Short Text](Short+Text)" }, query, Locale.ROOT, breakIterator, 0, new String[] {},
207+
10, null);
208+
209+
IllegalArgumentException e = expectThrows(
210+
IllegalArgumentException.class,
211+
() -> assertHighlightOneDoc(
212+
"text",
213+
new String[] { "[Long Text exceeds](Long+Text+exceeds) MAX analyzed offset)" },
214+
query,
215+
Locale.ROOT,
216+
breakIterator,
217+
0,
218+
new String[] {},
219+
20,
220+
null
221+
)
222+
);
223+
assertEquals(
224+
"The length [38] of field [text] in doc[0]/index[index] exceeds the [index.highlight.max_analyzed_offset] limit [20]. "
225+
+ "To avoid this error, set the query parameter [max_analyzed_offset] to a value less than index setting [20] and this "
226+
+ "will tolerate long field values by truncating them.",
227+
e.getMessage()
228+
);
229+
230+
final Integer queryMaxOffset = randomIntBetween(21, 1000);
231+
e = expectThrows(
232+
IllegalArgumentException.class,
233+
() -> assertHighlightOneDoc(
234+
"text",
235+
new String[] { "[Long Text exceeds](Long+Text+exceeds) MAX analyzed offset)" },
236+
query,
237+
Locale.ROOT,
238+
breakIterator,
239+
0,
240+
new String[] {},
241+
20,
242+
queryMaxOffset
243+
)
244+
);
245+
assertEquals(
246+
"The length [38] of field [text] in doc[0]/index[index] exceeds the [index.highlight.max_analyzed_offset] limit [20]. "
247+
+ "To avoid this error, set the query parameter [max_analyzed_offset] to a value less than index setting [20] and this "
248+
+ "will tolerate long field values by truncating them.",
249+
e.getMessage()
250+
);
251+
252+
assertHighlightOneDoc(
253+
"text",
254+
new String[] { "[Long Text Exceeds](Long+Text+Exceeds) MAX analyzed offset [Long Text Exceeds](Long+Text+Exceeds)" },
255+
query,
256+
Locale.ROOT,
257+
breakIterator,
258+
0,
259+
new String[] { "Long Text [Exceeds](_hit_term=exceeds) MAX analyzed offset [Long Text Exceeds](Long+Text+Exceeds)" },
260+
20,
261+
15
262+
);
263+
}
194264
}

rest-api-spec/src/main/resources/rest-api-spec/test/search.highlight/30_max_analyzed_offset.yml

Lines changed: 62 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ setup:
66
body:
77
settings:
88
number_of_shards: 1
9-
index.highlight.max_analyzed_offset: 10
9+
index.highlight.max_analyzed_offset: 30
1010
mappings:
1111
properties:
1212
field1:
@@ -39,6 +39,20 @@ setup:
3939
body: {"query" : {"match" : {"field1" : "fox"}}, "highlight" : {"type" : "unified", "fields" : {"field1" : {}}}}
4040
- match: { error.root_cause.0.type: "illegal_argument_exception" }
4141

42+
---
43+
"Unified highlighter on a field WITHOUT OFFSETS exceeding index.highlight.max_analyzed_offset with max_analyzed_offset=20 should SUCCEED":
44+
45+
- skip:
46+
version: " - 7.11.99"
47+
reason: max_analyzed_offset query param added in 7.12.0
48+
49+
- do:
50+
search:
51+
rest_total_hits_as_int: true
52+
index: test1
53+
body: {"query" : {"match" : {"field1" : "fox"}}, "highlight" : {"type" : "unified", "fields" : {"field1" : {}}, "max_analyzed_offset": "20"}}
54+
- match: {hits.hits.0.highlight.field1.0: "The quick brown <em>fox</em> went to the forest and saw another fox."}
55+
4256

4357
---
4458
"Plain highlighter on a field WITHOUT OFFSETS exceeding index.highlight.max_analyzed_offset should FAIL":
@@ -50,9 +64,23 @@ setup:
5064
search:
5165
rest_total_hits_as_int: true
5266
index: test1
53-
body: {"query" : {"match" : {"field1" : "fox"}}, "highlight" : {"type" : "unified", "fields" : {"field1" : {}}}}
67+
body: {"query" : {"match" : {"field1" : "fox"}}, "highlight" : {"type" : "plain", "fields" : {"field1" : {}}}}
5468
- match: { error.root_cause.0.type: "illegal_argument_exception" }
5569

70+
---
71+
"Plain highlighter on a field WITHOUT OFFSETS exceeding index.highlight.max_analyzed_offset with max_analyzed_offset=20 should SUCCEED":
72+
73+
- skip:
74+
version: " - 7.11.99"
75+
reason: max_analyzed_offset query param added in 7.12.0
76+
77+
- do:
78+
search:
79+
rest_total_hits_as_int: true
80+
index: test1
81+
body: {"query" : {"match" : {"field1" : "fox"}}, "highlight" : {"type" : "plain", "fields" : {"field1" : {}}, "max_analyzed_offset": 20}}
82+
- match: {hits.hits.0.highlight.field1.0: "The quick brown <em>fox</em> went to the forest and saw another fox."}
83+
5684

5785
---
5886
"Unified highlighter on a field WITH OFFSETS exceeding index.highlight.max_analyzed_offset should SUCCEED":
@@ -79,3 +107,35 @@ setup:
79107
index: test1
80108
body: {"query" : {"match" : {"field2" : "fox"}}, "highlight" : {"type" : "plain", "fields" : {"field2" : {}}}}
81109
- match: { error.root_cause.0.type: "illegal_argument_exception" }
110+
111+
---
112+
"Plain highlighter on a field WITH OFFSETS exceeding index.highlight.max_analyzed_offset with max_analyzed_offset=20 should SUCCEED":
113+
114+
- skip:
115+
version: " - 7.11.99"
116+
reason: max_analyzed_offset query param added in 7.12.0
117+
118+
- do:
119+
search:
120+
rest_total_hits_as_int: true
121+
index: test1
122+
body: {"query" : {"match" : {"field2" : "fox"}}, "highlight" : {"type" : "plain", "fields" : {"field2" : {}}, "max_analyzed_offset": 20}}
123+
- match: {hits.hits.0.highlight.field2.0: "The quick brown <em>fox</em> went to the forest and saw another fox."}
124+
125+
---
126+
"Plain highlighter with max_analyzed_offset < 0 should FAIL":
127+
128+
- skip:
129+
version: " - 7.11.99"
130+
reason: max_analyzed_offset query param added in 7.12.0
131+
132+
- do:
133+
catch: bad_request
134+
search:
135+
rest_total_hits_as_int: true
136+
index: test1
137+
body: {"query" : {"match" : {"field2" : "fox"}}, "highlight" : {"type" : "plain", "fields" : {"field2" : {}}, "max_analyzed_offset": -10}}
138+
- match: { status: 400 }
139+
- match: { error.root_cause.0.type: "x_content_parse_exception" }
140+
- match: { error.caused_by.type: "illegal_argument_exception" }
141+
- match: { error.caused_by.reason: "[max_analyzed_offset] must be a positive integer" }

0 commit comments

Comments
 (0)