Skip to content

Commit ed824b0

Browse files
committed
Add option to limit highlighting to max offset
Add a query parameter `limit_to_max_analyzed_offset` to allow users to limit the highlighting of text fields to the value of the `index.highlight.max_analyzed_offset`, thus preventing from throwing an exception when the length of the text field exceeds the limit. The highlighting still takes place but up to the length set by the setting. Relates to: #52155
1 parent df7041f commit ed824b0

File tree

12 files changed

+402
-131
lines changed

12 files changed

+402
-131
lines changed

docs/reference/index-modules.asciidoc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,7 @@ specific index module:
223223
The maximum number of tokens that can be produced using _analyze API.
224224
Defaults to `10000`.
225225

226+
[[index-max-analyzed-offset]]
226227
`index.highlight.max_analyzed_offset`::
227228

228229
The maximum number of characters that will be analyzed for a highlight request.

docs/reference/search/search-your-data/highlighting.asciidoc

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ needs highlighting. The `plain` highlighter always uses plain highlighting.
117117
Plain highlighting for large texts may require substantial amount of time and memory.
118118
To protect against this, the maximum number of text characters that will be analyzed has been
119119
limited to 1000000. This default limit can be changed
120-
for a particular index with the index setting `index.highlight.max_analyzed_offset`.
120+
for a particular index with the index setting <<index-max-analyzed-offset,`index.highlight.max_analyzed_offset`>>.
121121

122122
[discrete]
123123
[[highlighting-settings]]
@@ -242,6 +242,14 @@ require_field_match:: By default, only fields that contains a query match are
242242
highlighted. Set `require_field_match` to `false` to highlight all fields.
243243
Defaults to `true`.
244244

245+
limit_to_max_analyzed_offset:: By default, the maximum number of characters
246+
analyzed for a highlight request is bounded by the value defined in the
247+
<<index-max-analyzed-offset, `index.highlight.max_analyzed_offset`>> setting,
248+
and when the number of characters exceeds this limit an error is returned. If
249+
this setting is set to `true`, the analysis for the highlighting stops at this
250+
defined maximum limit, and the rest of the text is not processed, thus not
251+
highlighted and no error is returned.
252+
245253
tags_schema:: Set to `styled` to use the built-in tag schema. The `styled`
246254
schema defines the following `pre_tags` and defines `post_tags` as
247255
`</em>`.
@@ -1119,4 +1127,4 @@ using the passages's `matchStarts` and `matchEnds` information:
11191127
I'll be the <em>only</em> <em>fox</em> in the world for you.
11201128

11211129
This kind of formatted strings are the final result of the highlighter returned
1122-
to the user.
1130+
to the user.

plugins/mapper-annotated-text/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/AnnotatedTextHighlighterTests.java

Lines changed: 116 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
import org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter;
4343
import org.apache.lucene.search.uhighlight.Snippet;
4444
import org.apache.lucene.search.uhighlight.SplittingBreakIterator;
45+
import org.apache.lucene.search.uhighlight.UnifiedHighlighter;
4546
import org.apache.lucene.store.Directory;
4647
import org.elasticsearch.common.Strings;
4748
import org.elasticsearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedHighlighterAnalyzer;
@@ -59,78 +60,97 @@
5960

6061
public class AnnotatedTextHighlighterTests extends ESTestCase {
6162

63+
private void assertHighlightOneDoc(String fieldName, String[] markedUpInputs,
64+
Query query, Locale locale, BreakIterator breakIterator,
65+
int noMatchSize, String[] expectedPassages) throws Exception {
66+
67+
assertHighlightOneDoc(fieldName, markedUpInputs, query, locale, breakIterator, noMatchSize, expectedPassages,
68+
Integer.MAX_VALUE, false);
69+
}
70+
6271
private void assertHighlightOneDoc(String fieldName, String []markedUpInputs,
6372
Query query, Locale locale, BreakIterator breakIterator,
64-
int noMatchSize, String[] expectedPassages) throws Exception {
65-
66-
67-
// Annotated fields wrap the usual analyzer with one that injects extra tokens
68-
Analyzer wrapperAnalyzer = new AnnotationAnalyzerWrapper(new StandardAnalyzer());
69-
Directory dir = newDirectory();
70-
IndexWriterConfig iwc = newIndexWriterConfig(wrapperAnalyzer);
71-
iwc.setMergePolicy(newTieredMergePolicy(random()));
72-
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
73-
FieldType ft = new FieldType(TextField.TYPE_STORED);
74-
if (randomBoolean()) {
75-
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
76-
} else {
77-
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
78-
}
79-
ft.freeze();
80-
Document doc = new Document();
81-
for (String input : markedUpInputs) {
82-
Field field = new Field(fieldName, "", ft);
83-
field.setStringValue(input);
84-
doc.add(field);
73+
int noMatchSize, String[] expectedPassages,
74+
int maxAnalyzedOffset, boolean limitToMaxAnalyzedOffset) throws Exception {
75+
76+
Directory dir = null;
77+
DirectoryReader reader = null;
78+
try {
79+
dir = newDirectory();
80+
81+
// Annotated fields wrap the usual analyzer with one that injects extra tokens
82+
Analyzer wrapperAnalyzer = new AnnotationAnalyzerWrapper(new StandardAnalyzer());
83+
;
84+
IndexWriterConfig iwc = newIndexWriterConfig(wrapperAnalyzer);
85+
iwc.setMergePolicy(newTieredMergePolicy(random()));
86+
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
87+
FieldType ft = new FieldType(TextField.TYPE_STORED);
88+
if (randomBoolean()) {
89+
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
90+
} else {
91+
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
92+
}
93+
ft.freeze();
94+
Document doc = new Document();
95+
for (String input : markedUpInputs) {
96+
Field field = new Field(fieldName, "", ft);
97+
field.setStringValue(input);
98+
doc.add(field);
99+
}
100+
iw.addDocument(doc);
101+
reader = iw.getReader();
102+
IndexSearcher searcher = newSearcher(reader);
103+
iw.close();
104+
105+
AnnotatedText[] annotations = new AnnotatedText[markedUpInputs.length];
106+
for (int i = 0; i < markedUpInputs.length; i++) {
107+
annotations[i] = AnnotatedText.parse(markedUpInputs[i]);
108+
}
109+
AnnotatedHighlighterAnalyzer hiliteAnalyzer = new AnnotatedHighlighterAnalyzer(wrapperAnalyzer);
110+
hiliteAnalyzer.setAnnotations(annotations);
111+
AnnotatedPassageFormatter passageFormatter = new AnnotatedPassageFormatter(new DefaultEncoder());
112+
passageFormatter.setAnnotations(annotations);
113+
114+
ArrayList<Object> plainTextForHighlighter = new ArrayList<>(annotations.length);
115+
for (int i = 0; i < annotations.length; i++) {
116+
plainTextForHighlighter.add(annotations[i].textMinusMarkup);
117+
}
118+
119+
TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), 1, Sort.INDEXORDER);
120+
assertThat(topDocs.totalHits.value, equalTo(1L));
121+
String rawValue = Strings.collectionToDelimitedString(plainTextForHighlighter, String.valueOf(MULTIVAL_SEP_CHAR));
122+
CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(
123+
searcher,
124+
hiliteAnalyzer,
125+
UnifiedHighlighter.OffsetSource.ANALYSIS,
126+
passageFormatter,
127+
locale,
128+
breakIterator,
129+
"index",
130+
"text",
131+
query,
132+
noMatchSize,
133+
expectedPassages.length,
134+
name -> "text".equals(name),
135+
maxAnalyzedOffset,
136+
limitToMaxAnalyzedOffset
137+
);
138+
highlighter.setFieldMatcher((name) -> "text".equals(name));
139+
final Snippet[] snippets = highlighter.highlightField(getOnlyLeafReader(reader), topDocs.scoreDocs[0].doc, () -> rawValue);
140+
assertEquals(expectedPassages.length, snippets.length);
141+
for (int i = 0; i < snippets.length; i++) {
142+
assertEquals(expectedPassages[i], snippets[i].getText());
143+
}
144+
} finally {
145+
if (reader != null) {
146+
reader.close();
147+
}
148+
if (dir != null) {
149+
dir.close();
150+
}
85151
}
86-
iw.addDocument(doc);
87-
DirectoryReader reader = iw.getReader();
88-
IndexSearcher searcher = newSearcher(reader);
89-
iw.close();
90-
91-
AnnotatedText[] annotations = new AnnotatedText[markedUpInputs.length];
92-
for (int i = 0; i < markedUpInputs.length; i++) {
93-
annotations[i] = AnnotatedText.parse(markedUpInputs[i]);
94-
}
95-
AnnotatedHighlighterAnalyzer hiliteAnalyzer = new AnnotatedHighlighterAnalyzer(wrapperAnalyzer);
96-
hiliteAnalyzer.setAnnotations(annotations);
97-
AnnotatedPassageFormatter passageFormatter = new AnnotatedPassageFormatter(new DefaultEncoder());
98-
passageFormatter.setAnnotations(annotations);
99-
100-
ArrayList<Object> plainTextForHighlighter = new ArrayList<>(annotations.length);
101-
for (int i = 0; i < annotations.length; i++) {
102-
plainTextForHighlighter.add(annotations[i].textMinusMarkup);
103-
}
104-
105-
TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), 1, Sort.INDEXORDER);
106-
assertThat(topDocs.totalHits.value, equalTo(1L));
107-
String rawValue = Strings.collectionToDelimitedString(plainTextForHighlighter, String.valueOf(MULTIVAL_SEP_CHAR));
108-
CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(
109-
searcher,
110-
hiliteAnalyzer,
111-
null,
112-
passageFormatter,
113-
locale,
114-
breakIterator,
115-
"index",
116-
"text",
117-
query,
118-
noMatchSize,
119-
expectedPassages.length,
120-
name -> "text".equals(name),
121-
Integer.MAX_VALUE
122-
);
123-
highlighter.setFieldMatcher((name) -> "text".equals(name));
124-
final Snippet[] snippets = highlighter.highlightField(getOnlyLeafReader(reader), topDocs.scoreDocs[0].doc, () -> rawValue);
125-
assertEquals(expectedPassages.length, snippets.length);
126-
for (int i = 0; i < snippets.length; i++) {
127-
assertEquals(expectedPassages[i], snippets[i].getText());
128-
}
129-
reader.close();
130-
dir.close();
131152
}
132153

133-
134154
public void testAnnotatedTextStructuredMatch() throws Exception {
135155
// Check that a structured token eg a URL can be highlighted in a query
136156
// on marked-up
@@ -202,4 +222,33 @@ public void testBadAnnotation() throws Exception {
202222
assertHighlightOneDoc("text", markedUpInputs, query, Locale.ROOT, breakIterator, 0, expectedPassages);
203223
}
204224

225+
public void testExceedMaxAnalyzedOffset() throws Exception {
226+
TermQuery query = new TermQuery(new Term("text", "exceeds"));
227+
BreakIterator breakIterator = new CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR);
228+
assertHighlightOneDoc("text", new String[] { "[Short Text](Short+Text)" }, query, Locale.ROOT, breakIterator, 0, new String[] {});
229+
230+
IllegalArgumentException e = expectThrows(
231+
IllegalArgumentException.class,
232+
() -> assertHighlightOneDoc(
233+
"text",
234+
new String[] { "[Long Text exceeds](Long+Text+exceeds) MAX analyzed offset)" },
235+
query,
236+
Locale.ROOT,
237+
breakIterator,
238+
0,
239+
new String[] {},
240+
15,
241+
false
242+
)
243+
);
244+
assertEquals(
245+
"The length of [text] field of [0] doc of [index] index has exceeded [15] - maximum allowed to be analyzed for "
246+
+ "highlighting. This maximum can be set by changing the [index.highlight.max_analyzed_offset] index level setting. "
247+
+ "For large texts, indexing with offsets or term vectors is recommended!",
248+
e.getMessage()
249+
);
250+
251+
assertHighlightOneDoc("text", new String[] { "[Long Text exceeds](Long+Text+exceeds) MAX analyzed offset" },
252+
query, Locale.ROOT, breakIterator, 0, new String[] {"Long Text [exceeds](_hit_term=exceeds) MAX analyzed offset"}, 15, true);
253+
}
205254
}

rest-api-spec/src/main/resources/rest-api-spec/test/search.highlight/30_max_analyzed_offset.yml

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ setup:
66
body:
77
settings:
88
number_of_shards: 1
9-
index.highlight.max_analyzed_offset: 10
9+
index.highlight.max_analyzed_offset: 20
1010
mappings:
1111
properties:
1212
field1:
@@ -37,6 +37,16 @@ setup:
3737
body: {"query" : {"match" : {"field1" : "fox"}}, "highlight" : {"type" : "unified", "fields" : {"field1" : {}}}}
3838
- match: { error.root_cause.0.type: "illegal_argument_exception" }
3939

40+
---
41+
"Unified highlighter on a field WITHOUT OFFSETS exceeding index.highlight.max_analyzed_offset with limit_to_max_analyzed_offset=true should SUCCEED":
42+
43+
- do:
44+
search:
45+
rest_total_hits_as_int: true
46+
index: test1
47+
body: {"query" : {"match" : {"field1" : "fox"}}, "highlight" : {"type" : "unified", "fields" : {"field1" : {}}, "limit_to_max_analyzed_offset": "true"}}
48+
- match: {hits.hits.0.highlight.field1.0: "The quick brown <em>fox</em> went to the forest and saw another fox."}
49+
4050

4151
---
4252
"Plain highlighter on a field WITHOUT OFFSETS exceeding index.highlight.max_analyzed_offset should FAIL":
@@ -46,9 +56,19 @@ setup:
4656
search:
4757
rest_total_hits_as_int: true
4858
index: test1
49-
body: {"query" : {"match" : {"field1" : "fox"}}, "highlight" : {"type" : "unified", "fields" : {"field1" : {}}}}
59+
body: {"query" : {"match" : {"field1" : "fox"}}, "highlight" : {"type" : "plain", "fields" : {"field1" : {}}}}
5060
- match: { error.root_cause.0.type: "illegal_argument_exception" }
5161

62+
---
63+
"Plain highlighter on a field WITHOUT OFFSETS exceeding index.highlight.max_analyzed_offset with limit_to_max_analyzed_offset=true should SUCCEED":
64+
65+
- do:
66+
search:
67+
rest_total_hits_as_int: true
68+
index: test1
69+
body: {"query" : {"match" : {"field1" : "fox"}}, "highlight" : {"type" : "plain", "fields" : {"field1" : {}}, "limit_to_max_analyzed_offset": "true"}}
70+
- match: {hits.hits.0.highlight.field1.0: "The quick brown <em>fox</em> went to the forest and saw another fox."}
71+
5272

5373
---
5474
"Unified highlighter on a field WITH OFFSETS exceeding index.highlight.max_analyzed_offset should SUCCEED":
@@ -71,3 +91,13 @@ setup:
7191
index: test1
7292
body: {"query" : {"match" : {"field2" : "fox"}}, "highlight" : {"type" : "plain", "fields" : {"field2" : {}}}}
7393
- match: { error.root_cause.0.type: "illegal_argument_exception" }
94+
95+
---
96+
"Plain highlighter on a field WITH OFFSETS exceeding index.highlight.max_analyzed_offset with limit_to_max_analyzed_offset=true should SUCCEED":
97+
98+
- do:
99+
search:
100+
rest_total_hits_as_int: true
101+
index: test1
102+
body: {"query" : {"match" : {"field2" : "fox"}}, "highlight" : {"type" : "plain", "fields" : {"field2" : {}}, "limit_to_max_analyzed_offset": "true"}}
103+
- match: {hits.hits.0.highlight.field2.0: "The quick brown <em>fox</em> went to the forest and saw another fox."}

server/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
6565
private final int noMatchSize;
6666
private final FieldHighlighter fieldHighlighter;
6767
private final int maxAnalyzedOffset;
68+
private final boolean limitToMaxAnalyzedOffset;
6869

6970
/**
7071
* Creates a new instance of {@link CustomUnifiedHighlighter}
@@ -96,7 +97,8 @@ public CustomUnifiedHighlighter(IndexSearcher searcher,
9697
int noMatchSize,
9798
int maxPassages,
9899
Predicate<String> fieldMatcher,
99-
int maxAnalyzedOffset) throws IOException {
100+
int maxAnalyzedOffset,
101+
boolean limitToMaxAnalyzedOffset) throws IOException {
100102
super(searcher, analyzer);
101103
this.offsetSource = offsetSource;
102104
this.breakIterator = breakIterator;
@@ -107,6 +109,7 @@ public CustomUnifiedHighlighter(IndexSearcher searcher,
107109
this.noMatchSize = noMatchSize;
108110
this.setFieldMatcher(fieldMatcher);
109111
this.maxAnalyzedOffset = maxAnalyzedOffset;
112+
this.limitToMaxAnalyzedOffset = limitToMaxAnalyzedOffset;
110113
fieldHighlighter = getFieldHighlighter(field, query, extractTerms(query), maxPassages);
111114
}
112115

@@ -123,7 +126,7 @@ public Snippet[] highlightField(LeafReader reader, int docId, CheckedSupplier<St
123126
return null;
124127
}
125128
int fieldValueLength = fieldValue.length();
126-
if ((offsetSource == OffsetSource.ANALYSIS) && (fieldValueLength > maxAnalyzedOffset)) {
129+
if (((limitToMaxAnalyzedOffset == false) && (offsetSource == OffsetSource.ANALYSIS) && (fieldValueLength > maxAnalyzedOffset))) {
127130
throw new IllegalArgumentException(
128131
"The length of ["
129132
+ field

0 commit comments

Comments
 (0)