Skip to content

Commit 691421f

Browse files
committed
Fix invalid break iterator highlighting on keyword field (#49566)
By default the unified highlighter splits the input into passages using a sentence break iterator. However we don't check if the field is tokenized or not so `keyword` field also applies the break iterator even though they can only match on the entire content. This means that by default we'll split the content of a `keyword` field on sentence break if the requested number of fragments is set to a value different than 0 (default to 5). This commit changes this behavior to ignore the break iterator on non-tokenized fields (keyword) in order to always highlight the entire values. The number of requested fragments control the number of matched values are returned but the boundary_scanner_type is now ignored. Note that this is the behavior in 6x but some refactoring of the Lucene's highlighter exposed this bug in Elasticsearch 7x.
1 parent 1bc3e69 commit 691421f

File tree

2 files changed

+34
-3
lines changed

2 files changed

+34
-3
lines changed

server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/UnifiedHighlighter.java

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ public HighlightField highlight(HighlighterContext highlighterContext) {
6767
final int maxAnalyzedOffset = context.getIndexSettings().getHighlightMaxAnalyzedOffset();
6868

6969
List<Snippet> snippets = new ArrayList<>();
70-
int numberOfFragments;
70+
int numberOfFragments = field.fieldOptions().numberOfFragments();
7171
try {
7272

7373
final Analyzer analyzer = getAnalyzer(context.getMapperService().documentMapper(hitContext.hit().getType()),
@@ -90,14 +90,16 @@ public HighlightField highlight(HighlighterContext highlighterContext) {
9090
"This maximum can be set by changing the [" + IndexSettings.MAX_ANALYZED_OFFSET_SETTING.getKey() +
9191
"] index level setting. " + "For large texts, indexing with offsets or term vectors is recommended!");
9292
}
93-
if (field.fieldOptions().numberOfFragments() == 0) {
93+
if (numberOfFragments == 0
94+
// non-tokenized fields should not use any break iterator (ignore boundaryScannerType)
95+
|| fieldType.tokenized() == false) {
9496
// we use a control char to separate values, which is the only char that the custom break iterator
9597
// breaks the text on, so we don't lose the distinction between the different values of a field and we
9698
// get back a snippet per value
9799
CustomSeparatorBreakIterator breakIterator = new CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR);
98100
highlighter = new CustomUnifiedHighlighter(searcher, analyzer, offsetSource, passageFormatter,
99101
field.fieldOptions().boundaryScannerLocale(), breakIterator, fieldValue, field.fieldOptions().noMatchSize());
100-
numberOfFragments = fieldValues.size(); // we are highlighting the whole content, one snippet per value
102+
numberOfFragments = numberOfFragments == 0 ? fieldValues.size() : numberOfFragments;
101103
} else {
102104
//using paragraph separator we make sure that each field value holds a discrete passage for highlighting
103105
BreakIterator bi = getBreakIterator(field);

server/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,35 @@ protected Collection<Class<? extends Plugin>> nodePlugins() {
121121
return Arrays.asList(InternalSettingsPlugin.class, MockKeywordPlugin.class, MockAnalysisPlugin.class);
122122
}
123123

124+
public void testHighlightingWithKeywordIgnoreBoundaryScanner() throws IOException {
125+
XContentBuilder mappings = jsonBuilder();
126+
mappings.startObject();
127+
mappings.startObject("type")
128+
.startObject("properties")
129+
.startObject("tags")
130+
.field("type", "keyword")
131+
.endObject()
132+
.endObject().endObject();
133+
mappings.endObject();
134+
assertAcked(prepareCreate("test")
135+
.addMapping("type", mappings));
136+
client().prepareIndex("test").setId("1")
137+
.setSource(jsonBuilder().startObject().array("tags", "foo bar", "foo bar", "foo bar", "foo baz").endObject())
138+
.get();
139+
client().prepareIndex("test").setId("2")
140+
.setSource(jsonBuilder().startObject().array("tags", "foo baz", "foo baz", "foo baz", "foo bar").endObject())
141+
.get();
142+
refresh();
143+
144+
for (BoundaryScannerType scanner : BoundaryScannerType.values()) {
145+
SearchResponse search = client().prepareSearch().setQuery(matchQuery("tags", "foo bar"))
146+
.highlighter(new HighlightBuilder().field(new Field("tags")).numOfFragments(2).boundaryScannerType(scanner)).get();
147+
assertHighlight(search, 0, "tags", 0, 2, equalTo("<em>foo bar</em>"));
148+
assertHighlight(search, 0, "tags", 1, 2, equalTo("<em>foo bar</em>"));
149+
assertHighlight(search, 1, "tags", 0, 1, equalTo("<em>foo bar</em>"));
150+
}
151+
}
152+
124153
public void testHighlightingWithStoredKeyword() throws IOException {
125154
XContentBuilder mappings = jsonBuilder();
126155
mappings.startObject();

0 commit comments

Comments
 (0)