Skip to content

Commit f641000

Browse files
authored
Add LimitedOffsetsEnum to Limited offset token (#86110)
Apply `max_analyzed_offset` to highlighting when offsets are recorded in the index. Fixes #86109
1 parent 5a4a42e commit f641000

File tree

5 files changed

+172
-3
lines changed

5 files changed

+172
-3
lines changed

docs/changelog/86110.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
pr: 86110
2+
summary: Add LimitedOffsetsEnum to Limited offset token
3+
area: Search
4+
type: enhancement
5+
issues:
6+
- 86109

server/src/main/java/org/elasticsearch/lucene/search/uhighlight/CustomFieldHighlighter.java

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ class CustomFieldHighlighter extends FieldHighlighter {
3636
private final Locale breakIteratorLocale;
3737
private final int noMatchSize;
3838
private String fieldValue;
39+
private final Integer queryMaxAnalyzedOffset;
3940

4041
CustomFieldHighlighter(
4142
String field,
@@ -46,11 +47,13 @@ class CustomFieldHighlighter extends FieldHighlighter {
4647
int maxPassages,
4748
int maxNoHighlightPassages,
4849
PassageFormatter passageFormatter,
49-
int noMatchSize
50+
int noMatchSize,
51+
Integer queryMaxAnalyzedOffset
5052
) {
5153
super(field, fieldOffsetStrategy, breakIterator, passageScorer, maxPassages, maxNoHighlightPassages, passageFormatter);
5254
this.breakIteratorLocale = breakIteratorLocale;
5355
this.noMatchSize = noMatchSize;
56+
this.queryMaxAnalyzedOffset = queryMaxAnalyzedOffset;
5457
}
5558

5659
FieldOffsetStrategy getFieldOffsetStrategy() {
@@ -106,6 +109,10 @@ protected Passage[] getSummaryPassagesNoHighlight(int maxPassages) {
106109
@Override
107110
protected Passage[] highlightOffsetsEnums(OffsetsEnum off) throws IOException {
108111

112+
if (queryMaxAnalyzedOffset != null) {
113+
off = new LimitedOffsetsEnum(off, queryMaxAnalyzedOffset);
114+
}
115+
109116
final int contentLength = this.breakIterator.getText().getEndIndex();
110117

111118
if (off.nextPosition() == false) {

server/src/main/java/org/elasticsearch/lucene/search/uhighlight/CustomUnifiedHighlighter.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,8 @@ protected CustomFieldHighlighter getFieldHighlighter(String field, Query query,
193193
maxPassages,
194194
(noMatchSize > 0 ? 1 : 0),
195195
getFormatter(field),
196-
noMatchSize
196+
noMatchSize,
197+
queryMaxAnalyzedOffset
197198
);
198199
}
199200

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the Elastic License
4+
* 2.0 and the Server Side Public License, v 1; you may not use this file except
5+
* in compliance with, at your election, the Elastic License 2.0 or the Server
6+
* Side Public License, v 1.
7+
*/
8+
9+
package org.elasticsearch.lucene.search.uhighlight;
10+
11+
import org.apache.lucene.search.uhighlight.OffsetsEnum;
12+
import org.apache.lucene.util.BytesRef;
13+
14+
import java.io.IOException;
15+
16+
public class LimitedOffsetsEnum extends OffsetsEnum {
17+
private final OffsetsEnum delegate;
18+
private final int maxOffset;
19+
20+
public LimitedOffsetsEnum(OffsetsEnum delegate, int maxOffset) {
21+
this.delegate = delegate;
22+
this.maxOffset = maxOffset;
23+
}
24+
25+
@Override
26+
public boolean nextPosition() throws IOException {
27+
boolean next = delegate.nextPosition();
28+
if (next == false) {
29+
return next;
30+
}
31+
if (delegate.startOffset() > maxOffset) {
32+
return false;
33+
}
34+
return next;
35+
}
36+
37+
@Override
38+
public int freq() throws IOException {
39+
return delegate.freq();
40+
}
41+
42+
@Override
43+
public BytesRef getTerm() throws IOException {
44+
return delegate.getTerm();
45+
}
46+
47+
@Override
48+
public int startOffset() throws IOException {
49+
return delegate.startOffset();
50+
}
51+
52+
@Override
53+
public int endOffset() throws IOException {
54+
return delegate.endOffset();
55+
}
56+
}

server/src/test/java/org/elasticsearch/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java

Lines changed: 100 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
package org.elasticsearch.lucene.search.uhighlight;
1010

1111
import org.apache.lucene.analysis.Analyzer;
12+
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
1213
import org.apache.lucene.analysis.custom.CustomAnalyzer;
1314
import org.apache.lucene.analysis.ngram.EdgeNGramTokenizerFactory;
1415
import org.apache.lucene.analysis.standard.StandardAnalyzer;
@@ -40,6 +41,8 @@
4041

4142
import java.text.BreakIterator;
4243
import java.util.Locale;
44+
import java.util.Map;
45+
import java.util.TreeMap;
4346

4447
import static org.elasticsearch.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR;
4548
import static org.hamcrest.CoreMatchers.equalTo;
@@ -82,6 +85,34 @@ private void assertHighlightOneDoc(
8285
String[] expectedPassages,
8386
int maxAnalyzedOffset,
8487
Integer queryMaxAnalyzedOffset
88+
) throws Exception {
89+
assertHighlightOneDoc(
90+
fieldName,
91+
inputs,
92+
analyzer,
93+
query,
94+
locale,
95+
breakIterator,
96+
noMatchSize,
97+
expectedPassages,
98+
maxAnalyzedOffset,
99+
queryMaxAnalyzedOffset,
100+
UnifiedHighlighter.OffsetSource.ANALYSIS
101+
);
102+
}
103+
104+
private void assertHighlightOneDoc(
105+
String fieldName,
106+
String[] inputs,
107+
Analyzer analyzer,
108+
Query query,
109+
Locale locale,
110+
BreakIterator breakIterator,
111+
int noMatchSize,
112+
String[] expectedPassages,
113+
int maxAnalyzedOffset,
114+
Integer queryMaxAnalyzedOffset,
115+
UnifiedHighlighter.OffsetSource offsetSource
85116
) throws Exception {
86117
try (Directory dir = newDirectory()) {
87118
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
@@ -106,7 +137,7 @@ private void assertHighlightOneDoc(
106137
CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(
107138
searcher,
108139
analyzer,
109-
UnifiedHighlighter.OffsetSource.ANALYSIS,
140+
offsetSource,
110141
new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder()),
111142
locale,
112143
breakIterator,
@@ -394,4 +425,72 @@ public void testExceedMaxAnalyzedOffset() throws Exception {
394425
10
395426
);
396427
}
428+
429+
public void testExceedMaxAnalyzedOffsetWithRepeatedWords() throws Exception {
430+
431+
TermQuery query = new TermQuery(new Term("text", "Fun"));
432+
Analyzer analyzer = new WhitespaceAnalyzer();
433+
assertHighlightOneDoc(
434+
"text",
435+
new String[] { "Testing Fun Testing Fun" },
436+
analyzer,
437+
query,
438+
Locale.ROOT,
439+
BreakIterator.getSentenceInstance(Locale.ROOT),
440+
0,
441+
new String[] { "Testing <b>Fun</b> Testing Fun" },
442+
29,
443+
10,
444+
UnifiedHighlighter.OffsetSource.ANALYSIS
445+
);
446+
assertHighlightOneDoc(
447+
"text",
448+
new String[] { "Testing Fun Testing Fun" },
449+
analyzer,
450+
query,
451+
Locale.ROOT,
452+
BreakIterator.getSentenceInstance(Locale.ROOT),
453+
0,
454+
new String[] { "Testing <b>Fun</b> Testing Fun" },
455+
29,
456+
10,
457+
UnifiedHighlighter.OffsetSource.POSTINGS
458+
);
459+
}
460+
461+
public void testExceedMaxAnalyzedOffsetRandomOffset() throws Exception {
462+
TermQuery query = new TermQuery(new Term("text", "fun"));
463+
Analyzer analyzer = new WhitespaceAnalyzer();
464+
UnifiedHighlighter.OffsetSource offsetSource = randomBoolean()
465+
? UnifiedHighlighter.OffsetSource.ANALYSIS
466+
: UnifiedHighlighter.OffsetSource.POSTINGS;
467+
final String[] inputs = { "Fun fun fun fun fun" };
468+
TreeMap<Integer, String> outputs = new TreeMap<>(
469+
Map.of(
470+
7,
471+
"Fun <b>fun</b> fun fun fun",
472+
11,
473+
"Fun <b>fun</b> <b>fun</b> fun fun",
474+
15,
475+
"Fun <b>fun</b> <b>fun</b> <b>fun</b> fun",
476+
19,
477+
"Fun <b>fun</b> <b>fun</b> <b>fun</b> <b>fun</b>"
478+
)
479+
);
480+
Integer randomOffset = between(7, 19);
481+
String output = outputs.ceilingEntry(randomOffset).getValue();
482+
assertHighlightOneDoc(
483+
"text",
484+
inputs,
485+
analyzer,
486+
query,
487+
Locale.ROOT,
488+
BreakIterator.getSentenceInstance(Locale.ROOT),
489+
0,
490+
new String[] { output },
491+
47,
492+
randomOffset,
493+
offsetSource
494+
);
495+
}
397496
}

0 commit comments

Comments
 (0)