Skip to content

Commit 317a80f

Browse files
authored
Add stopword support to IntervalBuilder (#39637)
* Use Intervals.extend() to preserve stopword gaps when building interval queries - requires LUCENE-8697 * cleanup * More tests * Fix simple synonynms
1 parent fd22c80 commit 317a80f

File tree

2 files changed

+101
-11
lines changed

2 files changed

+101
-11
lines changed

server/src/main/java/org/elasticsearch/index/query/IntervalBuilder.java

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -143,38 +143,50 @@ protected static IntervalsSource combineSources(List<IntervalsSource> sources, i
143143
protected List<IntervalsSource> analyzeTerms(TokenStream ts) throws IOException {
144144
List<IntervalsSource> terms = new ArrayList<>();
145145
TermToBytesRefAttribute bytesAtt = ts.addAttribute(TermToBytesRefAttribute.class);
146+
PositionIncrementAttribute posAtt = ts.addAttribute(PositionIncrementAttribute.class);
146147
ts.reset();
147148
while (ts.incrementToken()) {
148149
BytesRef term = bytesAtt.getBytesRef();
149-
terms.add(Intervals.term(BytesRef.deepCopyOf(term)));
150+
int precedingSpaces = posAtt.getPositionIncrement() - 1;
151+
terms.add(extend(Intervals.term(BytesRef.deepCopyOf(term)), precedingSpaces));
150152
}
151153
ts.end();
152154
return terms;
153155
}
154156

157+
public static IntervalsSource extend(IntervalsSource source, int precedingSpaces) {
158+
if (precedingSpaces == 0) {
159+
return source;
160+
}
161+
return Intervals.extend(source, precedingSpaces, 0);
162+
}
163+
155164
protected IntervalsSource analyzeSynonyms(TokenStream ts, int maxGaps, boolean ordered) throws IOException {
156165
List<IntervalsSource> terms = new ArrayList<>();
157166
List<IntervalsSource> synonyms = new ArrayList<>();
158167
TermToBytesRefAttribute bytesAtt = ts.addAttribute(TermToBytesRefAttribute.class);
159168
PositionIncrementAttribute posAtt = ts.addAttribute(PositionIncrementAttribute.class);
160169
ts.reset();
170+
int spaces = 0;
161171
while (ts.incrementToken()) {
162-
if (posAtt.getPositionIncrement() == 1) {
172+
int posInc = posAtt.getPositionIncrement();
173+
if (posInc > 0) {
163174
if (synonyms.size() == 1) {
164-
terms.add(synonyms.get(0));
175+
terms.add(extend(synonyms.get(0), spaces));
165176
}
166177
else if (synonyms.size() > 1) {
167-
terms.add(Intervals.or(synonyms.toArray(new IntervalsSource[0])));
178+
terms.add(extend(Intervals.or(synonyms.toArray(new IntervalsSource[0])), spaces));
168179
}
169180
synonyms.clear();
181+
spaces = posInc - 1;
170182
}
171183
synonyms.add(Intervals.term(BytesRef.deepCopyOf(bytesAtt.getBytesRef())));
172184
}
173185
if (synonyms.size() == 1) {
174-
terms.add(synonyms.get(0));
186+
terms.add(extend(synonyms.get(0), spaces));
175187
}
176188
else {
177-
terms.add(Intervals.or(synonyms.toArray(new IntervalsSource[0])));
189+
terms.add(extend(Intervals.or(synonyms.toArray(new IntervalsSource[0])), spaces));
178190
}
179191
return combineSources(terms, maxGaps, ordered);
180192
}

server/src/test/java/org/elasticsearch/index/query/IntervalBuilderTests.java

Lines changed: 83 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,22 @@ public void testPhrase() throws IOException {
9494

9595
}
9696

97+
public void testPhraseWithStopword() throws IOException {
98+
99+
CannedTokenStream ts = new CannedTokenStream(
100+
new Token("term1", 1, 1, 2),
101+
new Token("term3", 2, 5, 6)
102+
);
103+
104+
IntervalsSource source = BUILDER.analyzeText(new CachingTokenFilter(ts), 0, true);
105+
IntervalsSource expected = Intervals.phrase(
106+
Intervals.term("term1"), Intervals.extend(Intervals.term("term3"), 1, 0)
107+
);
108+
109+
assertEquals(expected, source);
110+
111+
}
112+
97113
public void testSimpleSynonyms() throws IOException {
98114

99115
CannedTokenStream ts = new CannedTokenStream(
@@ -112,16 +128,32 @@ public void testSimpleSynonyms() throws IOException {
112128

113129
}
114130

115-
public void testGraphSynonyms() throws IOException {
131+
public void testSimpleSynonymsWithGap() throws IOException {
132+
// term1 [] term2/term3/term4 term5
133+
CannedTokenStream ts = new CannedTokenStream(
134+
new Token("term1", 1, 2),
135+
new Token("term2", 2, 3, 4),
136+
new Token("term3", 0, 3, 4),
137+
new Token("term4", 0, 3, 4),
138+
new Token("term5", 5, 6)
139+
);
140+
141+
IntervalsSource source = BUILDER.analyzeText(new CachingTokenFilter(ts), -1, true);
142+
IntervalsSource expected = Intervals.ordered(
143+
Intervals.term("term1"),
144+
Intervals.extend(Intervals.or(Intervals.term("term2"), Intervals.term("term3"), Intervals.term("term4")), 1, 0),
145+
Intervals.term("term5")
146+
);
147+
assertEquals(expected, source);
148+
}
116149

117-
// term1 term2/term3:2 term4 term5
150+
public void testGraphSynonyms() throws IOException {
118151

119-
Token graphToken = new Token("term2", 3, 4);
120-
graphToken.setPositionLength(2);
152+
// term1 term2:2/term3 term4 term5
121153

122154
CannedTokenStream ts = new CannedTokenStream(
123155
new Token("term1", 1, 2),
124-
graphToken,
156+
new Token("term2", 1, 3, 4, 2),
125157
new Token("term3", 0, 3, 4),
126158
new Token("term4", 5, 6),
127159
new Token("term5", 6, 7)
@@ -138,4 +170,50 @@ public void testGraphSynonyms() throws IOException {
138170

139171
}
140172

173+
public void testGraphSynonymsWithGaps() throws IOException {
174+
175+
// term1 [] term2:4/term3 [] [] term4 term5
176+
177+
CannedTokenStream ts = new CannedTokenStream(
178+
new Token("term1", 1, 2),
179+
new Token("term2", 2, 3, 4, 4),
180+
new Token("term3", 0, 3, 4),
181+
new Token("term4", 3, 5, 6),
182+
new Token("term5", 6, 7)
183+
);
184+
185+
IntervalsSource source = BUILDER.analyzeText(new CachingTokenFilter(ts), -1, true);
186+
IntervalsSource expected = Intervals.ordered(
187+
Intervals.term("term1"),
188+
Intervals.or(
189+
Intervals.extend(Intervals.term("term2"), 1, 0),
190+
Intervals.phrase(
191+
Intervals.extend(Intervals.term("term3"), 1, 0),
192+
Intervals.extend(Intervals.term("term4"), 2, 0))),
193+
Intervals.term("term5")
194+
);
195+
196+
assertEquals(expected, source);
197+
198+
}
199+
200+
public void testGraphTerminatesOnGap() throws IOException {
201+
// term1 term2:2/term3 term4 [] term5
202+
CannedTokenStream ts = new CannedTokenStream(
203+
new Token("term1", 1, 2),
204+
new Token("term2", 1, 2, 3, 2),
205+
new Token("term3", 0, 2, 3),
206+
new Token("term4", 2, 3),
207+
new Token("term5", 2, 6, 7)
208+
);
209+
210+
IntervalsSource source = BUILDER.analyzeText(new CachingTokenFilter(ts), -1, true);
211+
IntervalsSource expected = Intervals.ordered(
212+
Intervals.term("term1"),
213+
Intervals.or(Intervals.term("term2"), Intervals.phrase("term3", "term4")),
214+
Intervals.extend(Intervals.term("term5"), 1, 0)
215+
);
216+
assertEquals(expected, source);
217+
}
218+
141219
}

0 commit comments

Comments
 (0)