Skip to content

Commit 38009ef

Browse files
authored
Disable graph analysis at query time for shingle and cjk filters producing tokens of different size (elastic#23920)
This change disables graph analysis of token streams containing a shingle or a cjk filters that produce shingle or ngram of different size. The graph analysis is disabled for phrase and boolean queries. Closes elastic#23918
1 parent 203f843 commit 38009ef

File tree

10 files changed

+473
-2
lines changed

10 files changed

+473
-2
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
/*
2+
* Licensed to Elasticsearch under one or more contributor
3+
* license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright
5+
* ownership. Elasticsearch licenses this file to you under
6+
* the Apache License, Version 2.0 (the "License"); you may
7+
* not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.apache.lucene.analysis.miscellaneous;
21+
22+
import org.apache.lucene.analysis.TokenStream;
23+
import org.apache.lucene.util.Attribute;
24+
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
25+
26+
/**
27+
* This attribute can be used to indicate that the {@link PositionLengthAttribute}
28+
* should not be taken in account in this {@link TokenStream}.
29+
* Query parsers can extract this information to decide if this token stream should be analyzed
30+
* as a graph or not.
31+
*/
32+
public interface DisableGraphAttribute extends Attribute {}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
/*
2+
* Licensed to Elasticsearch under one or more contributor
3+
* license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright
5+
* ownership. Elasticsearch licenses this file to you under
6+
* the Apache License, Version 2.0 (the "License"); you may
7+
* not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.apache.lucene.analysis.miscellaneous;
21+
22+
import org.apache.lucene.util.AttributeImpl;
23+
import org.apache.lucene.util.AttributeReflector;
24+
25+
/** Default implementation of {@link DisableGraphAttribute}. */
26+
public class DisableGraphAttributeImpl extends AttributeImpl implements DisableGraphAttribute {
27+
public DisableGraphAttributeImpl() {}
28+
29+
@Override
30+
public void clear() {}
31+
32+
@Override
33+
public void reflectWith(AttributeReflector reflector) {
34+
}
35+
36+
@Override
37+
public void copyTo(AttributeImpl target) {}
38+
}

core/src/main/java/org/apache/lucene/queryparser/classic/MapperQueryParser.java

+28-1
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
package org.apache.lucene.queryparser.classic;
2121

2222
import org.apache.lucene.analysis.Analyzer;
23+
import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute;
2324
import org.apache.lucene.analysis.TokenStream;
2425
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
2526
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
@@ -49,14 +50,14 @@
4950
import org.elasticsearch.index.mapper.StringFieldType;
5051
import org.elasticsearch.index.query.QueryShardContext;
5152
import org.elasticsearch.index.query.support.QueryParsers;
53+
import org.elasticsearch.index.analysis.ShingleTokenFilterFactory;
5254

5355
import java.io.IOException;
5456
import java.util.ArrayList;
5557
import java.util.Collection;
5658
import java.util.HashMap;
5759
import java.util.List;
5860
import java.util.Map;
59-
import java.util.Objects;
6061

6162
import static java.util.Collections.unmodifiableMap;
6263
import static org.elasticsearch.common.lucene.search.Queries.fixNegativeQueryIfNeeded;
@@ -805,4 +806,30 @@ public Query parse(String query) throws ParseException {
805806
}
806807
return super.parse(query);
807808
}
809+
810+
/**
811+
* Checks if graph analysis should be enabled for the field depending
812+
* on the provided {@link Analyzer}
813+
*/
814+
protected Query createFieldQuery(Analyzer analyzer, BooleanClause.Occur operator, String field,
815+
String queryText, boolean quoted, int phraseSlop) {
816+
assert operator == BooleanClause.Occur.SHOULD || operator == BooleanClause.Occur.MUST;
817+
818+
// Use the analyzer to get all the tokens, and then build an appropriate
819+
// query based on the analysis chain.
820+
try (TokenStream source = analyzer.tokenStream(field, queryText)) {
821+
if (source.hasAttribute(DisableGraphAttribute.class)) {
822+
/**
823+
* A {@link TokenFilter} in this {@link TokenStream} disabled the graph analysis to avoid
824+
* paths explosion. See {@link ShingleTokenFilterFactory} for details.
825+
*/
826+
setEnableGraphQueries(false);
827+
}
828+
Query query = super.createFieldQuery(source, operator, field, quoted, phraseSlop);
829+
setEnableGraphQueries(true);
830+
return query;
831+
} catch (IOException e) {
832+
throw new RuntimeException("Error analyzing query text", e);
833+
}
834+
}
808835
}

core/src/main/java/org/elasticsearch/index/analysis/CJKBigramFilterFactory.java

+12-1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
import org.apache.lucene.analysis.TokenStream;
2323
import org.apache.lucene.analysis.cjk.CJKBigramFilter;
24+
import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute;
2425
import org.elasticsearch.common.settings.Settings;
2526
import org.elasticsearch.env.Environment;
2627
import org.elasticsearch.index.IndexSettings;
@@ -74,7 +75,17 @@ public CJKBigramFilterFactory(IndexSettings indexSettings, Environment environme
7475

7576
@Override
7677
public TokenStream create(TokenStream tokenStream) {
77-
return new CJKBigramFilter(tokenStream, flags, outputUnigrams);
78+
CJKBigramFilter filter = new CJKBigramFilter(tokenStream, flags, outputUnigrams);
79+
if (outputUnigrams) {
80+
/**
81+
* We disable the graph analysis on this token stream
82+
* because it produces bigrams AND unigrams.
83+
* Graph analysis on such token stream is useless and dangerous as it may create too many paths
84+
* since shingles of different size are not aligned in terms of positions.
85+
*/
86+
filter.addAttribute(DisableGraphAttribute.class);
87+
}
88+
return filter;
7889
}
7990

8091
}

core/src/main/java/org/elasticsearch/index/analysis/ShingleTokenFilterFactory.java

+10
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
package org.elasticsearch.index.analysis;
2121

2222
import org.apache.lucene.analysis.TokenStream;
23+
import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute;
2324
import org.apache.lucene.analysis.shingle.ShingleFilter;
2425
import org.elasticsearch.common.settings.Settings;
2526
import org.elasticsearch.env.Environment;
@@ -86,6 +87,15 @@ public TokenStream create(TokenStream tokenStream) {
8687
filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
8788
filter.setTokenSeparator(tokenSeparator);
8889
filter.setFillerToken(fillerToken);
90+
if (outputUnigrams || (minShingleSize != maxShingleSize)) {
91+
/**
92+
* We disable the graph analysis on this token stream
93+
* because it produces shingles of different size.
94+
* Graph analysis on such token stream is useless and dangerous as it may create too many paths
95+
* since shingles of different size are not aligned in terms of positions.
96+
*/
97+
filter.addAttribute(DisableGraphAttribute.class);
98+
}
8999
return filter;
90100
}
91101

core/src/main/java/org/elasticsearch/index/query/SimpleQueryParser.java

+28
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
package org.elasticsearch.index.query;
2020

2121
import org.apache.lucene.analysis.Analyzer;
22+
import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute;
2223
import org.apache.lucene.analysis.TokenStream;
2324
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
2425
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
@@ -31,6 +32,7 @@
3132
import org.apache.lucene.search.Query;
3233
import org.apache.lucene.search.SynonymQuery;
3334
import org.apache.lucene.util.BytesRef;
35+
import org.elasticsearch.index.analysis.ShingleTokenFilterFactory;
3436
import org.elasticsearch.index.mapper.MappedFieldType;
3537

3638
import java.io.IOException;
@@ -167,6 +169,32 @@ public Query newPrefixQuery(String text) {
167169
return super.simplify(bq.build());
168170
}
169171

172+
/**
173+
* Checks if graph analysis should be enabled for the field depending
174+
* on the provided {@link Analyzer}
175+
*/
176+
protected Query createFieldQuery(Analyzer analyzer, BooleanClause.Occur operator, String field,
177+
String queryText, boolean quoted, int phraseSlop) {
178+
assert operator == BooleanClause.Occur.SHOULD || operator == BooleanClause.Occur.MUST;
179+
180+
// Use the analyzer to get all the tokens, and then build an appropriate
181+
// query based on the analysis chain.
182+
try (TokenStream source = analyzer.tokenStream(field, queryText)) {
183+
if (source.hasAttribute(DisableGraphAttribute.class)) {
184+
/**
185+
* A {@link TokenFilter} in this {@link TokenStream} disabled the graph analysis to avoid
186+
* paths explosion. See {@link ShingleTokenFilterFactory} for details.
187+
*/
188+
setEnableGraphQueries(false);
189+
}
190+
Query query = super.createFieldQuery(source, operator, field, quoted, phraseSlop);
191+
setEnableGraphQueries(true);
192+
return query;
193+
} catch (IOException e) {
194+
throw new RuntimeException("Error analyzing query text", e);
195+
}
196+
}
197+
170198
private static Query wrapWithBoost(Query query, float boost) {
171199
if (boost != AbstractQueryBuilder.DEFAULT_BOOST) {
172200
return new BoostQuery(query, boost);

core/src/main/java/org/elasticsearch/index/search/MatchQuery.java

+29
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
package org.elasticsearch.index.search;
2121

2222
import org.apache.lucene.analysis.Analyzer;
23+
import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute;
24+
import org.apache.lucene.analysis.TokenStream;
2325
import org.apache.lucene.index.Term;
2426
import org.apache.lucene.queries.ExtendedCommonTermsQuery;
2527
import org.apache.lucene.search.BooleanClause;
@@ -49,6 +51,7 @@
4951
import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery;
5052
import org.elasticsearch.common.lucene.search.Queries;
5153
import org.elasticsearch.common.unit.Fuzziness;
54+
import org.elasticsearch.index.analysis.ShingleTokenFilterFactory;
5255
import org.elasticsearch.index.mapper.MappedFieldType;
5356
import org.elasticsearch.index.query.QueryShardContext;
5457
import org.elasticsearch.index.query.support.QueryParsers;
@@ -320,6 +323,32 @@ protected Query newSynonymQuery(Term[] terms) {
320323
return blendTermsQuery(terms, mapper);
321324
}
322325

326+
/**
327+
* Checks if graph analysis should be enabled for the field depending
328+
* on the provided {@link Analyzer}
329+
*/
330+
protected Query createFieldQuery(Analyzer analyzer, BooleanClause.Occur operator, String field,
331+
String queryText, boolean quoted, int phraseSlop) {
332+
assert operator == BooleanClause.Occur.SHOULD || operator == BooleanClause.Occur.MUST;
333+
334+
// Use the analyzer to get all the tokens, and then build an appropriate
335+
// query based on the analysis chain.
336+
try (TokenStream source = analyzer.tokenStream(field, queryText)) {
337+
if (source.hasAttribute(DisableGraphAttribute.class)) {
338+
/**
339+
* A {@link TokenFilter} in this {@link TokenStream} disabled the graph analysis to avoid
340+
* paths explosion. See {@link ShingleTokenFilterFactory} for details.
341+
*/
342+
setEnableGraphQueries(false);
343+
}
344+
Query query = super.createFieldQuery(source, operator, field, quoted, phraseSlop);
345+
setEnableGraphQueries(true);
346+
return query;
347+
} catch (IOException e) {
348+
throw new RuntimeException("Error analyzing query text", e);
349+
}
350+
}
351+
323352
public Query createPhrasePrefixQuery(String field, String queryText, int phraseSlop, int maxExpansions) {
324353
final Query query = createFieldQuery(getAnalyzer(), Occur.MUST, field, queryText, true, phraseSlop);
325354
return toMultiPhrasePrefix(query, phraseSlop, maxExpansions);

core/src/test/java/org/elasticsearch/index/analysis/CJKFilterFactoryTests.java

+23
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,9 @@
1919

2020
package org.elasticsearch.index.analysis;
2121

22+
import org.apache.lucene.analysis.TokenStream;
2223
import org.apache.lucene.analysis.Tokenizer;
24+
import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute;
2325
import org.apache.lucene.analysis.standard.StandardTokenizer;
2426
import org.elasticsearch.test.ESTestCase;
2527
import org.elasticsearch.test.ESTokenStreamTestCase;
@@ -69,4 +71,25 @@ public void testHanUnigramOnly() throws IOException {
6971
tokenizer.setReader(new StringReader(source));
7072
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
7173
}
74+
75+
public void testDisableGraph() throws IOException {
76+
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
77+
TokenFilterFactory allFlagsFactory = analysis.tokenFilter.get("cjk_all_flags");
78+
TokenFilterFactory hanOnlyFactory = analysis.tokenFilter.get("cjk_han_only");
79+
80+
String source = "多くの学生が試験に落ちた。";
81+
Tokenizer tokenizer = new StandardTokenizer();
82+
tokenizer.setReader(new StringReader(source));
83+
try (TokenStream tokenStream = allFlagsFactory.create(tokenizer)) {
84+
// This config outputs different size of ngrams so graph analysis is disabled
85+
assertTrue(tokenStream.hasAttribute(DisableGraphAttribute.class));
86+
}
87+
88+
tokenizer = new StandardTokenizer();
89+
tokenizer.setReader(new StringReader(source));
90+
try (TokenStream tokenStream = hanOnlyFactory.create(tokenizer)) {
91+
// This config uses only bigrams so graph analysis is enabled
92+
assertFalse(tokenStream.hasAttribute(DisableGraphAttribute.class));
93+
}
94+
}
7295
}

core/src/test/java/org/elasticsearch/index/analysis/ShingleTokenFilterFactoryTests.java

+22
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import org.apache.lucene.analysis.TokenStream;
2727
import org.apache.lucene.analysis.Tokenizer;
2828
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
29+
import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute;
2930
import org.elasticsearch.test.ESTestCase;
3031
import org.elasticsearch.test.ESTokenStreamTestCase;
3132

@@ -80,4 +81,25 @@ public void testFillerToken() throws IOException {
8081
TokenStream stream = new StopFilter(tokenizer, StopFilter.makeStopSet("the"));
8182
assertTokenStreamContents(tokenFilter.create(stream), expected);
8283
}
84+
85+
public void testDisableGraph() throws IOException {
86+
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
87+
TokenFilterFactory shingleFiller = analysis.tokenFilter.get("shingle_filler");
88+
TokenFilterFactory shingleInverse = analysis.tokenFilter.get("shingle_inverse");
89+
90+
String source = "hello world";
91+
Tokenizer tokenizer = new WhitespaceTokenizer();
92+
tokenizer.setReader(new StringReader(source));
93+
try (TokenStream stream = shingleFiller.create(tokenizer)) {
94+
// This config uses different size of shingles so graph analysis is disabled
95+
assertTrue(stream.hasAttribute(DisableGraphAttribute.class));
96+
}
97+
98+
tokenizer = new WhitespaceTokenizer();
99+
tokenizer.setReader(new StringReader(source));
100+
try (TokenStream stream = shingleInverse.create(tokenizer)) {
101+
// This config uses a single size of shingles so graph analysis is enabled
102+
assertFalse(stream.hasAttribute(DisableGraphAttribute.class));
103+
}
104+
}
83105
}

0 commit comments

Comments
 (0)