Skip to content

Commit ab8e632

Browse files
committed
Revert "Disable graph analysis at query time for shingle and cjk filters producing tokens of different size (#23920)"
This reverts commit 6cc7df7.
1 parent 6cc7df7 commit ab8e632

File tree

10 files changed

+2
-473
lines changed

10 files changed

+2
-473
lines changed

core/src/main/java/org/apache/lucene/analysis/miscellaneous/DisableGraphAttribute.java

Lines changed: 0 additions & 32 deletions
This file was deleted.

core/src/main/java/org/apache/lucene/analysis/miscellaneous/DisableGraphAttributeImpl.java

Lines changed: 0 additions & 38 deletions
This file was deleted.

core/src/main/java/org/apache/lucene/queryparser/classic/MapperQueryParser.java

Lines changed: 1 addition & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
import static org.elasticsearch.common.lucene.search.Queries.fixNegativeQueryIfNeeded;
2424

2525
import org.apache.lucene.analysis.Analyzer;
26-
import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute;
2726
import org.apache.lucene.analysis.TokenStream;
2827
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
2928
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
@@ -52,14 +51,14 @@
5251
import org.elasticsearch.index.mapper.StringFieldType;
5352
import org.elasticsearch.index.query.QueryShardContext;
5453
import org.elasticsearch.index.query.support.QueryParsers;
55-
import org.elasticsearch.index.analysis.ShingleTokenFilterFactory;
5654

5755
import java.io.IOException;
5856
import java.util.ArrayList;
5957
import java.util.Collection;
6058
import java.util.HashMap;
6159
import java.util.List;
6260
import java.util.Map;
61+
import java.util.Objects;
6362

6463
/**
6564
* A query parser that uses the {@link MapperService} in order to build smarter
@@ -808,30 +807,4 @@ public Query parse(String query) throws ParseException {
808807
}
809808
return super.parse(query);
810809
}
811-
812-
/**
813-
* Checks if graph analysis should be enabled for the field depending
814-
* on the provided {@link Analyzer}
815-
*/
816-
protected Query createFieldQuery(Analyzer analyzer, BooleanClause.Occur operator, String field,
817-
String queryText, boolean quoted, int phraseSlop) {
818-
assert operator == BooleanClause.Occur.SHOULD || operator == BooleanClause.Occur.MUST;
819-
820-
// Use the analyzer to get all the tokens, and then build an appropriate
821-
// query based on the analysis chain.
822-
try (TokenStream source = analyzer.tokenStream(field, queryText)) {
823-
if (source.hasAttribute(DisableGraphAttribute.class)) {
824-
/**
825-
* A {@link TokenFilter} in this {@link TokenStream} disabled the graph analysis to avoid
826-
* paths explosion. See {@link ShingleTokenFilterFactory} for details.
827-
*/
828-
setEnableGraphQueries(false);
829-
}
830-
Query query = super.createFieldQuery(source, operator, field, quoted, phraseSlop);
831-
setEnableGraphQueries(true);
832-
return query;
833-
} catch (IOException e) {
834-
throw new RuntimeException("Error analyzing query text", e);
835-
}
836-
}
837810
}

core/src/main/java/org/elasticsearch/index/analysis/CJKBigramFilterFactory.java

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121

2222
import org.apache.lucene.analysis.TokenStream;
2323
import org.apache.lucene.analysis.cjk.CJKBigramFilter;
24-
import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute;
2524
import org.elasticsearch.common.settings.Settings;
2625
import org.elasticsearch.env.Environment;
2726
import org.elasticsearch.index.IndexSettings;
@@ -74,17 +73,7 @@ public CJKBigramFilterFactory(IndexSettings indexSettings, Environment environme
7473

7574
@Override
7675
public TokenStream create(TokenStream tokenStream) {
77-
CJKBigramFilter filter = new CJKBigramFilter(tokenStream, flags, outputUnigrams);
78-
if (outputUnigrams) {
79-
/**
80-
* We disable the graph analysis on this token stream
81-
* because it produces bigrams AND unigrams.
82-
* Graph analysis on such token stream is useless and dangerous as it may create too many paths
83-
* since shingles of different size are not aligned in terms of positions.
84-
*/
85-
filter.addAttribute(DisableGraphAttribute.class);
86-
}
87-
return filter;
76+
return new CJKBigramFilter(tokenStream, flags, outputUnigrams);
8877
}
8978

9079
}

core/src/main/java/org/elasticsearch/index/analysis/ShingleTokenFilterFactory.java

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
package org.elasticsearch.index.analysis;
2121

2222
import org.apache.lucene.analysis.TokenStream;
23-
import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute;
2423
import org.apache.lucene.analysis.shingle.ShingleFilter;
2524
import org.elasticsearch.common.settings.Settings;
2625
import org.elasticsearch.env.Environment;
@@ -90,15 +89,6 @@ public TokenStream create(TokenStream tokenStream) {
9089
filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
9190
filter.setTokenSeparator(tokenSeparator);
9291
filter.setFillerToken(fillerToken);
93-
if (outputUnigrams || (minShingleSize != maxShingleSize)) {
94-
/**
95-
* We disable the graph analysis on this token stream
96-
* because it produces shingles of different size.
97-
* Graph analysis on such token stream is useless and dangerous as it may create too many paths
98-
* since shingles of different size are not aligned in terms of positions.
99-
*/
100-
filter.addAttribute(DisableGraphAttribute.class);
101-
}
10292
return filter;
10393
}
10494

core/src/main/java/org/elasticsearch/index/query/SimpleQueryParser.java

Lines changed: 0 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
package org.elasticsearch.index.query;
2020

2121
import org.apache.lucene.analysis.Analyzer;
22-
import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute;
2322
import org.apache.lucene.analysis.TokenStream;
2423
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
2524
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
@@ -32,7 +31,6 @@
3231
import org.apache.lucene.search.Query;
3332
import org.apache.lucene.search.SynonymQuery;
3433
import org.apache.lucene.util.BytesRef;
35-
import org.elasticsearch.index.analysis.ShingleTokenFilterFactory;
3634
import org.elasticsearch.index.mapper.MappedFieldType;
3735

3836
import java.io.IOException;
@@ -169,32 +167,6 @@ public Query newPrefixQuery(String text) {
169167
return super.simplify(bq.build());
170168
}
171169

172-
/**
173-
* Checks if graph analysis should be enabled for the field depending
174-
* on the provided {@link Analyzer}
175-
*/
176-
protected Query createFieldQuery(Analyzer analyzer, BooleanClause.Occur operator, String field,
177-
String queryText, boolean quoted, int phraseSlop) {
178-
assert operator == BooleanClause.Occur.SHOULD || operator == BooleanClause.Occur.MUST;
179-
180-
// Use the analyzer to get all the tokens, and then build an appropriate
181-
// query based on the analysis chain.
182-
try (TokenStream source = analyzer.tokenStream(field, queryText)) {
183-
if (source.hasAttribute(DisableGraphAttribute.class)) {
184-
/**
185-
* A {@link TokenFilter} in this {@link TokenStream} disabled the graph analysis to avoid
186-
* paths explosion. See {@link ShingleTokenFilterFactory} for details.
187-
*/
188-
setEnableGraphQueries(false);
189-
}
190-
Query query = super.createFieldQuery(source, operator, field, quoted, phraseSlop);
191-
setEnableGraphQueries(true);
192-
return query;
193-
} catch (IOException e) {
194-
throw new RuntimeException("Error analyzing query text", e);
195-
}
196-
}
197-
198170
private static Query wrapWithBoost(Query query, float boost) {
199171
if (boost != AbstractQueryBuilder.DEFAULT_BOOST) {
200172
return new BoostQuery(query, boost);

core/src/main/java/org/elasticsearch/index/search/MatchQuery.java

Lines changed: 0 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,6 @@
2020
package org.elasticsearch.index.search;
2121

2222
import org.apache.lucene.analysis.Analyzer;
23-
import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute;
24-
import org.apache.lucene.analysis.TokenStream;
2523
import org.apache.lucene.index.Term;
2624
import org.apache.lucene.queries.ExtendedCommonTermsQuery;
2725
import org.apache.lucene.search.BooleanClause;
@@ -46,7 +44,6 @@
4644
import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery;
4745
import org.elasticsearch.common.lucene.search.Queries;
4846
import org.elasticsearch.common.unit.Fuzziness;
49-
import org.elasticsearch.index.analysis.ShingleTokenFilterFactory;
5047
import org.elasticsearch.index.mapper.MappedFieldType;
5148
import org.elasticsearch.index.query.QueryShardContext;
5249
import org.elasticsearch.index.query.support.QueryParsers;
@@ -319,32 +316,6 @@ protected Query newSynonymQuery(Term[] terms) {
319316
return blendTermsQuery(terms, mapper);
320317
}
321318

322-
/**
323-
* Checks if graph analysis should be enabled for the field depending
324-
* on the provided {@link Analyzer}
325-
*/
326-
protected Query createFieldQuery(Analyzer analyzer, BooleanClause.Occur operator, String field,
327-
String queryText, boolean quoted, int phraseSlop) {
328-
assert operator == BooleanClause.Occur.SHOULD || operator == BooleanClause.Occur.MUST;
329-
330-
// Use the analyzer to get all the tokens, and then build an appropriate
331-
// query based on the analysis chain.
332-
try (TokenStream source = analyzer.tokenStream(field, queryText)) {
333-
if (source.hasAttribute(DisableGraphAttribute.class)) {
334-
/**
335-
* A {@link TokenFilter} in this {@link TokenStream} disabled the graph analysis to avoid
336-
* paths explosion. See {@link ShingleTokenFilterFactory} for details.
337-
*/
338-
setEnableGraphQueries(false);
339-
}
340-
Query query = super.createFieldQuery(source, operator, field, quoted, phraseSlop);
341-
setEnableGraphQueries(true);
342-
return query;
343-
} catch (IOException e) {
344-
throw new RuntimeException("Error analyzing query text", e);
345-
}
346-
}
347-
348319
public Query createPhrasePrefixQuery(String field, String queryText, int phraseSlop, int maxExpansions) {
349320
final Query query = createFieldQuery(getAnalyzer(), Occur.MUST, field, queryText, true, phraseSlop);
350321
if (query instanceof GraphQuery) {

core/src/test/java/org/elasticsearch/index/analysis/CJKFilterFactoryTests.java

Lines changed: 0 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,7 @@
1919

2020
package org.elasticsearch.index.analysis;
2121

22-
import org.apache.lucene.analysis.TokenStream;
2322
import org.apache.lucene.analysis.Tokenizer;
24-
import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute;
2523
import org.apache.lucene.analysis.standard.StandardTokenizer;
2624
import org.elasticsearch.test.ESTestCase;
2725
import org.elasticsearch.test.ESTokenStreamTestCase;
@@ -71,25 +69,4 @@ public void testHanUnigramOnly() throws IOException {
7169
tokenizer.setReader(new StringReader(source));
7270
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
7371
}
74-
75-
public void testDisableGraph() throws IOException {
76-
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
77-
TokenFilterFactory allFlagsFactory = analysis.tokenFilter.get("cjk_all_flags");
78-
TokenFilterFactory hanOnlyFactory = analysis.tokenFilter.get("cjk_han_only");
79-
80-
String source = "多くの学生が試験に落ちた。";
81-
Tokenizer tokenizer = new StandardTokenizer();
82-
tokenizer.setReader(new StringReader(source));
83-
try (TokenStream tokenStream = allFlagsFactory.create(tokenizer)) {
84-
// This config outputs different size of ngrams so graph analysis is disabled
85-
assertTrue(tokenStream.hasAttribute(DisableGraphAttribute.class));
86-
}
87-
88-
tokenizer = new StandardTokenizer();
89-
tokenizer.setReader(new StringReader(source));
90-
try (TokenStream tokenStream = hanOnlyFactory.create(tokenizer)) {
91-
// This config uses only bigrams so graph analysis is enabled
92-
assertFalse(tokenStream.hasAttribute(DisableGraphAttribute.class));
93-
}
94-
}
9572
}

core/src/test/java/org/elasticsearch/index/analysis/ShingleTokenFilterFactoryTests.java

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
import org.apache.lucene.analysis.TokenStream;
2727
import org.apache.lucene.analysis.Tokenizer;
2828
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
29-
import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute;
3029
import org.elasticsearch.test.ESTestCase;
3130
import org.elasticsearch.test.ESTokenStreamTestCase;
3231

@@ -81,25 +80,4 @@ public void testFillerToken() throws IOException {
8180
TokenStream stream = new StopFilter(tokenizer, StopFilter.makeStopSet("the"));
8281
assertTokenStreamContents(tokenFilter.create(stream), expected);
8382
}
84-
85-
public void testDisableGraph() throws IOException {
86-
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
87-
TokenFilterFactory shingleFiller = analysis.tokenFilter.get("shingle_filler");
88-
TokenFilterFactory shingleInverse = analysis.tokenFilter.get("shingle_inverse");
89-
90-
String source = "hello world";
91-
Tokenizer tokenizer = new WhitespaceTokenizer();
92-
tokenizer.setReader(new StringReader(source));
93-
try (TokenStream stream = shingleFiller.create(tokenizer)) {
94-
// This config uses different size of shingles so graph analysis is disabled
95-
assertTrue(stream.hasAttribute(DisableGraphAttribute.class));
96-
}
97-
98-
tokenizer = new WhitespaceTokenizer();
99-
tokenizer.setReader(new StringReader(source));
100-
try (TokenStream stream = shingleInverse.create(tokenizer)) {
101-
// This config uses a single size of shingles so graph analysis is enabled
102-
assertFalse(stream.hasAttribute(DisableGraphAttribute.class));
103-
}
104-
}
10583
}

0 commit comments

Comments
 (0)