Revert "Disable graph analysis at query time for shingle and cjk filters producing tokens of different size (#23920)"

jimczi · jimczi · commit ab8e63298dca · 2017-04-06T09:18:08.000+02:00
This reverts commit 6cc7df7.
diff --git a/core/src/main/java/org/apache/lucene/analysis/miscellaneous/DisableGraphAttribute.java b/core/src/main/java/org/apache/lucene/analysis/miscellaneous/DisableGraphAttribute.java
diff --git a/core/src/main/java/org/apache/lucene/analysis/miscellaneous/DisableGraphAttributeImpl.java b/core/src/main/java/org/apache/lucene/analysis/miscellaneous/DisableGraphAttributeImpl.java
diff --git a/core/src/main/java/org/apache/lucene/queryparser/classic/MapperQueryParser.java b/core/src/main/java/org/apache/lucene/queryparser/classic/MapperQueryParser.java
@@ -23,7 +23,6 @@
 import static org.elasticsearch.common.lucene.search.Queries.fixNegativeQueryIfNeeded;
 
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
@@ -52,14 +51,14 @@
 import org.elasticsearch.index.mapper.StringFieldType;
 import org.elasticsearch.index.query.QueryShardContext;
 import org.elasticsearch.index.query.support.QueryParsers;
-import org.elasticsearch.index.analysis.ShingleTokenFilterFactory;
 
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.Objects;
 
 /**
  * A query parser that uses the {@link MapperService} in order to build smarter
@@ -808,30 +807,4 @@ public Query parse(String query) throws ParseException {
         }
         return super.parse(query);
     }
-
-    /**
-     * Checks if graph analysis should be enabled for the field depending
-     * on the provided {@link Analyzer}
-     */
-    protected Query createFieldQuery(Analyzer analyzer, BooleanClause.Occur operator, String field,
-                                     String queryText, boolean quoted, int phraseSlop) {
-        assert operator == BooleanClause.Occur.SHOULD || operator == BooleanClause.Occur.MUST;
-
-        // Use the analyzer to get all the tokens, and then build an appropriate
-        // query based on the analysis chain.
-        try (TokenStream source = analyzer.tokenStream(field, queryText)) {
-            if (source.hasAttribute(DisableGraphAttribute.class)) {
-                /**
-                 * A {@link TokenFilter} in this {@link TokenStream} disabled the graph analysis to avoid
-                 * paths explosion. See {@link ShingleTokenFilterFactory} for details.
-                 */
-                setEnableGraphQueries(false);
-            }
-            Query query = super.createFieldQuery(source, operator, field, quoted, phraseSlop);
-            setEnableGraphQueries(true);
-            return query;
-        } catch (IOException e) {
-            throw new RuntimeException("Error analyzing query text", e);
-        }
-    }
 }
diff --git a/core/src/main/java/org/elasticsearch/index/analysis/CJKBigramFilterFactory.java b/core/src/main/java/org/elasticsearch/index/analysis/CJKBigramFilterFactory.java
@@ -21,7 +21,6 @@
 
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.cjk.CJKBigramFilter;
-import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
@@ -74,17 +73,7 @@ public CJKBigramFilterFactory(IndexSettings indexSettings, Environment environme
 
     @Override
     public TokenStream create(TokenStream tokenStream) {
-        CJKBigramFilter filter = new CJKBigramFilter(tokenStream, flags, outputUnigrams);
-        if (outputUnigrams) {
-            /**
-             * We disable the graph analysis on this token stream
-             * because it produces bigrams AND unigrams.
-             * Graph analysis on such token stream is useless and dangerous as it may create too many paths
-             * since shingles of different size are not aligned in terms of positions.
-             */
-            filter.addAttribute(DisableGraphAttribute.class);
-        }
-        return filter;
+        return new CJKBigramFilter(tokenStream, flags, outputUnigrams);
     }
 
 }
diff --git a/core/src/main/java/org/elasticsearch/index/analysis/ShingleTokenFilterFactory.java b/core/src/main/java/org/elasticsearch/index/analysis/ShingleTokenFilterFactory.java
@@ -20,7 +20,6 @@
 package org.elasticsearch.index.analysis;
 
 import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute;
 import org.apache.lucene.analysis.shingle.ShingleFilter;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
@@ -90,15 +89,6 @@ public TokenStream create(TokenStream tokenStream) {
             filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
             filter.setTokenSeparator(tokenSeparator);
             filter.setFillerToken(fillerToken);
-            if (outputUnigrams || (minShingleSize != maxShingleSize)) {
-                /**
-                 * We disable the graph analysis on this token stream
-                 * because it produces shingles of different size.
-                 * Graph analysis on such token stream is useless and dangerous as it may create too many paths
-                 * since shingles of different size are not aligned in terms of positions.
-                 */
-                filter.addAttribute(DisableGraphAttribute.class);
-            }
             return filter;
         }
 
diff --git a/core/src/main/java/org/elasticsearch/index/query/SimpleQueryParser.java b/core/src/main/java/org/elasticsearch/index/query/SimpleQueryParser.java
@@ -19,7 +19,6 @@
 package org.elasticsearch.index.query;
 
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
@@ -32,7 +31,6 @@
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.SynonymQuery;
 import org.apache.lucene.util.BytesRef;
-import org.elasticsearch.index.analysis.ShingleTokenFilterFactory;
 import org.elasticsearch.index.mapper.MappedFieldType;
 
 import java.io.IOException;
@@ -169,32 +167,6 @@ public Query newPrefixQuery(String text) {
         return super.simplify(bq.build());
     }
 
-    /**
-     * Checks if graph analysis should be enabled for the field depending
-     * on the provided {@link Analyzer}
-     */
-    protected Query createFieldQuery(Analyzer analyzer, BooleanClause.Occur operator, String field,
-                                     String queryText, boolean quoted, int phraseSlop) {
-        assert operator == BooleanClause.Occur.SHOULD || operator == BooleanClause.Occur.MUST;
-
-        // Use the analyzer to get all the tokens, and then build an appropriate
-        // query based on the analysis chain.
-        try (TokenStream source = analyzer.tokenStream(field, queryText)) {
-            if (source.hasAttribute(DisableGraphAttribute.class)) {
-                /**
-                 * A {@link TokenFilter} in this {@link TokenStream} disabled the graph analysis to avoid
-                 * paths explosion. See {@link ShingleTokenFilterFactory} for details.
-                 */
-                setEnableGraphQueries(false);
-            }
-            Query query = super.createFieldQuery(source, operator, field, quoted, phraseSlop);
-            setEnableGraphQueries(true);
-            return query;
-        } catch (IOException e) {
-            throw new RuntimeException("Error analyzing query text", e);
-        }
-    }
-
     private static Query wrapWithBoost(Query query, float boost) {
         if (boost != AbstractQueryBuilder.DEFAULT_BOOST) {
             return new BoostQuery(query, boost);
diff --git a/core/src/main/java/org/elasticsearch/index/search/MatchQuery.java b/core/src/main/java/org/elasticsearch/index/search/MatchQuery.java
@@ -20,8 +20,6 @@
 package org.elasticsearch.index.search;
 
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute;
-import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.queries.ExtendedCommonTermsQuery;
 import org.apache.lucene.search.BooleanClause;
@@ -46,7 +44,6 @@
 import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery;
 import org.elasticsearch.common.lucene.search.Queries;
 import org.elasticsearch.common.unit.Fuzziness;
-import org.elasticsearch.index.analysis.ShingleTokenFilterFactory;
 import org.elasticsearch.index.mapper.MappedFieldType;
 import org.elasticsearch.index.query.QueryShardContext;
 import org.elasticsearch.index.query.support.QueryParsers;
@@ -319,32 +316,6 @@ protected Query newSynonymQuery(Term[] terms) {
             return blendTermsQuery(terms, mapper);
         }
 
-        /**
-         * Checks if graph analysis should be enabled for the field depending
-         * on the provided {@link Analyzer}
-         */
-        protected Query createFieldQuery(Analyzer analyzer, BooleanClause.Occur operator, String field,
-                                         String queryText, boolean quoted, int phraseSlop) {
-            assert operator == BooleanClause.Occur.SHOULD || operator == BooleanClause.Occur.MUST;
-
-            // Use the analyzer to get all the tokens, and then build an appropriate
-            // query based on the analysis chain.
-            try (TokenStream source = analyzer.tokenStream(field, queryText)) {
-                if (source.hasAttribute(DisableGraphAttribute.class)) {
-                    /**
-                     * A {@link TokenFilter} in this {@link TokenStream} disabled the graph analysis to avoid
-                     * paths explosion. See {@link ShingleTokenFilterFactory} for details.
-                     */
-                    setEnableGraphQueries(false);
-                }
-                Query query = super.createFieldQuery(source, operator, field, quoted, phraseSlop);
-                setEnableGraphQueries(true);
-                return query;
-            } catch (IOException e) {
-                throw new RuntimeException("Error analyzing query text", e);
-            }
-        }
-
         public Query createPhrasePrefixQuery(String field, String queryText, int phraseSlop, int maxExpansions) {
             final Query query = createFieldQuery(getAnalyzer(), Occur.MUST, field, queryText, true, phraseSlop);
             if (query instanceof GraphQuery) {
diff --git a/core/src/test/java/org/elasticsearch/index/analysis/CJKFilterFactoryTests.java b/core/src/test/java/org/elasticsearch/index/analysis/CJKFilterFactoryTests.java
@@ -19,9 +19,7 @@
 
 package org.elasticsearch.index.analysis;
 
-import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.elasticsearch.test.ESTestCase;
 import org.elasticsearch.test.ESTokenStreamTestCase;
@@ -71,25 +69,4 @@ public void testHanUnigramOnly() throws IOException {
         tokenizer.setReader(new StringReader(source));
         assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
     }
-
-    public void testDisableGraph() throws IOException {
-        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
-        TokenFilterFactory allFlagsFactory = analysis.tokenFilter.get("cjk_all_flags");
-        TokenFilterFactory hanOnlyFactory = analysis.tokenFilter.get("cjk_han_only");
-
-        String source = "多くの学生が試験に落ちた。";
-        Tokenizer tokenizer = new StandardTokenizer();
-        tokenizer.setReader(new StringReader(source));
-        try (TokenStream tokenStream = allFlagsFactory.create(tokenizer)) {
-            // This config outputs different size of ngrams so graph analysis is disabled
-            assertTrue(tokenStream.hasAttribute(DisableGraphAttribute.class));
-        }
-
-        tokenizer = new StandardTokenizer();
-        tokenizer.setReader(new StringReader(source));
-        try (TokenStream tokenStream = hanOnlyFactory.create(tokenizer)) {
-            // This config uses only bigrams so graph analysis is enabled
-            assertFalse(tokenStream.hasAttribute(DisableGraphAttribute.class));
-        }
-    }
 }
diff --git a/core/src/test/java/org/elasticsearch/index/analysis/ShingleTokenFilterFactoryTests.java b/core/src/test/java/org/elasticsearch/index/analysis/ShingleTokenFilterFactoryTests.java
@@ -26,7 +26,6 @@
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
-import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute;
 import org.elasticsearch.test.ESTestCase;
 import org.elasticsearch.test.ESTokenStreamTestCase;
 
@@ -81,25 +80,4 @@ public void testFillerToken() throws IOException {
         TokenStream stream = new StopFilter(tokenizer, StopFilter.makeStopSet("the"));
         assertTokenStreamContents(tokenFilter.create(stream), expected);
     }
-
-    public void testDisableGraph() throws IOException {
-        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
-        TokenFilterFactory shingleFiller = analysis.tokenFilter.get("shingle_filler");
-        TokenFilterFactory shingleInverse = analysis.tokenFilter.get("shingle_inverse");
-
-        String source = "hello world";
-        Tokenizer tokenizer = new WhitespaceTokenizer();
-        tokenizer.setReader(new StringReader(source));
-        try (TokenStream stream = shingleFiller.create(tokenizer)) {
-            // This config uses different size of shingles so graph analysis is disabled
-            assertTrue(stream.hasAttribute(DisableGraphAttribute.class));
-        }
-
-        tokenizer = new WhitespaceTokenizer();
-        tokenizer.setReader(new StringReader(source));
-        try (TokenStream stream = shingleInverse.create(tokenizer)) {
-            // This config uses a single size of shingles so graph analysis is enabled
-            assertFalse(stream.hasAttribute(DisableGraphAttribute.class));
-        }
-    }
 }
diff --git a/core/src/test/java/org/elasticsearch/index/query/DisableGraphQueryTests.java b/core/src/test/java/org/elasticsearch/index/query/DisableGraphQueryTests.java