Check for deprecations when analyzers are built (#50908)

romseygeek · romseygeek · commit 8c16725a0d43 · 2020-01-14T13:52:02.000Z
Generally speaking, deprecated analysis components in elasticsearch will issue deprecation warnings when they are first used. However, this means that no warnings are emitted when indexes are created with deprecated components, and users have to actually index a document to see warnings. This makes it much harder to see these warnings and act on them at appropriate times. This is worse in the case where components throw exceptions on upgrade. In this case, users will not be aware of a problem until a document is indexed, instead of at index creation time. This commit adds a new check that pushes an empty string through all user-defined analyzers and normalizers when an IndexAnalyzers object is built for each index; deprecation warnings and exceptions are now emitted when indexes are created or opened. Fixes #42349
diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisPluginTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisPluginTests.java
@@ -20,23 +20,18 @@
 package org.elasticsearch.analysis.common;
 
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.MockTokenizer;
-import org.apache.lucene.analysis.Tokenizer;
 import org.elasticsearch.Version;
 import org.elasticsearch.cluster.metadata.IndexMetaData;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.index.analysis.IndexAnalyzers;
 import org.elasticsearch.index.analysis.NamedAnalyzer;
-import org.elasticsearch.index.analysis.TokenFilterFactory;
 import org.elasticsearch.test.ESTestCase;
 import org.elasticsearch.test.IndexSettingsModule;
 import org.elasticsearch.test.VersionUtils;
 
 import java.io.IOException;
-import java.io.StringReader;
-import java.util.Map;
 
 public class CommonAnalysisPluginTests extends ESTestCase {
 
@@ -45,42 +40,37 @@ public class CommonAnalysisPluginTests extends ESTestCase {
      */
     public void testNGramDeprecationWarning() throws IOException {
         Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
-                .put(IndexMetaData.SETTING_VERSION_CREATED,
+            .put(IndexMetaData.SETTING_VERSION_CREATED,
                         VersionUtils.randomVersionBetween(random(), Version.V_6_0_0, VersionUtils.getPreviousVersion(Version.V_7_0_0)))
-                .build();
+            .put("index.analysis.analyzer.custom_analyzer.type", "custom")
+            .put("index.analysis.analyzer.custom_analyzer.tokenizer", "standard")
+            .putList("index.analysis.analyzer.custom_analyzer.filter", "nGram")
+            .build();
 
-        IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
         try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
-            Map<String, TokenFilterFactory> tokenFilters = createTestAnalysis(idxSettings, settings, commonAnalysisPlugin).tokenFilter;
-            TokenFilterFactory tokenFilterFactory = tokenFilters.get("nGram");
-            Tokenizer tokenizer = new MockTokenizer();
-            tokenizer.setReader(new StringReader("foo bar"));
-            assertNotNull(tokenFilterFactory.create(tokenizer));
-            assertWarnings(
-                    "The [nGram] token filter name is deprecated and will be removed in a future version. "
-                    + "Please change the filter name to [ngram] instead.");
+            createTestAnalysis(IndexSettingsModule.newIndexSettings("index", settings), settings, commonAnalysisPlugin);
         }
+
+        assertWarnings("The [nGram] token filter name is deprecated and will be removed in a future version. "
+            + "Please change the filter name to [ngram] instead.");
     }
 
     /**
      * Check that the deprecated name "nGram" throws an error since 7.0.0
      */
     public void testNGramDeprecationError() throws IOException {
         Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
-                .put(IndexMetaData.SETTING_VERSION_CREATED, VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, null))
-                .build();
+            .put(IndexMetaData.SETTING_VERSION_CREATED, VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, null))
+            .put("index.analysis.analyzer.custom_analyzer.type", "custom")
+            .put("index.analysis.analyzer.custom_analyzer.tokenizer", "standard")
+            .putList("index.analysis.analyzer.custom_analyzer.filter", "nGram")
+            .build();
 
-        IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
         try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
-            Map<String, TokenFilterFactory> tokenFilters = createTestAnalysis(idxSettings, settings, commonAnalysisPlugin).tokenFilter;
-            TokenFilterFactory tokenFilterFactory = tokenFilters.get("nGram");
-            Tokenizer tokenizer = new MockTokenizer();
-            tokenizer.setReader(new StringReader("foo bar"));
-            IllegalArgumentException ex = expectThrows(IllegalArgumentException.class, () -> tokenFilterFactory.create(tokenizer));
-            assertEquals(
-                    "The [nGram] token filter name was deprecated in 6.4 and cannot be used in new indices. Please change the filter"
-                    + " name to [ngram] instead.",
-                    ex.getMessage());
+            IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
+                () -> createTestAnalysis(IndexSettingsModule.newIndexSettings("index", settings), settings, commonAnalysisPlugin));
+            assertEquals("The [nGram] token filter name was deprecated in 6.4 and cannot be used in new indices. "
+                + "Please change the filter name to [ngram] instead.", e.getMessage());
         }
     }
 
@@ -89,42 +79,36 @@ public void testNGramDeprecationError() throws IOException {
      */
     public void testEdgeNGramDeprecationWarning() throws IOException {
         Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
-                .put(IndexMetaData.SETTING_VERSION_CREATED,
-                        VersionUtils.randomVersionBetween(random(), Version.V_6_4_0, VersionUtils.getPreviousVersion(Version.V_7_0_0)))
-                .build();
+            .put(IndexMetaData.SETTING_VERSION_CREATED,
+                VersionUtils.randomVersionBetween(random(), Version.V_6_4_0, VersionUtils.getPreviousVersion(Version.V_7_0_0)))
+            .put("index.analysis.analyzer.custom_analyzer.type", "custom")
+            .put("index.analysis.analyzer.custom_analyzer.tokenizer", "standard")
+            .putList("index.analysis.analyzer.custom_analyzer.filter", "edgeNGram")
+            .build();
 
-        IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
         try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
-            Map<String, TokenFilterFactory> tokenFilters = createTestAnalysis(idxSettings, settings, commonAnalysisPlugin).tokenFilter;
-            TokenFilterFactory tokenFilterFactory = tokenFilters.get("edgeNGram");
-            Tokenizer tokenizer = new MockTokenizer();
-            tokenizer.setReader(new StringReader("foo bar"));
-            assertNotNull(tokenFilterFactory.create(tokenizer));
-            assertWarnings(
-                    "The [edgeNGram] token filter name is deprecated and will be removed in a future version. "
-                    + "Please change the filter name to [edge_ngram] instead.");
+            createTestAnalysis(IndexSettingsModule.newIndexSettings("index", settings), settings, commonAnalysisPlugin);
         }
+        assertWarnings("The [edgeNGram] token filter name is deprecated and will be removed in a future version. "
+            + "Please change the filter name to [edge_ngram] instead.");
     }
 
     /**
      * Check that the deprecated name "edgeNGram" throws an error for indices created since 7.0.0
      */
     public void testEdgeNGramDeprecationError() throws IOException {
         Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
-                .put(IndexMetaData.SETTING_VERSION_CREATED, VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, null))
-                .build();
+            .put(IndexMetaData.SETTING_VERSION_CREATED, VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, null))
+            .put("index.analysis.analyzer.custom_analyzer.type", "custom")
+            .put("index.analysis.analyzer.custom_analyzer.tokenizer", "standard")
+            .putList("index.analysis.analyzer.custom_analyzer.filter", "edgeNGram")
+            .build();
 
-        IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
         try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
-            Map<String, TokenFilterFactory> tokenFilters = createTestAnalysis(idxSettings, settings, commonAnalysisPlugin).tokenFilter;
-            TokenFilterFactory tokenFilterFactory = tokenFilters.get("edgeNGram");
-            Tokenizer tokenizer = new MockTokenizer();
-            tokenizer.setReader(new StringReader("foo bar"));
-            IllegalArgumentException ex = expectThrows(IllegalArgumentException.class, () -> tokenFilterFactory.create(tokenizer));
-            assertEquals(
-                    "The [edgeNGram] token filter name was deprecated in 6.4 and cannot be used in new indices. Please change the filter"
-                    + " name to [edge_ngram] instead.",
-                    ex.getMessage());
+            IllegalArgumentException ex = expectThrows(IllegalArgumentException.class,
+                () -> createTestAnalysis(IndexSettingsModule.newIndexSettings("index", settings), settings, commonAnalysisPlugin));
+            assertEquals("The [edgeNGram] token filter name was deprecated in 6.4 and cannot be used in new indices. "
+                + "Please change the filter name to [edge_ngram] instead.", ex.getMessage());
         }
     }
 
diff --git a/server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java b/server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java
@@ -19,6 +19,7 @@
 package org.elasticsearch.index.analysis;
 
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.core.KeywordTokenizer;
 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 import org.elasticsearch.ElasticsearchException;
@@ -542,6 +543,10 @@ public IndexAnalyzers build(IndexSettings indexSettings,
                 tokenFilterFactoryFactories, charFilterFactoryFactories);
         }
 
+        for (Analyzer analyzer : normalizers.values()) {
+            analyzer.normalize("", ""); // check for deprecations
+        }
+
         if (!analyzers.containsKey(DEFAULT_ANALYZER_NAME)) {
             analyzers.put(DEFAULT_ANALYZER_NAME,
                     produceAnalyzer(DEFAULT_ANALYZER_NAME,
@@ -605,6 +610,7 @@ private static NamedAnalyzer produceAnalyzer(String name,
         } else {
             analyzer = new NamedAnalyzer(name, analyzerFactory.scope(), analyzerF, overridePositionIncrementGap);
         }
+        checkVersions(analyzer);
         return analyzer;
     }
 
@@ -632,4 +638,20 @@ private void processNormalizerFactory(
         NamedAnalyzer normalizer = new NamedAnalyzer(name, normalizerFactory.scope(), normalizerF);
         normalizers.put(name, normalizer);
     }
+
+    // Some analysis components emit deprecation warnings or throw exceptions when used
+    // with the wrong version of elasticsearch.  These exceptions and warnings are
+    // normally thrown when tokenstreams are constructed, which unless we build a
+    // tokenstream up-front does not happen until a document is indexed.  In order to
+    // surface these warnings or exceptions as early as possible, we build an empty
+    // tokenstream and pull it through an Analyzer at construction time.
+    private static void checkVersions(Analyzer analyzer) {
+        try (TokenStream ts = analyzer.tokenStream("", "")) {
+            ts.reset();
+            while (ts.incrementToken()) {}
+            ts.end();
+        } catch (IOException e) {
+            throw new UncheckedIOException(e);
+        }
+    }
 }
diff --git a/server/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java b/server/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java
@@ -38,6 +38,7 @@
 import org.elasticsearch.index.analysis.AnalysisRegistry;
 import org.elasticsearch.index.analysis.CharFilterFactory;
 import org.elasticsearch.index.analysis.IndexAnalyzers;
+import org.elasticsearch.index.analysis.NormalizingTokenFilterFactory;
 import org.elasticsearch.index.analysis.PreConfiguredCharFilter;
 import org.elasticsearch.index.analysis.TokenFilterFactory;
 import org.elasticsearch.index.analysis.TokenizerFactory;
@@ -109,6 +110,25 @@ public TokenStream create(TokenStream tokenStream) {
                 }
             }
 
+            class DeprecatedTokenFilterFactory extends AbstractTokenFilterFactory implements NormalizingTokenFilterFactory {
+
+                DeprecatedTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
+                    super(indexSettings, name, settings);
+                }
+
+                @Override
+                public TokenStream create(TokenStream tokenStream) {
+                    deprecationLogger.deprecated("Using deprecated token filter [deprecated]");
+                    return tokenStream;
+                }
+
+                @Override
+                public TokenStream normalize(TokenStream tokenStream) {
+                    deprecationLogger.deprecated("Using deprecated token filter [deprecated]");
+                    return tokenStream;
+                }
+            }
+
             class AppendCharFilterFactory extends AbstractCharFilterFactory {
 
                 final String suffix;
@@ -137,7 +157,10 @@ public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
 
             @Override
             public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
-                return singletonMap("mock", MockFactory::new);
+                Map<String, AnalysisProvider<TokenFilterFactory>> filters = new HashMap<>();
+                filters.put("mock", MockFactory::new);
+                filters.put("deprecated", DeprecatedTokenFilterFactory::new);
+                return filters;
             }
 
             @Override
@@ -507,4 +530,28 @@ public void testExceedSetMaxTokenLimit() {
         assertEquals(e.getMessage(), "The number of tokens produced by calling _analyze has exceeded the allowed maximum of ["
             + idxMaxTokenCount + "]." + " This limit can be set by changing the [index.analyze.max_token_count] index level setting.");
     }
+
+    public void testDeprecationWarnings() throws IOException {
+        AnalyzeAction.Request req = new AnalyzeAction.Request();
+        req.tokenizer("standard");
+        req.addTokenFilter("lowercase");
+        req.addTokenFilter("deprecated");
+        req.text("test text");
+
+        AnalyzeAction.Response analyze =
+            TransportAnalyzeAction.analyze(req, registry, mockIndexService(), maxTokenCount);
+        assertEquals(2, analyze.getTokens().size());
+        assertWarnings("Using deprecated token filter [deprecated]");
+
+        // normalizer
+        req = new AnalyzeAction.Request();
+        req.addTokenFilter("lowercase");
+        req.addTokenFilter("deprecated");
+        req.text("text");
+
+        analyze =
+            TransportAnalyzeAction.analyze(req, registry, mockIndexService(), maxTokenCount);
+        assertEquals(1, analyze.getTokens().size());
+        assertWarnings("Using deprecated token filter [deprecated]");
+    }
 }
diff --git a/server/src/test/java/org/elasticsearch/index/IndexModuleTests.java b/server/src/test/java/org/elasticsearch/index/IndexModuleTests.java
@@ -19,6 +19,7 @@
 package org.elasticsearch.index;
 
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.index.AssertingDirectoryReader;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.FieldInvertState;
@@ -442,7 +443,7 @@ public Analyzer get() {
                 final Analyzer analyzer = new Analyzer() {
                     @Override
                     protected TokenStreamComponents createComponents(String fieldName) {
-                        throw new AssertionError("should not be here");
+                        return new TokenStreamComponents(new StandardTokenizer());
                     }
 
                     @Override
diff --git a/server/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java b/server/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java
diff --git a/server/src/test/java/org/elasticsearch/indices/analysis/AnalysisModuleTests.java b/server/src/test/java/org/elasticsearch/indices/analysis/AnalysisModuleTests.java