Check for deprecations when analyzers are built (#50908)

romseygeek · web-flow · commit 736ed474e260 · 2020-01-14T13:12:25.000Z
Generally speaking, deprecated analysis components in elasticsearch will issue deprecation warnings when they are first used. However, this means that no warnings are emitted when indexes are created with deprecated components, and users have to actually index a document to see warnings. This makes it much harder to see these warnings and act on them at appropriate times. This is worse in the case where components throw exceptions on upgrade. In this case, users will not be aware of a problem until a document is indexed, instead of at index creation time. This commit adds a new check that pushes an empty string through all user-defined analyzers and normalizers when an IndexAnalyzers object is built for each index; deprecation warnings and exceptions are now emitted when indexes are created or opened. Fixes #42349
diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisPluginTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisPluginTests.java
@@ -19,20 +19,15 @@
 
 package org.elasticsearch.analysis.common;
 
-import org.apache.lucene.analysis.MockTokenizer;
-import org.apache.lucene.analysis.Tokenizer;
 import org.elasticsearch.Version;
 import org.elasticsearch.cluster.metadata.IndexMetaData;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
-import org.elasticsearch.index.analysis.TokenFilterFactory;
 import org.elasticsearch.test.ESTestCase;
 import org.elasticsearch.test.IndexSettingsModule;
 import org.elasticsearch.test.VersionUtils;
 
 import java.io.IOException;
-import java.io.StringReader;
-import java.util.Map;
 
 public class CommonAnalysisPluginTests extends ESTestCase {
 
@@ -51,13 +46,8 @@ public void testNGramFilterInCustomAnalyzerDeprecationError() throws IOException
             .build();
 
         try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
-            Map<String, TokenFilterFactory> tokenFilters = createTestAnalysis(IndexSettingsModule.newIndexSettings("index", settings),
-                    settings, commonAnalysisPlugin).tokenFilter;
-            TokenFilterFactory tokenFilterFactory = tokenFilters.get("nGram");
-            Tokenizer tokenizer = new MockTokenizer();
-            tokenizer.setReader(new StringReader("foo bar"));
-
-            IllegalArgumentException ex = expectThrows(IllegalArgumentException.class, () -> tokenFilterFactory.create(tokenizer));
+            IllegalArgumentException ex = expectThrows(IllegalArgumentException.class,
+                () -> createTestAnalysis(IndexSettingsModule.newIndexSettings("index", settings), settings, commonAnalysisPlugin));
             assertEquals("The [nGram] token filter name was deprecated in 6.4 and cannot be used in new indices. "
                     + "Please change the filter name to [ngram] instead.", ex.getMessage());
         }
@@ -69,12 +59,7 @@ public void testNGramFilterInCustomAnalyzerDeprecationError() throws IOException
                 .putList("index.analysis.analyzer.custom_analyzer.filter", "my_ngram").put("index.analysis.filter.my_ngram.type", "nGram")
                 .build();
         try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
-            Map<String, TokenFilterFactory> tokenFilters = createTestAnalysis(IndexSettingsModule.newIndexSettings("index", settingsPre7),
-                    settingsPre7, commonAnalysisPlugin).tokenFilter;
-            TokenFilterFactory tokenFilterFactory = tokenFilters.get("nGram");
-            Tokenizer tokenizer = new MockTokenizer();
-            tokenizer.setReader(new StringReader("foo bar"));
-            assertNotNull(tokenFilterFactory.create(tokenizer));
+            createTestAnalysis(IndexSettingsModule.newIndexSettings("index", settingsPre7), settingsPre7, commonAnalysisPlugin);
             assertWarnings("The [nGram] token filter name is deprecated and will be removed in a future version. "
                     + "Please change the filter name to [ngram] instead.");
         }
@@ -95,13 +80,8 @@ public void testEdgeNGramFilterInCustomAnalyzerDeprecationError() throws IOExcep
             .build();
 
         try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
-            Map<String, TokenFilterFactory> tokenFilters = createTestAnalysis(IndexSettingsModule.newIndexSettings("index", settings),
-                    settings, commonAnalysisPlugin).tokenFilter;
-            TokenFilterFactory tokenFilterFactory = tokenFilters.get("edgeNGram");
-            Tokenizer tokenizer = new MockTokenizer();
-            tokenizer.setReader(new StringReader("foo bar"));
-
-            IllegalArgumentException ex = expectThrows(IllegalArgumentException.class, () -> tokenFilterFactory.create(tokenizer));
+            IllegalArgumentException ex = expectThrows(IllegalArgumentException.class,
+                () -> createTestAnalysis(IndexSettingsModule.newIndexSettings("index", settings), settings, commonAnalysisPlugin));
             assertEquals("The [edgeNGram] token filter name was deprecated in 6.4 and cannot be used in new indices. "
                     + "Please change the filter name to [edge_ngram] instead.", ex.getMessage());
         }
@@ -116,12 +96,8 @@ public void testEdgeNGramFilterInCustomAnalyzerDeprecationError() throws IOExcep
                 .build();
 
         try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
-            Map<String, TokenFilterFactory> tokenFilters = createTestAnalysis(IndexSettingsModule.newIndexSettings("index", settingsPre7),
-                    settingsPre7, commonAnalysisPlugin).tokenFilter;
-            TokenFilterFactory tokenFilterFactory = tokenFilters.get("edgeNGram");
-            Tokenizer tokenizer = new MockTokenizer();
-            tokenizer.setReader(new StringReader("foo bar"));
-            assertNotNull(tokenFilterFactory.create(tokenizer));
+           createTestAnalysis(IndexSettingsModule.newIndexSettings("index", settingsPre7),
+                    settingsPre7, commonAnalysisPlugin);
             assertWarnings("The [edgeNGram] token filter name is deprecated and will be removed in a future version. "
                     + "Please change the filter name to [edge_ngram] instead.");
         }
diff --git a/server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java b/server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java
@@ -19,6 +19,7 @@
 package org.elasticsearch.index.analysis;
 
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.core.KeywordTokenizer;
 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 import org.elasticsearch.ElasticsearchException;
@@ -536,6 +537,10 @@ public IndexAnalyzers build(IndexSettings indexSettings,
                 tokenFilterFactoryFactories, charFilterFactoryFactories);
         }
 
+        for (Analyzer analyzer : normalizers.values()) {
+            analyzer.normalize("", ""); // check for deprecations
+        }
+
         if (!analyzers.containsKey(DEFAULT_ANALYZER_NAME)) {
             analyzers.put(DEFAULT_ANALYZER_NAME,
                     produceAnalyzer(DEFAULT_ANALYZER_NAME,
@@ -599,6 +604,7 @@ private static NamedAnalyzer produceAnalyzer(String name,
         } else {
             analyzer = new NamedAnalyzer(name, analyzerFactory.scope(), analyzerF, overridePositionIncrementGap);
         }
+        checkVersions(analyzer);
         return analyzer;
     }
 
@@ -626,4 +632,20 @@ private void processNormalizerFactory(
         NamedAnalyzer normalizer = new NamedAnalyzer(name, normalizerFactory.scope(), normalizerF);
         normalizers.put(name, normalizer);
     }
+
+    // Some analysis components emit deprecation warnings or throw exceptions when used
+    // with the wrong version of elasticsearch.  These exceptions and warnings are
+    // normally thrown when tokenstreams are constructed, which unless we build a
+    // tokenstream up-front does not happen until a document is indexed.  In order to
+    // surface these warnings or exceptions as early as possible, we build an empty
+    // tokenstream and pull it through an Analyzer at construction time.
+    private static void checkVersions(Analyzer analyzer) {
+        try (TokenStream ts = analyzer.tokenStream("", "")) {
+            ts.reset();
+            while (ts.incrementToken()) {}
+            ts.end();
+        } catch (IOException e) {
+            throw new UncheckedIOException(e);
+        }
+    }
 }
diff --git a/server/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java b/server/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java
@@ -38,6 +38,7 @@
 import org.elasticsearch.index.analysis.AnalysisRegistry;
 import org.elasticsearch.index.analysis.CharFilterFactory;
 import org.elasticsearch.index.analysis.IndexAnalyzers;
+import org.elasticsearch.index.analysis.NormalizingTokenFilterFactory;
 import org.elasticsearch.index.analysis.PreConfiguredCharFilter;
 import org.elasticsearch.index.analysis.TokenFilterFactory;
 import org.elasticsearch.index.analysis.TokenizerFactory;
@@ -108,6 +109,25 @@ public TokenStream create(TokenStream tokenStream) {
                 }
             }
 
+            class DeprecatedTokenFilterFactory extends AbstractTokenFilterFactory implements NormalizingTokenFilterFactory {
+
+                DeprecatedTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
+                    super(indexSettings, name, settings);
+                }
+
+                @Override
+                public TokenStream create(TokenStream tokenStream) {
+                    deprecationLogger.deprecated("Using deprecated token filter [deprecated]");
+                    return tokenStream;
+                }
+
+                @Override
+                public TokenStream normalize(TokenStream tokenStream) {
+                    deprecationLogger.deprecated("Using deprecated token filter [deprecated]");
+                    return tokenStream;
+                }
+            }
+
             class AppendCharFilterFactory extends AbstractCharFilterFactory {
 
                 final String suffix;
@@ -136,7 +156,7 @@ public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
 
             @Override
             public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
-                return singletonMap("mock", MockFactory::new);
+                return Map.of("mock", MockFactory::new, "deprecated", DeprecatedTokenFilterFactory::new);
             }
 
             @Override
@@ -492,4 +512,28 @@ public void testExceedSetMaxTokenLimit() {
         assertEquals(e.getMessage(), "The number of tokens produced by calling _analyze has exceeded the allowed maximum of ["
             + idxMaxTokenCount + "]." + " This limit can be set by changing the [index.analyze.max_token_count] index level setting.");
     }
+
+    public void testDeprecationWarnings() throws IOException {
+        AnalyzeAction.Request req = new AnalyzeAction.Request();
+        req.tokenizer("standard");
+        req.addTokenFilter("lowercase");
+        req.addTokenFilter("deprecated");
+        req.text("test text");
+
+        AnalyzeAction.Response analyze =
+            TransportAnalyzeAction.analyze(req, registry, mockIndexService(), maxTokenCount);
+        assertEquals(2, analyze.getTokens().size());
+        assertWarnings("Using deprecated token filter [deprecated]");
+
+        // normalizer
+        req = new AnalyzeAction.Request();
+        req.addTokenFilter("lowercase");
+        req.addTokenFilter("deprecated");
+        req.text("text");
+
+        analyze =
+            TransportAnalyzeAction.analyze(req, registry, mockIndexService(), maxTokenCount);
+        assertEquals(1, analyze.getTokens().size());
+        assertWarnings("Using deprecated token filter [deprecated]");
+    }
 }
diff --git a/server/src/test/java/org/elasticsearch/index/IndexModuleTests.java b/server/src/test/java/org/elasticsearch/index/IndexModuleTests.java
@@ -19,6 +19,7 @@
 package org.elasticsearch.index;
 
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.index.AssertingDirectoryReader;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.FieldInvertState;
@@ -442,7 +443,7 @@ public Analyzer get() {
                 final Analyzer analyzer = new Analyzer() {
                     @Override
                     protected TokenStreamComponents createComponents(String fieldName) {
-                        throw new AssertionError("should not be here");
+                        return new TokenStreamComponents(new StandardTokenizer());
                     }
 
                     @Override
diff --git a/server/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java b/server/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java
@@ -20,12 +20,13 @@
 package org.elasticsearch.index.analysis;
 
 import com.carrotsearch.randomizedtesting.generators.RandomPicks;
-
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.MockTokenFilter;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.en.EnglishAnalyzer;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.elasticsearch.Version;
 import org.elasticsearch.cluster.metadata.IndexMetaData;
@@ -108,19 +109,25 @@ public void testOverrideDefaultAnalyzer() throws IOException {
     public void testOverrideDefaultAnalyzerWithoutAnalysisModeAll() throws IOException {
         Version version = VersionUtils.randomVersion(random());
         Settings settings = Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, version).build();
-        TokenFilterFactory tokenFilter = new AbstractTokenFilterFactory(IndexSettingsModule.newIndexSettings("index", settings),
-                "my_filter", Settings.EMPTY) {
+        IndexSettings indexSettings = IndexSettingsModule.newIndexSettings("index", settings);
+        TokenFilterFactory tokenFilter = new AbstractTokenFilterFactory(indexSettings, "my_filter", Settings.EMPTY) {
             @Override
             public AnalysisMode getAnalysisMode() {
                 return randomFrom(AnalysisMode.SEARCH_TIME, AnalysisMode.INDEX_TIME);
             }
 
             @Override
             public TokenStream create(TokenStream tokenStream) {
-                return null;
+                return tokenStream;
             }
         };
-        Analyzer analyzer = new CustomAnalyzer(null, new CharFilterFactory[0], new TokenFilterFactory[] { tokenFilter });
+        TokenizerFactory tokenizer = new AbstractTokenizerFactory(indexSettings, Settings.EMPTY, "my_tokenizer") {
+            @Override
+            public Tokenizer create() {
+                return new StandardTokenizer();
+            }
+        };
+        Analyzer analyzer = new CustomAnalyzer(tokenizer, new CharFilterFactory[0], new TokenFilterFactory[] { tokenFilter });
         MapperException ex = expectThrows(MapperException.class,
                 () -> emptyRegistry.build(IndexSettingsModule.newIndexSettings("index", settings),
                         singletonMap("default", new PreBuiltAnalyzerProvider("default", AnalyzerScope.INDEX, analyzer)), emptyMap(),
@@ -264,4 +271,122 @@ public void testEnsureCloseInvocationProperlyDelegated() throws IOException {
         registry.close();
         verify(mock).close();
     }
+
+    public void testDeprecationsAndExceptions() throws IOException {
+
+        AnalysisPlugin plugin = new AnalysisPlugin() {
+
+            class MockFactory extends AbstractTokenFilterFactory {
+                MockFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
+                    super(indexSettings, name, settings);
+                }
+
+                @Override
+                public TokenStream create(TokenStream tokenStream) {
+                    if (indexSettings.getIndexVersionCreated().equals(Version.CURRENT)) {
+                        deprecationLogger.deprecated("Using deprecated token filter [deprecated]");
+                    }
+                    return tokenStream;
+                }
+            }
+
+            class ExceptionFactory extends AbstractTokenFilterFactory {
+
+                ExceptionFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
+                    super(indexSettings, name, settings);
+                }
+
+                @Override
+                public TokenStream create(TokenStream tokenStream) {
+                    if (indexSettings.getIndexVersionCreated().equals(Version.CURRENT)) {
+                        throw new IllegalArgumentException("Cannot use token filter [exception]");
+                    }
+                    return tokenStream;
+                }
+            }
+
+            class UnusedMockFactory extends AbstractTokenFilterFactory {
+                UnusedMockFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
+                    super(indexSettings, name, settings);
+                }
+
+                @Override
+                public TokenStream create(TokenStream tokenStream) {
+                    deprecationLogger.deprecated("Using deprecated token filter [unused]");
+                    return tokenStream;
+                }
+            }
+
+            class NormalizerFactory extends AbstractTokenFilterFactory implements NormalizingTokenFilterFactory {
+
+                NormalizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
+                    super(indexSettings, name, settings);
+                }
+
+                @Override
+                public TokenStream create(TokenStream tokenStream) {
+                    deprecationLogger.deprecated("Using deprecated token filter [deprecated_normalizer]");
+                    return tokenStream;
+                }
+
+            }
+
+            @Override
+            public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
+                return Map.of("deprecated", MockFactory::new, "unused", UnusedMockFactory::new,
+                    "deprecated_normalizer", NormalizerFactory::new, "exception", ExceptionFactory::new);
+            }
+        };
+
+        Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build();
+        Settings indexSettings = Settings.builder()
+            .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
+            .put("index.analysis.filter.deprecated.type", "deprecated")
+            .put("index.analysis.analyzer.custom.tokenizer", "standard")
+            .putList("index.analysis.analyzer.custom.filter", "lowercase", "deprecated")
+            .build();
+
+        IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
+
+        new AnalysisModule(TestEnvironment.newEnvironment(settings),
+            singletonList(plugin)).getAnalysisRegistry().build(idxSettings);
+
+        // We should only get a warning from the token filter that is referenced in settings
+        assertWarnings("Using deprecated token filter [deprecated]");
+
+        indexSettings = Settings.builder()
+            .put(IndexMetaData.SETTING_VERSION_CREATED, VersionUtils.getPreviousVersion())
+            .put("index.analysis.filter.deprecated.type", "deprecated_normalizer")
+            .putList("index.analysis.normalizer.custom.filter", "lowercase", "deprecated_normalizer")
+            .put("index.analysis.filter.deprecated.type", "deprecated")
+            .put("index.analysis.filter.exception.type", "exception")
+            .put("index.analysis.analyzer.custom.tokenizer", "standard")
+            // exception will not throw because we're not on Version.CURRENT
+            .putList("index.analysis.analyzer.custom.filter", "lowercase", "deprecated", "exception")
+            .build();
+        idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
+
+        new AnalysisModule(TestEnvironment.newEnvironment(settings),
+            singletonList(plugin)).getAnalysisRegistry().build(idxSettings);
+
+        // We should only get a warning from the normalizer, because we're on a version where 'deprecated'
+        // works fine
+        assertWarnings("Using deprecated token filter [deprecated_normalizer]");
+
+        indexSettings = Settings.builder()
+            .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
+            .put("index.analysis.filter.exception.type", "exception")
+            .put("index.analysis.analyzer.custom.tokenizer", "standard")
+            // exception will not throw because we're not on Version.LATEST
+            .putList("index.analysis.analyzer.custom.filter", "lowercase", "exception")
+            .build();
+        IndexSettings exceptionSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
+
+        IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> {
+            new AnalysisModule(TestEnvironment.newEnvironment(settings),
+                singletonList(plugin)).getAnalysisRegistry().build(exceptionSettings);
+        });
+        assertEquals("Cannot use token filter [exception]", e.getMessage());
+
+    }
 }
diff --git a/server/src/test/java/org/elasticsearch/indices/analysis/AnalysisModuleTests.java b/server/src/test/java/org/elasticsearch/indices/analysis/AnalysisModuleTests.java