Deprecate and remove camel-case nGram and edgeNGram tokenizers

Christoph Büscher · Christoph Büscher · commit 00d5965223a8 · 2020-01-10T17:40:59.000+01:00
We already deprecated and removed the camel-case versions of the nGram and edgeNGram filters a while ago and we should do the same with the nGram and edgeNGram tokenizers. This PR deprecates the use of these names in favour of ngram and edge_ngram in 7 and disallows usage in new indices starting with 8. The deprecation part will be backported to 7.6. Closes elastic#50561
diff --git a/docs/reference/migration/migrate_8_0/mappings.asciidoc b/docs/reference/migration/migrate_8_0/mappings.asciidoc
@@ -39,3 +39,12 @@ The setting has been deprecated with 7.5 and is no longer supported on new indic
 Mappings for older indices will continue to work but emit a deprecation warning.
 The `enabled` setting for `_field_names` should be removed from templates and mappings. 
 Disabling _field_names is not necessary because it no longer carries a large index overhead.
+
+[float]
+[[nGram-edgeNGram-dreprecation]]
+==== Disallow use of the `nGram` and `edgeNGram` tokenizer names
+
+The `nGram` and `edgeNGram` tokenizer names haven been deprecated with 7.6 and are no longer
+supported on new indices. Mappings for indices created after 7.6 will continue to work but
+emit a deprecation warning. The tokenizer name should be changed to the fully equivalent
+`ngram` or `edge_ngram` names for new indices and in index templates.
diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java
@@ -337,9 +337,29 @@ public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
         tokenizers.put("simple_pattern", SimplePatternTokenizerFactory::new);
         tokenizers.put("simple_pattern_split", SimplePatternSplitTokenizerFactory::new);
         tokenizers.put("thai", ThaiTokenizerFactory::new);
-        tokenizers.put("nGram", NGramTokenizerFactory::new);
+        tokenizers.put("nGram", (IndexSettings indexSettings, Environment environment, String name, Settings settings) -> {
+            if (indexSettings.getIndexVersionCreated().onOrAfter(org.elasticsearch.Version.V_8_0_0)) {
+                throw new IllegalArgumentException("The [nGram] tokenizer name was deprecated in 7.6. "
+                        + "Please use the tokenizer name to [ngram] for indices created in versions 8 or higher instead.");
+            } else if (indexSettings.getIndexVersionCreated().onOrAfter(org.elasticsearch.Version.V_7_6_0)) {
+                deprecationLogger.deprecatedAndMaybeLog("nGram_tokenizer_deprecation",
+                        "The [nGram] tokenizer name is deprecated and will be removed in a future version. "
+                                + "Please change the tokenizer name to [ngram] instead.");
+            }
+            return new NGramTokenizerFactory(indexSettings, environment, name, settings);
+        });
         tokenizers.put("ngram", NGramTokenizerFactory::new);
-        tokenizers.put("edgeNGram", EdgeNGramTokenizerFactory::new);
+        tokenizers.put("edgeNGram", (IndexSettings indexSettings, Environment environment, String name, Settings settings) -> {
+            if (indexSettings.getIndexVersionCreated().onOrAfter(org.elasticsearch.Version.V_8_0_0)) {
+                throw new IllegalArgumentException("The [edgeNGram] tokenizer name was deprecated in 7.6. "
+                        + "Please use the tokenizer name to [edge_nGram] for indices created in versions 8 or higher instead.");
+            } else if (indexSettings.getIndexVersionCreated().onOrAfter(org.elasticsearch.Version.V_7_6_0)) {
+                deprecationLogger.deprecatedAndMaybeLog("edgeNGram_tokenizer_deprecation",
+                        "The [edgeNGram] tokenizer name is deprecated and will be removed in a future version. "
+                                + "Please change the tokenizer name to [edge_ngram] instead.");
+            }
+            return new EdgeNGramTokenizerFactory(indexSettings, environment, name, settings);
+        });
         tokenizers.put("edge_ngram", EdgeNGramTokenizerFactory::new);
         tokenizers.put("char_group", CharGroupTokenizerFactory::new);
         tokenizers.put("classic", ClassicTokenizerFactory::new);
@@ -522,8 +542,26 @@ public List<PreConfiguredTokenizer> getPreConfiguredTokenizers() {
         tokenizers.add(PreConfiguredTokenizer.singleton("lowercase", XLowerCaseTokenizer::new));
 
         // Temporary shim for aliases. TODO deprecate after they are moved
-        tokenizers.add(PreConfiguredTokenizer.singleton("nGram", NGramTokenizer::new));
+        tokenizers.add(PreConfiguredTokenizer.elasticsearchVersion("nGram", (version) -> {
+            if (version.onOrAfter(org.elasticsearch.Version.V_8_0_0)) {
+                throw new IllegalArgumentException("The [nGram] tokenizer name was deprecated in 7.6. "
+                        + "Please use the tokenizer name to [ngram] for indices created in versions 8 or higher instead.");
+            } else if (version.onOrAfter(org.elasticsearch.Version.V_7_6_0)) {
+                deprecationLogger.deprecatedAndMaybeLog("nGram_tokenizer_deprecation",
+                        "The [nGram] tokenizer name is deprecated and will be removed in a future version. "
+                                + "Please change the tokenizer name to [ngram] instead.");
+            }
+            return new NGramTokenizer();
+        }));
         tokenizers.add(PreConfiguredTokenizer.elasticsearchVersion("edgeNGram", (version) -> {
+            if (version.onOrAfter(org.elasticsearch.Version.V_8_0_0)) {
+                throw new IllegalArgumentException("The [edgeNGram] tokenizer name was deprecated in 7.6. "
+                        + "Please use the tokenizer name to [edge_ngram] for indices created in versions 8 or higher instead.");
+            } else if (version.onOrAfter(org.elasticsearch.Version.V_7_6_0)) {
+                deprecationLogger.deprecatedAndMaybeLog("nGram_tokenizer_deprecation",
+                        "The [edgeNGram] tokenizer name is deprecated and will be removed in a future version. "
+                                + "Please change the tokenizer name to [edge_ngram] instead.");
+            }
             if (version.onOrAfter(Version.V_7_3_0)) {
                 return new EdgeNGramTokenizer(NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE, NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
             }
diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisPluginTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisPluginTests.java
@@ -26,6 +26,7 @@
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.analysis.TokenFilterFactory;
+import org.elasticsearch.index.analysis.TokenizerFactory;
 import org.elasticsearch.test.ESTestCase;
 import org.elasticsearch.test.IndexSettingsModule;
 import org.elasticsearch.test.VersionUtils;
@@ -126,4 +127,87 @@ public void testEdgeNGramFilterInCustomAnalyzerDeprecationError() throws IOExcep
                     + "Please change the filter name to [edge_ngram] instead.");
         }
     }
+
+    /**
+     * Check that we log a deprecation warning for "nGram" and "edgeNGram" tokenizer names with 7.6 and
+     * disallow usages for indices created after 8.0
+     */
+    public void testNGramTokenizerDeprecation() throws IOException {
+        // tests for prebuilt tokenizer
+        doTestPrebuiltTokenizerDeprecation("nGram", "ngram",
+                VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, Version.V_7_5_2), false);
+        doTestPrebuiltTokenizerDeprecation("edgeNGram", "edge_ngram",
+                VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, Version.V_7_5_2), false);
+        doTestPrebuiltTokenizerDeprecation("nGram", "ngram",
+                VersionUtils.randomVersionBetween(random(), Version.V_7_6_0,
+                        Version.max(Version.V_7_6_0, VersionUtils.getPreviousVersion(Version.V_8_0_0))),
+                true);
+        doTestPrebuiltTokenizerDeprecation("edgeNGram", "edge_ngram",
+                VersionUtils.randomVersionBetween(random(), Version.V_7_6_0,
+                        Version.max(Version.V_7_6_0, VersionUtils.getPreviousVersion(Version.V_8_0_0))), true);
+        expectThrows(IllegalArgumentException.class, () -> doTestPrebuiltTokenizerDeprecation("nGram", "ngram",
+                VersionUtils.randomVersionBetween(random(), Version.V_8_0_0, Version.CURRENT), true));
+        expectThrows(IllegalArgumentException.class, () -> doTestPrebuiltTokenizerDeprecation("edgeNGram", "edge_ngram",
+                VersionUtils.randomVersionBetween(random(), Version.V_8_0_0, Version.CURRENT), true));
+
+        // same batch of tests for custom tokenizer definition in the settings
+        doTestCustomTokenizerDeprecation("nGram", "ngram",
+                VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, Version.V_7_5_2), false);
+        doTestCustomTokenizerDeprecation("edgeNGram", "edge_ngram",
+                VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, Version.V_7_5_2), false);
+        doTestCustomTokenizerDeprecation("nGram", "ngram",
+                VersionUtils.randomVersionBetween(random(), Version.V_7_6_0,
+                        Version.max(Version.V_7_6_0, VersionUtils.getPreviousVersion(Version.V_8_0_0))),
+                true);
+        doTestCustomTokenizerDeprecation("edgeNGram", "edge_ngram",
+                VersionUtils.randomVersionBetween(random(), Version.V_7_6_0,
+                        Version.max(Version.V_7_6_0, VersionUtils.getPreviousVersion(Version.V_8_0_0))), true);
+        expectThrows(IllegalArgumentException.class, () -> doTestCustomTokenizerDeprecation("nGram", "ngram",
+                VersionUtils.randomVersionBetween(random(), Version.V_8_0_0, Version.CURRENT), true));
+        expectThrows(IllegalArgumentException.class, () -> doTestCustomTokenizerDeprecation("edgeNGram", "edge_ngram",
+                VersionUtils.randomVersionBetween(random(), Version.V_8_0_0, Version.CURRENT), true));
+
+    }
+
+    public void doTestPrebuiltTokenizerDeprecation(String deprecatedName, String replacement, Version version, boolean expectWarning)
+            throws IOException {
+        final Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
+            .put(IndexMetaData.SETTING_VERSION_CREATED, version).build();
+
+        try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
+            Map<String, TokenizerFactory> tokenizers = createTestAnalysis(
+                    IndexSettingsModule.newIndexSettings("index", settings), settings, commonAnalysisPlugin).tokenizer;
+            TokenizerFactory tokenizerFactory = tokenizers.get(deprecatedName);
+
+            Tokenizer tokenizer = tokenizerFactory.create();
+            assertNotNull(tokenizer);
+            if (expectWarning) {
+                assertWarnings("The [" + deprecatedName + "] tokenizer name is deprecated and will be removed in a future version. "
+                        + "Please change the tokenizer name to [" + replacement + "] instead.");
+            }
+        }
+    }
+
+    public void doTestCustomTokenizerDeprecation(String deprecatedName, String replacement, Version version, boolean expectWarning)
+            throws IOException {
+        final Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
+            .put(IndexMetaData.SETTING_VERSION_CREATED, version)
+            .put("index.analysis.analyzer.custom_analyzer.type", "custom")
+            .put("index.analysis.analyzer.custom_analyzer.tokenizer", "my_tokenizer")
+            .put("index.analysis.tokenizer.my_tokenizer.type", deprecatedName)
+        .build();
+
+        try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
+            Map<String, TokenizerFactory> tokenizers = createTestAnalysis(
+                    IndexSettingsModule.newIndexSettings("index", settings), settings, commonAnalysisPlugin).tokenizer;
+            TokenizerFactory tokenizerFactory = tokenizers.get(deprecatedName);
+
+            Tokenizer tokenizer = tokenizerFactory.create();
+            assertNotNull(tokenizer);
+            if (expectWarning) {
+                assertWarnings("The [" + deprecatedName + "] tokenizer name is deprecated and will be removed in a future version. "
+                        + "Please change the tokenizer name to [" + replacement + "] instead.");
+            }
+        }
+    }
 }