Skip to content

Deprecate and remove camel-case nGram and edgeNGram tokenizers (#50862) #50991

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jan 14, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion docs/reference/migration/migrate_7_6.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,13 @@ GitHub or the 'discuss' forums.
The vector functions of the form `function(query, doc['field'])` are
deprecated, and the form `function(query, 'field')` should be used instead.
For example, `cosineSimilarity(query, doc['field'])` is replaced by
`cosineSimilarity(query, 'field')`.
`cosineSimilarity(query, 'field')`.

[discrete]
==== Disallow use of the `nGram` and `edgeNGram` tokenizer names

The `nGram` and `edgeNGram` tokenizer names haven been deprecated with 7.6.
Mappings for indices created after 7.6 will continue to work but emit a
deprecation warning. The tokenizer name should be changed to the fully
equivalent `ngram` or `edge_ngram` names for new indices and in index
templates.
Original file line number Diff line number Diff line change
Expand Up @@ -317,9 +317,23 @@ public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
tokenizers.put("simple_pattern", SimplePatternTokenizerFactory::new);
tokenizers.put("simple_pattern_split", SimplePatternSplitTokenizerFactory::new);
tokenizers.put("thai", ThaiTokenizerFactory::new);
tokenizers.put("nGram", NGramTokenizerFactory::new);
tokenizers.put("nGram", (IndexSettings indexSettings, Environment environment, String name, Settings settings) -> {
if (indexSettings.getIndexVersionCreated().onOrAfter(org.elasticsearch.Version.V_7_6_0)) {
deprecationLogger.deprecatedAndMaybeLog("nGram_tokenizer_deprecation",
"The [nGram] tokenizer name is deprecated and will be removed in a future version. "
+ "Please change the tokenizer name to [ngram] instead.");
}
return new NGramTokenizerFactory(indexSettings, environment, name, settings);
});
tokenizers.put("ngram", NGramTokenizerFactory::new);
tokenizers.put("edgeNGram", EdgeNGramTokenizerFactory::new);
tokenizers.put("edgeNGram", (IndexSettings indexSettings, Environment environment, String name, Settings settings) -> {
if (indexSettings.getIndexVersionCreated().onOrAfter(org.elasticsearch.Version.V_7_6_0)) {
deprecationLogger.deprecatedAndMaybeLog("edgeNGram_tokenizer_deprecation",
"The [edgeNGram] tokenizer name is deprecated and will be removed in a future version. "
+ "Please change the tokenizer name to [edge_ngram] instead.");
}
return new EdgeNGramTokenizerFactory(indexSettings, environment, name, settings);
});
tokenizers.put("edge_ngram", EdgeNGramTokenizerFactory::new);
tokenizers.put("char_group", CharGroupTokenizerFactory::new);
tokenizers.put("classic", ClassicTokenizerFactory::new);
Expand Down Expand Up @@ -548,8 +562,20 @@ public List<PreConfiguredTokenizer> getPreConfiguredTokenizers() {
tokenizers.add(PreConfiguredTokenizer.singleton("lowercase", XLowerCaseTokenizer::new));

// Temporary shim for aliases. TODO deprecate after they are moved
tokenizers.add(PreConfiguredTokenizer.singleton("nGram", NGramTokenizer::new));
tokenizers.add(PreConfiguredTokenizer.elasticsearchVersion("nGram", (version) -> {
if (version.onOrAfter(org.elasticsearch.Version.V_7_6_0)) {
deprecationLogger.deprecatedAndMaybeLog("nGram_tokenizer_deprecation",
"The [nGram] tokenizer name is deprecated and will be removed in a future version. "
+ "Please change the tokenizer name to [ngram] instead.");
}
return new NGramTokenizer();
}));
tokenizers.add(PreConfiguredTokenizer.elasticsearchVersion("edgeNGram", (version) -> {
if (version.onOrAfter(org.elasticsearch.Version.V_7_6_0)) {
deprecationLogger.deprecatedAndMaybeLog("edgeNGram_tokenizer_deprecation",
"The [edgeNGram] tokenizer name is deprecated and will be removed in a future version. "
+ "Please change the tokenizer name to [edge_ngram] instead.");
}
if (version.onOrAfter(Version.V_7_3_0)) {
return new EdgeNGramTokenizer(NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE, NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,21 @@
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Tokenizer;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.IndexAnalyzers;
import org.elasticsearch.index.analysis.NamedAnalyzer;
import org.elasticsearch.index.analysis.TokenizerFactory;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.IndexSettingsModule;
import org.elasticsearch.test.VersionUtils;

import java.io.IOException;
import java.util.Map;

public class CommonAnalysisPluginTests extends ESTestCase {

Expand Down Expand Up @@ -192,4 +195,64 @@ public void testEdgeNGramFilterInCustomAnalyzerDeprecationError() throws IOExcep
assertWarnings("The [edgeNGram] token filter name is deprecated and will be removed in a future version. "
+ "Please change the filter name to [edge_ngram] instead.");
}

/**
* Check that we log a deprecation warning for "nGram" and "edgeNGram" tokenizer names with 7.6 and
* disallow usages for indices created after 8.0
*/
public void testNGramTokenizerDeprecation() throws IOException {
// tests for prebuilt tokenizer
doTestPrebuiltTokenizerDeprecation("nGram", "ngram",
VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, Version.V_7_5_2), false);
doTestPrebuiltTokenizerDeprecation("edgeNGram", "edge_ngram",
VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, Version.V_7_5_2), false);
doTestPrebuiltTokenizerDeprecation("nGram", "ngram", Version.V_7_6_0, true);
doTestPrebuiltTokenizerDeprecation("edgeNGram", "edge_ngram", Version.V_7_6_0, true);

// same batch of tests for custom tokenizer definition in the settings
doTestCustomTokenizerDeprecation("nGram", "ngram",
VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, Version.V_7_5_2), false);
doTestCustomTokenizerDeprecation("edgeNGram", "edge_ngram",
VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, Version.V_7_5_2), false);
doTestCustomTokenizerDeprecation("nGram", "ngram", Version.V_7_6_0, true);
doTestCustomTokenizerDeprecation("edgeNGram", "edge_ngram", Version.V_7_6_0, true);
}

public void doTestPrebuiltTokenizerDeprecation(String deprecatedName, String replacement, Version version, boolean expectWarning)
throws IOException {
final Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
.put(IndexMetaData.SETTING_VERSION_CREATED, version).build();

try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
Map<String, TokenizerFactory> tokenizers = createTestAnalysis(
IndexSettingsModule.newIndexSettings("index", settings), settings, commonAnalysisPlugin).tokenizer;
TokenizerFactory tokenizerFactory = tokenizers.get(deprecatedName);

Tokenizer tokenizer = tokenizerFactory.create();
assertNotNull(tokenizer);
if (expectWarning) {
assertWarnings("The [" + deprecatedName + "] tokenizer name is deprecated and will be removed in a future version. "
+ "Please change the tokenizer name to [" + replacement + "] instead.");
}
}
}

public void doTestCustomTokenizerDeprecation(String deprecatedName, String replacement, Version version, boolean expectWarning)
throws IOException {
final Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
.put(IndexMetaData.SETTING_VERSION_CREATED, version)
.put("index.analysis.analyzer.custom_analyzer.type", "custom")
.put("index.analysis.analyzer.custom_analyzer.tokenizer", "my_tokenizer")
.put("index.analysis.tokenizer.my_tokenizer.type", deprecatedName)
.build();

try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
createTestAnalysis(IndexSettingsModule.newIndexSettings("index", settings), settings, commonAnalysisPlugin);

if (expectWarning) {
assertWarnings("The [" + deprecatedName + "] tokenizer name is deprecated and will be removed in a future version. "
+ "Please change the tokenizer name to [" + replacement + "] instead.");
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,11 @@ public void testPreConfiguredTokenizer() throws IOException {
}
}

// Check deprecated name as well
// Check deprecated name as well, needs version before 8.0 because throws IAE after that
{
try (IndexAnalyzers indexAnalyzers = buildAnalyzers(Version.CURRENT, "edgeNGram")) {
try (IndexAnalyzers indexAnalyzers = buildAnalyzers(
VersionUtils.randomVersionBetween(random(), Version.V_7_3_0, Version.CURRENT),
"edgeNGram")) {
NamedAnalyzer analyzer = indexAnalyzers.get("my_analyzer");
assertNotNull(analyzer);
assertAnalyzesTo(analyzer, "test", new String[]{"t", "te"});
Expand Down