Skip to content

Commit 2f13751

Browse files
author
Christoph Büscher
authored
Deprecate and remove camel-case nGram and edgeNGram tokenizers (#50862) (#50991)
We deprecated and removed the camel-case versions of the nGram and edgeNGram filters a while ago and we should do the same with the nGram and edgeNGram tokenizers. This PR deprecates the use of these names in favour of ngram and edge_ngram in 7. Usage will be disallowed on new indices starting with 8 then.
1 parent 6848dee commit 2f13751

File tree

4 files changed

+106
-6
lines changed

4 files changed

+106
-6
lines changed

docs/reference/migration/migrate_7_6.asciidoc

+10-1
Original file line numberDiff line numberDiff line change
@@ -35,4 +35,13 @@ GitHub or the 'discuss' forums.
3535
The vector functions of the form `function(query, doc['field'])` are
3636
deprecated, and the form `function(query, 'field')` should be used instead.
3737
For example, `cosineSimilarity(query, doc['field'])` is replaced by
38-
`cosineSimilarity(query, 'field')`.
38+
`cosineSimilarity(query, 'field')`.
39+
40+
[discrete]
41+
==== Disallow use of the `nGram` and `edgeNGram` tokenizer names
42+
43+
The `nGram` and `edgeNGram` tokenizer names haven been deprecated with 7.6.
44+
Mappings for indices created after 7.6 will continue to work but emit a
45+
deprecation warning. The tokenizer name should be changed to the fully
46+
equivalent `ngram` or `edge_ngram` names for new indices and in index
47+
templates.

modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java

+29-3
Original file line numberDiff line numberDiff line change
@@ -317,9 +317,23 @@ public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
317317
tokenizers.put("simple_pattern", SimplePatternTokenizerFactory::new);
318318
tokenizers.put("simple_pattern_split", SimplePatternSplitTokenizerFactory::new);
319319
tokenizers.put("thai", ThaiTokenizerFactory::new);
320-
tokenizers.put("nGram", NGramTokenizerFactory::new);
320+
tokenizers.put("nGram", (IndexSettings indexSettings, Environment environment, String name, Settings settings) -> {
321+
if (indexSettings.getIndexVersionCreated().onOrAfter(org.elasticsearch.Version.V_7_6_0)) {
322+
deprecationLogger.deprecatedAndMaybeLog("nGram_tokenizer_deprecation",
323+
"The [nGram] tokenizer name is deprecated and will be removed in a future version. "
324+
+ "Please change the tokenizer name to [ngram] instead.");
325+
}
326+
return new NGramTokenizerFactory(indexSettings, environment, name, settings);
327+
});
321328
tokenizers.put("ngram", NGramTokenizerFactory::new);
322-
tokenizers.put("edgeNGram", EdgeNGramTokenizerFactory::new);
329+
tokenizers.put("edgeNGram", (IndexSettings indexSettings, Environment environment, String name, Settings settings) -> {
330+
if (indexSettings.getIndexVersionCreated().onOrAfter(org.elasticsearch.Version.V_7_6_0)) {
331+
deprecationLogger.deprecatedAndMaybeLog("edgeNGram_tokenizer_deprecation",
332+
"The [edgeNGram] tokenizer name is deprecated and will be removed in a future version. "
333+
+ "Please change the tokenizer name to [edge_ngram] instead.");
334+
}
335+
return new EdgeNGramTokenizerFactory(indexSettings, environment, name, settings);
336+
});
323337
tokenizers.put("edge_ngram", EdgeNGramTokenizerFactory::new);
324338
tokenizers.put("char_group", CharGroupTokenizerFactory::new);
325339
tokenizers.put("classic", ClassicTokenizerFactory::new);
@@ -548,8 +562,20 @@ public List<PreConfiguredTokenizer> getPreConfiguredTokenizers() {
548562
tokenizers.add(PreConfiguredTokenizer.singleton("lowercase", XLowerCaseTokenizer::new));
549563

550564
// Temporary shim for aliases. TODO deprecate after they are moved
551-
tokenizers.add(PreConfiguredTokenizer.singleton("nGram", NGramTokenizer::new));
565+
tokenizers.add(PreConfiguredTokenizer.elasticsearchVersion("nGram", (version) -> {
566+
if (version.onOrAfter(org.elasticsearch.Version.V_7_6_0)) {
567+
deprecationLogger.deprecatedAndMaybeLog("nGram_tokenizer_deprecation",
568+
"The [nGram] tokenizer name is deprecated and will be removed in a future version. "
569+
+ "Please change the tokenizer name to [ngram] instead.");
570+
}
571+
return new NGramTokenizer();
572+
}));
552573
tokenizers.add(PreConfiguredTokenizer.elasticsearchVersion("edgeNGram", (version) -> {
574+
if (version.onOrAfter(org.elasticsearch.Version.V_7_6_0)) {
575+
deprecationLogger.deprecatedAndMaybeLog("edgeNGram_tokenizer_deprecation",
576+
"The [edgeNGram] tokenizer name is deprecated and will be removed in a future version. "
577+
+ "Please change the tokenizer name to [edge_ngram] instead.");
578+
}
553579
if (version.onOrAfter(Version.V_7_3_0)) {
554580
return new EdgeNGramTokenizer(NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE, NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
555581
}

modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisPluginTests.java

+63
Original file line numberDiff line numberDiff line change
@@ -20,18 +20,21 @@
2020
package org.elasticsearch.analysis.common;
2121

2222
import org.apache.lucene.analysis.Analyzer;
23+
import org.apache.lucene.analysis.Tokenizer;
2324
import org.elasticsearch.Version;
2425
import org.elasticsearch.cluster.metadata.IndexMetaData;
2526
import org.elasticsearch.common.settings.Settings;
2627
import org.elasticsearch.env.Environment;
2728
import org.elasticsearch.index.IndexSettings;
2829
import org.elasticsearch.index.analysis.IndexAnalyzers;
2930
import org.elasticsearch.index.analysis.NamedAnalyzer;
31+
import org.elasticsearch.index.analysis.TokenizerFactory;
3032
import org.elasticsearch.test.ESTestCase;
3133
import org.elasticsearch.test.IndexSettingsModule;
3234
import org.elasticsearch.test.VersionUtils;
3335

3436
import java.io.IOException;
37+
import java.util.Map;
3538

3639
public class CommonAnalysisPluginTests extends ESTestCase {
3740

@@ -192,4 +195,64 @@ public void testEdgeNGramFilterInCustomAnalyzerDeprecationError() throws IOExcep
192195
assertWarnings("The [edgeNGram] token filter name is deprecated and will be removed in a future version. "
193196
+ "Please change the filter name to [edge_ngram] instead.");
194197
}
198+
199+
/**
200+
* Check that we log a deprecation warning for "nGram" and "edgeNGram" tokenizer names with 7.6 and
201+
* disallow usages for indices created after 8.0
202+
*/
203+
public void testNGramTokenizerDeprecation() throws IOException {
204+
// tests for prebuilt tokenizer
205+
doTestPrebuiltTokenizerDeprecation("nGram", "ngram",
206+
VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, Version.V_7_5_2), false);
207+
doTestPrebuiltTokenizerDeprecation("edgeNGram", "edge_ngram",
208+
VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, Version.V_7_5_2), false);
209+
doTestPrebuiltTokenizerDeprecation("nGram", "ngram", Version.V_7_6_0, true);
210+
doTestPrebuiltTokenizerDeprecation("edgeNGram", "edge_ngram", Version.V_7_6_0, true);
211+
212+
// same batch of tests for custom tokenizer definition in the settings
213+
doTestCustomTokenizerDeprecation("nGram", "ngram",
214+
VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, Version.V_7_5_2), false);
215+
doTestCustomTokenizerDeprecation("edgeNGram", "edge_ngram",
216+
VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, Version.V_7_5_2), false);
217+
doTestCustomTokenizerDeprecation("nGram", "ngram", Version.V_7_6_0, true);
218+
doTestCustomTokenizerDeprecation("edgeNGram", "edge_ngram", Version.V_7_6_0, true);
219+
}
220+
221+
public void doTestPrebuiltTokenizerDeprecation(String deprecatedName, String replacement, Version version, boolean expectWarning)
222+
throws IOException {
223+
final Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
224+
.put(IndexMetaData.SETTING_VERSION_CREATED, version).build();
225+
226+
try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
227+
Map<String, TokenizerFactory> tokenizers = createTestAnalysis(
228+
IndexSettingsModule.newIndexSettings("index", settings), settings, commonAnalysisPlugin).tokenizer;
229+
TokenizerFactory tokenizerFactory = tokenizers.get(deprecatedName);
230+
231+
Tokenizer tokenizer = tokenizerFactory.create();
232+
assertNotNull(tokenizer);
233+
if (expectWarning) {
234+
assertWarnings("The [" + deprecatedName + "] tokenizer name is deprecated and will be removed in a future version. "
235+
+ "Please change the tokenizer name to [" + replacement + "] instead.");
236+
}
237+
}
238+
}
239+
240+
public void doTestCustomTokenizerDeprecation(String deprecatedName, String replacement, Version version, boolean expectWarning)
241+
throws IOException {
242+
final Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
243+
.put(IndexMetaData.SETTING_VERSION_CREATED, version)
244+
.put("index.analysis.analyzer.custom_analyzer.type", "custom")
245+
.put("index.analysis.analyzer.custom_analyzer.tokenizer", "my_tokenizer")
246+
.put("index.analysis.tokenizer.my_tokenizer.type", deprecatedName)
247+
.build();
248+
249+
try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
250+
createTestAnalysis(IndexSettingsModule.newIndexSettings("index", settings), settings, commonAnalysisPlugin);
251+
252+
if (expectWarning) {
253+
assertWarnings("The [" + deprecatedName + "] tokenizer name is deprecated and will be removed in a future version. "
254+
+ "Please change the tokenizer name to [" + replacement + "] instead.");
255+
}
256+
}
257+
}
195258
}

modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/EdgeNGramTokenizerTests.java

+4-2
Original file line numberDiff line numberDiff line change
@@ -86,9 +86,11 @@ public void testPreConfiguredTokenizer() throws IOException {
8686
}
8787
}
8888

89-
// Check deprecated name as well
89+
// Check deprecated name as well, needs version before 8.0 because throws IAE after that
9090
{
91-
try (IndexAnalyzers indexAnalyzers = buildAnalyzers(Version.CURRENT, "edgeNGram")) {
91+
try (IndexAnalyzers indexAnalyzers = buildAnalyzers(
92+
VersionUtils.randomVersionBetween(random(), Version.V_7_3_0, Version.CURRENT),
93+
"edgeNGram")) {
9294
NamedAnalyzer analyzer = indexAnalyzers.get("my_analyzer");
9395
assertNotNull(analyzer);
9496
assertAnalyzesTo(analyzer, "test", new String[]{"t", "te"});

0 commit comments

Comments
 (0)