diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java index f095b766ee1d5..44eeace79be43 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java @@ -83,6 +83,7 @@ import org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter; import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter; import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter; +import org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator; import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter; import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer; import org.apache.lucene.analysis.ngram.NGramTokenFilter; @@ -110,6 +111,7 @@ import org.apache.lucene.analysis.tr.TurkishAnalyzer; import org.apache.lucene.analysis.util.ElisionFilter; import org.apache.lucene.util.SetOnce; +import org.elasticsearch.Version; import org.elasticsearch.client.Client; import org.elasticsearch.cluster.service.ClusterService; import org.elasticsearch.common.io.stream.NamedWriteableRegistry; @@ -455,13 +457,15 @@ public List getPreConfiguredTokenFilters() { | WordDelimiterFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterFilter.SPLIT_ON_NUMERICS | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE, null))); - filters.add(PreConfiguredTokenFilter.singleton("word_delimiter_graph", false, false, input -> - new WordDelimiterGraphFilter(input, + filters.add(PreConfiguredTokenFilter.singletonWithVersion("word_delimiter_graph", false, false, (input, version) -> { + boolean adjustOffsets = version.onOrAfter(Version.V_7_3_0); + return new WordDelimiterGraphFilter(input, adjustOffsets, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, WordDelimiterGraphFilter.GENERATE_WORD_PARTS | WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS | WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterGraphFilter.SPLIT_ON_NUMERICS - | WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE, null))); + | WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE, null); + })); return filters; } @@ -475,8 +479,12 @@ public List getPreConfiguredTokenizers() { tokenizers.add(PreConfiguredTokenizer.singleton("letter", LetterTokenizer::new)); tokenizers.add(PreConfiguredTokenizer.singleton("whitespace", WhitespaceTokenizer::new)); tokenizers.add(PreConfiguredTokenizer.singleton("ngram", NGramTokenizer::new)); - tokenizers.add(PreConfiguredTokenizer.singleton("edge_ngram", - () -> new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE))); + tokenizers.add(PreConfiguredTokenizer.elasticsearchVersion("edge_ngram", (version) -> { + if (version.onOrAfter(Version.V_7_3_0)) { + return new EdgeNGramTokenizer(NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE, NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE); + } + return new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE); + })); tokenizers.add(PreConfiguredTokenizer.singleton("pattern", () -> new PatternTokenizer(Regex.compile("\\W+", null), -1))); tokenizers.add(PreConfiguredTokenizer.singleton("thai", ThaiTokenizer::new)); // TODO deprecate and remove in API @@ -485,8 +493,12 @@ public List getPreConfiguredTokenizers() { // Temporary shim for aliases. TODO deprecate after they are moved tokenizers.add(PreConfiguredTokenizer.singleton("nGram", NGramTokenizer::new)); - tokenizers.add(PreConfiguredTokenizer.singleton("edgeNGram", - () -> new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE))); + tokenizers.add(PreConfiguredTokenizer.elasticsearchVersion("edgeNGram", (version) -> { + if (version.onOrAfter(Version.V_7_3_0)) { + return new EdgeNGramTokenizer(NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE, NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE); + } + return new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE); + })); tokenizers.add(PreConfiguredTokenizer.singleton("PathHierarchy", PathHierarchyTokenizer::new)); return tokenizers; diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/EdgeNGramTokenizerTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/EdgeNGramTokenizerTests.java new file mode 100644 index 0000000000000..0172f7cbc2657 --- /dev/null +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/EdgeNGramTokenizerTests.java @@ -0,0 +1,98 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.analysis.common; + +import org.elasticsearch.Version; +import org.elasticsearch.cluster.metadata.IndexMetaData; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.env.TestEnvironment; +import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.IndexAnalyzers; +import org.elasticsearch.index.analysis.NamedAnalyzer; +import org.elasticsearch.indices.analysis.AnalysisModule; +import org.elasticsearch.test.ESTokenStreamTestCase; +import org.elasticsearch.test.IndexSettingsModule; +import org.elasticsearch.test.VersionUtils; + +import java.io.IOException; +import java.util.Collections; + +public class EdgeNGramTokenizerTests extends ESTokenStreamTestCase { + + private IndexAnalyzers buildAnalyzers(Version version, String tokenizer) throws IOException { + Settings settings = Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .build(); + Settings indexSettings = Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, version) + .put("index.analysis.analyzer.my_analyzer.tokenizer", tokenizer) + .build(); + IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings); + return new AnalysisModule(TestEnvironment.newEnvironment(settings), + Collections.singletonList(new CommonAnalysisPlugin())).getAnalysisRegistry().build(idxSettings); + } + + public void testPreConfiguredTokenizer() throws IOException { + + // Before 7.3 we return ngrams of length 1 only + { + Version version = VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, + VersionUtils.getPreviousVersion(Version.V_7_3_0)); + try (IndexAnalyzers indexAnalyzers = buildAnalyzers(version, "edge_ngram")) { + NamedAnalyzer analyzer = indexAnalyzers.get("my_analyzer"); + assertNotNull(analyzer); + assertAnalyzesTo(analyzer, "test", new String[]{"t"}); + } + } + + // Check deprecated name as well + { + Version version = VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, + VersionUtils.getPreviousVersion(Version.V_7_3_0)); + try (IndexAnalyzers indexAnalyzers = buildAnalyzers(version, "edgeNGram")) { + NamedAnalyzer analyzer = indexAnalyzers.get("my_analyzer"); + assertNotNull(analyzer); + assertAnalyzesTo(analyzer, "test", new String[]{"t"}); + } + } + + // Afterwards, we return ngrams of length 1 and 2, to match the default factory settings + { + try (IndexAnalyzers indexAnalyzers = buildAnalyzers(Version.CURRENT, "edge_ngram")) { + NamedAnalyzer analyzer = indexAnalyzers.get("my_analyzer"); + assertNotNull(analyzer); + assertAnalyzesTo(analyzer, "test", new String[]{"t", "te"}); + } + } + + // Check deprecated name as well + { + try (IndexAnalyzers indexAnalyzers = buildAnalyzers(Version.CURRENT, "edgeNGram")) { + NamedAnalyzer analyzer = indexAnalyzers.get("my_analyzer"); + assertNotNull(analyzer); + assertAnalyzesTo(analyzer, "test", new String[]{"t", "te"}); + + } + } + + } + +} diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactoryTests.java index d799674f231a1..c8e3699ea840d 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactoryTests.java @@ -20,14 +20,24 @@ import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.elasticsearch.Version; +import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; +import org.elasticsearch.env.TestEnvironment; +import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.AnalysisTestsHelper; +import org.elasticsearch.index.analysis.IndexAnalyzers; +import org.elasticsearch.index.analysis.NamedAnalyzer; import org.elasticsearch.index.analysis.TokenFilterFactory; +import org.elasticsearch.indices.analysis.AnalysisModule; import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.test.IndexSettingsModule; +import org.elasticsearch.test.VersionUtils; import java.io.IOException; import java.io.StringReader; +import java.util.Collections; public class WordDelimiterGraphTokenFilterFactoryTests extends BaseWordDelimiterTokenFilterFactoryTestCase { @@ -107,4 +117,51 @@ public void testAdjustingOffsets() throws IOException { assertTokenStreamContents(tokenFilter.create(tokenizer), expected, expectedStartOffsets, expectedEndOffsets, null, expectedIncr, expectedPosLen, null); } + + public void testPreconfiguredFilter() throws IOException { + // Before 7.3 we don't adjust offsets + { + Settings settings = Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .build(); + Settings indexSettings = Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, + VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, VersionUtils.getPreviousVersion(Version.V_7_3_0))) + .put("index.analysis.analyzer.my_analyzer.tokenizer", "standard") + .putList("index.analysis.analyzer.my_analyzer.filter", "word_delimiter_graph") + .build(); + IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings); + + try (IndexAnalyzers indexAnalyzers = new AnalysisModule(TestEnvironment.newEnvironment(settings), + Collections.singletonList(new CommonAnalysisPlugin())).getAnalysisRegistry().build(idxSettings)) { + + NamedAnalyzer analyzer = indexAnalyzers.get("my_analyzer"); + assertNotNull(analyzer); + assertAnalyzesTo(analyzer, "h100", new String[]{"h", "100"}, new int[]{ 0, 0 }, new int[]{ 4, 4 }); + + } + } + + // Afger 7.3 we do adjust offsets + { + Settings settings = Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .build(); + Settings indexSettings = Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) + .put("index.analysis.analyzer.my_analyzer.tokenizer", "standard") + .putList("index.analysis.analyzer.my_analyzer.filter", "word_delimiter_graph") + .build(); + IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings); + + try (IndexAnalyzers indexAnalyzers = new AnalysisModule(TestEnvironment.newEnvironment(settings), + Collections.singletonList(new CommonAnalysisPlugin())).getAnalysisRegistry().build(idxSettings)) { + + NamedAnalyzer analyzer = indexAnalyzers.get("my_analyzer"); + assertNotNull(analyzer); + assertAnalyzesTo(analyzer, "h100", new String[]{"h", "100"}, new int[]{ 0, 1 }, new int[]{ 1, 4 }); + + } + } + } } diff --git a/server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java b/server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java index b198a66d24a49..496b1eb3bfaea 100644 --- a/server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java +++ b/server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java @@ -111,6 +111,7 @@ private static Settings getSettingsFromIndexSettings(IndexSettings indexSettings private T getComponentFactory(IndexSettings settings, NameOrDefinition nod, String componentType, Function> globalComponentProvider, + Function> prebuiltComponentProvider, BiFunction> indexComponentProvider) throws IOException { if (nod.definition != null) { // custom component, so we build it from scratch @@ -128,10 +129,14 @@ private T getComponentFactory(IndexSettings settings, NameOrDefinition nod, return factory.get(settings, environment, "__anonymous__" + type, nod.definition); } if (settings == null) { - // no index provided, so we use global analysis components only - AnalysisProvider factory = globalComponentProvider.apply(nod.name); + // no index provided, so we use prebuilt analysis components + AnalysisProvider factory = prebuiltComponentProvider.apply(nod.name); if (factory == null) { - throw new IllegalArgumentException("failed to find global " + componentType + " under [" + nod.name + "]"); + // if there's no prebuilt component, try loading a global one to build with no settings + factory = globalComponentProvider.apply(nod.name); + if (factory == null) { + throw new IllegalArgumentException("failed to find global " + componentType + " under [" + nod.name + "]"); + } } return factory.get(environment, nod.name); } else { @@ -217,25 +222,26 @@ public IndexAnalyzers build(IndexSettings indexSettings) throws IOException { public NamedAnalyzer buildCustomAnalyzer(IndexSettings indexSettings, boolean normalizer, NameOrDefinition tokenizer, List charFilters, List tokenFilters) throws IOException { TokenizerFactory tokenizerFactory - = getComponentFactory(indexSettings, tokenizer, "tokenizer", this::getTokenizerProvider, this::getTokenizerProvider); + = getComponentFactory(indexSettings, tokenizer, "tokenizer", + this::getTokenizerProvider, prebuiltAnalysis::getTokenizerFactory, this::getTokenizerProvider); List charFilterFactories = new ArrayList<>(); for (NameOrDefinition nod : charFilters) { charFilterFactories.add(getComponentFactory(indexSettings, nod, "char_filter", - this::getCharFilterProvider, this::getCharFilterProvider)); + this::getCharFilterProvider, prebuiltAnalysis::getCharFilterFactory, this::getCharFilterProvider)); } List tokenFilterFactories = new ArrayList<>(); for (NameOrDefinition nod : tokenFilters) { TokenFilterFactory tff = getComponentFactory(indexSettings, nod, "filter", - this::getTokenFilterProvider, this::getTokenFilterProvider); + this::getTokenFilterProvider, prebuiltAnalysis::getTokenFilterFactory, this::getTokenFilterProvider); if (normalizer && tff instanceof NormalizingTokenFilterFactory == false) { throw new IllegalArgumentException("Custom normalizer may not use filter [" + tff.name() + "]"); } tff = tff.getChainAwareTokenFilterFactory(tokenizerFactory, charFilterFactories, tokenFilterFactories, name -> { try { return getComponentFactory(indexSettings, new NameOrDefinition(name), "filter", - this::getTokenFilterProvider, this::getTokenFilterProvider); + this::getTokenFilterProvider, prebuiltAnalysis::getTokenFilterFactory, this::getTokenFilterProvider); } catch (IOException e) { throw new UncheckedIOException(e); } diff --git a/server/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java b/server/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java index 1ffd7410fa66a..72830f79889c0 100644 --- a/server/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java +++ b/server/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java @@ -141,7 +141,7 @@ public Map> getTokenFilters() { @Override public List getPreConfiguredCharFilters() { - return singletonList(PreConfiguredCharFilter.singleton("append_foo", false, reader -> new AppendCharFilter(reader, "foo"))); + return singletonList(PreConfiguredCharFilter.singleton("append", false, reader -> new AppendCharFilter(reader, "foo"))); } }; registry = new AnalysisModule(environment, singletonList(plugin)).getAnalysisRegistry(); @@ -170,24 +170,11 @@ public void testNoIndexAnalyzers() throws IOException { List tokens = analyze.getTokens(); assertEquals(4, tokens.size()); - // Refer to a token filter by its type so we get its default configuration - request = new AnalyzeAction.Request(); - request.text("the qu1ck brown fox"); - request.tokenizer("standard"); - request.addTokenFilter("mock"); - analyze - = TransportAnalyzeAction.analyze(request, registry, null, maxTokenCount); - tokens = analyze.getTokens(); - assertEquals(3, tokens.size()); - assertEquals("qu1ck", tokens.get(0).getTerm()); - assertEquals("brown", tokens.get(1).getTerm()); - assertEquals("fox", tokens.get(2).getTerm()); - // We can refer to a pre-configured token filter by its name to get it request = new AnalyzeAction.Request(); request.text("the qu1ck brown fox"); request.tokenizer("standard"); - request.addCharFilter("append_foo"); + request.addCharFilter("append"); // <-- no config, so use preconfigured filter analyze = TransportAnalyzeAction.analyze(request, registry, null, maxTokenCount); tokens = analyze.getTokens(); @@ -197,35 +184,46 @@ public void testNoIndexAnalyzers() throws IOException { assertEquals("brown", tokens.get(2).getTerm()); assertEquals("foxfoo", tokens.get(3).getTerm()); - // We can refer to a token filter by its type to get its default configuration + // If the preconfigured filter doesn't exist, we use a global filter with no settings request = new AnalyzeAction.Request(); request.text("the qu1ck brown fox"); request.tokenizer("standard"); - request.addCharFilter("append"); + request.addTokenFilter("mock"); // <-- not preconfigured, but a global one available + analyze + = TransportAnalyzeAction.analyze(request, registry, null, maxTokenCount); + tokens = analyze.getTokens(); + assertEquals(3, tokens.size()); + assertEquals("qu1ck", tokens.get(0).getTerm()); + assertEquals("brown", tokens.get(1).getTerm()); + assertEquals("fox", tokens.get(2).getTerm()); + + // We can build a new char filter to get default values + request = new AnalyzeAction.Request(); request.text("the qu1ck brown fox"); + request.tokenizer("standard"); + request.addTokenFilter(Map.of("type", "mock", "stopword", "brown")); + request.addCharFilter(Map.of("type", "append")); // <-- basic config, uses defaults analyze = TransportAnalyzeAction.analyze(request, registry, null, maxTokenCount); tokens = analyze.getTokens(); - assertEquals(4, tokens.size()); + assertEquals(3, tokens.size()); assertEquals("the", tokens.get(0).getTerm()); assertEquals("qu1ck", tokens.get(1).getTerm()); - assertEquals("brown", tokens.get(2).getTerm()); - assertEquals("foxbar", tokens.get(3).getTerm()); + assertEquals("foxbar", tokens.get(2).getTerm()); // We can pass a new configuration request = new AnalyzeAction.Request(); request.text("the qu1ck brown fox"); request.tokenizer("standard"); request.addTokenFilter(Map.of("type", "mock", "stopword", "brown")); - request.addCharFilter("append"); - request.text("the qu1ck brown fox"); + request.addCharFilter(Map.of("type", "append", "suffix", "baz")); analyze = TransportAnalyzeAction.analyze(request, registry, null, maxTokenCount); tokens = analyze.getTokens(); assertEquals(3, tokens.size()); assertEquals("the", tokens.get(0).getTerm()); assertEquals("qu1ck", tokens.get(1).getTerm()); - assertEquals("foxbar", tokens.get(2).getTerm()); + assertEquals("foxbaz", tokens.get(2).getTerm()); } public void testFillsAttributes() throws IOException {