Skip to content

Commit fbefb46

Browse files
authored
Use preconfigured filters correctly in Analyze API (#43568)
When a named token filter or char filter is passed as part of an Analyze API request with no index, we currently try and build the relevant filter using no index settings. However, this can miss cases where there is a pre-configured filter defined in the analysis registry. One example here is the elision filter, which has a pre-configured version built with the french elision set; when used as part of normal analysis, this preconfigured set is used, but when used as part of the Analyze API we end up with NPEs because it tries to instantiate the filter with no index settings. This commit changes the Analyze API to check for pre-configured filters in the case that the request has no index defined, and is using a name rather than a custom definition for a filter. It also changes the pre-configured `word_delimiter_graph` filter and `edge_ngram` tokenizer to make their settings consistent with the defaults used when creating them with no settings Closes #43002 Closes #43621 Closes #43582
1 parent d2c696d commit fbefb46

File tree

5 files changed

+208
-37
lines changed

5 files changed

+208
-37
lines changed

modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java

+19-7
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@
8383
import org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter;
8484
import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
8585
import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter;
86+
import org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator;
8687
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
8788
import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
8889
import org.apache.lucene.analysis.ngram.NGramTokenFilter;
@@ -110,6 +111,7 @@
110111
import org.apache.lucene.analysis.tr.TurkishAnalyzer;
111112
import org.apache.lucene.analysis.util.ElisionFilter;
112113
import org.apache.lucene.util.SetOnce;
114+
import org.elasticsearch.Version;
113115
import org.elasticsearch.client.Client;
114116
import org.elasticsearch.cluster.service.ClusterService;
115117
import org.elasticsearch.common.io.stream.NamedWriteableRegistry;
@@ -455,13 +457,15 @@ public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
455457
| WordDelimiterFilter.SPLIT_ON_CASE_CHANGE
456458
| WordDelimiterFilter.SPLIT_ON_NUMERICS
457459
| WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE, null)));
458-
filters.add(PreConfiguredTokenFilter.singleton("word_delimiter_graph", false, false, input ->
459-
new WordDelimiterGraphFilter(input,
460+
filters.add(PreConfiguredTokenFilter.singletonWithVersion("word_delimiter_graph", false, false, (input, version) -> {
461+
boolean adjustOffsets = version.onOrAfter(Version.V_7_3_0);
462+
return new WordDelimiterGraphFilter(input, adjustOffsets, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE,
460463
WordDelimiterGraphFilter.GENERATE_WORD_PARTS
461464
| WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS
462465
| WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE
463466
| WordDelimiterGraphFilter.SPLIT_ON_NUMERICS
464-
| WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE, null)));
467+
| WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE, null);
468+
}));
465469
return filters;
466470
}
467471

@@ -475,8 +479,12 @@ public List<PreConfiguredTokenizer> getPreConfiguredTokenizers() {
475479
tokenizers.add(PreConfiguredTokenizer.singleton("letter", LetterTokenizer::new));
476480
tokenizers.add(PreConfiguredTokenizer.singleton("whitespace", WhitespaceTokenizer::new));
477481
tokenizers.add(PreConfiguredTokenizer.singleton("ngram", NGramTokenizer::new));
478-
tokenizers.add(PreConfiguredTokenizer.singleton("edge_ngram",
479-
() -> new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE)));
482+
tokenizers.add(PreConfiguredTokenizer.elasticsearchVersion("edge_ngram", (version) -> {
483+
if (version.onOrAfter(Version.V_7_3_0)) {
484+
return new EdgeNGramTokenizer(NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE, NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
485+
}
486+
return new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE);
487+
}));
480488
tokenizers.add(PreConfiguredTokenizer.singleton("pattern", () -> new PatternTokenizer(Regex.compile("\\W+", null), -1)));
481489
tokenizers.add(PreConfiguredTokenizer.singleton("thai", ThaiTokenizer::new));
482490
// TODO deprecate and remove in API
@@ -485,8 +493,12 @@ public List<PreConfiguredTokenizer> getPreConfiguredTokenizers() {
485493

486494
// Temporary shim for aliases. TODO deprecate after they are moved
487495
tokenizers.add(PreConfiguredTokenizer.singleton("nGram", NGramTokenizer::new));
488-
tokenizers.add(PreConfiguredTokenizer.singleton("edgeNGram",
489-
() -> new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE)));
496+
tokenizers.add(PreConfiguredTokenizer.elasticsearchVersion("edgeNGram", (version) -> {
497+
if (version.onOrAfter(Version.V_7_3_0)) {
498+
return new EdgeNGramTokenizer(NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE, NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
499+
}
500+
return new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE);
501+
}));
490502
tokenizers.add(PreConfiguredTokenizer.singleton("PathHierarchy", PathHierarchyTokenizer::new));
491503

492504
return tokenizers;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
/*
2+
* Licensed to Elasticsearch under one or more contributor
3+
* license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright
5+
* ownership. Elasticsearch licenses this file to you under
6+
* the Apache License, Version 2.0 (the "License"); you may
7+
* not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.elasticsearch.analysis.common;
21+
22+
import org.elasticsearch.Version;
23+
import org.elasticsearch.cluster.metadata.IndexMetaData;
24+
import org.elasticsearch.common.settings.Settings;
25+
import org.elasticsearch.env.Environment;
26+
import org.elasticsearch.env.TestEnvironment;
27+
import org.elasticsearch.index.IndexSettings;
28+
import org.elasticsearch.index.analysis.IndexAnalyzers;
29+
import org.elasticsearch.index.analysis.NamedAnalyzer;
30+
import org.elasticsearch.indices.analysis.AnalysisModule;
31+
import org.elasticsearch.test.ESTokenStreamTestCase;
32+
import org.elasticsearch.test.IndexSettingsModule;
33+
import org.elasticsearch.test.VersionUtils;
34+
35+
import java.io.IOException;
36+
import java.util.Collections;
37+
38+
public class EdgeNGramTokenizerTests extends ESTokenStreamTestCase {
39+
40+
private IndexAnalyzers buildAnalyzers(Version version, String tokenizer) throws IOException {
41+
Settings settings = Settings.builder()
42+
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
43+
.build();
44+
Settings indexSettings = Settings.builder()
45+
.put(IndexMetaData.SETTING_VERSION_CREATED, version)
46+
.put("index.analysis.analyzer.my_analyzer.tokenizer", tokenizer)
47+
.build();
48+
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
49+
return new AnalysisModule(TestEnvironment.newEnvironment(settings),
50+
Collections.singletonList(new CommonAnalysisPlugin())).getAnalysisRegistry().build(idxSettings);
51+
}
52+
53+
public void testPreConfiguredTokenizer() throws IOException {
54+
55+
// Before 7.3 we return ngrams of length 1 only
56+
{
57+
Version version = VersionUtils.randomVersionBetween(random(), Version.V_7_0_0,
58+
VersionUtils.getPreviousVersion(Version.V_7_3_0));
59+
try (IndexAnalyzers indexAnalyzers = buildAnalyzers(version, "edge_ngram")) {
60+
NamedAnalyzer analyzer = indexAnalyzers.get("my_analyzer");
61+
assertNotNull(analyzer);
62+
assertAnalyzesTo(analyzer, "test", new String[]{"t"});
63+
}
64+
}
65+
66+
// Check deprecated name as well
67+
{
68+
Version version = VersionUtils.randomVersionBetween(random(), Version.V_7_0_0,
69+
VersionUtils.getPreviousVersion(Version.V_7_3_0));
70+
try (IndexAnalyzers indexAnalyzers = buildAnalyzers(version, "edgeNGram")) {
71+
NamedAnalyzer analyzer = indexAnalyzers.get("my_analyzer");
72+
assertNotNull(analyzer);
73+
assertAnalyzesTo(analyzer, "test", new String[]{"t"});
74+
}
75+
}
76+
77+
// Afterwards, we return ngrams of length 1 and 2, to match the default factory settings
78+
{
79+
try (IndexAnalyzers indexAnalyzers = buildAnalyzers(Version.CURRENT, "edge_ngram")) {
80+
NamedAnalyzer analyzer = indexAnalyzers.get("my_analyzer");
81+
assertNotNull(analyzer);
82+
assertAnalyzesTo(analyzer, "test", new String[]{"t", "te"});
83+
}
84+
}
85+
86+
// Check deprecated name as well
87+
{
88+
try (IndexAnalyzers indexAnalyzers = buildAnalyzers(Version.CURRENT, "edgeNGram")) {
89+
NamedAnalyzer analyzer = indexAnalyzers.get("my_analyzer");
90+
assertNotNull(analyzer);
91+
assertAnalyzesTo(analyzer, "test", new String[]{"t", "te"});
92+
93+
}
94+
}
95+
96+
}
97+
98+
}

modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactoryTests.java

+57
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,24 @@
2020

2121
import org.apache.lucene.analysis.Tokenizer;
2222
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
23+
import org.elasticsearch.Version;
24+
import org.elasticsearch.cluster.metadata.IndexMetaData;
2325
import org.elasticsearch.common.settings.Settings;
2426
import org.elasticsearch.env.Environment;
27+
import org.elasticsearch.env.TestEnvironment;
28+
import org.elasticsearch.index.IndexSettings;
2529
import org.elasticsearch.index.analysis.AnalysisTestsHelper;
30+
import org.elasticsearch.index.analysis.IndexAnalyzers;
31+
import org.elasticsearch.index.analysis.NamedAnalyzer;
2632
import org.elasticsearch.index.analysis.TokenFilterFactory;
33+
import org.elasticsearch.indices.analysis.AnalysisModule;
2734
import org.elasticsearch.test.ESTestCase;
35+
import org.elasticsearch.test.IndexSettingsModule;
36+
import org.elasticsearch.test.VersionUtils;
2837

2938
import java.io.IOException;
3039
import java.io.StringReader;
40+
import java.util.Collections;
3141

3242
public class WordDelimiterGraphTokenFilterFactoryTests
3343
extends BaseWordDelimiterTokenFilterFactoryTestCase {
@@ -107,4 +117,51 @@ public void testAdjustingOffsets() throws IOException {
107117
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, expectedStartOffsets, expectedEndOffsets, null,
108118
expectedIncr, expectedPosLen, null);
109119
}
120+
121+
public void testPreconfiguredFilter() throws IOException {
122+
// Before 7.3 we don't adjust offsets
123+
{
124+
Settings settings = Settings.builder()
125+
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
126+
.build();
127+
Settings indexSettings = Settings.builder()
128+
.put(IndexMetaData.SETTING_VERSION_CREATED,
129+
VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, VersionUtils.getPreviousVersion(Version.V_7_3_0)))
130+
.put("index.analysis.analyzer.my_analyzer.tokenizer", "standard")
131+
.putList("index.analysis.analyzer.my_analyzer.filter", "word_delimiter_graph")
132+
.build();
133+
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
134+
135+
try (IndexAnalyzers indexAnalyzers = new AnalysisModule(TestEnvironment.newEnvironment(settings),
136+
Collections.singletonList(new CommonAnalysisPlugin())).getAnalysisRegistry().build(idxSettings)) {
137+
138+
NamedAnalyzer analyzer = indexAnalyzers.get("my_analyzer");
139+
assertNotNull(analyzer);
140+
assertAnalyzesTo(analyzer, "h100", new String[]{"h", "100"}, new int[]{ 0, 0 }, new int[]{ 4, 4 });
141+
142+
}
143+
}
144+
145+
// Afger 7.3 we do adjust offsets
146+
{
147+
Settings settings = Settings.builder()
148+
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
149+
.build();
150+
Settings indexSettings = Settings.builder()
151+
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
152+
.put("index.analysis.analyzer.my_analyzer.tokenizer", "standard")
153+
.putList("index.analysis.analyzer.my_analyzer.filter", "word_delimiter_graph")
154+
.build();
155+
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
156+
157+
try (IndexAnalyzers indexAnalyzers = new AnalysisModule(TestEnvironment.newEnvironment(settings),
158+
Collections.singletonList(new CommonAnalysisPlugin())).getAnalysisRegistry().build(idxSettings)) {
159+
160+
NamedAnalyzer analyzer = indexAnalyzers.get("my_analyzer");
161+
assertNotNull(analyzer);
162+
assertAnalyzesTo(analyzer, "h100", new String[]{"h", "100"}, new int[]{ 0, 1 }, new int[]{ 1, 4 });
163+
164+
}
165+
}
166+
}
110167
}

server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java

+13-7
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ private static Settings getSettingsFromIndexSettings(IndexSettings indexSettings
111111
private <T> T getComponentFactory(IndexSettings settings, NameOrDefinition nod,
112112
String componentType,
113113
Function<String, AnalysisProvider<T>> globalComponentProvider,
114+
Function<String, AnalysisProvider<T>> prebuiltComponentProvider,
114115
BiFunction<String, IndexSettings, AnalysisProvider<T>> indexComponentProvider) throws IOException {
115116
if (nod.definition != null) {
116117
// custom component, so we build it from scratch
@@ -128,10 +129,14 @@ private <T> T getComponentFactory(IndexSettings settings, NameOrDefinition nod,
128129
return factory.get(settings, environment, "__anonymous__" + type, nod.definition);
129130
}
130131
if (settings == null) {
131-
// no index provided, so we use global analysis components only
132-
AnalysisProvider<T> factory = globalComponentProvider.apply(nod.name);
132+
// no index provided, so we use prebuilt analysis components
133+
AnalysisProvider<T> factory = prebuiltComponentProvider.apply(nod.name);
133134
if (factory == null) {
134-
throw new IllegalArgumentException("failed to find global " + componentType + " under [" + nod.name + "]");
135+
// if there's no prebuilt component, try loading a global one to build with no settings
136+
factory = globalComponentProvider.apply(nod.name);
137+
if (factory == null) {
138+
throw new IllegalArgumentException("failed to find global " + componentType + " under [" + nod.name + "]");
139+
}
135140
}
136141
return factory.get(environment, nod.name);
137142
} else {
@@ -217,25 +222,26 @@ public IndexAnalyzers build(IndexSettings indexSettings) throws IOException {
217222
public NamedAnalyzer buildCustomAnalyzer(IndexSettings indexSettings, boolean normalizer, NameOrDefinition tokenizer,
218223
List<NameOrDefinition> charFilters, List<NameOrDefinition> tokenFilters) throws IOException {
219224
TokenizerFactory tokenizerFactory
220-
= getComponentFactory(indexSettings, tokenizer, "tokenizer", this::getTokenizerProvider, this::getTokenizerProvider);
225+
= getComponentFactory(indexSettings, tokenizer, "tokenizer",
226+
this::getTokenizerProvider, prebuiltAnalysis::getTokenizerFactory, this::getTokenizerProvider);
221227

222228
List<CharFilterFactory> charFilterFactories = new ArrayList<>();
223229
for (NameOrDefinition nod : charFilters) {
224230
charFilterFactories.add(getComponentFactory(indexSettings, nod, "char_filter",
225-
this::getCharFilterProvider, this::getCharFilterProvider));
231+
this::getCharFilterProvider, prebuiltAnalysis::getCharFilterFactory, this::getCharFilterProvider));
226232
}
227233

228234
List<TokenFilterFactory> tokenFilterFactories = new ArrayList<>();
229235
for (NameOrDefinition nod : tokenFilters) {
230236
TokenFilterFactory tff = getComponentFactory(indexSettings, nod, "filter",
231-
this::getTokenFilterProvider, this::getTokenFilterProvider);
237+
this::getTokenFilterProvider, prebuiltAnalysis::getTokenFilterFactory, this::getTokenFilterProvider);
232238
if (normalizer && tff instanceof NormalizingTokenFilterFactory == false) {
233239
throw new IllegalArgumentException("Custom normalizer may not use filter [" + tff.name() + "]");
234240
}
235241
tff = tff.getChainAwareTokenFilterFactory(tokenizerFactory, charFilterFactories, tokenFilterFactories, name -> {
236242
try {
237243
return getComponentFactory(indexSettings, new NameOrDefinition(name), "filter",
238-
this::getTokenFilterProvider, this::getTokenFilterProvider);
244+
this::getTokenFilterProvider, prebuiltAnalysis::getTokenFilterFactory, this::getTokenFilterProvider);
239245
} catch (IOException e) {
240246
throw new UncheckedIOException(e);
241247
}

0 commit comments

Comments
 (0)