Skip to content

Commit 8ff5519

Browse files
committed
Use preconfigured filters correctly in Analyze API (#43568)
When a named token filter or char filter is passed as part of an Analyze API request with no index, we currently try and build the relevant filter using no index settings. However, this can miss cases where there is a pre-configured filter defined in the analysis registry. One example here is the elision filter, which has a pre-configured version built with the french elision set; when used as part of normal analysis, this preconfigured set is used, but when used as part of the Analyze API we end up with NPEs because it tries to instantiate the filter with no index settings. This commit changes the Analyze API to check for pre-configured filters in the case that the request has no index defined, and is using a name rather than a custom definition for a filter. It also changes the pre-configured `word_delimiter_graph` filter and `edge_ngram` tokenizer to make their settings consistent with the defaults used when creating them with no settings Closes #43002 Closes #43621 Closes #43582
1 parent 05a7333 commit 8ff5519

File tree

5 files changed

+216
-40
lines changed

5 files changed

+216
-40
lines changed

modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java

+19-7
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@
8383
import org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter;
8484
import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
8585
import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter;
86+
import org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator;
8687
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
8788
import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
8889
import org.apache.lucene.analysis.ngram.NGramTokenFilter;
@@ -110,6 +111,7 @@
110111
import org.apache.lucene.analysis.tr.TurkishAnalyzer;
111112
import org.apache.lucene.analysis.util.ElisionFilter;
112113
import org.apache.lucene.util.SetOnce;
114+
import org.elasticsearch.Version;
113115
import org.elasticsearch.client.Client;
114116
import org.elasticsearch.cluster.service.ClusterService;
115117
import org.elasticsearch.common.io.stream.NamedWriteableRegistry;
@@ -488,13 +490,15 @@ public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
488490
| WordDelimiterFilter.SPLIT_ON_CASE_CHANGE
489491
| WordDelimiterFilter.SPLIT_ON_NUMERICS
490492
| WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE, null)));
491-
filters.add(PreConfiguredTokenFilter.singleton("word_delimiter_graph", false, input ->
492-
new WordDelimiterGraphFilter(input,
493+
filters.add(PreConfiguredTokenFilter.singletonWithVersion("word_delimiter_graph", false, (input, version) -> {
494+
boolean adjustOffsets = version.onOrAfter(Version.V_7_3_0);
495+
return new WordDelimiterGraphFilter(input, adjustOffsets, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE,
493496
WordDelimiterGraphFilter.GENERATE_WORD_PARTS
494497
| WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS
495498
| WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE
496499
| WordDelimiterGraphFilter.SPLIT_ON_NUMERICS
497-
| WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE, null)));
500+
| WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE, null);
501+
}));
498502
return filters;
499503
}
500504

@@ -508,8 +512,12 @@ public List<PreConfiguredTokenizer> getPreConfiguredTokenizers() {
508512
tokenizers.add(PreConfiguredTokenizer.singleton("letter", LetterTokenizer::new));
509513
tokenizers.add(PreConfiguredTokenizer.singleton("whitespace", WhitespaceTokenizer::new));
510514
tokenizers.add(PreConfiguredTokenizer.singleton("ngram", NGramTokenizer::new));
511-
tokenizers.add(PreConfiguredTokenizer.singleton("edge_ngram",
512-
() -> new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE)));
515+
tokenizers.add(PreConfiguredTokenizer.elasticsearchVersion("edge_ngram", (version) -> {
516+
if (version.onOrAfter(Version.V_7_3_0)) {
517+
return new EdgeNGramTokenizer(NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE, NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
518+
}
519+
return new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE);
520+
}));
513521
tokenizers.add(PreConfiguredTokenizer.singleton("pattern", () -> new PatternTokenizer(Regex.compile("\\W+", null), -1)));
514522
tokenizers.add(PreConfiguredTokenizer.singleton("thai", ThaiTokenizer::new));
515523
// TODO deprecate and remove in API
@@ -518,8 +526,12 @@ public List<PreConfiguredTokenizer> getPreConfiguredTokenizers() {
518526

519527
// Temporary shim for aliases. TODO deprecate after they are moved
520528
tokenizers.add(PreConfiguredTokenizer.singleton("nGram", NGramTokenizer::new));
521-
tokenizers.add(PreConfiguredTokenizer.singleton("edgeNGram",
522-
() -> new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE)));
529+
tokenizers.add(PreConfiguredTokenizer.elasticsearchVersion("edgeNGram", (version) -> {
530+
if (version.onOrAfter(Version.V_7_3_0)) {
531+
return new EdgeNGramTokenizer(NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE, NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
532+
}
533+
return new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE);
534+
}));
523535
tokenizers.add(PreConfiguredTokenizer.singleton("PathHierarchy", PathHierarchyTokenizer::new));
524536

525537
return tokenizers;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
/*
2+
* Licensed to Elasticsearch under one or more contributor
3+
* license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright
5+
* ownership. Elasticsearch licenses this file to you under
6+
* the Apache License, Version 2.0 (the "License"); you may
7+
* not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.elasticsearch.analysis.common;
21+
22+
import org.elasticsearch.Version;
23+
import org.elasticsearch.cluster.metadata.IndexMetaData;
24+
import org.elasticsearch.common.settings.Settings;
25+
import org.elasticsearch.env.Environment;
26+
import org.elasticsearch.env.TestEnvironment;
27+
import org.elasticsearch.index.IndexSettings;
28+
import org.elasticsearch.index.analysis.IndexAnalyzers;
29+
import org.elasticsearch.index.analysis.NamedAnalyzer;
30+
import org.elasticsearch.indices.analysis.AnalysisModule;
31+
import org.elasticsearch.test.ESTokenStreamTestCase;
32+
import org.elasticsearch.test.IndexSettingsModule;
33+
import org.elasticsearch.test.VersionUtils;
34+
35+
import java.io.IOException;
36+
import java.util.Collections;
37+
38+
public class EdgeNGramTokenizerTests extends ESTokenStreamTestCase {
39+
40+
private IndexAnalyzers buildAnalyzers(Version version, String tokenizer) throws IOException {
41+
Settings settings = Settings.builder()
42+
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
43+
.build();
44+
Settings indexSettings = Settings.builder()
45+
.put(IndexMetaData.SETTING_VERSION_CREATED, version)
46+
.put("index.analysis.analyzer.my_analyzer.tokenizer", tokenizer)
47+
.build();
48+
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
49+
return new AnalysisModule(TestEnvironment.newEnvironment(settings),
50+
Collections.singletonList(new CommonAnalysisPlugin())).getAnalysisRegistry().build(idxSettings);
51+
}
52+
53+
public void testPreConfiguredTokenizer() throws IOException {
54+
55+
// Before 7.3 we return ngrams of length 1 only
56+
{
57+
Version version = VersionUtils.randomVersionBetween(random(), Version.V_7_0_0,
58+
VersionUtils.getPreviousVersion(Version.V_7_3_0));
59+
try (IndexAnalyzers indexAnalyzers = buildAnalyzers(version, "edge_ngram")) {
60+
NamedAnalyzer analyzer = indexAnalyzers.get("my_analyzer");
61+
assertNotNull(analyzer);
62+
assertAnalyzesTo(analyzer, "test", new String[]{"t"});
63+
}
64+
}
65+
66+
// Check deprecated name as well
67+
{
68+
Version version = VersionUtils.randomVersionBetween(random(), Version.V_7_0_0,
69+
VersionUtils.getPreviousVersion(Version.V_7_3_0));
70+
try (IndexAnalyzers indexAnalyzers = buildAnalyzers(version, "edgeNGram")) {
71+
NamedAnalyzer analyzer = indexAnalyzers.get("my_analyzer");
72+
assertNotNull(analyzer);
73+
assertAnalyzesTo(analyzer, "test", new String[]{"t"});
74+
}
75+
}
76+
77+
// Afterwards, we return ngrams of length 1 and 2, to match the default factory settings
78+
{
79+
try (IndexAnalyzers indexAnalyzers = buildAnalyzers(Version.CURRENT, "edge_ngram")) {
80+
NamedAnalyzer analyzer = indexAnalyzers.get("my_analyzer");
81+
assertNotNull(analyzer);
82+
assertAnalyzesTo(analyzer, "test", new String[]{"t", "te"});
83+
}
84+
}
85+
86+
// Check deprecated name as well
87+
{
88+
try (IndexAnalyzers indexAnalyzers = buildAnalyzers(Version.CURRENT, "edgeNGram")) {
89+
NamedAnalyzer analyzer = indexAnalyzers.get("my_analyzer");
90+
assertNotNull(analyzer);
91+
assertAnalyzesTo(analyzer, "test", new String[]{"t", "te"});
92+
93+
}
94+
}
95+
96+
}
97+
98+
}

modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactoryTests.java

+57
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,24 @@
2020

2121
import org.apache.lucene.analysis.Tokenizer;
2222
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
23+
import org.elasticsearch.Version;
24+
import org.elasticsearch.cluster.metadata.IndexMetaData;
2325
import org.elasticsearch.common.settings.Settings;
2426
import org.elasticsearch.env.Environment;
27+
import org.elasticsearch.env.TestEnvironment;
28+
import org.elasticsearch.index.IndexSettings;
2529
import org.elasticsearch.index.analysis.AnalysisTestsHelper;
30+
import org.elasticsearch.index.analysis.IndexAnalyzers;
31+
import org.elasticsearch.index.analysis.NamedAnalyzer;
2632
import org.elasticsearch.index.analysis.TokenFilterFactory;
33+
import org.elasticsearch.indices.analysis.AnalysisModule;
2734
import org.elasticsearch.test.ESTestCase;
35+
import org.elasticsearch.test.IndexSettingsModule;
36+
import org.elasticsearch.test.VersionUtils;
2837

2938
import java.io.IOException;
3039
import java.io.StringReader;
40+
import java.util.Collections;
3141

3242
public class WordDelimiterGraphTokenFilterFactoryTests
3343
extends BaseWordDelimiterTokenFilterFactoryTestCase {
@@ -107,4 +117,51 @@ public void testAdjustingOffsets() throws IOException {
107117
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, expectedStartOffsets, expectedEndOffsets, null,
108118
expectedIncr, expectedPosLen, null);
109119
}
120+
121+
public void testPreconfiguredFilter() throws IOException {
122+
// Before 7.3 we don't adjust offsets
123+
{
124+
Settings settings = Settings.builder()
125+
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
126+
.build();
127+
Settings indexSettings = Settings.builder()
128+
.put(IndexMetaData.SETTING_VERSION_CREATED,
129+
VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, VersionUtils.getPreviousVersion(Version.V_7_3_0)))
130+
.put("index.analysis.analyzer.my_analyzer.tokenizer", "standard")
131+
.putList("index.analysis.analyzer.my_analyzer.filter", "word_delimiter_graph")
132+
.build();
133+
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
134+
135+
try (IndexAnalyzers indexAnalyzers = new AnalysisModule(TestEnvironment.newEnvironment(settings),
136+
Collections.singletonList(new CommonAnalysisPlugin())).getAnalysisRegistry().build(idxSettings)) {
137+
138+
NamedAnalyzer analyzer = indexAnalyzers.get("my_analyzer");
139+
assertNotNull(analyzer);
140+
assertAnalyzesTo(analyzer, "h100", new String[]{"h", "100"}, new int[]{ 0, 0 }, new int[]{ 4, 4 });
141+
142+
}
143+
}
144+
145+
// Afger 7.3 we do adjust offsets
146+
{
147+
Settings settings = Settings.builder()
148+
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
149+
.build();
150+
Settings indexSettings = Settings.builder()
151+
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
152+
.put("index.analysis.analyzer.my_analyzer.tokenizer", "standard")
153+
.putList("index.analysis.analyzer.my_analyzer.filter", "word_delimiter_graph")
154+
.build();
155+
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
156+
157+
try (IndexAnalyzers indexAnalyzers = new AnalysisModule(TestEnvironment.newEnvironment(settings),
158+
Collections.singletonList(new CommonAnalysisPlugin())).getAnalysisRegistry().build(idxSettings)) {
159+
160+
NamedAnalyzer analyzer = indexAnalyzers.get("my_analyzer");
161+
assertNotNull(analyzer);
162+
assertAnalyzesTo(analyzer, "h100", new String[]{"h", "100"}, new int[]{ 0, 1 }, new int[]{ 1, 4 });
163+
164+
}
165+
}
166+
}
110167
}

server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java

+13-7
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ private static Settings getSettingsFromIndexSettings(IndexSettings indexSettings
111111
private <T> T getComponentFactory(IndexSettings settings, NameOrDefinition nod,
112112
String componentType,
113113
Function<String, AnalysisProvider<T>> globalComponentProvider,
114+
Function<String, AnalysisProvider<T>> prebuiltComponentProvider,
114115
BiFunction<String, IndexSettings, AnalysisProvider<T>> indexComponentProvider) throws IOException {
115116
if (nod.definition != null) {
116117
// custom component, so we build it from scratch
@@ -128,10 +129,14 @@ private <T> T getComponentFactory(IndexSettings settings, NameOrDefinition nod,
128129
return factory.get(settings, environment, "__anonymous__" + type, nod.definition);
129130
}
130131
if (settings == null) {
131-
// no index provided, so we use global analysis components only
132-
AnalysisProvider<T> factory = globalComponentProvider.apply(nod.name);
132+
// no index provided, so we use prebuilt analysis components
133+
AnalysisProvider<T> factory = prebuiltComponentProvider.apply(nod.name);
133134
if (factory == null) {
134-
throw new IllegalArgumentException("failed to find global " + componentType + " under [" + nod.name + "]");
135+
// if there's no prebuilt component, try loading a global one to build with no settings
136+
factory = globalComponentProvider.apply(nod.name);
137+
if (factory == null) {
138+
throw new IllegalArgumentException("failed to find global " + componentType + " under [" + nod.name + "]");
139+
}
135140
}
136141
return factory.get(environment, nod.name);
137142
} else {
@@ -219,25 +224,26 @@ public IndexAnalyzers build(IndexSettings indexSettings) throws IOException {
219224
public NamedAnalyzer buildCustomAnalyzer(IndexSettings indexSettings, boolean normalizer, NameOrDefinition tokenizer,
220225
List<NameOrDefinition> charFilters, List<NameOrDefinition> tokenFilters) throws IOException {
221226
TokenizerFactory tokenizerFactory
222-
= getComponentFactory(indexSettings, tokenizer, "tokenizer", this::getTokenizerProvider, this::getTokenizerProvider);
227+
= getComponentFactory(indexSettings, tokenizer, "tokenizer",
228+
this::getTokenizerProvider, prebuiltAnalysis::getTokenizerFactory, this::getTokenizerProvider);
223229

224230
List<CharFilterFactory> charFilterFactories = new ArrayList<>();
225231
for (NameOrDefinition nod : charFilters) {
226232
charFilterFactories.add(getComponentFactory(indexSettings, nod, "char_filter",
227-
this::getCharFilterProvider, this::getCharFilterProvider));
233+
this::getCharFilterProvider, prebuiltAnalysis::getCharFilterFactory, this::getCharFilterProvider));
228234
}
229235

230236
List<TokenFilterFactory> tokenFilterFactories = new ArrayList<>();
231237
for (NameOrDefinition nod : tokenFilters) {
232238
TokenFilterFactory tff = getComponentFactory(indexSettings, nod, "filter",
233-
this::getTokenFilterProvider, this::getTokenFilterProvider);
239+
this::getTokenFilterProvider, prebuiltAnalysis::getTokenFilterFactory, this::getTokenFilterProvider);
234240
if (normalizer && tff instanceof NormalizingTokenFilterFactory == false) {
235241
throw new IllegalArgumentException("Custom normalizer may not use filter [" + tff.name() + "]");
236242
}
237243
tff = tff.getChainAwareTokenFilterFactory(tokenizerFactory, charFilterFactories, tokenFilterFactories, name -> {
238244
try {
239245
return getComponentFactory(indexSettings, new NameOrDefinition(name), "filter",
240-
this::getTokenFilterProvider, this::getTokenFilterProvider);
246+
this::getTokenFilterProvider, prebuiltAnalysis::getTokenFilterFactory, this::getTokenFilterProvider);
241247
} catch (IOException e) {
242248
throw new UncheckedIOException(e);
243249
}

0 commit comments

Comments
 (0)