Skip to content

Commit ab50d5a

Browse files
committed
Fix PreConfiguredTokenFilters getSynonymFilter() implementations (#38858)
When we added support for TokenFilterFactories to specialise how they were used when parsing synonym files, PreConfiguredTokenFilters were set up to either apply themselves, or be ignored. This behaviour is a leftover from an earlier iteration, and also has an incorrect default. This commit makes preconfigured token filters usable in synonym file parsing by default, and brings those filters that should not be used into line with index-specific filter factories; in indexes created before version 7 we emit a deprecation warning, and we throw an error in indexes created after. Backport of #38839 Fixes #38793
1 parent 352aa7c commit ab50d5a

File tree

3 files changed

+84
-31
lines changed

3 files changed

+84
-31
lines changed

modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -395,7 +395,7 @@ public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
395395
filters.add(PreConfiguredTokenFilter.singleton("cjk_bigram", false, CJKBigramFilter::new));
396396
filters.add(PreConfiguredTokenFilter.singleton("cjk_width", true, CJKWidthFilter::new));
397397
filters.add(PreConfiguredTokenFilter.singleton("classic", false, ClassicFilter::new));
398-
filters.add(PreConfiguredTokenFilter.singleton("common_grams", false,
398+
filters.add(PreConfiguredTokenFilter.singleton("common_grams", false, false,
399399
input -> new CommonGramsFilter(input, CharArraySet.EMPTY_SET)));
400400
filters.add(PreConfiguredTokenFilter.singleton("czech_stem", false, CzechStemFilter::new));
401401
filters.add(PreConfiguredTokenFilter.singleton("decimal_digit", true, DecimalDigitFilter::new));
@@ -408,9 +408,9 @@ public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
408408
DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER,
409409
DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER)));
410410
filters.add(PreConfiguredTokenFilter.singleton("dutch_stem", false, input -> new SnowballFilter(input, new DutchStemmer())));
411-
filters.add(PreConfiguredTokenFilter.singleton("edge_ngram", false, input ->
411+
filters.add(PreConfiguredTokenFilter.singleton("edge_ngram", false, false, input ->
412412
new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE)));
413-
filters.add(PreConfiguredTokenFilter.singletonWithVersion("edgeNGram", false, (reader, version) -> {
413+
filters.add(PreConfiguredTokenFilter.singletonWithVersion("edgeNGram", false, false, (reader, version) -> {
414414
if (version.onOrAfter(org.elasticsearch.Version.V_6_4_0)) {
415415
DEPRECATION_LOGGER.deprecatedAndMaybeLog("edgeNGram_deprecation",
416416
"The [edgeNGram] token filter name is deprecated and will be removed in a future version. "
@@ -433,8 +433,8 @@ public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
433433
new LimitTokenCountFilter(input,
434434
LimitTokenCountFilterFactory.DEFAULT_MAX_TOKEN_COUNT,
435435
LimitTokenCountFilterFactory.DEFAULT_CONSUME_ALL_TOKENS)));
436-
filters.add(PreConfiguredTokenFilter.singleton("ngram", false, NGramTokenFilter::new));
437-
filters.add(PreConfiguredTokenFilter.singletonWithVersion("nGram", false, (reader, version) -> {
436+
filters.add(PreConfiguredTokenFilter.singleton("ngram", false, false, NGramTokenFilter::new));
437+
filters.add(PreConfiguredTokenFilter.singletonWithVersion("nGram", false, false, (reader, version) -> {
438438
if (version.onOrAfter(org.elasticsearch.Version.V_6_4_0)) {
439439
DEPRECATION_LOGGER.deprecatedAndMaybeLog("nGram_deprecation",
440440
"The [nGram] token filter name is deprecated and will be removed in a future version. "
@@ -448,7 +448,7 @@ public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
448448
filters.add(PreConfiguredTokenFilter.singleton("russian_stem", false, input -> new SnowballFilter(input, "Russian")));
449449
filters.add(PreConfiguredTokenFilter.singleton("scandinavian_folding", true, ScandinavianFoldingFilter::new));
450450
filters.add(PreConfiguredTokenFilter.singleton("scandinavian_normalization", true, ScandinavianNormalizationFilter::new));
451-
filters.add(PreConfiguredTokenFilter.singleton("shingle", false, input -> {
451+
filters.add(PreConfiguredTokenFilter.singleton("shingle", false, false, input -> {
452452
TokenStream ts = new ShingleFilter(input);
453453
/**
454454
* We disable the graph analysis on this token stream
@@ -469,14 +469,14 @@ public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
469469
filters.add(PreConfiguredTokenFilter.singleton("type_as_payload", false, TypeAsPayloadTokenFilter::new));
470470
filters.add(PreConfiguredTokenFilter.singleton("unique", false, UniqueTokenFilter::new));
471471
filters.add(PreConfiguredTokenFilter.singleton("uppercase", true, UpperCaseFilter::new));
472-
filters.add(PreConfiguredTokenFilter.singleton("word_delimiter", false, input ->
472+
filters.add(PreConfiguredTokenFilter.singleton("word_delimiter", false, false, input ->
473473
new WordDelimiterFilter(input,
474474
WordDelimiterFilter.GENERATE_WORD_PARTS
475475
| WordDelimiterFilter.GENERATE_NUMBER_PARTS
476476
| WordDelimiterFilter.SPLIT_ON_CASE_CHANGE
477477
| WordDelimiterFilter.SPLIT_ON_NUMERICS
478478
| WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE, null)));
479-
filters.add(PreConfiguredTokenFilter.singleton("word_delimiter_graph", false, input ->
479+
filters.add(PreConfiguredTokenFilter.singleton("word_delimiter_graph", false, false, input ->
480480
new WordDelimiterGraphFilter(input,
481481
WordDelimiterGraphFilter.GENERATE_WORD_PARTS
482482
| WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS

modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/SynonymsAnalysisTests.java

Lines changed: 42 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -32,19 +32,24 @@
3232
import org.elasticsearch.index.IndexSettings;
3333
import org.elasticsearch.index.analysis.IndexAnalyzers;
3434
import org.elasticsearch.index.analysis.SynonymTokenFilterFactory;
35+
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
3536
import org.elasticsearch.index.analysis.TokenFilterFactory;
3637
import org.elasticsearch.index.analysis.TokenizerFactory;
3738
import org.elasticsearch.test.ESTestCase;
3839
import org.elasticsearch.test.IndexSettingsModule;
40+
import org.elasticsearch.test.VersionUtils;
3941
import org.hamcrest.MatcherAssert;
4042

4143
import java.io.IOException;
4244
import java.io.InputStream;
4345
import java.nio.file.Files;
4446
import java.nio.file.Path;
4547
import java.util.ArrayList;
48+
import java.util.Arrays;
4649
import java.util.Collections;
50+
import java.util.HashSet;
4751
import java.util.List;
52+
import java.util.Set;
4853

4954
import static org.hamcrest.Matchers.equalTo;
5055
import static org.hamcrest.Matchers.instanceOf;
@@ -164,23 +169,21 @@ public void testAsciiFoldingFilterForSynonyms() throws IOException {
164169
new int[]{ 1, 0 });
165170
}
166171

167-
public void testKeywordRepeatAndSynonyms() throws IOException {
172+
public void testPreconfigured() throws IOException {
168173
Settings settings = Settings.builder()
169174
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
170175
.put("path.home", createTempDir().toString())
171176
.put("index.analysis.filter.synonyms.type", "synonym")
172-
.putList("index.analysis.filter.synonyms.synonyms", "programmer, developer")
173-
.put("index.analysis.filter.my_english.type", "stemmer")
174-
.put("index.analysis.filter.my_english.language", "porter2")
175-
.put("index.analysis.analyzer.synonymAnalyzer.tokenizer", "standard")
176-
.putList("index.analysis.analyzer.synonymAnalyzer.filter", "lowercase", "keyword_repeat", "my_english", "synonyms")
177+
.putList("index.analysis.filter.synonyms.synonyms", "würst, sausage")
178+
.put("index.analysis.analyzer.my_analyzer.tokenizer", "standard")
179+
.putList("index.analysis.analyzer.my_analyzer.filter", "lowercase", "asciifolding", "synonyms")
177180
.build();
178181
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
179182
indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers;
180183

181-
BaseTokenStreamTestCase.assertAnalyzesTo(indexAnalyzers.get("synonymAnalyzer"), "programmers",
182-
new String[]{ "programmers", "programm", "develop" },
183-
new int[]{ 1, 0, 0 });
184+
BaseTokenStreamTestCase.assertAnalyzesTo(indexAnalyzers.get("my_analyzer"), "würst",
185+
new String[]{ "wurst", "sausage"},
186+
new int[]{ 1, 0 });
184187
}
185188

186189
public void testChainedSynonymFilters() throws IOException {
@@ -247,6 +250,36 @@ public void testTokenFiltersBypassSynonymAnalysis() throws IOException {
247250

248251
}
249252

253+
public void testPreconfiguredTokenFilters() throws IOException {
254+
Set<String> disallowedFilters = new HashSet<>(Arrays.asList(
255+
"common_grams", "edge_ngram", "edgeNGram", "keyword_repeat", "ngram", "nGram",
256+
"shingle", "word_delimiter", "word_delimiter_graph"
257+
));
258+
259+
Settings settings = Settings.builder()
260+
.put(IndexMetaData.SETTING_VERSION_CREATED,
261+
VersionUtils.randomVersionBetween(random(), Version.V_6_0_0, Version.CURRENT))
262+
.put("path.home", createTempDir().toString())
263+
.putList("common_words", "a", "b")
264+
.put("output_unigrams", "true")
265+
.build();
266+
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
267+
268+
CommonAnalysisPlugin plugin = new CommonAnalysisPlugin();
269+
270+
List<String> expectedWarnings = new ArrayList<>();
271+
for (PreConfiguredTokenFilter tf : plugin.getPreConfiguredTokenFilters()) {
272+
if (disallowedFilters.contains(tf.getName())) {
273+
tf.get(idxSettings, null, tf.getName(), settings).getSynonymFilter();
274+
expectedWarnings.add("Token filter [" + tf.getName() + "] will not be usable to parse synonyms after v7.0");
275+
}
276+
else {
277+
tf.get(idxSettings, null, tf.getName(), settings).getSynonymFilter();
278+
}
279+
}
280+
assertWarnings(expectedWarnings.toArray(new String[0]));
281+
}
282+
250283
public void testDisallowedTokenFilters() throws IOException {
251284

252285
Settings settings = Settings.builder()

server/src/main/java/org/elasticsearch/index/analysis/PreConfiguredTokenFilter.java

Lines changed: 34 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,11 @@
1919

2020
package org.elasticsearch.index.analysis;
2121

22+
import org.apache.logging.log4j.LogManager;
2223
import org.apache.lucene.analysis.TokenFilter;
2324
import org.apache.lucene.analysis.TokenStream;
2425
import org.elasticsearch.Version;
26+
import org.elasticsearch.common.logging.DeprecationLogger;
2527
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory;
2628
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
2729

@@ -32,40 +34,54 @@
3234
* Provides pre-configured, shared {@link TokenFilter}s.
3335
*/
3436
public final class PreConfiguredTokenFilter extends PreConfiguredAnalysisComponent<TokenFilterFactory> {
37+
38+
private static final DeprecationLogger DEPRECATION_LOGGER
39+
= new DeprecationLogger(LogManager.getLogger(PreConfiguredTokenFilter.class));
40+
3541
/**
3642
* Create a pre-configured token filter that may not vary at all.
3743
*/
3844
public static PreConfiguredTokenFilter singleton(String name, boolean useFilterForMultitermQueries,
3945
Function<TokenStream, TokenStream> create) {
40-
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, false, CachingStrategy.ONE,
46+
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, true, CachingStrategy.ONE,
4147
(tokenStream, version) -> create.apply(tokenStream));
4248
}
4349

4450
/**
4551
* Create a pre-configured token filter that may not vary at all.
4652
*/
4753
public static PreConfiguredTokenFilter singleton(String name, boolean useFilterForMultitermQueries,
48-
boolean useFilterForParsingSynonyms,
54+
boolean allowForSynonymParsing,
4955
Function<TokenStream, TokenStream> create) {
50-
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, useFilterForParsingSynonyms, CachingStrategy.ONE,
56+
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, allowForSynonymParsing, CachingStrategy.ONE,
5157
(tokenStream, version) -> create.apply(tokenStream));
5258
}
5359

5460
/**
55-
* Create a pre-configured token filter that may not vary at all.
61+
* Create a pre-configured token filter that may vary based on the Elasticsearch version.
5662
*/
5763
public static PreConfiguredTokenFilter singletonWithVersion(String name, boolean useFilterForMultitermQueries,
5864
BiFunction<TokenStream, Version, TokenStream> create) {
59-
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, false, CachingStrategy.ONE,
65+
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, true, CachingStrategy.ONE,
6066
(tokenStream, version) -> create.apply(tokenStream, version));
6167
}
6268

69+
/**
70+
* Create a pre-configured token filter that may vary based on the Elasticsearch version.
71+
*/
72+
public static PreConfiguredTokenFilter singletonWithVersion(String name, boolean useFilterForMultitermQueries,
73+
boolean useFilterForParsingSynonyms,
74+
BiFunction<TokenStream, Version, TokenStream> create) {
75+
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, useFilterForParsingSynonyms, CachingStrategy.ONE,
76+
(tokenStream, version) -> create.apply(tokenStream, version));
77+
}
78+
6379
/**
6480
* Create a pre-configured token filter that may vary based on the Lucene version.
6581
*/
6682
public static PreConfiguredTokenFilter luceneVersion(String name, boolean useFilterForMultitermQueries,
6783
BiFunction<TokenStream, org.apache.lucene.util.Version, TokenStream> create) {
68-
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, false, CachingStrategy.LUCENE,
84+
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, true, CachingStrategy.LUCENE,
6985
(tokenStream, version) -> create.apply(tokenStream, version.luceneVersion));
7086
}
7187

@@ -74,18 +90,18 @@ public static PreConfiguredTokenFilter luceneVersion(String name, boolean useFil
7490
*/
7591
public static PreConfiguredTokenFilter elasticsearchVersion(String name, boolean useFilterForMultitermQueries,
7692
BiFunction<TokenStream, org.elasticsearch.Version, TokenStream> create) {
77-
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, false, CachingStrategy.ELASTICSEARCH, create);
93+
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, true, CachingStrategy.ELASTICSEARCH, create);
7894
}
7995

8096
private final boolean useFilterForMultitermQueries;
81-
private final boolean useFilterForParsingSynonyms;
97+
private final boolean allowForSynonymParsing;
8298
private final BiFunction<TokenStream, Version, TokenStream> create;
8399

84-
private PreConfiguredTokenFilter(String name, boolean useFilterForMultitermQueries, boolean useFilterForParsingSynonyms,
100+
private PreConfiguredTokenFilter(String name, boolean useFilterForMultitermQueries, boolean allowForSynonymParsing,
85101
PreBuiltCacheFactory.CachingStrategy cache, BiFunction<TokenStream, Version, TokenStream> create) {
86102
super(name, cache);
87103
this.useFilterForMultitermQueries = useFilterForMultitermQueries;
88-
this.useFilterForParsingSynonyms = useFilterForParsingSynonyms;
104+
this.allowForSynonymParsing = allowForSynonymParsing;
89105
this.create = create;
90106
}
91107

@@ -118,10 +134,12 @@ public Object getMultiTermComponent() {
118134
}
119135

120136
public TokenFilterFactory getSynonymFilter() {
121-
if (useFilterForParsingSynonyms) {
137+
if (allowForSynonymParsing) {
122138
return this;
123139
}
124-
return IDENTITY_FILTER;
140+
DEPRECATION_LOGGER.deprecatedAndMaybeLog(name(), "Token filter [" + name()
141+
+ "] will not be usable to parse synonyms after v7.0");
142+
return this;
125143
}
126144
};
127145
}
@@ -138,10 +156,12 @@ public TokenStream create(TokenStream tokenStream) {
138156

139157
@Override
140158
public TokenFilterFactory getSynonymFilter() {
141-
if (useFilterForParsingSynonyms) {
159+
if (allowForSynonymParsing) {
142160
return this;
143161
}
144-
return IDENTITY_FILTER;
162+
DEPRECATION_LOGGER.deprecatedAndMaybeLog(name(), "Token filter [" + name()
163+
+ "] will not be usable to parse synonyms after v7.0");
164+
return this;
145165
}
146166
};
147167
}

0 commit comments

Comments
 (0)