Skip to content

Commit 51b230f

Browse files
authored
Fix PreConfiguredTokenFilters getSynonymFilter() implementations (#38839) (#43678)
When we added support for TokenFilterFactories to specialise how they were used when parsing synonym files, PreConfiguredTokenFilters were set up to either apply themselves, or be ignored. This behaviour is a leftover from an earlier iteration, and also has an incorrect default. This commit makes preconfigured token filters usable in synonym file parsing by default, and brings those filters that should not be used into line with index-specific filter factories; in indexes created before version 7 we emit a deprecation warning, and we throw an error in indexes created after. Fixes #38793
1 parent 94f18da commit 51b230f

File tree

3 files changed

+115
-31
lines changed

3 files changed

+115
-31
lines changed

modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java

+8-8
Original file line numberDiff line numberDiff line change
@@ -401,7 +401,7 @@ public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
401401
filters.add(PreConfiguredTokenFilter.singleton("cjk_bigram", false, CJKBigramFilter::new));
402402
filters.add(PreConfiguredTokenFilter.singleton("cjk_width", true, CJKWidthFilter::new));
403403
filters.add(PreConfiguredTokenFilter.singleton("classic", false, ClassicFilter::new));
404-
filters.add(PreConfiguredTokenFilter.singleton("common_grams", false,
404+
filters.add(PreConfiguredTokenFilter.singleton("common_grams", false, false,
405405
input -> new CommonGramsFilter(input, CharArraySet.EMPTY_SET)));
406406
filters.add(PreConfiguredTokenFilter.singleton("czech_stem", false, CzechStemFilter::new));
407407
filters.add(PreConfiguredTokenFilter.singleton("decimal_digit", true, DecimalDigitFilter::new));
@@ -422,9 +422,9 @@ public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
422422
DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER,
423423
DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER)));
424424
filters.add(PreConfiguredTokenFilter.singleton("dutch_stem", false, input -> new SnowballFilter(input, new DutchStemmer())));
425-
filters.add(PreConfiguredTokenFilter.singleton("edge_ngram", false, input ->
425+
filters.add(PreConfiguredTokenFilter.singleton("edge_ngram", false, false, input ->
426426
new EdgeNGramTokenFilter(input, 1)));
427-
filters.add(PreConfiguredTokenFilter.singletonWithVersion("edgeNGram", false, (reader, version) -> {
427+
filters.add(PreConfiguredTokenFilter.singletonWithVersion("edgeNGram", false, false, (reader, version) -> {
428428
if (version.onOrAfter(org.elasticsearch.Version.V_7_0_0)) {
429429
throw new IllegalArgumentException(
430430
"The [edgeNGram] token filter name was deprecated in 6.4 and cannot be used in new indices. "
@@ -451,8 +451,8 @@ public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
451451
new LimitTokenCountFilter(input,
452452
LimitTokenCountFilterFactory.DEFAULT_MAX_TOKEN_COUNT,
453453
LimitTokenCountFilterFactory.DEFAULT_CONSUME_ALL_TOKENS)));
454-
filters.add(PreConfiguredTokenFilter.singleton("ngram", false, reader -> new NGramTokenFilter(reader, 1, 2, false)));
455-
filters.add(PreConfiguredTokenFilter.singletonWithVersion("nGram", false, (reader, version) -> {
454+
filters.add(PreConfiguredTokenFilter.singleton("ngram", false, false, reader -> new NGramTokenFilter(reader, 1, 2, false)));
455+
filters.add(PreConfiguredTokenFilter.singletonWithVersion("nGram", false, false, (reader, version) -> {
456456
if (version.onOrAfter(org.elasticsearch.Version.V_7_0_0)) {
457457
throw new IllegalArgumentException("The [nGram] token filter name was deprecated in 6.4 and cannot be used in new indices. "
458458
+ "Please change the filter name to [ngram] instead.");
@@ -469,7 +469,7 @@ public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
469469
filters.add(PreConfiguredTokenFilter.singleton("russian_stem", false, input -> new SnowballFilter(input, "Russian")));
470470
filters.add(PreConfiguredTokenFilter.singleton("scandinavian_folding", true, ScandinavianFoldingFilter::new));
471471
filters.add(PreConfiguredTokenFilter.singleton("scandinavian_normalization", true, ScandinavianNormalizationFilter::new));
472-
filters.add(PreConfiguredTokenFilter.singleton("shingle", false, input -> {
472+
filters.add(PreConfiguredTokenFilter.singleton("shingle", false, false, input -> {
473473
TokenStream ts = new ShingleFilter(input);
474474
/**
475475
* We disable the graph analysis on this token stream
@@ -491,14 +491,14 @@ public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
491491
filters.add(PreConfiguredTokenFilter.singleton("type_as_payload", false, TypeAsPayloadTokenFilter::new));
492492
filters.add(PreConfiguredTokenFilter.singleton("unique", false, UniqueTokenFilter::new));
493493
filters.add(PreConfiguredTokenFilter.singleton("uppercase", true, UpperCaseFilter::new));
494-
filters.add(PreConfiguredTokenFilter.singleton("word_delimiter", false, input ->
494+
filters.add(PreConfiguredTokenFilter.singleton("word_delimiter", false, false, input ->
495495
new WordDelimiterFilter(input,
496496
WordDelimiterFilter.GENERATE_WORD_PARTS
497497
| WordDelimiterFilter.GENERATE_NUMBER_PARTS
498498
| WordDelimiterFilter.SPLIT_ON_CASE_CHANGE
499499
| WordDelimiterFilter.SPLIT_ON_NUMERICS
500500
| WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE, null)));
501-
filters.add(PreConfiguredTokenFilter.singletonWithVersion("word_delimiter_graph", false, (input, version) -> {
501+
filters.add(PreConfiguredTokenFilter.singletonWithVersion("word_delimiter_graph", false, false, (input, version) -> {
502502
boolean adjustOffsets = version.onOrAfter(Version.V_7_3_0);
503503
return new WordDelimiterGraphFilter(input, adjustOffsets, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE,
504504
WordDelimiterGraphFilter.GENERATE_WORD_PARTS

modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/SynonymsAnalysisTests.java

+63-9
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
import org.elasticsearch.env.Environment;
3131
import org.elasticsearch.index.IndexSettings;
3232
import org.elasticsearch.index.analysis.IndexAnalyzers;
33+
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
3334
import org.elasticsearch.index.analysis.TokenFilterFactory;
3435
import org.elasticsearch.index.analysis.TokenizerFactory;
3536
import org.elasticsearch.test.ESTestCase;
@@ -42,8 +43,11 @@
4243
import java.nio.file.Files;
4344
import java.nio.file.Path;
4445
import java.util.ArrayList;
46+
import java.util.Arrays;
4547
import java.util.Collections;
48+
import java.util.HashSet;
4649
import java.util.List;
50+
import java.util.Set;
4751

4852
import static org.hamcrest.Matchers.equalTo;
4953
import static org.hamcrest.Matchers.instanceOf;
@@ -163,23 +167,21 @@ public void testAsciiFoldingFilterForSynonyms() throws IOException {
163167
new int[]{ 1, 0 });
164168
}
165169

166-
public void testKeywordRepeatAndSynonyms() throws IOException {
170+
public void testPreconfigured() throws IOException {
167171
Settings settings = Settings.builder()
168172
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
169173
.put("path.home", createTempDir().toString())
170174
.put("index.analysis.filter.synonyms.type", "synonym")
171-
.putList("index.analysis.filter.synonyms.synonyms", "programmer, developer")
172-
.put("index.analysis.filter.my_english.type", "stemmer")
173-
.put("index.analysis.filter.my_english.language", "porter2")
174-
.put("index.analysis.analyzer.synonymAnalyzer.tokenizer", "standard")
175-
.putList("index.analysis.analyzer.synonymAnalyzer.filter", "lowercase", "keyword_repeat", "my_english", "synonyms")
175+
.putList("index.analysis.filter.synonyms.synonyms", "würst, sausage")
176+
.put("index.analysis.analyzer.my_analyzer.tokenizer", "standard")
177+
.putList("index.analysis.analyzer.my_analyzer.filter", "lowercase", "asciifolding", "synonyms")
176178
.build();
177179
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
178180
indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers;
179181

180-
BaseTokenStreamTestCase.assertAnalyzesTo(indexAnalyzers.get("synonymAnalyzer"), "programmers",
181-
new String[]{ "programmers", "programm", "develop" },
182-
new int[]{ 1, 0, 0 });
182+
BaseTokenStreamTestCase.assertAnalyzesTo(indexAnalyzers.get("my_analyzer"), "würst",
183+
new String[]{ "wurst", "sausage"},
184+
new int[]{ 1, 0 });
183185
}
184186

185187
public void testChainedSynonymFilters() throws IOException {
@@ -248,6 +250,58 @@ public void testTokenFiltersBypassSynonymAnalysis() throws IOException {
248250

249251
}
250252

253+
public void testPreconfiguredTokenFilters() throws IOException {
254+
Set<String> disallowedFilters = new HashSet<>(Arrays.asList(
255+
"common_grams", "edge_ngram", "edgeNGram", "keyword_repeat", "ngram", "nGram",
256+
"shingle", "word_delimiter", "word_delimiter_graph"
257+
));
258+
259+
Settings settings = Settings.builder()
260+
.put(IndexMetaData.SETTING_VERSION_CREATED,
261+
VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, Version.CURRENT))
262+
.put("path.home", createTempDir().toString())
263+
.build();
264+
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
265+
266+
CommonAnalysisPlugin plugin = new CommonAnalysisPlugin();
267+
268+
for (PreConfiguredTokenFilter tf : plugin.getPreConfiguredTokenFilters()) {
269+
if (disallowedFilters.contains(tf.getName())) {
270+
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
271+
"Expected exception for factory " + tf.getName(), () -> {
272+
tf.get(idxSettings, null, tf.getName(), settings).getSynonymFilter();
273+
});
274+
assertEquals(tf.getName(), "Token filter [" + tf.getName()
275+
+ "] cannot be used to parse synonyms",
276+
e.getMessage());
277+
}
278+
else {
279+
tf.get(idxSettings, null, tf.getName(), settings).getSynonymFilter();
280+
}
281+
}
282+
283+
Settings settings2 = Settings.builder()
284+
.put(IndexMetaData.SETTING_VERSION_CREATED,
285+
VersionUtils.randomVersionBetween(random(), Version.V_6_0_0, VersionUtils.getPreviousVersion(Version.V_7_0_0)))
286+
.put("path.home", createTempDir().toString())
287+
.putList("common_words", "a", "b")
288+
.put("output_unigrams", "true")
289+
.build();
290+
IndexSettings idxSettings2 = IndexSettingsModule.newIndexSettings("index", settings2);
291+
292+
List<String> expectedWarnings = new ArrayList<>();
293+
for (PreConfiguredTokenFilter tf : plugin.getPreConfiguredTokenFilters()) {
294+
if (disallowedFilters.contains(tf.getName())) {
295+
tf.get(idxSettings2, null, tf.getName(), settings2).getSynonymFilter();
296+
expectedWarnings.add("Token filter [" + tf.getName() + "] will not be usable to parse synonyms after v7.0");
297+
}
298+
else {
299+
tf.get(idxSettings2, null, tf.getName(), settings2).getSynonymFilter();
300+
}
301+
}
302+
assertWarnings(expectedWarnings.toArray(new String[0]));
303+
}
304+
251305
public void testDisallowedTokenFilters() throws IOException {
252306

253307
Settings settings = Settings.builder()

server/src/main/java/org/elasticsearch/index/analysis/PreConfiguredTokenFilter.java

+44-14
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,11 @@
1919

2020
package org.elasticsearch.index.analysis;
2121

22+
import org.apache.logging.log4j.LogManager;
2223
import org.apache.lucene.analysis.TokenFilter;
2324
import org.apache.lucene.analysis.TokenStream;
2425
import org.elasticsearch.Version;
26+
import org.elasticsearch.common.logging.DeprecationLogger;
2527
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory;
2628
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
2729

@@ -32,40 +34,54 @@
3234
* Provides pre-configured, shared {@link TokenFilter}s.
3335
*/
3436
public final class PreConfiguredTokenFilter extends PreConfiguredAnalysisComponent<TokenFilterFactory> {
37+
38+
private static final DeprecationLogger DEPRECATION_LOGGER
39+
= new DeprecationLogger(LogManager.getLogger(PreConfiguredTokenFilter.class));
40+
3541
/**
3642
* Create a pre-configured token filter that may not vary at all.
3743
*/
3844
public static PreConfiguredTokenFilter singleton(String name, boolean useFilterForMultitermQueries,
3945
Function<TokenStream, TokenStream> create) {
40-
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, false, CachingStrategy.ONE,
46+
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, true, CachingStrategy.ONE,
4147
(tokenStream, version) -> create.apply(tokenStream));
4248
}
4349

4450
/**
4551
* Create a pre-configured token filter that may not vary at all.
4652
*/
4753
public static PreConfiguredTokenFilter singleton(String name, boolean useFilterForMultitermQueries,
48-
boolean useFilterForParsingSynonyms,
54+
boolean allowForSynonymParsing,
4955
Function<TokenStream, TokenStream> create) {
50-
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, useFilterForParsingSynonyms, CachingStrategy.ONE,
56+
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, allowForSynonymParsing, CachingStrategy.ONE,
5157
(tokenStream, version) -> create.apply(tokenStream));
5258
}
5359

5460
/**
55-
* Create a pre-configured token filter that may not vary at all.
61+
* Create a pre-configured token filter that may vary based on the Elasticsearch version.
5662
*/
5763
public static PreConfiguredTokenFilter singletonWithVersion(String name, boolean useFilterForMultitermQueries,
5864
BiFunction<TokenStream, Version, TokenStream> create) {
59-
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, false, CachingStrategy.ONE,
65+
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, true, CachingStrategy.ONE,
6066
(tokenStream, version) -> create.apply(tokenStream, version));
6167
}
6268

69+
/**
70+
* Create a pre-configured token filter that may vary based on the Elasticsearch version.
71+
*/
72+
public static PreConfiguredTokenFilter singletonWithVersion(String name, boolean useFilterForMultitermQueries,
73+
boolean useFilterForParsingSynonyms,
74+
BiFunction<TokenStream, Version, TokenStream> create) {
75+
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, useFilterForParsingSynonyms, CachingStrategy.ONE,
76+
(tokenStream, version) -> create.apply(tokenStream, version));
77+
}
78+
6379
/**
6480
* Create a pre-configured token filter that may vary based on the Lucene version.
6581
*/
6682
public static PreConfiguredTokenFilter luceneVersion(String name, boolean useFilterForMultitermQueries,
6783
BiFunction<TokenStream, org.apache.lucene.util.Version, TokenStream> create) {
68-
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, false, CachingStrategy.LUCENE,
84+
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, true, CachingStrategy.LUCENE,
6985
(tokenStream, version) -> create.apply(tokenStream, version.luceneVersion));
7086
}
7187

@@ -74,18 +90,18 @@ public static PreConfiguredTokenFilter luceneVersion(String name, boolean useFil
7490
*/
7591
public static PreConfiguredTokenFilter elasticsearchVersion(String name, boolean useFilterForMultitermQueries,
7692
BiFunction<TokenStream, org.elasticsearch.Version, TokenStream> create) {
77-
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, false, CachingStrategy.ELASTICSEARCH, create);
93+
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, true, CachingStrategy.ELASTICSEARCH, create);
7894
}
7995

8096
private final boolean useFilterForMultitermQueries;
81-
private final boolean useFilterForParsingSynonyms;
97+
private final boolean allowForSynonymParsing;
8298
private final BiFunction<TokenStream, Version, TokenStream> create;
8399

84-
private PreConfiguredTokenFilter(String name, boolean useFilterForMultitermQueries, boolean useFilterForParsingSynonyms,
100+
private PreConfiguredTokenFilter(String name, boolean useFilterForMultitermQueries, boolean allowForSynonymParsing,
85101
PreBuiltCacheFactory.CachingStrategy cache, BiFunction<TokenStream, Version, TokenStream> create) {
86102
super(name, cache);
87103
this.useFilterForMultitermQueries = useFilterForMultitermQueries;
88-
this.useFilterForParsingSynonyms = useFilterForParsingSynonyms;
104+
this.allowForSynonymParsing = allowForSynonymParsing;
89105
this.create = create;
90106
}
91107

@@ -118,10 +134,17 @@ public TokenStream create(TokenStream tokenStream) {
118134

119135
@Override
120136
public TokenFilterFactory getSynonymFilter() {
121-
if (useFilterForParsingSynonyms) {
137+
if (allowForSynonymParsing) {
138+
return this;
139+
}
140+
if (version.onOrAfter(Version.V_7_0_0)) {
141+
throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms");
142+
}
143+
else {
144+
DEPRECATION_LOGGER.deprecatedAndMaybeLog(name(), "Token filter [" + name()
145+
+ "] will not be usable to parse synonyms after v7.0");
122146
return this;
123147
}
124-
return IDENTITY_FILTER;
125148
}
126149
};
127150
}
@@ -138,10 +161,17 @@ public TokenStream create(TokenStream tokenStream) {
138161

139162
@Override
140163
public TokenFilterFactory getSynonymFilter() {
141-
if (useFilterForParsingSynonyms) {
164+
if (allowForSynonymParsing) {
165+
return this;
166+
}
167+
if (version.onOrAfter(Version.V_7_0_0)) {
168+
throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms");
169+
}
170+
else {
171+
DEPRECATION_LOGGER.deprecatedAndMaybeLog(name(), "Token filter [" + name()
172+
+ "] will not be usable to parse synonyms after v7.0");
142173
return this;
143174
}
144-
return IDENTITY_FILTER;
145175
}
146176
};
147177
}

0 commit comments

Comments
 (0)