Skip to content

Fix PreConfiguredTokenFilters getSynonymFilter() implementations #43678

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 28, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -401,7 +401,7 @@ public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
filters.add(PreConfiguredTokenFilter.singleton("cjk_bigram", false, CJKBigramFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("cjk_width", true, CJKWidthFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("classic", false, ClassicFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("common_grams", false,
filters.add(PreConfiguredTokenFilter.singleton("common_grams", false, false,
input -> new CommonGramsFilter(input, CharArraySet.EMPTY_SET)));
filters.add(PreConfiguredTokenFilter.singleton("czech_stem", false, CzechStemFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("decimal_digit", true, DecimalDigitFilter::new));
Expand All @@ -422,9 +422,9 @@ public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER,
DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER)));
filters.add(PreConfiguredTokenFilter.singleton("dutch_stem", false, input -> new SnowballFilter(input, new DutchStemmer())));
filters.add(PreConfiguredTokenFilter.singleton("edge_ngram", false, input ->
filters.add(PreConfiguredTokenFilter.singleton("edge_ngram", false, false, input ->
new EdgeNGramTokenFilter(input, 1)));
filters.add(PreConfiguredTokenFilter.singletonWithVersion("edgeNGram", false, (reader, version) -> {
filters.add(PreConfiguredTokenFilter.singletonWithVersion("edgeNGram", false, false, (reader, version) -> {
if (version.onOrAfter(org.elasticsearch.Version.V_7_0_0)) {
throw new IllegalArgumentException(
"The [edgeNGram] token filter name was deprecated in 6.4 and cannot be used in new indices. "
Expand All @@ -451,8 +451,8 @@ public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
new LimitTokenCountFilter(input,
LimitTokenCountFilterFactory.DEFAULT_MAX_TOKEN_COUNT,
LimitTokenCountFilterFactory.DEFAULT_CONSUME_ALL_TOKENS)));
filters.add(PreConfiguredTokenFilter.singleton("ngram", false, reader -> new NGramTokenFilter(reader, 1, 2, false)));
filters.add(PreConfiguredTokenFilter.singletonWithVersion("nGram", false, (reader, version) -> {
filters.add(PreConfiguredTokenFilter.singleton("ngram", false, false, reader -> new NGramTokenFilter(reader, 1, 2, false)));
filters.add(PreConfiguredTokenFilter.singletonWithVersion("nGram", false, false, (reader, version) -> {
if (version.onOrAfter(org.elasticsearch.Version.V_7_0_0)) {
throw new IllegalArgumentException("The [nGram] token filter name was deprecated in 6.4 and cannot be used in new indices. "
+ "Please change the filter name to [ngram] instead.");
Expand All @@ -469,7 +469,7 @@ public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
filters.add(PreConfiguredTokenFilter.singleton("russian_stem", false, input -> new SnowballFilter(input, "Russian")));
filters.add(PreConfiguredTokenFilter.singleton("scandinavian_folding", true, ScandinavianFoldingFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("scandinavian_normalization", true, ScandinavianNormalizationFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("shingle", false, input -> {
filters.add(PreConfiguredTokenFilter.singleton("shingle", false, false, input -> {
TokenStream ts = new ShingleFilter(input);
/**
* We disable the graph analysis on this token stream
Expand All @@ -491,14 +491,14 @@ public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
filters.add(PreConfiguredTokenFilter.singleton("type_as_payload", false, TypeAsPayloadTokenFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("unique", false, UniqueTokenFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("uppercase", true, UpperCaseFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("word_delimiter", false, input ->
filters.add(PreConfiguredTokenFilter.singleton("word_delimiter", false, false, input ->
new WordDelimiterFilter(input,
WordDelimiterFilter.GENERATE_WORD_PARTS
| WordDelimiterFilter.GENERATE_NUMBER_PARTS
| WordDelimiterFilter.SPLIT_ON_CASE_CHANGE
| WordDelimiterFilter.SPLIT_ON_NUMERICS
| WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE, null)));
filters.add(PreConfiguredTokenFilter.singletonWithVersion("word_delimiter_graph", false, (input, version) -> {
filters.add(PreConfiguredTokenFilter.singletonWithVersion("word_delimiter_graph", false, false, (input, version) -> {
boolean adjustOffsets = version.onOrAfter(Version.V_7_3_0);
return new WordDelimiterGraphFilter(input, adjustOffsets, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE,
WordDelimiterGraphFilter.GENERATE_WORD_PARTS
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.IndexAnalyzers;
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.index.analysis.TokenizerFactory;
import org.elasticsearch.test.ESTestCase;
Expand All @@ -42,8 +43,11 @@
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.instanceOf;
Expand Down Expand Up @@ -163,23 +167,21 @@ public void testAsciiFoldingFilterForSynonyms() throws IOException {
new int[]{ 1, 0 });
}

public void testKeywordRepeatAndSynonyms() throws IOException {
public void testPreconfigured() throws IOException {
Settings settings = Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
.put("path.home", createTempDir().toString())
.put("index.analysis.filter.synonyms.type", "synonym")
.putList("index.analysis.filter.synonyms.synonyms", "programmer, developer")
.put("index.analysis.filter.my_english.type", "stemmer")
.put("index.analysis.filter.my_english.language", "porter2")
.put("index.analysis.analyzer.synonymAnalyzer.tokenizer", "standard")
.putList("index.analysis.analyzer.synonymAnalyzer.filter", "lowercase", "keyword_repeat", "my_english", "synonyms")
.putList("index.analysis.filter.synonyms.synonyms", "würst, sausage")
.put("index.analysis.analyzer.my_analyzer.tokenizer", "standard")
.putList("index.analysis.analyzer.my_analyzer.filter", "lowercase", "asciifolding", "synonyms")
.build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers;

BaseTokenStreamTestCase.assertAnalyzesTo(indexAnalyzers.get("synonymAnalyzer"), "programmers",
new String[]{ "programmers", "programm", "develop" },
new int[]{ 1, 0, 0 });
BaseTokenStreamTestCase.assertAnalyzesTo(indexAnalyzers.get("my_analyzer"), "würst",
new String[]{ "wurst", "sausage"},
new int[]{ 1, 0 });
}

public void testChainedSynonymFilters() throws IOException {
Expand Down Expand Up @@ -248,6 +250,58 @@ public void testTokenFiltersBypassSynonymAnalysis() throws IOException {

}

public void testPreconfiguredTokenFilters() throws IOException {
Set<String> disallowedFilters = new HashSet<>(Arrays.asList(
"common_grams", "edge_ngram", "edgeNGram", "keyword_repeat", "ngram", "nGram",
"shingle", "word_delimiter", "word_delimiter_graph"
));

Settings settings = Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED,
VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, Version.CURRENT))
.put("path.home", createTempDir().toString())
.build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);

CommonAnalysisPlugin plugin = new CommonAnalysisPlugin();

for (PreConfiguredTokenFilter tf : plugin.getPreConfiguredTokenFilters()) {
if (disallowedFilters.contains(tf.getName())) {
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
"Expected exception for factory " + tf.getName(), () -> {
tf.get(idxSettings, null, tf.getName(), settings).getSynonymFilter();
});
assertEquals(tf.getName(), "Token filter [" + tf.getName()
+ "] cannot be used to parse synonyms",
e.getMessage());
}
else {
tf.get(idxSettings, null, tf.getName(), settings).getSynonymFilter();
}
}

Settings settings2 = Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED,
VersionUtils.randomVersionBetween(random(), Version.V_6_0_0, VersionUtils.getPreviousVersion(Version.V_7_0_0)))
.put("path.home", createTempDir().toString())
.putList("common_words", "a", "b")
.put("output_unigrams", "true")
.build();
IndexSettings idxSettings2 = IndexSettingsModule.newIndexSettings("index", settings2);

List<String> expectedWarnings = new ArrayList<>();
for (PreConfiguredTokenFilter tf : plugin.getPreConfiguredTokenFilters()) {
if (disallowedFilters.contains(tf.getName())) {
tf.get(idxSettings2, null, tf.getName(), settings2).getSynonymFilter();
expectedWarnings.add("Token filter [" + tf.getName() + "] will not be usable to parse synonyms after v7.0");
}
else {
tf.get(idxSettings2, null, tf.getName(), settings2).getSynonymFilter();
}
}
assertWarnings(expectedWarnings.toArray(new String[0]));
}

public void testDisallowedTokenFilters() throws IOException {

Settings settings = Settings.builder()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,11 @@

package org.elasticsearch.index.analysis;

import org.apache.logging.log4j.LogManager;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.elasticsearch.Version;
import org.elasticsearch.common.logging.DeprecationLogger;
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory;
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;

Expand All @@ -32,40 +34,54 @@
* Provides pre-configured, shared {@link TokenFilter}s.
*/
public final class PreConfiguredTokenFilter extends PreConfiguredAnalysisComponent<TokenFilterFactory> {

private static final DeprecationLogger DEPRECATION_LOGGER
= new DeprecationLogger(LogManager.getLogger(PreConfiguredTokenFilter.class));

/**
* Create a pre-configured token filter that may not vary at all.
*/
public static PreConfiguredTokenFilter singleton(String name, boolean useFilterForMultitermQueries,
Function<TokenStream, TokenStream> create) {
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, false, CachingStrategy.ONE,
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, true, CachingStrategy.ONE,
(tokenStream, version) -> create.apply(tokenStream));
}

/**
* Create a pre-configured token filter that may not vary at all.
*/
public static PreConfiguredTokenFilter singleton(String name, boolean useFilterForMultitermQueries,
boolean useFilterForParsingSynonyms,
boolean allowForSynonymParsing,
Function<TokenStream, TokenStream> create) {
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, useFilterForParsingSynonyms, CachingStrategy.ONE,
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, allowForSynonymParsing, CachingStrategy.ONE,
(tokenStream, version) -> create.apply(tokenStream));
}

/**
* Create a pre-configured token filter that may not vary at all.
* Create a pre-configured token filter that may vary based on the Elasticsearch version.
*/
public static PreConfiguredTokenFilter singletonWithVersion(String name, boolean useFilterForMultitermQueries,
BiFunction<TokenStream, Version, TokenStream> create) {
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, false, CachingStrategy.ONE,
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, true, CachingStrategy.ONE,
(tokenStream, version) -> create.apply(tokenStream, version));
}

/**
* Create a pre-configured token filter that may vary based on the Elasticsearch version.
*/
public static PreConfiguredTokenFilter singletonWithVersion(String name, boolean useFilterForMultitermQueries,
boolean useFilterForParsingSynonyms,
BiFunction<TokenStream, Version, TokenStream> create) {
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, useFilterForParsingSynonyms, CachingStrategy.ONE,
(tokenStream, version) -> create.apply(tokenStream, version));
}

/**
* Create a pre-configured token filter that may vary based on the Lucene version.
*/
public static PreConfiguredTokenFilter luceneVersion(String name, boolean useFilterForMultitermQueries,
BiFunction<TokenStream, org.apache.lucene.util.Version, TokenStream> create) {
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, false, CachingStrategy.LUCENE,
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, true, CachingStrategy.LUCENE,
(tokenStream, version) -> create.apply(tokenStream, version.luceneVersion));
}

Expand All @@ -74,18 +90,18 @@ public static PreConfiguredTokenFilter luceneVersion(String name, boolean useFil
*/
public static PreConfiguredTokenFilter elasticsearchVersion(String name, boolean useFilterForMultitermQueries,
BiFunction<TokenStream, org.elasticsearch.Version, TokenStream> create) {
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, false, CachingStrategy.ELASTICSEARCH, create);
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, true, CachingStrategy.ELASTICSEARCH, create);
}

private final boolean useFilterForMultitermQueries;
private final boolean useFilterForParsingSynonyms;
private final boolean allowForSynonymParsing;
private final BiFunction<TokenStream, Version, TokenStream> create;

private PreConfiguredTokenFilter(String name, boolean useFilterForMultitermQueries, boolean useFilterForParsingSynonyms,
private PreConfiguredTokenFilter(String name, boolean useFilterForMultitermQueries, boolean allowForSynonymParsing,
PreBuiltCacheFactory.CachingStrategy cache, BiFunction<TokenStream, Version, TokenStream> create) {
super(name, cache);
this.useFilterForMultitermQueries = useFilterForMultitermQueries;
this.useFilterForParsingSynonyms = useFilterForParsingSynonyms;
this.allowForSynonymParsing = allowForSynonymParsing;
this.create = create;
}

Expand Down Expand Up @@ -118,10 +134,17 @@ public TokenStream create(TokenStream tokenStream) {

@Override
public TokenFilterFactory getSynonymFilter() {
if (useFilterForParsingSynonyms) {
if (allowForSynonymParsing) {
return this;
}
if (version.onOrAfter(Version.V_7_0_0)) {
throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms");
}
else {
DEPRECATION_LOGGER.deprecatedAndMaybeLog(name(), "Token filter [" + name()
+ "] will not be usable to parse synonyms after v7.0");
return this;
}
return IDENTITY_FILTER;
}
};
}
Expand All @@ -138,10 +161,17 @@ public TokenStream create(TokenStream tokenStream) {

@Override
public TokenFilterFactory getSynonymFilter() {
if (useFilterForParsingSynonyms) {
if (allowForSynonymParsing) {
return this;
}
if (version.onOrAfter(Version.V_7_0_0)) {
throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms");
}
else {
DEPRECATION_LOGGER.deprecatedAndMaybeLog(name(), "Token filter [" + name()
+ "] will not be usable to parse synonyms after v7.0");
return this;
}
return IDENTITY_FILTER;
}
};
}
Expand Down