Skip to content

Commit 5107949

Browse files
authored
Allow TokenFilterFactories to rewrite themselves against their preceding chain (#33702)
We currently special-case SynonymFilterFactory and SynonymGraphFilterFactory, which need to know their predecessors in the analysis chain in order to correctly analyze their synonym lists. This special-casing doesn't work with Referring filter factories, such as the Multiplexer or Conditional filters. We also have a number of filters (eg the Multiplexer) that will break synonyms when they appear before them in a chain, because they produce multiple tokens at the same position. This commit adds two methods to the TokenFilterFactory interface. * `getChainAwareTokenFilterFactory()` allows a filter factory to rewrite itself against its preceding filter chain, or to resolve references to other filters. It replaces `ReferringFilterFactory` and `CustomAnalyzerProvider.checkAndApplySynonymFilter`, and by default returns `this`. * `getSynonymFilter()` defines whether or not a filter should be applied when building a synonym list `Analyzer`. By default it returns `true`. Fixes #33609
1 parent 4190a9f commit 5107949

File tree

12 files changed

+271
-241
lines changed

12 files changed

+271
-241
lines changed

docs/reference/analysis/tokenfilters/multiplexer-tokenfilter.asciidoc

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,4 +113,12 @@ And it'd respond:
113113
// TESTRESPONSE
114114

115115
<1> The stemmer has also emitted a token `home` at position 1, but because it is a
116-
duplicate of this token it has been removed from the token stream
116+
duplicate of this token it has been removed from the token stream
117+
118+
NOTE: The synonym and synonym_graph filters use their preceding analysis chain to
119+
parse and analyse their synonym lists, and ignore any token filters in the chain
120+
that produce multiple tokens at the same position. This means that any filters
121+
within the multiplexer will be ignored for the purpose of synonyms. If you want to
122+
use filters contained within the multiplexer for parsing synonyms (for example, to
123+
apply stemming to the synonym lists), then you should append the synonym filter
124+
to the relevant multiplexer filter list.

modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/MultiplexerTokenFilterFactory.java

Lines changed: 42 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -29,33 +29,20 @@
2929
import org.elasticsearch.env.Environment;
3030
import org.elasticsearch.index.IndexSettings;
3131
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
32-
import org.elasticsearch.index.analysis.ReferringFilterFactory;
32+
import org.elasticsearch.index.analysis.CharFilterFactory;
3333
import org.elasticsearch.index.analysis.TokenFilterFactory;
34+
import org.elasticsearch.index.analysis.TokenizerFactory;
3435

3536
import java.io.IOException;
3637
import java.util.ArrayList;
3738
import java.util.List;
38-
import java.util.Map;
3939
import java.util.function.Function;
4040

41-
public class MultiplexerTokenFilterFactory extends AbstractTokenFilterFactory implements ReferringFilterFactory {
41+
public class MultiplexerTokenFilterFactory extends AbstractTokenFilterFactory {
4242

43-
private List<TokenFilterFactory> filters;
4443
private List<String> filterNames;
4544
private final boolean preserveOriginal;
4645

47-
private static final TokenFilterFactory IDENTITY_FACTORY = new TokenFilterFactory() {
48-
@Override
49-
public String name() {
50-
return "identity";
51-
}
52-
53-
@Override
54-
public TokenStream create(TokenStream tokenStream) {
55-
return tokenStream;
56-
}
57-
};
58-
5946
public MultiplexerTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) throws IOException {
6047
super(indexSettings, name, settings);
6148
this.filterNames = settings.getAsList("filters");
@@ -64,31 +51,56 @@ public MultiplexerTokenFilterFactory(IndexSettings indexSettings, Environment en
6451

6552
@Override
6653
public TokenStream create(TokenStream tokenStream) {
67-
List<Function<TokenStream, TokenStream>> functions = new ArrayList<>();
68-
for (TokenFilterFactory tff : filters) {
69-
functions.add(tff::create);
70-
}
71-
return new RemoveDuplicatesTokenFilter(new MultiplexTokenFilter(tokenStream, functions));
54+
throw new UnsupportedOperationException("TokenFilterFactory.getChainAwareTokenFilterFactory() must be called first");
7255
}
7356

7457
@Override
75-
public void setReferences(Map<String, TokenFilterFactory> factories) {
76-
filters = new ArrayList<>();
58+
public TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
59+
List<TokenFilterFactory> previousTokenFilters,
60+
Function<String, TokenFilterFactory> allFilters) {
61+
List<TokenFilterFactory> filters = new ArrayList<>();
7762
if (preserveOriginal) {
78-
filters.add(IDENTITY_FACTORY);
63+
filters.add(IDENTITY_FILTER);
7964
}
8065
for (String filter : filterNames) {
8166
String[] parts = Strings.tokenizeToStringArray(filter, ",");
8267
if (parts.length == 1) {
83-
filters.add(resolveFilterFactory(factories, parts[0]));
68+
TokenFilterFactory factory = resolveFilterFactory(allFilters, parts[0]);
69+
factory = factory.getChainAwareTokenFilterFactory(tokenizer, charFilters, previousTokenFilters, allFilters);
70+
filters.add(factory);
8471
} else {
72+
List<TokenFilterFactory> existingChain = new ArrayList<>(previousTokenFilters);
8573
List<TokenFilterFactory> chain = new ArrayList<>();
8674
for (String subfilter : parts) {
87-
chain.add(resolveFilterFactory(factories, subfilter));
75+
TokenFilterFactory factory = resolveFilterFactory(allFilters, subfilter);
76+
factory = factory.getChainAwareTokenFilterFactory(tokenizer, charFilters, existingChain, allFilters);
77+
chain.add(factory);
78+
existingChain.add(factory);
8879
}
8980
filters.add(chainFilters(filter, chain));
9081
}
9182
}
83+
84+
return new TokenFilterFactory() {
85+
@Override
86+
public String name() {
87+
return MultiplexerTokenFilterFactory.this.name();
88+
}
89+
90+
@Override
91+
public TokenStream create(TokenStream tokenStream) {
92+
List<Function<TokenStream, TokenStream>> functions = new ArrayList<>();
93+
for (TokenFilterFactory tff : filters) {
94+
functions.add(tff::create);
95+
}
96+
return new RemoveDuplicatesTokenFilter(new MultiplexTokenFilter(tokenStream, functions));
97+
}
98+
99+
@Override
100+
public TokenFilterFactory getSynonymFilter() {
101+
return IDENTITY_FILTER;
102+
}
103+
};
92104
}
93105

94106
private TokenFilterFactory chainFilters(String name, List<TokenFilterFactory> filters) {
@@ -108,11 +120,12 @@ public TokenStream create(TokenStream tokenStream) {
108120
};
109121
}
110122

111-
private TokenFilterFactory resolveFilterFactory(Map<String, TokenFilterFactory> factories, String name) {
112-
if (factories.containsKey(name) == false) {
123+
private TokenFilterFactory resolveFilterFactory(Function<String, TokenFilterFactory> factories, String name) {
124+
TokenFilterFactory factory = factories.apply(name);
125+
if (factory == null) {
113126
throw new IllegalArgumentException("Multiplexing filter [" + name() + "] refers to undefined tokenfilter [" + name + "]");
114127
} else {
115-
return factories.get(name);
128+
return factory;
116129
}
117130
}
118131

modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ScriptedConditionTokenFilterFactory.java

Lines changed: 40 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -24,26 +24,24 @@
2424
import org.elasticsearch.common.settings.Settings;
2525
import org.elasticsearch.index.IndexSettings;
2626
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
27-
import org.elasticsearch.index.analysis.ReferringFilterFactory;
27+
import org.elasticsearch.index.analysis.CharFilterFactory;
2828
import org.elasticsearch.index.analysis.TokenFilterFactory;
29+
import org.elasticsearch.index.analysis.TokenizerFactory;
2930
import org.elasticsearch.script.Script;
3031
import org.elasticsearch.script.ScriptService;
3132
import org.elasticsearch.script.ScriptType;
3233

33-
import java.io.IOException;
3434
import java.util.ArrayList;
3535
import java.util.List;
36-
import java.util.Map;
3736
import java.util.function.Function;
3837

3938
/**
4039
* A factory for a conditional token filter that only applies child filters if the underlying token
4140
* matches an {@link AnalysisPredicateScript}
4241
*/
43-
public class ScriptedConditionTokenFilterFactory extends AbstractTokenFilterFactory implements ReferringFilterFactory {
42+
public class ScriptedConditionTokenFilterFactory extends AbstractTokenFilterFactory {
4443

4544
private final AnalysisPredicateScript.Factory factory;
46-
private final List<TokenFilterFactory> filters = new ArrayList<>();
4745
private final List<String> filterNames;
4846

4947
ScriptedConditionTokenFilterFactory(IndexSettings indexSettings, String name,
@@ -65,13 +63,43 @@ public class ScriptedConditionTokenFilterFactory extends AbstractTokenFilterFact
6563

6664
@Override
6765
public TokenStream create(TokenStream tokenStream) {
68-
Function<TokenStream, TokenStream> filter = in -> {
69-
for (TokenFilterFactory tff : filters) {
70-
in = tff.create(in);
66+
throw new UnsupportedOperationException("getChainAwareTokenFilterFactory should be called first");
67+
}
68+
69+
@Override
70+
public TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
71+
List<TokenFilterFactory> previousTokenFilters,
72+
Function<String, TokenFilterFactory> allFilters) {
73+
List<TokenFilterFactory> filters = new ArrayList<>();
74+
List<TokenFilterFactory> existingChain = new ArrayList<>(previousTokenFilters);
75+
for (String filter : filterNames) {
76+
TokenFilterFactory tff = allFilters.apply(filter);
77+
if (tff == null) {
78+
throw new IllegalArgumentException("ScriptedConditionTokenFilter [" + name() +
79+
"] refers to undefined token filter [" + filter + "]");
80+
}
81+
tff = tff.getChainAwareTokenFilterFactory(tokenizer, charFilters, existingChain, allFilters);
82+
filters.add(tff);
83+
existingChain.add(tff);
84+
}
85+
86+
return new TokenFilterFactory() {
87+
@Override
88+
public String name() {
89+
return ScriptedConditionTokenFilterFactory.this.name();
90+
}
91+
92+
@Override
93+
public TokenStream create(TokenStream tokenStream) {
94+
Function<TokenStream, TokenStream> filter = in -> {
95+
for (TokenFilterFactory tff : filters) {
96+
in = tff.create(in);
97+
}
98+
return in;
99+
};
100+
return new ScriptedConditionTokenFilter(tokenStream, filter, factory.newInstance());
71101
}
72-
return in;
73102
};
74-
return new ScriptedConditionTokenFilter(tokenStream, filter, factory.newInstance());
75103
}
76104

77105
private static class ScriptedConditionTokenFilter extends ConditionalTokenFilter {
@@ -80,29 +108,17 @@ private static class ScriptedConditionTokenFilter extends ConditionalTokenFilter
80108
private final AnalysisPredicateScript.Token token;
81109

82110
ScriptedConditionTokenFilter(TokenStream input, Function<TokenStream, TokenStream> inputFactory,
83-
AnalysisPredicateScript script) {
111+
AnalysisPredicateScript script) {
84112
super(input, inputFactory);
85113
this.script = script;
86114
this.token = new AnalysisPredicateScript.Token(this);
87115
}
88116

89117
@Override
90-
protected boolean shouldFilter() throws IOException {
118+
protected boolean shouldFilter() {
91119
token.updatePosition();
92120
return script.execute(token);
93121
}
94122
}
95123

96-
@Override
97-
public void setReferences(Map<String, TokenFilterFactory> factories) {
98-
for (String filter : filterNames) {
99-
TokenFilterFactory tff = factories.get(filter);
100-
if (tff == null) {
101-
throw new IllegalArgumentException("ScriptedConditionTokenFilter [" + name() +
102-
"] refers to undefined token filter [" + filter + "]");
103-
}
104-
filters.add(tff);
105-
}
106-
}
107-
108124
}

modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/SynonymsAnalysisTests.java

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
package org.elasticsearch.analysis.common;
2121

2222
import org.apache.lucene.analysis.Analyzer;
23+
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
2324
import org.apache.lucene.analysis.TokenStream;
2425
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
2526
import org.elasticsearch.Version;
@@ -117,6 +118,26 @@ public void testExpandSynonymWordDeleteByAnalyzer() throws IOException {
117118
}
118119
}
119120

121+
public void testSynonymsWithMultiplexer() throws IOException {
122+
Settings settings = Settings.builder()
123+
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
124+
.put("path.home", createTempDir().toString())
125+
.put("index.analysis.filter.synonyms.type", "synonym")
126+
.putList("index.analysis.filter.synonyms.synonyms", "programmer, developer")
127+
.put("index.analysis.filter.my_english.type", "stemmer")
128+
.put("index.analysis.filter.my_english.language", "porter2")
129+
.put("index.analysis.filter.stem_repeat.type", "multiplexer")
130+
.putList("index.analysis.filter.stem_repeat.filters", "my_english, synonyms")
131+
.put("index.analysis.analyzer.synonymAnalyzer.tokenizer", "standard")
132+
.putList("index.analysis.analyzer.synonymAnalyzer.filter", "lowercase", "stem_repeat")
133+
.build();
134+
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
135+
indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers;
136+
137+
BaseTokenStreamTestCase.assertAnalyzesTo(indexAnalyzers.get("synonymAnalyzer"), "Some developers are odd",
138+
new String[]{ "some", "developers", "develop", "programm", "are", "odd" },
139+
new int[]{ 1, 1, 0, 0, 1, 1 });
140+
}
120141

121142
private void match(String analyzerName, String source, String target) throws IOException {
122143
Analyzer analyzer = indexAnalyzers.get(analyzerName).analyzer();

0 commit comments

Comments
 (0)