Skip to content

Commit 81014e0

Browse files
committed
Add conditional token filter to elasticsearch (#31958)
This allows tokenfilters to be applied selectively, depending on the status of the current token in the tokenstream. The filter takes a scripted predicate, and only applies its subfilter when the predicate returns true.
1 parent aff5658 commit 81014e0

File tree

13 files changed

+578
-1
lines changed

13 files changed

+578
-1
lines changed

docs/painless/painless-contexts/index.asciidoc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ include::painless-metric-agg-reduce-context.asciidoc[]
3030

3131
include::painless-bucket-agg-context.asciidoc[]
3232

33+
include::painless-analysis-predicate-context.asciidoc[]
34+
3335
include::painless-watcher-condition-context.asciidoc[]
3436

3537
include::painless-watcher-transform-context.asciidoc[]
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
[[painless-analysis-predicate-context]]
2+
=== Analysis Predicate Context
3+
4+
Use a painless script to determine whether or not the current token in an
5+
analysis chain matches a predicate.
6+
7+
*Variables*
8+
9+
`params` (`Map`, read-only)::
10+
User-defined parameters passed in as part of the query.
11+
12+
`token.term` (`CharSequence`, read-only)::
13+
The characters of the current token
14+
15+
`token.position` (`int`, read-only)::
16+
The position of the current token
17+
18+
`token.positionIncrement` (`int`, read-only)::
19+
The position increment of the current token
20+
21+
`token.positionLength` (`int`, read-only)::
22+
The position length of the current token
23+
24+
`token.startOffset` (`int`, read-only)::
25+
The start offset of the current token
26+
27+
`token.endOffset` (`int`, read-only)::
28+
The end offset of the current token
29+
30+
`token.type` (`String`, read-only)::
31+
The type of the current token
32+
33+
`token.keyword` ('boolean`, read-only)::
34+
Whether or not the current token is marked as a keyword
35+
36+
*Return*
37+
38+
`boolean`::
39+
Whether or not the current token matches the predicate
40+
41+
*API*
42+
43+
The standard <<painless-api-reference, Painless API>> is available.

docs/reference/analysis/tokenfilters.asciidoc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ include::tokenfilters/word-delimiter-graph-tokenfilter.asciidoc[]
3737

3838
include::tokenfilters/multiplexer-tokenfilter.asciidoc[]
3939

40+
include::tokenfilters/condition-tokenfilter.asciidoc[]
41+
4042
include::tokenfilters/stemmer-tokenfilter.asciidoc[]
4143

4244
include::tokenfilters/stemmer-override-tokenfilter.asciidoc[]
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
[[analysis-condition-tokenfilter]]
2+
=== Conditional Token Filter
3+
4+
The conditional token filter takes a predicate script and a list of subfilters, and
5+
only applies the subfilters to the current token if it matches the predicate.
6+
7+
[float]
8+
=== Options
9+
[horizontal]
10+
filter:: a chain of token filters to apply to the current token if the predicate
11+
matches. These can be any token filters defined elsewhere in the index mappings.
12+
13+
script:: a predicate script that determines whether or not the filters will be applied
14+
to the current token. Note that only inline scripts are supported
15+
16+
[float]
17+
=== Settings example
18+
19+
You can set it up like:
20+
21+
[source,js]
22+
--------------------------------------------------
23+
PUT /condition_example
24+
{
25+
"settings" : {
26+
"analysis" : {
27+
"analyzer" : {
28+
"my_analyzer" : {
29+
"tokenizer" : "standard",
30+
"filter" : [ "my_condition" ]
31+
}
32+
},
33+
"filter" : {
34+
"my_condition" : {
35+
"type" : "condition",
36+
"filter" : [ "lowercase" ],
37+
"script" : {
38+
"source" : "token.getTerm().length() < 5" <1>
39+
}
40+
}
41+
}
42+
}
43+
}
44+
}
45+
--------------------------------------------------
46+
// CONSOLE
47+
48+
<1> This will only apply the lowercase filter to terms that are less than 5
49+
characters in length
50+
51+
And test it like:
52+
53+
[source,js]
54+
--------------------------------------------------
55+
POST /condition_example/_analyze
56+
{
57+
"analyzer" : "my_analyzer",
58+
"text" : "What Flapdoodle"
59+
}
60+
--------------------------------------------------
61+
// CONSOLE
62+
// TEST[continued]
63+
64+
And it'd respond:
65+
66+
[source,js]
67+
--------------------------------------------------
68+
{
69+
"tokens": [
70+
{
71+
"token": "what", <1>
72+
"start_offset": 0,
73+
"end_offset": 4,
74+
"type": "<ALPHANUM>",
75+
"position": 0
76+
},
77+
{
78+
"token": "Flapdoodle", <2>
79+
"start_offset": 5,
80+
"end_offset": 15,
81+
"type": "<ALPHANUM>",
82+
"position": 1
83+
}
84+
]
85+
}
86+
--------------------------------------------------
87+
// TESTRESPONSE
88+
<1> The term `What` has been lowercased, because it is only 4 characters long
89+
<2> The term `Flapdoodle` has been left in its original case, because it doesn't pass
90+
the predicate

modules/analysis-common/build.gradle

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,4 +20,13 @@
2020
esplugin {
2121
description 'Adds "built in" analyzers to Elasticsearch.'
2222
classname 'org.elasticsearch.analysis.common.CommonAnalysisPlugin'
23+
extendedPlugins = ['lang-painless']
24+
}
25+
26+
dependencies {
27+
compileOnly project(':modules:lang-painless')
28+
}
29+
30+
integTestCluster {
31+
module project(':modules:lang-painless')
2332
}
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
/*
2+
* Licensed to Elasticsearch under one or more contributor
3+
* license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright
5+
* ownership. Elasticsearch licenses this file to you under
6+
* the Apache License, Version 2.0 (the "License"); you may
7+
* not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.elasticsearch.analysis.common;
21+
22+
import org.elasticsearch.painless.spi.PainlessExtension;
23+
import org.elasticsearch.painless.spi.Whitelist;
24+
import org.elasticsearch.painless.spi.WhitelistLoader;
25+
import org.elasticsearch.script.ScriptContext;
26+
27+
import java.util.Collections;
28+
import java.util.List;
29+
import java.util.Map;
30+
31+
public class AnalysisPainlessExtension implements PainlessExtension {
32+
33+
private static final Whitelist WHITELIST =
34+
WhitelistLoader.loadFromResourceFiles(AnalysisPainlessExtension.class, "painless_whitelist.txt");
35+
36+
@Override
37+
public Map<ScriptContext<?>, List<Whitelist>> getContextWhitelists() {
38+
return Collections.singletonMap(AnalysisPredicateScript.CONTEXT, Collections.singletonList(WHITELIST));
39+
}
40+
}
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
/*
2+
* Licensed to Elasticsearch under one or more contributor
3+
* license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright
5+
* ownership. Elasticsearch licenses this file to you under
6+
* the Apache License, Version 2.0 (the "License"); you may
7+
* not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.elasticsearch.analysis.common;
21+
22+
import org.elasticsearch.script.ScriptContext;
23+
24+
/**
25+
* A predicate based on the current token in a TokenStream
26+
*/
27+
public abstract class AnalysisPredicateScript {
28+
29+
/**
30+
* Encapsulation of the state of the current token
31+
*/
32+
public static class Token {
33+
public CharSequence term;
34+
public int pos;
35+
public int posInc;
36+
public int posLen;
37+
public int startOffset;
38+
public int endOffset;
39+
public String type;
40+
public boolean isKeyword;
41+
42+
public CharSequence getTerm() {
43+
return term;
44+
}
45+
46+
public int getPositionIncrement() {
47+
return posInc;
48+
}
49+
50+
public int getPosition() {
51+
return pos;
52+
}
53+
54+
public int getPositionLength() {
55+
return posLen;
56+
}
57+
58+
public int getStartOffset() {
59+
return startOffset;
60+
}
61+
62+
public int getEndOffset() {
63+
return endOffset;
64+
}
65+
66+
public String getType() {
67+
return type;
68+
}
69+
70+
public boolean isKeyword() {
71+
return isKeyword;
72+
}
73+
}
74+
75+
/**
76+
* Returns {@code true} if the current term matches the predicate
77+
*/
78+
public abstract boolean execute(Token token);
79+
80+
public interface Factory {
81+
AnalysisPredicateScript newInstance();
82+
}
83+
84+
public static final String[] PARAMETERS = new String[]{ "token" };
85+
public static final ScriptContext<Factory> CONTEXT = new ScriptContext<>("analysis", Factory.class);
86+
87+
}

modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,9 +111,16 @@
111111
import org.apache.lucene.analysis.tr.ApostropheFilter;
112112
import org.apache.lucene.analysis.tr.TurkishAnalyzer;
113113
import org.apache.lucene.analysis.util.ElisionFilter;
114+
import org.apache.lucene.util.SetOnce;
115+
import org.elasticsearch.client.Client;
116+
import org.elasticsearch.cluster.service.ClusterService;
117+
import org.elasticsearch.common.io.stream.NamedWriteableRegistry;
114118
import org.elasticsearch.common.logging.DeprecationLogger;
115119
import org.elasticsearch.common.logging.Loggers;
116120
import org.elasticsearch.common.regex.Regex;
121+
import org.elasticsearch.common.xcontent.NamedXContentRegistry;
122+
import org.elasticsearch.env.Environment;
123+
import org.elasticsearch.env.NodeEnvironment;
117124
import org.elasticsearch.index.analysis.AnalyzerProvider;
118125
import org.elasticsearch.index.analysis.CharFilterFactory;
119126
import org.elasticsearch.index.analysis.PreBuiltAnalyzerProviderFactory;
@@ -127,20 +134,44 @@
127134
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
128135
import org.elasticsearch.plugins.AnalysisPlugin;
129136
import org.elasticsearch.plugins.Plugin;
137+
import org.elasticsearch.plugins.ScriptPlugin;
138+
import org.elasticsearch.script.ScriptContext;
139+
import org.elasticsearch.script.ScriptService;
140+
import org.elasticsearch.threadpool.ThreadPool;
141+
import org.elasticsearch.watcher.ResourceWatcherService;
130142
import org.tartarus.snowball.ext.DutchStemmer;
131143
import org.tartarus.snowball.ext.FrenchStemmer;
132144

133145
import java.util.ArrayList;
146+
import java.util.Collection;
147+
import java.util.Collections;
134148
import java.util.List;
135149
import java.util.Map;
136150
import java.util.TreeMap;
137151

138152
import static org.elasticsearch.plugins.AnalysisPlugin.requiresAnalysisSettings;
139153

140-
public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
154+
public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin, ScriptPlugin {
141155

142156
private static final DeprecationLogger DEPRECATION_LOGGER = new DeprecationLogger(Loggers.getLogger(CommonAnalysisPlugin.class));
143157

158+
private final SetOnce<ScriptService> scriptService = new SetOnce<>();
159+
160+
@Override
161+
public Collection<Object> createComponents(Client client, ClusterService clusterService, ThreadPool threadPool,
162+
ResourceWatcherService resourceWatcherService, ScriptService scriptService,
163+
NamedXContentRegistry xContentRegistry, Environment environment,
164+
NodeEnvironment nodeEnvironment, NamedWriteableRegistry namedWriteableRegistry) {
165+
this.scriptService.set(scriptService);
166+
return Collections.emptyList();
167+
}
168+
169+
@Override
170+
@SuppressWarnings("rawtypes") // TODO ScriptPlugin needs to change this to pass precommit?
171+
public List<ScriptContext> getContexts() {
172+
return Collections.singletonList(AnalysisPredicateScript.CONTEXT);
173+
}
174+
144175
@Override
145176
public Map<String, AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> getAnalyzers() {
146177
Map<String, AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> analyzers = new TreeMap<>();
@@ -202,6 +233,8 @@ public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
202233
filters.put("classic", ClassicFilterFactory::new);
203234
filters.put("czech_stem", CzechStemTokenFilterFactory::new);
204235
filters.put("common_grams", requiresAnalysisSettings(CommonGramsTokenFilterFactory::new));
236+
filters.put("condition",
237+
requiresAnalysisSettings((i, e, n, s) -> new ScriptedConditionTokenFilterFactory(i, n, s, scriptService.get())));
205238
filters.put("decimal_digit", DecimalDigitFilterFactory::new);
206239
filters.put("delimited_payload_filter", LegacyDelimitedPayloadTokenFilterFactory::new);
207240
filters.put("delimited_payload", DelimitedPayloadTokenFilterFactory::new);

0 commit comments

Comments
 (0)