Skip to content

Commit bf7bb3d

Browse files
author
Christoph Büscher
committed
Add exclusion option to keep_types token filter (elastic#32012)
Currently the `keep_types` token filter includes all token types specified using its `types` parameter. Lucenes TypeTokenFilter also provides a second mode where instead of keeping the specified tokens (include) they are filtered out (exclude). This change exposes this option as a new `mode` parameter that can either take the values `include` (the default, if not specified) or `exclude`. Closes elastic#29277
1 parent f555426 commit bf7bb3d

File tree

3 files changed

+142
-16
lines changed

3 files changed

+142
-16
lines changed

docs/reference/analysis/tokenfilters/keep-types-tokenfilter.asciidoc

Lines changed: 71 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,9 @@ contained in a predefined set.
88
[float]
99
=== Options
1010
[horizontal]
11-
types:: a list of types to keep
12-
11+
types:: a list of types to include (default mode) or exclude
12+
mode:: if set to `include` (default) the specified token types will be kept,
13+
if set to `exclude` the specified token types will be removed from the stream
1314

1415
[float]
1516
=== Settings example
@@ -53,7 +54,7 @@ POST /keep_types_example/_analyze
5354
// CONSOLE
5455
// TEST[continued]
5556

56-
And it'd respond:
57+
The response will be:
5758

5859
[source,js]
5960
--------------------------------------------------
@@ -72,3 +73,70 @@ And it'd respond:
7273
// TESTRESPONSE
7374

7475
Note how only the `<NUM>` token is in the output.
76+
77+
=== Exclude mode settings example
78+
79+
If the `mode` parameter is set to `exclude` like in the following example:
80+
81+
[source,js]
82+
--------------------------------------------------
83+
PUT /keep_types_exclude_example
84+
{
85+
"settings" : {
86+
"analysis" : {
87+
"analyzer" : {
88+
"my_analyzer" : {
89+
"tokenizer" : "standard",
90+
"filter" : ["standard", "lowercase", "remove_numbers"]
91+
}
92+
},
93+
"filter" : {
94+
"remove_numbers" : {
95+
"type" : "keep_types",
96+
"mode" : "exclude",
97+
"types" : [ "<NUM>" ]
98+
}
99+
}
100+
}
101+
}
102+
}
103+
--------------------------------------------------
104+
// CONSOLE
105+
106+
And we test it like:
107+
108+
[source,js]
109+
--------------------------------------------------
110+
POST /keep_types_exclude_example/_analyze
111+
{
112+
"analyzer" : "my_analyzer",
113+
"text" : "hello 101 world"
114+
}
115+
--------------------------------------------------
116+
// CONSOLE
117+
// TEST[continued]
118+
119+
The response will be:
120+
121+
[source,js]
122+
--------------------------------------------------
123+
{
124+
"tokens": [
125+
{
126+
"token": "hello",
127+
"start_offset": 0,
128+
"end_offset": 5,
129+
"type": "<ALPHANUM>",
130+
"position": 0
131+
},
132+
{
133+
"token": "world",
134+
"start_offset": 10,
135+
"end_offset": 15,
136+
"type": "<ALPHANUM>",
137+
"position": 2
138+
}
139+
]
140+
}
141+
--------------------------------------------------
142+
// TESTRESPONSE

modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/KeepTypesFilterFactory.java

Lines changed: 31 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,21 +29,47 @@
2929

3030
import java.util.HashSet;
3131
import java.util.List;
32+
import java.util.Locale;
3233
import java.util.Set;
3334

3435
/**
3536
* A {@link TokenFilterFactory} for {@link TypeTokenFilter}. This filter only
3637
* keep tokens that are contained in the set configured via
37-
* {@value #KEEP_TYPES_KEY} setting.
38+
* {@value #KEEP_TYPES_MODE_KEY} setting.
3839
* <p>
3940
* Configuration options:
4041
* <ul>
41-
* <li>{@value #KEEP_TYPES_KEY} the array of words / tokens to keep.</li>
42+
* <li>{@value #KEEP_TYPES_KEY} the array of words / tokens.</li>
43+
* <li>{@value #KEEP_TYPES_MODE_KEY} whether to keep ("include") or discard
44+
* ("exclude") the specified token types.</li>
4245
* </ul>
4346
*/
4447
public class KeepTypesFilterFactory extends AbstractTokenFilterFactory {
4548
private final Set<String> keepTypes;
46-
private static final String KEEP_TYPES_KEY = "types";
49+
private final KeepTypesMode includeMode;
50+
static final String KEEP_TYPES_KEY = "types";
51+
static final String KEEP_TYPES_MODE_KEY = "mode";
52+
53+
enum KeepTypesMode {
54+
INCLUDE, EXCLUDE;
55+
56+
@Override
57+
public String toString() {
58+
return this.name().toLowerCase(Locale.ROOT);
59+
}
60+
61+
private static KeepTypesMode fromString(String modeString) {
62+
String lc = modeString.toLowerCase(Locale.ROOT);
63+
if (lc.equals("include")) {
64+
return INCLUDE;
65+
} else if (lc.equals("exclude")) {
66+
return EXCLUDE;
67+
} else {
68+
throw new IllegalArgumentException("`keep_types` tokenfilter mode can only be [" + KeepTypesMode.INCLUDE + "] or ["
69+
+ KeepTypesMode.EXCLUDE + "] but was [" + modeString + "].");
70+
}
71+
}
72+
};
4773

4874
KeepTypesFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
4975
super(indexSettings, name, settings);
@@ -52,12 +78,12 @@ public class KeepTypesFilterFactory extends AbstractTokenFilterFactory {
5278
if ((arrayKeepTypes == null)) {
5379
throw new IllegalArgumentException("keep_types requires `" + KEEP_TYPES_KEY + "` to be configured");
5480
}
55-
81+
this.includeMode = KeepTypesMode.fromString(settings.get(KEEP_TYPES_MODE_KEY, "include"));
5682
this.keepTypes = new HashSet<>(arrayKeepTypes);
5783
}
5884

5985
@Override
6086
public TokenStream create(TokenStream tokenStream) {
61-
return new TypeTokenFilter(tokenStream, keepTypes, true);
87+
return new TypeTokenFilter(tokenStream, keepTypes, includeMode == KeepTypesMode.INCLUDE);
6288
}
6389
}

modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/KeepTypesFilterFactoryTests.java

Lines changed: 40 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -34,19 +34,51 @@
3434
import static org.hamcrest.Matchers.instanceOf;
3535

3636
public class KeepTypesFilterFactoryTests extends ESTokenStreamTestCase {
37-
public void testKeepTypes() throws IOException {
38-
Settings settings = Settings.builder()
39-
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
40-
.put("index.analysis.filter.keep_numbers.type", "keep_types")
41-
.putList("index.analysis.filter.keep_numbers.types", new String[] {"<NUM>", "<SOMETHINGELSE>"})
42-
.build();
37+
38+
private static final String BASE_SETTING = "index.analysis.filter.keep_numbers";
39+
40+
public void testKeepTypesInclude() throws IOException {
41+
Settings.Builder settingsBuilder = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
42+
.put(BASE_SETTING + ".type", "keep_types")
43+
.putList(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_KEY, new String[] { "<NUM>", "<SOMETHINGELSE>" });
44+
// either use default mode or set "include" mode explicitly
45+
if (random().nextBoolean()) {
46+
settingsBuilder.put(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_MODE_KEY,
47+
KeepTypesFilterFactory.KeepTypesMode.INCLUDE);
48+
}
49+
Settings settings = settingsBuilder.build();
50+
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
51+
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("keep_numbers");
52+
assertThat(tokenFilter, instanceOf(KeepTypesFilterFactory.class));
53+
String source = "Hello 123 world";
54+
String[] expected = new String[] { "123" };
55+
Tokenizer tokenizer = new StandardTokenizer();
56+
tokenizer.setReader(new StringReader(source));
57+
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[] { 2 });
58+
}
59+
60+
public void testKeepTypesExclude() throws IOException {
61+
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
62+
.put(BASE_SETTING + ".type", "keep_types")
63+
.putList(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_KEY, new String[] { "<NUM>", "<SOMETHINGELSE>" })
64+
.put(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_MODE_KEY, KeepTypesFilterFactory.KeepTypesMode.EXCLUDE).build();
4365
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
4466
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("keep_numbers");
4567
assertThat(tokenFilter, instanceOf(KeepTypesFilterFactory.class));
4668
String source = "Hello 123 world";
47-
String[] expected = new String[]{"123"};
69+
String[] expected = new String[] { "Hello", "world" };
4870
Tokenizer tokenizer = new StandardTokenizer();
4971
tokenizer.setReader(new StringReader(source));
50-
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[]{2});
72+
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[] { 1, 2 });
73+
}
74+
75+
public void testKeepTypesException() throws IOException {
76+
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
77+
.put(BASE_SETTING + ".type", "keep_types")
78+
.putList(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_KEY, new String[] { "<NUM>", "<SOMETHINGELSE>" })
79+
.put(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_MODE_KEY, "bad_parameter").build();
80+
IllegalArgumentException ex = expectThrows(IllegalArgumentException.class,
81+
() -> AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin()));
82+
assertEquals("`keep_types` tokenfilter mode can only be [include] or [exclude] but was [bad_parameter].", ex.getMessage());
5183
}
5284
}

0 commit comments

Comments
 (0)