Skip to content

Commit 4ffa050

Browse files
author
Christoph Büscher
committed
Allow custom characters in token_chars of ngram tokenizers (#49250)
Currently the `token_chars` setting in both `edgeNGram` and `ngram` tokenizers only allows for a list of predefined character classes, which might not fit every use case. For example, including underscore "_" in a token would currently require the `punctuation` class which comes with a lot of other characters. This change adds an additional "custom" option to the `token_chars` setting, which requires an additional `custom_token_chars` setting to be present and which will be interpreted as a set of characters to inlcude into a token. Closes #25894
1 parent c6b3116 commit 4ffa050

File tree

6 files changed

+91
-16
lines changed

6 files changed

+91
-16
lines changed

docs/reference/analysis/tokenizers/edgengram-tokenizer.asciidoc

+8
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,14 @@ Character classes may be any of the following:
9696
* `whitespace` -- for example `" "` or `"\n"`
9797
* `punctuation` -- for example `!` or `"`
9898
* `symbol` -- for example `$` or `√`
99+
* `custom` -- custom characters which need to be set using the
100+
`custom_token_chars` setting.
101+
102+
`custom_token_chars`::
103+
104+
Custom characters that should be treated as part of a token. For example,
105+
setting this to `+-_` will make the tokenizer treat the plus, minus and
106+
underscore sign as part of a token.
99107

100108
[[max-gram-limits]]
101109
=== Limitations of the `max_gram` parameter

docs/reference/analysis/tokenizers/ngram-tokenizer.asciidoc

+8
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,14 @@ Character classes may be any of the following:
190190
* `whitespace` -- for example `" "` or `"\n"`
191191
* `punctuation` -- for example `!` or `"`
192192
* `symbol` -- for example `$` or `√`
193+
* `custom` -- custom characters which need to be set using the
194+
`custom_token_chars` setting.
195+
196+
`custom_token_chars`::
197+
198+
Custom characters that should be treated as part of a token. For example,
199+
setting this to `+-_` will make the tokenizer treat the plus, minus and
200+
underscore sign as part of a token.
193201

194202
TIP: It usually makes sense to set `min_gram` and `max_gram` to the same
195203
value. The smaller the length, the more documents will match but the lower

modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EdgeNGramTokenizerFactory.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ public class EdgeNGramTokenizerFactory extends AbstractTokenizerFactory {
3939
super(indexSettings, settings, name);
4040
this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
4141
this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
42-
this.matcher = parseTokenChars(settings.getAsList("token_chars"));
42+
this.matcher = parseTokenChars(settings);
4343
}
4444

4545
@Override

modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/NGramTokenizerFactory.java

+24-3
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,14 @@
2929

3030
import java.lang.reflect.Field;
3131
import java.lang.reflect.Modifier;
32+
import java.util.Collections;
3233
import java.util.HashMap;
3334
import java.util.List;
3435
import java.util.Locale;
3536
import java.util.Map;
37+
import java.util.Set;
38+
import java.util.stream.Collectors;
39+
import java.util.stream.Stream;
3640

3741
import static java.util.Collections.unmodifiableMap;
3842

@@ -68,7 +72,8 @@ public class NGramTokenizerFactory extends AbstractTokenizerFactory {
6872
MATCHERS = unmodifiableMap(matchers);
6973
}
7074

71-
static CharMatcher parseTokenChars(List<String> characterClasses) {
75+
static CharMatcher parseTokenChars(Settings settings) {
76+
List<String> characterClasses = settings.getAsList("token_chars");
7277
if (characterClasses == null || characterClasses.isEmpty()) {
7378
return null;
7479
}
@@ -77,7 +82,23 @@ static CharMatcher parseTokenChars(List<String> characterClasses) {
7782
characterClass = characterClass.toLowerCase(Locale.ROOT).trim();
7883
CharMatcher matcher = MATCHERS.get(characterClass);
7984
if (matcher == null) {
80-
throw new IllegalArgumentException("Unknown token type: '" + characterClass + "', must be one of " + MATCHERS.keySet());
85+
if (characterClass.equals("custom") == false) {
86+
throw new IllegalArgumentException("Unknown token type: '" + characterClass + "', must be one of " + Stream
87+
.of(MATCHERS.keySet(), Collections.singleton("custom")).flatMap(x -> x.stream()).collect(Collectors.toSet()));
88+
}
89+
String customCharacters = settings.get("custom_token_chars");
90+
if (customCharacters == null) {
91+
throw new IllegalArgumentException("Token type: 'custom' requires setting `custom_token_chars`");
92+
}
93+
final Set<Integer> customCharSet = customCharacters.chars().boxed().collect(Collectors.toSet());
94+
matcher = new CharMatcher() {
95+
96+
@Override
97+
public boolean isTokenChar(int c) {
98+
return customCharSet.contains(c);
99+
}
100+
101+
};
81102
}
82103
builder.or(matcher);
83104
}
@@ -101,7 +122,7 @@ static CharMatcher parseTokenChars(List<String> characterClasses) {
101122
+ "expected difference must be less than or equal to: [" + maxAllowedNgramDiff + "]");
102123
}
103124
}
104-
this.matcher = parseTokenChars(settings.getAsList("token_chars"));
125+
this.matcher = parseTokenChars(settings);
105126
}
106127

107128
@Override

modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/EdgeNGramTokenizerTests.java

+16
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,13 @@
1919

2020
package org.elasticsearch.analysis.common;
2121

22+
import org.apache.lucene.analysis.Tokenizer;
2223
import org.elasticsearch.Version;
2324
import org.elasticsearch.cluster.metadata.IndexMetaData;
2425
import org.elasticsearch.common.settings.Settings;
2526
import org.elasticsearch.env.Environment;
2627
import org.elasticsearch.env.TestEnvironment;
28+
import org.elasticsearch.index.Index;
2729
import org.elasticsearch.index.IndexSettings;
2830
import org.elasticsearch.index.analysis.IndexAnalyzers;
2931
import org.elasticsearch.index.analysis.NamedAnalyzer;
@@ -33,6 +35,7 @@
3335
import org.elasticsearch.test.VersionUtils;
3436

3537
import java.io.IOException;
38+
import java.io.StringReader;
3639
import java.util.Collections;
3740

3841
public class EdgeNGramTokenizerTests extends ESTokenStreamTestCase {
@@ -95,4 +98,17 @@ public void testPreConfiguredTokenizer() throws IOException {
9598

9699
}
97100

101+
public void testCustomTokenChars() throws IOException {
102+
final Index index = new Index("test", "_na_");
103+
final String name = "engr";
104+
final Settings indexSettings = newAnalysisSettingsBuilder().put(IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey(), 2).build();
105+
106+
final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3)
107+
.putList("token_chars", "letter", "custom").put("custom_token_chars","_-").build();
108+
Tokenizer tokenizer = new EdgeNGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name,
109+
settings).create();
110+
tokenizer.setReader(new StringReader("Abc -gh _jk =lm"));
111+
assertTokenStreamContents(tokenizer, new String[] {"Ab", "Abc", "-g", "-gh", "_j", "_jk", "lm"});
112+
}
113+
98114
}

modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/NGramTokenizerFactoryTests.java

+34-12
Original file line numberDiff line numberDiff line change
@@ -46,25 +46,34 @@ public void testParseTokenChars() {
4646
final Index index = new Index("test", "_na_");
4747
final String name = "ngr";
4848
final Settings indexSettings = newAnalysisSettingsBuilder().build();
49-
IndexSettings indexProperties = IndexSettingsModule.newIndexSettings(index, indexSettings);
50-
for (String tokenChars : Arrays.asList("letters", "number", "DIRECTIONALITY_UNDEFINED")) {
49+
final IndexSettings indexProperties = IndexSettingsModule.newIndexSettings(index, indexSettings);
50+
for (String tokenChars : Arrays.asList("letter", " digit ", "punctuation", "DIGIT", "CoNtRoL", "dash_punctuation")) {
5151
final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3)
5252
.put("token_chars", tokenChars).build();
53-
try {
54-
new NGramTokenizerFactory(indexProperties, null, name, settings).create();
55-
fail();
56-
} catch (IllegalArgumentException expected) {
57-
// OK
58-
}
53+
new NGramTokenizerFactory(indexProperties, null, name, settings).create();
54+
// no exception
5955
}
60-
for (String tokenChars : Arrays.asList("letter", " digit ", "punctuation", "DIGIT", "CoNtRoL", "dash_punctuation")) {
56+
{
6157
final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3)
62-
.put("token_chars", tokenChars).build();
63-
indexProperties = IndexSettingsModule.newIndexSettings(index, indexSettings);
64-
58+
.put("token_chars", "DIRECTIONALITY_UNDEFINED").build();
59+
IllegalArgumentException ex = expectThrows(IllegalArgumentException.class,
60+
() -> new NGramTokenizerFactory(indexProperties, null, name, settings).create());
61+
assertEquals("Unknown token type: 'directionality_undefined'", ex.getMessage().substring(0, 46));
62+
assertTrue(ex.getMessage().contains("custom"));
63+
}
64+
{
65+
final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "custom")
66+
.put("custom_token_chars", "_-").build();
6567
new NGramTokenizerFactory(indexProperties, null, name, settings).create();
6668
// no exception
6769
}
70+
{
71+
final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "custom")
72+
.build();
73+
IllegalArgumentException ex = expectThrows(IllegalArgumentException.class,
74+
() -> new NGramTokenizerFactory(indexProperties, null, name, settings).create());
75+
assertEquals("Token type: 'custom' requires setting `custom_token_chars`", ex.getMessage());
76+
}
6877
}
6978

7079
public void testNoTokenChars() throws IOException {
@@ -80,6 +89,19 @@ public void testNoTokenChars() throws IOException {
8089
assertTokenStreamContents(tokenizer, new String[] {"1.", "1.3", "1.34", ".3", ".34", "34"});
8190
}
8291

92+
public void testCustomTokenChars() throws IOException {
93+
final Index index = new Index("test", "_na_");
94+
final String name = "ngr";
95+
final Settings indexSettings = newAnalysisSettingsBuilder().put(IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey(), 2).build();
96+
97+
final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3)
98+
.putList("token_chars", "letter", "custom").put("custom_token_chars","_-").build();
99+
Tokenizer tokenizer = new NGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings)
100+
.create();
101+
tokenizer.setReader(new StringReader("Abc -gh _jk =lm"));
102+
assertTokenStreamContents(tokenizer, new String[] {"Ab", "Abc", "bc", "-g", "-gh", "gh", "_j", "_jk", "jk", "lm"});
103+
}
104+
83105
public void testPreTokenization() throws IOException {
84106
// Make sure that pretokenization works well and that it can be used even with token chars which are supplementary characters
85107
final Index index = new Index("test", "_na_");

0 commit comments

Comments
 (0)