Skip to content

Commit ccf2913

Browse files
author
Christoph Büscher
committed
Allow custom characters in token_chars of ngram tokenizers
Currently the `token_chars` setting in both `edgeNGram` and `ngram` tokenizers only allows for a list of predefined character classes, which might not fit every use case. For example, including underscore "_" in a token would currently require the `punctuation` class which comes with a lot of other characters. This change adds an additional "custom" option to the `token_chars` setting, which requires an additional `custom_token_chars` setting to be present and which will be interpreted as a set of characters to inlcude into a token. Closes #25894
1 parent 529ebea commit ccf2913

File tree

6 files changed

+87
-16
lines changed

6 files changed

+87
-16
lines changed

docs/reference/analysis/tokenizers/edgengram-tokenizer.asciidoc

+8
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,14 @@ Character classes may be any of the following:
9696
* `whitespace` -- for example `" "` or `"\n"`
9797
* `punctuation` -- for example `!` or `"`
9898
* `symbol` -- for example `$` or `√`
99+
* `custom` -- custom characters which need to be set using the
100+
`custom_token_chars` setting.
101+
102+
`custom_token_chars`::
103+
104+
Custom characters that should be treated as part of a token. For example,
105+
setting this to `+-_` will make the tokenizer treat the plus, minus and
106+
underscore sign as part of a token.
99107

100108
[[max-gram-limits]]
101109
=== Limitations of the `max_gram` parameter

docs/reference/analysis/tokenizers/ngram-tokenizer.asciidoc

+8
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,14 @@ Character classes may be any of the following:
190190
* `whitespace` -- for example `" "` or `"\n"`
191191
* `punctuation` -- for example `!` or `"`
192192
* `symbol` -- for example `$` or `√`
193+
* `custom` -- custom characters which need to be set using the
194+
`custom_token_chars` setting.
195+
196+
`custom_token_chars`::
197+
198+
Custom characters that should be treated as part of a token. For example,
199+
setting this to `+-_` will make the tokenizer treat the plus, minus and
200+
underscore sign as part of a token.
193201

194202
TIP: It usually makes sense to set `min_gram` and `max_gram` to the same
195203
value. The smaller the length, the more documents will match but the lower

modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EdgeNGramTokenizerFactory.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ public class EdgeNGramTokenizerFactory extends AbstractTokenizerFactory {
3939
super(indexSettings, settings, name);
4040
this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
4141
this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
42-
this.matcher = parseTokenChars(settings.getAsList("token_chars"));
42+
this.matcher = parseTokenChars(settings);
4343
}
4444

4545
@Override

modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/NGramTokenizerFactory.java

+21-3
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@
3232
import java.util.List;
3333
import java.util.Locale;
3434
import java.util.Map;
35+
import java.util.Set;
36+
import java.util.stream.Collectors;
3537

3638
import static java.util.Collections.unmodifiableMap;
3739

@@ -67,7 +69,8 @@ public class NGramTokenizerFactory extends AbstractTokenizerFactory {
6769
MATCHERS = unmodifiableMap(matchers);
6870
}
6971

70-
static CharMatcher parseTokenChars(List<String> characterClasses) {
72+
static CharMatcher parseTokenChars(Settings settings) {
73+
List<String> characterClasses = settings.getAsList("token_chars");
7174
if (characterClasses == null || characterClasses.isEmpty()) {
7275
return null;
7376
}
@@ -76,7 +79,22 @@ static CharMatcher parseTokenChars(List<String> characterClasses) {
7679
characterClass = characterClass.toLowerCase(Locale.ROOT).trim();
7780
CharMatcher matcher = MATCHERS.get(characterClass);
7881
if (matcher == null) {
79-
throw new IllegalArgumentException("Unknown token type: '" + characterClass + "', must be one of " + MATCHERS.keySet());
82+
if (characterClass.equals("custom") == false) {
83+
throw new IllegalArgumentException("Unknown token type: '" + characterClass + "', must be one of " + MATCHERS.keySet());
84+
}
85+
String customCharacters = settings.get("custom_token_chars");
86+
if (customCharacters == null) {
87+
throw new IllegalArgumentException("Token type: 'custom' requires setting `custom_token_chars`");
88+
}
89+
final Set<Integer> customCharSet = customCharacters.chars().boxed().collect(Collectors.toSet());
90+
matcher = new CharMatcher() {
91+
92+
@Override
93+
public boolean isTokenChar(int c) {
94+
return customCharSet.contains(c);
95+
}
96+
97+
};
8098
}
8199
builder.or(matcher);
82100
}
@@ -95,7 +113,7 @@ static CharMatcher parseTokenChars(List<String> characterClasses) {
95113
+ maxAllowedNgramDiff + "] but was [" + ngramDiff + "]. This limit can be set by changing the ["
96114
+ IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey() + "] index level setting.");
97115
}
98-
this.matcher = parseTokenChars(settings.getAsList("token_chars"));
116+
this.matcher = parseTokenChars(settings);
99117
}
100118

101119
@Override

modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/EdgeNGramTokenizerTests.java

+16
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,13 @@
1919

2020
package org.elasticsearch.analysis.common;
2121

22+
import org.apache.lucene.analysis.Tokenizer;
2223
import org.elasticsearch.Version;
2324
import org.elasticsearch.cluster.metadata.IndexMetaData;
2425
import org.elasticsearch.common.settings.Settings;
2526
import org.elasticsearch.env.Environment;
2627
import org.elasticsearch.env.TestEnvironment;
28+
import org.elasticsearch.index.Index;
2729
import org.elasticsearch.index.IndexSettings;
2830
import org.elasticsearch.index.analysis.IndexAnalyzers;
2931
import org.elasticsearch.index.analysis.NamedAnalyzer;
@@ -33,6 +35,7 @@
3335
import org.elasticsearch.test.VersionUtils;
3436

3537
import java.io.IOException;
38+
import java.io.StringReader;
3639
import java.util.Collections;
3740

3841
public class EdgeNGramTokenizerTests extends ESTokenStreamTestCase {
@@ -95,4 +98,17 @@ public void testPreConfiguredTokenizer() throws IOException {
9598

9699
}
97100

101+
public void testCustomTokenChars() throws IOException {
102+
final Index index = new Index("test", "_na_");
103+
final String name = "engr";
104+
final Settings indexSettings = newAnalysisSettingsBuilder().put(IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey(), 2).build();
105+
106+
final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3)
107+
.putList("token_chars", "letter", "custom").put("custom_token_chars","_-").build();
108+
Tokenizer tokenizer = new EdgeNGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name,
109+
settings).create();
110+
tokenizer.setReader(new StringReader("Abc -gh _jk =lm"));
111+
assertTokenStreamContents(tokenizer, new String[] {"Ab", "Abc", "-g", "-gh", "_j", "_jk", "lm"});
112+
}
113+
98114
}

modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/NGramTokenizerFactoryTests.java

+33-12
Original file line numberDiff line numberDiff line change
@@ -46,25 +46,33 @@ public void testParseTokenChars() {
4646
final Index index = new Index("test", "_na_");
4747
final String name = "ngr";
4848
final Settings indexSettings = newAnalysisSettingsBuilder().build();
49-
IndexSettings indexProperties = IndexSettingsModule.newIndexSettings(index, indexSettings);
50-
for (String tokenChars : Arrays.asList("letters", "number", "DIRECTIONALITY_UNDEFINED")) {
49+
final IndexSettings indexProperties = IndexSettingsModule.newIndexSettings(index, indexSettings);
50+
for (String tokenChars : Arrays.asList("letter", " digit ", "punctuation", "DIGIT", "CoNtRoL", "dash_punctuation")) {
5151
final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3)
5252
.put("token_chars", tokenChars).build();
53-
try {
54-
new NGramTokenizerFactory(indexProperties, null, name, settings).create();
55-
fail();
56-
} catch (IllegalArgumentException expected) {
57-
// OK
58-
}
53+
new NGramTokenizerFactory(indexProperties, null, name, settings).create();
54+
// no exception
5955
}
60-
for (String tokenChars : Arrays.asList("letter", " digit ", "punctuation", "DIGIT", "CoNtRoL", "dash_punctuation")) {
56+
{
6157
final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3)
62-
.put("token_chars", tokenChars).build();
63-
indexProperties = IndexSettingsModule.newIndexSettings(index, indexSettings);
64-
58+
.put("token_chars", "DIRECTIONALITY_UNDEFINED").build();
59+
IllegalArgumentException ex = expectThrows(IllegalArgumentException.class,
60+
() -> new NGramTokenizerFactory(indexProperties, null, name, settings).create());
61+
assertEquals("Unknown token type: 'directionality_undefined'", ex.getMessage().substring(0, 46));
62+
}
63+
{
64+
final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "custom")
65+
.put("custom_token_chars", "_-").build();
6566
new NGramTokenizerFactory(indexProperties, null, name, settings).create();
6667
// no exception
6768
}
69+
{
70+
final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "custom")
71+
.build();
72+
IllegalArgumentException ex = expectThrows(IllegalArgumentException.class,
73+
() -> new NGramTokenizerFactory(indexProperties, null, name, settings).create());
74+
assertEquals("Token type: 'custom' requires setting `custom_token_chars`", ex.getMessage());
75+
}
6876
}
6977

7078
public void testNoTokenChars() throws IOException {
@@ -80,6 +88,19 @@ public void testNoTokenChars() throws IOException {
8088
assertTokenStreamContents(tokenizer, new String[] {"1.", "1.3", "1.34", ".3", ".34", "34"});
8189
}
8290

91+
public void testCustomTokenChars() throws IOException {
92+
final Index index = new Index("test", "_na_");
93+
final String name = "ngr";
94+
final Settings indexSettings = newAnalysisSettingsBuilder().put(IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey(), 2).build();
95+
96+
final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3)
97+
.putList("token_chars", "letter", "custom").put("custom_token_chars","_-").build();
98+
Tokenizer tokenizer = new NGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings)
99+
.create();
100+
tokenizer.setReader(new StringReader("Abc -gh _jk =lm"));
101+
assertTokenStreamContents(tokenizer, new String[] {"Ab", "Abc", "bc", "-g", "-gh", "gh", "_j", "_jk", "jk", "lm"});
102+
}
103+
83104
public void testPreTokenization() throws IOException {
84105
// Make sure that pretokenization works well and that it can be used even with token chars which are supplementary characters
85106
final Index index = new Index("test", "_na_");

0 commit comments

Comments
 (0)