Skip to content

Commit fe2a752

Browse files
committed
Add support for inlined user dictionary in the Kuromoji plugin (#45489)
This change adds a new option called user_dictionary_rules to Kuromoji's tokenizer. It can be used to set additional tokenization rules to the Japanese tokenizer directly in the settings (instead of using a file). This commit also adds a check that no rules are duplicated since this is not allowed in the UserDictionary. Closes #25343
1 parent 3318c91 commit fe2a752

File tree

7 files changed

+125
-16
lines changed

7 files changed

+125
-16
lines changed

docs/plugins/analysis-kuromoji.asciidoc

+33
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,39 @@ dictionary to `$ES_HOME/config/userdict_ja.txt`:
9898
東京スカイツリー,東京 スカイツリー,トウキョウ スカイツリー,カスタム名詞
9999
-----------------------
100100

101+
--
102+
103+
You can also inline the rules directly in the tokenizer definition using
104+
the `user_dictionary_rules` option:
105+
106+
[source,js]
107+
--------------------------------------------------
108+
PUT nori_sample
109+
{
110+
"settings": {
111+
"index": {
112+
"analysis": {
113+
"tokenizer": {
114+
"kuromoji_user_dict": {
115+
"type": "kuromoji_tokenizer",
116+
"mode": "extended",
117+
"user_dictionary_rules": ["東京スカイツリー,東京 スカイツリー,トウキョウ スカイツリー,カスタム名詞"]
118+
}
119+
},
120+
"analyzer": {
121+
"my_analyzer": {
122+
"type": "custom",
123+
"tokenizer": "kuromoji_user_dict"
124+
}
125+
}
126+
}
127+
}
128+
}
129+
}
130+
--------------------------------------------------
131+
// CONSOLE
132+
--
133+
101134
`nbest_cost`/`nbest_examples`::
102135
+
103136
--

plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiTokenizerFactory.java

+30-9
Original file line numberDiff line numberDiff line change
@@ -23,17 +23,22 @@
2323
import org.apache.lucene.analysis.ja.JapaneseTokenizer;
2424
import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode;
2525
import org.apache.lucene.analysis.ja.dict.UserDictionary;
26+
import org.apache.lucene.analysis.ja.util.CSVUtil;
2627
import org.elasticsearch.ElasticsearchException;
2728
import org.elasticsearch.common.settings.Settings;
2829
import org.elasticsearch.env.Environment;
2930
import org.elasticsearch.index.IndexSettings;
3031

3132
import java.io.IOException;
32-
import java.io.Reader;
33+
import java.io.StringReader;
34+
import java.util.HashSet;
35+
import java.util.List;
36+
import java.util.Set;
3337

3438
public class KuromojiTokenizerFactory extends AbstractTokenizerFactory {
3539

36-
private static final String USER_DICT_OPTION = "user_dictionary";
40+
private static final String USER_DICT_PATH_OPTION = "user_dictionary";
41+
private static final String USER_DICT_RULES_OPTION = "user_dictionary_rules";
3742
private static final String NBEST_COST = "nbest_cost";
3843
private static final String NBEST_EXAMPLES = "nbest_examples";
3944

@@ -54,17 +59,33 @@ public KuromojiTokenizerFactory(IndexSettings indexSettings, Environment env, St
5459
}
5560

5661
public static UserDictionary getUserDictionary(Environment env, Settings settings) {
62+
if (settings.get(USER_DICT_PATH_OPTION) != null && settings.get(USER_DICT_RULES_OPTION) != null) {
63+
throw new IllegalArgumentException("It is not allowed to use [" + USER_DICT_PATH_OPTION + "] in conjunction" +
64+
" with [" + USER_DICT_RULES_OPTION + "]");
65+
}
5766
try {
58-
final Reader reader = Analysis.getReaderFromFile(env, settings, USER_DICT_OPTION);
59-
if (reader == null) {
67+
List<String> ruleList = Analysis.getWordList(env, settings, USER_DICT_PATH_OPTION, USER_DICT_RULES_OPTION, false);
68+
if (ruleList == null || ruleList.isEmpty()) {
6069
return null;
61-
} else {
62-
try {
63-
return UserDictionary.open(reader);
64-
} finally {
65-
reader.close();
70+
}
71+
Set<String> dup = new HashSet<>();
72+
int lineNum = 0;
73+
for (String line : ruleList) {
74+
// ignore comments
75+
if (line.startsWith("#") == false) {
76+
String[] values = CSVUtil.parse(line);
77+
if (dup.add(values[0]) == false) {
78+
throw new IllegalArgumentException("Found duplicate term [" + values[0] + "] in user dictionary " +
79+
"at line [" + lineNum + "]");
80+
}
6681
}
82+
++ lineNum;
83+
}
84+
StringBuilder sb = new StringBuilder();
85+
for (String line : ruleList) {
86+
sb.append(line).append(System.lineSeparator());
6787
}
88+
return UserDictionary.open(new StringReader(sb.toString()));
6889
} catch (IOException e) {
6990
throw new ElasticsearchException("failed to load kuromoji user dictionary", e);
7091
}

plugins/analysis-kuromoji/src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java

+54
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
package org.elasticsearch.index.analysis;
2121

22+
import org.apache.lucene.analysis.Analyzer;
2223
import org.apache.lucene.analysis.TokenStream;
2324
import org.apache.lucene.analysis.Tokenizer;
2425
import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
@@ -39,6 +40,8 @@
3940
import java.nio.file.Files;
4041
import java.nio.file.Path;
4142

43+
import static org.apache.lucene.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;
44+
import static org.hamcrest.CoreMatchers.containsString;
4245
import static org.hamcrest.Matchers.equalTo;
4346
import static org.hamcrest.Matchers.greaterThan;
4447
import static org.hamcrest.Matchers.instanceOf;
@@ -307,4 +310,55 @@ public void testNumberFilterFactory() throws Exception {
307310
tokenizer.setReader(new StringReader(source));
308311
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
309312
}
313+
314+
public void testKuromojiAnalyzerUserDict() throws Exception {
315+
Settings settings = Settings.builder()
316+
.put("index.analysis.analyzer.my_analyzer.type", "kuromoji")
317+
.putList("index.analysis.analyzer.my_analyzer.user_dictionary_rules", "c++,c++,w,w", "制限スピード,制限スピード,セイゲンスピード,テスト名詞")
318+
.build();
319+
TestAnalysis analysis = createTestAnalysis(settings);
320+
Analyzer analyzer = analysis.indexAnalyzers.get("my_analyzer");
321+
try (TokenStream stream = analyzer.tokenStream("", "制限スピード")) {
322+
assertTokenStreamContents(stream, new String[]{"制限スピード"});
323+
}
324+
325+
try (TokenStream stream = analyzer.tokenStream("", "c++world")) {
326+
assertTokenStreamContents(stream, new String[]{"c++", "world"});
327+
}
328+
}
329+
330+
public void testKuromojiAnalyzerInvalidUserDictOption() throws Exception {
331+
Settings settings = Settings.builder()
332+
.put("index.analysis.analyzer.my_analyzer.type", "kuromoji")
333+
.put("index.analysis.analyzer.my_analyzer.user_dictionary", "user_dict.txt")
334+
.putList("index.analysis.analyzer.my_analyzer.user_dictionary_rules", "c++,c++,w,w")
335+
.build();
336+
IllegalArgumentException exc = expectThrows(IllegalArgumentException.class, () -> createTestAnalysis(settings));
337+
assertThat(exc.getMessage(), containsString("It is not allowed to use [user_dictionary] in conjunction " +
338+
"with [user_dictionary_rules]"));
339+
}
340+
341+
public void testKuromojiAnalyzerDuplicateUserDictRule() throws Exception {
342+
Settings settings = Settings.builder()
343+
.put("index.analysis.analyzer.my_analyzer.type", "kuromoji")
344+
.putList("index.analysis.analyzer.my_analyzer.user_dictionary_rules",
345+
"c++,c++,w,w", "#comment", "制限スピード,制限スピード,セイゲンスピード,テスト名詞", "制限スピード,制限スピード,セイゲンスピード,テスト名詞")
346+
.build();
347+
IllegalArgumentException exc = expectThrows(IllegalArgumentException.class, () -> createTestAnalysis(settings));
348+
assertThat(exc.getMessage(), containsString("[制限スピード] in user dictionary at line [3]"));
349+
}
350+
351+
private TestAnalysis createTestAnalysis(Settings analysisSettings) throws IOException {
352+
InputStream dict = KuromojiAnalysisTests.class.getResourceAsStream("user_dict.txt");
353+
Path home = createTempDir();
354+
Path config = home.resolve("config");
355+
Files.createDirectory(config);
356+
Files.copy(dict, config.resolve("user_dict.txt"));
357+
Settings settings = Settings.builder()
358+
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
359+
.put(Environment.PATH_HOME_SETTING.getKey(), home)
360+
.put(analysisSettings)
361+
.build();
362+
return AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new AnalysisKuromojiPlugin());
363+
}
310364
}

plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriTokenizerFactory.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ public static UserDictionary getUserDictionary(Environment env, Settings setting
5151
throw new IllegalArgumentException("It is not allowed to use [" + USER_DICT_PATH_OPTION + "] in conjunction" +
5252
" with [" + USER_DICT_RULES_OPTION + "]");
5353
}
54-
List<String> ruleList = Analysis.getWordList(env, settings, USER_DICT_PATH_OPTION, USER_DICT_RULES_OPTION);
54+
List<String> ruleList = Analysis.getWordList(env, settings, USER_DICT_PATH_OPTION, USER_DICT_RULES_OPTION, true);
5555
StringBuilder sb = new StringBuilder();
5656
if (ruleList == null || ruleList.isEmpty()) {
5757
return null;

server/src/main/java/org/elasticsearch/index/analysis/Analysis.java

+7-6
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,7 @@ public static CharArraySet getWordSet(Environment env, Settings settings, String
221221
* If the word list cannot be found at either key.
222222
*/
223223
public static List<String> getWordList(Environment env, Settings settings, String settingPrefix) {
224-
return getWordList(env, settings, settingPrefix + "_path", settingPrefix);
224+
return getWordList(env, settings, settingPrefix + "_path", settingPrefix, true);
225225
}
226226

227227
/**
@@ -231,7 +231,8 @@ public static List<String> getWordList(Environment env, Settings settings, Strin
231231
* @throws IllegalArgumentException
232232
* If the word list cannot be found at either key.
233233
*/
234-
public static List<String> getWordList(Environment env, Settings settings, String settingPath, String settingList) {
234+
public static List<String> getWordList(Environment env, Settings settings,
235+
String settingPath, String settingList, boolean removeComments) {
235236
String wordListPath = settings.get(settingPath, null);
236237

237238
if (wordListPath == null) {
@@ -246,7 +247,7 @@ public static List<String> getWordList(Environment env, Settings settings, Strin
246247
final Path path = env.configFile().resolve(wordListPath);
247248

248249
try {
249-
return loadWordList(path, "#");
250+
return loadWordList(path, removeComments);
250251
} catch (CharacterCodingException ex) {
251252
String message = String.format(Locale.ROOT,
252253
"Unsupported character encoding detected while reading %s: %s - files must be UTF-8 encoded",
@@ -258,15 +259,15 @@ public static List<String> getWordList(Environment env, Settings settings, Strin
258259
}
259260
}
260261

261-
private static List<String> loadWordList(Path path, String comment) throws IOException {
262+
private static List<String> loadWordList(Path path, boolean removeComments) throws IOException {
262263
final List<String> result = new ArrayList<>();
263264
try (BufferedReader br = Files.newBufferedReader(path, StandardCharsets.UTF_8)) {
264265
String word;
265266
while ((word = br.readLine()) != null) {
266-
if (!Strings.hasText(word)) {
267+
if (Strings.hasText(word) == false) {
267268
continue;
268269
}
269-
if (!word.startsWith(comment)) {
270+
if (removeComments == false || word.startsWith("#") == false) {
270271
result.add(word.trim());
271272
}
272273
}

0 commit comments

Comments
 (0)