Skip to content

Commit a53e865

Browse files
authored
Add support for inlined user dictionary in Nori (#36123)
Add support for inlined user dictionary in Nori This change adds a new option called `user_dictionary_rules` to the Nori a tokenizer`. It can be used to set additional tokenization rules to the Korean tokenizer directly in the settings (instead of using a file). Closes #35842
1 parent ca09936 commit a53e865

File tree

4 files changed

+99
-13
lines changed

4 files changed

+99
-13
lines changed

docs/plugins/analysis-nori.asciidoc

+37-1
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ The first token is mandatory and represents the custom noun that should be added
7070
the dictionary. For compound nouns the custom segmentation can be provided
7171
after the first token (`[<token 1> ... <token n>]`). The segmentation of the
7272
custom compound nouns is controlled by the `decompound_mode` setting.
73-
--
73+
7474

7575
As a demonstration of how the user dictionary can be used, save the following
7676
dictionary to `$ES_HOME/config/userdict_ko.txt`:
@@ -153,6 +153,42 @@ The above `analyze` request returns the following:
153153
// TESTRESPONSE
154154

155155
<1> This is a compound token that spans two positions (`mixed` mode).
156+
--
157+
158+
`user_dictionary_rules`::
159+
+
160+
--
161+
162+
You can also inline the rules directly in the tokenizer definition using
163+
the `user_dictionary_rules` option:
164+
165+
[source,js]
166+
--------------------------------------------------
167+
PUT nori_sample
168+
{
169+
"settings": {
170+
"index": {
171+
"analysis": {
172+
"tokenizer": {
173+
"nori_user_dict": {
174+
"type": "nori_tokenizer",
175+
"decompound_mode": "mixed",
176+
"user_dictionary_rules": ["c++", "C샤프", "세종", "세종시 세종 시"]
177+
}
178+
},
179+
"analyzer": {
180+
"my_analyzer": {
181+
"type": "custom",
182+
"tokenizer": "nori_user_dict"
183+
}
184+
}
185+
}
186+
}
187+
}
188+
}
189+
--------------------------------------------------
190+
// CONSOLE
191+
--
156192

157193
The `nori_tokenizer` sets a number of additional attributes per token that are used by token filters
158194
to modify the stream.

plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriTokenizerFactory.java

+18-7
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,13 @@
2929

3030
import java.io.IOException;
3131
import java.io.Reader;
32+
import java.io.StringReader;
33+
import java.util.List;
3234
import java.util.Locale;
3335

3436
public class NoriTokenizerFactory extends AbstractTokenizerFactory {
35-
private static final String USER_DICT_OPTION = "user_dictionary";
37+
private static final String USER_DICT_PATH_OPTION = "user_dictionary";
38+
private static final String USER_DICT_RULES_OPTION = "user_dictionary_rules";
3639

3740
private final UserDictionary userDictionary;
3841
private final KoreanTokenizer.DecompoundMode decompoundMode;
@@ -44,12 +47,20 @@ public NoriTokenizerFactory(IndexSettings indexSettings, Environment env, String
4447
}
4548

4649
public static UserDictionary getUserDictionary(Environment env, Settings settings) {
47-
try (Reader reader = Analysis.getReaderFromFile(env, settings, USER_DICT_OPTION)) {
48-
if (reader == null) {
49-
return null;
50-
} else {
51-
return UserDictionary.open(reader);
52-
}
50+
if (settings.get(USER_DICT_PATH_OPTION) != null && settings.get(USER_DICT_RULES_OPTION) != null) {
51+
throw new IllegalArgumentException("It is not allowed to use [" + USER_DICT_PATH_OPTION + "] in conjunction" +
52+
" with [" + USER_DICT_RULES_OPTION + "]");
53+
}
54+
List<String> ruleList = Analysis.getWordList(env, settings, USER_DICT_PATH_OPTION, USER_DICT_RULES_OPTION);
55+
StringBuilder sb = new StringBuilder();
56+
if (ruleList == null || ruleList.isEmpty()) {
57+
return null;
58+
}
59+
for (String line : ruleList) {
60+
sb.append(line).append(System.lineSeparator());
61+
}
62+
try (Reader rulesReader = new StringReader(sb.toString())) {
63+
return UserDictionary.open(rulesReader);
5364
} catch (IOException e) {
5465
throw new ElasticsearchException("failed to load nori user dictionary", e);
5566
}

plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/NoriAnalysisTests.java

+28
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
import java.nio.file.Files;
3939
import java.nio.file.Path;
4040

41+
import static org.hamcrest.Matchers.containsString;
4142
import static org.hamcrest.Matchers.instanceOf;
4243

4344
public class NoriAnalysisTests extends ESTokenStreamTestCase {
@@ -76,6 +77,22 @@ public void testNoriAnalyzer() throws Exception {
7677
}
7778

7879
public void testNoriAnalyzerUserDict() throws Exception {
80+
Settings settings = Settings.builder()
81+
.put("index.analysis.analyzer.my_analyzer.type", "nori")
82+
.putList("index.analysis.analyzer.my_analyzer.user_dictionary_rules", "c++", "C샤프", "세종", "세종시 세종 시")
83+
.build();
84+
TestAnalysis analysis = createTestAnalysis(settings);
85+
Analyzer analyzer = analysis.indexAnalyzers.get("my_analyzer");
86+
try (TokenStream stream = analyzer.tokenStream("", "세종시")) {
87+
assertTokenStreamContents(stream, new String[]{"세종", "시"});
88+
}
89+
90+
try (TokenStream stream = analyzer.tokenStream("", "c++world")) {
91+
assertTokenStreamContents(stream, new String[]{"c++", "world"});
92+
}
93+
}
94+
95+
public void testNoriAnalyzerUserDictPath() throws Exception {
7996
Settings settings = Settings.builder()
8097
.put("index.analysis.analyzer.my_analyzer.type", "nori")
8198
.put("index.analysis.analyzer.my_analyzer.user_dictionary", "user_dict.txt")
@@ -91,6 +108,17 @@ public void testNoriAnalyzerUserDict() throws Exception {
91108
}
92109
}
93110

111+
public void testNoriAnalyzerInvalidUserDictOption() throws Exception {
112+
Settings settings = Settings.builder()
113+
.put("index.analysis.analyzer.my_analyzer.type", "nori")
114+
.put("index.analysis.analyzer.my_analyzer.user_dictionary", "user_dict.txt")
115+
.putList("index.analysis.analyzer.my_analyzer.user_dictionary_rules", "c++", "C샤프", "세종", "세종시 세종 시")
116+
.build();
117+
IllegalArgumentException exc = expectThrows(IllegalArgumentException.class, () -> createTestAnalysis(settings));
118+
assertThat(exc.getMessage(), containsString("It is not allowed to use [user_dictionary] in conjunction " +
119+
"with [user_dictionary_rules]"));
120+
}
121+
94122
public void testNoriTokenizer() throws Exception {
95123
Settings settings = Settings.builder()
96124
.put("index.analysis.tokenizer.my_tokenizer.type", "nori_tokenizer")

server/src/main/java/org/elasticsearch/index/analysis/Analysis.java

+16-5
Original file line numberDiff line numberDiff line change
@@ -221,10 +221,21 @@ public static CharArraySet getWordSet(Environment env, Settings settings, String
221221
* If the word list cannot be found at either key.
222222
*/
223223
public static List<String> getWordList(Environment env, Settings settings, String settingPrefix) {
224-
String wordListPath = settings.get(settingPrefix + "_path", null);
224+
return getWordList(env, settings, settingPrefix + "_path", settingPrefix);
225+
}
226+
227+
/**
228+
* Fetches a list of words from the specified settings file. The list should either be available at the key
229+
* specified by <code>settingList</code> or in a file specified by <code>settingPath</code>.
230+
*
231+
* @throws IllegalArgumentException
232+
* If the word list cannot be found at either key.
233+
*/
234+
public static List<String> getWordList(Environment env, Settings settings, String settingPath, String settingList) {
235+
String wordListPath = settings.get(settingPath, null);
225236

226237
if (wordListPath == null) {
227-
List<String> explicitWordList = settings.getAsList(settingPrefix, null);
238+
List<String> explicitWordList = settings.getAsList(settingList, null);
228239
if (explicitWordList == null) {
229240
return null;
230241
} else {
@@ -238,11 +249,11 @@ public static List<String> getWordList(Environment env, Settings settings, Strin
238249
return loadWordList(path, "#");
239250
} catch (CharacterCodingException ex) {
240251
String message = String.format(Locale.ROOT,
241-
"Unsupported character encoding detected while reading %s_path: %s - files must be UTF-8 encoded",
242-
settingPrefix, path.toString());
252+
"Unsupported character encoding detected while reading %s: %s - files must be UTF-8 encoded",
253+
settingPath, path.toString());
243254
throw new IllegalArgumentException(message, ex);
244255
} catch (IOException ioe) {
245-
String message = String.format(Locale.ROOT, "IOException while reading %s_path: %s", settingPrefix, path.toString());
256+
String message = String.format(Locale.ROOT, "IOException while reading %s: %s", settingPath, path.toString());
246257
throw new IllegalArgumentException(message, ioe);
247258
}
248259
}

0 commit comments

Comments
 (0)