diff --git a/docs/plugins/analysis-nori.asciidoc b/docs/plugins/analysis-nori.asciidoc index 7a6ab21595881..68ec943533aa9 100644 --- a/docs/plugins/analysis-nori.asciidoc +++ b/docs/plugins/analysis-nori.asciidoc @@ -70,7 +70,7 @@ The first token is mandatory and represents the custom noun that should be added the dictionary. For compound nouns the custom segmentation can be provided after the first token (`[ ... ]`). The segmentation of the custom compound nouns is controlled by the `decompound_mode` setting. --- + As a demonstration of how the user dictionary can be used, save the following dictionary to `$ES_HOME/config/userdict_ko.txt`: @@ -153,6 +153,42 @@ The above `analyze` request returns the following: // TESTRESPONSE <1> This is a compound token that spans two positions (`mixed` mode). +-- + +`user_dictionary_rules`:: ++ +-- + +You can also inline the rules directly in the tokenizer definition using +the `user_dictionary_rules` option: + +[source,js] +-------------------------------------------------- +PUT nori_sample +{ + "settings": { + "index": { + "analysis": { + "tokenizer": { + "nori_user_dict": { + "type": "nori_tokenizer", + "decompound_mode": "mixed", + "user_dictionary_rules": ["c++", "C샤프", "세종", "세종시 세종 시"] + } + }, + "analyzer": { + "my_analyzer": { + "type": "custom", + "tokenizer": "nori_user_dict" + } + } + } + } + } +} +-------------------------------------------------- +// CONSOLE +-- The `nori_tokenizer` sets a number of additional attributes per token that are used by token filters to modify the stream. diff --git a/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriTokenizerFactory.java b/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriTokenizerFactory.java index 9295ed95c3fb8..aa96da807c80f 100644 --- a/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriTokenizerFactory.java +++ b/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriTokenizerFactory.java @@ -29,10 +29,13 @@ import java.io.IOException; import java.io.Reader; +import java.io.StringReader; +import java.util.List; import java.util.Locale; public class NoriTokenizerFactory extends AbstractTokenizerFactory { - private static final String USER_DICT_OPTION = "user_dictionary"; + private static final String USER_DICT_PATH_OPTION = "user_dictionary"; + private static final String USER_DICT_RULES_OPTION = "user_dictionary_rules"; private final UserDictionary userDictionary; private final KoreanTokenizer.DecompoundMode decompoundMode; @@ -44,12 +47,20 @@ public NoriTokenizerFactory(IndexSettings indexSettings, Environment env, String } public static UserDictionary getUserDictionary(Environment env, Settings settings) { - try (Reader reader = Analysis.getReaderFromFile(env, settings, USER_DICT_OPTION)) { - if (reader == null) { - return null; - } else { - return UserDictionary.open(reader); - } + if (settings.get(USER_DICT_PATH_OPTION) != null && settings.get(USER_DICT_RULES_OPTION) != null) { + throw new IllegalArgumentException("It is not allowed to use [" + USER_DICT_PATH_OPTION + "] in conjunction" + + " with [" + USER_DICT_RULES_OPTION + "]"); + } + List ruleList = Analysis.getWordList(env, settings, USER_DICT_PATH_OPTION, USER_DICT_RULES_OPTION); + StringBuilder sb = new StringBuilder(); + if (ruleList == null || ruleList.isEmpty()) { + return null; + } + for (String line : ruleList) { + sb.append(line).append(System.lineSeparator()); + } + try (Reader rulesReader = new StringReader(sb.toString())) { + return UserDictionary.open(rulesReader); } catch (IOException e) { throw new ElasticsearchException("failed to load nori user dictionary", e); } diff --git a/plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/NoriAnalysisTests.java b/plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/NoriAnalysisTests.java index fa5858a7bbbb8..051a2f3e4dc32 100644 --- a/plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/NoriAnalysisTests.java +++ b/plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/NoriAnalysisTests.java @@ -38,6 +38,7 @@ import java.nio.file.Files; import java.nio.file.Path; +import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.instanceOf; public class NoriAnalysisTests extends ESTokenStreamTestCase { @@ -76,6 +77,22 @@ public void testNoriAnalyzer() throws Exception { } public void testNoriAnalyzerUserDict() throws Exception { + Settings settings = Settings.builder() + .put("index.analysis.analyzer.my_analyzer.type", "nori") + .putList("index.analysis.analyzer.my_analyzer.user_dictionary_rules", "c++", "C샤프", "세종", "세종시 세종 시") + .build(); + TestAnalysis analysis = createTestAnalysis(settings); + Analyzer analyzer = analysis.indexAnalyzers.get("my_analyzer"); + try (TokenStream stream = analyzer.tokenStream("", "세종시")) { + assertTokenStreamContents(stream, new String[]{"세종", "시"}); + } + + try (TokenStream stream = analyzer.tokenStream("", "c++world")) { + assertTokenStreamContents(stream, new String[]{"c++", "world"}); + } + } + + public void testNoriAnalyzerUserDictPath() throws Exception { Settings settings = Settings.builder() .put("index.analysis.analyzer.my_analyzer.type", "nori") .put("index.analysis.analyzer.my_analyzer.user_dictionary", "user_dict.txt") @@ -91,6 +108,17 @@ public void testNoriAnalyzerUserDict() throws Exception { } } + public void testNoriAnalyzerInvalidUserDictOption() throws Exception { + Settings settings = Settings.builder() + .put("index.analysis.analyzer.my_analyzer.type", "nori") + .put("index.analysis.analyzer.my_analyzer.user_dictionary", "user_dict.txt") + .putList("index.analysis.analyzer.my_analyzer.user_dictionary_rules", "c++", "C샤프", "세종", "세종시 세종 시") + .build(); + IllegalArgumentException exc = expectThrows(IllegalArgumentException.class, () -> createTestAnalysis(settings)); + assertThat(exc.getMessage(), containsString("It is not allowed to use [user_dictionary] in conjunction " + + "with [user_dictionary_rules]")); + } + public void testNoriTokenizer() throws Exception { Settings settings = Settings.builder() .put("index.analysis.tokenizer.my_tokenizer.type", "nori_tokenizer") diff --git a/server/src/main/java/org/elasticsearch/index/analysis/Analysis.java b/server/src/main/java/org/elasticsearch/index/analysis/Analysis.java index d56b8820e9b1c..09a87124110b3 100644 --- a/server/src/main/java/org/elasticsearch/index/analysis/Analysis.java +++ b/server/src/main/java/org/elasticsearch/index/analysis/Analysis.java @@ -221,10 +221,21 @@ public static CharArraySet getWordSet(Environment env, Settings settings, String * If the word list cannot be found at either key. */ public static List getWordList(Environment env, Settings settings, String settingPrefix) { - String wordListPath = settings.get(settingPrefix + "_path", null); + return getWordList(env, settings, settingPrefix + "_path", settingPrefix); + } + + /** + * Fetches a list of words from the specified settings file. The list should either be available at the key + * specified by settingList or in a file specified by settingPath. + * + * @throws IllegalArgumentException + * If the word list cannot be found at either key. + */ + public static List getWordList(Environment env, Settings settings, String settingPath, String settingList) { + String wordListPath = settings.get(settingPath, null); if (wordListPath == null) { - List explicitWordList = settings.getAsList(settingPrefix, null); + List explicitWordList = settings.getAsList(settingList, null); if (explicitWordList == null) { return null; } else { @@ -238,11 +249,11 @@ public static List getWordList(Environment env, Settings settings, Strin return loadWordList(path, "#"); } catch (CharacterCodingException ex) { String message = String.format(Locale.ROOT, - "Unsupported character encoding detected while reading %s_path: %s - files must be UTF-8 encoded", - settingPrefix, path.toString()); + "Unsupported character encoding detected while reading %s: %s - files must be UTF-8 encoded", + settingPath, path.toString()); throw new IllegalArgumentException(message, ex); } catch (IOException ioe) { - String message = String.format(Locale.ROOT, "IOException while reading %s_path: %s", settingPrefix, path.toString()); + String message = String.format(Locale.ROOT, "IOException while reading %s: %s", settingPath, path.toString()); throw new IllegalArgumentException(message, ioe); } }