Skip to content

Commit 9d5409a

Browse files
authored
Expose discard_compound_token option to kuromoji_tokenizer (#57421)
This commit exposes the new Lucene option `discard_compound_token` to the Elasticsearch Kuromoji plugin.
1 parent 34e2535 commit 9d5409a

File tree

4 files changed

+29
-5
lines changed

4 files changed

+29
-5
lines changed

docs/plugins/analysis-kuromoji.asciidoc

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ unknown words. It can be set to:
7070

7171
Extended mode outputs unigrams for unknown words. Example output:
7272

73-
関西, 国際, 空港
73+
関西, 関西国際空港, 国際, 空港
7474
ア, ブ, ラ, カ, ダ, ブ, ラ
7575
--
7676

@@ -208,6 +208,12 @@ The above `analyze` request returns the following:
208208
}
209209
--------------------------------------------------
210210

211+
`discard_compound_token`::
212+
Whether original compound tokens should be discarded from the output with `search` mode. Defaults to `false`.
213+
Example output with `search` or `extended` mode and this option `true`:
214+
215+
関西, 国際, 空港
216+
211217

212218
[[analysis-kuromoji-baseform]]
213219
==== `kuromoji_baseform` token filter

plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiTokenizerFactory.java

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,21 +41,24 @@ public class KuromojiTokenizerFactory extends AbstractTokenizerFactory {
4141
private static final String USER_DICT_RULES_OPTION = "user_dictionary_rules";
4242
private static final String NBEST_COST = "nbest_cost";
4343
private static final String NBEST_EXAMPLES = "nbest_examples";
44+
private static final String DISCARD_COMPOUND_TOKEN = "discard_compound_token";
4445

4546
private final UserDictionary userDictionary;
4647
private final Mode mode;
4748
private final String nBestExamples;
4849
private final int nBestCost;
4950

50-
private boolean discartPunctuation;
51+
private boolean discardPunctuation;
52+
private boolean discardCompoundToken;
5153

5254
public KuromojiTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
5355
super(indexSettings, settings, name);
5456
mode = getMode(settings);
5557
userDictionary = getUserDictionary(env, settings);
56-
discartPunctuation = settings.getAsBoolean("discard_punctuation", true);
58+
discardPunctuation = settings.getAsBoolean("discard_punctuation", true);
5759
nBestCost = settings.getAsInt(NBEST_COST, -1);
5860
nBestExamples = settings.get(NBEST_EXAMPLES);
61+
discardCompoundToken = settings.getAsBoolean(DISCARD_COMPOUND_TOKEN, false);
5962
}
6063

6164
public static UserDictionary getUserDictionary(Environment env, Settings settings) {
@@ -108,7 +111,7 @@ public static JapaneseTokenizer.Mode getMode(Settings settings) {
108111

109112
@Override
110113
public Tokenizer create() {
111-
JapaneseTokenizer t = new JapaneseTokenizer(userDictionary, discartPunctuation, mode);
114+
JapaneseTokenizer t = new JapaneseTokenizer(userDictionary, discardPunctuation, discardCompoundToken, mode);
112115
int nBestCost = this.nBestCost;
113116
if (nBestExamples != null) {
114117
nBestCost = Math.max(nBestCost, t.calcNBestCost(nBestExamples));

plugins/analysis-kuromoji/src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,17 @@ public void testKuromojiAnalyzerDuplicateUserDictRule() throws Exception {
348348
assertThat(exc.getMessage(), containsString("[制限スピード] in user dictionary at line [3]"));
349349
}
350350

351+
public void testDiscardCompoundToken() throws Exception {
352+
TestAnalysis analysis = createTestAnalysis();
353+
TokenizerFactory tokenizerFactory = analysis.tokenizer.get("kuromoji_discard_compound_token");
354+
String source = "株式会社";
355+
String[] expected = new String[] {"株式", "会社"};
356+
357+
Tokenizer tokenizer = tokenizerFactory.create();
358+
tokenizer.setReader(new StringReader(source));
359+
assertSimpleTSOutput(tokenizer, expected);
360+
}
361+
351362
private TestAnalysis createTestAnalysis(Settings analysisSettings) throws IOException {
352363
InputStream dict = KuromojiAnalysisTests.class.getResourceAsStream("user_dict.txt");
353364
Path home = createTempDir();

plugins/analysis-kuromoji/src/test/resources/org/elasticsearch/index/analysis/kuromoji_analysis.json

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,10 @@
6060
"type": "kuromoji_tokenizer",
6161
"nbest_examples" : "/鳩山積み-鳩山/鳩山積み-鳩/",
6262
"nbest_cost" : "1000"
63+
},
64+
"kuromoji_discard_compound_token": {
65+
"type": "kuromoji_tokenizer",
66+
"discard_compound_token": true
6367
}
6468
},
6569
"analyzer" : {
@@ -68,7 +72,7 @@
6872
"tokenizer" : "kuromoji_tokenizer"
6973
}
7074
}
71-
75+
7276
}
7377
}
7478
}

0 commit comments

Comments
 (0)