Skip to content

Commit 98b2544

Browse files
johtanidadoonet
authored andcommitted
Add JapaneseIterationMarkCharFilter support
Currently, Kuromoji have JapaneseIterationMarkCharFilter. Add IterationMarkCharFilter to analysis-kuromoji. Closes #7.
1 parent 4c95a3e commit 98b2544

File tree

6 files changed

+133
-8
lines changed

6 files changed

+133
-8
lines changed

README.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ The plugin includes these analyzer and tokenizer, tokenfilter.
2525

2626
| name | type |
2727
|-------------------------|-------------|
28+
| kuromoji_iteration_mark | charfilter |
2829
| kuromoji | analyzer |
2930
| kuromoji_tokenizer | tokenizer |
3031
| kuromoji_baseform | tokenfilter |
@@ -49,6 +50,18 @@ This analyzer is the following tokenizer and tokenfilter combination.
4950
* `kuromoji_stemmer` : Kuromiji Katakana Stemmer Filter(TokenFilter)
5051
* `lowercase` : LowerCase Filter (TokenFilter)
5152

53+
## CharFilter : kuromoji_iteration_mark
54+
55+
A charfilter of type `kuromoji_iteration_mark`.
56+
This charfilter is Normalizes Japanese horizontal iteration marks (odoriji) to their expanded form.
57+
58+
The following ar setting that can be set for a `kuromoji_iteration_mark` charfilter type:
59+
60+
| **Setting** | **Description** | **Default value** |
61+
|:----------------|:-------------------------------------------------------------|:------------------|
62+
| normalize_kanji | indicates whether kanji iteration marks should be normalized | `true` |
63+
| normalize_kana | indicates whether kanji iteration marks should be normalized | `true` |
64+
5265
## Tokenizer : kuromoji_tokenizer
5366

5467
A tokenizer of type `kuromoji_tokenizer`.
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
package org.elasticsearch.index.analysis;
2+
3+
import org.apache.lucene.analysis.ja.JapaneseIterationMarkCharFilter;
4+
import org.elasticsearch.common.inject.Inject;
5+
import org.elasticsearch.common.inject.assistedinject.Assisted;
6+
import org.elasticsearch.common.settings.Settings;
7+
import org.elasticsearch.index.Index;
8+
import org.elasticsearch.index.settings.IndexSettings;
9+
10+
import java.io.Reader;
11+
12+
public class KuromojiIterationMarkCharFilterFactory extends AbstractCharFilterFactory {
13+
14+
private final boolean normalizeKanji;
15+
private final boolean normalizeKana;
16+
17+
@Inject
18+
public KuromojiIterationMarkCharFilterFactory(Index index, @IndexSettings Settings indexSettings,
19+
@Assisted String name, @Assisted Settings settings) {
20+
super(index, indexSettings, name);
21+
normalizeKanji = settings.getAsBoolean("normalize_kanji", JapaneseIterationMarkCharFilter.NORMALIZE_KANJI_DEFAULT);
22+
normalizeKana = settings.getAsBoolean("normalize_kana", JapaneseIterationMarkCharFilter.NORMALIZE_KANA_DEFAULT);
23+
}
24+
25+
@Override
26+
public Reader create(Reader reader) {
27+
return new JapaneseIterationMarkCharFilter(reader, normalizeKanji, normalizeKana);
28+
}
29+
}

src/main/java/org/elasticsearch/indices/analysis/KuromojiIndicesAnalysis.java

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,7 @@
2626
import org.elasticsearch.common.component.AbstractComponent;
2727
import org.elasticsearch.common.inject.Inject;
2828
import org.elasticsearch.common.settings.Settings;
29-
import org.elasticsearch.index.analysis.PreBuiltTokenFilterFactoryFactory;
30-
import org.elasticsearch.index.analysis.PreBuiltTokenizerFactoryFactory;
31-
import org.elasticsearch.index.analysis.TokenFilterFactory;
32-
import org.elasticsearch.index.analysis.TokenizerFactory;
29+
import org.elasticsearch.index.analysis.*;
3330

3431
import java.io.Reader;
3532

@@ -41,9 +38,24 @@ public class KuromojiIndicesAnalysis extends AbstractComponent {
4138

4239
@Inject
4340
public KuromojiIndicesAnalysis(Settings settings,
44-
IndicesAnalysisService indicesAnalysisService) {
41+
IndicesAnalysisService indicesAnalysisService) {
4542
super(settings);
4643

44+
indicesAnalysisService.charFilterFactories().put("kuromoji_iteration_mark",
45+
new PreBuiltCharFilterFactoryFactory(new CharFilterFactory() {
46+
@Override
47+
public String name() {
48+
return "kuromoji_iteration_mark";
49+
}
50+
51+
@Override
52+
public Reader create(Reader reader) {
53+
return new JapaneseIterationMarkCharFilter(reader,
54+
JapaneseIterationMarkCharFilter.NORMALIZE_KANJI_DEFAULT,
55+
JapaneseIterationMarkCharFilter.NORMALIZE_KANA_DEFAULT);
56+
}
57+
}));
58+
4759
indicesAnalysisService.tokenizerFactories().put("kuromoji_tokenizer",
4860
new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() {
4961
@Override
@@ -83,7 +95,7 @@ public String name() {
8395
public TokenStream create(TokenStream tokenStream) {
8496
return new JapanesePartOfSpeechStopFilter(Version.LUCENE_44,
8597
tokenStream, JapaneseAnalyzer
86-
.getDefaultStopTags());
98+
.getDefaultStopTags());
8799
}
88100
}));
89101

src/main/java/org/elasticsearch/plugin/analysis/kuromoji/AnalysisKuromojiPlugin.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ public Collection<Class<? extends Module>> modules() {
4848
}
4949

5050
public void onModule(AnalysisModule module) {
51+
module.addCharFilter("kuromoji_iteration_mark", KuromojiIterationMarkCharFilterFactory.class);
5152
module.addAnalyzer("kuromoji", KuromojiAnalyzerProvider.class);
5253
module.addTokenizer("kuromoji_tokenizer", KuromojiTokenizerFactory.class);
5354
module.addTokenFilter("kuromoji_baseform", KuromojiBaseFormFilterFactory.class);

src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java

Lines changed: 55 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,9 @@
4141
import org.junit.Test;
4242

4343
import java.io.IOException;
44+
import java.io.Reader;
4445
import java.io.StringReader;
4546

46-
import static org.hamcrest.MatcherAssert.assertThat;
4747
import static org.hamcrest.Matchers.*;
4848

4949
/**
@@ -75,6 +75,9 @@ public void testDefaultsKuromojiAnalysis() throws IOException {
7575
analyzer = analysisService.analyzer("my_analyzer");
7676
assertThat(analyzer.analyzer(), instanceOf(CustomAnalyzer.class));
7777
assertThat(analyzer.analyzer().tokenStream(null, new StringReader("")), instanceOf(JapaneseTokenizer.class));
78+
79+
CharFilterFactory charFilterFactory = analysisService.charFilter("kuromoji_iteration_mark");
80+
assertThat(charFilterFactory, instanceOf(KuromojiIterationMarkCharFilterFactory.class));
7881
}
7982

8083
@Test
@@ -130,6 +133,41 @@ public void testKatakanaStemFilter() throws IOException {
130133
expected_tokens_katakana = new String[]{"明後日", "パーティー", "に", "行く", "予定", "が", "ある", "図書館", "で", "資料", "を", "コピー", "し", "まし", "た"};
131134
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens_katakana);
132135
}
136+
@Test
137+
public void testIterationMarkCharFilter() throws IOException {
138+
AnalysisService analysisService = createAnalysisService();
139+
// test only kanji
140+
CharFilterFactory charFilterFactory = analysisService.charFilter("kuromoji_im_only_kanji");
141+
assertNotNull(charFilterFactory);
142+
assertThat(charFilterFactory, instanceOf(KuromojiIterationMarkCharFilterFactory.class));
143+
144+
String source = "ところゞゝゝ、ジヾが、時々、馬鹿々々しい";
145+
String expected = "ところゞゝゝ、ジヾが、時時、馬鹿馬鹿しい";
146+
147+
assertCharFilterEquals(charFilterFactory.create(new StringReader(source)), expected);
148+
149+
// test only kana
150+
151+
charFilterFactory = analysisService.charFilter("kuromoji_im_only_kana");
152+
assertNotNull(charFilterFactory);
153+
assertThat(charFilterFactory, instanceOf(KuromojiIterationMarkCharFilterFactory.class));
154+
155+
expected = "ところどころ、ジジが、時々、馬鹿々々しい";
156+
157+
assertCharFilterEquals(charFilterFactory.create(new StringReader(source)), expected);
158+
159+
// test default
160+
161+
charFilterFactory = analysisService.charFilter("kuromoji_im_default");
162+
assertNotNull(charFilterFactory);
163+
assertThat(charFilterFactory, instanceOf(KuromojiIterationMarkCharFilterFactory.class));
164+
165+
expected = "ところどころ、ジジが、時時、馬鹿馬鹿しい";
166+
167+
assertCharFilterEquals(charFilterFactory.create(new StringReader(source)), expected);
168+
169+
170+
}
133171

134172
public AnalysisService createAnalysisService() {
135173
Settings settings = ImmutableSettings.settingsBuilder().loadFromClasspath("org/elasticsearch/index/analysis/kuromoji_analysis.json").build();
@@ -165,4 +203,20 @@ public static void assertSimpleTSOutput(TokenStream stream,
165203
}
166204
assertThat("not all tokens produced", i, equalTo(expected.length));
167205
}
206+
207+
private void assertCharFilterEquals(Reader filtered,
208+
String expected) throws IOException {
209+
String actual = readFully(filtered);
210+
assertThat(actual, equalTo(expected));
211+
}
212+
213+
private String readFully(Reader reader) throws IOException {
214+
StringBuilder buffer = new StringBuilder();
215+
int ch;
216+
while((ch = reader.read()) != -1){
217+
buffer.append((char)ch);
218+
}
219+
return buffer.toString();
220+
}
221+
168222
}

src/test/java/org/elasticsearch/index/analysis/kuromoji_analysis.json

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,23 @@
1717

1818

1919
},
20-
20+
21+
"char_filter":{
22+
"kuromoji_im_only_kanji":{
23+
"type":"kuromoji_iteration_mark",
24+
"normalize_kanji":true,
25+
"normalize_kana":false
26+
},
27+
"kuromoji_im_only_kana":{
28+
"type":"kuromoji_iteration_mark",
29+
"normalize_kanji":false,
30+
"normalize_kana":true
31+
},
32+
"kuromoji_im_default":{
33+
"type":"kuromoji_iteration_mark"
34+
}
35+
},
36+
2137
"tokenizer" : {
2238
"kuromoji" : {
2339
"type":"kuromoji_tokenizer"

0 commit comments

Comments
 (0)