Skip to content

Commit d063fe6

Browse files
committed
Deprecate smartcn_word
Looks like `WordTokenFilter` has been [deprecated in Lucene 4.8](http://lucene.apache.org/core/4_8_0/analyzers-smartcn/org/apache/lucene/analysis/cn/smart/WordTokenFilter.html) and looking at the javadoc, it looks like that only the [HMMChineseTokenizer](http://lucene.apache.org/core/4_8_0/analyzers-smartcn/org/apache/lucene/analysis/cn/smart/HMMChineseTokenizer.html) will be supported. We need to deprecate `smartcn_word` and `smartcn_sentence`. We add `smartcn_tokenizer` which does the both things. Closes #22. (cherry picked from commit 64dcb9b)
1 parent 45dfe9a commit d063fe6

8 files changed

+71
-18
lines changed

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,9 @@ Please read documentation relative to the version you are using:
2020

2121
* [3.0.0-SNAPSHOT](https://github.com/elasticsearch/elasticsearch-analysis-smartcn/blob/master/README.md)
2222

23-
The plugin includes the `smartcn` analyzer, `smartcn_sentence` tokenizer, and `smartcn_word` token filter.
23+
The plugin includes the `smartcn` analyzer and `smartcn_tokenizer` tokenizer.
24+
25+
Note that `smartcn_word` token filter and `smartcn_sentence` have been deprecated.
2426

2527
License
2628
-------

src/main/java/org/elasticsearch/index/analysis/SmartChineseAnalysisBinderProcessor.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,14 @@ public void processAnalyzers(AnalyzersBindings analyzersBindings) {
3030

3131
@Override
3232
public void processTokenizers(TokenizersBindings tokenizersBindings) {
33+
// TODO Remove it in 2.3.0 (was deprecated: see https://github.com/elasticsearch/elasticsearch-analysis-smartcn/issues/22)
3334
tokenizersBindings.processTokenizer("smartcn_sentence", SmartChineseSentenceTokenizerFactory.class);
35+
tokenizersBindings.processTokenizer("smartcn_tokenizer", SmartChineseTokenizerTokenizerFactory.class);
3436
}
3537

3638
@Override
3739
public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) {
40+
// TODO Remove it in 2.3.0 (was deprecated: see https://github.com/elasticsearch/elasticsearch-analysis-smartcn/issues/22)
3841
tokenFiltersBindings.processTokenFilter("smartcn_word", SmartChineseWordTokenFilterFactory.class);
3942
}
4043
}

src/main/java/org/elasticsearch/index/analysis/SmartChineseSentenceTokenizerFactory.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,9 @@
3030
import java.io.Reader;
3131

3232
/**
33+
* SentenceTokenizer has been deprecated in Lucene 4.8
3334
*/
35+
@Deprecated
3436
public class SmartChineseSentenceTokenizerFactory extends AbstractTokenizerFactory {
3537

3638
@Inject
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
/*
2+
* Licensed to Elasticsearch under one or more contributor
3+
* license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright
5+
* ownership. Elasticsearch licenses this file to you under
6+
* the Apache License, Version 2.0 (the "License"); you may
7+
* not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.elasticsearch.index.analysis;
21+
22+
import org.apache.lucene.analysis.Tokenizer;
23+
import org.apache.lucene.analysis.cn.smart.HMMChineseTokenizer;
24+
import org.elasticsearch.common.inject.Inject;
25+
import org.elasticsearch.common.inject.assistedinject.Assisted;
26+
import org.elasticsearch.common.settings.Settings;
27+
import org.elasticsearch.index.Index;
28+
import org.elasticsearch.index.settings.IndexSettings;
29+
30+
import java.io.Reader;
31+
32+
public class SmartChineseTokenizerTokenizerFactory extends AbstractTokenizerFactory {
33+
34+
@Inject
35+
public SmartChineseTokenizerTokenizerFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
36+
super(index, indexSettings, name, settings);
37+
}
38+
39+
@Override
40+
public Tokenizer create(Reader reader) {
41+
return new HMMChineseTokenizer(reader);
42+
}
43+
}

src/main/java/org/elasticsearch/index/analysis/SmartChineseWordTokenFilterFactory.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,9 @@
2828
import org.elasticsearch.index.settings.IndexSettings;
2929

3030
/**
31+
* WordTokenFilter has been deprecated in Lucene 4.8
3132
*/
33+
@Deprecated
3234
public class SmartChineseWordTokenFilterFactory extends AbstractTokenFilterFactory {
3335

3436
@Inject

src/main/java/org/elasticsearch/indices/analysis/smartcn/SmartChineseIndicesAnalysis.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
import org.apache.lucene.analysis.TokenStream;
2323
import org.apache.lucene.analysis.Tokenizer;
24+
import org.apache.lucene.analysis.cn.smart.HMMChineseTokenizer;
2425
import org.apache.lucene.analysis.cn.smart.SentenceTokenizer;
2526
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
2627
import org.apache.lucene.analysis.cn.smart.WordTokenFilter;
@@ -47,6 +48,7 @@ public SmartChineseIndicesAnalysis(Settings settings, IndicesAnalysisService ind
4748
indicesAnalysisService.analyzerProviderFactories().put("smartcn", new PreBuiltAnalyzerProviderFactory("smartcn", AnalyzerScope.INDICES, new SmartChineseAnalyzer(Lucene.ANALYZER_VERSION)));
4849

4950
// Register smartcn_word token filter
51+
// TODO Remove it in 2.3.0 (was deprecated: see https://github.com/elasticsearch/elasticsearch-analysis-smartcn/issues/22)
5052
indicesAnalysisService.tokenFilterFactories().put("smartcn_word", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
5153
@Override public String name() {
5254
return "smartcn_word";
@@ -70,6 +72,18 @@ public Tokenizer create(Reader reader) {
7072
}
7173
}));
7274

75+
// Register smartcn_sentence tokenizer
76+
indicesAnalysisService.tokenizerFactories().put("smartcn_tokenizer", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() {
77+
@Override
78+
public String name() {
79+
return "smartcn_tokenizer";
80+
}
81+
82+
@Override
83+
public Tokenizer create(Reader reader) {
84+
return new HMMChineseTokenizer(reader);
85+
}
86+
}));
7387

7488
}
7589
}

src/test/java/org/elasticsearch/index/analysis/SimpleSmartChineseAnalysisTests.java

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -53,10 +53,7 @@ public void testDefaultsIcuAnalysis() {
5353

5454
AnalysisService analysisService = injector.getInstance(AnalysisService.class);
5555

56-
TokenizerFactory tokenizerFactory = analysisService.tokenizer("smartcn_sentence");
57-
MatcherAssert.assertThat(tokenizerFactory, instanceOf(SmartChineseSentenceTokenizerFactory.class));
58-
59-
TokenFilterFactory filterFactory = analysisService.tokenFilter("smartcn_word");
60-
MatcherAssert.assertThat(filterFactory, instanceOf(SmartChineseWordTokenFilterFactory.class));
56+
TokenizerFactory tokenizerFactory = analysisService.tokenizer("smartcn_tokenizer");
57+
MatcherAssert.assertThat(tokenizerFactory, instanceOf(SmartChineseTokenizerTokenizerFactory.class));
6158
}
6259
}

src/test/java/org/elasticsearch/index/analysis/SimpleSmartChineseIntegrationTests.java

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -54,20 +54,10 @@ public void testSmartcnAnalyzer() throws ExecutionException, InterruptedExceptio
5454
@Test
5555
public void testSmartcnTokenizer() throws ExecutionException, InterruptedException {
5656
AnalyzeResponse response = client().admin().indices()
57-
.prepareAnalyze("叻出色").setTokenizer("smartcn_sentence")
57+
.prepareAnalyze("叻出色").setTokenizer("smartcn_tokenizer")
5858
.execute().get();
5959

6060
assertThat(response, notNullValue());
61-
assertThat(response.getTokens().size(), is(1));
62-
}
63-
64-
@Test
65-
public void testSmartcnTokenFilter() throws ExecutionException, InterruptedException {
66-
AnalyzeResponse response = client().admin().indices()
67-
.prepareAnalyze("叻出色").setTokenFilters("smartcn_word")
68-
.execute().get();
69-
70-
assertThat(response, notNullValue());
71-
assertThat(response.getTokens().size(), is(3));
61+
assertThat(response.getTokens().size(), is(2));
7262
}
7363
}

0 commit comments

Comments
 (0)