Skip to content

Commit d34a2b0

Browse files
xyudadoonet
authored andcommitted
Add smartcn_word & smartcn_sentence back for backwards compatibility
In f4d0d27 the deprecated `smartcn_sentence` tokenizer and deprecated `smartcn_word` token filter were removed as the new all in one `smartcn_tokenizer` should be used instead. However for those with pre-existing indices with mappings that reference the deprecated tokenizer and token filter this changes causes indexing and search errors. This change preserves the `smartcn_sentence` tokenizer name and aliases it to the new `smartcn_tokenizer`. The change set also reintroduces a `smartcn_word` token filter that is a noop filter. The result of these changes should allow elasticsearch instances with the existing deprecated mappings to upgrade and take advantage of the new tokenizer in lucene. (cherry picked from commit 25a0071) Closes #29
1 parent 940297c commit d34a2b0

File tree

3 files changed

+75
-4
lines changed

3 files changed

+75
-4
lines changed

src/main/java/org/elasticsearch/index/analysis/SmartChineseAnalysisBinderProcessor.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,5 +31,13 @@ public void processAnalyzers(AnalyzersBindings analyzersBindings) {
3131
@Override
3232
public void processTokenizers(TokenizersBindings tokenizersBindings) {
3333
tokenizersBindings.processTokenizer("smartcn_tokenizer", SmartChineseTokenizerTokenizerFactory.class);
34+
// This is an alias to "smartcn_tokenizer"; it's here for backwards compat
35+
tokenizersBindings.processTokenizer("smartcn_sentence", SmartChineseTokenizerTokenizerFactory.class);
36+
}
37+
38+
@Override
39+
public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) {
40+
// This is a noop token filter; it's here for backwards compat before we had "smartcn_tokenizer"
41+
tokenFiltersBindings.processTokenFilter("smartcn_word", SmartChineseNoOpTokenFilterFactory.class);
3442
}
3543
}
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
/*
2+
* Licensed to Elasticsearch under one or more contributor
3+
* license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright
5+
* ownership. Elasticsearch licenses this file to you under
6+
* the Apache License, Version 2.0 (the "License"); you may
7+
* not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.elasticsearch.index.analysis;
21+
22+
import org.apache.lucene.analysis.TokenStream;
23+
import org.elasticsearch.common.inject.Inject;
24+
import org.elasticsearch.common.inject.assistedinject.Assisted;
25+
import org.elasticsearch.common.settings.Settings;
26+
import org.elasticsearch.index.Index;
27+
import org.elasticsearch.index.settings.IndexSettings;
28+
29+
public class SmartChineseNoOpTokenFilterFactory extends AbstractTokenFilterFactory {
30+
31+
@Inject
32+
public SmartChineseNoOpTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
33+
super(index, indexSettings, name, settings);
34+
}
35+
36+
@Override
37+
public TokenStream create(TokenStream tokenStream) {
38+
return tokenStream;
39+
}
40+
}

src/main/java/org/elasticsearch/indices/analysis/smartcn/SmartChineseIndicesAnalysis.java

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,15 +20,13 @@
2020
package org.elasticsearch.indices.analysis.smartcn;
2121

2222
import org.apache.lucene.analysis.Tokenizer;
23+
import org.apache.lucene.analysis.TokenStream;
2324
import org.apache.lucene.analysis.cn.smart.HMMChineseTokenizer;
2425
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
2526
import org.elasticsearch.common.component.AbstractComponent;
2627
import org.elasticsearch.common.inject.Inject;
2728
import org.elasticsearch.common.settings.Settings;
28-
import org.elasticsearch.index.analysis.AnalyzerScope;
29-
import org.elasticsearch.index.analysis.PreBuiltAnalyzerProviderFactory;
30-
import org.elasticsearch.index.analysis.PreBuiltTokenizerFactoryFactory;
31-
import org.elasticsearch.index.analysis.TokenizerFactory;
29+
import org.elasticsearch.index.analysis.*;
3230
import org.elasticsearch.indices.analysis.IndicesAnalysisService;
3331

3432
import java.io.Reader;
@@ -59,5 +57,30 @@ public Tokenizer create(Reader reader) {
5957
}
6058
}));
6159

60+
// Register smartcn_sentence tokenizer -- for backwards compat an alias to smartcn_tokenizer
61+
indicesAnalysisService.tokenizerFactories().put("smartcn_sentence", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() {
62+
@Override
63+
public String name() {
64+
return "smartcn_sentence";
65+
}
66+
67+
@Override
68+
public Tokenizer create(Reader reader) {
69+
return new HMMChineseTokenizer(reader);
70+
}
71+
}));
72+
73+
// Register smartcn_word token filter -- noop
74+
indicesAnalysisService.tokenFilterFactories().put("smartcn_word", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
75+
@Override
76+
public String name() {
77+
return "smartcn_word";
78+
}
79+
80+
@Override
81+
public TokenStream create(TokenStream tokenStream) {
82+
return tokenStream;
83+
}
84+
}));
6285
}
6386
}

0 commit comments

Comments
 (0)