Skip to content

Commit a344ebb

Browse files
committed
Index Analysis: Add language analyzers and stemmers, closes #72
1 parent 45234f4 commit a344ebb

20 files changed

+1052
-1
lines changed

modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java

+19-1
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,25 @@ public AnalysisModule(Settings settings) {
8181
if (!tokenFiltersSettings.containsKey("shingle")) {
8282
tokenFilterBinder.addBinding("shingle").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, ShingleTokenFilterFactory.class)).in(Scopes.SINGLETON);
8383
}
84-
84+
// extends defaults
85+
if (!tokenFiltersSettings.containsKey("arabicStem")) {
86+
tokenFilterBinder.addBinding("arabicStem").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, ArabicStemTokenFilterFactory.class)).in(Scopes.SINGLETON);
87+
}
88+
if (!tokenFiltersSettings.containsKey("brazilianStem")) {
89+
tokenFilterBinder.addBinding("brazilianStem").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, BrazilianStemTokenFilterFactory.class)).in(Scopes.SINGLETON);
90+
}
91+
if (!tokenFiltersSettings.containsKey("dutchStem")) {
92+
tokenFilterBinder.addBinding("dutchStem").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, DutchStemTokenFilterFactory.class)).in(Scopes.SINGLETON);
93+
}
94+
if (!tokenFiltersSettings.containsKey("frenchStem")) {
95+
tokenFilterBinder.addBinding("frenchStem").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, FrenchStemTokenFilterFactory.class)).in(Scopes.SINGLETON);
96+
}
97+
if (!tokenFiltersSettings.containsKey("germanStem")) {
98+
tokenFilterBinder.addBinding("germanStem").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, GermanStemTokenFilterFactory.class)).in(Scopes.SINGLETON);
99+
}
100+
if (!tokenFiltersSettings.containsKey("russianStem")) {
101+
tokenFilterBinder.addBinding("russianStem").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, RussianStemTokenFilterFactory.class)).in(Scopes.SINGLETON);
102+
}
85103

86104
MapBinder<String, TokenizerFactoryFactory> tokenizerBinder
87105
= MapBinder.newMapBinder(binder(), String.class, TokenizerFactoryFactory.class);

modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/AnalysisService.java

+39
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,45 @@ public AnalysisService(Index index) {
101101
analyzerProviders.put("defaultSearch", analyzerProviders.get("default"));
102102
}
103103

104+
// extended analyzers defaults
105+
if (!analyzerProviders.containsKey("arabic")) {
106+
analyzerProviders.put("arabic", new ArabicAnalyzerProvider(index, indexSettings, "arabic", ImmutableSettings.Builder.EMPTY_SETTINGS));
107+
}
108+
if (!analyzerProviders.containsKey("brazilian")) {
109+
analyzerProviders.put("brazilian", new BrazilianAnalyzerProvider(index, indexSettings, "brazilian", ImmutableSettings.Builder.EMPTY_SETTINGS));
110+
}
111+
if (!analyzerProviders.containsKey("chinese")) {
112+
analyzerProviders.put("chinese", new ChineseAnalyzerProvider(index, indexSettings, "chinese", ImmutableSettings.Builder.EMPTY_SETTINGS));
113+
}
114+
if (!analyzerProviders.containsKey("cjk")) {
115+
analyzerProviders.put("cjk", new ChineseAnalyzerProvider(index, indexSettings, "cjk", ImmutableSettings.Builder.EMPTY_SETTINGS));
116+
}
117+
if (!analyzerProviders.containsKey("czech")) {
118+
analyzerProviders.put("czech", new CzechAnalyzerProvider(index, indexSettings, "czech", ImmutableSettings.Builder.EMPTY_SETTINGS));
119+
}
120+
if (!analyzerProviders.containsKey("dutch")) {
121+
analyzerProviders.put("dutch", new DutchAnalyzerProvider(index, indexSettings, "dutch", ImmutableSettings.Builder.EMPTY_SETTINGS));
122+
}
123+
if (!analyzerProviders.containsKey("french")) {
124+
analyzerProviders.put("french", new FrenchAnalyzerProvider(index, indexSettings, "french", ImmutableSettings.Builder.EMPTY_SETTINGS));
125+
}
126+
if (!analyzerProviders.containsKey("german")) {
127+
analyzerProviders.put("german", new GermanAnalyzerProvider(index, indexSettings, "german", ImmutableSettings.Builder.EMPTY_SETTINGS));
128+
}
129+
if (!analyzerProviders.containsKey("greek")) {
130+
analyzerProviders.put("greek", new GreekAnalyzerProvider(index, indexSettings, "greek", ImmutableSettings.Builder.EMPTY_SETTINGS));
131+
}
132+
if (!analyzerProviders.containsKey("persian")) {
133+
analyzerProviders.put("persian", new PersianAnalyzerProvider(index, indexSettings, "persian", ImmutableSettings.Builder.EMPTY_SETTINGS));
134+
}
135+
if (!analyzerProviders.containsKey("russian")) {
136+
analyzerProviders.put("russian", new RussianAnalyzerProvider(index, indexSettings, "russian", ImmutableSettings.Builder.EMPTY_SETTINGS));
137+
}
138+
if (!analyzerProviders.containsKey("thai")) {
139+
analyzerProviders.put("thai", new ThaiAnalyzerProvider(index, indexSettings, "thai", ImmutableSettings.Builder.EMPTY_SETTINGS));
140+
}
141+
142+
104143
this.analyzerProviders = ImmutableMap.copyOf(analyzerProviders);
105144

106145
Map<String, NamedAnalyzer> analyzers = newHashMap();
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
/*
2+
* Licensed to Elastic Search and Shay Banon under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. Elastic Search licenses this
6+
* file to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.elasticsearch.index.analysis;
21+
22+
import com.google.common.collect.ImmutableSet;
23+
import com.google.common.collect.Iterators;
24+
import com.google.inject.Inject;
25+
import com.google.inject.assistedinject.Assisted;
26+
import org.apache.lucene.analysis.ar.ArabicAnalyzer;
27+
import org.apache.lucene.util.Version;
28+
import org.elasticsearch.index.Index;
29+
import org.elasticsearch.index.settings.IndexSettings;
30+
import org.elasticsearch.util.settings.Settings;
31+
32+
import java.util.Set;
33+
34+
/**
35+
* @author kimchy (shay.banon)
36+
*/
37+
public class ArabicAnalyzerProvider extends AbstractAnalyzerProvider<ArabicAnalyzer> {
38+
39+
private final Set<String> stopWords;
40+
41+
private final ArabicAnalyzer arabicAnalyzer;
42+
43+
@Inject public ArabicAnalyzerProvider(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
44+
super(index, indexSettings, name);
45+
String[] stopWords = settings.getAsArray("stopwords");
46+
if (stopWords.length > 0) {
47+
this.stopWords = ImmutableSet.copyOf(Iterators.forArray(stopWords));
48+
} else {
49+
this.stopWords = ArabicAnalyzer.getDefaultStopSet();
50+
}
51+
arabicAnalyzer = new ArabicAnalyzer(Version.LUCENE_CURRENT, this.stopWords);
52+
}
53+
54+
@Override public ArabicAnalyzer get() {
55+
return this.arabicAnalyzer;
56+
}
57+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
/*
2+
* Licensed to Elastic Search and Shay Banon under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. Elastic Search licenses this
6+
* file to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.elasticsearch.index.analysis;
21+
22+
import com.google.inject.Inject;
23+
import com.google.inject.assistedinject.Assisted;
24+
import org.apache.lucene.analysis.TokenStream;
25+
import org.apache.lucene.analysis.ar.ArabicStemFilter;
26+
import org.elasticsearch.index.Index;
27+
import org.elasticsearch.index.settings.IndexSettings;
28+
import org.elasticsearch.util.settings.Settings;
29+
30+
/**
31+
* @author kimchy (shay.banon)
32+
*/
33+
public class ArabicStemTokenFilterFactory extends AbstractTokenFilterFactory {
34+
35+
@Inject public ArabicStemTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
36+
super(index, indexSettings, name);
37+
}
38+
39+
@Override public TokenStream create(TokenStream tokenStream) {
40+
return new ArabicStemFilter(tokenStream);
41+
}
42+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
/*
2+
* Licensed to Elastic Search and Shay Banon under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. Elastic Search licenses this
6+
* file to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.elasticsearch.index.analysis;
21+
22+
import com.google.common.collect.ImmutableSet;
23+
import com.google.common.collect.Iterators;
24+
import com.google.inject.Inject;
25+
import com.google.inject.assistedinject.Assisted;
26+
import org.apache.lucene.analysis.br.BrazilianAnalyzer;
27+
import org.apache.lucene.util.Version;
28+
import org.elasticsearch.index.Index;
29+
import org.elasticsearch.index.settings.IndexSettings;
30+
import org.elasticsearch.util.settings.Settings;
31+
32+
import java.util.Set;
33+
34+
/**
35+
* @author kimchy (shay.banon)
36+
*/
37+
public class BrazilianAnalyzerProvider extends AbstractAnalyzerProvider<BrazilianAnalyzer> {
38+
39+
private final Set<?> stopWords;
40+
41+
private final Set<?> stemExclusion;
42+
43+
private final BrazilianAnalyzer analyzer;
44+
45+
@Inject public BrazilianAnalyzerProvider(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
46+
super(index, indexSettings, name);
47+
String[] stopWords = settings.getAsArray("stopwords");
48+
if (stopWords.length > 0) {
49+
this.stopWords = ImmutableSet.copyOf(Iterators.forArray(stopWords));
50+
} else {
51+
this.stopWords = BrazilianAnalyzer.getDefaultStopSet();
52+
}
53+
54+
String[] stemExclusion = settings.getAsArray("stemExclusion");
55+
if (stemExclusion.length > 0) {
56+
this.stemExclusion = ImmutableSet.copyOf(Iterators.forArray(stemExclusion));
57+
} else {
58+
this.stemExclusion = ImmutableSet.of();
59+
}
60+
analyzer = new BrazilianAnalyzer(Version.LUCENE_CURRENT, this.stopWords, this.stemExclusion);
61+
}
62+
63+
@Override public BrazilianAnalyzer get() {
64+
return this.analyzer;
65+
}
66+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
/*
2+
* Licensed to Elastic Search and Shay Banon under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. Elastic Search licenses this
6+
* file to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.elasticsearch.index.analysis;
21+
22+
import com.google.common.collect.ImmutableSet;
23+
import com.google.common.collect.Iterators;
24+
import com.google.inject.Inject;
25+
import com.google.inject.assistedinject.Assisted;
26+
import org.apache.lucene.analysis.TokenStream;
27+
import org.apache.lucene.analysis.br.BrazilianStemFilter;
28+
import org.elasticsearch.index.Index;
29+
import org.elasticsearch.index.settings.IndexSettings;
30+
import org.elasticsearch.util.settings.Settings;
31+
32+
import java.util.Set;
33+
34+
/**
35+
* @author kimchy (shay.banon)
36+
*/
37+
public class BrazilianStemTokenFilterFactory extends AbstractTokenFilterFactory {
38+
39+
private final Set<?> exclusions;
40+
41+
@Inject public BrazilianStemTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
42+
super(index, indexSettings, name);
43+
String[] stemExclusion = settings.getAsArray("stemExclusion");
44+
if (stemExclusion.length > 0) {
45+
this.exclusions = ImmutableSet.copyOf(Iterators.forArray(stemExclusion));
46+
} else {
47+
this.exclusions = ImmutableSet.of();
48+
}
49+
}
50+
51+
@Override public TokenStream create(TokenStream tokenStream) {
52+
return new BrazilianStemFilter(tokenStream, exclusions);
53+
}
54+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
/*
2+
* Licensed to Elastic Search and Shay Banon under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. Elastic Search licenses this
6+
* file to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.elasticsearch.index.analysis;
21+
22+
import com.google.inject.Inject;
23+
import com.google.inject.assistedinject.Assisted;
24+
import org.apache.lucene.analysis.cn.ChineseAnalyzer;
25+
import org.elasticsearch.index.Index;
26+
import org.elasticsearch.index.settings.IndexSettings;
27+
import org.elasticsearch.util.settings.Settings;
28+
29+
/**
30+
* @author kimchy (shay.banon)
31+
*/
32+
public class ChineseAnalyzerProvider extends AbstractAnalyzerProvider<ChineseAnalyzer> {
33+
34+
private final ChineseAnalyzer analyzer;
35+
36+
@Inject public ChineseAnalyzerProvider(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
37+
super(index, indexSettings, name);
38+
analyzer = new ChineseAnalyzer();
39+
}
40+
41+
@Override public ChineseAnalyzer get() {
42+
return this.analyzer;
43+
}
44+
}

0 commit comments

Comments
 (0)