Skip to content

Commit 3b37b79

Browse files
committed
Fix beidermorse phonetic token filter for unspecified languageset (#27112)
Currently, when we create a BeiderMorseFilter with an unspecified `languageset`, the filter will not guess the language, which should be the default behaviour. This change fixes this and adds a simple test for the cases with and without provided `languageset` settings. Closes #26771
1 parent f972116 commit 3b37b79

File tree

3 files changed

+44
-9
lines changed

3 files changed

+44
-9
lines changed

plugins/analysis-phonetic/src/main/java/org/elasticsearch/index/analysis/PhoneticTokenFilterFactory.java

+7-7
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,6 @@
1919

2020
package org.elasticsearch.index.analysis;
2121

22-
import java.util.HashSet;
23-
import java.util.List;
24-
2522
import org.apache.commons.codec.Encoder;
2623
import org.apache.commons.codec.language.Caverphone1;
2724
import org.apache.commons.codec.language.Caverphone2;
@@ -45,6 +42,9 @@
4542
import org.elasticsearch.index.analysis.phonetic.KoelnerPhonetik;
4643
import org.elasticsearch.index.analysis.phonetic.Nysiis;
4744

45+
import java.util.HashSet;
46+
import java.util.List;
47+
4848
public class PhoneticTokenFilterFactory extends AbstractTokenFilterFactory {
4949

5050
private final Encoder encoder;
@@ -116,11 +116,11 @@ public PhoneticTokenFilterFactory(IndexSettings indexSettings, Environment envir
116116
public TokenStream create(TokenStream tokenStream) {
117117
if (encoder == null) {
118118
if (ruletype != null && nametype != null) {
119-
if (languageset != null) {
120-
final LanguageSet languages = LanguageSet.from(new HashSet<>(languageset));
121-
return new BeiderMorseFilter(tokenStream, new PhoneticEngine(nametype, ruletype, true), languages);
119+
LanguageSet langset = null;
120+
if (languageset != null && languageset.size() > 0) {
121+
langset = LanguageSet.from(new HashSet<>(languageset));
122122
}
123-
return new BeiderMorseFilter(tokenStream, new PhoneticEngine(nametype, ruletype, true));
123+
return new BeiderMorseFilter(tokenStream, new PhoneticEngine(nametype, ruletype, true), langset);
124124
}
125125
if (maxcodelength > 0) {
126126
return new DoubleMetaphoneFilter(tokenStream, maxcodelength, !replace);

plugins/analysis-phonetic/src/test/java/org/elasticsearch/index/analysis/SimplePhoneticAnalysisTests.java

+33-2
Original file line numberDiff line numberDiff line change
@@ -19,26 +19,57 @@
1919

2020
package org.elasticsearch.index.analysis;
2121

22+
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
23+
import org.apache.lucene.analysis.Tokenizer;
24+
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
2225
import org.elasticsearch.Version;
2326
import org.elasticsearch.cluster.metadata.IndexMetaData;
2427
import org.elasticsearch.common.settings.Settings;
2528
import org.elasticsearch.index.Index;
2629
import org.elasticsearch.plugin.analysis.AnalysisPhoneticPlugin;
2730
import org.elasticsearch.test.ESTestCase;
2831
import org.hamcrest.MatcherAssert;
32+
import org.junit.Before;
2933

3034
import java.io.IOException;
35+
import java.io.StringReader;
3136

3237
import static org.hamcrest.Matchers.instanceOf;
3338

3439
public class SimplePhoneticAnalysisTests extends ESTestCase {
35-
public void testPhoneticTokenFilterFactory() throws IOException {
40+
41+
private TestAnalysis analysis;
42+
43+
@Before
44+
public void setup() throws IOException {
3645
String yaml = "/org/elasticsearch/index/analysis/phonetic-1.yml";
3746
Settings settings = Settings.builder().loadFromStream(yaml, getClass().getResourceAsStream(yaml), false)
3847
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
3948
.build();
40-
TestAnalysis analysis = createTestAnalysis(new Index("test", "_na_"), settings, new AnalysisPhoneticPlugin());
49+
this.analysis = createTestAnalysis(new Index("test", "_na_"), settings, new AnalysisPhoneticPlugin());
50+
}
51+
52+
public void testPhoneticTokenFilterFactory() throws IOException {
4153
TokenFilterFactory filterFactory = analysis.tokenFilter.get("phonetic");
4254
MatcherAssert.assertThat(filterFactory, instanceOf(PhoneticTokenFilterFactory.class));
4355
}
56+
57+
public void testPhoneticTokenFilterBeiderMorseNoLanguage() throws IOException {
58+
TokenFilterFactory filterFactory = analysis.tokenFilter.get("beidermorsefilter");
59+
Tokenizer tokenizer = new WhitespaceTokenizer();
60+
tokenizer.setReader(new StringReader("ABADIAS"));
61+
String[] expected = new String[] { "abYdias", "abYdios", "abadia", "abadiaS", "abadias", "abadio", "abadioS", "abadios", "abodia",
62+
"abodiaS", "abodias", "abodio", "abodioS", "abodios", "avadias", "avadios", "avodias", "avodios", "obadia", "obadiaS",
63+
"obadias", "obadio", "obadioS", "obadios", "obodia", "obodiaS", "obodias", "obodioS" };
64+
BaseTokenStreamTestCase.assertTokenStreamContents(filterFactory.create(tokenizer), expected);
65+
}
66+
67+
public void testPhoneticTokenFilterBeiderMorseWithLanguage() throws IOException {
68+
TokenFilterFactory filterFactory = analysis.tokenFilter.get("beidermorsefilterfrench");
69+
Tokenizer tokenizer = new WhitespaceTokenizer();
70+
tokenizer.setReader(new StringReader("Rimbault"));
71+
String[] expected = new String[] { "rimbD", "rimbDlt", "rimba", "rimbalt", "rimbo", "rimbolt", "rimbu", "rimbult", "rmbD", "rmbDlt",
72+
"rmba", "rmbalt", "rmbo", "rmbolt", "rmbu", "rmbult" };
73+
BaseTokenStreamTestCase.assertTokenStreamContents(filterFactory.create(tokenizer), expected);
74+
}
4475
}

plugins/analysis-phonetic/src/test/resources/org/elasticsearch/index/analysis/phonetic-1.yml

+4
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,10 @@ index:
1919
beidermorsefilter:
2020
type: phonetic
2121
encoder: beidermorse
22+
beidermorsefilterfrench:
23+
type: phonetic
24+
encoder: beidermorse
25+
languageset : [ "french" ]
2226
koelnerphonetikfilter:
2327
type: phonetic
2428
encoder: koelnerphonetik

0 commit comments

Comments
 (0)