Skip to content

Commit fa330a1

Browse files
committed
Fix beidermorse phonetic token filter for unspecified languageset (#27112)
Currently, when we create a BeiderMorseFilter with an unspecified `languageset`, the filter will not guess the language, which should be the default behaviour. This change fixes this and adds a simple test for the cases with and without provided `languageset` settings. Closes #26771
1 parent 9c56b41 commit fa330a1

File tree

3 files changed

+44
-9
lines changed

3 files changed

+44
-9
lines changed

plugins/analysis-phonetic/src/main/java/org/elasticsearch/index/analysis/PhoneticTokenFilterFactory.java

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,6 @@
1919

2020
package org.elasticsearch.index.analysis;
2121

22-
import java.util.Arrays;
23-
import java.util.HashSet;
24-
2522
import org.apache.commons.codec.Encoder;
2623
import org.apache.commons.codec.language.Caverphone1;
2724
import org.apache.commons.codec.language.Caverphone2;
@@ -45,6 +42,9 @@
4542
import org.elasticsearch.index.analysis.phonetic.KoelnerPhonetik;
4643
import org.elasticsearch.index.analysis.phonetic.Nysiis;
4744

45+
import java.util.Arrays;
46+
import java.util.HashSet;
47+
4848
/**
4949
*
5050
*/
@@ -119,11 +119,11 @@ public PhoneticTokenFilterFactory(IndexSettings indexSettings, Environment envir
119119
public TokenStream create(TokenStream tokenStream) {
120120
if (encoder == null) {
121121
if (ruletype != null && nametype != null) {
122-
if (languageset != null) {
123-
final LanguageSet languages = LanguageSet.from(new HashSet<>(Arrays.asList(languageset)));
124-
return new BeiderMorseFilter(tokenStream, new PhoneticEngine(nametype, ruletype, true), languages);
122+
LanguageSet langset = null;
123+
if (languageset != null && languageset.length > 0) {
124+
langset = LanguageSet.from(new HashSet<>(Arrays.asList(languageset)));
125125
}
126-
return new BeiderMorseFilter(tokenStream, new PhoneticEngine(nametype, ruletype, true));
126+
return new BeiderMorseFilter(tokenStream, new PhoneticEngine(nametype, ruletype, true), langset);
127127
}
128128
if (maxcodelength > 0) {
129129
return new DoubleMetaphoneFilter(tokenStream, maxcodelength, !replace);

plugins/analysis-phonetic/src/test/java/org/elasticsearch/index/analysis/SimplePhoneticAnalysisTests.java

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,28 +19,59 @@
1919

2020
package org.elasticsearch.index.analysis;
2121

22+
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
23+
import org.apache.lucene.analysis.Tokenizer;
24+
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
2225
import org.elasticsearch.Version;
2326
import org.elasticsearch.cluster.metadata.IndexMetaData;
2427
import org.elasticsearch.common.settings.Settings;
2528
import org.elasticsearch.index.Index;
2629
import org.elasticsearch.plugin.analysis.AnalysisPhoneticPlugin;
2730
import org.elasticsearch.test.ESTestCase;
2831
import org.hamcrest.MatcherAssert;
32+
import org.junit.Before;
2933

3034
import java.io.IOException;
35+
import java.io.StringReader;
3136

3237
import static org.hamcrest.Matchers.instanceOf;
3338

3439
/**
3540
*/
3641
public class SimplePhoneticAnalysisTests extends ESTestCase {
37-
public void testPhoneticTokenFilterFactory() throws IOException {
42+
43+
private TestAnalysis analysis;
44+
45+
@Before
46+
public void setup() throws IOException {
3847
String yaml = "/org/elasticsearch/index/analysis/phonetic-1.yml";
3948
Settings settings = Settings.builder().loadFromStream(yaml, getClass().getResourceAsStream(yaml))
4049
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
4150
.build();
42-
TestAnalysis analysis = createTestAnalysis(new Index("test", "_na_"), settings, new AnalysisPhoneticPlugin());
51+
this.analysis = createTestAnalysis(new Index("test", "_na_"), settings, new AnalysisPhoneticPlugin());
52+
}
53+
54+
public void testPhoneticTokenFilterFactory() throws IOException {
4355
TokenFilterFactory filterFactory = analysis.tokenFilter.get("phonetic");
4456
MatcherAssert.assertThat(filterFactory, instanceOf(PhoneticTokenFilterFactory.class));
4557
}
58+
59+
public void testPhoneticTokenFilterBeiderMorseNoLanguage() throws IOException {
60+
TokenFilterFactory filterFactory = analysis.tokenFilter.get("beidermorsefilter");
61+
Tokenizer tokenizer = new WhitespaceTokenizer();
62+
tokenizer.setReader(new StringReader("ABADIAS"));
63+
String[] expected = new String[] { "abYdias", "abYdios", "abadia", "abadiaS", "abadias", "abadio", "abadioS", "abadios", "abodia",
64+
"abodiaS", "abodias", "abodio", "abodioS", "abodios", "avadias", "avadios", "avodias", "avodios", "obadia", "obadiaS",
65+
"obadias", "obadio", "obadioS", "obadios", "obodia", "obodiaS", "obodias", "obodioS" };
66+
BaseTokenStreamTestCase.assertTokenStreamContents(filterFactory.create(tokenizer), expected);
67+
}
68+
69+
public void testPhoneticTokenFilterBeiderMorseWithLanguage() throws IOException {
70+
TokenFilterFactory filterFactory = analysis.tokenFilter.get("beidermorsefilterfrench");
71+
Tokenizer tokenizer = new WhitespaceTokenizer();
72+
tokenizer.setReader(new StringReader("Rimbault"));
73+
String[] expected = new String[] { "rimbD", "rimbDlt", "rimba", "rimbalt", "rimbo", "rimbolt", "rimbu", "rimbult", "rmbD", "rmbDlt",
74+
"rmba", "rmbalt", "rmbo", "rmbolt", "rmbu", "rmbult" };
75+
BaseTokenStreamTestCase.assertTokenStreamContents(filterFactory.create(tokenizer), expected);
76+
}
4677
}

plugins/analysis-phonetic/src/test/resources/org/elasticsearch/index/analysis/phonetic-1.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,10 @@ index:
1919
beidermorsefilter:
2020
type: phonetic
2121
encoder: beidermorse
22+
beidermorsefilterfrench:
23+
type: phonetic
24+
encoder: beidermorse
25+
languageset : [ "french" ]
2226
koelnerphonetikfilter:
2327
type: phonetic
2428
encoder: koelnerphonetik

0 commit comments

Comments
 (0)