Skip to content

Commit 3c76b8b

Browse files
committed
Added support for ches suffix with exceptions for words like avalanche
1 parent 0171b46 commit 3c76b8b

File tree

2 files changed

+33
-9
lines changed

2 files changed

+33
-9
lines changed

modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,19 @@ public static class EnglishPluralStemmer {
7777
"canoes".toCharArray(),
7878
"oboes".toCharArray()
7979
};
80+
// Words ending in ches that retain the e when stemmed
81+
public static final char [][] chesExceptions = {
82+
"cliches".toCharArray(),
83+
"avalanches".toCharArray(),
84+
"mustaches".toCharArray(),
85+
"moustaches".toCharArray(),
86+
"quiches".toCharArray(),
87+
"headaches".toCharArray(),
88+
"heartaches".toCharArray(),
89+
"porsches".toCharArray(),
90+
"tranches".toCharArray(),
91+
"caches".toCharArray()
92+
};
8093

8194
@SuppressWarnings("fallthrough")
8295
public int stem(char s[], int len) {
@@ -105,7 +118,7 @@ public int stem(char s[], int len) {
105118
}
106119
// oes
107120
if (len > 3 && s[len -3] == 'o') {
108-
if (isOesException(s, len)) {
121+
if (isException(s, len, oesExceptions)) {
109122
// Only remove the S
110123
return len -1;
111124
}
@@ -118,10 +131,16 @@ public int stem(char s[], int len) {
118131
return len - 2;
119132
}
120133

121-
// tches (TODO consider just ches? Gains: lunches == lunch, losses: moustaches!= moustache
122-
if (len > 5) {
123-
if (s[len -5] == 't' && s[len -4] == 'c' && s[len -3] == 'h' ){
134+
// ches
135+
if (len > 4) {
136+
if (s[len -4] == 'c' && s[len -3] == 'h' ){
137+
if (isException(s, len, chesExceptions)) {
138+
// Only remove the S
139+
return len -1;
140+
}
141+
// Remove the es
124142
return len - 2;
143+
125144
}
126145
}
127146
}
@@ -132,8 +151,8 @@ public int stem(char s[], int len) {
132151
}
133152
}
134153

135-
private boolean isOesException(char[] s, int len) {
136-
for (char[] oesRule : oesExceptions) {
154+
private boolean isException(char[] s, int len, char [][] exceptionsList) {
155+
for (char[] oesRule : exceptionsList) {
137156
int rulePos = oesRule.length - 1;
138157
int sPos = len - 1;
139158
boolean matched = true;

modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -163,11 +163,16 @@ public void testEnglishPluralFilter() throws IOException {
163163
assertAnalyzesTo(analyzer, "dies", new String[]{"die"});
164164

165165

166-
// *CHES - would be good to find a simple rule that solves lunches, churches but doesn't break aches
167-
// documenting current behaviour here as a known issue:
168-
assertAnalyzesTo(analyzer, "lunches", new String[]{"lunche"});
166+
assertAnalyzesTo(analyzer, "lunches", new String[]{"lunch"});
169167
assertAnalyzesTo(analyzer, "avalanches", new String[]{"avalanche"});
170168
assertAnalyzesTo(analyzer, "headaches", new String[]{"headache"});
169+
assertAnalyzesTo(analyzer, "caches", new String[]{"cache"});
170+
assertAnalyzesTo(analyzer, "beaches", new String[]{"beach"});
171+
assertAnalyzesTo(analyzer, "britches", new String[]{"britch"});
172+
assertAnalyzesTo(analyzer, "cockroaches", new String[]{"cockroach"});
173+
assertAnalyzesTo(analyzer, "cliches", new String[]{"cliche"});
174+
assertAnalyzesTo(analyzer, "quiches", new String[]{"quiche"});
175+
171176
}
172177
}
173178

0 commit comments

Comments
 (0)