diff --git a/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc
index b5d5426ff2710..1e82b2f47417a 100644
--- a/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc
+++ b/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc
@@ -84,6 +84,7 @@ English::
http://snowball.tartarus.org/algorithms/porter/stemmer.html[*`english`*],
http://ciir.cs.umass.edu/pubfiles/ir-35.pdf[`light_english`],
http://www.researchgate.net/publication/220433848_How_effective_is_suffixing[`minimal_english`],
+https://github.com/elastic/elasticsearch/issues/42892[`plural_english`],
http://lucene.apache.org/core/4_9_0/analyzers-common/org/apache/lucene/analysis/en/EnglishPossessiveFilter.html[`possessive_english`],
http://snowball.tartarus.org/algorithms/english/stemmer.html[`porter2`],
http://snowball.tartarus.org/algorithms/lovins/stemmer.html[`lovins`]
diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java
new file mode 100644
index 0000000000000..98e0936dc0faa
--- /dev/null
+++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java
@@ -0,0 +1,175 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.analysis.common;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.en.EnglishMinimalStemFilter;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+
+import java.io.IOException;
+
+public final class EnglishPluralStemFilter extends TokenFilter {
+ private final EnglishPluralStemmer stemmer = new EnglishPluralStemmer();
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
+
+ public EnglishPluralStemFilter(TokenStream input) {
+ super(input);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ if (!keywordAttr.isKeyword()) {
+ final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
+ termAtt.setLength(newlen);
+ }
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ /**
+ * Plural stemmer for English based on the {@link EnglishMinimalStemFilter}
+ *
+ * This stemmer removes plurals but beyond EnglishMinimalStemFilter adds
+ * four new suffix rules to remove dangling e characters:
+ *
+ * - xes - "boxes" becomes "box"
+ * - sses - "dresses" becomes "dress"
+ * - shes - "dishes" becomes "dish"
+ * - tches - "watches" becomes "watch"
+ *
+ * See https://github.com/elastic/elasticsearch/issues/42892
+ *
+ * In addition the s stemmer logic is amended so that
+ *
+ * - ees->ee so that bees matches bee
+ * - ies->y only on longer words to that ties matches tie
+ * - oes->o rule so that tomatoes matches tomato but retains e for some words eg shoes to shoe
+ *
+ */
+ public static class EnglishPluralStemmer {
+
+ // Words ending in oes that retain the e when stemmed
+ public static final char [][] oesExceptions = {
+ "shoes".toCharArray(),
+ "canoes".toCharArray(),
+ "oboes".toCharArray()
+ };
+ // Words ending in ches that retain the e when stemmed
+ public static final char [][] chesExceptions = {
+ "cliches".toCharArray(),
+ "avalanches".toCharArray(),
+ "mustaches".toCharArray(),
+ "moustaches".toCharArray(),
+ "quiches".toCharArray(),
+ "headaches".toCharArray(),
+ "heartaches".toCharArray(),
+ "porsches".toCharArray(),
+ "tranches".toCharArray(),
+ "caches".toCharArray()
+ };
+
+ @SuppressWarnings("fallthrough")
+ public int stem(char s[], int len) {
+ if (len < 3 || s[len - 1] != 's')
+ return len;
+
+ switch (s[len - 2]) {
+ case 'u':
+ case 's':
+ return len;
+ case 'e':
+ // Modified ies->y logic from original s-stemmer - only work on strings > 4
+ // so spies -> spy still but pies->pie.
+ // The original code also special-cased aies and eies for no good reason as far as I can tell.
+ // ( no words of consequence - eg http://www.thefreedictionary.com/words-that-end-in-aies )
+ if (len > 4 && s[len - 3] == 'i') {
+ s[len - 3] = 'y';
+ return len - 2;
+ }
+
+ // Suffix rules to remove any dangling "e"
+ if (len > 3) {
+ // xes (but >1 prefix so we can stem "boxes->box" but keep "axes->axe")
+ if (len > 4 && s[len -3] == 'x') {
+ return len - 2;
+ }
+ // oes
+ if (len > 3 && s[len -3] == 'o') {
+ if (isException(s, len, oesExceptions)) {
+ // Only remove the S
+ return len -1;
+ }
+ // Remove the es
+ return len - 2;
+ }
+ if (len > 4) {
+ // shes/sses
+ if (s[len -4] == 's' && (s[len -3] == 'h' || s[len -3] == 's')){
+ return len - 2;
+ }
+
+ // ches
+ if (len > 4) {
+ if (s[len -4] == 'c' && s[len -3] == 'h' ){
+ if (isException(s, len, chesExceptions)) {
+ // Only remove the S
+ return len -1;
+ }
+ // Remove the es
+ return len - 2;
+
+ }
+ }
+ }
+ }
+
+ default:
+ return len - 1;
+ }
+ }
+
+ private boolean isException(char[] s, int len, char [][] exceptionsList) {
+ for (char[] oesRule : exceptionsList) {
+ int rulePos = oesRule.length - 1;
+ int sPos = len - 1;
+ boolean matched = true;
+ while (rulePos >= 0 && sPos >= 0) {
+ if (oesRule[rulePos] != s[sPos]) {
+ matched = false;
+ break;
+ }
+ rulePos--;
+ sPos--;
+ }
+ if (matched) {
+ return true;
+ }
+ }
+ return false;
+ }
+ }
+
+}
diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactory.java
index b94f7f6499a97..396db78707a36 100644
--- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactory.java
+++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactory.java
@@ -139,6 +139,8 @@ public TokenStream create(TokenStream tokenStream) {
return new SnowballFilter(tokenStream, new EnglishStemmer());
} else if ("minimal_english".equalsIgnoreCase(language) || "minimalEnglish".equalsIgnoreCase(language)) {
return new EnglishMinimalStemFilter(tokenStream);
+ } else if ("plural_english".equalsIgnoreCase(language) || "pluralEnglish".equalsIgnoreCase(language)) {
+ return new EnglishPluralStemFilter(tokenStream);
} else if ("possessive_english".equalsIgnoreCase(language) || "possessiveEnglish".equalsIgnoreCase(language)) {
return new EnglishPossessiveFilter(tokenStream);
diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java
index 8e3e862f462e2..c4f598dea2f73 100644
--- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java
+++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java
@@ -97,6 +97,84 @@ public void testPorter2FilterFactory() throws IOException {
assertAnalyzesTo(analyzer, "possibly", new String[]{"possibl"});
}
}
+
+ public void testEnglishPluralFilter() throws IOException {
+ int iters = scaledRandomIntBetween(20, 100);
+ for (int i = 0; i < iters; i++) {
+
+ Version v = VersionUtils.randomVersion(random());
+ Settings settings = Settings.builder()
+ .put("index.analysis.filter.my_plurals.type", "stemmer")
+ .put("index.analysis.filter.my_plurals.language", "plural_english")
+ .put("index.analysis.analyzer.my_plurals.tokenizer","whitespace")
+ .put("index.analysis.analyzer.my_plurals.filter","my_plurals")
+ .put(SETTING_VERSION_CREATED,v)
+ .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+ .build();
+
+ ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, PLUGIN);
+ TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_plurals");
+ assertThat(tokenFilter, instanceOf(StemmerTokenFilterFactory.class));
+ Tokenizer tokenizer = new WhitespaceTokenizer();
+ tokenizer.setReader(new StringReader("dresses"));
+ TokenStream create = tokenFilter.create(tokenizer);
+ IndexAnalyzers indexAnalyzers = analysis.indexAnalyzers;
+ NamedAnalyzer analyzer = indexAnalyzers.get("my_plurals");
+ assertThat(create, instanceOf(EnglishPluralStemFilter.class));
+
+ // Check old EnglishMinimalStemmer ("S" stemmer) logic
+ assertAnalyzesTo(analyzer, "phones", new String[]{"phone"});
+ assertAnalyzesTo(analyzer, "horses", new String[]{"horse"});
+ assertAnalyzesTo(analyzer, "cameras", new String[]{"camera"});
+
+ // The orginal s stemmer gives up on stemming oes words because English has no fixed rule for the stem
+ // (see https://howtospell.co.uk/making-O-words-plural )
+ // This stemmer removes the es but retains e for a small number of exceptions
+ assertAnalyzesTo(analyzer, "mosquitoes", new String[]{"mosquito"});
+ assertAnalyzesTo(analyzer, "heroes", new String[]{"hero"});
+ // oes exceptions that retain the e.
+ assertAnalyzesTo(analyzer, "shoes", new String[]{"shoe"});
+ assertAnalyzesTo(analyzer, "horseshoes", new String[]{"horseshoe"});
+ assertAnalyzesTo(analyzer, "canoes", new String[]{"canoe"});
+ assertAnalyzesTo(analyzer, "oboes", new String[]{"oboe"});
+
+ // Check improved EnglishPluralStemFilter logic
+ //sses
+ assertAnalyzesTo(analyzer, "dresses", new String[]{"dress"});
+ assertAnalyzesTo(analyzer, "possess", new String[]{"possess"});
+ assertAnalyzesTo(analyzer, "possesses", new String[]{"possess"});
+ // xes
+ assertAnalyzesTo(analyzer, "boxes", new String[]{"box"});
+ assertAnalyzesTo(analyzer, "axes", new String[]{"axe"});
+ //shes
+ assertAnalyzesTo(analyzer, "dishes", new String[]{"dish"});
+ assertAnalyzesTo(analyzer, "washes", new String[]{"wash"});
+ //ees
+ assertAnalyzesTo(analyzer, "employees", new String[]{"employee"});
+ assertAnalyzesTo(analyzer, "bees", new String[]{"bee"});
+ //tch
+ assertAnalyzesTo(analyzer, "watches", new String[]{"watch"});
+ assertAnalyzesTo(analyzer, "itches", new String[]{"itch"});
+ // ies->y but only for length >4
+ assertAnalyzesTo(analyzer, "spies", new String[]{"spy"});
+ assertAnalyzesTo(analyzer, "ties", new String[]{"tie"});
+ assertAnalyzesTo(analyzer, "lies", new String[]{"lie"});
+ assertAnalyzesTo(analyzer, "pies", new String[]{"pie"});
+ assertAnalyzesTo(analyzer, "dies", new String[]{"die"});
+
+
+ assertAnalyzesTo(analyzer, "lunches", new String[]{"lunch"});
+ assertAnalyzesTo(analyzer, "avalanches", new String[]{"avalanche"});
+ assertAnalyzesTo(analyzer, "headaches", new String[]{"headache"});
+ assertAnalyzesTo(analyzer, "caches", new String[]{"cache"});
+ assertAnalyzesTo(analyzer, "beaches", new String[]{"beach"});
+ assertAnalyzesTo(analyzer, "britches", new String[]{"britch"});
+ assertAnalyzesTo(analyzer, "cockroaches", new String[]{"cockroach"});
+ assertAnalyzesTo(analyzer, "cliches", new String[]{"cliche"});
+ assertAnalyzesTo(analyzer, "quiches", new String[]{"quiche"});
+
+ }
+ }
public void testMultipleLanguagesThrowsException() throws IOException {
Version v = VersionUtils.randomVersion(random());