diff --git a/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc index b5d5426ff2710..1e82b2f47417a 100644 --- a/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc @@ -84,6 +84,7 @@ English:: http://snowball.tartarus.org/algorithms/porter/stemmer.html[*`english`*], http://ciir.cs.umass.edu/pubfiles/ir-35.pdf[`light_english`], http://www.researchgate.net/publication/220433848_How_effective_is_suffixing[`minimal_english`], +https://github.com/elastic/elasticsearch/issues/42892[`plural_english`], http://lucene.apache.org/core/4_9_0/analyzers-common/org/apache/lucene/analysis/en/EnglishPossessiveFilter.html[`possessive_english`], http://snowball.tartarus.org/algorithms/english/stemmer.html[`porter2`], http://snowball.tartarus.org/algorithms/lovins/stemmer.html[`lovins`] diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java new file mode 100644 index 0000000000000..98e0936dc0faa --- /dev/null +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java @@ -0,0 +1,175 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.analysis.common; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.en.EnglishMinimalStemFilter; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; + +import java.io.IOException; + +public final class EnglishPluralStemFilter extends TokenFilter { + private final EnglishPluralStemmer stemmer = new EnglishPluralStemmer(); + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); + + public EnglishPluralStemFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if (!keywordAttr.isKeyword()) { + final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length()); + termAtt.setLength(newlen); + } + return true; + } else { + return false; + } + } + + /** + * Plural stemmer for English based on the {@link EnglishMinimalStemFilter} + *

+ * This stemmer removes plurals but beyond EnglishMinimalStemFilter adds + * four new suffix rules to remove dangling e characters: + *

+ * See https://github.com/elastic/elasticsearch/issues/42892 + *

+ * In addition the s stemmer logic is amended so that + *

+ */ + public static class EnglishPluralStemmer { + + // Words ending in oes that retain the e when stemmed + public static final char [][] oesExceptions = { + "shoes".toCharArray(), + "canoes".toCharArray(), + "oboes".toCharArray() + }; + // Words ending in ches that retain the e when stemmed + public static final char [][] chesExceptions = { + "cliches".toCharArray(), + "avalanches".toCharArray(), + "mustaches".toCharArray(), + "moustaches".toCharArray(), + "quiches".toCharArray(), + "headaches".toCharArray(), + "heartaches".toCharArray(), + "porsches".toCharArray(), + "tranches".toCharArray(), + "caches".toCharArray() + }; + + @SuppressWarnings("fallthrough") + public int stem(char s[], int len) { + if (len < 3 || s[len - 1] != 's') + return len; + + switch (s[len - 2]) { + case 'u': + case 's': + return len; + case 'e': + // Modified ies->y logic from original s-stemmer - only work on strings > 4 + // so spies -> spy still but pies->pie. + // The original code also special-cased aies and eies for no good reason as far as I can tell. + // ( no words of consequence - eg http://www.thefreedictionary.com/words-that-end-in-aies ) + if (len > 4 && s[len - 3] == 'i') { + s[len - 3] = 'y'; + return len - 2; + } + + // Suffix rules to remove any dangling "e" + if (len > 3) { + // xes (but >1 prefix so we can stem "boxes->box" but keep "axes->axe") + if (len > 4 && s[len -3] == 'x') { + return len - 2; + } + // oes + if (len > 3 && s[len -3] == 'o') { + if (isException(s, len, oesExceptions)) { + // Only remove the S + return len -1; + } + // Remove the es + return len - 2; + } + if (len > 4) { + // shes/sses + if (s[len -4] == 's' && (s[len -3] == 'h' || s[len -3] == 's')){ + return len - 2; + } + + // ches + if (len > 4) { + if (s[len -4] == 'c' && s[len -3] == 'h' ){ + if (isException(s, len, chesExceptions)) { + // Only remove the S + return len -1; + } + // Remove the es + return len - 2; + + } + } + } + } + + default: + return len - 1; + } + } + + private boolean isException(char[] s, int len, char [][] exceptionsList) { + for (char[] oesRule : exceptionsList) { + int rulePos = oesRule.length - 1; + int sPos = len - 1; + boolean matched = true; + while (rulePos >= 0 && sPos >= 0) { + if (oesRule[rulePos] != s[sPos]) { + matched = false; + break; + } + rulePos--; + sPos--; + } + if (matched) { + return true; + } + } + return false; + } + } + +} diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactory.java index b94f7f6499a97..396db78707a36 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactory.java @@ -139,6 +139,8 @@ public TokenStream create(TokenStream tokenStream) { return new SnowballFilter(tokenStream, new EnglishStemmer()); } else if ("minimal_english".equalsIgnoreCase(language) || "minimalEnglish".equalsIgnoreCase(language)) { return new EnglishMinimalStemFilter(tokenStream); + } else if ("plural_english".equalsIgnoreCase(language) || "pluralEnglish".equalsIgnoreCase(language)) { + return new EnglishPluralStemFilter(tokenStream); } else if ("possessive_english".equalsIgnoreCase(language) || "possessiveEnglish".equalsIgnoreCase(language)) { return new EnglishPossessiveFilter(tokenStream); diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java index 8e3e862f462e2..c4f598dea2f73 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java @@ -97,6 +97,84 @@ public void testPorter2FilterFactory() throws IOException { assertAnalyzesTo(analyzer, "possibly", new String[]{"possibl"}); } } + + public void testEnglishPluralFilter() throws IOException { + int iters = scaledRandomIntBetween(20, 100); + for (int i = 0; i < iters; i++) { + + Version v = VersionUtils.randomVersion(random()); + Settings settings = Settings.builder() + .put("index.analysis.filter.my_plurals.type", "stemmer") + .put("index.analysis.filter.my_plurals.language", "plural_english") + .put("index.analysis.analyzer.my_plurals.tokenizer","whitespace") + .put("index.analysis.analyzer.my_plurals.filter","my_plurals") + .put(SETTING_VERSION_CREATED,v) + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .build(); + + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, PLUGIN); + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_plurals"); + assertThat(tokenFilter, instanceOf(StemmerTokenFilterFactory.class)); + Tokenizer tokenizer = new WhitespaceTokenizer(); + tokenizer.setReader(new StringReader("dresses")); + TokenStream create = tokenFilter.create(tokenizer); + IndexAnalyzers indexAnalyzers = analysis.indexAnalyzers; + NamedAnalyzer analyzer = indexAnalyzers.get("my_plurals"); + assertThat(create, instanceOf(EnglishPluralStemFilter.class)); + + // Check old EnglishMinimalStemmer ("S" stemmer) logic + assertAnalyzesTo(analyzer, "phones", new String[]{"phone"}); + assertAnalyzesTo(analyzer, "horses", new String[]{"horse"}); + assertAnalyzesTo(analyzer, "cameras", new String[]{"camera"}); + + // The orginal s stemmer gives up on stemming oes words because English has no fixed rule for the stem + // (see https://howtospell.co.uk/making-O-words-plural ) + // This stemmer removes the es but retains e for a small number of exceptions + assertAnalyzesTo(analyzer, "mosquitoes", new String[]{"mosquito"}); + assertAnalyzesTo(analyzer, "heroes", new String[]{"hero"}); + // oes exceptions that retain the e. + assertAnalyzesTo(analyzer, "shoes", new String[]{"shoe"}); + assertAnalyzesTo(analyzer, "horseshoes", new String[]{"horseshoe"}); + assertAnalyzesTo(analyzer, "canoes", new String[]{"canoe"}); + assertAnalyzesTo(analyzer, "oboes", new String[]{"oboe"}); + + // Check improved EnglishPluralStemFilter logic + //sses + assertAnalyzesTo(analyzer, "dresses", new String[]{"dress"}); + assertAnalyzesTo(analyzer, "possess", new String[]{"possess"}); + assertAnalyzesTo(analyzer, "possesses", new String[]{"possess"}); + // xes + assertAnalyzesTo(analyzer, "boxes", new String[]{"box"}); + assertAnalyzesTo(analyzer, "axes", new String[]{"axe"}); + //shes + assertAnalyzesTo(analyzer, "dishes", new String[]{"dish"}); + assertAnalyzesTo(analyzer, "washes", new String[]{"wash"}); + //ees + assertAnalyzesTo(analyzer, "employees", new String[]{"employee"}); + assertAnalyzesTo(analyzer, "bees", new String[]{"bee"}); + //tch + assertAnalyzesTo(analyzer, "watches", new String[]{"watch"}); + assertAnalyzesTo(analyzer, "itches", new String[]{"itch"}); + // ies->y but only for length >4 + assertAnalyzesTo(analyzer, "spies", new String[]{"spy"}); + assertAnalyzesTo(analyzer, "ties", new String[]{"tie"}); + assertAnalyzesTo(analyzer, "lies", new String[]{"lie"}); + assertAnalyzesTo(analyzer, "pies", new String[]{"pie"}); + assertAnalyzesTo(analyzer, "dies", new String[]{"die"}); + + + assertAnalyzesTo(analyzer, "lunches", new String[]{"lunch"}); + assertAnalyzesTo(analyzer, "avalanches", new String[]{"avalanche"}); + assertAnalyzesTo(analyzer, "headaches", new String[]{"headache"}); + assertAnalyzesTo(analyzer, "caches", new String[]{"cache"}); + assertAnalyzesTo(analyzer, "beaches", new String[]{"beach"}); + assertAnalyzesTo(analyzer, "britches", new String[]{"britch"}); + assertAnalyzesTo(analyzer, "cockroaches", new String[]{"cockroach"}); + assertAnalyzesTo(analyzer, "cliches", new String[]{"cliche"}); + assertAnalyzesTo(analyzer, "quiches", new String[]{"quiche"}); + + } + } public void testMultipleLanguagesThrowsException() throws IOException { Version v = VersionUtils.randomVersion(random());