Ukrainian language plugin can fill up heap (#71998)

romseygeek · romseygeek · commit d6038a3ec7b5 · 2021-04-21T12:14:05.000+01:00
The lucene Ukrainian analyzer has a bug where a large in-memory dictionary is loaded and stored on a thread local for every tokenstream generated in a new thread (for more details see https://issues.apache.org/jira/browse/LUCENE-9930). Due to checks added in #50908, we create a tokenstream for every registered analyzer in every shard, which means that any node with the ukrainian plugin installed will leak one copy of this dictionary per shard, whether or not the ukrainian analyzer is actually being used. This commit makes the plugin use a fixed version of the UkrainianMorfologikAnalyzer, until we merge a version of lucene that contains the upstream fix.
diff --git a/plugins/analysis-ukrainian/src/main/java/org/apache/lucene/analysis/uk/XUkrainianMorfologikAnalyzer.java b/plugins/analysis-ukrainian/src/main/java/org/apache/lucene/analysis/uk/XUkrainianMorfologikAnalyzer.java
@@ -0,0 +1,158 @@
+/*@notice
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.uk;
+
+import morfologik.stemming.Dictionary;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.charfilter.MappingCharFilter;
+import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.morfologik.MorfologikFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.util.IOUtils;
+import org.elasticsearch.common.SuppressForbidden;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.nio.charset.StandardCharsets;
+
+/**
+ * A dictionary-based {@link Analyzer} for Ukrainian.
+ *
+ * Modified from lucene 8.8.0 sources to incorporate a bugfix for
+ * https://issues.apache.org/jira/browse/LUCENE-9930
+ */
+public final class XUkrainianMorfologikAnalyzer extends StopwordAnalyzerBase {
+    private final CharArraySet stemExclusionSet;
+
+    /** File containing default Ukrainian stopwords. */
+    public static final String DEFAULT_STOPWORD_FILE = "stopwords.txt";
+
+    /**
+     * Returns an unmodifiable instance of the default stop words set.
+     * @return default stop words set.
+     */
+    public static CharArraySet getDefaultStopSet() {
+        return DefaultSetHolder.DEFAULT_STOP_SET;
+    }
+
+    /**
+     * Atomically loads the DEFAULT_STOP_SET and DICTIONARY in a lazy fashion once the outer class
+     * accesses the static final set the first time.;
+     */
+    @SuppressForbidden(reason="Lucene uses IOUtils")
+    private static class DefaultSetHolder {
+        static final CharArraySet DEFAULT_STOP_SET;
+        static final Dictionary DICTIONARY;
+
+        static {
+            try {
+                DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(UkrainianMorfologikAnalyzer.class,
+                    DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8));
+                DICTIONARY = Dictionary.read(
+                    UkrainianMorfologikAnalyzer.class.getClassLoader().getResource("ua/net/nlp/ukrainian.dict"));
+            } catch (IOException ex) {
+                // default set should always be present as it is part of the
+                // distribution (JAR)
+                throw new RuntimeException("Unable to load resources", ex);
+            }
+        }
+    }
+
+    /**
+     * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
+     */
+    public XUkrainianMorfologikAnalyzer() {
+        this(DefaultSetHolder.DEFAULT_STOP_SET);
+    }
+
+    /**
+     * Builds an analyzer with the given stop words.
+     *
+     * @param stopwords a stopword set
+     */
+    public XUkrainianMorfologikAnalyzer(CharArraySet stopwords) {
+        this(stopwords, CharArraySet.EMPTY_SET);
+    }
+
+    /**
+     * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
+     * provided this analyzer will add a {@link SetKeywordMarkerFilter} before
+     * stemming.
+     *
+     * @param stopwords a stopword set
+     * @param stemExclusionSet a set of terms not to be stemmed
+     */
+    public XUkrainianMorfologikAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
+        super(stopwords);
+        this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
+    }
+
+    @Override
+    protected Reader initReader(String fieldName, Reader reader) {
+        NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
+        // different apostrophes
+        builder.add("\u2019", "'");
+        builder.add("\u2018", "'");
+        builder.add("\u02BC", "'");
+        builder.add("`", "'");
+        builder.add("´", "'");
+        // ignored characters
+        builder.add("\u0301", "");
+        builder.add("\u00AD", "");
+        builder.add("ґ", "г");
+        builder.add("Ґ", "Г");
+
+        NormalizeCharMap normMap = builder.build();
+        reader = new MappingCharFilter(normMap, reader);
+        return reader;
+    }
+
+    /**
+     * Creates a
+     * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
+     * which tokenizes all the text in the provided {@link Reader}.
+     *
+     * @return A
+     *         {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
+     *         built from an {@link StandardTokenizer} filtered with
+     *         {@link LowerCaseFilter}, {@link StopFilter}
+     *         , {@link SetKeywordMarkerFilter} if a stem exclusion set is
+     *         provided and {@link MorfologikFilter} on the Ukrainian dictionary.
+     */
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName) {
+        Tokenizer source = new StandardTokenizer();
+        TokenStream result = new LowerCaseFilter(source);
+        result = new StopFilter(result, stopwords);
+
+        if (stemExclusionSet.isEmpty() == false) {
+            result = new SetKeywordMarkerFilter(result, stemExclusionSet);
+        }
+
+        result = new MorfologikFilter(result, DefaultSetHolder.DICTIONARY);
+        return new TokenStreamComponents(source, result);
+    }
+
+}
diff --git a/plugins/analysis-ukrainian/src/main/java/org/elasticsearch/index/analysis/UkrainianAnalyzerProvider.java b/plugins/analysis-ukrainian/src/main/java/org/elasticsearch/index/analysis/UkrainianAnalyzerProvider.java
@@ -10,27 +10,27 @@
 
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.uk.UkrainianMorfologikAnalyzer;
+import org.apache.lucene.analysis.uk.XUkrainianMorfologikAnalyzer;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
 
-public class UkrainianAnalyzerProvider extends AbstractIndexAnalyzerProvider<UkrainianMorfologikAnalyzer> {
+public class UkrainianAnalyzerProvider extends AbstractIndexAnalyzerProvider<XUkrainianMorfologikAnalyzer> {
 
-    private final UkrainianMorfologikAnalyzer analyzer;
+    private final XUkrainianMorfologikAnalyzer analyzer;
 
     public UkrainianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
         super(indexSettings, name, settings);
-        analyzer = new UkrainianMorfologikAnalyzer(
+        analyzer = new XUkrainianMorfologikAnalyzer(
             Analysis.parseStopWords(env, settings, UkrainianMorfologikAnalyzer.getDefaultStopSet()),
             Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)
         );
         analyzer.setVersion(version);
     }
 
     @Override
-    public UkrainianMorfologikAnalyzer get() {
+    public XUkrainianMorfologikAnalyzer get() {
         return this.analyzer;
     }
 
-
 }
diff --git a/plugins/analysis-ukrainian/src/test/java/org/elasticsearch/index/analysis/UkrainianAnalysisTests.java b/plugins/analysis-ukrainian/src/test/java/org/elasticsearch/index/analysis/UkrainianAnalysisTests.java
@@ -9,7 +9,7 @@
 package org.elasticsearch.index.analysis;
 
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.uk.UkrainianMorfologikAnalyzer;
+import org.apache.lucene.analysis.uk.XUkrainianMorfologikAnalyzer;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.index.Index;
 import org.elasticsearch.plugin.analysis.ukrainian.AnalysisUkrainianPlugin;
@@ -27,6 +27,6 @@ public void testDefaultsUkranianAnalysis() throws IOException {
                 new AnalysisUkrainianPlugin());
 
         Analyzer analyzer = analysis.indexAnalyzers.get("ukrainian").analyzer();
-        MatcherAssert.assertThat(analyzer, instanceOf(UkrainianMorfologikAnalyzer.class));
+        MatcherAssert.assertThat(analyzer, instanceOf(XUkrainianMorfologikAnalyzer.class));
     }
 }

Original file line number	Diff line number	Diff line change
`@@ -9,7 +9,7 @@`
`9`	`9`	`package org.elasticsearch.index.analysis;`
`10`	`10`
`11`	`11`	`import org.apache.lucene.analysis.Analyzer;`
`12`		`-import org.apache.lucene.analysis.uk.UkrainianMorfologikAnalyzer;`
	`12`	`+import org.apache.lucene.analysis.uk.XUkrainianMorfologikAnalyzer;`
`13`	`13`	`import org.elasticsearch.common.settings.Settings;`
`14`	`14`	`import org.elasticsearch.index.Index;`
`15`	`15`	`import org.elasticsearch.plugin.analysis.ukrainian.AnalysisUkrainianPlugin;`
`@@ -27,6 +27,6 @@ public void testDefaultsUkranianAnalysis() throws IOException {`
`27`	`27`	`new AnalysisUkrainianPlugin());`
`28`	`28`
`29`	`29`	`Analyzer analyzer = analysis.indexAnalyzers.get("ukrainian").analyzer();`
`30`		`- MatcherAssert.assertThat(analyzer, instanceOf(UkrainianMorfologikAnalyzer.class));`
	`30`	`+ MatcherAssert.assertThat(analyzer, instanceOf(XUkrainianMorfologikAnalyzer.class));`
`31`	`31`	`}`
`32`	`32`	`}`