Skip to content

Commit 993f0b0

Browse files
authored
Ukrainian language plugin can fill up heap (#71998)
The lucene Ukrainian analyzer has a bug where a large in-memory dictionary is loaded and stored on a thread local for every tokenstream generated in a new thread (for more details see https://issues.apache.org/jira/browse/LUCENE-9930). Due to checks added in #50908, we create a tokenstream for every registered analyzer in every shard, which means that any node with the ukrainian plugin installed will leak one copy of this dictionary per shard, whether or not the ukrainian analyzer is actually being used. This commit makes the plugin use a fixed version of the UkrainianMorfologikAnalyzer, until we merge a version of lucene that contains the upstream fix.
1 parent a1cd67f commit 993f0b0

File tree

3 files changed

+165
-7
lines changed

3 files changed

+165
-7
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
/*@notice
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.lucene.analysis.uk;
18+
19+
import morfologik.stemming.Dictionary;
20+
import org.apache.lucene.analysis.Analyzer;
21+
import org.apache.lucene.analysis.CharArraySet;
22+
import org.apache.lucene.analysis.LowerCaseFilter;
23+
import org.apache.lucene.analysis.StopFilter;
24+
import org.apache.lucene.analysis.StopwordAnalyzerBase;
25+
import org.apache.lucene.analysis.TokenStream;
26+
import org.apache.lucene.analysis.Tokenizer;
27+
import org.apache.lucene.analysis.WordlistLoader;
28+
import org.apache.lucene.analysis.charfilter.MappingCharFilter;
29+
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
30+
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
31+
import org.apache.lucene.analysis.morfologik.MorfologikFilter;
32+
import org.apache.lucene.analysis.standard.StandardTokenizer;
33+
import org.apache.lucene.util.IOUtils;
34+
import org.elasticsearch.common.SuppressForbidden;
35+
36+
import java.io.IOException;
37+
import java.io.Reader;
38+
import java.nio.charset.StandardCharsets;
39+
40+
/**
41+
* A dictionary-based {@link Analyzer} for Ukrainian.
42+
*
43+
* Modified from lucene 8.8.0 sources to incorporate a bugfix for
44+
* https://issues.apache.org/jira/browse/LUCENE-9930
45+
*/
46+
public final class XUkrainianMorfologikAnalyzer extends StopwordAnalyzerBase {
47+
private final CharArraySet stemExclusionSet;
48+
49+
/** File containing default Ukrainian stopwords. */
50+
public static final String DEFAULT_STOPWORD_FILE = "stopwords.txt";
51+
52+
/**
53+
* Returns an unmodifiable instance of the default stop words set.
54+
* @return default stop words set.
55+
*/
56+
public static CharArraySet getDefaultStopSet() {
57+
return DefaultSetHolder.DEFAULT_STOP_SET;
58+
}
59+
60+
/**
61+
* Atomically loads the DEFAULT_STOP_SET and DICTIONARY in a lazy fashion once the outer class
62+
* accesses the static final set the first time.;
63+
*/
64+
@SuppressForbidden(reason="Lucene uses IOUtils")
65+
private static class DefaultSetHolder {
66+
static final CharArraySet DEFAULT_STOP_SET;
67+
static final Dictionary DICTIONARY;
68+
69+
static {
70+
try {
71+
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(UkrainianMorfologikAnalyzer.class,
72+
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8));
73+
DICTIONARY = Dictionary.read(
74+
UkrainianMorfologikAnalyzer.class.getClassLoader().getResource("ua/net/nlp/ukrainian.dict"));
75+
} catch (IOException ex) {
76+
// default set should always be present as it is part of the
77+
// distribution (JAR)
78+
throw new RuntimeException("Unable to load resources", ex);
79+
}
80+
}
81+
}
82+
83+
/**
84+
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
85+
*/
86+
public XUkrainianMorfologikAnalyzer() {
87+
this(DefaultSetHolder.DEFAULT_STOP_SET);
88+
}
89+
90+
/**
91+
* Builds an analyzer with the given stop words.
92+
*
93+
* @param stopwords a stopword set
94+
*/
95+
public XUkrainianMorfologikAnalyzer(CharArraySet stopwords) {
96+
this(stopwords, CharArraySet.EMPTY_SET);
97+
}
98+
99+
/**
100+
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
101+
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
102+
* stemming.
103+
*
104+
* @param stopwords a stopword set
105+
* @param stemExclusionSet a set of terms not to be stemmed
106+
*/
107+
public XUkrainianMorfologikAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
108+
super(stopwords);
109+
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
110+
}
111+
112+
@Override
113+
protected Reader initReader(String fieldName, Reader reader) {
114+
NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
115+
// different apostrophes
116+
builder.add("\u2019", "'");
117+
builder.add("\u2018", "'");
118+
builder.add("\u02BC", "'");
119+
builder.add("`", "'");
120+
builder.add("´", "'");
121+
// ignored characters
122+
builder.add("\u0301", "");
123+
builder.add("\u00AD", "");
124+
builder.add("ґ", "г");
125+
builder.add("Ґ", "Г");
126+
127+
NormalizeCharMap normMap = builder.build();
128+
reader = new MappingCharFilter(normMap, reader);
129+
return reader;
130+
}
131+
132+
/**
133+
* Creates a
134+
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
135+
* which tokenizes all the text in the provided {@link Reader}.
136+
*
137+
* @return A
138+
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
139+
* built from an {@link StandardTokenizer} filtered with
140+
* {@link LowerCaseFilter}, {@link StopFilter}
141+
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is
142+
* provided and {@link MorfologikFilter} on the Ukrainian dictionary.
143+
*/
144+
@Override
145+
protected TokenStreamComponents createComponents(String fieldName) {
146+
Tokenizer source = new StandardTokenizer();
147+
TokenStream result = new LowerCaseFilter(source);
148+
result = new StopFilter(result, stopwords);
149+
150+
if (stemExclusionSet.isEmpty() == false) {
151+
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
152+
}
153+
154+
result = new MorfologikFilter(result, DefaultSetHolder.DICTIONARY);
155+
return new TokenStreamComponents(source, result);
156+
}
157+
158+
}

plugins/analysis-ukrainian/src/main/java/org/elasticsearch/index/analysis/UkrainianAnalyzerProvider.java

+5-5
Original file line numberDiff line numberDiff line change
@@ -10,27 +10,27 @@
1010

1111
import org.apache.lucene.analysis.CharArraySet;
1212
import org.apache.lucene.analysis.uk.UkrainianMorfologikAnalyzer;
13+
import org.apache.lucene.analysis.uk.XUkrainianMorfologikAnalyzer;
1314
import org.elasticsearch.common.settings.Settings;
1415
import org.elasticsearch.env.Environment;
1516
import org.elasticsearch.index.IndexSettings;
1617

17-
public class UkrainianAnalyzerProvider extends AbstractIndexAnalyzerProvider<UkrainianMorfologikAnalyzer> {
18+
public class UkrainianAnalyzerProvider extends AbstractIndexAnalyzerProvider<XUkrainianMorfologikAnalyzer> {
1819

19-
private final UkrainianMorfologikAnalyzer analyzer;
20+
private final XUkrainianMorfologikAnalyzer analyzer;
2021

2122
public UkrainianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
2223
super(indexSettings, name, settings);
23-
analyzer = new UkrainianMorfologikAnalyzer(
24+
analyzer = new XUkrainianMorfologikAnalyzer(
2425
Analysis.parseStopWords(env, settings, UkrainianMorfologikAnalyzer.getDefaultStopSet()),
2526
Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)
2627
);
2728
analyzer.setVersion(version);
2829
}
2930

3031
@Override
31-
public UkrainianMorfologikAnalyzer get() {
32+
public XUkrainianMorfologikAnalyzer get() {
3233
return this.analyzer;
3334
}
3435

35-
3636
}

plugins/analysis-ukrainian/src/test/java/org/elasticsearch/index/analysis/UkrainianAnalysisTests.java

+2-2
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
package org.elasticsearch.index.analysis;
1010

1111
import org.apache.lucene.analysis.Analyzer;
12-
import org.apache.lucene.analysis.uk.UkrainianMorfologikAnalyzer;
12+
import org.apache.lucene.analysis.uk.XUkrainianMorfologikAnalyzer;
1313
import org.elasticsearch.common.settings.Settings;
1414
import org.elasticsearch.index.Index;
1515
import org.elasticsearch.plugin.analysis.ukrainian.AnalysisUkrainianPlugin;
@@ -27,6 +27,6 @@ public void testDefaultsUkranianAnalysis() throws IOException {
2727
new AnalysisUkrainianPlugin());
2828

2929
Analyzer analyzer = analysis.indexAnalyzers.get("ukrainian").analyzer();
30-
MatcherAssert.assertThat(analyzer, instanceOf(UkrainianMorfologikAnalyzer.class));
30+
MatcherAssert.assertThat(analyzer, instanceOf(XUkrainianMorfologikAnalyzer.class));
3131
}
3232
}

0 commit comments

Comments
 (0)