Skip to content

Commit 8c16725

Browse files
committed
Check for deprecations when analyzers are built (#50908)
Generally speaking, deprecated analysis components in elasticsearch will issue deprecation warnings when they are first used. However, this means that no warnings are emitted when indexes are created with deprecated components, and users have to actually index a document to see warnings. This makes it much harder to see these warnings and act on them at appropriate times. This is worse in the case where components throw exceptions on upgrade. In this case, users will not be aware of a problem until a document is indexed, instead of at index creation time. This commit adds a new check that pushes an empty string through all user-defined analyzers and normalizers when an IndexAnalyzers object is built for each index; deprecation warnings and exceptions are now emitted when indexes are created or opened. Fixes #42349
1 parent 9c6ffdc commit 8c16725

File tree

6 files changed

+245
-64
lines changed

6 files changed

+245
-64
lines changed

modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisPluginTests.java

+36-52
Original file line numberDiff line numberDiff line change
@@ -20,23 +20,18 @@
2020
package org.elasticsearch.analysis.common;
2121

2222
import org.apache.lucene.analysis.Analyzer;
23-
import org.apache.lucene.analysis.MockTokenizer;
24-
import org.apache.lucene.analysis.Tokenizer;
2523
import org.elasticsearch.Version;
2624
import org.elasticsearch.cluster.metadata.IndexMetaData;
2725
import org.elasticsearch.common.settings.Settings;
2826
import org.elasticsearch.env.Environment;
2927
import org.elasticsearch.index.IndexSettings;
3028
import org.elasticsearch.index.analysis.IndexAnalyzers;
3129
import org.elasticsearch.index.analysis.NamedAnalyzer;
32-
import org.elasticsearch.index.analysis.TokenFilterFactory;
3330
import org.elasticsearch.test.ESTestCase;
3431
import org.elasticsearch.test.IndexSettingsModule;
3532
import org.elasticsearch.test.VersionUtils;
3633

3734
import java.io.IOException;
38-
import java.io.StringReader;
39-
import java.util.Map;
4035

4136
public class CommonAnalysisPluginTests extends ESTestCase {
4237

@@ -45,42 +40,37 @@ public class CommonAnalysisPluginTests extends ESTestCase {
4540
*/
4641
public void testNGramDeprecationWarning() throws IOException {
4742
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
48-
.put(IndexMetaData.SETTING_VERSION_CREATED,
43+
.put(IndexMetaData.SETTING_VERSION_CREATED,
4944
VersionUtils.randomVersionBetween(random(), Version.V_6_0_0, VersionUtils.getPreviousVersion(Version.V_7_0_0)))
50-
.build();
45+
.put("index.analysis.analyzer.custom_analyzer.type", "custom")
46+
.put("index.analysis.analyzer.custom_analyzer.tokenizer", "standard")
47+
.putList("index.analysis.analyzer.custom_analyzer.filter", "nGram")
48+
.build();
5149

52-
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
5350
try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
54-
Map<String, TokenFilterFactory> tokenFilters = createTestAnalysis(idxSettings, settings, commonAnalysisPlugin).tokenFilter;
55-
TokenFilterFactory tokenFilterFactory = tokenFilters.get("nGram");
56-
Tokenizer tokenizer = new MockTokenizer();
57-
tokenizer.setReader(new StringReader("foo bar"));
58-
assertNotNull(tokenFilterFactory.create(tokenizer));
59-
assertWarnings(
60-
"The [nGram] token filter name is deprecated and will be removed in a future version. "
61-
+ "Please change the filter name to [ngram] instead.");
51+
createTestAnalysis(IndexSettingsModule.newIndexSettings("index", settings), settings, commonAnalysisPlugin);
6252
}
53+
54+
assertWarnings("The [nGram] token filter name is deprecated and will be removed in a future version. "
55+
+ "Please change the filter name to [ngram] instead.");
6356
}
6457

6558
/**
6659
* Check that the deprecated name "nGram" throws an error since 7.0.0
6760
*/
6861
public void testNGramDeprecationError() throws IOException {
6962
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
70-
.put(IndexMetaData.SETTING_VERSION_CREATED, VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, null))
71-
.build();
63+
.put(IndexMetaData.SETTING_VERSION_CREATED, VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, null))
64+
.put("index.analysis.analyzer.custom_analyzer.type", "custom")
65+
.put("index.analysis.analyzer.custom_analyzer.tokenizer", "standard")
66+
.putList("index.analysis.analyzer.custom_analyzer.filter", "nGram")
67+
.build();
7268

73-
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
7469
try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
75-
Map<String, TokenFilterFactory> tokenFilters = createTestAnalysis(idxSettings, settings, commonAnalysisPlugin).tokenFilter;
76-
TokenFilterFactory tokenFilterFactory = tokenFilters.get("nGram");
77-
Tokenizer tokenizer = new MockTokenizer();
78-
tokenizer.setReader(new StringReader("foo bar"));
79-
IllegalArgumentException ex = expectThrows(IllegalArgumentException.class, () -> tokenFilterFactory.create(tokenizer));
80-
assertEquals(
81-
"The [nGram] token filter name was deprecated in 6.4 and cannot be used in new indices. Please change the filter"
82-
+ " name to [ngram] instead.",
83-
ex.getMessage());
70+
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
71+
() -> createTestAnalysis(IndexSettingsModule.newIndexSettings("index", settings), settings, commonAnalysisPlugin));
72+
assertEquals("The [nGram] token filter name was deprecated in 6.4 and cannot be used in new indices. "
73+
+ "Please change the filter name to [ngram] instead.", e.getMessage());
8474
}
8575
}
8676

@@ -89,42 +79,36 @@ public void testNGramDeprecationError() throws IOException {
8979
*/
9080
public void testEdgeNGramDeprecationWarning() throws IOException {
9181
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
92-
.put(IndexMetaData.SETTING_VERSION_CREATED,
93-
VersionUtils.randomVersionBetween(random(), Version.V_6_4_0, VersionUtils.getPreviousVersion(Version.V_7_0_0)))
94-
.build();
82+
.put(IndexMetaData.SETTING_VERSION_CREATED,
83+
VersionUtils.randomVersionBetween(random(), Version.V_6_4_0, VersionUtils.getPreviousVersion(Version.V_7_0_0)))
84+
.put("index.analysis.analyzer.custom_analyzer.type", "custom")
85+
.put("index.analysis.analyzer.custom_analyzer.tokenizer", "standard")
86+
.putList("index.analysis.analyzer.custom_analyzer.filter", "edgeNGram")
87+
.build();
9588

96-
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
9789
try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
98-
Map<String, TokenFilterFactory> tokenFilters = createTestAnalysis(idxSettings, settings, commonAnalysisPlugin).tokenFilter;
99-
TokenFilterFactory tokenFilterFactory = tokenFilters.get("edgeNGram");
100-
Tokenizer tokenizer = new MockTokenizer();
101-
tokenizer.setReader(new StringReader("foo bar"));
102-
assertNotNull(tokenFilterFactory.create(tokenizer));
103-
assertWarnings(
104-
"The [edgeNGram] token filter name is deprecated and will be removed in a future version. "
105-
+ "Please change the filter name to [edge_ngram] instead.");
90+
createTestAnalysis(IndexSettingsModule.newIndexSettings("index", settings), settings, commonAnalysisPlugin);
10691
}
92+
assertWarnings("The [edgeNGram] token filter name is deprecated and will be removed in a future version. "
93+
+ "Please change the filter name to [edge_ngram] instead.");
10794
}
10895

10996
/**
11097
* Check that the deprecated name "edgeNGram" throws an error for indices created since 7.0.0
11198
*/
11299
public void testEdgeNGramDeprecationError() throws IOException {
113100
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
114-
.put(IndexMetaData.SETTING_VERSION_CREATED, VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, null))
115-
.build();
101+
.put(IndexMetaData.SETTING_VERSION_CREATED, VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, null))
102+
.put("index.analysis.analyzer.custom_analyzer.type", "custom")
103+
.put("index.analysis.analyzer.custom_analyzer.tokenizer", "standard")
104+
.putList("index.analysis.analyzer.custom_analyzer.filter", "edgeNGram")
105+
.build();
116106

117-
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
118107
try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
119-
Map<String, TokenFilterFactory> tokenFilters = createTestAnalysis(idxSettings, settings, commonAnalysisPlugin).tokenFilter;
120-
TokenFilterFactory tokenFilterFactory = tokenFilters.get("edgeNGram");
121-
Tokenizer tokenizer = new MockTokenizer();
122-
tokenizer.setReader(new StringReader("foo bar"));
123-
IllegalArgumentException ex = expectThrows(IllegalArgumentException.class, () -> tokenFilterFactory.create(tokenizer));
124-
assertEquals(
125-
"The [edgeNGram] token filter name was deprecated in 6.4 and cannot be used in new indices. Please change the filter"
126-
+ " name to [edge_ngram] instead.",
127-
ex.getMessage());
108+
IllegalArgumentException ex = expectThrows(IllegalArgumentException.class,
109+
() -> createTestAnalysis(IndexSettingsModule.newIndexSettings("index", settings), settings, commonAnalysisPlugin));
110+
assertEquals("The [edgeNGram] token filter name was deprecated in 6.4 and cannot be used in new indices. "
111+
+ "Please change the filter name to [edge_ngram] instead.", ex.getMessage());
128112
}
129113
}
130114

server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java

+22
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
package org.elasticsearch.index.analysis;
2020

2121
import org.apache.lucene.analysis.Analyzer;
22+
import org.apache.lucene.analysis.TokenStream;
2223
import org.apache.lucene.analysis.core.KeywordTokenizer;
2324
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
2425
import org.elasticsearch.ElasticsearchException;
@@ -542,6 +543,10 @@ public IndexAnalyzers build(IndexSettings indexSettings,
542543
tokenFilterFactoryFactories, charFilterFactoryFactories);
543544
}
544545

546+
for (Analyzer analyzer : normalizers.values()) {
547+
analyzer.normalize("", ""); // check for deprecations
548+
}
549+
545550
if (!analyzers.containsKey(DEFAULT_ANALYZER_NAME)) {
546551
analyzers.put(DEFAULT_ANALYZER_NAME,
547552
produceAnalyzer(DEFAULT_ANALYZER_NAME,
@@ -605,6 +610,7 @@ private static NamedAnalyzer produceAnalyzer(String name,
605610
} else {
606611
analyzer = new NamedAnalyzer(name, analyzerFactory.scope(), analyzerF, overridePositionIncrementGap);
607612
}
613+
checkVersions(analyzer);
608614
return analyzer;
609615
}
610616

@@ -632,4 +638,20 @@ private void processNormalizerFactory(
632638
NamedAnalyzer normalizer = new NamedAnalyzer(name, normalizerFactory.scope(), normalizerF);
633639
normalizers.put(name, normalizer);
634640
}
641+
642+
// Some analysis components emit deprecation warnings or throw exceptions when used
643+
// with the wrong version of elasticsearch. These exceptions and warnings are
644+
// normally thrown when tokenstreams are constructed, which unless we build a
645+
// tokenstream up-front does not happen until a document is indexed. In order to
646+
// surface these warnings or exceptions as early as possible, we build an empty
647+
// tokenstream and pull it through an Analyzer at construction time.
648+
private static void checkVersions(Analyzer analyzer) {
649+
try (TokenStream ts = analyzer.tokenStream("", "")) {
650+
ts.reset();
651+
while (ts.incrementToken()) {}
652+
ts.end();
653+
} catch (IOException e) {
654+
throw new UncheckedIOException(e);
655+
}
656+
}
635657
}

server/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java

+48-1
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
import org.elasticsearch.index.analysis.AnalysisRegistry;
3939
import org.elasticsearch.index.analysis.CharFilterFactory;
4040
import org.elasticsearch.index.analysis.IndexAnalyzers;
41+
import org.elasticsearch.index.analysis.NormalizingTokenFilterFactory;
4142
import org.elasticsearch.index.analysis.PreConfiguredCharFilter;
4243
import org.elasticsearch.index.analysis.TokenFilterFactory;
4344
import org.elasticsearch.index.analysis.TokenizerFactory;
@@ -109,6 +110,25 @@ public TokenStream create(TokenStream tokenStream) {
109110
}
110111
}
111112

113+
class DeprecatedTokenFilterFactory extends AbstractTokenFilterFactory implements NormalizingTokenFilterFactory {
114+
115+
DeprecatedTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
116+
super(indexSettings, name, settings);
117+
}
118+
119+
@Override
120+
public TokenStream create(TokenStream tokenStream) {
121+
deprecationLogger.deprecated("Using deprecated token filter [deprecated]");
122+
return tokenStream;
123+
}
124+
125+
@Override
126+
public TokenStream normalize(TokenStream tokenStream) {
127+
deprecationLogger.deprecated("Using deprecated token filter [deprecated]");
128+
return tokenStream;
129+
}
130+
}
131+
112132
class AppendCharFilterFactory extends AbstractCharFilterFactory {
113133

114134
final String suffix;
@@ -137,7 +157,10 @@ public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
137157

138158
@Override
139159
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
140-
return singletonMap("mock", MockFactory::new);
160+
Map<String, AnalysisProvider<TokenFilterFactory>> filters = new HashMap<>();
161+
filters.put("mock", MockFactory::new);
162+
filters.put("deprecated", DeprecatedTokenFilterFactory::new);
163+
return filters;
141164
}
142165

143166
@Override
@@ -507,4 +530,28 @@ public void testExceedSetMaxTokenLimit() {
507530
assertEquals(e.getMessage(), "The number of tokens produced by calling _analyze has exceeded the allowed maximum of ["
508531
+ idxMaxTokenCount + "]." + " This limit can be set by changing the [index.analyze.max_token_count] index level setting.");
509532
}
533+
534+
public void testDeprecationWarnings() throws IOException {
535+
AnalyzeAction.Request req = new AnalyzeAction.Request();
536+
req.tokenizer("standard");
537+
req.addTokenFilter("lowercase");
538+
req.addTokenFilter("deprecated");
539+
req.text("test text");
540+
541+
AnalyzeAction.Response analyze =
542+
TransportAnalyzeAction.analyze(req, registry, mockIndexService(), maxTokenCount);
543+
assertEquals(2, analyze.getTokens().size());
544+
assertWarnings("Using deprecated token filter [deprecated]");
545+
546+
// normalizer
547+
req = new AnalyzeAction.Request();
548+
req.addTokenFilter("lowercase");
549+
req.addTokenFilter("deprecated");
550+
req.text("text");
551+
552+
analyze =
553+
TransportAnalyzeAction.analyze(req, registry, mockIndexService(), maxTokenCount);
554+
assertEquals(1, analyze.getTokens().size());
555+
assertWarnings("Using deprecated token filter [deprecated]");
556+
}
510557
}

server/src/test/java/org/elasticsearch/index/IndexModuleTests.java

+2-1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
package org.elasticsearch.index;
2020

2121
import org.apache.lucene.analysis.Analyzer;
22+
import org.apache.lucene.analysis.standard.StandardTokenizer;
2223
import org.apache.lucene.index.AssertingDirectoryReader;
2324
import org.apache.lucene.index.DirectoryReader;
2425
import org.apache.lucene.index.FieldInvertState;
@@ -442,7 +443,7 @@ public Analyzer get() {
442443
final Analyzer analyzer = new Analyzer() {
443444
@Override
444445
protected TokenStreamComponents createComponents(String fieldName) {
445-
throw new AssertionError("should not be here");
446+
return new TokenStreamComponents(new StandardTokenizer());
446447
}
447448

448449
@Override

0 commit comments

Comments
 (0)