Skip to content

Commit e87afdf

Browse files
committed
[ML] Move analyzer dependencies out of categorization config (#32123)
The ML config classes will shortly be moved to the X-Pack protocol library to allow the ML APIs to be moved to the high level REST client. Dependencies on server functionality should be removed from the config classes before this is done. This change is entirely about moving code between packages. It does not add or remove any functionality or tests.
1 parent 5018c47 commit e87afdf

File tree

6 files changed

+382
-381
lines changed

6 files changed

+382
-381
lines changed

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/config/CategorizationAnalyzerConfig.java

+1-203
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,8 @@
55
*/
66
package org.elasticsearch.xpack.core.ml.job.config;
77

8-
import org.apache.lucene.analysis.Analyzer;
9-
import org.elasticsearch.Version;
10-
import org.elasticsearch.action.admin.indices.analyze.TransportAnalyzeAction;
11-
import org.elasticsearch.cluster.metadata.IndexMetaData;
128
import org.elasticsearch.common.ParseField;
139
import org.elasticsearch.common.Strings;
14-
import org.elasticsearch.common.UUIDs;
15-
import org.elasticsearch.common.collect.Tuple;
1610
import org.elasticsearch.common.io.stream.StreamInput;
1711
import org.elasticsearch.common.io.stream.StreamOutput;
1812
import org.elasticsearch.common.io.stream.Writeable;
@@ -22,15 +16,6 @@
2216
import org.elasticsearch.common.xcontent.XContentFactory;
2317
import org.elasticsearch.common.xcontent.XContentParser;
2418
import org.elasticsearch.common.xcontent.XContentType;
25-
import org.elasticsearch.env.Environment;
26-
import org.elasticsearch.index.IndexSettings;
27-
import org.elasticsearch.index.analysis.AnalysisRegistry;
28-
import org.elasticsearch.index.analysis.CharFilterFactory;
29-
import org.elasticsearch.index.analysis.CustomAnalyzer;
30-
import org.elasticsearch.index.analysis.CustomAnalyzerProvider;
31-
import org.elasticsearch.index.analysis.TokenFilterFactory;
32-
import org.elasticsearch.index.analysis.TokenizerFactory;
33-
import org.elasticsearch.indices.analysis.AnalysisModule;
3419
import org.elasticsearch.rest.action.admin.indices.RestAnalyzeAction;
3520
import org.elasticsearch.xpack.core.ml.MlParserType;
3621

@@ -42,12 +27,11 @@
4227
import java.util.Map;
4328
import java.util.Objects;
4429

45-
4630
/**
4731
* Configuration for the categorization analyzer.
4832
*
4933
* The syntax is a subset of what can be supplied to the {@linkplain RestAnalyzeAction <code>_analyze</code> endpoint}.
50-
* To summarise, the first option is to specify the name of an out-of-the-box analyzer:
34+
* To summarize, the first option is to specify the name of an out-of-the-box analyzer:
5135
* <code>
5236
* "categorization_analyzer" : "standard"
5337
* </code>
@@ -66,11 +50,6 @@
6650
* { "type" : "pattern_replace", "pattern": "^[0-9].*" }
6751
* ]
6852
* </code>
69-
*
70-
* Unfortunately there is no easy to to reuse a subset of the <code>_analyze</code> action implementation, so much
71-
* of the code in this file is copied from {@link TransportAnalyzeAction}. Unfortunately the logic required here is
72-
* not quite identical to that of {@link TransportAnalyzeAction}, and the required code is hard to partially reuse.
73-
* TODO: consider refactoring ES core to allow more reuse.
7453
*/
7554
public class CategorizationAnalyzerConfig implements ToXContentFragment, Writeable {
7655

@@ -350,175 +329,6 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
350329
return builder;
351330
}
352331

353-
/**
354-
* Convert the config to an {@link Analyzer}. This may be a global analyzer or a newly created custom analyzer.
355-
* In the case of a global analyzer the caller must NOT close it when they have finished with it. In the case of
356-
* a newly created custom analyzer the caller is responsible for closing it.
357-
* @return The first tuple member is the {@link Analyzer}; the second indicates whether the caller is responsible
358-
* for closing it.
359-
*/
360-
public Tuple<Analyzer, Boolean> toAnalyzer(AnalysisRegistry analysisRegistry, Environment environment) throws IOException {
361-
if (analyzer != null) {
362-
Analyzer globalAnalyzer = analysisRegistry.getAnalyzer(analyzer);
363-
if (globalAnalyzer == null) {
364-
throw new IllegalArgumentException("Failed to find global analyzer [" + analyzer + "]");
365-
}
366-
return new Tuple<>(globalAnalyzer, Boolean.FALSE);
367-
} else {
368-
List<CharFilterFactory> charFilterFactoryList =
369-
parseCharFilterFactories(analysisRegistry, environment);
370-
371-
Tuple<String, TokenizerFactory> tokenizerFactory = parseTokenizerFactory(analysisRegistry,
372-
environment);
373-
374-
List<TokenFilterFactory> tokenFilterFactoryList = parseTokenFilterFactories(analysisRegistry,
375-
environment, tokenizerFactory, charFilterFactoryList);
376-
377-
return new Tuple<>(new CustomAnalyzer(tokenizerFactory.v1(), tokenizerFactory.v2(),
378-
charFilterFactoryList.toArray(new CharFilterFactory[charFilterFactoryList.size()]),
379-
tokenFilterFactoryList.toArray(new TokenFilterFactory[tokenFilterFactoryList.size()])), Boolean.TRUE);
380-
}
381-
}
382-
383-
384-
/**
385-
* Get char filter factories for each configured char filter. Each configuration
386-
* element can be the name of an out-of-the-box char filter, or a custom definition.
387-
*/
388-
private List<CharFilterFactory> parseCharFilterFactories(AnalysisRegistry analysisRegistry,
389-
Environment environment) throws IOException {
390-
final List<CharFilterFactory> charFilterFactoryList = new ArrayList<>();
391-
for (NameOrDefinition charFilter : charFilters) {
392-
final CharFilterFactory charFilterFactory;
393-
if (charFilter.name != null) {
394-
AnalysisModule.AnalysisProvider<CharFilterFactory> charFilterFactoryFactory =
395-
analysisRegistry.getCharFilterProvider(charFilter.name);
396-
if (charFilterFactoryFactory == null) {
397-
throw new IllegalArgumentException("Failed to find global char filter under [" + charFilter.name + "]");
398-
}
399-
charFilterFactory = charFilterFactoryFactory.get(environment, charFilter.name);
400-
} else {
401-
String charFilterTypeName = charFilter.definition.get("type");
402-
if (charFilterTypeName == null) {
403-
throw new IllegalArgumentException("Missing [type] setting for char filter: " + charFilter.definition);
404-
}
405-
AnalysisModule.AnalysisProvider<CharFilterFactory> charFilterFactoryFactory =
406-
analysisRegistry.getCharFilterProvider(charFilterTypeName);
407-
if (charFilterFactoryFactory == null) {
408-
throw new IllegalArgumentException("Failed to find global char filter under [" + charFilterTypeName + "]");
409-
}
410-
Settings settings = augmentSettings(charFilter.definition);
411-
// Need to set anonymous "name" of char_filter
412-
charFilterFactory = charFilterFactoryFactory.get(buildDummyIndexSettings(settings), environment,
413-
"_anonymous_charfilter", settings);
414-
}
415-
if (charFilterFactory == null) {
416-
throw new IllegalArgumentException("Failed to find char filter [" + charFilter + "]");
417-
}
418-
charFilterFactoryList.add(charFilterFactory);
419-
}
420-
return charFilterFactoryList;
421-
}
422-
423-
/**
424-
* Get the tokenizer factory for the configured tokenizer. The configuration
425-
* can be the name of an out-of-the-box tokenizer, or a custom definition.
426-
*/
427-
private Tuple<String, TokenizerFactory> parseTokenizerFactory(AnalysisRegistry analysisRegistry,
428-
Environment environment) throws IOException {
429-
final String name;
430-
final TokenizerFactory tokenizerFactory;
431-
if (tokenizer.name != null) {
432-
name = tokenizer.name;
433-
AnalysisModule.AnalysisProvider<TokenizerFactory> tokenizerFactoryFactory = analysisRegistry.getTokenizerProvider(name);
434-
if (tokenizerFactoryFactory == null) {
435-
throw new IllegalArgumentException("Failed to find global tokenizer under [" + name + "]");
436-
}
437-
tokenizerFactory = tokenizerFactoryFactory.get(environment, name);
438-
} else {
439-
String tokenizerTypeName = tokenizer.definition.get("type");
440-
if (tokenizerTypeName == null) {
441-
throw new IllegalArgumentException("Missing [type] setting for tokenizer: " + tokenizer.definition);
442-
}
443-
AnalysisModule.AnalysisProvider<TokenizerFactory> tokenizerFactoryFactory =
444-
analysisRegistry.getTokenizerProvider(tokenizerTypeName);
445-
if (tokenizerFactoryFactory == null) {
446-
throw new IllegalArgumentException("Failed to find global tokenizer under [" + tokenizerTypeName + "]");
447-
}
448-
Settings settings = augmentSettings(tokenizer.definition);
449-
// Need to set anonymous "name" of tokenizer
450-
name = "_anonymous_tokenizer";
451-
tokenizerFactory = tokenizerFactoryFactory.get(buildDummyIndexSettings(settings), environment, name, settings);
452-
}
453-
return new Tuple<>(name, tokenizerFactory);
454-
}
455-
456-
/**
457-
* Get token filter factories for each configured token filter. Each configuration
458-
* element can be the name of an out-of-the-box token filter, or a custom definition.
459-
*/
460-
private List<TokenFilterFactory> parseTokenFilterFactories(AnalysisRegistry analysisRegistry, Environment environment,
461-
Tuple<String, TokenizerFactory> tokenizerFactory,
462-
List<CharFilterFactory> charFilterFactoryList) throws IOException {
463-
final List<TokenFilterFactory> tokenFilterFactoryList = new ArrayList<>();
464-
for (NameOrDefinition tokenFilter : tokenFilters) {
465-
TokenFilterFactory tokenFilterFactory;
466-
if (tokenFilter.name != null) {
467-
AnalysisModule.AnalysisProvider<TokenFilterFactory> tokenFilterFactoryFactory;
468-
tokenFilterFactoryFactory = analysisRegistry.getTokenFilterProvider(tokenFilter.name);
469-
if (tokenFilterFactoryFactory == null) {
470-
throw new IllegalArgumentException("Failed to find global token filter under [" + tokenFilter.name + "]");
471-
}
472-
tokenFilterFactory = tokenFilterFactoryFactory.get(environment, tokenFilter.name);
473-
} else {
474-
String filterTypeName = tokenFilter.definition.get("type");
475-
if (filterTypeName == null) {
476-
throw new IllegalArgumentException("Missing [type] setting for token filter: " + tokenFilter.definition);
477-
}
478-
AnalysisModule.AnalysisProvider<TokenFilterFactory> tokenFilterFactoryFactory =
479-
analysisRegistry.getTokenFilterProvider(filterTypeName);
480-
if (tokenFilterFactoryFactory == null) {
481-
throw new IllegalArgumentException("Failed to find global token filter under [" + filterTypeName + "]");
482-
}
483-
Settings settings = augmentSettings(tokenFilter.definition);
484-
// Need to set anonymous "name" of token_filter
485-
tokenFilterFactory = tokenFilterFactoryFactory.get(buildDummyIndexSettings(settings), environment,
486-
"_anonymous_tokenfilter", settings);
487-
tokenFilterFactory = CustomAnalyzerProvider.checkAndApplySynonymFilter(tokenFilterFactory, tokenizerFactory.v1(),
488-
tokenizerFactory.v2(), tokenFilterFactoryList, charFilterFactoryList, environment);
489-
}
490-
if (tokenFilterFactory == null) {
491-
throw new IllegalArgumentException("Failed to find or create token filter [" + tokenFilter + "]");
492-
}
493-
tokenFilterFactoryList.add(tokenFilterFactory);
494-
}
495-
return tokenFilterFactoryList;
496-
}
497-
498-
/**
499-
* The Elasticsearch analysis functionality is designed to work with indices. For
500-
* categorization we have to pretend we've got some index settings.
501-
*/
502-
private IndexSettings buildDummyIndexSettings(Settings settings) {
503-
IndexMetaData metaData = IndexMetaData.builder(IndexMetaData.INDEX_UUID_NA_VALUE).settings(settings).build();
504-
return new IndexSettings(metaData, Settings.EMPTY);
505-
}
506-
507-
/**
508-
* The behaviour of Elasticsearch analyzers can vary between versions.
509-
* For categorization we'll always use the latest version of the text analysis.
510-
* The other settings are just to stop classes that expect to be associated with
511-
* an index from complaining.
512-
*/
513-
private Settings augmentSettings(Settings settings) {
514-
return Settings.builder().put(settings)
515-
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
516-
.put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0)
517-
.put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
518-
.put(IndexMetaData.SETTING_INDEX_UUID, UUIDs.randomBase64UUID())
519-
.build();
520-
}
521-
522332
@Override
523333
public boolean equals(Object o) {
524334
if (this == o) return true;
@@ -609,17 +419,5 @@ public CategorizationAnalyzerConfig build() {
609419
}
610420
return new CategorizationAnalyzerConfig(analyzer, charFilters, tokenizer, tokenFilters);
611421
}
612-
613-
/**
614-
* Verify that the builder will build a valid config. This is not done as part of the basic build
615-
* because it verifies that the names of analyzers/tokenizers/filters referenced by the config are
616-
* known, and the validity of these names could change over time.
617-
*/
618-
public void verify(AnalysisRegistry analysisRegistry, Environment environment) throws IOException {
619-
Tuple<Analyzer, Boolean> tuple = build().toAnalyzer(analysisRegistry, environment);
620-
if (tuple.v2()) {
621-
tuple.v1().close();
622-
}
623-
}
624422
}
625423
}

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/config/Job.java

+2-16
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,6 @@
2121
import org.elasticsearch.common.xcontent.ToXContentObject;
2222
import org.elasticsearch.common.xcontent.XContentBuilder;
2323
import org.elasticsearch.common.xcontent.XContentParser.Token;
24-
import org.elasticsearch.env.Environment;
25-
import org.elasticsearch.index.analysis.AnalysisRegistry;
2624
import org.elasticsearch.xpack.core.ml.MlParserType;
2725
import org.elasticsearch.xpack.core.ml.job.messages.Messages;
2826
import org.elasticsearch.xpack.core.ml.job.persistence.AnomalyDetectorsIndexFields;
@@ -777,8 +775,8 @@ public Builder setAnalysisConfig(AnalysisConfig.Builder configBuilder) {
777775
return this;
778776
}
779777

780-
public AnalysisLimits getAnalysisLimits() {
781-
return analysisLimits;
778+
public AnalysisConfig getAnalysisConfig() {
779+
return analysisConfig;
782780
}
783781

784782
public Builder setAnalysisLimits(AnalysisLimits analysisLimits) {
@@ -1081,18 +1079,6 @@ public void validateAnalysisLimitsAndSetDefaults(@Nullable ByteSizeValue maxMode
10811079
AnalysisLimits.DEFAULT_MODEL_MEMORY_LIMIT_MB);
10821080
}
10831081

1084-
/**
1085-
* Validate the char filter/tokenizer/token filter names used in the categorization analyzer config (if any).
1086-
* The overall structure can be validated at parse time, but the exact names need to be checked separately,
1087-
* as plugins that provide the functionality can be installed/uninstalled.
1088-
*/
1089-
public void validateCategorizationAnalyzer(AnalysisRegistry analysisRegistry, Environment environment) throws IOException {
1090-
CategorizationAnalyzerConfig categorizationAnalyzerConfig = analysisConfig.getCategorizationAnalyzerConfig();
1091-
if (categorizationAnalyzerConfig != null) {
1092-
new CategorizationAnalyzerConfig.Builder(categorizationAnalyzerConfig).verify(analysisRegistry, environment);
1093-
}
1094-
}
1095-
10961082
private void validateGroups() {
10971083
for (String group : this.groups) {
10981084
if (MlStrings.isValidId(group) == false) {

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/JobManager.java

+19-1
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
import org.elasticsearch.xpack.core.ml.action.UpdateJobAction;
4040
import org.elasticsearch.xpack.core.ml.action.util.QueryPage;
4141
import org.elasticsearch.xpack.core.ml.job.config.AnalysisLimits;
42+
import org.elasticsearch.xpack.core.ml.job.config.CategorizationAnalyzerConfig;
4243
import org.elasticsearch.xpack.core.ml.job.config.DataDescription;
4344
import org.elasticsearch.xpack.core.ml.job.config.Job;
4445
import org.elasticsearch.xpack.core.ml.job.config.JobState;
@@ -50,6 +51,7 @@
5051
import org.elasticsearch.xpack.core.ml.job.process.autodetect.state.ModelSnapshot;
5152
import org.elasticsearch.xpack.core.ml.utils.ExceptionsHelper;
5253
import org.elasticsearch.xpack.ml.MachineLearning;
54+
import org.elasticsearch.xpack.ml.job.categorization.CategorizationAnalyzer;
5355
import org.elasticsearch.xpack.ml.job.persistence.JobProvider;
5456
import org.elasticsearch.xpack.ml.job.persistence.JobResultsPersister;
5557
import org.elasticsearch.xpack.ml.job.process.autodetect.UpdateParams;
@@ -170,14 +172,30 @@ public JobState getJobState(String jobId) {
170172
return MlMetadata.getJobState(jobId, tasks);
171173
}
172174

175+
/**
176+
* Validate the char filter/tokenizer/token filter names used in the categorization analyzer config (if any).
177+
* This validation has to be done server-side; it cannot be done in a client as that won't have loaded the
178+
* appropriate analysis modules/plugins.
179+
* The overall structure can be validated at parse time, but the exact names need to be checked separately,
180+
* as plugins that provide the functionality can be installed/uninstalled.
181+
*/
182+
static void validateCategorizationAnalyzer(Job.Builder jobBuilder, AnalysisRegistry analysisRegistry, Environment environment)
183+
throws IOException {
184+
CategorizationAnalyzerConfig categorizationAnalyzerConfig = jobBuilder.getAnalysisConfig().getCategorizationAnalyzerConfig();
185+
if (categorizationAnalyzerConfig != null) {
186+
CategorizationAnalyzer.verifyConfigBuilder(new CategorizationAnalyzerConfig.Builder(categorizationAnalyzerConfig),
187+
analysisRegistry, environment);
188+
}
189+
}
190+
173191
/**
174192
* Stores a job in the cluster state
175193
*/
176194
public void putJob(PutJobAction.Request request, AnalysisRegistry analysisRegistry, ClusterState state,
177195
ActionListener<PutJobAction.Response> actionListener) throws IOException {
178196

179197
request.getJobBuilder().validateAnalysisLimitsAndSetDefaults(maxModelMemoryLimit);
180-
request.getJobBuilder().validateCategorizationAnalyzer(analysisRegistry, environment);
198+
validateCategorizationAnalyzer(request.getJobBuilder(), analysisRegistry, environment);
181199

182200
Job job = request.getJobBuilder().build(new Date());
183201

0 commit comments

Comments
 (0)