elastic
diff --git a/‎x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/config/CategorizationAnalyzerConfig.java
+1-203 b/‎x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/config/CategorizationAnalyzerConfig.java
+1-203
diff --git a/‎x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/config/Job.java
+2-16 b/‎x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/config/Job.java
+2-16
diff --git a/‎x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/JobManager.java
+19-1 b/‎x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/JobManager.java
+19-1
@@ -5,14 +5,8 @@
  */
 package org.elasticsearch.xpack.core.ml.job.config;
 
-import org.apache.lucene.analysis.Analyzer;
-import org.elasticsearch.Version;
-import org.elasticsearch.action.admin.indices.analyze.TransportAnalyzeAction;
-import org.elasticsearch.cluster.metadata.IndexMetaData;
 import org.elasticsearch.common.ParseField;
 import org.elasticsearch.common.Strings;
-import org.elasticsearch.common.UUIDs;
-import org.elasticsearch.common.collect.Tuple;
 import org.elasticsearch.common.io.stream.StreamInput;
 import org.elasticsearch.common.io.stream.StreamOutput;
 import org.elasticsearch.common.io.stream.Writeable;
@@ -22,15 +16,6 @@
 import org.elasticsearch.common.xcontent.XContentFactory;
 import org.elasticsearch.common.xcontent.XContentParser;
 import org.elasticsearch.common.xcontent.XContentType;
-import org.elasticsearch.env.Environment;
-import org.elasticsearch.index.IndexSettings;
-import org.elasticsearch.index.analysis.AnalysisRegistry;
-import org.elasticsearch.index.analysis.CharFilterFactory;
-import org.elasticsearch.index.analysis.CustomAnalyzer;
-import org.elasticsearch.index.analysis.CustomAnalyzerProvider;
-import org.elasticsearch.index.analysis.TokenFilterFactory;
-import org.elasticsearch.index.analysis.TokenizerFactory;
-import org.elasticsearch.indices.analysis.AnalysisModule;
 import org.elasticsearch.rest.action.admin.indices.RestAnalyzeAction;
 import org.elasticsearch.xpack.core.ml.MlParserType;
 
@@ -42,12 +27,11 @@
 import java.util.Map;
 import java.util.Objects;
 
-
 /**
  * Configuration for the categorization analyzer.
  *
  * The syntax is a subset of what can be supplied to the {@linkplain RestAnalyzeAction <code>_analyze</code> endpoint}.
- * To summarise, the first option is to specify the name of an out-of-the-box analyzer:
+ * To summarize, the first option is to specify the name of an out-of-the-box analyzer:
  * <code>
  *     "categorization_analyzer" : "standard"
  * </code>
@@ -66,11 +50,6 @@
  *         { "type" : "pattern_replace", "pattern": "^[0-9].*" }
  *     ]
  * </code>
- *
- * Unfortunately there is no easy to to reuse a subset of the <code>_analyze</code> action implementation, so much
- * of the code in this file is copied from {@link TransportAnalyzeAction}.  Unfortunately the logic required here is
- * not quite identical to that of {@link TransportAnalyzeAction}, and the required code is hard to partially reuse.
- * TODO: consider refactoring ES core to allow more reuse.
  */
 public class CategorizationAnalyzerConfig implements ToXContentFragment, Writeable {
 
@@ -350,175 +329,6 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
         return builder;
     }
 
-    /**
-     * Convert the config to an {@link Analyzer}.  This may be a global analyzer or a newly created custom analyzer.
-     * In the case of a global analyzer the caller must NOT close it when they have finished with it.  In the case of
-     * a newly created custom analyzer the caller is responsible for closing it.
-     * @return The first tuple member is the {@link Analyzer}; the second indicates whether the caller is responsible
-     *         for closing it.
-     */
-    public Tuple<Analyzer, Boolean> toAnalyzer(AnalysisRegistry analysisRegistry, Environment environment) throws IOException {
-        if (analyzer != null) {
-            Analyzer globalAnalyzer = analysisRegistry.getAnalyzer(analyzer);
-            if (globalAnalyzer == null) {
-                throw new IllegalArgumentException("Failed to find global analyzer [" + analyzer + "]");
-            }
-            return new Tuple<>(globalAnalyzer, Boolean.FALSE);
-        } else {
-            List<CharFilterFactory> charFilterFactoryList =
-                    parseCharFilterFactories(analysisRegistry, environment);
-
-            Tuple<String, TokenizerFactory> tokenizerFactory = parseTokenizerFactory(analysisRegistry,
-                    environment);
-
-            List<TokenFilterFactory> tokenFilterFactoryList = parseTokenFilterFactories(analysisRegistry,
-                    environment, tokenizerFactory, charFilterFactoryList);
-
-            return new Tuple<>(new CustomAnalyzer(tokenizerFactory.v1(), tokenizerFactory.v2(),
-                    charFilterFactoryList.toArray(new CharFilterFactory[charFilterFactoryList.size()]),
-                    tokenFilterFactoryList.toArray(new TokenFilterFactory[tokenFilterFactoryList.size()])), Boolean.TRUE);
-        }
-    }
-
-
-    /**
-     * Get char filter factories for each configured char filter.  Each configuration
-     * element can be the name of an out-of-the-box char filter, or a custom definition.
-     */
-    private List<CharFilterFactory> parseCharFilterFactories(AnalysisRegistry analysisRegistry,
-                                                             Environment environment) throws IOException {
-        final List<CharFilterFactory> charFilterFactoryList = new ArrayList<>();
-        for (NameOrDefinition charFilter : charFilters) {
-            final CharFilterFactory charFilterFactory;
-            if (charFilter.name != null) {
-                AnalysisModule.AnalysisProvider<CharFilterFactory> charFilterFactoryFactory =
-                        analysisRegistry.getCharFilterProvider(charFilter.name);
-                if (charFilterFactoryFactory == null) {
-                    throw new IllegalArgumentException("Failed to find global char filter under [" + charFilter.name + "]");
-                }
-                charFilterFactory = charFilterFactoryFactory.get(environment, charFilter.name);
-            } else {
-                String charFilterTypeName = charFilter.definition.get("type");
-                if (charFilterTypeName == null) {
-                    throw new IllegalArgumentException("Missing [type] setting for char filter: " + charFilter.definition);
-                }
-                AnalysisModule.AnalysisProvider<CharFilterFactory> charFilterFactoryFactory =
-                        analysisRegistry.getCharFilterProvider(charFilterTypeName);
-                if (charFilterFactoryFactory == null) {
-                    throw new IllegalArgumentException("Failed to find global char filter under [" + charFilterTypeName + "]");
-                }
-                Settings settings = augmentSettings(charFilter.definition);
-                // Need to set anonymous "name" of char_filter
-                charFilterFactory = charFilterFactoryFactory.get(buildDummyIndexSettings(settings), environment,
-                        "_anonymous_charfilter", settings);
-            }
-            if (charFilterFactory == null) {
-                throw new IllegalArgumentException("Failed to find char filter [" + charFilter + "]");
-            }
-            charFilterFactoryList.add(charFilterFactory);
-        }
-        return charFilterFactoryList;
-    }
-
-    /**
-     * Get the tokenizer factory for the configured tokenizer.  The configuration
-     * can be the name of an out-of-the-box tokenizer, or a custom definition.
-     */
-    private Tuple<String, TokenizerFactory> parseTokenizerFactory(AnalysisRegistry analysisRegistry,
-                                                                  Environment environment) throws IOException {
-        final String name;
-        final TokenizerFactory tokenizerFactory;
-        if (tokenizer.name != null) {
-            name = tokenizer.name;
-            AnalysisModule.AnalysisProvider<TokenizerFactory> tokenizerFactoryFactory = analysisRegistry.getTokenizerProvider(name);
-            if (tokenizerFactoryFactory == null) {
-                throw new IllegalArgumentException("Failed to find global tokenizer under [" + name + "]");
-            }
-            tokenizerFactory = tokenizerFactoryFactory.get(environment, name);
-        } else {
-            String tokenizerTypeName = tokenizer.definition.get("type");
-            if (tokenizerTypeName == null) {
-                throw new IllegalArgumentException("Missing [type] setting for tokenizer: " + tokenizer.definition);
-            }
-            AnalysisModule.AnalysisProvider<TokenizerFactory> tokenizerFactoryFactory =
-                    analysisRegistry.getTokenizerProvider(tokenizerTypeName);
-            if (tokenizerFactoryFactory == null) {
-                throw new IllegalArgumentException("Failed to find global tokenizer under [" + tokenizerTypeName + "]");
-            }
-            Settings settings = augmentSettings(tokenizer.definition);
-            // Need to set anonymous "name" of tokenizer
-            name = "_anonymous_tokenizer";
-            tokenizerFactory = tokenizerFactoryFactory.get(buildDummyIndexSettings(settings), environment, name, settings);
-        }
-        return new Tuple<>(name, tokenizerFactory);
-    }
-
-    /**
-     * Get token filter factories for each configured token filter.  Each configuration
-     * element can be the name of an out-of-the-box token filter, or a custom definition.
-     */
-    private List<TokenFilterFactory> parseTokenFilterFactories(AnalysisRegistry analysisRegistry, Environment environment,
-                                                               Tuple<String, TokenizerFactory> tokenizerFactory,
-                                                               List<CharFilterFactory> charFilterFactoryList) throws IOException {
-        final List<TokenFilterFactory> tokenFilterFactoryList = new ArrayList<>();
-        for (NameOrDefinition tokenFilter : tokenFilters) {
-            TokenFilterFactory tokenFilterFactory;
-            if (tokenFilter.name != null) {
-                AnalysisModule.AnalysisProvider<TokenFilterFactory> tokenFilterFactoryFactory;
-                tokenFilterFactoryFactory = analysisRegistry.getTokenFilterProvider(tokenFilter.name);
-                if (tokenFilterFactoryFactory == null) {
-                    throw new IllegalArgumentException("Failed to find global token filter under [" + tokenFilter.name + "]");
-                }
-                tokenFilterFactory = tokenFilterFactoryFactory.get(environment, tokenFilter.name);
-            } else {
-                String filterTypeName = tokenFilter.definition.get("type");
-                if (filterTypeName == null) {
-                    throw new IllegalArgumentException("Missing [type] setting for token filter: " + tokenFilter.definition);
-                }
-                AnalysisModule.AnalysisProvider<TokenFilterFactory> tokenFilterFactoryFactory =
-                        analysisRegistry.getTokenFilterProvider(filterTypeName);
-                if (tokenFilterFactoryFactory == null) {
-                    throw new IllegalArgumentException("Failed to find global token filter under [" + filterTypeName + "]");
-                }
-                Settings settings = augmentSettings(tokenFilter.definition);
-                // Need to set anonymous "name" of token_filter
-                tokenFilterFactory = tokenFilterFactoryFactory.get(buildDummyIndexSettings(settings), environment,
-                        "_anonymous_tokenfilter", settings);
-                tokenFilterFactory = CustomAnalyzerProvider.checkAndApplySynonymFilter(tokenFilterFactory, tokenizerFactory.v1(),
-                        tokenizerFactory.v2(), tokenFilterFactoryList, charFilterFactoryList, environment);
-            }
-            if (tokenFilterFactory == null) {
-                throw new IllegalArgumentException("Failed to find or create token filter [" + tokenFilter + "]");
-            }
-            tokenFilterFactoryList.add(tokenFilterFactory);
-        }
-        return tokenFilterFactoryList;
-    }
-
-    /**
-     * The Elasticsearch analysis functionality is designed to work with indices.  For
-     * categorization we have to pretend we've got some index settings.
-     */
-    private IndexSettings buildDummyIndexSettings(Settings settings) {
-        IndexMetaData metaData = IndexMetaData.builder(IndexMetaData.INDEX_UUID_NA_VALUE).settings(settings).build();
-        return new IndexSettings(metaData, Settings.EMPTY);
-    }
-
-    /**
-     * The behaviour of Elasticsearch analyzers can vary between versions.
-     * For categorization we'll always use the latest version of the text analysis.
-     * The other settings are just to stop classes that expect to be associated with
-     * an index from complaining.
-     */
-    private Settings augmentSettings(Settings settings) {
-        return Settings.builder().put(settings)
-                .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
-                .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0)
-                .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
-                .put(IndexMetaData.SETTING_INDEX_UUID, UUIDs.randomBase64UUID())
-                .build();
-    }
-
     @Override
     public boolean equals(Object o) {
         if (this == o) return true;
@@ -609,17 +419,5 @@ public CategorizationAnalyzerConfig build() {
             }
             return new CategorizationAnalyzerConfig(analyzer, charFilters, tokenizer, tokenFilters);
         }
-
-        /**
-         * Verify that the builder will build a valid config.  This is not done as part of the basic build
-         * because it verifies that the names of analyzers/tokenizers/filters referenced by the config are
-         * known, and the validity of these names could change over time.
-         */
-        public void verify(AnalysisRegistry analysisRegistry, Environment environment) throws IOException {
-            Tuple<Analyzer, Boolean> tuple = build().toAnalyzer(analysisRegistry, environment);
-            if (tuple.v2()) {
-                tuple.v1().close();
-            }
-        }
     }
 }
@@ -21,8 +21,6 @@
 import org.elasticsearch.common.xcontent.ToXContentObject;
 import org.elasticsearch.common.xcontent.XContentBuilder;
 import org.elasticsearch.common.xcontent.XContentParser.Token;
-import org.elasticsearch.env.Environment;
-import org.elasticsearch.index.analysis.AnalysisRegistry;
 import org.elasticsearch.xpack.core.ml.MlParserType;
 import org.elasticsearch.xpack.core.ml.job.messages.Messages;
 import org.elasticsearch.xpack.core.ml.job.persistence.AnomalyDetectorsIndexFields;
@@ -809,8 +807,8 @@ public Builder setAnalysisConfig(AnalysisConfig.Builder configBuilder) {
             return this;
         }
 
-        public AnalysisLimits getAnalysisLimits() {
-             return analysisLimits;
+        public AnalysisConfig getAnalysisConfig() {
+             return analysisConfig;
         }
 
         public Builder setAnalysisLimits(AnalysisLimits analysisLimits) {
@@ -1135,18 +1133,6 @@ public void validateAnalysisLimitsAndSetDefaults(@Nullable ByteSizeValue maxMode
                     AnalysisLimits.DEFAULT_MODEL_MEMORY_LIMIT_MB);
         }
 
-        /**
-         * Validate the char filter/tokenizer/token filter names used in the categorization analyzer config (if any).
-         * The overall structure can be validated at parse time, but the exact names need to be checked separately,
-         * as plugins that provide the functionality can be installed/uninstalled.
-         */
-        public void validateCategorizationAnalyzer(AnalysisRegistry analysisRegistry, Environment environment) throws IOException {
-            CategorizationAnalyzerConfig categorizationAnalyzerConfig = analysisConfig.getCategorizationAnalyzerConfig();
-            if (categorizationAnalyzerConfig != null) {
-                new CategorizationAnalyzerConfig.Builder(categorizationAnalyzerConfig).verify(analysisRegistry, environment);
-            }
-        }
-
         private void validateGroups() {
             for (String group : this.groups) {
                 if (MlStrings.isValidId(group) == false) {
 
@@ -39,6 +39,7 @@
 import org.elasticsearch.xpack.core.ml.action.UpdateJobAction;
 import org.elasticsearch.xpack.core.ml.action.util.QueryPage;
 import org.elasticsearch.xpack.core.ml.job.config.AnalysisLimits;
+import org.elasticsearch.xpack.core.ml.job.config.CategorizationAnalyzerConfig;
 import org.elasticsearch.xpack.core.ml.job.config.DataDescription;
 import org.elasticsearch.xpack.core.ml.job.config.Job;
 import org.elasticsearch.xpack.core.ml.job.config.JobState;
@@ -50,6 +51,7 @@
 import org.elasticsearch.xpack.core.ml.job.process.autodetect.state.ModelSnapshot;
 import org.elasticsearch.xpack.core.ml.utils.ExceptionsHelper;
 import org.elasticsearch.xpack.ml.MachineLearning;
+import org.elasticsearch.xpack.ml.job.categorization.CategorizationAnalyzer;
 import org.elasticsearch.xpack.ml.job.persistence.JobProvider;
 import org.elasticsearch.xpack.ml.job.persistence.JobResultsPersister;
 import org.elasticsearch.xpack.ml.job.process.autodetect.UpdateParams;
@@ -170,14 +172,30 @@ public JobState getJobState(String jobId) {
         return MlMetadata.getJobState(jobId, tasks);
     }
 
+    /**
+     * Validate the char filter/tokenizer/token filter names used in the categorization analyzer config (if any).
+     * This validation has to be done server-side; it cannot be done in a client as that won't have loaded the
+     * appropriate analysis modules/plugins.
+     * The overall structure can be validated at parse time, but the exact names need to be checked separately,
+     * as plugins that provide the functionality can be installed/uninstalled.
+     */
+    static void validateCategorizationAnalyzer(Job.Builder jobBuilder, AnalysisRegistry analysisRegistry, Environment environment)
+        throws IOException {
+        CategorizationAnalyzerConfig categorizationAnalyzerConfig = jobBuilder.getAnalysisConfig().getCategorizationAnalyzerConfig();
+        if (categorizationAnalyzerConfig != null) {
+            CategorizationAnalyzer.verifyConfigBuilder(new CategorizationAnalyzerConfig.Builder(categorizationAnalyzerConfig),
+                analysisRegistry, environment);
+        }
+    }
+
     /**
      * Stores a job in the cluster state
      */
     public void putJob(PutJobAction.Request request, AnalysisRegistry analysisRegistry, ClusterState state,
                        ActionListener<PutJobAction.Response> actionListener) throws IOException {
 
         request.getJobBuilder().validateAnalysisLimitsAndSetDefaults(maxModelMemoryLimit);
-        request.getJobBuilder().validateCategorizationAnalyzer(analysisRegistry, environment);
+        validateCategorizationAnalyzer(request.getJobBuilder(), analysisRegistry, environment);
 
         Job job = request.getJobBuilder().build(new Date());