|
5 | 5 | */
|
6 | 6 | package org.elasticsearch.xpack.core.ml.job.config;
|
7 | 7 |
|
8 |
| -import org.apache.lucene.analysis.Analyzer; |
9 |
| -import org.elasticsearch.Version; |
10 |
| -import org.elasticsearch.action.admin.indices.analyze.TransportAnalyzeAction; |
11 |
| -import org.elasticsearch.cluster.metadata.IndexMetaData; |
12 | 8 | import org.elasticsearch.common.ParseField;
|
13 | 9 | import org.elasticsearch.common.Strings;
|
14 |
| -import org.elasticsearch.common.UUIDs; |
15 |
| -import org.elasticsearch.common.collect.Tuple; |
16 | 10 | import org.elasticsearch.common.io.stream.StreamInput;
|
17 | 11 | import org.elasticsearch.common.io.stream.StreamOutput;
|
18 | 12 | import org.elasticsearch.common.io.stream.Writeable;
|
|
22 | 16 | import org.elasticsearch.common.xcontent.XContentFactory;
|
23 | 17 | import org.elasticsearch.common.xcontent.XContentParser;
|
24 | 18 | import org.elasticsearch.common.xcontent.XContentType;
|
25 |
| -import org.elasticsearch.env.Environment; |
26 |
| -import org.elasticsearch.index.IndexSettings; |
27 |
| -import org.elasticsearch.index.analysis.AnalysisRegistry; |
28 |
| -import org.elasticsearch.index.analysis.CharFilterFactory; |
29 |
| -import org.elasticsearch.index.analysis.CustomAnalyzer; |
30 |
| -import org.elasticsearch.index.analysis.CustomAnalyzerProvider; |
31 |
| -import org.elasticsearch.index.analysis.TokenFilterFactory; |
32 |
| -import org.elasticsearch.index.analysis.TokenizerFactory; |
33 |
| -import org.elasticsearch.indices.analysis.AnalysisModule; |
34 | 19 | import org.elasticsearch.rest.action.admin.indices.RestAnalyzeAction;
|
35 | 20 | import org.elasticsearch.xpack.core.ml.MlParserType;
|
36 | 21 |
|
|
42 | 27 | import java.util.Map;
|
43 | 28 | import java.util.Objects;
|
44 | 29 |
|
45 |
| - |
46 | 30 | /**
|
47 | 31 | * Configuration for the categorization analyzer.
|
48 | 32 | *
|
49 | 33 | * The syntax is a subset of what can be supplied to the {@linkplain RestAnalyzeAction <code>_analyze</code> endpoint}.
|
50 |
| - * To summarise, the first option is to specify the name of an out-of-the-box analyzer: |
| 34 | + * To summarize, the first option is to specify the name of an out-of-the-box analyzer: |
51 | 35 | * <code>
|
52 | 36 | * "categorization_analyzer" : "standard"
|
53 | 37 | * </code>
|
|
66 | 50 | * { "type" : "pattern_replace", "pattern": "^[0-9].*" }
|
67 | 51 | * ]
|
68 | 52 | * </code>
|
69 |
| - * |
70 |
| - * Unfortunately there is no easy to to reuse a subset of the <code>_analyze</code> action implementation, so much |
71 |
| - * of the code in this file is copied from {@link TransportAnalyzeAction}. Unfortunately the logic required here is |
72 |
| - * not quite identical to that of {@link TransportAnalyzeAction}, and the required code is hard to partially reuse. |
73 |
| - * TODO: consider refactoring ES core to allow more reuse. |
74 | 53 | */
|
75 | 54 | public class CategorizationAnalyzerConfig implements ToXContentFragment, Writeable {
|
76 | 55 |
|
@@ -350,175 +329,6 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
|
350 | 329 | return builder;
|
351 | 330 | }
|
352 | 331 |
|
353 |
| - /** |
354 |
| - * Convert the config to an {@link Analyzer}. This may be a global analyzer or a newly created custom analyzer. |
355 |
| - * In the case of a global analyzer the caller must NOT close it when they have finished with it. In the case of |
356 |
| - * a newly created custom analyzer the caller is responsible for closing it. |
357 |
| - * @return The first tuple member is the {@link Analyzer}; the second indicates whether the caller is responsible |
358 |
| - * for closing it. |
359 |
| - */ |
360 |
| - public Tuple<Analyzer, Boolean> toAnalyzer(AnalysisRegistry analysisRegistry, Environment environment) throws IOException { |
361 |
| - if (analyzer != null) { |
362 |
| - Analyzer globalAnalyzer = analysisRegistry.getAnalyzer(analyzer); |
363 |
| - if (globalAnalyzer == null) { |
364 |
| - throw new IllegalArgumentException("Failed to find global analyzer [" + analyzer + "]"); |
365 |
| - } |
366 |
| - return new Tuple<>(globalAnalyzer, Boolean.FALSE); |
367 |
| - } else { |
368 |
| - List<CharFilterFactory> charFilterFactoryList = |
369 |
| - parseCharFilterFactories(analysisRegistry, environment); |
370 |
| - |
371 |
| - Tuple<String, TokenizerFactory> tokenizerFactory = parseTokenizerFactory(analysisRegistry, |
372 |
| - environment); |
373 |
| - |
374 |
| - List<TokenFilterFactory> tokenFilterFactoryList = parseTokenFilterFactories(analysisRegistry, |
375 |
| - environment, tokenizerFactory, charFilterFactoryList); |
376 |
| - |
377 |
| - return new Tuple<>(new CustomAnalyzer(tokenizerFactory.v1(), tokenizerFactory.v2(), |
378 |
| - charFilterFactoryList.toArray(new CharFilterFactory[charFilterFactoryList.size()]), |
379 |
| - tokenFilterFactoryList.toArray(new TokenFilterFactory[tokenFilterFactoryList.size()])), Boolean.TRUE); |
380 |
| - } |
381 |
| - } |
382 |
| - |
383 |
| - |
384 |
| - /** |
385 |
| - * Get char filter factories for each configured char filter. Each configuration |
386 |
| - * element can be the name of an out-of-the-box char filter, or a custom definition. |
387 |
| - */ |
388 |
| - private List<CharFilterFactory> parseCharFilterFactories(AnalysisRegistry analysisRegistry, |
389 |
| - Environment environment) throws IOException { |
390 |
| - final List<CharFilterFactory> charFilterFactoryList = new ArrayList<>(); |
391 |
| - for (NameOrDefinition charFilter : charFilters) { |
392 |
| - final CharFilterFactory charFilterFactory; |
393 |
| - if (charFilter.name != null) { |
394 |
| - AnalysisModule.AnalysisProvider<CharFilterFactory> charFilterFactoryFactory = |
395 |
| - analysisRegistry.getCharFilterProvider(charFilter.name); |
396 |
| - if (charFilterFactoryFactory == null) { |
397 |
| - throw new IllegalArgumentException("Failed to find global char filter under [" + charFilter.name + "]"); |
398 |
| - } |
399 |
| - charFilterFactory = charFilterFactoryFactory.get(environment, charFilter.name); |
400 |
| - } else { |
401 |
| - String charFilterTypeName = charFilter.definition.get("type"); |
402 |
| - if (charFilterTypeName == null) { |
403 |
| - throw new IllegalArgumentException("Missing [type] setting for char filter: " + charFilter.definition); |
404 |
| - } |
405 |
| - AnalysisModule.AnalysisProvider<CharFilterFactory> charFilterFactoryFactory = |
406 |
| - analysisRegistry.getCharFilterProvider(charFilterTypeName); |
407 |
| - if (charFilterFactoryFactory == null) { |
408 |
| - throw new IllegalArgumentException("Failed to find global char filter under [" + charFilterTypeName + "]"); |
409 |
| - } |
410 |
| - Settings settings = augmentSettings(charFilter.definition); |
411 |
| - // Need to set anonymous "name" of char_filter |
412 |
| - charFilterFactory = charFilterFactoryFactory.get(buildDummyIndexSettings(settings), environment, |
413 |
| - "_anonymous_charfilter", settings); |
414 |
| - } |
415 |
| - if (charFilterFactory == null) { |
416 |
| - throw new IllegalArgumentException("Failed to find char filter [" + charFilter + "]"); |
417 |
| - } |
418 |
| - charFilterFactoryList.add(charFilterFactory); |
419 |
| - } |
420 |
| - return charFilterFactoryList; |
421 |
| - } |
422 |
| - |
423 |
| - /** |
424 |
| - * Get the tokenizer factory for the configured tokenizer. The configuration |
425 |
| - * can be the name of an out-of-the-box tokenizer, or a custom definition. |
426 |
| - */ |
427 |
| - private Tuple<String, TokenizerFactory> parseTokenizerFactory(AnalysisRegistry analysisRegistry, |
428 |
| - Environment environment) throws IOException { |
429 |
| - final String name; |
430 |
| - final TokenizerFactory tokenizerFactory; |
431 |
| - if (tokenizer.name != null) { |
432 |
| - name = tokenizer.name; |
433 |
| - AnalysisModule.AnalysisProvider<TokenizerFactory> tokenizerFactoryFactory = analysisRegistry.getTokenizerProvider(name); |
434 |
| - if (tokenizerFactoryFactory == null) { |
435 |
| - throw new IllegalArgumentException("Failed to find global tokenizer under [" + name + "]"); |
436 |
| - } |
437 |
| - tokenizerFactory = tokenizerFactoryFactory.get(environment, name); |
438 |
| - } else { |
439 |
| - String tokenizerTypeName = tokenizer.definition.get("type"); |
440 |
| - if (tokenizerTypeName == null) { |
441 |
| - throw new IllegalArgumentException("Missing [type] setting for tokenizer: " + tokenizer.definition); |
442 |
| - } |
443 |
| - AnalysisModule.AnalysisProvider<TokenizerFactory> tokenizerFactoryFactory = |
444 |
| - analysisRegistry.getTokenizerProvider(tokenizerTypeName); |
445 |
| - if (tokenizerFactoryFactory == null) { |
446 |
| - throw new IllegalArgumentException("Failed to find global tokenizer under [" + tokenizerTypeName + "]"); |
447 |
| - } |
448 |
| - Settings settings = augmentSettings(tokenizer.definition); |
449 |
| - // Need to set anonymous "name" of tokenizer |
450 |
| - name = "_anonymous_tokenizer"; |
451 |
| - tokenizerFactory = tokenizerFactoryFactory.get(buildDummyIndexSettings(settings), environment, name, settings); |
452 |
| - } |
453 |
| - return new Tuple<>(name, tokenizerFactory); |
454 |
| - } |
455 |
| - |
456 |
| - /** |
457 |
| - * Get token filter factories for each configured token filter. Each configuration |
458 |
| - * element can be the name of an out-of-the-box token filter, or a custom definition. |
459 |
| - */ |
460 |
| - private List<TokenFilterFactory> parseTokenFilterFactories(AnalysisRegistry analysisRegistry, Environment environment, |
461 |
| - Tuple<String, TokenizerFactory> tokenizerFactory, |
462 |
| - List<CharFilterFactory> charFilterFactoryList) throws IOException { |
463 |
| - final List<TokenFilterFactory> tokenFilterFactoryList = new ArrayList<>(); |
464 |
| - for (NameOrDefinition tokenFilter : tokenFilters) { |
465 |
| - TokenFilterFactory tokenFilterFactory; |
466 |
| - if (tokenFilter.name != null) { |
467 |
| - AnalysisModule.AnalysisProvider<TokenFilterFactory> tokenFilterFactoryFactory; |
468 |
| - tokenFilterFactoryFactory = analysisRegistry.getTokenFilterProvider(tokenFilter.name); |
469 |
| - if (tokenFilterFactoryFactory == null) { |
470 |
| - throw new IllegalArgumentException("Failed to find global token filter under [" + tokenFilter.name + "]"); |
471 |
| - } |
472 |
| - tokenFilterFactory = tokenFilterFactoryFactory.get(environment, tokenFilter.name); |
473 |
| - } else { |
474 |
| - String filterTypeName = tokenFilter.definition.get("type"); |
475 |
| - if (filterTypeName == null) { |
476 |
| - throw new IllegalArgumentException("Missing [type] setting for token filter: " + tokenFilter.definition); |
477 |
| - } |
478 |
| - AnalysisModule.AnalysisProvider<TokenFilterFactory> tokenFilterFactoryFactory = |
479 |
| - analysisRegistry.getTokenFilterProvider(filterTypeName); |
480 |
| - if (tokenFilterFactoryFactory == null) { |
481 |
| - throw new IllegalArgumentException("Failed to find global token filter under [" + filterTypeName + "]"); |
482 |
| - } |
483 |
| - Settings settings = augmentSettings(tokenFilter.definition); |
484 |
| - // Need to set anonymous "name" of token_filter |
485 |
| - tokenFilterFactory = tokenFilterFactoryFactory.get(buildDummyIndexSettings(settings), environment, |
486 |
| - "_anonymous_tokenfilter", settings); |
487 |
| - tokenFilterFactory = CustomAnalyzerProvider.checkAndApplySynonymFilter(tokenFilterFactory, tokenizerFactory.v1(), |
488 |
| - tokenizerFactory.v2(), tokenFilterFactoryList, charFilterFactoryList, environment); |
489 |
| - } |
490 |
| - if (tokenFilterFactory == null) { |
491 |
| - throw new IllegalArgumentException("Failed to find or create token filter [" + tokenFilter + "]"); |
492 |
| - } |
493 |
| - tokenFilterFactoryList.add(tokenFilterFactory); |
494 |
| - } |
495 |
| - return tokenFilterFactoryList; |
496 |
| - } |
497 |
| - |
498 |
| - /** |
499 |
| - * The Elasticsearch analysis functionality is designed to work with indices. For |
500 |
| - * categorization we have to pretend we've got some index settings. |
501 |
| - */ |
502 |
| - private IndexSettings buildDummyIndexSettings(Settings settings) { |
503 |
| - IndexMetaData metaData = IndexMetaData.builder(IndexMetaData.INDEX_UUID_NA_VALUE).settings(settings).build(); |
504 |
| - return new IndexSettings(metaData, Settings.EMPTY); |
505 |
| - } |
506 |
| - |
507 |
| - /** |
508 |
| - * The behaviour of Elasticsearch analyzers can vary between versions. |
509 |
| - * For categorization we'll always use the latest version of the text analysis. |
510 |
| - * The other settings are just to stop classes that expect to be associated with |
511 |
| - * an index from complaining. |
512 |
| - */ |
513 |
| - private Settings augmentSettings(Settings settings) { |
514 |
| - return Settings.builder().put(settings) |
515 |
| - .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) |
516 |
| - .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0) |
517 |
| - .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1) |
518 |
| - .put(IndexMetaData.SETTING_INDEX_UUID, UUIDs.randomBase64UUID()) |
519 |
| - .build(); |
520 |
| - } |
521 |
| - |
522 | 332 | @Override
|
523 | 333 | public boolean equals(Object o) {
|
524 | 334 | if (this == o) return true;
|
@@ -609,17 +419,5 @@ public CategorizationAnalyzerConfig build() {
|
609 | 419 | }
|
610 | 420 | return new CategorizationAnalyzerConfig(analyzer, charFilters, tokenizer, tokenFilters);
|
611 | 421 | }
|
612 |
| - |
613 |
| - /** |
614 |
| - * Verify that the builder will build a valid config. This is not done as part of the basic build |
615 |
| - * because it verifies that the names of analyzers/tokenizers/filters referenced by the config are |
616 |
| - * known, and the validity of these names could change over time. |
617 |
| - */ |
618 |
| - public void verify(AnalysisRegistry analysisRegistry, Environment environment) throws IOException { |
619 |
| - Tuple<Analyzer, Boolean> tuple = build().toAnalyzer(analysisRegistry, environment); |
620 |
| - if (tuple.v2()) { |
621 |
| - tuple.v1().close(); |
622 |
| - } |
623 |
| - } |
624 | 422 | }
|
625 | 423 | }
|
0 commit comments