From f0a14ad67c071233f37d931d2ef69202d7d24ac3 Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Mon, 1 Jul 2019 08:29:39 +0100 Subject: [PATCH 1/3] Allow all token/char filters in normalizers --- .../index/analysis/AnalysisRegistry.java | 26 +++--- .../index/analysis/AnalyzerComponents.java | 10 +- .../analysis/CustomAnalyzerProvider.java | 5 +- .../analysis/CustomNormalizerProvider.java | 93 ------------------- .../analysis/ReloadableCustomAnalyzer.java | 2 +- .../index/analysis/CustomNormalizerTests.java | 29 ------ .../ReloadableCustomAnalyzerTests.java | 6 +- 7 files changed, 24 insertions(+), 147 deletions(-) delete mode 100644 server/src/main/java/org/elasticsearch/index/analysis/CustomNormalizerProvider.java diff --git a/server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java b/server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java index 755266604add5..b682cefe9f9c0 100644 --- a/server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java +++ b/server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java @@ -44,6 +44,7 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.function.BiFunction; import java.util.function.Function; +import java.util.function.Supplier; import java.util.stream.Collectors; import static java.util.Collections.unmodifiableMap; @@ -409,8 +410,11 @@ private Map buildMapping(Component component, IndexSettings setti continue; } } else if (component == Component.NORMALIZER) { + if (currentSettings.hasValue("tokenizer")) { + throw new IllegalArgumentException("Custom normalizer [" + name + "] cannot configure a tokenizer"); + } if (typeName == null || typeName.equals("custom")) { - T factory = (T) new CustomNormalizerProvider(settings, name, currentSettings); + T factory = (T) new CustomAnalyzerProvider(settings, name, currentSettings); factories.put(name, factory); continue; } @@ -531,10 +535,10 @@ public IndexAnalyzers build(IndexSettings indexSettings, }); } for (Map.Entry> entry : normalizerProviders.entrySet()) { - processNormalizerFactory(entry.getKey(), entry.getValue(), normalizers, "keyword", - tokenizerFactoryFactories.get("keyword"), tokenFilterFactoryFactories, charFilterFactoryFactories); + processNormalizerFactory(entry.getKey(), entry.getValue(), normalizers, + () -> tokenizerFactoryFactories.get("keyword"), tokenFilterFactoryFactories, charFilterFactoryFactories); processNormalizerFactory(entry.getKey(), entry.getValue(), whitespaceNormalizers, - "whitespace", () -> new WhitespaceTokenizer(), tokenFilterFactoryFactories, charFilterFactoryFactories); + () -> WhitespaceTokenizer::new, tokenFilterFactoryFactories, charFilterFactoryFactories); } if (!analyzers.containsKey(DEFAULT_ANALYZER_NAME)) { @@ -575,7 +579,7 @@ private static NamedAnalyzer produceAnalyzer(String name, */ int overridePositionIncrementGap = TextFieldMapper.Defaults.POSITION_INCREMENT_GAP; if (analyzerFactory instanceof CustomAnalyzerProvider) { - ((CustomAnalyzerProvider) analyzerFactory).build(tokenizers, charFilters, tokenFilters); + ((CustomAnalyzerProvider) analyzerFactory).build(tokenizers::get, charFilters, tokenFilters); /* * Custom analyzers already default to the correct, version * dependent positionIncrementGap and the user is be able to @@ -603,20 +607,16 @@ private static NamedAnalyzer produceAnalyzer(String name, return analyzer; } - private void processNormalizerFactory( + private static void processNormalizerFactory( String name, AnalyzerProvider normalizerFactory, Map normalizers, - String tokenizerName, - TokenizerFactory tokenizerFactory, + Supplier tokenizerSupplier, Map tokenFilters, Map charFilters) { - if (tokenizerFactory == null) { - throw new IllegalStateException("keyword tokenizer factory is null, normalizers require analysis-common module"); - } - if (normalizerFactory instanceof CustomNormalizerProvider) { - ((CustomNormalizerProvider) normalizerFactory).build(tokenizerName, tokenizerFactory, charFilters, tokenFilters); + if (normalizerFactory instanceof CustomAnalyzerProvider) { + ((CustomAnalyzerProvider) normalizerFactory).build(n -> tokenizerSupplier.get(), charFilters, tokenFilters); } if (normalizers.containsKey(name)) { throw new IllegalStateException("already registered analyzer with name: " + name); diff --git a/server/src/main/java/org/elasticsearch/index/analysis/AnalyzerComponents.java b/server/src/main/java/org/elasticsearch/index/analysis/AnalyzerComponents.java index f150ac54558e0..d7388b13bbdd7 100644 --- a/server/src/main/java/org/elasticsearch/index/analysis/AnalyzerComponents.java +++ b/server/src/main/java/org/elasticsearch/index/analysis/AnalyzerComponents.java @@ -24,6 +24,7 @@ import java.util.ArrayList; import java.util.List; import java.util.Map; +import java.util.function.Function; /** * A class that groups analysis components necessary to produce a custom analyzer. @@ -49,14 +50,11 @@ public final class AnalyzerComponents { this.analysisMode = mode; } - static AnalyzerComponents createComponents(String name, Settings analyzerSettings, final Map tokenizers, + static AnalyzerComponents createComponents(String name, Settings analyzerSettings, final Function tokenizers, final Map charFilters, final Map tokenFilters) { String tokenizerName = analyzerSettings.get("tokenizer"); - if (tokenizerName == null) { - throw new IllegalArgumentException("Custom Analyzer [" + name + "] must be configured with a tokenizer"); - } - TokenizerFactory tokenizer = tokenizers.get(tokenizerName); + TokenizerFactory tokenizer = tokenizers.apply(tokenizerName); if (tokenizer == null) { throw new IllegalArgumentException( "Custom Analyzer [" + name + "] failed to find tokenizer under name " + "[" + tokenizerName + "]"); @@ -108,4 +106,4 @@ public CharFilterFactory[] getCharFilters() { public AnalysisMode analysisMode() { return this.analysisMode; } -} \ No newline at end of file +} diff --git a/server/src/main/java/org/elasticsearch/index/analysis/CustomAnalyzerProvider.java b/server/src/main/java/org/elasticsearch/index/analysis/CustomAnalyzerProvider.java index d8a50838e9df4..d84b18b854c41 100644 --- a/server/src/main/java/org/elasticsearch/index/analysis/CustomAnalyzerProvider.java +++ b/server/src/main/java/org/elasticsearch/index/analysis/CustomAnalyzerProvider.java @@ -25,6 +25,7 @@ import org.elasticsearch.index.mapper.TextFieldMapper; import java.util.Map; +import java.util.function.Function; import static org.elasticsearch.index.analysis.AnalyzerComponents.createComponents; @@ -44,7 +45,7 @@ public CustomAnalyzerProvider(IndexSettings indexSettings, this.analyzerSettings = settings; } - void build(final Map tokenizers, + void build(final Function tokenizers, final Map charFilters, final Map tokenFilters) { customAnalyzer = create(name(), analyzerSettings, tokenizers, charFilters, tokenFilters); @@ -54,7 +55,7 @@ void build(final Map tokenizers, * Factory method that either returns a plain {@link ReloadableCustomAnalyzer} if the components used for creation are supporting index * and search time use, or a {@link ReloadableCustomAnalyzer} if the components are intended for search time use only. */ - private static Analyzer create(String name, Settings analyzerSettings, Map tokenizers, + private static Analyzer create(String name, Settings analyzerSettings, Function tokenizers, Map charFilters, Map tokenFilters) { int positionIncrementGap = TextFieldMapper.Defaults.POSITION_INCREMENT_GAP; diff --git a/server/src/main/java/org/elasticsearch/index/analysis/CustomNormalizerProvider.java b/server/src/main/java/org/elasticsearch/index/analysis/CustomNormalizerProvider.java deleted file mode 100644 index 13ee76e8d6265..0000000000000 --- a/server/src/main/java/org/elasticsearch/index/analysis/CustomNormalizerProvider.java +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Licensed to Elasticsearch under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. Elasticsearch licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.elasticsearch.index.analysis; - -import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.index.IndexSettings; - -import java.util.ArrayList; -import java.util.List; -import java.util.Map; - -/** - * A custom normalizer that is built out of a char and token filters. On the - * contrary to analyzers, it does not support tokenizers and only supports a - * subset of char and token filters. - */ -public final class CustomNormalizerProvider extends AbstractIndexAnalyzerProvider { - - private final Settings analyzerSettings; - - private CustomAnalyzer customAnalyzer; - - public CustomNormalizerProvider(IndexSettings indexSettings, - String name, Settings settings) { - super(indexSettings, name, settings); - this.analyzerSettings = settings; - } - - public void build(final String tokenizerName, final TokenizerFactory tokenizerFactory, final Map charFilters, - final Map tokenFilters) { - if (analyzerSettings.get("tokenizer") != null) { - throw new IllegalArgumentException("Custom normalizer [" + name() + "] cannot configure a tokenizer"); - } - - List charFilterNames = analyzerSettings.getAsList("char_filter"); - List charFiltersList = new ArrayList<>(charFilterNames.size()); - for (String charFilterName : charFilterNames) { - CharFilterFactory charFilter = charFilters.get(charFilterName); - if (charFilter == null) { - throw new IllegalArgumentException("Custom normalizer [" + name() + "] failed to find char_filter under name [" - + charFilterName + "]"); - } - if (charFilter instanceof NormalizingCharFilterFactory == false) { - throw new IllegalArgumentException("Custom normalizer [" + name() + "] may not use char filter [" - + charFilterName + "]"); - } - charFiltersList.add(charFilter); - } - - List tokenFilterNames = analyzerSettings.getAsList("filter"); - List tokenFilterList = new ArrayList<>(tokenFilterNames.size()); - for (String tokenFilterName : tokenFilterNames) { - TokenFilterFactory tokenFilter = tokenFilters.get(tokenFilterName); - if (tokenFilter == null) { - throw new IllegalArgumentException("Custom Analyzer [" + name() + "] failed to find filter under name [" - + tokenFilterName + "]"); - } - if (tokenFilter instanceof NormalizingTokenFilterFactory == false) { - throw new IllegalArgumentException("Custom normalizer [" + name() + "] may not use filter [" + tokenFilterName + "]"); - } - tokenFilterList.add(tokenFilter); - } - - this.customAnalyzer = new CustomAnalyzer( - tokenizerName, - tokenizerFactory, - charFiltersList.toArray(new CharFilterFactory[charFiltersList.size()]), - tokenFilterList.toArray(new TokenFilterFactory[tokenFilterList.size()]) - ); - } - - @Override - public CustomAnalyzer get() { - return this.customAnalyzer; - } -} diff --git a/server/src/main/java/org/elasticsearch/index/analysis/ReloadableCustomAnalyzer.java b/server/src/main/java/org/elasticsearch/index/analysis/ReloadableCustomAnalyzer.java index 7d3b8532caeb0..6c1f48a7fe181 100644 --- a/server/src/main/java/org/elasticsearch/index/analysis/ReloadableCustomAnalyzer.java +++ b/server/src/main/java/org/elasticsearch/index/analysis/ReloadableCustomAnalyzer.java @@ -120,7 +120,7 @@ public synchronized void reload(String name, final Map tokenizers, final Map charFilters, final Map tokenFilters) { - AnalyzerComponents components = AnalyzerComponents.createComponents(name, settings, tokenizers, charFilters, tokenFilters); + AnalyzerComponents components = AnalyzerComponents.createComponents(name, settings, tokenizers::get, charFilters, tokenFilters); this.components = components; } diff --git a/server/src/test/java/org/elasticsearch/index/analysis/CustomNormalizerTests.java b/server/src/test/java/org/elasticsearch/index/analysis/CustomNormalizerTests.java index 697c39bb90d30..a047bafd6ce3b 100644 --- a/server/src/test/java/org/elasticsearch/index/analysis/CustomNormalizerTests.java +++ b/server/src/test/java/org/elasticsearch/index/analysis/CustomNormalizerTests.java @@ -103,36 +103,7 @@ public void testCharFilters() throws IOException { assertEquals(new BytesRef("zbc"), normalizer.normalize("foo", "abc")); } - public void testIllegalFilters() throws IOException { - Settings settings = Settings.builder() - .putList("index.analysis.normalizer.my_normalizer.filter", "mock_forbidden") - .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) - .build(); - IllegalArgumentException e = expectThrows(IllegalArgumentException.class, - () -> AnalysisTestsHelper.createTestAnalysisFromSettings(settings, MOCK_ANALYSIS_PLUGIN)); - assertEquals("Custom normalizer [my_normalizer] may not use filter [mock_forbidden]", e.getMessage()); - } - - public void testIllegalCharFilters() throws IOException { - Settings settings = Settings.builder() - .putList("index.analysis.normalizer.my_normalizer.char_filter", "mock_forbidden") - .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) - .build(); - IllegalArgumentException e = expectThrows(IllegalArgumentException.class, - () -> AnalysisTestsHelper.createTestAnalysisFromSettings(settings, MOCK_ANALYSIS_PLUGIN)); - assertEquals("Custom normalizer [my_normalizer] may not use char filter [mock_forbidden]", e.getMessage()); - } - private static class MockAnalysisPlugin implements AnalysisPlugin { - @Override - public List getPreConfiguredTokenFilters() { - return singletonList(PreConfiguredTokenFilter.singleton("mock_forbidden", false, MockLowerCaseFilter::new)); - } - - @Override - public List getPreConfiguredCharFilters() { - return singletonList(PreConfiguredCharFilter.singleton("mock_forbidden", false, Function.identity())); - } @Override public Map> getCharFilters() { diff --git a/server/src/test/java/org/elasticsearch/index/analysis/ReloadableCustomAnalyzerTests.java b/server/src/test/java/org/elasticsearch/index/analysis/ReloadableCustomAnalyzerTests.java index e60df7e2ce1a4..31c3d57b1167f 100644 --- a/server/src/test/java/org/elasticsearch/index/analysis/ReloadableCustomAnalyzerTests.java +++ b/server/src/test/java/org/elasticsearch/index/analysis/ReloadableCustomAnalyzerTests.java @@ -88,7 +88,7 @@ public void testBasicCtor() { .putList("filter", "my_filter") .build(); - AnalyzerComponents components = createComponents("my_analyzer", analyzerSettings, testAnalysis.tokenizer, testAnalysis.charFilter, + AnalyzerComponents components = createComponents("my_analyzer", analyzerSettings, testAnalysis.tokenizer::get, testAnalysis.charFilter, Collections.singletonMap("my_filter", NO_OP_SEARCH_TIME_FILTER)); try (ReloadableCustomAnalyzer analyzer = new ReloadableCustomAnalyzer(components, positionIncrementGap, offsetGap)) { @@ -106,7 +106,7 @@ public void testBasicCtor() { .put("tokenizer", "standard") .putList("filter", "lowercase") .build(); - AnalyzerComponents indexAnalyzerComponents = createComponents("my_analyzer", indexAnalyzerSettings, testAnalysis.tokenizer, + AnalyzerComponents indexAnalyzerComponents = createComponents("my_analyzer", indexAnalyzerSettings, testAnalysis.tokenizer::get, testAnalysis.charFilter, testAnalysis.tokenFilter); IllegalArgumentException ex = expectThrows(IllegalArgumentException.class, () -> new ReloadableCustomAnalyzer(indexAnalyzerComponents, positionIncrementGap, offsetGap)); @@ -123,7 +123,7 @@ public void testReloading() throws IOException, InterruptedException { .putList("filter", "my_filter") .build(); - AnalyzerComponents components = createComponents("my_analyzer", analyzerSettings, testAnalysis.tokenizer, testAnalysis.charFilter, + AnalyzerComponents components = createComponents("my_analyzer", analyzerSettings, testAnalysis.tokenizer::get, testAnalysis.charFilter, Collections.singletonMap("my_filter", NO_OP_SEARCH_TIME_FILTER)); int numThreads = randomIntBetween(5, 10); From 0d85b9b1718644f34e88d1b53fe3b7eedcbd47a3 Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Mon, 1 Jul 2019 08:49:22 +0100 Subject: [PATCH 2/3] imports; checkstyle --- .../index/analysis/CustomNormalizerTests.java | 4 ---- .../analysis/ReloadableCustomAnalyzerTests.java | 12 ++++++------ 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/server/src/test/java/org/elasticsearch/index/analysis/CustomNormalizerTests.java b/server/src/test/java/org/elasticsearch/index/analysis/CustomNormalizerTests.java index a047bafd6ce3b..77229d6b5caa7 100644 --- a/server/src/test/java/org/elasticsearch/index/analysis/CustomNormalizerTests.java +++ b/server/src/test/java/org/elasticsearch/index/analysis/CustomNormalizerTests.java @@ -19,7 +19,6 @@ package org.elasticsearch.index.analysis; -import org.apache.lucene.analysis.MockLowerCaseFilter; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.util.BytesRef; import org.elasticsearch.common.settings.Settings; @@ -31,11 +30,8 @@ import java.io.IOException; import java.io.Reader; -import java.util.List; import java.util.Map; -import java.util.function.Function; -import static java.util.Collections.singletonList; import static java.util.Collections.singletonMap; public class CustomNormalizerTests extends ESTokenStreamTestCase { diff --git a/server/src/test/java/org/elasticsearch/index/analysis/ReloadableCustomAnalyzerTests.java b/server/src/test/java/org/elasticsearch/index/analysis/ReloadableCustomAnalyzerTests.java index 31c3d57b1167f..5bf2aec2e3c57 100644 --- a/server/src/test/java/org/elasticsearch/index/analysis/ReloadableCustomAnalyzerTests.java +++ b/server/src/test/java/org/elasticsearch/index/analysis/ReloadableCustomAnalyzerTests.java @@ -88,8 +88,8 @@ public void testBasicCtor() { .putList("filter", "my_filter") .build(); - AnalyzerComponents components = createComponents("my_analyzer", analyzerSettings, testAnalysis.tokenizer::get, testAnalysis.charFilter, - Collections.singletonMap("my_filter", NO_OP_SEARCH_TIME_FILTER)); + AnalyzerComponents components = createComponents("my_analyzer", analyzerSettings, testAnalysis.tokenizer::get, + testAnalysis.charFilter, Collections.singletonMap("my_filter", NO_OP_SEARCH_TIME_FILTER)); try (ReloadableCustomAnalyzer analyzer = new ReloadableCustomAnalyzer(components, positionIncrementGap, offsetGap)) { assertEquals(positionIncrementGap, analyzer.getPositionIncrementGap(randomAlphaOfLength(5))); @@ -106,8 +106,8 @@ public void testBasicCtor() { .put("tokenizer", "standard") .putList("filter", "lowercase") .build(); - AnalyzerComponents indexAnalyzerComponents = createComponents("my_analyzer", indexAnalyzerSettings, testAnalysis.tokenizer::get, - testAnalysis.charFilter, testAnalysis.tokenFilter); + AnalyzerComponents indexAnalyzerComponents = createComponents("my_analyzer", indexAnalyzerSettings, + testAnalysis.tokenizer::get, testAnalysis.charFilter, testAnalysis.tokenFilter); IllegalArgumentException ex = expectThrows(IllegalArgumentException.class, () -> new ReloadableCustomAnalyzer(indexAnalyzerComponents, positionIncrementGap, offsetGap)); assertEquals("ReloadableCustomAnalyzer must only be initialized with analysis components in AnalysisMode.SEARCH_TIME mode", @@ -123,8 +123,8 @@ public void testReloading() throws IOException, InterruptedException { .putList("filter", "my_filter") .build(); - AnalyzerComponents components = createComponents("my_analyzer", analyzerSettings, testAnalysis.tokenizer::get, testAnalysis.charFilter, - Collections.singletonMap("my_filter", NO_OP_SEARCH_TIME_FILTER)); + AnalyzerComponents components = createComponents("my_analyzer", analyzerSettings, testAnalysis.tokenizer::get, + testAnalysis.charFilter, Collections.singletonMap("my_filter", NO_OP_SEARCH_TIME_FILTER)); int numThreads = randomIntBetween(5, 10); ExecutorService executorService = Executors.newFixedThreadPool(numThreads); From ef3002e177b2f4877275fe3f84cec37cab9bf9f1 Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Mon, 1 Jul 2019 14:57:23 +0100 Subject: [PATCH 3/3] docs; explcitly use KeywordTokenizer constructor --- docs/reference/analysis/normalizers.asciidoc | 11 +---------- .../index/analysis/AnalysisRegistry.java | 3 ++- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/docs/reference/analysis/normalizers.asciidoc b/docs/reference/analysis/normalizers.asciidoc index ddf37acd67b12..02be7987695aa 100644 --- a/docs/reference/analysis/normalizers.asciidoc +++ b/docs/reference/analysis/normalizers.asciidoc @@ -2,16 +2,7 @@ == Normalizers Normalizers are similar to analyzers except that they may only emit a single -token. As a consequence, they do not have a tokenizer and only accept a subset -of the available char filters and token filters. Only the filters that work on -a per-character basis are allowed. For instance a lowercasing filter would be -allowed, but not a stemming filter, which needs to look at the keyword as a -whole. The current list of filters that can be used in a normalizer is -following: `arabic_normalization`, `asciifolding`, `bengali_normalization`, -`cjk_width`, `decimal_digit`, `elision`, `german_normalization`, -`hindi_normalization`, `indic_normalization`, `lowercase`, -`persian_normalization`, `scandinavian_folding`, `serbian_normalization`, -`sorani_normalization`, `uppercase`. +token. As a consequence, they do not have a tokenizer. [float] === Custom normalizers diff --git a/server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java b/server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java index b682cefe9f9c0..b5f88bc8fbd21 100644 --- a/server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java +++ b/server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java @@ -19,6 +19,7 @@ package org.elasticsearch.index.analysis; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.elasticsearch.ElasticsearchException; import org.elasticsearch.Version; @@ -536,7 +537,7 @@ public IndexAnalyzers build(IndexSettings indexSettings, } for (Map.Entry> entry : normalizerProviders.entrySet()) { processNormalizerFactory(entry.getKey(), entry.getValue(), normalizers, - () -> tokenizerFactoryFactories.get("keyword"), tokenFilterFactoryFactories, charFilterFactoryFactories); + () -> KeywordTokenizer::new, tokenFilterFactoryFactories, charFilterFactoryFactories); processNormalizerFactory(entry.getKey(), entry.getValue(), whitespaceNormalizers, () -> WhitespaceTokenizer::new, tokenFilterFactoryFactories, charFilterFactoryFactories); }