-
Notifications
You must be signed in to change notification settings - Fork 25.2k
Allow all token/char filters in normalizers #43803
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,6 +24,7 @@ | |
import java.util.ArrayList; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.function.Function; | ||
|
||
/** | ||
* A class that groups analysis components necessary to produce a custom analyzer. | ||
|
@@ -49,14 +50,11 @@ public final class AnalyzerComponents { | |
this.analysisMode = mode; | ||
} | ||
|
||
static AnalyzerComponents createComponents(String name, Settings analyzerSettings, final Map<String, TokenizerFactory> tokenizers, | ||
static AnalyzerComponents createComponents(String name, Settings analyzerSettings, final Function<String, TokenizerFactory> tokenizers, | ||
final Map<String, CharFilterFactory> charFilters, final Map<String, TokenFilterFactory> tokenFilters) { | ||
String tokenizerName = analyzerSettings.get("tokenizer"); | ||
if (tokenizerName == null) { | ||
throw new IllegalArgumentException("Custom Analyzer [" + name + "] must be configured with a tokenizer"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we make sure the name is set elsewhere or is this not necessary anymore? Otherwise it might be good to keep, I haven'r checked what the follow-up exception would be if the name is "null" and the function is called. Might be more cryptic. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The check has been moved to |
||
} | ||
|
||
TokenizerFactory tokenizer = tokenizers.get(tokenizerName); | ||
TokenizerFactory tokenizer = tokenizers.apply(tokenizerName); | ||
if (tokenizer == null) { | ||
throw new IllegalArgumentException( | ||
"Custom Analyzer [" + name + "] failed to find tokenizer under name " + "[" + tokenizerName + "]"); | ||
|
@@ -108,4 +106,4 @@ public CharFilterFactory[] getCharFilters() { | |
public AnalysisMode analysisMode() { | ||
return this.analysisMode; | ||
} | ||
} | ||
} |
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,7 +19,6 @@ | |
|
||
package org.elasticsearch.index.analysis; | ||
|
||
import org.apache.lucene.analysis.MockLowerCaseFilter; | ||
import org.apache.lucene.analysis.MockTokenizer; | ||
import org.apache.lucene.util.BytesRef; | ||
import org.elasticsearch.common.settings.Settings; | ||
|
@@ -31,11 +30,8 @@ | |
|
||
import java.io.IOException; | ||
import java.io.Reader; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.function.Function; | ||
|
||
import static java.util.Collections.singletonList; | ||
import static java.util.Collections.singletonMap; | ||
|
||
public class CustomNormalizerTests extends ESTokenStreamTestCase { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we still need this test when CustomNormalizerProvider is gone? I saw there is also no CustomAnalyzerTests, maybe this could be renamed and a few tests added that test the CustomAnalyzer parts that are not covered here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There's still a distinction in settings between analyzers and normalizers, so I think this test is still useful? |
||
|
@@ -103,36 +99,7 @@ public void testCharFilters() throws IOException { | |
assertEquals(new BytesRef("zbc"), normalizer.normalize("foo", "abc")); | ||
} | ||
|
||
public void testIllegalFilters() throws IOException { | ||
Settings settings = Settings.builder() | ||
.putList("index.analysis.normalizer.my_normalizer.filter", "mock_forbidden") | ||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) | ||
.build(); | ||
IllegalArgumentException e = expectThrows(IllegalArgumentException.class, | ||
() -> AnalysisTestsHelper.createTestAnalysisFromSettings(settings, MOCK_ANALYSIS_PLUGIN)); | ||
assertEquals("Custom normalizer [my_normalizer] may not use filter [mock_forbidden]", e.getMessage()); | ||
} | ||
|
||
public void testIllegalCharFilters() throws IOException { | ||
Settings settings = Settings.builder() | ||
.putList("index.analysis.normalizer.my_normalizer.char_filter", "mock_forbidden") | ||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) | ||
.build(); | ||
IllegalArgumentException e = expectThrows(IllegalArgumentException.class, | ||
() -> AnalysisTestsHelper.createTestAnalysisFromSettings(settings, MOCK_ANALYSIS_PLUGIN)); | ||
assertEquals("Custom normalizer [my_normalizer] may not use char filter [mock_forbidden]", e.getMessage()); | ||
} | ||
|
||
private static class MockAnalysisPlugin implements AnalysisPlugin { | ||
@Override | ||
public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() { | ||
return singletonList(PreConfiguredTokenFilter.singleton("mock_forbidden", false, MockLowerCaseFilter::new)); | ||
} | ||
|
||
@Override | ||
public List<PreConfiguredCharFilter> getPreConfiguredCharFilters() { | ||
return singletonList(PreConfiguredCharFilter.singleton("mock_forbidden", false, Function.identity())); | ||
} | ||
|
||
@Override | ||
public Map<String, AnalysisProvider<CharFilterFactory>> getCharFilters() { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we keep this check and move it inside the function further down that calls tokenizerSupplier.get()? I'm not really sure its useful, but it looks like it might provide useful hints about a misconfigured cluster (thats the only way I can see analysis-common missing). In case you re-add this, this should get a test too.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we're missing analysis-common then I don't think anything is going to work? The error message here is out of date anyway, as we build normalizers with both keyword and whitespace tokenizers, but I'll change the caller to pass
KeywordTokenizer::new
rather than calling into the tokenizer map to ensure that this would never be a problem.