From d42128cf33de8d6c2c40437a0fc39c0be930811e Mon Sep 17 00:00:00 2001 From: markharwood Date: Fri, 20 Mar 2020 11:03:30 +0000 Subject: [PATCH 1/9] Add support for normalisation to wildcard field --- .../index/query/QueryBuilders.java | 2 +- .../test/wildcard/10_wildcard_basic.yml | 66 ++++++++++--- .../wildcard/mapper/WildcardFieldMapper.java | 94 ++++++++++++++++++- 3 files changed, 146 insertions(+), 16 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/query/QueryBuilders.java b/server/src/main/java/org/elasticsearch/index/query/QueryBuilders.java index 3aa6118936e4d..bb9a007c60ca4 100644 --- a/server/src/main/java/org/elasticsearch/index/query/QueryBuilders.java +++ b/server/src/main/java/org/elasticsearch/index/query/QueryBuilders.java @@ -239,7 +239,7 @@ public static RangeQueryBuilder rangeQuery(String name) { * which matches any single character. Note this query can be slow, as it * needs to iterate over many terms. In order to prevent extremely slow WildcardQueries, * a Wildcard term should not start with one of the wildcards {@code *} or - * {@code ?}. + * {@code ?}. (The wildcard field type however, is optimised for leading wildcards) * * @param name The field name * @param query The wildcard query string diff --git a/x-pack/plugin/src/test/resources/rest-api-spec/test/wildcard/10_wildcard_basic.yml b/x-pack/plugin/src/test/resources/rest-api-spec/test/wildcard/10_wildcard_basic.yml index c67a79c8218da..486082ef3ab59 100644 --- a/x-pack/plugin/src/test/resources/rest-api-spec/test/wildcard/10_wildcard_basic.yml +++ b/x-pack/plugin/src/test/resources/rest-api-spec/test/wildcard/10_wildcard_basic.yml @@ -1,8 +1,8 @@ setup: - skip: features: headers - version: " - 7.9.99" - reason: "wildcard fields were added from 8.0" + version: " - 7.6.99" + reason: "wildcard fields were added from 7.7" - do: indices.create: @@ -10,10 +10,20 @@ setup: body: settings: number_of_replicas: 0 + analysis: + normalizer: + lowercase: + type: custom + char_filter: [] + filter: ["lowercase"] mappings: properties: my_wildcard: type: wildcard + normalizer: lowercase + fields: + case_sensitive: + type: wildcard - do: index: index: test-index @@ -26,6 +36,12 @@ setup: id: 2 body: my_wildcard: goodbye world + - do: + index: + index: test-index + id: 3 + body: + my_wildcard: cAsE iNsEnSiTiVe World - do: indices.refresh: {} @@ -80,6 +96,31 @@ setup: my_wildcard: {value: "*ello worl*" } + - match: {hits.total.value: 1} +--- +"Case insensitive query": + - do: + search: + body: + track_total_hits: true + query: + wildcard: + my_wildcard: {value: "*Worl*" } + + + - match: {hits.total.value: 3} + +--- +"Case sensitive query": + - do: + search: + body: + track_total_hits: true + query: + wildcard: + my_wildcard.case_sensitive: {value: "*Worl*" } + + - match: {hits.total.value: 1} --- @@ -93,7 +134,7 @@ setup: my_wildcard: {value: "*ld" } - - match: {hits.total.value: 2} + - match: {hits.total.value: 3} --- "Long suffix query": @@ -188,8 +229,8 @@ setup: terms: {field: "my_wildcard" } - - match: {hits.total.value: 2} - - length: { aggregations.top_vals.buckets: 2 } + - match: {hits.total.value: 3} + - length: { aggregations.top_vals.buckets: 3 } --- "Sort works": @@ -199,10 +240,11 @@ setup: track_total_hits: true sort: [ { "my_wildcard": "desc" } ] - - match: { hits.total.value: 2 } - - length: { hits.hits: 2 } + - match: { hits.total.value: 3 } + - length: { hits.hits: 3 } - match: { hits.hits.0._id: "1" } - match: { hits.hits.1._id: "2" } + - match: { hits.hits.2._id: "3" } - do: search: @@ -210,9 +252,9 @@ setup: track_total_hits: true sort: [ { "my_wildcard": "asc" } ] - - match: { hits.total.value: 2 } - - length: { hits.hits: 2 } - - match: { hits.hits.0._id: "2" } - - match: { hits.hits.1._id: "1" } - + - match: { hits.total.value: 3 } + - length: { hits.hits: 3 } + - match: { hits.hits.0._id: "3" } + - match: { hits.hits.1._id: "2" } + - match: { hits.hits.2._id: "1" } diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java index e489d8a35bb9f..c94187ac05a74 100644 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java @@ -10,6 +10,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.WhitespaceAnalyzer; import org.apache.lucene.analysis.ngram.NGramTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.document.Field; @@ -39,6 +40,7 @@ import org.elasticsearch.index.Index; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.AnalyzerScope; +import org.elasticsearch.index.analysis.IndexAnalyzers; import org.elasticsearch.index.analysis.NamedAnalyzer; import org.elasticsearch.index.fielddata.IndexFieldData; import org.elasticsearch.index.fielddata.IndexFieldData.XFieldComparatorSource.Nested; @@ -46,6 +48,8 @@ import org.elasticsearch.index.fielddata.fieldcomparator.BytesRefFieldComparatorSource; import org.elasticsearch.index.fielddata.plain.BytesBinaryDVIndexFieldData; import org.elasticsearch.index.mapper.BinaryFieldMapper.CustomBinaryDocValuesField; +import org.elasticsearch.index.mapper.KeywordFieldMapper.Builder; +import org.elasticsearch.index.mapper.KeywordFieldMapper.KeywordFieldType; import org.elasticsearch.index.mapper.FieldMapper; import org.elasticsearch.index.mapper.MappedFieldType; import org.elasticsearch.index.mapper.Mapper; @@ -64,6 +68,7 @@ import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Objects; import static org.elasticsearch.index.mapper.TypeParsers.parseField; @@ -100,6 +105,9 @@ public static class Defaults { public static class Builder extends FieldMapper.Builder { protected int ignoreAbove = Defaults.IGNORE_ABOVE; + private IndexAnalyzers indexAnalyzers; + private String normalizerName; + public Builder(String name) { super(name, Defaults.FIELD_TYPE, Defaults.FIELD_TYPE); @@ -164,10 +172,23 @@ protected void setupFieldType(BuilderContext context) { public WildcardFieldType fieldType() { return (WildcardFieldType) super.fieldType(); } + + public Builder normalizer(IndexAnalyzers indexAnalyzers, String name) { + this.indexAnalyzers = indexAnalyzers; + this.normalizerName = name; + return builder; + } @Override public WildcardFieldMapper build(BuilderContext context) { - setupFieldType(context); + setupFieldType(context); + if (normalizerName != null) { + NamedAnalyzer normalizer = indexAnalyzers.getNormalizer(normalizerName); + if (normalizer == null) { + throw new MapperParsingException("normalizer [" + normalizerName + "] not found for field [" + name + "]"); + } + fieldType().setNormalizer(normalizer); + } return new WildcardFieldMapper( name, fieldType, defaultFieldType, ignoreAbove, context.indexSettings(), multiFieldsBuilder.build(this, context), copyTo); @@ -188,6 +209,11 @@ public static class TypeParser implements Mapper.TypeParser { if (propName.equals("ignore_above")) { builder.ignoreAbove(XContentMapValues.nodeIntegerValue(propNode, -1)); iterator.remove(); + } else if (propName.equals("normalizer")) { + if (propNode != null) { + builder.normalizer(parserContext.getIndexAnalyzers(), propNode.toString()); + } + iterator.remove(); } } @@ -198,6 +224,8 @@ public static class TypeParser implements Mapper.TypeParser { public static final char TOKEN_START_OR_END_CHAR = 0; public static final class WildcardFieldType extends MappedFieldType { + + private NamedAnalyzer normalizer = null; public WildcardFieldType() { setIndexAnalyzer(Lucene.KEYWORD_ANALYZER); @@ -206,6 +234,7 @@ public WildcardFieldType() { protected WildcardFieldType(WildcardFieldType ref) { super(ref); + this.normalizer = ref.normalizer; } public WildcardFieldType clone() { @@ -213,6 +242,29 @@ public WildcardFieldType clone() { return result; } + + @Override + public boolean equals(Object o) { + if (super.equals(o) == false) { + return false; + } + WildcardFieldType other = (WildcardFieldType) o; + return Objects.equals(normalizer, other.normalizer); + } + + @Override + public int hashCode() { + return 31 * super.hashCode() + Objects.hash(normalizer); + } + + private NamedAnalyzer normalizer() { + return normalizer; + } + + public void setNormalizer(NamedAnalyzer normalizer) { + checkIfFrozen(); + this.normalizer = normalizer; + } // Holds parsed information about the wildcard pattern static class PatternStructure { @@ -327,6 +379,12 @@ public boolean equals(Object obj) { @Override public Query wildcardQuery(String wildcardPattern, RewriteMethod method, QueryShardContext context) { + try { + wildcardPattern = normalize(wildcardPattern); + } catch (IOException e) { + throw new IllegalStateException("The field [" + name() + + "] hit an IOException normalizing the value [" + wildcardPattern+ "]."); + } PatternStructure patternStructure = new PatternStructure(wildcardPattern); ArrayList tokens = new ArrayList<>(); @@ -467,7 +525,31 @@ public IndexFieldData build(IndexSettings indexSettings, MappedFieldType fiel CircuitBreakerService breakerService, MapperService mapperService) { return new WildcardBytesBinaryDVIndexFieldData(indexSettings.getIndex(), fieldType.name()); }}; - } + } + + + String normalize(String value) throws IOException { + if (normalizer != null) { + try (TokenStream ts = normalizer.tokenStream(name(), value)) { + final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); + ts.reset(); + if (ts.incrementToken() == false) { + throw new IllegalStateException("The normalization token stream is " + + "expected to produce exactly 1 token, but got 0 for analyzer " + + normalizer + " and input \"" + value + "\""); + } + final String newValue = termAtt.toString(); + if (ts.incrementToken()) { + throw new IllegalStateException("The normalization token stream is " + + "expected to produce exactly 1 token, but got 2+ for analyzer " + + normalizer + " and input \"" + value + "\""); + } + ts.end(); + return newValue; + } + } + return value; + } } static class WildcardBytesBinaryDVIndexFieldData extends BytesBinaryDVIndexFieldData{ @@ -521,6 +603,11 @@ protected void doXContentBody(XContentBuilder builder, boolean includeDefaults, if (includeDefaults || ignoreAbove != Defaults.IGNORE_ABOVE) { builder.field("ignore_above", ignoreAbove); } + if (fieldType().normalizer() != null) { + builder.field("normalizer", fieldType().normalizer().name()); + } else if (includeDefaults) { + builder.nullField("normalizer"); + } } @Override @@ -544,10 +631,11 @@ protected void parseCreateField(ParseContext context, List field // For internal use by Lucene only - used to define ngram index final MappedFieldType ngramFieldType; - void createFields(String value, Document parseDoc, Listfields) { + void createFields(String value, Document parseDoc, Listfields) throws IOException { if (value == null || value.length() > ignoreAbove) { return; } + value = fieldType().normalize(value); String ngramValue = TOKEN_START_OR_END_CHAR + value + TOKEN_START_OR_END_CHAR + TOKEN_START_OR_END_CHAR; Field ngramField = new Field(fieldType().name(), ngramValue, ngramFieldType); fields.add(ngramField); From 2cf000001c7052a7aa928e0dc859c4a983c85b49 Mon Sep 17 00:00:00 2001 From: markharwood Date: Fri, 20 Mar 2020 11:04:14 +0000 Subject: [PATCH 2/9] Tidied imports --- .../xpack/wildcard/mapper/WildcardFieldMapper.java | 3 --- 1 file changed, 3 deletions(-) diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java index c94187ac05a74..9000f48948ad7 100644 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java @@ -10,7 +10,6 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.core.WhitespaceAnalyzer; import org.apache.lucene.analysis.ngram.NGramTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.document.Field; @@ -48,8 +47,6 @@ import org.elasticsearch.index.fielddata.fieldcomparator.BytesRefFieldComparatorSource; import org.elasticsearch.index.fielddata.plain.BytesBinaryDVIndexFieldData; import org.elasticsearch.index.mapper.BinaryFieldMapper.CustomBinaryDocValuesField; -import org.elasticsearch.index.mapper.KeywordFieldMapper.Builder; -import org.elasticsearch.index.mapper.KeywordFieldMapper.KeywordFieldType; import org.elasticsearch.index.mapper.FieldMapper; import org.elasticsearch.index.mapper.MappedFieldType; import org.elasticsearch.index.mapper.Mapper; From ccefc677169317c941a88f19678ba0df05bed371 Mon Sep 17 00:00:00 2001 From: markharwood Date: Fri, 20 Mar 2020 13:26:48 +0000 Subject: [PATCH 3/9] Added docs about params --- docs/reference/mapping/types/wildcard.asciidoc | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/docs/reference/mapping/types/wildcard.asciidoc b/docs/reference/mapping/types/wildcard.asciidoc index 9d01f4820a894..aa2de7db87afc 100644 --- a/docs/reference/mapping/types/wildcard.asciidoc +++ b/docs/reference/mapping/types/wildcard.asciidoc @@ -48,6 +48,23 @@ POST my_index/_doc/_search -------------------------------------------------- +[[wildcard-params]] +==== Parameters for wildcard fields + +The following parameters are accepted by `wildcard` fields: + +[horizontal] + +<>:: + + Do not index any string longer than this value. Defaults to `2147483647` + so that all values would be accepted. + +<>:: + + How to pre-process the value prior to indexing. Defaults to `null`, + meaning the value is kept as-is. + ==== Limitations * `wildcard` fields are untokenized like keyword fields, so do not support queries that rely on word positions such as phrase queries. From ea6bd9a0f69867f2fb5165d6ab6e3d98d698acc4 Mon Sep 17 00:00:00 2001 From: markharwood Date: Fri, 20 Mar 2020 14:30:23 +0000 Subject: [PATCH 4/9] Fix outdated error message --- .../java/org/elasticsearch/index/mapper/MappedFieldType.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/mapper/MappedFieldType.java b/server/src/main/java/org/elasticsearch/index/mapper/MappedFieldType.java index 67bbdaf79ce76..1a80aa80a30b0 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/MappedFieldType.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/MappedFieldType.java @@ -368,14 +368,14 @@ public Query fuzzyQuery(Object value, Fuzziness fuzziness, int prefixLength, int } public Query prefixQuery(String value, @Nullable MultiTermQuery.RewriteMethod method, QueryShardContext context) { - throw new QueryShardException(context, "Can only use prefix queries on keyword and text fields - not on [" + name + throw new QueryShardException(context, "Can only use prefix queries on keyword, text and wildcard fields - not on [" + name + "] which is of type [" + typeName() + "]"); } public Query wildcardQuery(String value, @Nullable MultiTermQuery.RewriteMethod method, QueryShardContext context) { - throw new QueryShardException(context, "Can only use wildcard queries on keyword and text fields - not on [" + name + throw new QueryShardException(context, "Can only use wildcard queries on keyword, text and wildcard fields - not on [" + name + "] which is of type [" + typeName() + "]"); } From e8817a4bec13c0f5d5cf1e86827b9f2d8a9e2fd2 Mon Sep 17 00:00:00 2001 From: markharwood Date: Fri, 20 Mar 2020 15:00:17 +0000 Subject: [PATCH 5/9] Avoid normaliser butchering wildcard query special characters --- .../wildcard/mapper/WildcardFieldMapper.java | 38 ++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java index 9000f48948ad7..209365f5b22fd 100644 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java @@ -28,6 +28,8 @@ import org.apache.lucene.search.SortField; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.WildcardQuery; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.automaton.Automaton; import org.elasticsearch.ElasticsearchParseException; import org.elasticsearch.common.lucene.BytesRefs; @@ -66,6 +68,8 @@ import java.util.List; import java.util.Map; import java.util.Objects; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import static org.elasticsearch.index.mapper.TypeParsers.parseField; @@ -377,7 +381,7 @@ public boolean equals(Object obj) { @Override public Query wildcardQuery(String wildcardPattern, RewriteMethod method, QueryShardContext context) { try { - wildcardPattern = normalize(wildcardPattern); + wildcardPattern = normalizeWildcardPattern(wildcardPattern); } catch (IOException e) { throw new IllegalStateException("The field [" + name() + "] hit an IOException normalizing the value [" + wildcardPattern+ "]."); @@ -547,6 +551,38 @@ String normalize(String value) throws IOException { } return value; } + + private static final Pattern WILDCARD_PATTERN = Pattern.compile("(\\\\.)|([?*]+)"); + + String normalizeWildcardPattern(String value) throws IOException { + if (normalizer == null) { + return value; + } + // we want to normalize everything except wildcard characters, e.g. F?o Ba* to f?o ba*, even if e.g there + // is a char_filter that would otherwise remove them + Matcher wildcardMatcher = WILDCARD_PATTERN.matcher(value); + BytesRefBuilder sb = new BytesRefBuilder(); + int last = 0; + + while (wildcardMatcher.find()) { + if (wildcardMatcher.start() > 0) { + String chunk = value.substring(last, wildcardMatcher.start()); + + BytesRef normalized = normalizer.normalize(name(), chunk); + sb.append(normalized); + } + // append the matched group - without normalizing + sb.append(new BytesRef(wildcardMatcher.group())); + + last = wildcardMatcher.end(); + } + if (last < value.length()) { + String chunk = value.substring(last); + BytesRef normalized = searchAnalyzer().normalize(name(), chunk); + sb.append(normalized); + } + return sb.toString(); + } } static class WildcardBytesBinaryDVIndexFieldData extends BytesBinaryDVIndexFieldData{ From 567a3fb0b032efd59081532a511df31c5b2e8cef Mon Sep 17 00:00:00 2001 From: markharwood Date: Fri, 20 Mar 2020 16:02:06 +0000 Subject: [PATCH 6/9] Fix broken test expectations --- .../org/elasticsearch/index/query/PrefixQueryBuilderTests.java | 2 +- .../elasticsearch/index/query/QueryStringQueryBuilderTests.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/server/src/test/java/org/elasticsearch/index/query/PrefixQueryBuilderTests.java b/server/src/test/java/org/elasticsearch/index/query/PrefixQueryBuilderTests.java index 94596ffd6c58d..4253eaded3056 100644 --- a/server/src/test/java/org/elasticsearch/index/query/PrefixQueryBuilderTests.java +++ b/server/src/test/java/org/elasticsearch/index/query/PrefixQueryBuilderTests.java @@ -116,7 +116,7 @@ public void testNumeric() throws Exception { QueryShardContext context = createShardContext(); QueryShardException e = expectThrows(QueryShardException.class, () -> query.toQuery(context)); - assertEquals("Can only use prefix queries on keyword and text fields - not on [mapped_int] which is of type [integer]", + assertEquals("Can only use prefix queries on keyword, text and wildcard fields - not on [mapped_int] which is of type [integer]", e.getMessage()); } diff --git a/server/src/test/java/org/elasticsearch/index/query/QueryStringQueryBuilderTests.java b/server/src/test/java/org/elasticsearch/index/query/QueryStringQueryBuilderTests.java index 85453e774fe6e..7724e82a9e083 100644 --- a/server/src/test/java/org/elasticsearch/index/query/QueryStringQueryBuilderTests.java +++ b/server/src/test/java/org/elasticsearch/index/query/QueryStringQueryBuilderTests.java @@ -813,7 +813,7 @@ public void testPrefixNumeric() throws Exception { QueryShardContext context = createShardContext(); QueryShardException e = expectThrows(QueryShardException.class, () -> query.toQuery(context)); - assertEquals("Can only use prefix queries on keyword and text fields - not on [mapped_int] which is of type [integer]", + assertEquals("Can only use prefix queries on keyword, text and wildcard fields - not on [mapped_int] which is of type [integer]", e.getMessage()); query.lenient(true); query.toQuery(context); // no exception From c7b934b5c428162b9c4996d28f4d576b272de462 Mon Sep 17 00:00:00 2001 From: markharwood Date: Fri, 20 Mar 2020 17:23:45 +0000 Subject: [PATCH 7/9] Fix wrong toString method --- .../xpack/wildcard/mapper/WildcardFieldMapper.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java index 209365f5b22fd..feaac5ed207e7 100644 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java @@ -581,7 +581,7 @@ String normalizeWildcardPattern(String value) throws IOException { BytesRef normalized = searchAnalyzer().normalize(name(), chunk); sb.append(normalized); } - return sb.toString(); + return sb.toBytesRef().utf8ToString(); } } From 8dc50492759e936579b16773d9ed1ac9e73aa884 Mon Sep 17 00:00:00 2001 From: markharwood Date: Mon, 23 Mar 2020 15:47:15 +0000 Subject: [PATCH 8/9] Address review comments - common method for normalising wildcard patterns and checkCompatibility --- .../index/mapper/StringFieldType.java | 58 +++++++++++-------- .../wildcard/mapper/WildcardFieldMapper.java | 54 +++++------------ 2 files changed, 47 insertions(+), 65 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/mapper/StringFieldType.java b/server/src/main/java/org/elasticsearch/index/mapper/StringFieldType.java index 05bf6b61d1de1..7da521870fef5 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/StringFieldType.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/StringFieldType.java @@ -19,6 +19,7 @@ package org.elasticsearch.index.mapper; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.index.Term; import org.apache.lucene.search.FuzzyQuery; import org.apache.lucene.search.MultiTermQuery; @@ -36,6 +37,7 @@ import org.elasticsearch.index.query.QueryShardContext; import org.elasticsearch.index.query.support.QueryParsers; +import java.io.IOException; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -93,6 +95,36 @@ public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, Quer return query; } + public static final String normalizeWildcardPattern(String fieldname, String value, Analyzer normalizer) { + if (normalizer == null) { + return value; + } + // we want to normalize everything except wildcard characters, e.g. F?o Ba* to f?o ba*, even if e.g there + // is a char_filter that would otherwise remove them + Matcher wildcardMatcher = WILDCARD_PATTERN.matcher(value); + BytesRefBuilder sb = new BytesRefBuilder(); + int last = 0; + + while (wildcardMatcher.find()) { + if (wildcardMatcher.start() > 0) { + String chunk = value.substring(last, wildcardMatcher.start()); + + BytesRef normalized = normalizer.normalize(fieldname, chunk); + sb.append(normalized); + } + // append the matched group - without normalizing + sb.append(new BytesRef(wildcardMatcher.group())); + + last = wildcardMatcher.end(); + } + if (last < value.length()) { + String chunk = value.substring(last); + BytesRef normalized = normalizer.normalize(fieldname, chunk); + sb.append(normalized); + } + return sb.toBytesRef().utf8ToString(); + } + @Override public Query wildcardQuery(String value, MultiTermQuery.RewriteMethod method, QueryShardContext context) { failIfNotIndexed(); @@ -103,30 +135,8 @@ public Query wildcardQuery(String value, MultiTermQuery.RewriteMethod method, Qu Term term; if (searchAnalyzer() != null) { - // we want to normalize everything except wildcard characters, e.g. F?o Ba* to f?o ba*, even if e.g there - // is a char_filter that would otherwise remove them - Matcher wildcardMatcher = WILDCARD_PATTERN.matcher(value); - BytesRefBuilder sb = new BytesRefBuilder(); - int last = 0; - - while (wildcardMatcher.find()) { - if (wildcardMatcher.start() > 0) { - String chunk = value.substring(last, wildcardMatcher.start()); - - BytesRef normalized = searchAnalyzer().normalize(name(), chunk); - sb.append(normalized); - } - // append the matched group - without normalizing - sb.append(new BytesRef(wildcardMatcher.group())); - - last = wildcardMatcher.end(); - } - if (last < value.length()) { - String chunk = value.substring(last); - BytesRef normalized = searchAnalyzer().normalize(name(), chunk); - sb.append(normalized); - } - term = new Term(name(), sb.toBytesRef()); + value = normalizeWildcardPattern(name(), value, searchAnalyzer()); + term = new Term(name(), value); } else { term = new Term(name(), indexedValueForSearch(value)); } diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java index feaac5ed207e7..568e4f502dbe0 100644 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java @@ -28,8 +28,6 @@ import org.apache.lucene.search.SortField; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.WildcardQuery; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.automaton.Automaton; import org.elasticsearch.ElasticsearchParseException; import org.elasticsearch.common.lucene.BytesRefs; @@ -56,6 +54,7 @@ import org.elasticsearch.index.mapper.MapperService; import org.elasticsearch.index.mapper.ParseContext; import org.elasticsearch.index.mapper.ParseContext.Document; +import org.elasticsearch.index.mapper.StringFieldType; import org.elasticsearch.index.query.QueryShardContext; import org.elasticsearch.index.similarity.SimilarityProvider; import org.elasticsearch.indices.breaker.CircuitBreakerService; @@ -68,8 +67,6 @@ import java.util.List; import java.util.Map; import java.util.Objects; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import static org.elasticsearch.index.mapper.TypeParsers.parseField; @@ -266,7 +263,16 @@ public void setNormalizer(NamedAnalyzer normalizer) { checkIfFrozen(); this.normalizer = normalizer; } - + + @Override + public void checkCompatibility(MappedFieldType otherFT, List conflicts) { + super.checkCompatibility(otherFT, conflicts); + WildcardFieldType other = (WildcardFieldType) otherFT; + if (Objects.equals(normalizer, other.normalizer) == false) { + conflicts.add("mapper [" + name() + "] has different [normalizer]"); + } + } + // Holds parsed information about the wildcard pattern static class PatternStructure { boolean openStart, openEnd, hasSymbols; @@ -380,11 +386,8 @@ public boolean equals(Object obj) { @Override public Query wildcardQuery(String wildcardPattern, RewriteMethod method, QueryShardContext context) { - try { - wildcardPattern = normalizeWildcardPattern(wildcardPattern); - } catch (IOException e) { - throw new IllegalStateException("The field [" + name() + - "] hit an IOException normalizing the value [" + wildcardPattern+ "]."); + if (normalizer != null) { + wildcardPattern = StringFieldType.normalizeWildcardPattern(name(), wildcardPattern, normalizer); } PatternStructure patternStructure = new PatternStructure(wildcardPattern); ArrayList tokens = new ArrayList<>(); @@ -552,37 +555,6 @@ String normalize(String value) throws IOException { return value; } - private static final Pattern WILDCARD_PATTERN = Pattern.compile("(\\\\.)|([?*]+)"); - - String normalizeWildcardPattern(String value) throws IOException { - if (normalizer == null) { - return value; - } - // we want to normalize everything except wildcard characters, e.g. F?o Ba* to f?o ba*, even if e.g there - // is a char_filter that would otherwise remove them - Matcher wildcardMatcher = WILDCARD_PATTERN.matcher(value); - BytesRefBuilder sb = new BytesRefBuilder(); - int last = 0; - - while (wildcardMatcher.find()) { - if (wildcardMatcher.start() > 0) { - String chunk = value.substring(last, wildcardMatcher.start()); - - BytesRef normalized = normalizer.normalize(name(), chunk); - sb.append(normalized); - } - // append the matched group - without normalizing - sb.append(new BytesRef(wildcardMatcher.group())); - - last = wildcardMatcher.end(); - } - if (last < value.length()) { - String chunk = value.substring(last); - BytesRef normalized = searchAnalyzer().normalize(name(), chunk); - sb.append(normalized); - } - return sb.toBytesRef().utf8ToString(); - } } static class WildcardBytesBinaryDVIndexFieldData extends BytesBinaryDVIndexFieldData{ From 4f242fc9f23294400bcc5593a7b24c941868e399 Mon Sep 17 00:00:00 2001 From: markharwood Date: Mon, 23 Mar 2020 16:03:30 +0000 Subject: [PATCH 9/9] Remove unused import --- .../java/org/elasticsearch/index/mapper/StringFieldType.java | 1 - 1 file changed, 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/index/mapper/StringFieldType.java b/server/src/main/java/org/elasticsearch/index/mapper/StringFieldType.java index 7da521870fef5..37c20b236ed56 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/StringFieldType.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/StringFieldType.java @@ -37,7 +37,6 @@ import org.elasticsearch.index.query.QueryShardContext; import org.elasticsearch.index.query.support.QueryParsers; -import java.io.IOException; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern;