Skip to content

Commit 2537e02

Browse files
authored
Wildcard field - add normalizer support (#53851)
* Add support for normalisation to wildcard field * Tidied imports * Added docs about params * Fix outdated error message * Avoid normaliser butchering wildcard query special characters * Fix broken test expectations * Fix wrong toString method * Address review comments - common method for normalising wildcard patterns and checkCompatibility * Remove unused import
1 parent 5e637c4 commit 2537e02

File tree

8 files changed

+206
-45
lines changed

8 files changed

+206
-45
lines changed

docs/reference/mapping/types/wildcard.asciidoc

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,23 @@ POST my_index/_doc/_search
4848
--------------------------------------------------
4949

5050

51+
[[wildcard-params]]
52+
==== Parameters for wildcard fields
53+
54+
The following parameters are accepted by `wildcard` fields:
55+
56+
[horizontal]
57+
58+
<<ignore-above,`ignore_above`>>::
59+
60+
Do not index any string longer than this value. Defaults to `2147483647`
61+
so that all values would be accepted.
62+
63+
<<normalizer,`normalizer`>>::
64+
65+
How to pre-process the value prior to indexing. Defaults to `null`,
66+
meaning the value is kept as-is.
67+
5168
==== Limitations
5269

5370
* `wildcard` fields are untokenized like keyword fields, so do not support queries that rely on word positions such as phrase queries.

server/src/main/java/org/elasticsearch/index/mapper/MappedFieldType.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -368,14 +368,14 @@ public Query fuzzyQuery(Object value, Fuzziness fuzziness, int prefixLength, int
368368
}
369369

370370
public Query prefixQuery(String value, @Nullable MultiTermQuery.RewriteMethod method, QueryShardContext context) {
371-
throw new QueryShardException(context, "Can only use prefix queries on keyword and text fields - not on [" + name
371+
throw new QueryShardException(context, "Can only use prefix queries on keyword, text and wildcard fields - not on [" + name
372372
+ "] which is of type [" + typeName() + "]");
373373
}
374374

375375
public Query wildcardQuery(String value,
376376
@Nullable MultiTermQuery.RewriteMethod method,
377377
QueryShardContext context) {
378-
throw new QueryShardException(context, "Can only use wildcard queries on keyword and text fields - not on [" + name
378+
throw new QueryShardException(context, "Can only use wildcard queries on keyword, text and wildcard fields - not on [" + name
379379
+ "] which is of type [" + typeName() + "]");
380380
}
381381

server/src/main/java/org/elasticsearch/index/mapper/StringFieldType.java

Lines changed: 33 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
package org.elasticsearch.index.mapper;
2121

22+
import org.apache.lucene.analysis.Analyzer;
2223
import org.apache.lucene.index.Term;
2324
import org.apache.lucene.search.FuzzyQuery;
2425
import org.apache.lucene.search.MultiTermQuery;
@@ -93,6 +94,36 @@ public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, Quer
9394
return query;
9495
}
9596

97+
public static final String normalizeWildcardPattern(String fieldname, String value, Analyzer normalizer) {
98+
if (normalizer == null) {
99+
return value;
100+
}
101+
// we want to normalize everything except wildcard characters, e.g. F?o Ba* to f?o ba*, even if e.g there
102+
// is a char_filter that would otherwise remove them
103+
Matcher wildcardMatcher = WILDCARD_PATTERN.matcher(value);
104+
BytesRefBuilder sb = new BytesRefBuilder();
105+
int last = 0;
106+
107+
while (wildcardMatcher.find()) {
108+
if (wildcardMatcher.start() > 0) {
109+
String chunk = value.substring(last, wildcardMatcher.start());
110+
111+
BytesRef normalized = normalizer.normalize(fieldname, chunk);
112+
sb.append(normalized);
113+
}
114+
// append the matched group - without normalizing
115+
sb.append(new BytesRef(wildcardMatcher.group()));
116+
117+
last = wildcardMatcher.end();
118+
}
119+
if (last < value.length()) {
120+
String chunk = value.substring(last);
121+
BytesRef normalized = normalizer.normalize(fieldname, chunk);
122+
sb.append(normalized);
123+
}
124+
return sb.toBytesRef().utf8ToString();
125+
}
126+
96127
@Override
97128
public Query wildcardQuery(String value, MultiTermQuery.RewriteMethod method, QueryShardContext context) {
98129
failIfNotIndexed();
@@ -103,30 +134,8 @@ public Query wildcardQuery(String value, MultiTermQuery.RewriteMethod method, Qu
103134

104135
Term term;
105136
if (searchAnalyzer() != null) {
106-
// we want to normalize everything except wildcard characters, e.g. F?o Ba* to f?o ba*, even if e.g there
107-
// is a char_filter that would otherwise remove them
108-
Matcher wildcardMatcher = WILDCARD_PATTERN.matcher(value);
109-
BytesRefBuilder sb = new BytesRefBuilder();
110-
int last = 0;
111-
112-
while (wildcardMatcher.find()) {
113-
if (wildcardMatcher.start() > 0) {
114-
String chunk = value.substring(last, wildcardMatcher.start());
115-
116-
BytesRef normalized = searchAnalyzer().normalize(name(), chunk);
117-
sb.append(normalized);
118-
}
119-
// append the matched group - without normalizing
120-
sb.append(new BytesRef(wildcardMatcher.group()));
121-
122-
last = wildcardMatcher.end();
123-
}
124-
if (last < value.length()) {
125-
String chunk = value.substring(last);
126-
BytesRef normalized = searchAnalyzer().normalize(name(), chunk);
127-
sb.append(normalized);
128-
}
129-
term = new Term(name(), sb.toBytesRef());
137+
value = normalizeWildcardPattern(name(), value, searchAnalyzer());
138+
term = new Term(name(), value);
130139
} else {
131140
term = new Term(name(), indexedValueForSearch(value));
132141
}

server/src/main/java/org/elasticsearch/index/query/QueryBuilders.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -239,7 +239,7 @@ public static RangeQueryBuilder rangeQuery(String name) {
239239
* which matches any single character. Note this query can be slow, as it
240240
* needs to iterate over many terms. In order to prevent extremely slow WildcardQueries,
241241
* a Wildcard term should not start with one of the wildcards {@code *} or
242-
* {@code ?}.
242+
* {@code ?}. (The wildcard field type however, is optimised for leading wildcards)
243243
*
244244
* @param name The field name
245245
* @param query The wildcard query string

server/src/test/java/org/elasticsearch/index/query/PrefixQueryBuilderTests.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ public void testNumeric() throws Exception {
116116
QueryShardContext context = createShardContext();
117117
QueryShardException e = expectThrows(QueryShardException.class,
118118
() -> query.toQuery(context));
119-
assertEquals("Can only use prefix queries on keyword and text fields - not on [mapped_int] which is of type [integer]",
119+
assertEquals("Can only use prefix queries on keyword, text and wildcard fields - not on [mapped_int] which is of type [integer]",
120120
e.getMessage());
121121
}
122122

server/src/test/java/org/elasticsearch/index/query/QueryStringQueryBuilderTests.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -813,7 +813,7 @@ public void testPrefixNumeric() throws Exception {
813813
QueryShardContext context = createShardContext();
814814
QueryShardException e = expectThrows(QueryShardException.class,
815815
() -> query.toQuery(context));
816-
assertEquals("Can only use prefix queries on keyword and text fields - not on [mapped_int] which is of type [integer]",
816+
assertEquals("Can only use prefix queries on keyword, text and wildcard fields - not on [mapped_int] which is of type [integer]",
817817
e.getMessage());
818818
query.lenient(true);
819819
query.toQuery(context); // no exception

x-pack/plugin/src/test/resources/rest-api-spec/test/wildcard/10_wildcard_basic.yml

Lines changed: 54 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,29 @@
11
setup:
22
- skip:
33
features: headers
4-
version: " - 7.9.99"
5-
reason: "wildcard fields were added from 8.0"
4+
version: " - 7.6.99"
5+
reason: "wildcard fields were added from 7.7"
66

77
- do:
88
indices.create:
99
index: test-index
1010
body:
1111
settings:
1212
number_of_replicas: 0
13+
analysis:
14+
normalizer:
15+
lowercase:
16+
type: custom
17+
char_filter: []
18+
filter: ["lowercase"]
1319
mappings:
1420
properties:
1521
my_wildcard:
1622
type: wildcard
23+
normalizer: lowercase
24+
fields:
25+
case_sensitive:
26+
type: wildcard
1727
- do:
1828
index:
1929
index: test-index
@@ -26,6 +36,12 @@ setup:
2636
id: 2
2737
body:
2838
my_wildcard: goodbye world
39+
- do:
40+
index:
41+
index: test-index
42+
id: 3
43+
body:
44+
my_wildcard: cAsE iNsEnSiTiVe World
2945

3046
- do:
3147
indices.refresh: {}
@@ -80,6 +96,31 @@ setup:
8096
my_wildcard: {value: "*ello worl*" }
8197

8298

99+
- match: {hits.total.value: 1}
100+
---
101+
"Case insensitive query":
102+
- do:
103+
search:
104+
body:
105+
track_total_hits: true
106+
query:
107+
wildcard:
108+
my_wildcard: {value: "*Worl*" }
109+
110+
111+
- match: {hits.total.value: 3}
112+
113+
---
114+
"Case sensitive query":
115+
- do:
116+
search:
117+
body:
118+
track_total_hits: true
119+
query:
120+
wildcard:
121+
my_wildcard.case_sensitive: {value: "*Worl*" }
122+
123+
83124
- match: {hits.total.value: 1}
84125

85126
---
@@ -93,7 +134,7 @@ setup:
93134
my_wildcard: {value: "*ld" }
94135

95136

96-
- match: {hits.total.value: 2}
137+
- match: {hits.total.value: 3}
97138

98139
---
99140
"Long suffix query":
@@ -188,8 +229,8 @@ setup:
188229
terms: {field: "my_wildcard" }
189230

190231

191-
- match: {hits.total.value: 2}
192-
- length: { aggregations.top_vals.buckets: 2 }
232+
- match: {hits.total.value: 3}
233+
- length: { aggregations.top_vals.buckets: 3 }
193234

194235
---
195236
"Sort works":
@@ -199,20 +240,21 @@ setup:
199240
track_total_hits: true
200241
sort: [ { "my_wildcard": "desc" } ]
201242

202-
- match: { hits.total.value: 2 }
203-
- length: { hits.hits: 2 }
243+
- match: { hits.total.value: 3 }
244+
- length: { hits.hits: 3 }
204245
- match: { hits.hits.0._id: "1" }
205246
- match: { hits.hits.1._id: "2" }
247+
- match: { hits.hits.2._id: "3" }
206248

207249
- do:
208250
search:
209251
body:
210252
track_total_hits: true
211253
sort: [ { "my_wildcard": "asc" } ]
212254

213-
- match: { hits.total.value: 2 }
214-
- length: { hits.hits: 2 }
215-
- match: { hits.hits.0._id: "2" }
216-
- match: { hits.hits.1._id: "1" }
217-
255+
- match: { hits.total.value: 3 }
256+
- length: { hits.hits: 3 }
257+
- match: { hits.hits.0._id: "3" }
258+
- match: { hits.hits.1._id: "2" }
259+
- match: { hits.hits.2._id: "1" }
218260

0 commit comments

Comments
 (0)