Skip to content

Commit 2c74f3e

Browse files
authored
Backport of new wildcard field type (#53590)
* New wildcard field optimised for wildcard queries (#49993) Indexes values using size 3 ngrams and also stores the full original as a binary doc value. Wildcard queries operate by using a cheap approximation query on the ngram field followed up by a more expensive verification query using an automaton on the binary doc values. Also supports aggregations and sorting.
1 parent a906f8a commit 2c74f3e

File tree

11 files changed

+1365
-17
lines changed

11 files changed

+1365
-17
lines changed

docs/reference/mapping/types.asciidoc

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ document:
77
[float]
88
=== Core datatypes
99

10-
string:: <<text,`text`>> and <<keyword,`keyword`>>
10+
string:: <<text,`text`>>, <<keyword,`keyword`>> and <<wildcard,`wildcard`>>
1111
<<number>>:: `long`, `integer`, `short`, `byte`, `double`, `float`, `half_float`, `scaled_float`
1212
<<date>>:: `date`
1313
<<date_nanos>>:: `date_nanos`
@@ -135,3 +135,5 @@ include::types/token-count.asciidoc[]
135135
include::types/shape.asciidoc[]
136136

137137
include::types/constant-keyword.asciidoc[]
138+
139+
include::types/wildcard.asciidoc[]
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
[role="xpack"]
2+
[testenv="basic"]
3+
[[wildcard]]
4+
=== Wildcard datatype
5+
++++
6+
<titleabbrev>Wildcard</titleabbrev>
7+
++++
8+
9+
A `wildcard` field stores values optimised for wildcard grep-like queries.
10+
Wildcard queries are possible on other field types but suffer from constraints:
11+
* `text` fields limit matching of any wildcard expressions to individual tokens rather than the original whole value held in a field
12+
* `keyword` fields are untokenized but slow at performing wildcard queries (especially patterns with leading wildcards).
13+
14+
Internally the `wildcard` field indexes the whole field value using ngrams and stores the full string.
15+
The index is used as a rough filter to cut down the number of values that are then checked by retrieving and checking the full values.
16+
This field is especially well suited to run grep-like queries on log lines. Storage costs are typically lower than those of `keyword`
17+
fields but search speeds for exact matches on full terms are slower.
18+
19+
You index and search a wildcard field as follows
20+
21+
[source,console]
22+
--------------------------------------------------
23+
PUT my_index
24+
{
25+
"mappings": {
26+
"properties": {
27+
"my_wildcard": {
28+
"type": "wildcard"
29+
}
30+
}
31+
}
32+
}
33+
34+
PUT my_index/_doc/1
35+
{
36+
"my_wildcard" : "This string can be quite lengthy"
37+
}
38+
39+
POST my_index/_doc/_search
40+
{
41+
"query": {
42+
"wildcard" : {
43+
"value": "*quite*lengthy"
44+
}
45+
}
46+
}
47+
48+
49+
--------------------------------------------------
50+
51+
52+
==== Limitations
53+
54+
* `wildcard` fields are untokenized like keyword fields, so do not support queries that rely on word positions such as phrase queries.
55+

server/src/main/java/org/elasticsearch/index/fielddata/plain/BinaryDVIndexFieldData.java

Lines changed: 1 addition & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,6 @@
2121

2222
import org.apache.lucene.index.LeafReaderContext;
2323
import org.apache.lucene.search.SortField;
24-
import org.apache.lucene.search.SortedSetSortField;
25-
import org.apache.lucene.search.SortedSetSelector;
2624
import org.elasticsearch.common.Nullable;
2725
import org.elasticsearch.common.util.BigArrays;
2826
import org.elasticsearch.index.Index;
@@ -54,20 +52,7 @@ public BinaryDVAtomicFieldData loadDirect(LeafReaderContext context) throws Exce
5452
public SortField sortField(@Nullable Object missingValue, MultiValueMode sortMode, XFieldComparatorSource.Nested nested,
5553
boolean reverse) {
5654
XFieldComparatorSource source = new BytesRefFieldComparatorSource(this, missingValue, sortMode, nested);
57-
/**
58-
* Check if we can use a simple {@link SortedSetSortField} compatible with index sorting and
59-
* returns a custom sort field otherwise.
60-
*/
61-
if (nested != null ||
62-
(sortMode != MultiValueMode.MAX && sortMode != MultiValueMode.MIN) ||
63-
(source.sortMissingFirst(missingValue) == false && source.sortMissingLast(missingValue) == false)) {
64-
return new SortField(getFieldName(), source, reverse);
65-
}
66-
SortField sortField = new SortedSetSortField(fieldName, reverse,
67-
sortMode == MultiValueMode.MAX ? SortedSetSelector.Type.MAX : SortedSetSelector.Type.MIN);
68-
sortField.setMissingValue(source.sortMissingLast(missingValue) ^ reverse ?
69-
SortedSetSortField.STRING_LAST : SortedSetSortField.STRING_FIRST);
70-
return sortField;
55+
return new SortField(getFieldName(), source, reverse);
7156
}
7257

7358
@Override

x-pack/plugin/core/src/main/java/org/elasticsearch/license/XPackLicenseState.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -613,6 +613,16 @@ public boolean isFlattenedAllowed() {
613613
public boolean isVectorsAllowed() {
614614
return allowForAllLicenses();
615615
}
616+
617+
618+
/**
619+
* Determine if Wildcard support should be enabled.
620+
* <p>
621+
* Wildcard is available for all license types except {@link OperationMode#MISSING}
622+
*/
623+
public synchronized boolean isWildcardAllowed() {
624+
return status.active;
625+
}
616626

617627
public boolean isOdbcAllowed() {
618628
return isAllowedByLicense(OperationMode.PLATINUM);
Lines changed: 218 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,218 @@
1+
setup:
2+
- skip:
3+
features: headers
4+
version: " - 7.6.99"
5+
reason: "wildcard fields were added from 7.7"
6+
7+
- do:
8+
indices.create:
9+
index: test-index
10+
body:
11+
settings:
12+
number_of_replicas: 0
13+
mappings:
14+
properties:
15+
my_wildcard:
16+
type: wildcard
17+
- do:
18+
index:
19+
index: test-index
20+
id: 1
21+
body:
22+
my_wildcard: hello world
23+
- do:
24+
index:
25+
index: test-index
26+
id: 2
27+
body:
28+
my_wildcard: goodbye world
29+
30+
- do:
31+
indices.refresh: {}
32+
33+
---
34+
"Short prefix query":
35+
- do:
36+
search:
37+
body:
38+
track_total_hits: true
39+
query:
40+
wildcard:
41+
my_wildcard: {value: "hel*" }
42+
43+
44+
- match: {hits.total.value: 1}
45+
46+
---
47+
"Long prefix query":
48+
- do:
49+
search:
50+
body:
51+
track_total_hits: true
52+
query:
53+
wildcard:
54+
my_wildcard: {value: "hello wor*" }
55+
56+
57+
- match: {hits.total.value: 1}
58+
59+
---
60+
"Short unrooted query":
61+
- do:
62+
search:
63+
body:
64+
track_total_hits: true
65+
query:
66+
wildcard:
67+
my_wildcard: {value: "*ello*" }
68+
69+
70+
- match: {hits.total.value: 1}
71+
72+
---
73+
"Long unrooted query":
74+
- do:
75+
search:
76+
body:
77+
track_total_hits: true
78+
query:
79+
wildcard:
80+
my_wildcard: {value: "*ello worl*" }
81+
82+
83+
- match: {hits.total.value: 1}
84+
85+
---
86+
"Short suffix query":
87+
- do:
88+
search:
89+
body:
90+
track_total_hits: true
91+
query:
92+
wildcard:
93+
my_wildcard: {value: "*ld" }
94+
95+
96+
- match: {hits.total.value: 2}
97+
98+
---
99+
"Long suffix query":
100+
- do:
101+
search:
102+
body:
103+
track_total_hits: true
104+
query:
105+
wildcard:
106+
my_wildcard: {value: "*ello world" }
107+
108+
109+
- match: {hits.total.value: 1}
110+
111+
---
112+
"No wildcard wildcard query":
113+
- do:
114+
search:
115+
body:
116+
track_total_hits: true
117+
query:
118+
wildcard:
119+
my_wildcard: {value: "hello world" }
120+
121+
122+
- match: {hits.total.value: 1}
123+
124+
---
125+
"Term query on wildcard field":
126+
- do:
127+
search:
128+
body:
129+
track_total_hits: true
130+
query:
131+
term:
132+
my_wildcard: "hello world"
133+
134+
135+
- match: {hits.total.value: 1}
136+
137+
---
138+
"Terms query on wildcard field":
139+
- do:
140+
search:
141+
body:
142+
track_total_hits: true
143+
query:
144+
terms:
145+
my_wildcard: ["hello world", "does not exist"]
146+
147+
148+
- match: {hits.total.value: 1}
149+
150+
---
151+
"Prefix query on wildcard field":
152+
- do:
153+
search:
154+
body:
155+
track_total_hits: true
156+
query:
157+
prefix:
158+
my_wildcard:
159+
value: "hell*"
160+
161+
162+
- match: {hits.total.value: 1}
163+
164+
---
165+
"Sequence fail":
166+
- do:
167+
search:
168+
body:
169+
track_total_hits: true
170+
query:
171+
wildcard:
172+
my_wildcard: {value: "*world*hello*" }
173+
174+
175+
- match: {hits.total.value: 0}
176+
177+
---
178+
"Aggs work":
179+
- do:
180+
search:
181+
body:
182+
track_total_hits: true
183+
query:
184+
wildcard:
185+
my_wildcard: {value: "*world*" }
186+
aggs:
187+
top_vals:
188+
terms: {field: "my_wildcard" }
189+
190+
191+
- match: {hits.total.value: 2}
192+
- length: { aggregations.top_vals.buckets: 2 }
193+
194+
---
195+
"Sort works":
196+
- do:
197+
search:
198+
body:
199+
track_total_hits: true
200+
sort: [ { "my_wildcard": "desc" } ]
201+
202+
- match: { hits.total.value: 2 }
203+
- length: { hits.hits: 2 }
204+
- match: { hits.hits.0._id: "1" }
205+
- match: { hits.hits.1._id: "2" }
206+
207+
- do:
208+
search:
209+
body:
210+
track_total_hits: true
211+
sort: [ { "my_wildcard": "asc" } ]
212+
213+
- match: { hits.total.value: 2 }
214+
- length: { hits.hits: 2 }
215+
- match: { hits.hits.0._id: "2" }
216+
- match: { hits.hits.1._id: "1" }
217+
218+

x-pack/plugin/wildcard/build.gradle

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
evaluationDependsOn(xpackModule('core'))
2+
3+
apply plugin: 'elasticsearch.esplugin'
4+
5+
esplugin {
6+
name 'wildcard'
7+
description 'A plugin for a keyword field type with efficient wildcard search'
8+
classname 'org.elasticsearch.xpack.wildcard.Wildcard'
9+
extendedPlugins = ['x-pack-core']
10+
}
11+
archivesBaseName = 'x-pack-wildcard'
12+
13+
dependencies {
14+
compileOnly project(path: xpackModule('core'), configuration: 'default')
15+
testCompile project(path: xpackModule('core'), configuration: 'testArtifacts')
16+
}
17+
18+
integTest.enabled = false
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the Elastic License;
4+
* you may not use this file except in compliance with the Elastic License.
5+
*/
6+
7+
package org.elasticsearch.xpack.wildcard;
8+
9+
import org.elasticsearch.common.settings.Settings;
10+
import org.elasticsearch.index.mapper.Mapper;
11+
import org.elasticsearch.plugins.MapperPlugin;
12+
import org.elasticsearch.plugins.Plugin;
13+
import org.elasticsearch.xpack.wildcard.mapper.WildcardFieldMapper;
14+
15+
import java.util.Collections;
16+
import java.util.LinkedHashMap;
17+
import java.util.Map;
18+
19+
public class Wildcard extends Plugin implements MapperPlugin {
20+
21+
22+
public Wildcard(Settings settings) {
23+
}
24+
25+
@Override
26+
public Map<String, Mapper.TypeParser> getMappers() {
27+
Map<String, Mapper.TypeParser> mappers = new LinkedHashMap<>();
28+
mappers.put(WildcardFieldMapper.CONTENT_TYPE, new WildcardFieldMapper.TypeParser());
29+
return Collections.unmodifiableMap(mappers);
30+
}
31+
}

0 commit comments

Comments
 (0)