Skip to content

Wildcard field - added case insensitive search option #53814

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -368,14 +368,14 @@ public Query fuzzyQuery(Object value, Fuzziness fuzziness, int prefixLength, int
}

public Query prefixQuery(String value, @Nullable MultiTermQuery.RewriteMethod method, QueryShardContext context) {
throw new QueryShardException(context, "Can only use prefix queries on keyword and text fields - not on [" + name
throw new QueryShardException(context, "Can only use prefix queries on keyword, text and wildcard fields - not on [" + name
+ "] which is of type [" + typeName() + "]");
}

public Query wildcardQuery(String value,
@Nullable MultiTermQuery.RewriteMethod method,
QueryShardContext context) {
throw new QueryShardException(context, "Can only use wildcard queries on keyword and text fields - not on [" + name
throw new QueryShardException(context, "Can only use wildcard queries on keyword, text and wildcard fields - not on [" + name
+ "] which is of type [" + typeName() + "]");
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ public static RangeQueryBuilder rangeQuery(String name) {
* which matches any single character. Note this query can be slow, as it
* needs to iterate over many terms. In order to prevent extremely slow WildcardQueries,
* a Wildcard term should not start with one of the wildcards {@code *} or
* {@code ?}.
* {@code ?}. (The new wildcard field type however, is optimised for leading wildcards)_
*
* @param name The field name
* @param query The wildcard query string
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
setup:
- skip:
features: headers
version: " - 7.9.99"
reason: "wildcard fields were added from 8.0"
version: " - 7.6.99"
reason: "wildcard fields were added from 7.7"

- do:
indices.create:
Expand All @@ -26,6 +26,12 @@ setup:
id: 2
body:
my_wildcard: goodbye world
- do:
index:
index: test-index
id: 3
body:
my_wildcard: cAsE iNsEnSiTiVe World

- do:
indices.refresh: {}
Expand Down Expand Up @@ -82,6 +88,19 @@ setup:

- match: {hits.total.value: 1}

---
"Case insensitive query":
- do:
search:
body:
track_total_hits: true
query:
wildcard:
my_wildcard._case_insensitive: {value: "*Worl*" }


- match: {hits.total.value: 3}

---
"Short suffix query":
- do:
Expand All @@ -93,7 +112,7 @@ setup:
my_wildcard: {value: "*ld" }


- match: {hits.total.value: 2}
- match: {hits.total.value: 3}

---
"Long suffix query":
Expand Down Expand Up @@ -199,20 +218,22 @@ setup:
track_total_hits: true
sort: [ { "my_wildcard": "desc" } ]

- match: { hits.total.value: 2 }
- length: { hits.hits: 2 }
- match: { hits.total.value: 3 }
- length: { hits.hits: 3 }
- match: { hits.hits.0._id: "1" }
- match: { hits.hits.1._id: "2" }
- match: { hits.hits.2._id: "3" }

- do:
search:
body:
track_total_hits: true
sort: [ { "my_wildcard": "asc" } ]

- match: { hits.total.value: 2 }
- length: { hits.hits: 2 }
- match: { hits.hits.0._id: "2" }
- match: { hits.hits.1._id: "1" }
- match: { hits.total.value: 3 }
- length: { hits.hits: 3 }
- match: { hits.hits.0._id: "3" }
- match: { hits.hits.1._id: "2" }
- match: { hits.hits.2._id: "1" }


Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

package org.elasticsearch.xpack.wildcard.mapper;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.LeafReaderContext;
Expand All @@ -23,6 +24,7 @@
import org.apache.lucene.util.automaton.ByteRunAutomaton;

import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Objects;

/**
Expand All @@ -34,11 +36,13 @@ public class AutomatonQueryOnBinaryDv extends Query {
private final String field;
private final String matchPattern;
private final Automaton automaton;
private Analyzer normalizer;

public AutomatonQueryOnBinaryDv(String field, String matchPattern, Automaton automaton) {
public AutomatonQueryOnBinaryDv(String field, String matchPattern, Automaton automaton, Analyzer normalizer) {
this.field = field;
this.matchPattern = matchPattern;
this.automaton = automaton;
this.normalizer = normalizer;
}

@Override
Expand All @@ -62,14 +66,24 @@ public boolean matches() throws IOException {
int size = badi.readVInt();
for (int i=0; i< size; i++) {
int valLength = badi.readVInt();
if (bytesMatcher.run(arrayOfValues.bytes, badi.getPosition(), valLength)) {
if (valueMatches(arrayOfValues.bytes, badi.getPosition(), valLength)) {
return true;
}
}
badi.skipBytes(valLength);
}
return false;
}

private boolean valueMatches(byte[] bytes, int position, int valLength) {
if (normalizer == null) {
return bytesMatcher.run(bytes, badi.getPosition(), valLength);
} else {
String s = new String(bytes, position, valLength, StandardCharsets.UTF_8);
BytesRef normalized = normalizer.normalize(null, s);
return (bytesMatcher.run(normalized.bytes, normalized.offset, normalized.length));
}
}

@Override
public float matchCost() {
// TODO: how can we compute this?
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
package org.elasticsearch.xpack.wildcard.mapper;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ngram.NGramTokenizer;
Expand All @@ -28,8 +29,10 @@
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.Automaton;
import org.elasticsearch.ElasticsearchParseException;
import org.elasticsearch.common.collect.Iterators;
import org.elasticsearch.common.lucene.BytesRefs;
import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.common.settings.Settings;
Expand Down Expand Up @@ -79,9 +82,18 @@ public class WildcardFieldMapper extends FieldMapper {
@Override
public TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new NGramTokenizer(NGRAM_SIZE, NGRAM_SIZE);
return new TokenStreamComponents(tokenizer);
// Lower case all ngram content
TokenStream tok = new LowerCaseFilter(tokenizer);
return new TokenStreamComponents(r -> {
tokenizer.setReader(r);
},tok);
}
});

@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
return new LowerCaseFilter(in);
}
});

public static class Defaults {
public static final MappedFieldType FIELD_TYPE = new WildcardFieldType();
Expand Down Expand Up @@ -167,12 +179,117 @@ public WildcardFieldType fieldType() {

@Override
public WildcardFieldMapper build(BuilderContext context) {
setupFieldType(context);
setupFieldType(context);

String fullName = buildFullName(context);
CaseInsensitiveFieldType caseInsensitiveFieldType =
new CaseInsensitiveFieldType(fullName, fullName + "._case_insensitive");
CaseInsensitiveFieldMapper caseInsensitiveFieldMapper =
new CaseInsensitiveFieldMapper(caseInsensitiveFieldType, context.indexSettings());

return new WildcardFieldMapper(
name, fieldType, defaultFieldType, ignoreAbove,
context.indexSettings(), multiFieldsBuilder.build(this, context), copyTo);
context.indexSettings(), multiFieldsBuilder.build(this, context), copyTo, caseInsensitiveFieldMapper);
}
}


@SuppressWarnings("unchecked")
@Override
public Iterator<Mapper> iterator() {
List<Mapper> subIterators = new ArrayList<>();
if (caseInsensitiveFieldMapper != null) {
subIterators.add(caseInsensitiveFieldMapper);
}
return Iterators.concat(super.iterator(), subIterators.iterator());
}

private static final class CaseInsensitiveFieldMapper extends FieldMapper {

protected CaseInsensitiveFieldMapper(CaseInsensitiveFieldType fieldType, Settings indexSettings) {
super(fieldType.name(), fieldType, fieldType, indexSettings, MultiFields.empty(), CopyTo.empty());
}

@Override
protected void parseCreateField(ParseContext context, List<IndexableField> fields) {
throw new UnsupportedOperationException();
}

@Override
protected String contentType() {
return "caseInsensitive";
}

@Override
public String toString() {
return fieldType().toString();
}
}

static final class CaseInsensitiveFieldType extends MappedFieldType {

final String parentField;

CaseInsensitiveFieldType(String parentField, String name) {
setName(name);
this.parentField = parentField;
}

void doXContent(XContentBuilder builder) throws IOException {
builder.startObject("index_caseInsensitive");
builder.endObject();
}

private WildcardFieldType getParent(QueryShardContext context) {
return (WildcardFieldType) context.fieldMapper(this.parentField);
}


@Override
public Query wildcardQuery(String value, RewriteMethod method, QueryShardContext context) {
// Delegate to parent with case sensitivity turned off.
return getParent(context).wildcardQuery(value, method, context, false);
}

@Override
public Query termsQuery(List<?> values, QueryShardContext context) {
BooleanQuery.Builder bq = new BooleanQuery.Builder();
for (Object value : values) {
bq.add(termQuery(value, context), Occur.SHOULD);
}
return new ConstantScoreQuery(bq.build());
}

@Override
public Query termQuery(Object value, QueryShardContext context) {
return wildcardQuery(BytesRefs.toString(value), MultiTermQuery.CONSTANT_SCORE_REWRITE, context);
}

@Override
public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, QueryShardContext context) {
return wildcardQuery(value + "*", method, context);
}

@Override
public CaseInsensitiveFieldType clone() {
return new CaseInsensitiveFieldType(parentField, name());
}

@Override
public String typeName() {
return "caseInsensitive";
}

@Override
public String toString() {
return super.toString() + ",caseInsensitive";
}

@Override
public Query existsQuery(QueryShardContext context) {
return getParent(context).existsQuery(context);
}
}

public static class TypeParser implements Mapper.TypeParser {
@Override
Expand Down Expand Up @@ -320,15 +437,29 @@ public boolean equals(Object obj) {
PatternStructure other = (PatternStructure) obj;
return pattern.equals(other.pattern);
}


}


public static BytesRef toLower(BytesRef value) {
return WILDCARD_ANALYZER.normalize(null, value.utf8ToString());
}

public static String toLower(String value) {
return WILDCARD_ANALYZER.normalize(null, value).utf8ToString();
}

@Override
public Query wildcardQuery(String wildcardPattern, RewriteMethod method, QueryShardContext context) {
return wildcardQuery(wildcardPattern, method, context, true);
}

public Query wildcardQuery(String wildcardPattern, RewriteMethod method, QueryShardContext context, boolean caseSensitive) {
if (caseSensitive == false) {
wildcardPattern = toLower(wildcardPattern);
}

PatternStructure patternStructure = new PatternStructure(wildcardPattern);
ArrayList<String> tokens = new ArrayList<>();


for (int i = 0; i < patternStructure.fragments.length; i++) {
String fragment = patternStructure.fragments[i];
Expand Down Expand Up @@ -389,7 +520,9 @@ public Query wildcardQuery(String wildcardPattern, RewriteMethod method, QuerySh
BooleanQuery.Builder verifyingBuilder = new BooleanQuery.Builder();
verifyingBuilder.add(new BooleanClause(approximation, Occur.MUST));
Automaton automaton = WildcardQuery.toAutomaton(new Term(name(), wildcardPattern));
verifyingBuilder.add(new BooleanClause(new AutomatonQueryOnBinaryDv(name(), wildcardPattern, automaton), Occur.MUST));
Analyzer normalizer = caseSensitive ? null: WILDCARD_ANALYZER;
verifyingBuilder.add(new BooleanClause(
new AutomatonQueryOnBinaryDv(name(), wildcardPattern, automaton, normalizer), Occur.MUST));
return verifyingBuilder.build();
}
return approximation;
Expand Down Expand Up @@ -486,16 +619,19 @@ public SortField sortField(Object missingValue, MultiValueMode sortMode, Nested
}

private int ignoreAbove;
private CaseInsensitiveFieldMapper caseInsensitiveFieldMapper;

private WildcardFieldMapper(String simpleName, MappedFieldType fieldType, MappedFieldType defaultFieldType,
int ignoreAbove, Settings indexSettings, MultiFields multiFields, CopyTo copyTo) {
int ignoreAbove, Settings indexSettings, MultiFields multiFields, CopyTo copyTo,
CaseInsensitiveFieldMapper caseInsensitiveFieldMapper) {
super(simpleName, fieldType, defaultFieldType, indexSettings, multiFields, copyTo);
this.ignoreAbove = ignoreAbove;
assert fieldType.indexOptions() == IndexOptions.DOCS;

ngramFieldType = fieldType.clone();
ngramFieldType.setTokenized(true);
ngramFieldType.freeze();
this.caseInsensitiveFieldMapper = caseInsensitiveFieldMapper;
}

/** Values that have more chars than the return value of this method will
Expand Down Expand Up @@ -570,6 +706,8 @@ protected String contentType() {
@Override
protected void doMerge(Mapper mergeWith) {
super.doMerge(mergeWith);
WildcardFieldMapper mw = (WildcardFieldMapper) mergeWith;
this.caseInsensitiveFieldMapper = (CaseInsensitiveFieldMapper) this.caseInsensitiveFieldMapper.merge(mw.caseInsensitiveFieldMapper);
this.ignoreAbove = ((WildcardFieldMapper) mergeWith).ignoreAbove;
}
}