From c47de57414aa9e3ad9094984f6e87e8ae390ed73 Mon Sep 17 00:00:00 2001 From: Michele Rastelli Date: Thu, 29 Feb 2024 13:01:28 +0100 Subject: [PATCH] wildcard analyzer --- .../entity/arangosearch/AnalyzerType.java | 3 +- .../arangosearch/analyzer/SearchAnalyzer.java | 3 +- .../analyzer/WildcardAnalyzer.java | 66 ++++++++++++++++++ .../analyzer/WildcardAnalyzerProperties.java | 68 +++++++++++++++++++ .../com/arangodb/ArangoSearchAsyncTest.java | 29 ++++++++ .../java/com/arangodb/ArangoSearchTest.java | 29 ++++++++ 6 files changed, 196 insertions(+), 2 deletions(-) create mode 100644 core/src/main/java/com/arangodb/entity/arangosearch/analyzer/WildcardAnalyzer.java create mode 100644 core/src/main/java/com/arangodb/entity/arangosearch/analyzer/WildcardAnalyzerProperties.java diff --git a/core/src/main/java/com/arangodb/entity/arangosearch/AnalyzerType.java b/core/src/main/java/com/arangodb/entity/arangosearch/AnalyzerType.java index e92b0b191..873a0d41b 100644 --- a/core/src/main/java/com/arangodb/entity/arangosearch/AnalyzerType.java +++ b/core/src/main/java/com/arangodb/entity/arangosearch/AnalyzerType.java @@ -40,5 +40,6 @@ public enum AnalyzerType { collation, classification, nearest_neighbors, - minhash + minhash, + wildcard } diff --git a/core/src/main/java/com/arangodb/entity/arangosearch/analyzer/SearchAnalyzer.java b/core/src/main/java/com/arangodb/entity/arangosearch/analyzer/SearchAnalyzer.java index 3c3a60ba6..a1be95127 100644 --- a/core/src/main/java/com/arangodb/entity/arangosearch/analyzer/SearchAnalyzer.java +++ b/core/src/main/java/com/arangodb/entity/arangosearch/analyzer/SearchAnalyzer.java @@ -53,7 +53,8 @@ @JsonSubTypes.Type(name = "collation", value = CollationAnalyzer.class), @JsonSubTypes.Type(name = "classification", value = ClassificationAnalyzer.class), @JsonSubTypes.Type(name = "nearest_neighbors", value = NearestNeighborsAnalyzer.class), - @JsonSubTypes.Type(name = "minhash", value = MinHashAnalyzer.class) + @JsonSubTypes.Type(name = "minhash", value = MinHashAnalyzer.class), + @JsonSubTypes.Type(name = "wildcard", value = WildcardAnalyzer.class) }) public abstract class SearchAnalyzer { private String name; diff --git a/core/src/main/java/com/arangodb/entity/arangosearch/analyzer/WildcardAnalyzer.java b/core/src/main/java/com/arangodb/entity/arangosearch/analyzer/WildcardAnalyzer.java new file mode 100644 index 000000000..5e81b68ac --- /dev/null +++ b/core/src/main/java/com/arangodb/entity/arangosearch/analyzer/WildcardAnalyzer.java @@ -0,0 +1,66 @@ +/* + * DISCLAIMER + * + * Copyright 2016 ArangoDB GmbH, Cologne, Germany + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Copyright holder is ArangoDB GmbH, Cologne, Germany + */ + +package com.arangodb.entity.arangosearch.analyzer; + + +import com.arangodb.entity.arangosearch.AnalyzerType; + +import java.util.Objects; + +/** + * An Analyzer that creates n-grams to enable fast partial matching for wildcard queries if you have large string + * values, especially if you want to search for suffixes or substrings in the middle of strings (infixes) as opposed to + * prefixes. + * It can apply an Analyzer of your choice before creating the n-grams, for example, to normalize text for + * case-insensitive and accent-insensitive search. + * + * @author Michele Rastelli + * @see API Documentation + */ +public final class WildcardAnalyzer extends SearchAnalyzer { + private WildcardAnalyzerProperties properties; + + public WildcardAnalyzer() { + setType(AnalyzerType.wildcard); + } + + public WildcardAnalyzerProperties getProperties() { + return properties; + } + + public void setProperties(WildcardAnalyzerProperties properties) { + this.properties = properties; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + if (!super.equals(o)) return false; + WildcardAnalyzer that = (WildcardAnalyzer) o; + return Objects.equals(properties, that.properties); + } + + @Override + public int hashCode() { + return Objects.hash(super.hashCode(), properties); + } +} diff --git a/core/src/main/java/com/arangodb/entity/arangosearch/analyzer/WildcardAnalyzerProperties.java b/core/src/main/java/com/arangodb/entity/arangosearch/analyzer/WildcardAnalyzerProperties.java new file mode 100644 index 000000000..84042de08 --- /dev/null +++ b/core/src/main/java/com/arangodb/entity/arangosearch/analyzer/WildcardAnalyzerProperties.java @@ -0,0 +1,68 @@ +/* + * DISCLAIMER + * + * Copyright 2016 ArangoDB GmbH, Cologne, Germany + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Copyright holder is ArangoDB GmbH, Cologne, Germany + */ + +package com.arangodb.entity.arangosearch.analyzer; + + +import java.util.Objects; + +/** + * @author Michele Rastelli + */ +public final class WildcardAnalyzerProperties { + + private Integer ngramSize; + private SearchAnalyzer analyzer; + + /** + * @return unsigned integer for the n-gram length, needs to be at least 2 + */ + public Integer getNgramSize() { + return ngramSize; + } + + /** + * @param ngramSize unsigned integer for the n-gram length, needs to be at least 2 + */ + public void setNgramSize(Integer ngramSize) { + this.ngramSize = ngramSize; + } + + public SearchAnalyzer getAnalyzer() { + return analyzer; + } + + public void setAnalyzer(SearchAnalyzer analyzer) { + this.analyzer = analyzer; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + WildcardAnalyzerProperties that = (WildcardAnalyzerProperties) o; + return Objects.equals(ngramSize, that.ngramSize) && Objects.equals(analyzer, that.analyzer); + } + + @Override + public int hashCode() { + return Objects.hash(ngramSize, analyzer); + } +} diff --git a/driver/src/test/java/com/arangodb/ArangoSearchAsyncTest.java b/driver/src/test/java/com/arangodb/ArangoSearchAsyncTest.java index 39aa7ca2e..bcff04c04 100644 --- a/driver/src/test/java/com/arangodb/ArangoSearchAsyncTest.java +++ b/driver/src/test/java/com/arangodb/ArangoSearchAsyncTest.java @@ -1034,6 +1034,35 @@ void MinHashAnalyzer(ArangoDatabaseAsync db) throws ExecutionException, Interrup createGetAndDeleteTypedAnalyzer(db, analyzer); } + @ParameterizedTest + @MethodSource("asyncDbs") + void WildcardAnalyzer(ArangoDatabaseAsync db) throws ExecutionException, InterruptedException { + assumeTrue(isAtLeastVersion(3, 12)); + + NormAnalyzerProperties properties = new NormAnalyzerProperties(); + properties.setLocale("ru"); + properties.setAnalyzerCase(SearchAnalyzerCase.lower); + properties.setAccent(true); + + NormAnalyzer normAnalyzer = new NormAnalyzer(); + normAnalyzer.setProperties(properties); + + WildcardAnalyzerProperties wildcardProperties = new WildcardAnalyzerProperties(); + wildcardProperties.setNgramSize(3); + wildcardProperties.setAnalyzer(normAnalyzer); + + Set features = new HashSet<>(); + features.add(AnalyzerFeature.frequency); + features.add(AnalyzerFeature.position); + + WildcardAnalyzer wildcardAnalyzer = new WildcardAnalyzer(); + wildcardAnalyzer.setName("test-" + UUID.randomUUID()); + wildcardAnalyzer.setProperties(wildcardProperties); + wildcardAnalyzer.setFeatures(features); + + createGetAndDeleteTypedAnalyzer(db, wildcardAnalyzer); + } + @ParameterizedTest @MethodSource("asyncDbs") void offsetFeature(ArangoDatabaseAsync db) throws ExecutionException, InterruptedException { diff --git a/driver/src/test/java/com/arangodb/ArangoSearchTest.java b/driver/src/test/java/com/arangodb/ArangoSearchTest.java index a4216477b..7c6eaa996 100644 --- a/driver/src/test/java/com/arangodb/ArangoSearchTest.java +++ b/driver/src/test/java/com/arangodb/ArangoSearchTest.java @@ -1033,6 +1033,35 @@ void MinHashAnalyzer(ArangoDatabase db) { createGetAndDeleteTypedAnalyzer(db, analyzer); } + @ParameterizedTest + @MethodSource("dbs") + void WildcardAnalyzer(ArangoDatabase db) { + assumeTrue(isAtLeastVersion(3, 12)); + + NormAnalyzerProperties properties = new NormAnalyzerProperties(); + properties.setLocale("ru"); + properties.setAnalyzerCase(SearchAnalyzerCase.lower); + properties.setAccent(true); + + NormAnalyzer normAnalyzer = new NormAnalyzer(); + normAnalyzer.setProperties(properties); + + WildcardAnalyzerProperties wildcardProperties = new WildcardAnalyzerProperties(); + wildcardProperties.setNgramSize(3); + wildcardProperties.setAnalyzer(normAnalyzer); + + Set features = new HashSet<>(); + features.add(AnalyzerFeature.frequency); + features.add(AnalyzerFeature.position); + + WildcardAnalyzer wildcardAnalyzer = new WildcardAnalyzer(); + wildcardAnalyzer.setName("test-" + UUID.randomUUID()); + wildcardAnalyzer.setProperties(wildcardProperties); + wildcardAnalyzer.setFeatures(features); + + createGetAndDeleteTypedAnalyzer(db, wildcardAnalyzer); + } + @ParameterizedTest @MethodSource("dbs") void offsetFeature(ArangoDatabase db) {