From 61dcb31bbfccdc15767fd5aeece189df2a8aa548 Mon Sep 17 00:00:00 2001 From: Michele Rastelli Date: Sun, 28 Nov 2021 20:08:00 +0100 Subject: [PATCH 1/2] SegmentationAnalyzer --- .../entity/arangosearch/AnalyzerType.java | 2 +- .../analyzer/NormAnalyzerProperties.java | 3 + .../analyzer/SearchAnalyzerCase.java | 2 +- .../analyzer/SegmentationAnalyzer.java | 66 ++++++++++++++++ .../SegmentationAnalyzerProperties.java | 78 +++++++++++++++++++ .../analyzer/StopwordsAnalyzer.java | 8 -- .../analyzer/TextAnalyzerProperties.java | 3 + .../velocypack/VPackDeserializers.java | 15 +--- .../java/com/arangodb/ArangoSearchTest.java | 49 ++++++------ 9 files changed, 178 insertions(+), 48 deletions(-) create mode 100644 src/main/java/com/arangodb/entity/arangosearch/analyzer/SegmentationAnalyzer.java create mode 100644 src/main/java/com/arangodb/entity/arangosearch/analyzer/SegmentationAnalyzerProperties.java diff --git a/src/main/java/com/arangodb/entity/arangosearch/AnalyzerType.java b/src/main/java/com/arangodb/entity/arangosearch/AnalyzerType.java index e37b48528..6484770e5 100644 --- a/src/main/java/com/arangodb/entity/arangosearch/AnalyzerType.java +++ b/src/main/java/com/arangodb/entity/arangosearch/AnalyzerType.java @@ -24,5 +24,5 @@ * @author Michele Rastelli */ public enum AnalyzerType { - identity, delimiter, stem, norm, ngram, text, pipeline, stopwords, aql, geojson, geopoint + identity, delimiter, stem, norm, ngram, text, pipeline, stopwords, aql, geojson, geopoint, segmentation } diff --git a/src/main/java/com/arangodb/entity/arangosearch/analyzer/NormAnalyzerProperties.java b/src/main/java/com/arangodb/entity/arangosearch/analyzer/NormAnalyzerProperties.java index 4ea42db17..cb1bcfbb8 100644 --- a/src/main/java/com/arangodb/entity/arangosearch/analyzer/NormAnalyzerProperties.java +++ b/src/main/java/com/arangodb/entity/arangosearch/analyzer/NormAnalyzerProperties.java @@ -66,6 +66,9 @@ public SearchAnalyzerCase getAnalyzerCase() { return analyzerCase; } + /** + * @param analyzerCase defaults to {@link SearchAnalyzerCase#none} + */ public void setAnalyzerCase(SearchAnalyzerCase analyzerCase) { this.analyzerCase = analyzerCase; } diff --git a/src/main/java/com/arangodb/entity/arangosearch/analyzer/SearchAnalyzerCase.java b/src/main/java/com/arangodb/entity/arangosearch/analyzer/SearchAnalyzerCase.java index b01d35dbf..32049f882 100644 --- a/src/main/java/com/arangodb/entity/arangosearch/analyzer/SearchAnalyzerCase.java +++ b/src/main/java/com/arangodb/entity/arangosearch/analyzer/SearchAnalyzerCase.java @@ -36,7 +36,7 @@ public enum SearchAnalyzerCase { upper, /** - * to not change character case (default) + * to not change character case */ none } diff --git a/src/main/java/com/arangodb/entity/arangosearch/analyzer/SegmentationAnalyzer.java b/src/main/java/com/arangodb/entity/arangosearch/analyzer/SegmentationAnalyzer.java new file mode 100644 index 000000000..70ce2f4cf --- /dev/null +++ b/src/main/java/com/arangodb/entity/arangosearch/analyzer/SegmentationAnalyzer.java @@ -0,0 +1,66 @@ +/* + * DISCLAIMER + * + * Copyright 2016 ArangoDB GmbH, Cologne, Germany + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Copyright holder is ArangoDB GmbH, Cologne, Germany + */ + +package com.arangodb.entity.arangosearch.analyzer; + + +import com.arangodb.entity.arangosearch.AnalyzerType; + +import java.util.Objects; + +/** + * An Analyzer capable of breaking up the input text into tokens in a language-agnostic manner, making it suitable for + * mixed language strings. + * It can optionally preserve all non-whitespace or all characters instead of keeping alphanumeric characters only, as + * well as apply case conversion. + * + * @author Michele Rastelli + * @see API Documentation + * @since ArangoDB 3.9 + */ +public class SegmentationAnalyzer extends SearchAnalyzer { + public SegmentationAnalyzer() { + setType(AnalyzerType.segmentation); + } + + private SegmentationAnalyzerProperties properties; + + public SegmentationAnalyzerProperties getProperties() { + return properties; + } + + public void setProperties(SegmentationAnalyzerProperties properties) { + this.properties = properties; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + if (!super.equals(o)) return false; + SegmentationAnalyzer that = (SegmentationAnalyzer) o; + return Objects.equals(properties, that.properties); + } + + @Override + public int hashCode() { + return Objects.hash(super.hashCode(), properties); + } +} diff --git a/src/main/java/com/arangodb/entity/arangosearch/analyzer/SegmentationAnalyzerProperties.java b/src/main/java/com/arangodb/entity/arangosearch/analyzer/SegmentationAnalyzerProperties.java new file mode 100644 index 000000000..221cda81e --- /dev/null +++ b/src/main/java/com/arangodb/entity/arangosearch/analyzer/SegmentationAnalyzerProperties.java @@ -0,0 +1,78 @@ +/* + * DISCLAIMER + * + * Copyright 2016 ArangoDB GmbH, Cologne, Germany + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Copyright holder is ArangoDB GmbH, Cologne, Germany + */ + +package com.arangodb.entity.arangosearch.analyzer; + + +import com.arangodb.velocypack.annotations.SerializedName; + +import java.util.Objects; + +/** + * @author Michele Rastelli + * @since ArangoDB 3.9 + */ +public class SegmentationAnalyzerProperties { + + @SerializedName("break") + private BreakMode breakMode; + + @SerializedName("case") + private SearchAnalyzerCase analyzerCase; + + public BreakMode getBreakMode() { + return breakMode; + } + + /** + * @param breakMode defaults to {@link BreakMode#alpha} + */ + public void setBreakMode(BreakMode breakMode) { + this.breakMode = breakMode; + } + + public SearchAnalyzerCase getAnalyzerCase() { + return analyzerCase; + } + + /** + * @param analyzerCase defaults to {@link SearchAnalyzerCase#lower} + */ + public void setAnalyzerCase(SearchAnalyzerCase analyzerCase) { + this.analyzerCase = analyzerCase; + } + + public enum BreakMode { + all, alpha, graphic + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + SegmentationAnalyzerProperties that = (SegmentationAnalyzerProperties) o; + return breakMode == that.breakMode && analyzerCase == that.analyzerCase; + } + + @Override + public int hashCode() { + return Objects.hash(breakMode, analyzerCase); + } +} diff --git a/src/main/java/com/arangodb/entity/arangosearch/analyzer/StopwordsAnalyzer.java b/src/main/java/com/arangodb/entity/arangosearch/analyzer/StopwordsAnalyzer.java index 060d6743f..264b48d54 100644 --- a/src/main/java/com/arangodb/entity/arangosearch/analyzer/StopwordsAnalyzer.java +++ b/src/main/java/com/arangodb/entity/arangosearch/analyzer/StopwordsAnalyzer.java @@ -26,14 +26,6 @@ import java.util.Objects; /** - * WARNING: - * The implementation of Stopwords analyzer is not final in ArangoDB 3.8.0, so using it might result in unpredictable behavior. - * This will be fixed in ArangoDB 3.8.1 and will have a different API. - * Any usage of the current Java driver API related to it is therefore discouraged. - * See related bug report - *

- *

- *

* An Analyzer capable of removing specified tokens from the input. * * @author Michele Rastelli diff --git a/src/main/java/com/arangodb/entity/arangosearch/analyzer/TextAnalyzerProperties.java b/src/main/java/com/arangodb/entity/arangosearch/analyzer/TextAnalyzerProperties.java index 8e1ee63fc..b22f7bc34 100644 --- a/src/main/java/com/arangodb/entity/arangosearch/analyzer/TextAnalyzerProperties.java +++ b/src/main/java/com/arangodb/entity/arangosearch/analyzer/TextAnalyzerProperties.java @@ -80,6 +80,9 @@ public SearchAnalyzerCase getAnalyzerCase() { return analyzerCase; } + /** + * @param analyzerCase defaults to {@link SearchAnalyzerCase#lower} + */ public void setAnalyzerCase(SearchAnalyzerCase analyzerCase) { this.analyzerCase = analyzerCase; } diff --git a/src/main/java/com/arangodb/internal/velocypack/VPackDeserializers.java b/src/main/java/com/arangodb/internal/velocypack/VPackDeserializers.java index 8308c4b7a..7047eea43 100644 --- a/src/main/java/com/arangodb/internal/velocypack/VPackDeserializers.java +++ b/src/main/java/com/arangodb/internal/velocypack/VPackDeserializers.java @@ -43,18 +43,7 @@ import com.arangodb.entity.arangosearch.PrimarySort; import com.arangodb.entity.arangosearch.StoreValuesType; import com.arangodb.entity.arangosearch.StoredValue; -import com.arangodb.entity.arangosearch.analyzer.AQLAnalyzer; -import com.arangodb.entity.arangosearch.analyzer.DelimiterAnalyzer; -import com.arangodb.entity.arangosearch.analyzer.GeoJSONAnalyzer; -import com.arangodb.entity.arangosearch.analyzer.GeoPointAnalyzer; -import com.arangodb.entity.arangosearch.analyzer.IdentityAnalyzer; -import com.arangodb.entity.arangosearch.analyzer.NGramAnalyzer; -import com.arangodb.entity.arangosearch.analyzer.NormAnalyzer; -import com.arangodb.entity.arangosearch.analyzer.PipelineAnalyzer; -import com.arangodb.entity.arangosearch.analyzer.SearchAnalyzer; -import com.arangodb.entity.arangosearch.analyzer.StemAnalyzer; -import com.arangodb.entity.arangosearch.analyzer.StopwordsAnalyzer; -import com.arangodb.entity.arangosearch.analyzer.TextAnalyzer; +import com.arangodb.entity.arangosearch.analyzer.*; import com.arangodb.model.CollectionSchema; import com.arangodb.velocypack.VPackDeserializer; import com.arangodb.velocypack.VPackParser; @@ -119,6 +108,8 @@ public class VPackDeserializers { return context.deserialize(vpack, GeoJSONAnalyzer.class); case geopoint: return context.deserialize(vpack, GeoPointAnalyzer.class); + case segmentation: + return context.deserialize(vpack, SegmentationAnalyzer.class); default: throw new IllegalArgumentException("Unknown analyzer type: " + type); } diff --git a/src/test/java/com/arangodb/ArangoSearchTest.java b/src/test/java/com/arangodb/ArangoSearchTest.java index 76e9aa953..f7229b4ce 100644 --- a/src/test/java/com/arangodb/ArangoSearchTest.java +++ b/src/test/java/com/arangodb/ArangoSearchTest.java @@ -34,32 +34,7 @@ import com.arangodb.entity.arangosearch.PrimarySort; import com.arangodb.entity.arangosearch.StoreValuesType; import com.arangodb.entity.arangosearch.StoredValue; -import com.arangodb.entity.arangosearch.analyzer.AQLAnalyzer; -import com.arangodb.entity.arangosearch.analyzer.AQLAnalyzerProperties; -import com.arangodb.entity.arangosearch.analyzer.DelimiterAnalyzer; -import com.arangodb.entity.arangosearch.analyzer.DelimiterAnalyzerProperties; -import com.arangodb.entity.arangosearch.analyzer.EdgeNgram; -import com.arangodb.entity.arangosearch.analyzer.GeoJSONAnalyzer; -import com.arangodb.entity.arangosearch.analyzer.GeoAnalyzerOptions; -import com.arangodb.entity.arangosearch.analyzer.GeoJSONAnalyzerProperties; -import com.arangodb.entity.arangosearch.analyzer.GeoPointAnalyzer; -import com.arangodb.entity.arangosearch.analyzer.GeoPointAnalyzerProperties; -import com.arangodb.entity.arangosearch.analyzer.IdentityAnalyzer; -import com.arangodb.entity.arangosearch.analyzer.NGramAnalyzer; -import com.arangodb.entity.arangosearch.analyzer.NGramAnalyzerProperties; -import com.arangodb.entity.arangosearch.analyzer.NormAnalyzer; -import com.arangodb.entity.arangosearch.analyzer.NormAnalyzerProperties; -import com.arangodb.entity.arangosearch.analyzer.PipelineAnalyzer; -import com.arangodb.entity.arangosearch.analyzer.PipelineAnalyzerProperties; -import com.arangodb.entity.arangosearch.analyzer.SearchAnalyzer; -import com.arangodb.entity.arangosearch.analyzer.SearchAnalyzerCase; -import com.arangodb.entity.arangosearch.analyzer.StemAnalyzer; -import com.arangodb.entity.arangosearch.analyzer.StemAnalyzerProperties; -import com.arangodb.entity.arangosearch.analyzer.StopwordsAnalyzer; -import com.arangodb.entity.arangosearch.analyzer.StopwordsAnalyzerProperties; -import com.arangodb.entity.arangosearch.analyzer.StreamType; -import com.arangodb.entity.arangosearch.analyzer.TextAnalyzer; -import com.arangodb.entity.arangosearch.analyzer.TextAnalyzerProperties; +import com.arangodb.entity.arangosearch.analyzer.*; import com.arangodb.model.arangosearch.AnalyzerDeleteOptions; import com.arangodb.model.arangosearch.ArangoSearchCreateOptions; import com.arangodb.model.arangosearch.ArangoSearchPropertiesOptions; @@ -967,4 +942,26 @@ public void geoPointAnalyzer() { } + @Test + public void segmentationAnalyzer() { + assumeTrue(isAtLeastVersion(3, 9)); + + SegmentationAnalyzerProperties properties = new SegmentationAnalyzerProperties(); + properties.setBreakMode(SegmentationAnalyzerProperties.BreakMode.graphic); + properties.setAnalyzerCase(SearchAnalyzerCase.upper); + + Set features = new HashSet<>(); + features.add(AnalyzerFeature.frequency); + features.add(AnalyzerFeature.norm); + features.add(AnalyzerFeature.position); + + SegmentationAnalyzer segmentationAnalyzer = new SegmentationAnalyzer(); + segmentationAnalyzer.setName("test-" + UUID.randomUUID().toString()); + segmentationAnalyzer.setProperties(properties); + segmentationAnalyzer.setFeatures(features); + + createGetAndDeleteTypedAnalyzer(segmentationAnalyzer); + } + + } From 45900deabcd249167ebf59a555badc2709fcca0f Mon Sep 17 00:00:00 2001 From: Michele Rastelli Date: Sun, 28 Nov 2021 20:21:14 +0100 Subject: [PATCH 2/2] CollationAnalyzer --- .../entity/arangosearch/AnalyzerType.java | 2 +- .../analyzer/CollationAnalyzer.java | 64 +++++++++++++++++++ .../analyzer/CollationAnalyzerProperties.java | 60 +++++++++++++++++ .../velocypack/VPackDeserializers.java | 2 + .../java/com/arangodb/ArangoSearchTest.java | 20 ++++++ 5 files changed, 147 insertions(+), 1 deletion(-) create mode 100644 src/main/java/com/arangodb/entity/arangosearch/analyzer/CollationAnalyzer.java create mode 100644 src/main/java/com/arangodb/entity/arangosearch/analyzer/CollationAnalyzerProperties.java diff --git a/src/main/java/com/arangodb/entity/arangosearch/AnalyzerType.java b/src/main/java/com/arangodb/entity/arangosearch/AnalyzerType.java index 6484770e5..7a5ea5918 100644 --- a/src/main/java/com/arangodb/entity/arangosearch/AnalyzerType.java +++ b/src/main/java/com/arangodb/entity/arangosearch/AnalyzerType.java @@ -24,5 +24,5 @@ * @author Michele Rastelli */ public enum AnalyzerType { - identity, delimiter, stem, norm, ngram, text, pipeline, stopwords, aql, geojson, geopoint, segmentation + identity, delimiter, stem, norm, ngram, text, pipeline, stopwords, aql, geojson, geopoint, segmentation, collation } diff --git a/src/main/java/com/arangodb/entity/arangosearch/analyzer/CollationAnalyzer.java b/src/main/java/com/arangodb/entity/arangosearch/analyzer/CollationAnalyzer.java new file mode 100644 index 000000000..e07d820a3 --- /dev/null +++ b/src/main/java/com/arangodb/entity/arangosearch/analyzer/CollationAnalyzer.java @@ -0,0 +1,64 @@ +/* + * DISCLAIMER + * + * Copyright 2016 ArangoDB GmbH, Cologne, Germany + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Copyright holder is ArangoDB GmbH, Cologne, Germany + */ + +package com.arangodb.entity.arangosearch.analyzer; + + +import com.arangodb.entity.arangosearch.AnalyzerType; + +import java.util.Objects; + +/** + * An Analyzer capable of converting the input into a set of language-specific tokens. This makes comparisons follow the + * rules of the respective language, most notable in range queries against Views. + * + * @author Michele Rastelli + * @see API Documentation + * @since ArangoDB 3.9 + */ +public class CollationAnalyzer extends SearchAnalyzer { + public CollationAnalyzer() { + setType(AnalyzerType.collation); + } + + private CollationAnalyzerProperties properties; + + public CollationAnalyzerProperties getProperties() { + return properties; + } + + public void setProperties(CollationAnalyzerProperties properties) { + this.properties = properties; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + if (!super.equals(o)) return false; + CollationAnalyzer that = (CollationAnalyzer) o; + return Objects.equals(properties, that.properties); + } + + @Override + public int hashCode() { + return Objects.hash(super.hashCode(), properties); + } +} diff --git a/src/main/java/com/arangodb/entity/arangosearch/analyzer/CollationAnalyzerProperties.java b/src/main/java/com/arangodb/entity/arangosearch/analyzer/CollationAnalyzerProperties.java new file mode 100644 index 000000000..d056dbaa2 --- /dev/null +++ b/src/main/java/com/arangodb/entity/arangosearch/analyzer/CollationAnalyzerProperties.java @@ -0,0 +1,60 @@ +/* + * DISCLAIMER + * + * Copyright 2016 ArangoDB GmbH, Cologne, Germany + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Copyright holder is ArangoDB GmbH, Cologne, Germany + */ + +package com.arangodb.entity.arangosearch.analyzer; + + +import java.util.Objects; + +/** + * @author Michele Rastelli + * @since ArangoDB 3.9 + */ +public class CollationAnalyzerProperties { + + private String locale; + + /** + * @return a locale in the format `language[_COUNTRY][.encoding][@variant]` (square brackets denote optional parts), + * e.g. `de.utf-8` or `en_US.utf-8`. Only UTF-8 encoding is meaningful in ArangoDB. + * The locale is forwarded to ICU without checks. An invalid locale does not prevent the creation of the Analyzer. + * @see Supported Languages + */ + public String getLocale() { + return locale; + } + + public void setLocale(String locale) { + this.locale = locale; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + CollationAnalyzerProperties that = (CollationAnalyzerProperties) o; + return Objects.equals(locale, that.locale); + } + + @Override + public int hashCode() { + return Objects.hash(locale); + } +} diff --git a/src/main/java/com/arangodb/internal/velocypack/VPackDeserializers.java b/src/main/java/com/arangodb/internal/velocypack/VPackDeserializers.java index 7047eea43..933c8dc2e 100644 --- a/src/main/java/com/arangodb/internal/velocypack/VPackDeserializers.java +++ b/src/main/java/com/arangodb/internal/velocypack/VPackDeserializers.java @@ -110,6 +110,8 @@ public class VPackDeserializers { return context.deserialize(vpack, GeoPointAnalyzer.class); case segmentation: return context.deserialize(vpack, SegmentationAnalyzer.class); + case collation: + return context.deserialize(vpack, CollationAnalyzer.class); default: throw new IllegalArgumentException("Unknown analyzer type: " + type); } diff --git a/src/test/java/com/arangodb/ArangoSearchTest.java b/src/test/java/com/arangodb/ArangoSearchTest.java index f7229b4ce..671fc0389 100644 --- a/src/test/java/com/arangodb/ArangoSearchTest.java +++ b/src/test/java/com/arangodb/ArangoSearchTest.java @@ -963,5 +963,25 @@ public void segmentationAnalyzer() { createGetAndDeleteTypedAnalyzer(segmentationAnalyzer); } + @Test + public void collationAnalyzer() { + assumeTrue(isAtLeastVersion(3, 9)); + + CollationAnalyzerProperties properties = new CollationAnalyzerProperties(); + properties.setLocale("ru"); + + Set features = new HashSet<>(); + features.add(AnalyzerFeature.frequency); + features.add(AnalyzerFeature.norm); + features.add(AnalyzerFeature.position); + + CollationAnalyzer collationAnalyzer = new CollationAnalyzer(); + collationAnalyzer.setName("test-" + UUID.randomUUID().toString()); + collationAnalyzer.setProperties(properties); + collationAnalyzer.setFeatures(features); + + createGetAndDeleteTypedAnalyzer(collationAnalyzer); + } + }