Skip to content

Commit 609c113

Browse files
authored
[DE-77] Segmentation and Collation Analyzers (#418)
* SegmentationAnalyzer * CollationAnalyzer
1 parent 0f99d4e commit 609c113

11 files changed

+324
-48
lines changed

src/main/java/com/arangodb/entity/arangosearch/AnalyzerType.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,5 +24,5 @@
2424
* @author Michele Rastelli
2525
*/
2626
public enum AnalyzerType {
27-
identity, delimiter, stem, norm, ngram, text, pipeline, stopwords, aql, geojson, geopoint
27+
identity, delimiter, stem, norm, ngram, text, pipeline, stopwords, aql, geojson, geopoint, segmentation, collation
2828
}
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
/*
2+
* DISCLAIMER
3+
*
4+
* Copyright 2016 ArangoDB GmbH, Cologne, Germany
5+
*
6+
* Licensed under the Apache License, Version 2.0 (the "License");
7+
* you may not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*
18+
* Copyright holder is ArangoDB GmbH, Cologne, Germany
19+
*/
20+
21+
package com.arangodb.entity.arangosearch.analyzer;
22+
23+
24+
import com.arangodb.entity.arangosearch.AnalyzerType;
25+
26+
import java.util.Objects;
27+
28+
/**
29+
* An Analyzer capable of converting the input into a set of language-specific tokens. This makes comparisons follow the
30+
* rules of the respective language, most notable in range queries against Views.
31+
*
32+
* @author Michele Rastelli
33+
* @see <a href= "https://www.arangodb.com/docs/stable/arangosearch-analyzers.html#collation">API Documentation</a>
34+
* @since ArangoDB 3.9
35+
*/
36+
public class CollationAnalyzer extends SearchAnalyzer {
37+
public CollationAnalyzer() {
38+
setType(AnalyzerType.collation);
39+
}
40+
41+
private CollationAnalyzerProperties properties;
42+
43+
public CollationAnalyzerProperties getProperties() {
44+
return properties;
45+
}
46+
47+
public void setProperties(CollationAnalyzerProperties properties) {
48+
this.properties = properties;
49+
}
50+
51+
@Override
52+
public boolean equals(Object o) {
53+
if (this == o) return true;
54+
if (o == null || getClass() != o.getClass()) return false;
55+
if (!super.equals(o)) return false;
56+
CollationAnalyzer that = (CollationAnalyzer) o;
57+
return Objects.equals(properties, that.properties);
58+
}
59+
60+
@Override
61+
public int hashCode() {
62+
return Objects.hash(super.hashCode(), properties);
63+
}
64+
}
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
/*
2+
* DISCLAIMER
3+
*
4+
* Copyright 2016 ArangoDB GmbH, Cologne, Germany
5+
*
6+
* Licensed under the Apache License, Version 2.0 (the "License");
7+
* you may not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*
18+
* Copyright holder is ArangoDB GmbH, Cologne, Germany
19+
*/
20+
21+
package com.arangodb.entity.arangosearch.analyzer;
22+
23+
24+
import java.util.Objects;
25+
26+
/**
27+
* @author Michele Rastelli
28+
* @since ArangoDB 3.9
29+
*/
30+
public class CollationAnalyzerProperties {
31+
32+
private String locale;
33+
34+
/**
35+
* @return a locale in the format `language[_COUNTRY][.encoding][@variant]` (square brackets denote optional parts),
36+
* e.g. `de.utf-8` or `en_US.utf-8`. Only UTF-8 encoding is meaningful in ArangoDB.
37+
* The locale is forwarded to ICU without checks. An invalid locale does not prevent the creation of the Analyzer.
38+
* @see <a href= "https://www.arangodb.com/docs/stable/arangosearch-analyzers.html#supported-languages">Supported Languages</a>
39+
*/
40+
public String getLocale() {
41+
return locale;
42+
}
43+
44+
public void setLocale(String locale) {
45+
this.locale = locale;
46+
}
47+
48+
@Override
49+
public boolean equals(Object o) {
50+
if (this == o) return true;
51+
if (o == null || getClass() != o.getClass()) return false;
52+
CollationAnalyzerProperties that = (CollationAnalyzerProperties) o;
53+
return Objects.equals(locale, that.locale);
54+
}
55+
56+
@Override
57+
public int hashCode() {
58+
return Objects.hash(locale);
59+
}
60+
}

src/main/java/com/arangodb/entity/arangosearch/analyzer/NormAnalyzerProperties.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,9 @@ public SearchAnalyzerCase getAnalyzerCase() {
6666
return analyzerCase;
6767
}
6868

69+
/**
70+
* @param analyzerCase defaults to {@link SearchAnalyzerCase#none}
71+
*/
6972
public void setAnalyzerCase(SearchAnalyzerCase analyzerCase) {
7073
this.analyzerCase = analyzerCase;
7174
}

src/main/java/com/arangodb/entity/arangosearch/analyzer/SearchAnalyzerCase.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ public enum SearchAnalyzerCase {
3636
upper,
3737

3838
/**
39-
* to not change character case (default)
39+
* to not change character case
4040
*/
4141
none
4242
}
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
/*
2+
* DISCLAIMER
3+
*
4+
* Copyright 2016 ArangoDB GmbH, Cologne, Germany
5+
*
6+
* Licensed under the Apache License, Version 2.0 (the "License");
7+
* you may not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*
18+
* Copyright holder is ArangoDB GmbH, Cologne, Germany
19+
*/
20+
21+
package com.arangodb.entity.arangosearch.analyzer;
22+
23+
24+
import com.arangodb.entity.arangosearch.AnalyzerType;
25+
26+
import java.util.Objects;
27+
28+
/**
29+
* An Analyzer capable of breaking up the input text into tokens in a language-agnostic manner, making it suitable for
30+
* mixed language strings.
31+
* It can optionally preserve all non-whitespace or all characters instead of keeping alphanumeric characters only, as
32+
* well as apply case conversion.
33+
*
34+
* @author Michele Rastelli
35+
* @see <a href= "https://www.arangodb.com/docs/stable/arangosearch-analyzers.html#segmentation">API Documentation</a>
36+
* @since ArangoDB 3.9
37+
*/
38+
public class SegmentationAnalyzer extends SearchAnalyzer {
39+
public SegmentationAnalyzer() {
40+
setType(AnalyzerType.segmentation);
41+
}
42+
43+
private SegmentationAnalyzerProperties properties;
44+
45+
public SegmentationAnalyzerProperties getProperties() {
46+
return properties;
47+
}
48+
49+
public void setProperties(SegmentationAnalyzerProperties properties) {
50+
this.properties = properties;
51+
}
52+
53+
@Override
54+
public boolean equals(Object o) {
55+
if (this == o) return true;
56+
if (o == null || getClass() != o.getClass()) return false;
57+
if (!super.equals(o)) return false;
58+
SegmentationAnalyzer that = (SegmentationAnalyzer) o;
59+
return Objects.equals(properties, that.properties);
60+
}
61+
62+
@Override
63+
public int hashCode() {
64+
return Objects.hash(super.hashCode(), properties);
65+
}
66+
}
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
/*
2+
* DISCLAIMER
3+
*
4+
* Copyright 2016 ArangoDB GmbH, Cologne, Germany
5+
*
6+
* Licensed under the Apache License, Version 2.0 (the "License");
7+
* you may not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*
18+
* Copyright holder is ArangoDB GmbH, Cologne, Germany
19+
*/
20+
21+
package com.arangodb.entity.arangosearch.analyzer;
22+
23+
24+
import com.arangodb.velocypack.annotations.SerializedName;
25+
26+
import java.util.Objects;
27+
28+
/**
29+
* @author Michele Rastelli
30+
* @since ArangoDB 3.9
31+
*/
32+
public class SegmentationAnalyzerProperties {
33+
34+
@SerializedName("break")
35+
private BreakMode breakMode;
36+
37+
@SerializedName("case")
38+
private SearchAnalyzerCase analyzerCase;
39+
40+
public BreakMode getBreakMode() {
41+
return breakMode;
42+
}
43+
44+
/**
45+
* @param breakMode defaults to {@link BreakMode#alpha}
46+
*/
47+
public void setBreakMode(BreakMode breakMode) {
48+
this.breakMode = breakMode;
49+
}
50+
51+
public SearchAnalyzerCase getAnalyzerCase() {
52+
return analyzerCase;
53+
}
54+
55+
/**
56+
* @param analyzerCase defaults to {@link SearchAnalyzerCase#lower}
57+
*/
58+
public void setAnalyzerCase(SearchAnalyzerCase analyzerCase) {
59+
this.analyzerCase = analyzerCase;
60+
}
61+
62+
public enum BreakMode {
63+
all, alpha, graphic
64+
}
65+
66+
@Override
67+
public boolean equals(Object o) {
68+
if (this == o) return true;
69+
if (o == null || getClass() != o.getClass()) return false;
70+
SegmentationAnalyzerProperties that = (SegmentationAnalyzerProperties) o;
71+
return breakMode == that.breakMode && analyzerCase == that.analyzerCase;
72+
}
73+
74+
@Override
75+
public int hashCode() {
76+
return Objects.hash(breakMode, analyzerCase);
77+
}
78+
}

src/main/java/com/arangodb/entity/arangosearch/analyzer/StopwordsAnalyzer.java

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -26,14 +26,6 @@
2626
import java.util.Objects;
2727

2828
/**
29-
* WARNING:
30-
* The implementation of Stopwords analyzer is not final in ArangoDB 3.8.0, so using it might result in unpredictable behavior.
31-
* This will be fixed in ArangoDB 3.8.1 and will have a different API.
32-
* Any usage of the current Java driver API related to it is therefore discouraged.
33-
* See related <a href="https://github.com/arangodb/arangodb-java-driver/issues/394">bug report</a>
34-
* <p>
35-
* <p>
36-
* <p>
3729
* An Analyzer capable of removing specified tokens from the input.
3830
*
3931
* @author Michele Rastelli

src/main/java/com/arangodb/entity/arangosearch/analyzer/TextAnalyzerProperties.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,9 @@ public SearchAnalyzerCase getAnalyzerCase() {
8080
return analyzerCase;
8181
}
8282

83+
/**
84+
* @param analyzerCase defaults to {@link SearchAnalyzerCase#lower}
85+
*/
8386
public void setAnalyzerCase(SearchAnalyzerCase analyzerCase) {
8487
this.analyzerCase = analyzerCase;
8588
}

src/main/java/com/arangodb/internal/velocypack/VPackDeserializers.java

Lines changed: 5 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -43,18 +43,7 @@
4343
import com.arangodb.entity.arangosearch.PrimarySort;
4444
import com.arangodb.entity.arangosearch.StoreValuesType;
4545
import com.arangodb.entity.arangosearch.StoredValue;
46-
import com.arangodb.entity.arangosearch.analyzer.AQLAnalyzer;
47-
import com.arangodb.entity.arangosearch.analyzer.DelimiterAnalyzer;
48-
import com.arangodb.entity.arangosearch.analyzer.GeoJSONAnalyzer;
49-
import com.arangodb.entity.arangosearch.analyzer.GeoPointAnalyzer;
50-
import com.arangodb.entity.arangosearch.analyzer.IdentityAnalyzer;
51-
import com.arangodb.entity.arangosearch.analyzer.NGramAnalyzer;
52-
import com.arangodb.entity.arangosearch.analyzer.NormAnalyzer;
53-
import com.arangodb.entity.arangosearch.analyzer.PipelineAnalyzer;
54-
import com.arangodb.entity.arangosearch.analyzer.SearchAnalyzer;
55-
import com.arangodb.entity.arangosearch.analyzer.StemAnalyzer;
56-
import com.arangodb.entity.arangosearch.analyzer.StopwordsAnalyzer;
57-
import com.arangodb.entity.arangosearch.analyzer.TextAnalyzer;
46+
import com.arangodb.entity.arangosearch.analyzer.*;
5847
import com.arangodb.model.CollectionSchema;
5948
import com.arangodb.model.ZKDIndexOptions;
6049
import com.arangodb.velocypack.VPackDeserializer;
@@ -120,6 +109,10 @@ public class VPackDeserializers {
120109
return context.deserialize(vpack, GeoJSONAnalyzer.class);
121110
case geopoint:
122111
return context.deserialize(vpack, GeoPointAnalyzer.class);
112+
case segmentation:
113+
return context.deserialize(vpack, SegmentationAnalyzer.class);
114+
case collation:
115+
return context.deserialize(vpack, CollationAnalyzer.class);
123116
default:
124117
throw new IllegalArgumentException("Unknown analyzer type: " + type);
125118
}

0 commit comments

Comments
 (0)