Skip to content

[DE-77] Segmentation and Collation Analyzers #418

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jan 12, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,5 @@
* @author Michele Rastelli
*/
public enum AnalyzerType {
identity, delimiter, stem, norm, ngram, text, pipeline, stopwords, aql, geojson, geopoint
identity, delimiter, stem, norm, ngram, text, pipeline, stopwords, aql, geojson, geopoint, segmentation, collation
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/*
* DISCLAIMER
*
* Copyright 2016 ArangoDB GmbH, Cologne, Germany
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Copyright holder is ArangoDB GmbH, Cologne, Germany
*/

package com.arangodb.entity.arangosearch.analyzer;


import com.arangodb.entity.arangosearch.AnalyzerType;

import java.util.Objects;

/**
* An Analyzer capable of converting the input into a set of language-specific tokens. This makes comparisons follow the
* rules of the respective language, most notable in range queries against Views.
*
* @author Michele Rastelli
* @see <a href= "https://www.arangodb.com/docs/stable/arangosearch-analyzers.html#collation">API Documentation</a>
* @since ArangoDB 3.9
*/
public class CollationAnalyzer extends SearchAnalyzer {
public CollationAnalyzer() {
setType(AnalyzerType.collation);
}

private CollationAnalyzerProperties properties;

public CollationAnalyzerProperties getProperties() {
return properties;
}

public void setProperties(CollationAnalyzerProperties properties) {
this.properties = properties;
}

@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
if (!super.equals(o)) return false;
CollationAnalyzer that = (CollationAnalyzer) o;
return Objects.equals(properties, that.properties);
}

@Override
public int hashCode() {
return Objects.hash(super.hashCode(), properties);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
/*
* DISCLAIMER
*
* Copyright 2016 ArangoDB GmbH, Cologne, Germany
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Copyright holder is ArangoDB GmbH, Cologne, Germany
*/

package com.arangodb.entity.arangosearch.analyzer;


import java.util.Objects;

/**
* @author Michele Rastelli
* @since ArangoDB 3.9
*/
public class CollationAnalyzerProperties {

private String locale;

/**
* @return a locale in the format `language[_COUNTRY][.encoding][@variant]` (square brackets denote optional parts),
* e.g. `de.utf-8` or `en_US.utf-8`. Only UTF-8 encoding is meaningful in ArangoDB.
* The locale is forwarded to ICU without checks. An invalid locale does not prevent the creation of the Analyzer.
* @see <a href= "https://www.arangodb.com/docs/stable/arangosearch-analyzers.html#supported-languages">Supported Languages</a>
*/
public String getLocale() {
return locale;
}

public void setLocale(String locale) {
this.locale = locale;
}

@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
CollationAnalyzerProperties that = (CollationAnalyzerProperties) o;
return Objects.equals(locale, that.locale);
}

@Override
public int hashCode() {
return Objects.hash(locale);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,9 @@ public SearchAnalyzerCase getAnalyzerCase() {
return analyzerCase;
}

/**
* @param analyzerCase defaults to {@link SearchAnalyzerCase#none}
*/
public void setAnalyzerCase(SearchAnalyzerCase analyzerCase) {
this.analyzerCase = analyzerCase;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ public enum SearchAnalyzerCase {
upper,

/**
* to not change character case (default)
* to not change character case
*/
none
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
/*
* DISCLAIMER
*
* Copyright 2016 ArangoDB GmbH, Cologne, Germany
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Copyright holder is ArangoDB GmbH, Cologne, Germany
*/

package com.arangodb.entity.arangosearch.analyzer;


import com.arangodb.entity.arangosearch.AnalyzerType;

import java.util.Objects;

/**
* An Analyzer capable of breaking up the input text into tokens in a language-agnostic manner, making it suitable for
* mixed language strings.
* It can optionally preserve all non-whitespace or all characters instead of keeping alphanumeric characters only, as
* well as apply case conversion.
*
* @author Michele Rastelli
* @see <a href= "https://www.arangodb.com/docs/stable/arangosearch-analyzers.html#segmentation">API Documentation</a>
* @since ArangoDB 3.9
*/
public class SegmentationAnalyzer extends SearchAnalyzer {
public SegmentationAnalyzer() {
setType(AnalyzerType.segmentation);
}

private SegmentationAnalyzerProperties properties;

public SegmentationAnalyzerProperties getProperties() {
return properties;
}

public void setProperties(SegmentationAnalyzerProperties properties) {
this.properties = properties;
}

@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
if (!super.equals(o)) return false;
SegmentationAnalyzer that = (SegmentationAnalyzer) o;
return Objects.equals(properties, that.properties);
}

@Override
public int hashCode() {
return Objects.hash(super.hashCode(), properties);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
/*
* DISCLAIMER
*
* Copyright 2016 ArangoDB GmbH, Cologne, Germany
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Copyright holder is ArangoDB GmbH, Cologne, Germany
*/

package com.arangodb.entity.arangosearch.analyzer;


import com.arangodb.velocypack.annotations.SerializedName;

import java.util.Objects;

/**
* @author Michele Rastelli
* @since ArangoDB 3.9
*/
public class SegmentationAnalyzerProperties {

@SerializedName("break")
private BreakMode breakMode;

@SerializedName("case")
private SearchAnalyzerCase analyzerCase;

public BreakMode getBreakMode() {
return breakMode;
}

/**
* @param breakMode defaults to {@link BreakMode#alpha}
*/
public void setBreakMode(BreakMode breakMode) {
this.breakMode = breakMode;
}

public SearchAnalyzerCase getAnalyzerCase() {
return analyzerCase;
}

/**
* @param analyzerCase defaults to {@link SearchAnalyzerCase#lower}
*/
public void setAnalyzerCase(SearchAnalyzerCase analyzerCase) {
this.analyzerCase = analyzerCase;
}

public enum BreakMode {
all, alpha, graphic
}

@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
SegmentationAnalyzerProperties that = (SegmentationAnalyzerProperties) o;
return breakMode == that.breakMode && analyzerCase == that.analyzerCase;
}

@Override
public int hashCode() {
return Objects.hash(breakMode, analyzerCase);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,6 @@
import java.util.Objects;

/**
* WARNING:
* The implementation of Stopwords analyzer is not final in ArangoDB 3.8.0, so using it might result in unpredictable behavior.
* This will be fixed in ArangoDB 3.8.1 and will have a different API.
* Any usage of the current Java driver API related to it is therefore discouraged.
* See related <a href="https://github.com/arangodb/arangodb-java-driver/issues/394">bug report</a>
* <p>
* <p>
* <p>
* An Analyzer capable of removing specified tokens from the input.
*
* @author Michele Rastelli
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,9 @@ public SearchAnalyzerCase getAnalyzerCase() {
return analyzerCase;
}

/**
* @param analyzerCase defaults to {@link SearchAnalyzerCase#lower}
*/
public void setAnalyzerCase(SearchAnalyzerCase analyzerCase) {
this.analyzerCase = analyzerCase;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,18 +43,7 @@
import com.arangodb.entity.arangosearch.PrimarySort;
import com.arangodb.entity.arangosearch.StoreValuesType;
import com.arangodb.entity.arangosearch.StoredValue;
import com.arangodb.entity.arangosearch.analyzer.AQLAnalyzer;
import com.arangodb.entity.arangosearch.analyzer.DelimiterAnalyzer;
import com.arangodb.entity.arangosearch.analyzer.GeoJSONAnalyzer;
import com.arangodb.entity.arangosearch.analyzer.GeoPointAnalyzer;
import com.arangodb.entity.arangosearch.analyzer.IdentityAnalyzer;
import com.arangodb.entity.arangosearch.analyzer.NGramAnalyzer;
import com.arangodb.entity.arangosearch.analyzer.NormAnalyzer;
import com.arangodb.entity.arangosearch.analyzer.PipelineAnalyzer;
import com.arangodb.entity.arangosearch.analyzer.SearchAnalyzer;
import com.arangodb.entity.arangosearch.analyzer.StemAnalyzer;
import com.arangodb.entity.arangosearch.analyzer.StopwordsAnalyzer;
import com.arangodb.entity.arangosearch.analyzer.TextAnalyzer;
import com.arangodb.entity.arangosearch.analyzer.*;
import com.arangodb.model.CollectionSchema;
import com.arangodb.model.ZKDIndexOptions;
import com.arangodb.velocypack.VPackDeserializer;
Expand Down Expand Up @@ -120,6 +109,10 @@ public class VPackDeserializers {
return context.deserialize(vpack, GeoJSONAnalyzer.class);
case geopoint:
return context.deserialize(vpack, GeoPointAnalyzer.class);
case segmentation:
return context.deserialize(vpack, SegmentationAnalyzer.class);
case collation:
return context.deserialize(vpack, CollationAnalyzer.class);
default:
throw new IllegalArgumentException("Unknown analyzer type: " + type);
}
Expand Down
Loading