typed analyzers javadoc

rashtao · rashtao · commit d6e7a5b6bd42 · 2020-04-01T17:37:12.000+02:00
diff --git a/src/main/java/com/arangodb/entity/arangosearch/AnalyzerFeature.java b/src/main/java/com/arangodb/entity/arangosearch/AnalyzerFeature.java
@@ -22,7 +22,23 @@
 
 /**
  * @author Michele Rastelli
+ * @see <a href= "https://www.arangodb.com/docs/stable/arangosearch-analyzers.html#analyzer-features">API Documentation</a>
  */
 public enum AnalyzerFeature {
-    frequency, norm, position
+
+    /**
+     * how often a term is seen, required for PHRASE()
+     */
+    frequency,
+
+    /**
+     * the field normalization factor
+     */
+    norm,
+
+    /**
+     * sequentially increasing term position, required for PHRASE(). If present then the frequency feature is also required
+     */
+    position
+
 }
diff --git a/src/main/java/com/arangodb/entity/arangosearch/analyzer/DelimiterAnalyzer.java b/src/main/java/com/arangodb/entity/arangosearch/analyzer/DelimiterAnalyzer.java
@@ -26,7 +26,10 @@
 import java.util.Objects;
 
 /**
+ * An Analyzer capable of breaking up delimited text into tokens as per RFC 4180 (without starting new records on newlines).
+ *
  * @author Michele Rastelli
+ * @see <a href= "https://www.arangodb.com/docs/stable/arangosearch-analyzers.html#delimiter">API Documentation</a>
  */
 public class DelimiterAnalyzer extends SearchAnalyzer {
     public DelimiterAnalyzer() {
diff --git a/src/main/java/com/arangodb/entity/arangosearch/analyzer/DelimiterAnalyzerProperties.java b/src/main/java/com/arangodb/entity/arangosearch/analyzer/DelimiterAnalyzerProperties.java
@@ -30,6 +30,9 @@ public class DelimiterAnalyzerProperties {
 
     private String delimiter;
 
+    /**
+     * @return the delimiting character(s)
+     */
     public String getDelimiter() {
         return delimiter;
     }
diff --git a/src/main/java/com/arangodb/entity/arangosearch/analyzer/EdgeNgram.java b/src/main/java/com/arangodb/entity/arangosearch/analyzer/EdgeNgram.java
@@ -31,6 +31,9 @@ public class EdgeNgram {
     private long max;
     private boolean preserveOriginal;
 
+    /**
+     * @return minimal n-gram length
+     */
     public long getMin() {
         return min;
     }
@@ -39,6 +42,9 @@ public void setMin(long min) {
         this.min = min;
     }
 
+    /**
+     * @return maximal n-gram length
+     */
     public long getMax() {
         return max;
     }
@@ -47,6 +53,9 @@ public void setMax(long max) {
         this.max = max;
     }
 
+    /**
+     * @return whether to include the original token even if its length is less than min or greater than max
+     */
     public boolean isPreserveOriginal() {
         return preserveOriginal;
     }
diff --git a/src/main/java/com/arangodb/entity/arangosearch/analyzer/IdentityAnalyzer.java b/src/main/java/com/arangodb/entity/arangosearch/analyzer/IdentityAnalyzer.java
@@ -24,7 +24,10 @@
 import com.arangodb.entity.arangosearch.AnalyzerType;
 
 /**
+ * An Analyzer applying the identity transformation, i.e. returning the input unmodified.
+ *
  * @author Michele Rastelli
+ * @see <a href= "https://www.arangodb.com/docs/stable/arangosearch-analyzers.html#identity">API Documentation</a>
  */
 public class IdentityAnalyzer extends SearchAnalyzer {
     public IdentityAnalyzer() {
diff --git a/src/main/java/com/arangodb/entity/arangosearch/analyzer/NGramAnalyzer.java b/src/main/java/com/arangodb/entity/arangosearch/analyzer/NGramAnalyzer.java
@@ -26,7 +26,15 @@
 import java.util.Objects;
 
 /**
+ * An Analyzer capable of producing n-grams from a specified input in a range of min..max (inclusive). Can optionally
+ * preserve the original input.
+ * <p>
+ * This Analyzer type can be used to implement substring matching. Note that it slices the input based on bytes and not
+ * characters by default (streamType). The “binary” mode supports single-byte characters only; multi-byte UTF-8
+ * characters raise an Invalid UTF-8 sequence query error.
+ *
  * @author Michele Rastelli
+ * @see <a href= "https://www.arangodb.com/docs/stable/arangosearch-analyzers.html#n-gram">API Documentation</a>
  */
 public class NGramAnalyzer extends SearchAnalyzer {
     public NGramAnalyzer() {
diff --git a/src/main/java/com/arangodb/entity/arangosearch/analyzer/NGramAnalyzerProperties.java b/src/main/java/com/arangodb/entity/arangosearch/analyzer/NGramAnalyzerProperties.java
@@ -24,7 +24,15 @@
 import java.util.Objects;
 
 /**
+ * An Analyzer capable of producing n-grams from a specified input in a range of min..max (inclusive). Can optionally
+ * preserve the original input.
+ * <p>
+ * This Analyzer type can be used to implement substring matching. Note that it slices the input based on bytes and not
+ * characters by default (streamType). The “binary” mode supports single-byte characters only; multi-byte UTF-8
+ * characters raise an Invalid UTF-8 sequence query error.
+ *
  * @author Michele Rastelli
+ * @see <a href= "https://www.arangodb.com/docs/stable/arangosearch-analyzers.html#n-gram">API Documentation</a>
  */
 public class NGramAnalyzerProperties {
 
@@ -41,6 +49,9 @@ public NGramAnalyzerProperties() {
         streamType = StreamType.binary;
     }
 
+    /**
+     * @return minimum n-gram length
+     */
     public long getMin() {
         return min;
     }
@@ -49,6 +60,9 @@ public void setMin(long min) {
         this.min = min;
     }
 
+    /**
+     * @return maximum n-gram length
+     */
     public long getMax() {
         return max;
     }
@@ -57,6 +71,10 @@ public void setMax(long max) {
         this.max = max;
     }
 
+    /**
+     * @return <code>true</code> to include the original value as well
+     *         <code>false</code> to produce the n-grams based on min and max only
+     */
     public boolean isPreserveOriginal() {
         return preserveOriginal;
     }
@@ -65,6 +83,10 @@ public void setPreserveOriginal(boolean preserveOriginal) {
         this.preserveOriginal = preserveOriginal;
     }
 
+    /**
+     * @return this value will be prepended to n-grams which include the beginning of the input. Can be used for
+     * matching prefixes. Choose a character or sequence as marker which does not occur in the input
+     */
     public String getStartMarker() {
         return startMarker;
     }
@@ -73,6 +95,10 @@ public void setStartMarker(String startMarker) {
         this.startMarker = startMarker;
     }
 
+    /**
+     * @return this value will be appended to n-grams which include the end of the input. Can be used for matching
+     * suffixes. Choose a character or sequence as marker which does not occur in the input.
+     */
     public String getEndMarker() {
         return endMarker;
     }
diff --git a/src/main/java/com/arangodb/entity/arangosearch/analyzer/NormAnalyzer.java b/src/main/java/com/arangodb/entity/arangosearch/analyzer/NormAnalyzer.java
@@ -26,7 +26,10 @@
 import java.util.Objects;
 
 /**
+ * An Analyzer capable of normalizing the text, treated as a single token, i.e. case conversion and accent removal.
+ *
  * @author Michele Rastelli
+ * @see <a href= "https://www.arangodb.com/docs/stable/arangosearch-analyzers.html#norm">API Documentation</a>
  */
 public class NormAnalyzer extends SearchAnalyzer {
     public NormAnalyzer() {
diff --git a/src/main/java/com/arangodb/entity/arangosearch/analyzer/NormAnalyzerProperties.java b/src/main/java/com/arangodb/entity/arangosearch/analyzer/NormAnalyzerProperties.java
@@ -37,6 +37,11 @@ public class NormAnalyzerProperties {
     @SerializedName("case")
     private SearchAnalyzerCase analyzerCase;
 
+    /**
+     * @return a locale in the format `language[_COUNTRY][.encoding][@variant]` (square brackets denote optional parts),
+     * e.g. `de.utf-8` or `en_US.utf-8`. Only UTF-8 encoding is meaningful in ArangoDB.
+     * @see <a href= "https://www.arangodb.com/docs/stable/arangosearch-analyzers.html#supported-languages">Supported Languages</a>
+     */
     public String getLocale() {
         return locale;
     }
@@ -45,6 +50,10 @@ public void setLocale(String locale) {
         this.locale = locale;
     }
 
+    /**
+     * @return <code>true</code> to preserve accented characters (default)
+     *         <code>false</code> to convert accented characters to their base characters
+     */
     public boolean isAccent() {
         return accent;
     }
diff --git a/src/main/java/com/arangodb/entity/arangosearch/analyzer/SearchAnalyzer.java b/src/main/java/com/arangodb/entity/arangosearch/analyzer/SearchAnalyzer.java
@@ -35,6 +35,9 @@ public abstract class SearchAnalyzer {
     private AnalyzerType type;
     private Set<AnalyzerFeature> features;
 
+    /**
+     * @return The Analyzer name.
+     */
     public String getName() {
         return name;
     }
@@ -43,6 +46,9 @@ public void setName(String name) {
         this.name = name;
     }
 
+    /**
+     * @return The Analyzer type.
+     */
     public AnalyzerType getType() {
         return type;
     }
@@ -51,6 +57,9 @@ public void setType(AnalyzerType type) {
         this.type = type;
     }
 
+    /**
+     * @return The set of features to set on the Analyzer generated fields.
+     */
     public Set<AnalyzerFeature> getFeatures() {
         return features;
     }
diff --git a/src/main/java/com/arangodb/entity/arangosearch/analyzer/SearchAnalyzerCase.java b/src/main/java/com/arangodb/entity/arangosearch/analyzer/SearchAnalyzerCase.java
@@ -25,5 +25,18 @@
  * @author Michele Rastelli
  */
 public enum SearchAnalyzerCase {
-    lower, upper, none
+    /**
+     * convert to all lower-case characters
+     */
+    lower,
+
+    /**
+     * convert to all upper-case characters
+     */
+    upper,
+
+    /**
+     * to not change character case (default)
+     */
+    none
 }
diff --git a/src/main/java/com/arangodb/entity/arangosearch/analyzer/StemAnalyzer.java b/src/main/java/com/arangodb/entity/arangosearch/analyzer/StemAnalyzer.java
@@ -26,7 +26,10 @@
 import java.util.Objects;
 
 /**
+ * An Analyzer capable of stemming the text, treated as a single token, for supported languages.
+ *
  * @author Michele Rastelli
+ * @see <a href= "https://www.arangodb.com/docs/stable/arangosearch-analyzers.html#stem">API Documentation</a>
  */
 public class StemAnalyzer extends SearchAnalyzer {
     public StemAnalyzer() {
diff --git a/src/main/java/com/arangodb/entity/arangosearch/analyzer/StemAnalyzerProperties.java b/src/main/java/com/arangodb/entity/arangosearch/analyzer/StemAnalyzerProperties.java
@@ -30,6 +30,11 @@ public class StemAnalyzerProperties {
 
     private String locale;
 
+    /**
+     * @return a locale in the format `language[_COUNTRY][.encoding][@variant]` (square brackets denote optional parts),
+     * e.g. `de.utf-8` or `en_US.utf-8`. Only UTF-8 encoding is meaningful in ArangoDB.
+     * @see <a href= "https://www.arangodb.com/docs/stable/arangosearch-analyzers.html#supported-languages">Supported Languages</a>
+     */
     public String getLocale() {
         return locale;
     }
diff --git a/src/main/java/com/arangodb/entity/arangosearch/analyzer/StreamType.java b/src/main/java/com/arangodb/entity/arangosearch/analyzer/StreamType.java
@@ -25,5 +25,13 @@
  * @author Michele Rastelli
  */
 public enum StreamType {
-    binary, utf8
+    /**
+     * one byte is considered as one character (default)
+     */
+    binary,
+
+    /**
+     * one Unicode codepoint is treated as one character
+     */
+    utf8
 }
diff --git a/src/main/java/com/arangodb/entity/arangosearch/analyzer/TextAnalyzer.java b/src/main/java/com/arangodb/entity/arangosearch/analyzer/TextAnalyzer.java
@@ -26,7 +26,11 @@
 import java.util.Objects;
 
 /**
+ * An Analyzer capable of breaking up strings into individual words while also optionally filtering out stop-words,
+ * extracting word stems, applying case conversion and accent removal.
+ *
  * @author Michele Rastelli
+ * @see <a href= "https://www.arangodb.com/docs/stable/arangosearch-analyzers.html#text">API Documentation</a>
  */
 public class TextAnalyzer extends SearchAnalyzer {
     public TextAnalyzer() {
diff --git a/src/main/java/com/arangodb/entity/arangosearch/analyzer/TextAnalyzerProperties.java b/src/main/java/com/arangodb/entity/arangosearch/analyzer/TextAnalyzerProperties.java
@@ -51,6 +51,11 @@ public TextAnalyzerProperties() {
 
     private String stopwordsPath;
 
+    /**
+     * @return a locale in the format `language[_COUNTRY][.encoding][@variant]` (square brackets denote optional parts),
+     * e.g. `de.utf-8` or `en_US.utf-8`. Only UTF-8 encoding is meaningful in ArangoDB.
+     * @see <a href= "https://www.arangodb.com/docs/stable/arangosearch-analyzers.html#supported-languages">Supported Languages</a>
+     */
     public String getLocale() {
         return locale;
     }
@@ -59,6 +64,10 @@ public void setLocale(String locale) {
         this.locale = locale;
     }
 
+    /**
+     * @return <code>true</code> to preserve accented characters (default)
+     * <code>false</code> to convert accented characters to their base characters
+     */
     public boolean isAccent() {
         return accent;
     }
@@ -75,6 +84,10 @@ public void setAnalyzerCase(SearchAnalyzerCase analyzerCase) {
         this.analyzerCase = analyzerCase;
     }
 
+    /**
+     * @return <code>true</code> to apply stemming on returned words (default)
+     * <code>false</code> to leave the tokenized words as-is
+     */
     public boolean isStemming() {
         return stemming;
     }
@@ -83,6 +96,15 @@ public void setStemming(boolean stemming) {
         this.stemming = stemming;
     }
 
+    /**
+     * @return if present, then edge n-grams are generated for each token (word). That is, the start of the n-gram is
+     * anchored to the beginning of the token, whereas the ngram Analyzer would produce all possible substrings from a
+     * single input token (within the defined length restrictions). Edge n-grams can be used to cover word-based
+     * auto-completion queries with an index, for which you should set the following other options:
+     * - accent: false
+     * - case: {@link SearchAnalyzerCase#lower}
+     * - stemming: false
+     */
     public EdgeNgram getEdgeNgram() {
         return edgeNgram;
     }
@@ -91,6 +113,11 @@ public void setEdgeNgram(EdgeNgram edgeNgram) {
         this.edgeNgram = edgeNgram;
     }
 
+    /**
+     * @return an array of strings with words to omit from result. Default: load words from stopwordsPath. To disable
+     * stop-word filtering provide an empty array []. If both stopwords and stopwordsPath are provided then both word
+     * sources are combined.
+     */
     public List<String> getStopwords() {
         return stopwords;
     }
@@ -99,6 +126,19 @@ public void setStopwords(List<String> stopwords) {
         this.stopwords = stopwords;
     }
 
+    /**
+     * @return path with a language sub-directory (e.g. en for a locale en_US.utf-8) containing files with words to omit.
+     * Each word has to be on a separate line. Everything after the first whitespace character on a line will be ignored
+     * and can be used for comments. The files can be named arbitrarily and have any file extension (or none).
+     * <p>
+     * Default: if no path is provided then the value of the environment variable IRESEARCH_TEXT_STOPWORD_PATH is used
+     * to determine the path, or if it is undefined then the current working directory is assumed. If the stopwords
+     * attribute is provided then no stop-words are loaded from files, unless an explicit stopwordsPath is also provided.
+     * <p>
+     * Note that if the stopwordsPath can not be accessed, is missing language sub-directories or has no files for a
+     * language required by an Analyzer, then the creation of a new Analyzer is refused. If such an issue is discovered
+     * for an existing Analyzer during startup then the server will abort with a fatal error.
+     */
     public String getStopwordsPath() {
         return stopwordsPath;
     }

Original file line number	Diff line number	Diff line change
`@@ -30,6 +30,9 @@ public class DelimiterAnalyzerProperties {`
`30`	`30`
`31`	`31`	`private String delimiter;`
`32`	`32`
	`33`	`+ /**`
	`34`	`+ * @return the delimiting character(s)`
	`35`	`+ */`
`33`	`36`	`public String getDelimiter() {`
`34`	`37`	`return delimiter;`
`35`	`38`	`}`
Original file line number	Diff line number	Diff line change
`@@ -31,6 +31,9 @@ public class EdgeNgram {`
`31`	`31`	`private long max;`
`32`	`32`	`private boolean preserveOriginal;`
`33`	`33`
	`34`	`+ /**`
	`35`	`+ * @return minimal n-gram length`
	`36`	`+ */`
`34`	`37`	`public long getMin() {`
`35`	`38`	`return min;`
`36`	`39`	`}`
`@@ -39,6 +42,9 @@ public void setMin(long min) {`
`39`	`42`	`this.min = min;`
`40`	`43`	`}`
`41`	`44`
	`45`	`+ /**`
	`46`	`+ * @return maximal n-gram length`
	`47`	`+ */`
`42`	`48`	`public long getMax() {`
`43`	`49`	`return max;`
`44`	`50`	`}`
`@@ -47,6 +53,9 @@ public void setMax(long max) {`
`47`	`53`	`this.max = max;`
`48`	`54`	`}`
`49`	`55`
	`56`	`+ /**`
	`57`	`+ * @return whether to include the original token even if its length is less than min or greater than max`
	`58`	`+ */`
`50`	`59`	`public boolean isPreserveOriginal() {`
`51`	`60`	`return preserveOriginal;`
`52`	`61`	`}`
Original file line number	Diff line number	Diff line change
`@@ -35,6 +35,9 @@ public abstract class SearchAnalyzer {`
`35`	`35`	`private AnalyzerType type;`
`36`	`36`	`private Set<AnalyzerFeature> features;`
`37`	`37`
	`38`	`+ /**`
	`39`	`+ * @return The Analyzer name.`
	`40`	`+ */`
`38`	`41`	`public String getName() {`
`39`	`42`	`return name;`
`40`	`43`	`}`
`@@ -43,6 +46,9 @@ public void setName(String name) {`
`43`	`46`	`this.name = name;`
`44`	`47`	`}`
`45`	`48`
	`49`	`+ /**`
	`50`	`+ * @return The Analyzer type.`
	`51`	`+ */`
`46`	`52`	`public AnalyzerType getType() {`
`47`	`53`	`return type;`
`48`	`54`	`}`
`@@ -51,6 +57,9 @@ public void setType(AnalyzerType type) {`
`51`	`57`	`this.type = type;`
`52`	`58`	`}`
`53`	`59`
	`60`	`+ /**`
	`61`	`+ * @return The set of features to set on the Analyzer generated fields.`
	`62`	`+ */`
`54`	`63`	`public Set<AnalyzerFeature> getFeatures() {`
`55`	`64`	`return features;`
`56`	`65`	`}`