Skip to content

Commit d6e7a5b

Browse files
committed
typed analyzers javadoc
1 parent 8081c6e commit d6e7a5b

16 files changed

+165
-3
lines changed

src/main/java/com/arangodb/entity/arangosearch/AnalyzerFeature.java

+17-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,23 @@
2222

2323
/**
2424
* @author Michele Rastelli
25+
* @see <a href= "https://www.arangodb.com/docs/stable/arangosearch-analyzers.html#analyzer-features">API Documentation</a>
2526
*/
2627
public enum AnalyzerFeature {
27-
frequency, norm, position
28+
29+
/**
30+
* how often a term is seen, required for PHRASE()
31+
*/
32+
frequency,
33+
34+
/**
35+
* the field normalization factor
36+
*/
37+
norm,
38+
39+
/**
40+
* sequentially increasing term position, required for PHRASE(). If present then the frequency feature is also required
41+
*/
42+
position
43+
2844
}

src/main/java/com/arangodb/entity/arangosearch/analyzer/DelimiterAnalyzer.java

+3
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,10 @@
2626
import java.util.Objects;
2727

2828
/**
29+
* An Analyzer capable of breaking up delimited text into tokens as per RFC 4180 (without starting new records on newlines).
30+
*
2931
* @author Michele Rastelli
32+
* @see <a href= "https://www.arangodb.com/docs/stable/arangosearch-analyzers.html#delimiter">API Documentation</a>
3033
*/
3134
public class DelimiterAnalyzer extends SearchAnalyzer {
3235
public DelimiterAnalyzer() {

src/main/java/com/arangodb/entity/arangosearch/analyzer/DelimiterAnalyzerProperties.java

+3
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,9 @@ public class DelimiterAnalyzerProperties {
3030

3131
private String delimiter;
3232

33+
/**
34+
* @return the delimiting character(s)
35+
*/
3336
public String getDelimiter() {
3437
return delimiter;
3538
}

src/main/java/com/arangodb/entity/arangosearch/analyzer/EdgeNgram.java

+9
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,9 @@ public class EdgeNgram {
3131
private long max;
3232
private boolean preserveOriginal;
3333

34+
/**
35+
* @return minimal n-gram length
36+
*/
3437
public long getMin() {
3538
return min;
3639
}
@@ -39,6 +42,9 @@ public void setMin(long min) {
3942
this.min = min;
4043
}
4144

45+
/**
46+
* @return maximal n-gram length
47+
*/
4248
public long getMax() {
4349
return max;
4450
}
@@ -47,6 +53,9 @@ public void setMax(long max) {
4753
this.max = max;
4854
}
4955

56+
/**
57+
* @return whether to include the original token even if its length is less than min or greater than max
58+
*/
5059
public boolean isPreserveOriginal() {
5160
return preserveOriginal;
5261
}

src/main/java/com/arangodb/entity/arangosearch/analyzer/IdentityAnalyzer.java

+3
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,10 @@
2424
import com.arangodb.entity.arangosearch.AnalyzerType;
2525

2626
/**
27+
* An Analyzer applying the identity transformation, i.e. returning the input unmodified.
28+
*
2729
* @author Michele Rastelli
30+
* @see <a href= "https://www.arangodb.com/docs/stable/arangosearch-analyzers.html#identity">API Documentation</a>
2831
*/
2932
public class IdentityAnalyzer extends SearchAnalyzer {
3033
public IdentityAnalyzer() {

src/main/java/com/arangodb/entity/arangosearch/analyzer/NGramAnalyzer.java

+8
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,15 @@
2626
import java.util.Objects;
2727

2828
/**
29+
* An Analyzer capable of producing n-grams from a specified input in a range of min..max (inclusive). Can optionally
30+
* preserve the original input.
31+
* <p>
32+
* This Analyzer type can be used to implement substring matching. Note that it slices the input based on bytes and not
33+
* characters by default (streamType). The “binary” mode supports single-byte characters only; multi-byte UTF-8
34+
* characters raise an Invalid UTF-8 sequence query error.
35+
*
2936
* @author Michele Rastelli
37+
* @see <a href= "https://www.arangodb.com/docs/stable/arangosearch-analyzers.html#n-gram">API Documentation</a>
3038
*/
3139
public class NGramAnalyzer extends SearchAnalyzer {
3240
public NGramAnalyzer() {

src/main/java/com/arangodb/entity/arangosearch/analyzer/NGramAnalyzerProperties.java

+26
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,15 @@
2424
import java.util.Objects;
2525

2626
/**
27+
* An Analyzer capable of producing n-grams from a specified input in a range of min..max (inclusive). Can optionally
28+
* preserve the original input.
29+
* <p>
30+
* This Analyzer type can be used to implement substring matching. Note that it slices the input based on bytes and not
31+
* characters by default (streamType). The “binary” mode supports single-byte characters only; multi-byte UTF-8
32+
* characters raise an Invalid UTF-8 sequence query error.
33+
*
2734
* @author Michele Rastelli
35+
* @see <a href= "https://www.arangodb.com/docs/stable/arangosearch-analyzers.html#n-gram">API Documentation</a>
2836
*/
2937
public class NGramAnalyzerProperties {
3038

@@ -41,6 +49,9 @@ public NGramAnalyzerProperties() {
4149
streamType = StreamType.binary;
4250
}
4351

52+
/**
53+
* @return minimum n-gram length
54+
*/
4455
public long getMin() {
4556
return min;
4657
}
@@ -49,6 +60,9 @@ public void setMin(long min) {
4960
this.min = min;
5061
}
5162

63+
/**
64+
* @return maximum n-gram length
65+
*/
5266
public long getMax() {
5367
return max;
5468
}
@@ -57,6 +71,10 @@ public void setMax(long max) {
5771
this.max = max;
5872
}
5973

74+
/**
75+
* @return <code>true</code> to include the original value as well
76+
* <code>false</code> to produce the n-grams based on min and max only
77+
*/
6078
public boolean isPreserveOriginal() {
6179
return preserveOriginal;
6280
}
@@ -65,6 +83,10 @@ public void setPreserveOriginal(boolean preserveOriginal) {
6583
this.preserveOriginal = preserveOriginal;
6684
}
6785

86+
/**
87+
* @return this value will be prepended to n-grams which include the beginning of the input. Can be used for
88+
* matching prefixes. Choose a character or sequence as marker which does not occur in the input
89+
*/
6890
public String getStartMarker() {
6991
return startMarker;
7092
}
@@ -73,6 +95,10 @@ public void setStartMarker(String startMarker) {
7395
this.startMarker = startMarker;
7496
}
7597

98+
/**
99+
* @return this value will be appended to n-grams which include the end of the input. Can be used for matching
100+
* suffixes. Choose a character or sequence as marker which does not occur in the input.
101+
*/
76102
public String getEndMarker() {
77103
return endMarker;
78104
}

src/main/java/com/arangodb/entity/arangosearch/analyzer/NormAnalyzer.java

+3
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,10 @@
2626
import java.util.Objects;
2727

2828
/**
29+
* An Analyzer capable of normalizing the text, treated as a single token, i.e. case conversion and accent removal.
30+
*
2931
* @author Michele Rastelli
32+
* @see <a href= "https://www.arangodb.com/docs/stable/arangosearch-analyzers.html#norm">API Documentation</a>
3033
*/
3134
public class NormAnalyzer extends SearchAnalyzer {
3235
public NormAnalyzer() {

src/main/java/com/arangodb/entity/arangosearch/analyzer/NormAnalyzerProperties.java

+9
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,11 @@ public class NormAnalyzerProperties {
3737
@SerializedName("case")
3838
private SearchAnalyzerCase analyzerCase;
3939

40+
/**
41+
* @return a locale in the format `language[_COUNTRY][.encoding][@variant]` (square brackets denote optional parts),
42+
* e.g. `de.utf-8` or `en_US.utf-8`. Only UTF-8 encoding is meaningful in ArangoDB.
43+
* @see <a href= "https://www.arangodb.com/docs/stable/arangosearch-analyzers.html#supported-languages">Supported Languages</a>
44+
*/
4045
public String getLocale() {
4146
return locale;
4247
}
@@ -45,6 +50,10 @@ public void setLocale(String locale) {
4550
this.locale = locale;
4651
}
4752

53+
/**
54+
* @return <code>true</code> to preserve accented characters (default)
55+
* <code>false</code> to convert accented characters to their base characters
56+
*/
4857
public boolean isAccent() {
4958
return accent;
5059
}

src/main/java/com/arangodb/entity/arangosearch/analyzer/SearchAnalyzer.java

+9
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,9 @@ public abstract class SearchAnalyzer {
3535
private AnalyzerType type;
3636
private Set<AnalyzerFeature> features;
3737

38+
/**
39+
* @return The Analyzer name.
40+
*/
3841
public String getName() {
3942
return name;
4043
}
@@ -43,6 +46,9 @@ public void setName(String name) {
4346
this.name = name;
4447
}
4548

49+
/**
50+
* @return The Analyzer type.
51+
*/
4652
public AnalyzerType getType() {
4753
return type;
4854
}
@@ -51,6 +57,9 @@ public void setType(AnalyzerType type) {
5157
this.type = type;
5258
}
5359

60+
/**
61+
* @return The set of features to set on the Analyzer generated fields.
62+
*/
5463
public Set<AnalyzerFeature> getFeatures() {
5564
return features;
5665
}

src/main/java/com/arangodb/entity/arangosearch/analyzer/SearchAnalyzerCase.java

+14-1
Original file line numberDiff line numberDiff line change
@@ -25,5 +25,18 @@
2525
* @author Michele Rastelli
2626
*/
2727
public enum SearchAnalyzerCase {
28-
lower, upper, none
28+
/**
29+
* convert to all lower-case characters
30+
*/
31+
lower,
32+
33+
/**
34+
* convert to all upper-case characters
35+
*/
36+
upper,
37+
38+
/**
39+
* to not change character case (default)
40+
*/
41+
none
2942
}

src/main/java/com/arangodb/entity/arangosearch/analyzer/StemAnalyzer.java

+3
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,10 @@
2626
import java.util.Objects;
2727

2828
/**
29+
* An Analyzer capable of stemming the text, treated as a single token, for supported languages.
30+
*
2931
* @author Michele Rastelli
32+
* @see <a href= "https://www.arangodb.com/docs/stable/arangosearch-analyzers.html#stem">API Documentation</a>
3033
*/
3134
public class StemAnalyzer extends SearchAnalyzer {
3235
public StemAnalyzer() {

src/main/java/com/arangodb/entity/arangosearch/analyzer/StemAnalyzerProperties.java

+5
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,11 @@ public class StemAnalyzerProperties {
3030

3131
private String locale;
3232

33+
/**
34+
* @return a locale in the format `language[_COUNTRY][.encoding][@variant]` (square brackets denote optional parts),
35+
* e.g. `de.utf-8` or `en_US.utf-8`. Only UTF-8 encoding is meaningful in ArangoDB.
36+
* @see <a href= "https://www.arangodb.com/docs/stable/arangosearch-analyzers.html#supported-languages">Supported Languages</a>
37+
*/
3338
public String getLocale() {
3439
return locale;
3540
}

src/main/java/com/arangodb/entity/arangosearch/analyzer/StreamType.java

+9-1
Original file line numberDiff line numberDiff line change
@@ -25,5 +25,13 @@
2525
* @author Michele Rastelli
2626
*/
2727
public enum StreamType {
28-
binary, utf8
28+
/**
29+
* one byte is considered as one character (default)
30+
*/
31+
binary,
32+
33+
/**
34+
* one Unicode codepoint is treated as one character
35+
*/
36+
utf8
2937
}

src/main/java/com/arangodb/entity/arangosearch/analyzer/TextAnalyzer.java

+4
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,11 @@
2626
import java.util.Objects;
2727

2828
/**
29+
* An Analyzer capable of breaking up strings into individual words while also optionally filtering out stop-words,
30+
* extracting word stems, applying case conversion and accent removal.
31+
*
2932
* @author Michele Rastelli
33+
* @see <a href= "https://www.arangodb.com/docs/stable/arangosearch-analyzers.html#text">API Documentation</a>
3034
*/
3135
public class TextAnalyzer extends SearchAnalyzer {
3236
public TextAnalyzer() {

src/main/java/com/arangodb/entity/arangosearch/analyzer/TextAnalyzerProperties.java

+40
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,11 @@ public TextAnalyzerProperties() {
5151

5252
private String stopwordsPath;
5353

54+
/**
55+
* @return a locale in the format `language[_COUNTRY][.encoding][@variant]` (square brackets denote optional parts),
56+
* e.g. `de.utf-8` or `en_US.utf-8`. Only UTF-8 encoding is meaningful in ArangoDB.
57+
* @see <a href= "https://www.arangodb.com/docs/stable/arangosearch-analyzers.html#supported-languages">Supported Languages</a>
58+
*/
5459
public String getLocale() {
5560
return locale;
5661
}
@@ -59,6 +64,10 @@ public void setLocale(String locale) {
5964
this.locale = locale;
6065
}
6166

67+
/**
68+
* @return <code>true</code> to preserve accented characters (default)
69+
* <code>false</code> to convert accented characters to their base characters
70+
*/
6271
public boolean isAccent() {
6372
return accent;
6473
}
@@ -75,6 +84,10 @@ public void setAnalyzerCase(SearchAnalyzerCase analyzerCase) {
7584
this.analyzerCase = analyzerCase;
7685
}
7786

87+
/**
88+
* @return <code>true</code> to apply stemming on returned words (default)
89+
* <code>false</code> to leave the tokenized words as-is
90+
*/
7891
public boolean isStemming() {
7992
return stemming;
8093
}
@@ -83,6 +96,15 @@ public void setStemming(boolean stemming) {
8396
this.stemming = stemming;
8497
}
8598

99+
/**
100+
* @return if present, then edge n-grams are generated for each token (word). That is, the start of the n-gram is
101+
* anchored to the beginning of the token, whereas the ngram Analyzer would produce all possible substrings from a
102+
* single input token (within the defined length restrictions). Edge n-grams can be used to cover word-based
103+
* auto-completion queries with an index, for which you should set the following other options:
104+
* - accent: false
105+
* - case: {@link SearchAnalyzerCase#lower}
106+
* - stemming: false
107+
*/
86108
public EdgeNgram getEdgeNgram() {
87109
return edgeNgram;
88110
}
@@ -91,6 +113,11 @@ public void setEdgeNgram(EdgeNgram edgeNgram) {
91113
this.edgeNgram = edgeNgram;
92114
}
93115

116+
/**
117+
* @return an array of strings with words to omit from result. Default: load words from stopwordsPath. To disable
118+
* stop-word filtering provide an empty array []. If both stopwords and stopwordsPath are provided then both word
119+
* sources are combined.
120+
*/
94121
public List<String> getStopwords() {
95122
return stopwords;
96123
}
@@ -99,6 +126,19 @@ public void setStopwords(List<String> stopwords) {
99126
this.stopwords = stopwords;
100127
}
101128

129+
/**
130+
* @return path with a language sub-directory (e.g. en for a locale en_US.utf-8) containing files with words to omit.
131+
* Each word has to be on a separate line. Everything after the first whitespace character on a line will be ignored
132+
* and can be used for comments. The files can be named arbitrarily and have any file extension (or none).
133+
* <p>
134+
* Default: if no path is provided then the value of the environment variable IRESEARCH_TEXT_STOPWORD_PATH is used
135+
* to determine the path, or if it is undefined then the current working directory is assumed. If the stopwords
136+
* attribute is provided then no stop-words are loaded from files, unless an explicit stopwordsPath is also provided.
137+
* <p>
138+
* Note that if the stopwordsPath can not be accessed, is missing language sub-directories or has no files for a
139+
* language required by an Analyzer, then the creation of a new Analyzer is refused. If such an issue is discovered
140+
* for an existing Analyzer during startup then the server will abort with a fatal error.
141+
*/
102142
public String getStopwordsPath() {
103143
return stopwordsPath;
104144
}

0 commit comments

Comments
 (0)