@@ -51,6 +51,11 @@ public TextAnalyzerProperties() {
51
51
52
52
private String stopwordsPath ;
53
53
54
+ /**
55
+ * @return a locale in the format `language[_COUNTRY][.encoding][@variant]` (square brackets denote optional parts),
56
+ * e.g. `de.utf-8` or `en_US.utf-8`. Only UTF-8 encoding is meaningful in ArangoDB.
57
+ * @see <a href= "https://www.arangodb.com/docs/stable/arangosearch-analyzers.html#supported-languages">Supported Languages</a>
58
+ */
54
59
public String getLocale () {
55
60
return locale ;
56
61
}
@@ -59,6 +64,10 @@ public void setLocale(String locale) {
59
64
this .locale = locale ;
60
65
}
61
66
67
+ /**
68
+ * @return <code>true</code> to preserve accented characters (default)
69
+ * <code>false</code> to convert accented characters to their base characters
70
+ */
62
71
public boolean isAccent () {
63
72
return accent ;
64
73
}
@@ -75,6 +84,10 @@ public void setAnalyzerCase(SearchAnalyzerCase analyzerCase) {
75
84
this .analyzerCase = analyzerCase ;
76
85
}
77
86
87
+ /**
88
+ * @return <code>true</code> to apply stemming on returned words (default)
89
+ * <code>false</code> to leave the tokenized words as-is
90
+ */
78
91
public boolean isStemming () {
79
92
return stemming ;
80
93
}
@@ -83,6 +96,15 @@ public void setStemming(boolean stemming) {
83
96
this .stemming = stemming ;
84
97
}
85
98
99
+ /**
100
+ * @return if present, then edge n-grams are generated for each token (word). That is, the start of the n-gram is
101
+ * anchored to the beginning of the token, whereas the ngram Analyzer would produce all possible substrings from a
102
+ * single input token (within the defined length restrictions). Edge n-grams can be used to cover word-based
103
+ * auto-completion queries with an index, for which you should set the following other options:
104
+ * - accent: false
105
+ * - case: {@link SearchAnalyzerCase#lower}
106
+ * - stemming: false
107
+ */
86
108
public EdgeNgram getEdgeNgram () {
87
109
return edgeNgram ;
88
110
}
@@ -91,6 +113,11 @@ public void setEdgeNgram(EdgeNgram edgeNgram) {
91
113
this .edgeNgram = edgeNgram ;
92
114
}
93
115
116
+ /**
117
+ * @return an array of strings with words to omit from result. Default: load words from stopwordsPath. To disable
118
+ * stop-word filtering provide an empty array []. If both stopwords and stopwordsPath are provided then both word
119
+ * sources are combined.
120
+ */
94
121
public List <String > getStopwords () {
95
122
return stopwords ;
96
123
}
@@ -99,6 +126,19 @@ public void setStopwords(List<String> stopwords) {
99
126
this .stopwords = stopwords ;
100
127
}
101
128
129
+ /**
130
+ * @return path with a language sub-directory (e.g. en for a locale en_US.utf-8) containing files with words to omit.
131
+ * Each word has to be on a separate line. Everything after the first whitespace character on a line will be ignored
132
+ * and can be used for comments. The files can be named arbitrarily and have any file extension (or none).
133
+ * <p>
134
+ * Default: if no path is provided then the value of the environment variable IRESEARCH_TEXT_STOPWORD_PATH is used
135
+ * to determine the path, or if it is undefined then the current working directory is assumed. If the stopwords
136
+ * attribute is provided then no stop-words are loaded from files, unless an explicit stopwordsPath is also provided.
137
+ * <p>
138
+ * Note that if the stopwordsPath can not be accessed, is missing language sub-directories or has no files for a
139
+ * language required by an Analyzer, then the creation of a new Analyzer is refused. If such an issue is discovered
140
+ * for an existing Analyzer during startup then the server will abort with a fatal error.
141
+ */
102
142
public String getStopwordsPath () {
103
143
return stopwordsPath ;
104
144
}
0 commit comments