Skip to content

Commit 59d7f5c

Browse files
committed
Exposed ICU collator options in IcuCollationTokenFilterFactory
Closes #6
1 parent e7d045e commit 59d7f5c

File tree

5 files changed

+416
-8
lines changed

5 files changed

+416
-8
lines changed

README.md

+25
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,31 @@ And here is a sample of custom collation:
103103
}
104104
}
105105

106+
Optional options:
107+
* `strength` - The strength property determines the minimum level of difference considered significant during comparison.
108+
The default strength for the Collator is `tertiary`, unless specified otherwise by the locale used to create the Collator.
109+
Possible values: `primary`, `secondary`, `tertiary`, `quaternary` or `identical`.
110+
See ICU Collation:http://icu-project.org/apiref/icu4j/com/ibm/icu/text/Collator.html documentation for a more detailed
111+
explanation for the specific values.
112+
* `decomposition` - Possible values: `no` or `canonical`. Defaults to `no`. Setting this decomposition property with
113+
`canonical` allows the Collator to handle un-normalized text properly, producing the same results as if the text were
114+
normalized. If `no` is set, it is the user's responsibility to insure that all text is already in the appropriate form
115+
before a comparison or before getting a CollationKey. Adjusting decomposition mode allows the user to select between
116+
faster and more complete collation behavior. Since a great many of the world's languages do not require text
117+
normalization, most locales set `no` as the default decomposition mode.
118+
119+
Expert options:
120+
* `alternate` - Possible values: `shifted` or `non-ignorable`. Sets the alternate handling for strength `quaternary`
121+
to be either shifted or non-ignorable. What boils down to ignoring punctuation and whitespace.
122+
* `caseLevel` - Possible values: `true` or `false`. Default is `false`. Whether case level sorting is required. When
123+
strength is set to `primary` this will ignore accent differences.
124+
* `caseFirst` - Possible values: `lower` or `upper`. Useful to control which case is sorted first when case is not ignored
125+
for strength `tertiary`.
126+
* `numeric` - Possible values: `true` or `false`. Whether digits are sorted according to numeric representation. For
127+
example the value `egg-9` is sorted before the value `egg-21`. Defaults to `false`.
128+
* `variableTop` - Single character or contraction. Controls what is variable for `alternate`.
129+
* `hiraganaQuaternaryMode` - Possible values: `true` or `false`. Defaults to `false`. Distinguishing between Katakana
130+
and Hiragana characters in `quaternary` strength .
106131

107132
ICU Tokenizer
108133
-------------

pom.xml

+10
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,16 @@
6868
<artifactId>testng</artifactId>
6969
<version>6.8</version>
7070
<scope>test</scope>
71+
<exclusions>
72+
<exclusion>
73+
<groupId>org.hamcrest</groupId>
74+
<artifactId>hamcrest-core</artifactId>
75+
</exclusion>
76+
<exclusion>
77+
<groupId>junit</groupId>
78+
<artifactId>junit</artifactId>
79+
</exclusion>
80+
</exclusions>
7181
</dependency>
7282

7383
<dependency>

src/main/java/org/elasticsearch/index/analysis/IcuCollationTokenFilterFactory.java

+75-2
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,6 @@
4545
* <p>The second option is to specify collation rules as defined in the <a href="http://www.icu-project.org/userguide/Collate_Customization.html">
4646
* Collation customization</a> chapter in icu docs. The <tt>rules</tt> parameter can either embed the rules definition
4747
* in the settings or refer to an external location (preferable located under the <tt>config</tt> location, relative to it).
48-
*
49-
*
5048
*/
5149
public class IcuCollationTokenFilterFactory extends AbstractTokenFilterFactory {
5250

@@ -96,6 +94,81 @@ public IcuCollationTokenFilterFactory(Index index, @IndexSettings Settings index
9694
collator = Collator.getInstance();
9795
}
9896
}
97+
98+
// set the strength flag, otherwise it will be the default.
99+
String strength = settings.get("strength");
100+
if (strength != null) {
101+
if (strength.equalsIgnoreCase("primary")) {
102+
collator.setStrength(Collator.PRIMARY);
103+
} else if (strength.equalsIgnoreCase("secondary")) {
104+
collator.setStrength(Collator.SECONDARY);
105+
} else if (strength.equalsIgnoreCase("tertiary")) {
106+
collator.setStrength(Collator.TERTIARY);
107+
} else if (strength.equalsIgnoreCase("quaternary")) {
108+
collator.setStrength(Collator.QUATERNARY);
109+
} else if (strength.equalsIgnoreCase("identical")) {
110+
collator.setStrength(Collator.IDENTICAL);
111+
} else {
112+
throw new ElasticSearchIllegalArgumentException("Invalid strength: " + strength);
113+
}
114+
}
115+
116+
// set the decomposition flag, otherwise it will be the default.
117+
String decomposition = settings.get("decomposition");
118+
if (decomposition != null) {
119+
if (decomposition.equalsIgnoreCase("no")) {
120+
collator.setDecomposition(Collator.NO_DECOMPOSITION);
121+
} else if (decomposition.equalsIgnoreCase("canonical")) {
122+
collator.setDecomposition(Collator.CANONICAL_DECOMPOSITION);
123+
} else {
124+
throw new ElasticSearchIllegalArgumentException("Invalid decomposition: " + decomposition);
125+
}
126+
}
127+
128+
// expert options: concrete subclasses are always a RuleBasedCollator
129+
RuleBasedCollator rbc = (RuleBasedCollator) collator;
130+
String alternate = settings.get("alternate");
131+
if (alternate != null) {
132+
if (alternate.equalsIgnoreCase("shifted")) {
133+
rbc.setAlternateHandlingShifted(true);
134+
} else if (alternate.equalsIgnoreCase("non-ignorable")) {
135+
rbc.setAlternateHandlingShifted(false);
136+
} else {
137+
throw new ElasticSearchIllegalArgumentException("Invalid alternate: " + alternate);
138+
}
139+
}
140+
141+
Boolean caseLevel = settings.getAsBoolean("caseLevel", null);
142+
if (caseLevel != null) {
143+
rbc.setCaseLevel(caseLevel);
144+
}
145+
146+
String caseFirst = settings.get("caseFirst");
147+
if (caseFirst != null) {
148+
if (caseFirst.equalsIgnoreCase("lower")) {
149+
rbc.setLowerCaseFirst(true);
150+
} else if (caseFirst.equalsIgnoreCase("upper")) {
151+
rbc.setUpperCaseFirst(true);
152+
} else {
153+
throw new ElasticSearchIllegalArgumentException("Invalid caseFirst: " + caseFirst);
154+
}
155+
}
156+
157+
Boolean numeric = settings.getAsBoolean("numeric", null);
158+
if (numeric != null) {
159+
rbc.setNumericCollation(numeric);
160+
}
161+
162+
String variableTop = settings.get("variableTop");
163+
if (variableTop != null) {
164+
rbc.setVariableTop(variableTop);
165+
}
166+
167+
Boolean hiraganaQuaternaryMode = settings.getAsBoolean("hiraganaQuaternaryMode", null);
168+
if (hiraganaQuaternaryMode != null) {
169+
rbc.setHiraganaQuaternary(hiraganaQuaternaryMode);
170+
}
171+
99172
this.collator = collator;
100173
}
101174

src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java

+6-6
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,10 @@
2929
import org.elasticsearch.index.settings.IndexSettingsModule;
3030
import org.elasticsearch.indices.analysis.IndicesAnalysisModule;
3131
import org.elasticsearch.indices.analysis.IndicesAnalysisService;
32-
import org.hamcrest.MatcherAssert;
3332
import org.testng.annotations.Test;
3433

3534
import static org.elasticsearch.common.settings.ImmutableSettings.Builder.EMPTY_SETTINGS;
35+
import static org.hamcrest.MatcherAssert.assertThat;
3636
import static org.hamcrest.Matchers.instanceOf;
3737

3838
/**
@@ -53,18 +53,18 @@ public void testDefaultsIcuAnalysis() {
5353
AnalysisService analysisService = injector.getInstance(AnalysisService.class);
5454

5555
TokenizerFactory tokenizerFactory = analysisService.tokenizer("icu_tokenizer");
56-
MatcherAssert.assertThat(tokenizerFactory, instanceOf(IcuTokenizerFactory.class));
56+
assertThat(tokenizerFactory, instanceOf(IcuTokenizerFactory.class));
5757

5858
TokenFilterFactory filterFactory = analysisService.tokenFilter("icu_normalizer");
59-
MatcherAssert.assertThat(filterFactory, instanceOf(IcuNormalizerTokenFilterFactory.class));
59+
assertThat(filterFactory, instanceOf(IcuNormalizerTokenFilterFactory.class));
6060

6161
filterFactory = analysisService.tokenFilter("icu_folding");
62-
MatcherAssert.assertThat(filterFactory, instanceOf(IcuFoldingTokenFilterFactory.class));
62+
assertThat(filterFactory, instanceOf(IcuFoldingTokenFilterFactory.class));
6363

6464
filterFactory = analysisService.tokenFilter("icu_collation");
65-
MatcherAssert.assertThat(filterFactory, instanceOf(IcuCollationTokenFilterFactory.class));
65+
assertThat(filterFactory, instanceOf(IcuCollationTokenFilterFactory.class));
6666

6767
filterFactory = analysisService.tokenFilter("icu_transform");
68-
MatcherAssert.assertThat(filterFactory, instanceOf(IcuTransformTokenFilterFactory.class));
68+
assertThat(filterFactory, instanceOf(IcuTransformTokenFilterFactory.class));
6969
}
7070
}

0 commit comments

Comments
 (0)