Skip to content

Commit 14f540e

Browse files
Ke Liromseygeek
Ke Li
authored andcommitted
Deprecate unicodeSetFilter in favour of unicode_set_filter (#29215)
1 parent b665984 commit 14f540e

File tree

5 files changed

+77
-9
lines changed

5 files changed

+77
-9
lines changed

docs/plugins/analysis-icu.asciidoc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ normalization can be specified with the `name` parameter, which accepts `nfc`,
3838
convert `nfc` to `nfd` or `nfkc` to `nfkd` respectively:
3939

4040
Which letters are normalized can be controlled by specifying the
41-
`unicodeSetFilter` parameter, which accepts a
41+
`unicode_set_filter` parameter, which accepts a
4242
http://icu-project.org/apiref/icu4j/com/ibm/icu/text/UnicodeSet.html[UnicodeSet].
4343

4444
Here are two examples, the default usage and a customised character filter:
@@ -194,7 +194,7 @@ with the `name` parameter, which accepts `nfc`, `nfkc`, and `nfkc_cf`
194194
(default).
195195

196196
Which letters are normalized can be controlled by specifying the
197-
`unicodeSetFilter` parameter, which accepts a
197+
`unicode_set_filter` parameter, which accepts a
198198
http://icu-project.org/apiref/icu4j/com/ibm/icu/text/UnicodeSet.html[UnicodeSet].
199199

200200
You should probably prefer the <<analysis-icu-normalization-charfilter,Normalization character filter>>.
@@ -273,7 +273,7 @@ The ICU folding token filter already does Unicode normalization, so there is
273273
no need to use Normalize character or token filter as well.
274274

275275
Which letters are folded can be controlled by specifying the
276-
`unicodeSetFilter` parameter, which accepts a
276+
`unicode_set_filter` parameter, which accepts a
277277
http://icu-project.org/apiref/icu4j/com/ibm/icu/text/UnicodeSet.html[UnicodeSet].
278278

279279
The following example exempts Swedish characters from folding. It is important
@@ -300,7 +300,7 @@ PUT icu_sample
300300
"filter": {
301301
"swedish_folding": {
302302
"type": "icu_folding",
303-
"unicodeSetFilter": "[^åäöÅÄÖ]"
303+
"unicode_set_filter": "[^åäöÅÄÖ]"
304304
}
305305
}
306306
}

plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuFoldingTokenFilterFactory.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ public class IcuFoldingTokenFilterFactory extends AbstractTokenFilterFactory imp
5050

5151
public IcuFoldingTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
5252
super(indexSettings, name, settings);
53-
this.normalizer = IcuNormalizerTokenFilterFactory.wrapWithUnicodeSetFilter(ICU_FOLDING_NORMALIZER, settings);
53+
this.normalizer = IcuNormalizerTokenFilterFactory.wrapWithUnicodeSetFilter(indexSettings, ICU_FOLDING_NORMALIZER, settings);
5454
}
5555

5656
@Override

plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerCharFilterFactory.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ public IcuNormalizerCharFilterFactory(IndexSettings indexSettings, Environment e
4949
}
5050
Normalizer2 normalizer = Normalizer2.getInstance(
5151
null, method, "compose".equals(mode) ? Normalizer2.Mode.COMPOSE : Normalizer2.Mode.DECOMPOSE);
52-
this.normalizer = IcuNormalizerTokenFilterFactory.wrapWithUnicodeSetFilter(normalizer, settings);
52+
this.normalizer = IcuNormalizerTokenFilterFactory.wrapWithUnicodeSetFilter(indexSettings, normalizer, settings);
5353
}
5454

5555
@Override

plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerTokenFilterFactory.java

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,10 @@
2323
import com.ibm.icu.text.Normalizer2;
2424
import com.ibm.icu.text.UnicodeSet;
2525

26+
import org.apache.logging.log4j.LogManager;
2627
import org.apache.lucene.analysis.TokenStream;
28+
import org.elasticsearch.Version;
29+
import org.elasticsearch.common.logging.DeprecationLogger;
2730
import org.elasticsearch.common.settings.Settings;
2831
import org.elasticsearch.env.Environment;
2932
import org.elasticsearch.index.IndexSettings;
@@ -35,14 +38,15 @@
3538
* <p>The {@code unicodeSetFilter} attribute can be used to provide the UniCodeSet for filtering.</p>
3639
*/
3740
public class IcuNormalizerTokenFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
38-
41+
private final static DeprecationLogger deprecationLogger =
42+
new DeprecationLogger(LogManager.getLogger(IcuNormalizerTokenFilterFactory.class));
3943
private final Normalizer2 normalizer;
4044

4145
public IcuNormalizerTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
4246
super(indexSettings, name, settings);
4347
String method = settings.get("name", "nfkc_cf");
4448
Normalizer2 normalizer = Normalizer2.getInstance(null, method, Normalizer2.Mode.COMPOSE);
45-
this.normalizer = wrapWithUnicodeSetFilter(normalizer, settings);
49+
this.normalizer = wrapWithUnicodeSetFilter(indexSettings, normalizer, settings);
4650
}
4751

4852
@Override
@@ -55,8 +59,17 @@ public Object getMultiTermComponent() {
5559
return this;
5660
}
5761

58-
static Normalizer2 wrapWithUnicodeSetFilter(final Normalizer2 normalizer, Settings settings) {
62+
static Normalizer2 wrapWithUnicodeSetFilter(final IndexSettings indexSettings,
63+
final Normalizer2 normalizer,
64+
final Settings settings) {
5965
String unicodeSetFilter = settings.get("unicodeSetFilter");
66+
if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0_alpha1)) {
67+
if (unicodeSetFilter != null) {
68+
deprecationLogger.deprecated("[unicodeSetFilter] has been deprecated in favor of [unicode_set_filter]");
69+
} else {
70+
unicodeSetFilter = settings.get("unicode_set_filter");
71+
}
72+
}
6073
if (unicodeSetFilter != null) {
6174
UnicodeSet unicodeSet = new UnicodeSet(unicodeSetFilter);
6275

plugins/analysis-icu/src/test/resources/rest-api-spec/test/analysis_icu/10_basic.yml

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,61 @@
4848
---
4949
"Normalization with a UnicodeSet Filter":
5050
- do:
51+
indices.create:
52+
index: test
53+
body:
54+
settings:
55+
index:
56+
analysis:
57+
char_filter:
58+
charfilter_icu_normalizer:
59+
type: icu_normalizer
60+
unicode_set_filter: "[^ß]"
61+
filter:
62+
tokenfilter_icu_normalizer:
63+
type: icu_normalizer
64+
unicode_set_filter: "[^ßB]"
65+
tokenfilter_icu_folding:
66+
type: icu_folding
67+
unicode_set_filter: "[^â]"
68+
- do:
69+
indices.analyze:
70+
index: test
71+
body:
72+
char_filter: ["charfilter_icu_normalizer"]
73+
tokenizer: keyword
74+
text: charfilter Föo Bâr Ruß
75+
- length: { tokens: 1 }
76+
- match: { tokens.0.token: charfilter föo bâr ruß }
77+
- do:
78+
indices.analyze:
79+
index: test
80+
body:
81+
tokenizer: keyword
82+
filter: ["tokenfilter_icu_normalizer"]
83+
text: tokenfilter Föo Bâr Ruß
84+
- length: { tokens: 1 }
85+
- match: { tokens.0.token: tokenfilter föo Bâr ruß }
86+
- do:
87+
indices.analyze:
88+
index: test
89+
body:
90+
tokenizer: keyword
91+
filter: ["tokenfilter_icu_folding"]
92+
text: icufolding Föo Bâr Ruß
93+
- length: { tokens: 1 }
94+
- match: { tokens.0.token: icufolding foo bâr russ }
95+
96+
---
97+
"Normalization with a CamcelCase UnicodeSet Filter":
98+
- skip:
99+
version: " - 6.99.99"
100+
reason: unicodeSetFilter deprecated in 7.0.0, replaced by unicode_set_filter
101+
features: "warnings"
102+
103+
- do:
104+
warnings:
105+
- "[unicodeSetFilter] has been deprecated in favor of [unicode_set_filter]"
51106
indices.create:
52107
index: test
53108
body:

0 commit comments

Comments
 (0)