@@ -46,25 +46,33 @@ public void testParseTokenChars() {
46
46
final Index index = new Index ("test" , "_na_" );
47
47
final String name = "ngr" ;
48
48
final Settings indexSettings = newAnalysisSettingsBuilder ().build ();
49
- IndexSettings indexProperties = IndexSettingsModule .newIndexSettings (index , indexSettings );
50
- for (String tokenChars : Arrays .asList ("letters " , "number " , "DIRECTIONALITY_UNDEFINED " )) {
49
+ final IndexSettings indexProperties = IndexSettingsModule .newIndexSettings (index , indexSettings );
50
+ for (String tokenChars : Arrays .asList ("letter " , " digit " , "punctuation" , "DIGIT" , "CoNtRoL" , "dash_punctuation " )) {
51
51
final Settings settings = newAnalysisSettingsBuilder ().put ("min_gram" , 2 ).put ("max_gram" , 3 )
52
52
.put ("token_chars" , tokenChars ).build ();
53
- try {
54
- new NGramTokenizerFactory (indexProperties , null , name , settings ).create ();
55
- fail ();
56
- } catch (IllegalArgumentException expected ) {
57
- // OK
58
- }
53
+ new NGramTokenizerFactory (indexProperties , null , name , settings ).create ();
54
+ // no exception
59
55
}
60
- for ( String tokenChars : Arrays . asList ( "letter" , " digit " , "punctuation" , "DIGIT" , "CoNtRoL" , "dash_punctuation" )) {
56
+ {
61
57
final Settings settings = newAnalysisSettingsBuilder ().put ("min_gram" , 2 ).put ("max_gram" , 3 )
62
- .put ("token_chars" , tokenChars ).build ();
63
- indexProperties = IndexSettingsModule .newIndexSettings (index , indexSettings );
64
-
58
+ .put ("token_chars" , "DIRECTIONALITY_UNDEFINED" ).build ();
59
+ IllegalArgumentException ex = expectThrows (IllegalArgumentException .class ,
60
+ () -> new NGramTokenizerFactory (indexProperties , null , name , settings ).create ());
61
+ assertEquals ("Unknown token type: 'directionality_undefined'" , ex .getMessage ().substring (0 , 46 ));
62
+ }
63
+ {
64
+ final Settings settings = newAnalysisSettingsBuilder ().put ("min_gram" , 2 ).put ("max_gram" , 3 ).put ("token_chars" , "custom" )
65
+ .put ("custom_token_chars" , "_-" ).build ();
65
66
new NGramTokenizerFactory (indexProperties , null , name , settings ).create ();
66
67
// no exception
67
68
}
69
+ {
70
+ final Settings settings = newAnalysisSettingsBuilder ().put ("min_gram" , 2 ).put ("max_gram" , 3 ).put ("token_chars" , "custom" )
71
+ .build ();
72
+ IllegalArgumentException ex = expectThrows (IllegalArgumentException .class ,
73
+ () -> new NGramTokenizerFactory (indexProperties , null , name , settings ).create ());
74
+ assertEquals ("Token type: 'custom' requires setting `custom_token_chars`" , ex .getMessage ());
75
+ }
68
76
}
69
77
70
78
public void testNoTokenChars () throws IOException {
@@ -80,6 +88,19 @@ public void testNoTokenChars() throws IOException {
80
88
assertTokenStreamContents (tokenizer , new String [] {"1." , "1.3" , "1.34" , ".3" , ".34" , "34" });
81
89
}
82
90
91
+ public void testCustomTokenChars () throws IOException {
92
+ final Index index = new Index ("test" , "_na_" );
93
+ final String name = "ngr" ;
94
+ final Settings indexSettings = newAnalysisSettingsBuilder ().put (IndexSettings .MAX_NGRAM_DIFF_SETTING .getKey (), 2 ).build ();
95
+
96
+ final Settings settings = newAnalysisSettingsBuilder ().put ("min_gram" , 2 ).put ("max_gram" , 3 )
97
+ .putList ("token_chars" , "letter" , "custom" ).put ("custom_token_chars" ,"_-" ).build ();
98
+ Tokenizer tokenizer = new NGramTokenizerFactory (IndexSettingsModule .newIndexSettings (index , indexSettings ), null , name , settings )
99
+ .create ();
100
+ tokenizer .setReader (new StringReader ("Abc -gh _jk =lm" ));
101
+ assertTokenStreamContents (tokenizer , new String [] {"Ab" , "Abc" , "bc" , "-g" , "-gh" , "gh" , "_j" , "_jk" , "jk" , "lm" });
102
+ }
103
+
83
104
public void testPreTokenization () throws IOException {
84
105
// Make sure that pretokenization works well and that it can be used even with token chars which are supplementary characters
85
106
final Index index = new Index ("test" , "_na_" );
0 commit comments