@@ -46,25 +46,34 @@ public void testParseTokenChars() {
46
46
final Index index = new Index ("test" , "_na_" );
47
47
final String name = "ngr" ;
48
48
final Settings indexSettings = newAnalysisSettingsBuilder ().build ();
49
- IndexSettings indexProperties = IndexSettingsModule .newIndexSettings (index , indexSettings );
50
- for (String tokenChars : Arrays .asList ("letters " , "number " , "DIRECTIONALITY_UNDEFINED " )) {
49
+ final IndexSettings indexProperties = IndexSettingsModule .newIndexSettings (index , indexSettings );
50
+ for (String tokenChars : Arrays .asList ("letter " , " digit " , "punctuation" , "DIGIT" , "CoNtRoL" , "dash_punctuation " )) {
51
51
final Settings settings = newAnalysisSettingsBuilder ().put ("min_gram" , 2 ).put ("max_gram" , 3 )
52
52
.put ("token_chars" , tokenChars ).build ();
53
- try {
54
- new NGramTokenizerFactory (indexProperties , null , name , settings ).create ();
55
- fail ();
56
- } catch (IllegalArgumentException expected ) {
57
- // OK
58
- }
53
+ new NGramTokenizerFactory (indexProperties , null , name , settings ).create ();
54
+ // no exception
59
55
}
60
- for ( String tokenChars : Arrays . asList ( "letter" , " digit " , "punctuation" , "DIGIT" , "CoNtRoL" , "dash_punctuation" )) {
56
+ {
61
57
final Settings settings = newAnalysisSettingsBuilder ().put ("min_gram" , 2 ).put ("max_gram" , 3 )
62
- .put ("token_chars" , tokenChars ).build ();
63
- indexProperties = IndexSettingsModule .newIndexSettings (index , indexSettings );
64
-
58
+ .put ("token_chars" , "DIRECTIONALITY_UNDEFINED" ).build ();
59
+ IllegalArgumentException ex = expectThrows (IllegalArgumentException .class ,
60
+ () -> new NGramTokenizerFactory (indexProperties , null , name , settings ).create ());
61
+ assertEquals ("Unknown token type: 'directionality_undefined'" , ex .getMessage ().substring (0 , 46 ));
62
+ assertTrue (ex .getMessage ().contains ("custom" ));
63
+ }
64
+ {
65
+ final Settings settings = newAnalysisSettingsBuilder ().put ("min_gram" , 2 ).put ("max_gram" , 3 ).put ("token_chars" , "custom" )
66
+ .put ("custom_token_chars" , "_-" ).build ();
65
67
new NGramTokenizerFactory (indexProperties , null , name , settings ).create ();
66
68
// no exception
67
69
}
70
+ {
71
+ final Settings settings = newAnalysisSettingsBuilder ().put ("min_gram" , 2 ).put ("max_gram" , 3 ).put ("token_chars" , "custom" )
72
+ .build ();
73
+ IllegalArgumentException ex = expectThrows (IllegalArgumentException .class ,
74
+ () -> new NGramTokenizerFactory (indexProperties , null , name , settings ).create ());
75
+ assertEquals ("Token type: 'custom' requires setting `custom_token_chars`" , ex .getMessage ());
76
+ }
68
77
}
69
78
70
79
public void testNoTokenChars () throws IOException {
@@ -80,6 +89,19 @@ public void testNoTokenChars() throws IOException {
80
89
assertTokenStreamContents (tokenizer , new String [] {"1." , "1.3" , "1.34" , ".3" , ".34" , "34" });
81
90
}
82
91
92
+ public void testCustomTokenChars () throws IOException {
93
+ final Index index = new Index ("test" , "_na_" );
94
+ final String name = "ngr" ;
95
+ final Settings indexSettings = newAnalysisSettingsBuilder ().put (IndexSettings .MAX_NGRAM_DIFF_SETTING .getKey (), 2 ).build ();
96
+
97
+ final Settings settings = newAnalysisSettingsBuilder ().put ("min_gram" , 2 ).put ("max_gram" , 3 )
98
+ .putList ("token_chars" , "letter" , "custom" ).put ("custom_token_chars" ,"_-" ).build ();
99
+ Tokenizer tokenizer = new NGramTokenizerFactory (IndexSettingsModule .newIndexSettings (index , indexSettings ), null , name , settings )
100
+ .create ();
101
+ tokenizer .setReader (new StringReader ("Abc -gh _jk =lm" ));
102
+ assertTokenStreamContents (tokenizer , new String [] {"Ab" , "Abc" , "bc" , "-g" , "-gh" , "gh" , "_j" , "_jk" , "jk" , "lm" });
103
+ }
104
+
83
105
public void testPreTokenization () throws IOException {
84
106
// Make sure that pretokenization works well and that it can be used even with token chars which are supplementary characters
85
107
final Index index = new Index ("test" , "_na_" );
0 commit comments