Skip to content

Commit 2615df9

Browse files
Support simple_pattern and simple_pattern_split tokenizers (#5789) (#5791)
* Support simple pattern tokenizer * Remove incorrect serializer attribute * Support simple_pattern_split tokenizer Co-authored-by: Steve Gordon <[email protected]>
1 parent 27e1a18 commit 2615df9

File tree

5 files changed

+151
-103
lines changed

5 files changed

+151
-103
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
// Licensed to Elasticsearch B.V under one or more agreements.
2+
// Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
3+
// See the LICENSE file in the project root for more information
4+
5+
using System.Runtime.Serialization;
6+
7+
namespace Nest
8+
{
9+
/// <summary>
10+
/// The simple_pattern tokenizer uses a regular expression to capture matching text as terms.
11+
/// </summary>
12+
public interface ISimplePatternSplitTokenizer : ITokenizer
13+
{
14+
/// <summary>
15+
/// Lucene regular expression, defaults to the empty string.
16+
/// </summary>
17+
[DataMember(Name = "pattern")]
18+
string Pattern { get; set; }
19+
}
20+
21+
/// <inheritdoc />
22+
public class SimplePatternSplitTokenizer : TokenizerBase, ISimplePatternSplitTokenizer
23+
{
24+
public SimplePatternSplitTokenizer() => Type = "simple_pattern_split";
25+
26+
/// <inheritdoc />
27+
public string Pattern { get; set; }
28+
}
29+
30+
/// <inheritdoc />
31+
public class SimplePatternSplitTokenizerDescriptor
32+
: TokenizerDescriptorBase<SimplePatternSplitTokenizerDescriptor, ISimplePatternSplitTokenizer>, ISimplePatternSplitTokenizer
33+
{
34+
protected override string Type => "simple_pattern_split";
35+
36+
string ISimplePatternSplitTokenizer.Pattern { get; set; }
37+
38+
/// <inheritdoc cref="ISimplePatternSplitTokenizer.Pattern" />
39+
public SimplePatternSplitTokenizerDescriptor Pattern(string pattern) => Assign(pattern, (a, v) => a.Pattern = v);
40+
}
41+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
// Licensed to Elasticsearch B.V under one or more agreements.
2+
// Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
3+
// See the LICENSE file in the project root for more information
4+
5+
using System.Runtime.Serialization;
6+
7+
namespace Nest
8+
{
9+
/// <summary>
10+
/// The simple_pattern tokenizer uses a regular expression to capture matching text as terms.
11+
/// </summary>
12+
public interface ISimplePatternTokenizer : ITokenizer
13+
{
14+
/// <summary>
15+
/// Lucene regular expression, defaults to the empty string.
16+
/// </summary>
17+
[DataMember(Name = "pattern")]
18+
string Pattern { get; set; }
19+
}
20+
21+
/// <inheritdoc />
22+
public class SimplePatternTokenizer : TokenizerBase, ISimplePatternTokenizer
23+
{
24+
public SimplePatternTokenizer() => Type = "simple_pattern";
25+
26+
/// <inheritdoc />
27+
public string Pattern { get; set; }
28+
}
29+
30+
/// <inheritdoc />
31+
public class SimplePatternTokenizerDescriptor
32+
: TokenizerDescriptorBase<SimplePatternTokenizerDescriptor, ISimplePatternTokenizer>, ISimplePatternTokenizer
33+
{
34+
protected override string Type => "simple_pattern";
35+
36+
string ISimplePatternTokenizer.Pattern { get; set; }
37+
38+
/// <inheritdoc cref="ISimplePatternTokenizer.Pattern" />
39+
public SimplePatternTokenizerDescriptor Pattern(string pattern) => Assign(pattern, (a, v) => a.Pattern = v);
40+
}
41+
}

src/Nest/Analysis/Tokenizers/Tokenizers.cs

+8
Original file line numberDiff line numberDiff line change
@@ -132,5 +132,13 @@ public TokenizersDescriptor Nori(string name, Func<NoriTokenizerDescriptor, INor
132132
/// >
133133
public TokenizersDescriptor CharGroup(string name, Func<CharGroupTokenizerDescriptor, ICharGroupTokenizer> selector) =>
134134
Assign(name, selector?.Invoke(new CharGroupTokenizerDescriptor()));
135+
136+
/// <inheritdoc cref="ISimplePatternTokenizer"/>>
137+
public TokenizersDescriptor SimplePattern(string name, Func<SimplePatternTokenizerDescriptor, ISimplePatternTokenizer> selector) =>
138+
Assign(name, selector?.Invoke(new SimplePatternTokenizerDescriptor()));
139+
140+
/// <inheritdoc cref="ISimplePatternSplitTokenizer"/>>
141+
public TokenizersDescriptor SimplePatternSplit(string name, Func<SimplePatternSplitTokenizerDescriptor, ISimplePatternSplitTokenizer> selector) =>
142+
Assign(name, selector?.Invoke(new SimplePatternSplitTokenizerDescriptor()));
135143
}
136144
}

src/Nest/Indices/Analyze/AnalyzeTokenizersDescriptor.cs

+8-1
Original file line numberDiff line numberDiff line change
@@ -104,8 +104,15 @@ public ITokenizer Nori(Func<NoriTokenizerDescriptor, INoriTokenizer> selector) =
104104
selector.Invoke(new NoriTokenizerDescriptor());
105105

106106
/// <inheritdoc cref="ICharGroupTokenizer.TokenizeOnCharacters" />
107-
/// >
108107
public ITokenizer CharGroup(Func<CharGroupTokenizerDescriptor, ICharGroupTokenizer> selector) =>
109108
selector?.Invoke(new CharGroupTokenizerDescriptor());
109+
110+
/// <inheritdoc cref="ISimplePatternTokenizer"/>>
111+
public ITokenizer SimplePattern(Func<SimplePatternTokenizerDescriptor, ISimplePatternTokenizer> selector) =>
112+
selector?.Invoke(new SimplePatternTokenizerDescriptor());
113+
114+
/// <inheritdoc cref="ISimplePatternSplitTokenizer"/>>
115+
public ITokenizer SimplePatternSplit(Func<SimplePatternSplitTokenizerDescriptor, ISimplePatternSplitTokenizer> selector) =>
116+
selector?.Invoke(new SimplePatternSplitTokenizerDescriptor());
110117
}
111118
}

tests/Tests/Analysis/Tokenizers/TokenizerTests.cs

+53-102
Original file line numberDiff line numberDiff line change
@@ -22,18 +22,10 @@ public class EdgeNGramTests : TokenizerAssertionBase<EdgeNGramTests>
2222

2323
public override ITokenizer Initializer => new EdgeNGramTokenizer
2424
{
25-
MaxGram = 2,
26-
MinGram = 1,
27-
TokenChars = new[] { TokenChar.Digit, TokenChar.Letter }
25+
MaxGram = 2, MinGram = 1, TokenChars = new[] { TokenChar.Digit, TokenChar.Letter }
2826
};
2927

30-
public override object Json => new
31-
{
32-
min_gram = 1,
33-
max_gram = 2,
34-
token_chars = new[] { "digit", "letter" },
35-
type = "edge_ngram"
36-
};
28+
public override object Json => new { min_gram = 1, max_gram = 2, token_chars = new[] { "digit", "letter" }, type = "edge_ngram" };
3729

3830
public override string Name => "endgen";
3931
}
@@ -50,10 +42,7 @@ public class EdgeNGramCustomTokenCharsTests : TokenizerAssertionBase<EdgeNGramCu
5042

5143
public override ITokenizer Initializer => new EdgeNGramTokenizer
5244
{
53-
MaxGram = 2,
54-
MinGram = 1,
55-
TokenChars = new[] { TokenChar.Custom },
56-
CustomTokenChars = "+-_"
45+
MaxGram = 2, MinGram = 1, TokenChars = new[] { TokenChar.Custom }, CustomTokenChars = "+-_"
5746
};
5847

5948
public override object Json => new
@@ -62,7 +51,7 @@ public class EdgeNGramCustomTokenCharsTests : TokenizerAssertionBase<EdgeNGramCu
6251
max_gram = 2,
6352
token_chars = new[] { "custom" },
6453
custom_token_chars = "+-_",
65-
type = "edge_ngram"
54+
type = "edge_ngram"
6655
};
6756

6857
public override string Name => "endgen_custom";
@@ -78,18 +67,10 @@ public class NGramTests : TokenizerAssertionBase<NGramTests>
7867

7968
public override ITokenizer Initializer => new NGramTokenizer
8069
{
81-
MaxGram = 2,
82-
MinGram = 1,
83-
TokenChars = new[] { TokenChar.Digit, TokenChar.Letter }
70+
MaxGram = 2, MinGram = 1, TokenChars = new[] { TokenChar.Digit, TokenChar.Letter }
8471
};
8572

86-
public override object Json => new
87-
{
88-
min_gram = 1,
89-
max_gram = 2,
90-
token_chars = new[] { "digit", "letter" },
91-
type = "ngram"
92-
};
73+
public override object Json => new { min_gram = 1, max_gram = 2, token_chars = new[] { "digit", "letter" }, type = "ngram" };
9374

9475
public override string Name => "ng";
9576
}
@@ -106,10 +87,7 @@ public class NGramCustomTokenCharsTests : TokenizerAssertionBase<NGramCustomToke
10687

10788
public override ITokenizer Initializer => new NGramTokenizer
10889
{
109-
MaxGram = 2,
110-
MinGram = 1,
111-
TokenChars = new[] { TokenChar.Custom },
112-
CustomTokenChars = "+-_"
90+
MaxGram = 2, MinGram = 1, TokenChars = new[] { TokenChar.Custom }, CustomTokenChars = "+-_"
11391
};
11492

11593
public override object Json => new
@@ -164,16 +142,9 @@ public class IcuTests : TokenizerAssertionBase<IcuTests>
164142
.RuleFiles(RuleFiles)
165143
);
166144

167-
public override ITokenizer Initializer => new IcuTokenizer
168-
{
169-
RuleFiles = RuleFiles,
170-
};
145+
public override ITokenizer Initializer => new IcuTokenizer { RuleFiles = RuleFiles, };
171146

172-
public override object Json => new
173-
{
174-
rule_files = RuleFiles,
175-
type = "icu_tokenizer"
176-
};
147+
public override object Json => new { rule_files = RuleFiles, type = "icu_tokenizer" };
177148

178149
public override string Name => "icu";
179150
}
@@ -198,7 +169,7 @@ public class KuromojiTests : TokenizerAssertionBase<KuromojiTests>
198169
DiscardPunctuation = true,
199170
NBestExamples = Example,
200171
NBestCost = 1000,
201-
UserDictionaryRules = new [] { Inline }
172+
UserDictionaryRules = new[] { Inline }
202173
};
203174

204175
public override object Json => new
@@ -208,7 +179,7 @@ public class KuromojiTests : TokenizerAssertionBase<KuromojiTests>
208179
nbest_cost = 1000,
209180
nbest_examples = Example,
210181
type = "kuromoji_tokenizer",
211-
user_dictionary_rules = new [] { Inline }
182+
user_dictionary_rules = new[] { Inline }
212183
};
213184

214185
public override string Name => "kuro";
@@ -228,18 +199,9 @@ public class KuromojiDiscardCompoundTokenTests : TokenizerAssertionBase<Kuromoji
228199
.DiscardCompoundToken()
229200
);
230201

231-
public override ITokenizer Initializer => new KuromojiTokenizer
232-
{
233-
Mode = KuromojiTokenizationMode.Search,
234-
DiscardCompoundToken = true,
235-
};
202+
public override ITokenizer Initializer => new KuromojiTokenizer { Mode = KuromojiTokenizationMode.Search, DiscardCompoundToken = true, };
236203

237-
public override object Json => new
238-
{
239-
discard_compound_token = true,
240-
mode = "search",
241-
type = "kuromoji_tokenizer",
242-
};
204+
public override object Json => new { discard_compound_token = true, mode = "search", type = "kuromoji_tokenizer", };
243205

244206
public override string Name => "kuro_discard_compound_token";
245207
}
@@ -252,11 +214,7 @@ public class UaxTests : TokenizerAssertionBase<UaxTests>
252214

253215
public override ITokenizer Initializer => new UaxEmailUrlTokenizer { MaxTokenLength = 12 };
254216

255-
public override object Json => new
256-
{
257-
max_token_length = 12,
258-
type = "uax_url_email"
259-
};
217+
public override object Json => new { max_token_length = 12, type = "uax_url_email" };
260218

261219
public override string Name => "uax";
262220
}
@@ -269,20 +227,9 @@ public class PatternTests : TokenizerAssertionBase<PatternTests>
269227
.Pattern(@"\W+")
270228
);
271229

272-
public override ITokenizer Initializer => new PatternTokenizer
273-
{
274-
Flags = "CASE_INSENSITIVE",
275-
Group = 1,
276-
Pattern = @"\W+"
277-
};
230+
public override ITokenizer Initializer => new PatternTokenizer { Flags = "CASE_INSENSITIVE", Group = 1, Pattern = @"\W+" };
278231

279-
public override object Json => new
280-
{
281-
pattern = @"\W+",
282-
flags = "CASE_INSENSITIVE",
283-
group = 1,
284-
type = "pattern"
285-
};
232+
public override object Json => new { pattern = @"\W+", flags = "CASE_INSENSITIVE", group = 1, type = "pattern" };
286233

287234
public override string Name => "pat";
288235
}
@@ -312,10 +259,7 @@ public class NoriTests : TokenizerAssertionBase<NoriTests>
312259
.DecompoundMode(NoriDecompoundMode.Mixed)
313260
);
314261

315-
public override ITokenizer Initializer => new NoriTokenizer
316-
{
317-
DecompoundMode = NoriDecompoundMode.Mixed
318-
};
262+
public override ITokenizer Initializer => new NoriTokenizer { DecompoundMode = NoriDecompoundMode.Mixed };
319263

320264
public override object Json => new { type = "nori_tokenizer", decompound_mode = "mixed" };
321265
public override string Name => "nori";
@@ -331,16 +275,14 @@ public class NoriWithUserDictionaryTests : TokenizerAssertionBase<NoriWithUserDi
331275

332276
public override ITokenizer Initializer => new NoriTokenizer
333277
{
334-
DecompoundMode = NoriDecompoundMode.Mixed,
335-
UserDictionaryRules = new [] { "c++", "C샤프", "세종", "세종시 세종 시" }
278+
DecompoundMode = NoriDecompoundMode.Mixed, UserDictionaryRules = new[] { "c++", "C샤프", "세종", "세종시 세종 시" }
336279
};
337280

338281
public override object Json => new
339282
{
340-
type = "nori_tokenizer",
341-
decompound_mode = "mixed",
342-
user_dictionary_rules = new [] { "c++", "C샤프", "세종", "세종시 세종 시" }
283+
type = "nori_tokenizer", decompound_mode = "mixed", user_dictionary_rules = new[] { "c++", "C샤프", "세종", "세종시 세종 시" }
343284
};
285+
344286
public override string Name => "nori_userdictionary";
345287
}
346288

@@ -353,16 +295,9 @@ public class CharGroupTests : TokenizerAssertionBase<CharGroupTests>
353295
.TokenizeOnCharacters(_chars)
354296
);
355297

356-
public override ITokenizer Initializer => new CharGroupTokenizer
357-
{
358-
TokenizeOnCharacters = _chars
359-
};
298+
public override ITokenizer Initializer => new CharGroupTokenizer { TokenizeOnCharacters = _chars };
360299

361-
public override object Json => new
362-
{
363-
tokenize_on_chars = _chars,
364-
type = "char_group"
365-
};
300+
public override object Json => new { tokenize_on_chars = _chars, type = "char_group" };
366301

367302
public override string Name => "char_group";
368303
}
@@ -377,18 +312,9 @@ public class CharGroupMaxTokenLengthTests : TokenizerAssertionBase<CharGroupMaxT
377312
.MaxTokenLength(255)
378313
);
379314

380-
public override ITokenizer Initializer => new CharGroupTokenizer
381-
{
382-
TokenizeOnCharacters = _chars,
383-
MaxTokenLength = 255
384-
};
315+
public override ITokenizer Initializer => new CharGroupTokenizer { TokenizeOnCharacters = _chars, MaxTokenLength = 255 };
385316

386-
public override object Json => new
387-
{
388-
tokenize_on_chars = _chars,
389-
type = "char_group",
390-
max_token_length = 255
391-
};
317+
public override object Json => new { tokenize_on_chars = _chars, type = "char_group", max_token_length = 255 };
392318

393319
public override string Name => "char_group_max_token_length";
394320
}
@@ -400,13 +326,38 @@ public class DiscardPunctuationTests : TokenizerAssertionBase<DiscardPunctuation
400326
.DiscardPunctuation()
401327
);
402328

403-
public override ITokenizer Initializer => new NoriTokenizer
404-
{
405-
DiscardPunctuation = true
406-
};
329+
public override ITokenizer Initializer => new NoriTokenizer { DiscardPunctuation = true };
407330

408331
public override object Json => new { type = "nori_tokenizer", discard_punctuation = true };
409332
public override string Name => "nori-discard";
410333
}
334+
335+
[SkipVersion("<7.7.0", "simple_pattern experimental until 7.7.0")]
336+
public class SimplePatternTests : TokenizerAssertionBase<SimplePatternTests>
337+
{
338+
public override FuncTokenizer Fluent => (n, t) => t.SimplePattern(n, e => e
339+
.Pattern(@"\W+")
340+
);
341+
342+
public override ITokenizer Initializer => new SimplePatternTokenizer { Pattern = @"\W+" };
343+
344+
public override object Json => new { pattern = @"\W+", type = "simple_pattern" };
345+
346+
public override string Name => "simple-pattern";
347+
}
348+
349+
[SkipVersion("<7.7.0", "simple_pattern_split experimental until 7.7.0")]
350+
public class SimplePatternSplitTests : TokenizerAssertionBase<SimplePatternTests>
351+
{
352+
public override FuncTokenizer Fluent => (n, t) => t.SimplePatternSplit(n, e => e
353+
.Pattern(@"\W+")
354+
);
355+
356+
public override ITokenizer Initializer => new SimplePatternTokenizer { Pattern = @"\W+" };
357+
358+
public override object Json => new { pattern = @"\W+", type = "simple_pattern_split" };
359+
360+
public override string Name => "simple-pattern-split";
361+
}
411362
}
412363
}

0 commit comments

Comments
 (0)