Skip to content

Commit d88b057

Browse files
Mpdreamzrusscam
authored andcommitted
Add support for the char_group tokenizer (#3427)
(cherry picked from commit 9ab4384)
1 parent 89f9e5d commit d88b057

File tree

4 files changed

+75
-0
lines changed

4 files changed

+75
-0
lines changed
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
using System.Collections.Generic;
2+
using Newtonsoft.Json;
3+
4+
namespace Nest
5+
{
6+
/// <summary>
7+
/// A tokenizer that breaks text into terms whenever it encounters a character which is in a defined set. It is mostly useful
8+
/// for cases where a simple custom tokenization is desired, and the overhead of use of <see cref="PatternTokenizer"/> is not acceptable.
9+
/// </summary>
10+
public interface ICharGroupTokenizer : ITokenizer
11+
{
12+
/// <summary>
13+
/// A list containing a list of characters to tokenize the string on. Whenever a character from this list is encountered, a
14+
/// new token is started. This accepts either single characters like eg. -, or character groups: whitespace, letter, digit,
15+
/// punctuation, symbol.
16+
/// </summary>
17+
[JsonProperty("tokenize_on_chars")]
18+
IEnumerable<string> TokenizeOnCharacters { get; set; }
19+
}
20+
21+
/// <inheritdoc cref="ICharGroupTokenizer"/>
22+
public class CharGroupTokenizer : TokenizerBase, ICharGroupTokenizer
23+
{
24+
internal const string TokenizerType = "char_group";
25+
26+
public CharGroupTokenizer() => this.Type = TokenizerType;
27+
28+
/// <inheritdoc cref="ICharGroupTokenizer.TokenizeOnCharacters"/>
29+
public IEnumerable<string> TokenizeOnCharacters { get; set; }
30+
}
31+
32+
/// <inheritdoc cref="ICharGroupTokenizer"/>
33+
public class CharGroupTokenizerDescriptor
34+
: TokenizerDescriptorBase<CharGroupTokenizerDescriptor, ICharGroupTokenizer>, ICharGroupTokenizer
35+
{
36+
protected override string Type => CharGroupTokenizer.TokenizerType;
37+
38+
IEnumerable<string> ICharGroupTokenizer.TokenizeOnCharacters { get; set; }
39+
40+
/// <inheritdoc cref="ICharGroupTokenizer.TokenizeOnCharacters"/>
41+
public CharGroupTokenizerDescriptor TokenizeOnCharacters(params string[] characters) =>
42+
Assign(a => a.TokenizeOnCharacters = characters);
43+
44+
/// <inheritdoc cref="ICharGroupTokenizer.TokenizeOnCharacters"/>
45+
public CharGroupTokenizerDescriptor TokenizeOnCharacters(IEnumerable<string> characters) =>
46+
Assign(a => a.TokenizeOnCharacters = characters);
47+
}
48+
}

src/Nest/Analysis/Tokenizers/Tokenizers.cs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,5 +112,9 @@ public TokenizersDescriptor Kuromoji(string name, Func<KuromojiTokenizerDescript
112112
/// </summary>
113113
public TokenizersDescriptor Icu(string name, Func<IcuTokenizerDescriptor, IIcuTokenizer> selector) =>
114114
Assign(name, selector?.Invoke(new IcuTokenizerDescriptor()));
115+
116+
/// <inheritdoc cref="ICharGroupTokenizer.TokenizeOnCharacters"/>>
117+
public TokenizersDescriptor CharGroup(string name, Func<CharGroupTokenizerDescriptor, ICharGroupTokenizer> selector) =>
118+
Assign(name, selector?.Invoke(new CharGroupTokenizerDescriptor()));
115119
}
116120
}

src/Nest/Indices/Analyze/AnalyzeTokenizersDescriptor.cs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,5 +91,8 @@ public ITokenizer Kuromoji(Func<KuromojiTokenizerDescriptor, IKuromojiTokenizer>
9191
/// </summary>
9292
public ITokenizer Icu(Func<IcuTokenizerDescriptor, IIcuTokenizer> selector) =>
9393
(selector?.Invoke(new IcuTokenizerDescriptor()));
94+
95+
/// <inheritdoc cref="ICharGroupTokenizer.TokenizeOnCharacters"/>>
96+
public ITokenizer CharGroup(Func<CharGroupTokenizerDescriptor, ICharGroupTokenizer> selector) => selector?.Invoke(new CharGroupTokenizerDescriptor());
9497
}
9598
}

src/Tests/Tests/Analysis/Tokenizers/TokenizerTests.cs

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,5 +203,25 @@ public class StandardTests : TokenizerAssertionBase<StandardTests>
203203

204204
public override object Json => new {type = "standard"};
205205
}
206+
207+
public class CharGroupTests : TokenizerAssertionBase<CharGroupTests>
208+
{
209+
private readonly string[] _chars = {"whitespace", "-", "\n"};
210+
public override string Name => "uax";
211+
public override ITokenizer Initializer => new CharGroupTokenizer
212+
{
213+
TokenizeOnCharacters = _chars
214+
};
215+
216+
public override FuncTokenizer Fluent => (n, t) => t.CharGroup(n, e => e
217+
.TokenizeOnCharacters(_chars)
218+
);
219+
220+
public override object Json => new
221+
{
222+
tokenize_on_chars = _chars,
223+
type = "char_group"
224+
};
225+
}
206226
}
207227
}

0 commit comments

Comments
 (0)