Skip to content

Commit d0ef9fe

Browse files
committed
Support custom token chars in (edge)ngram tokenizer (#4384)
Relates: #4341 This commit adds support for custom token characters for edgengram and ngram tokenizers. (cherry picked from commit 38ec44d)
1 parent f27d114 commit d0ef9fe

File tree

4 files changed

+118
-10
lines changed

4 files changed

+118
-10
lines changed

src/Nest/Analysis/Tokenizers/NGram/EdgeNGramTokenizer.cs

+25-5
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,18 @@ public interface IEdgeNGramTokenizer : ITokenizer
2929
/// </summary>
3030
[DataMember(Name ="token_chars")]
3131
IEnumerable<TokenChar> TokenChars { get; set; }
32+
33+
/// <summary>
34+
/// Custom characters that should be treated as part of a token. For example,
35+
/// setting this to +-_ will make the tokenizer treat the plus, minus and
36+
/// underscore sign as part of a token.
37+
/// <para />
38+
/// Requires setting <see cref="TokenChar.Custom"/> as part of <see cref="TokenChars"/>
39+
/// <para />
40+
/// Available in Elasticsearch 7.6.0+.
41+
/// </summary>
42+
[DataMember(Name = "custom_token_chars")]
43+
string CustomTokenChars { get; set; }
3244
}
3345

3446
/// <inheritdoc />
@@ -44,6 +56,9 @@ public class EdgeNGramTokenizer : TokenizerBase, IEdgeNGramTokenizer
4456

4557
/// <inheritdoc />
4658
public IEnumerable<TokenChar> TokenChars { get; set; }
59+
60+
/// <inheritdoc />
61+
public string CustomTokenChars { get; set; }
4762
}
4863

4964
/// <inheritdoc />
@@ -52,22 +67,27 @@ public class EdgeNGramTokenizerDescriptor
5267
{
5368
protected override string Type => "edge_ngram";
5469
int? IEdgeNGramTokenizer.MaxGram { get; set; }
55-
5670
int? IEdgeNGramTokenizer.MinGram { get; set; }
5771
IEnumerable<TokenChar> IEdgeNGramTokenizer.TokenChars { get; set; }
5872

59-
/// <inheritdoc />
73+
string IEdgeNGramTokenizer.CustomTokenChars { get; set; }
74+
75+
/// <inheritdoc cref="IEdgeNGramTokenizer.MinGram" />
6076
public EdgeNGramTokenizerDescriptor MinGram(int? minGram) => Assign(minGram, (a, v) => a.MinGram = v);
6177

62-
/// <inheritdoc />
78+
/// <inheritdoc cref="IEdgeNGramTokenizer.MaxGram" />
6379
public EdgeNGramTokenizerDescriptor MaxGram(int? maxGram) => Assign(maxGram, (a, v) => a.MaxGram = v);
6480

65-
/// <inheritdoc />
81+
/// <inheritdoc cref="IEdgeNGramTokenizer.TokenChars" />
6682
public EdgeNGramTokenizerDescriptor TokenChars(IEnumerable<TokenChar> tokenChars) =>
6783
Assign(tokenChars, (a, v) => a.TokenChars = v);
6884

69-
/// <inheritdoc />
85+
/// <inheritdoc cref="IEdgeNGramTokenizer.TokenChars" />
7086
public EdgeNGramTokenizerDescriptor TokenChars(params TokenChar[] tokenChars) =>
7187
Assign(tokenChars, (a, v) => a.TokenChars = v);
88+
89+
/// <inheritdoc cref="IEdgeNGramTokenizer.CustomTokenChars" />
90+
public EdgeNGramTokenizerDescriptor CustomTokenChars(string customTokenChars) =>
91+
Assign(customTokenChars, (a, v) => a.CustomTokenChars = v);
7292
}
7393
}

src/Nest/Analysis/Tokenizers/NGram/NGramTokenizer.cs

+25-5
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,18 @@ public interface INGramTokenizer : ITokenizer
2929
/// </summary>
3030
[DataMember(Name ="token_chars")]
3131
IEnumerable<TokenChar> TokenChars { get; set; }
32+
33+
/// <summary>
34+
/// Custom characters that should be treated as part of a token. For example,
35+
/// setting this to +-_ will make the tokenizer treat the plus, minus and
36+
/// underscore sign as part of a token.
37+
/// <para />
38+
/// Requires setting <see cref="TokenChar.Custom"/> as part of <see cref="TokenChars"/>
39+
/// <para />
40+
/// Available in Elasticsearch 7.6.0+.
41+
/// </summary>
42+
[DataMember(Name = "custom_token_chars")]
43+
string CustomTokenChars { get; set; }
3244
}
3345

3446
/// <inheritdoc />
@@ -44,6 +56,9 @@ public class NGramTokenizer : TokenizerBase, INGramTokenizer
4456

4557
/// <inheritdoc />
4658
public IEnumerable<TokenChar> TokenChars { get; set; }
59+
60+
/// <inheritdoc />
61+
public string CustomTokenChars { get; set; }
4762
}
4863

4964
/// <inheritdoc />
@@ -52,21 +67,26 @@ public class NGramTokenizerDescriptor
5267
{
5368
protected override string Type => "ngram";
5469
int? INGramTokenizer.MaxGram { get; set; }
55-
5670
int? INGramTokenizer.MinGram { get; set; }
5771
IEnumerable<TokenChar> INGramTokenizer.TokenChars { get; set; }
5872

59-
/// <inheritdoc />
73+
string INGramTokenizer.CustomTokenChars { get; set; }
74+
75+
/// <inheritdoc cref="INGramTokenizer.MinGram" />
6076
public NGramTokenizerDescriptor MinGram(int? minGram) => Assign(minGram, (a, v) => a.MinGram = v);
6177

62-
/// <inheritdoc />
78+
/// <inheritdoc cref="INGramTokenizer.MaxGram" />
6379
public NGramTokenizerDescriptor MaxGram(int? minGram) => Assign(minGram, (a, v) => a.MaxGram = v);
6480

65-
/// <inheritdoc />
81+
/// <inheritdoc cref="INGramTokenizer.TokenChars" />
6682
public NGramTokenizerDescriptor TokenChars(IEnumerable<TokenChar> tokenChars) =>
6783
Assign(tokenChars, (a, v) => a.TokenChars = v);
6884

69-
/// <inheritdoc />
85+
/// <inheritdoc cref="INGramTokenizer.TokenChars" />
7086
public NGramTokenizerDescriptor TokenChars(params TokenChar[] tokenChars) => Assign(tokenChars, (a, v) => a.TokenChars = v);
87+
88+
/// <inheritdoc cref="INGramTokenizer.CustomTokenChars" />
89+
public NGramTokenizerDescriptor CustomTokenChars(string customTokenChars) =>
90+
Assign(customTokenChars, (a, v) => a.CustomTokenChars = v);
7191
}
7292
}

src/Nest/Analysis/Tokenizers/NGram/TokenChar.cs

+8
Original file line numberDiff line numberDiff line change
@@ -20,5 +20,13 @@ public enum TokenChar
2020

2121
[EnumMember(Value = "symbol")]
2222
Symbol,
23+
24+
/// <summary>
25+
/// Custom token characters.
26+
/// <para></para>
27+
/// Available in Elasticsearch 7.6.0+
28+
/// </summary>
29+
[EnumMember(Value = "custom")]
30+
Custom,
2331
}
2432
}

tests/Tests/Analysis/Tokenizers/TokenizerTests.cs

+60
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,36 @@ public class EdgeNGramTests : TokenizerAssertionBase<EdgeNGramTests>
3434
public override string Name => "endgen";
3535
}
3636

37+
[SkipVersion("<7.6.0", "CustomTokenChars introduced in 7.6.0")]
38+
public class EdgeNGramCustomTokenCharsTests : TokenizerAssertionBase<EdgeNGramCustomTokenCharsTests>
39+
{
40+
public override FuncTokenizer Fluent => (n, t) => t.EdgeNGram(n, e => e
41+
.MaxGram(2)
42+
.MinGram(1)
43+
.TokenChars(TokenChar.Custom)
44+
.CustomTokenChars("+-_")
45+
);
46+
47+
public override ITokenizer Initializer => new EdgeNGramTokenizer
48+
{
49+
MaxGram = 2,
50+
MinGram = 1,
51+
TokenChars = new[] { TokenChar.Custom },
52+
CustomTokenChars = "+-_"
53+
};
54+
55+
public override object Json => new
56+
{
57+
min_gram = 1,
58+
max_gram = 2,
59+
token_chars = new[] { "custom" },
60+
custom_token_chars = "+-_",
61+
type = "edge_ngram"
62+
};
63+
64+
public override string Name => "endgen_custom";
65+
}
66+
3767
public class NGramTests : TokenizerAssertionBase<NGramTests>
3868
{
3969
public override FuncTokenizer Fluent => (n, t) => t.NGram(n, e => e
@@ -60,6 +90,36 @@ public class NGramTests : TokenizerAssertionBase<NGramTests>
6090
public override string Name => "ng";
6191
}
6292

93+
[SkipVersion("<7.6.0", "CustomTokenChars introduced in 7.6.0")]
94+
public class NGramCustomTokenCharsTests : TokenizerAssertionBase<NGramCustomTokenCharsTests>
95+
{
96+
public override FuncTokenizer Fluent => (n, t) => t.NGram(n, e => e
97+
.MaxGram(2)
98+
.MinGram(1)
99+
.TokenChars(TokenChar.Custom)
100+
.CustomTokenChars("+-_")
101+
);
102+
103+
public override ITokenizer Initializer => new NGramTokenizer
104+
{
105+
MaxGram = 2,
106+
MinGram = 1,
107+
TokenChars = new[] { TokenChar.Custom },
108+
CustomTokenChars = "+-_"
109+
};
110+
111+
public override object Json => new
112+
{
113+
min_gram = 1,
114+
max_gram = 2,
115+
token_chars = new[] { "custom" },
116+
custom_token_chars = "+-_",
117+
type = "ngram"
118+
};
119+
120+
public override string Name => "ngram_custom";
121+
}
122+
63123
public class PathHierarchyTests : TokenizerAssertionBase<PathHierarchyTests>
64124
{
65125
public override FuncTokenizer Fluent => (n, t) => t.PathHierarchy(n, e => e

0 commit comments

Comments
 (0)