Skip to content

Commit 258800d

Browse files
Mpdreamzrusscam
authored andcommitted
Add support for nori plugin (#3430)
* add analysis-nori plugin to writable cluster and added nori_tokenizer * add nori_part_of_speech token filter * add nori analyzer * add Nori() to AnalyzeTokenDescriptor, CodeStandard tests caught this (cherry picked from commit c7e51cb)
1 parent e44bb69 commit 258800d

File tree

13 files changed

+248
-3
lines changed

13 files changed

+248
-3
lines changed

src/Nest/Analysis/Analyzers/AnalyzerJsonConverter.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ public override object ReadJson(JsonReader reader, Type objectType, object exist
2929
case "simple": return o.ToObject<SimpleAnalyzer>(ElasticContractResolver.Empty);
3030
case "fingerprint": return o.ToObject<FingerprintAnalyzer>(ElasticContractResolver.Empty);
3131
case "kuromoji": return o.ToObject<KuromojiAnalyzer>(ElasticContractResolver.Empty);
32+
case "nori": return o.ToObject<NoriAnalyzer>(ElasticContractResolver.Empty);
3233
default:
3334
if (o.Property("tokenizer") != null)
3435
return o.ToObject<CustomAnalyzer>(ElasticContractResolver.Empty);

src/Nest/Analysis/Analyzers/Analyzers.cs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,5 +97,9 @@ public AnalyzersDescriptor Fingerprint(string name, Func<FingerprintAnalyzerDesc
9797
/// </summary>
9898
public AnalyzersDescriptor Kuromoji(string name, Func<KuromojiAnalyzerDescriptor, IKuromojiAnalyzer> selector = null) =>
9999
Assign(name, selector.InvokeOrDefault(new KuromojiAnalyzerDescriptor()));
100+
101+
/// <inheritdoc cref="INoriAnalyzer"/>
102+
public AnalyzersDescriptor Nori(string name, Func<NoriAnalyzerDescriptor, INoriAnalyzer> selector) =>
103+
Assign(name, selector?.Invoke(new NoriAnalyzerDescriptor()));
100104
}
101105
}
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
using System.Collections.Generic;
2+
using Newtonsoft.Json;
3+
4+
namespace Nest
5+
{
6+
/// <summary>
7+
///The nori analyzer consists of the following tokenizer and token filters:
8+
///<para> - nori_tokenizer</para>
9+
///<para> - nori_part_of_speech token filter</para>
10+
///<para> - nori_readingform token filter</para>
11+
///<para> - lowercase token filter</para>
12+
/// </summary>
13+
public interface INoriAnalyzer : IAnalyzer
14+
{
15+
/// <inheritdoc cref="INoriTokenizer.DecompoundMode"/>
16+
[JsonProperty("decompound_mode")]
17+
NoriDecompoundMode? DecompoundMode { get; set; }
18+
19+
/// <inheritdoc cref="INoriTokenizer.UserDictionary"/>
20+
[JsonProperty("user_dictionary")]
21+
string UserDictionary { get; set; }
22+
23+
/// <inheritdoc cref="INoriPartOfSpeechTokenFilter.StopTags"/>
24+
[JsonProperty("stoptags")]
25+
IEnumerable<string> StopTags { get; set; }
26+
}
27+
28+
/// <inheritdoc cref="INoriAnalyzer"/>
29+
public class NoriAnalyzer : AnalyzerBase, INoriAnalyzer
30+
{
31+
public NoriAnalyzer() : base("nori") {}
32+
33+
/// <inheritdoc cref="INoriTokenizer.DecompoundMode"/>
34+
public NoriDecompoundMode? DecompoundMode { get; set; }
35+
36+
/// <inheritdoc cref="INoriTokenizer.UserDictionary"/>
37+
public string UserDictionary { get; set; }
38+
39+
///<inheritdoc cref="INoriPartOfSpeechTokenFilter.StopTags" />
40+
public IEnumerable<string> StopTags { get; set; }
41+
}
42+
43+
/// <inheritdoc cref="INoriAnalyzer"/>
44+
public class NoriAnalyzerDescriptor : AnalyzerDescriptorBase<NoriAnalyzerDescriptor, INoriAnalyzer>, INoriAnalyzer
45+
{
46+
protected override string Type => "nori";
47+
48+
NoriDecompoundMode? INoriAnalyzer.DecompoundMode { get; set; }
49+
string INoriAnalyzer.UserDictionary { get; set; }
50+
IEnumerable<string> INoriAnalyzer.StopTags { get; set; }
51+
52+
/// <inheritdoc cref="INoriTokenizer.DecompoundMode"/>
53+
public NoriAnalyzerDescriptor DecompoundMode(NoriDecompoundMode? mode) => Assign(a => a.DecompoundMode = mode);
54+
55+
/// <inheritdoc cref="INoriTokenizer.UserDictionary"/>
56+
public NoriAnalyzerDescriptor UserDictionary(string path) => Assign(a => a.UserDictionary = path);
57+
58+
///<inheritdoc cref="INoriPartOfSpeechTokenFilter.StopTags" />
59+
public NoriAnalyzerDescriptor StopTags(IEnumerable<string> stopTags) => Assign(a => a.StopTags = stopTags);
60+
61+
///<inheritdoc cref="INoriPartOfSpeechTokenFilter.StopTags" />
62+
public NoriAnalyzerDescriptor StopTags(params string[] stopTags) => Assign(a => a.StopTags = stopTags);
63+
64+
}
65+
}
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
using System.Collections.Generic;
2+
using Newtonsoft.Json;
3+
4+
namespace Nest
5+
{
6+
/// <summary> The nori_part_of_speech token filter removes tokens that match a set of part-of-speech tags. </summary>
7+
public interface INoriPartOfSpeechTokenFilter : ITokenFilter
8+
{
9+
/// <summary> An array of part-of-speech tags that should be removed. </summary>
10+
[JsonProperty("stoptags")]
11+
IEnumerable<string> StopTags { get; set; }
12+
}
13+
14+
/// <inheritdoc cref="INoriPartOfSpeechTokenFilter"/>
15+
public class NoriPartOfSpeechTokenFilter : TokenFilterBase, INoriPartOfSpeechTokenFilter
16+
{
17+
public NoriPartOfSpeechTokenFilter() : base("nori_part_of_speech") { }
18+
19+
///<inheritdoc cref="INoriPartOfSpeechTokenFilter.StopTags" />
20+
public IEnumerable<string> StopTags { get; set; }
21+
}
22+
/// <inheritdoc cref="INoriPartOfSpeechTokenFilter"/>
23+
public class NoriPartOfSpeechTokenFilterDescriptor
24+
: TokenFilterDescriptorBase<NoriPartOfSpeechTokenFilterDescriptor, INoriPartOfSpeechTokenFilter>, INoriPartOfSpeechTokenFilter
25+
{
26+
protected override string Type => "nori_part_of_speech";
27+
28+
IEnumerable<string> INoriPartOfSpeechTokenFilter.StopTags { get; set; }
29+
30+
///<inheritdoc cref="INoriPartOfSpeechTokenFilter.StopTags" />
31+
public NoriPartOfSpeechTokenFilterDescriptor StopTags(IEnumerable<string> stopTags) => Assign(a => a.StopTags = stopTags);
32+
33+
///<inheritdoc cref="INoriPartOfSpeechTokenFilter.StopTags" />
34+
public NoriPartOfSpeechTokenFilterDescriptor StopTags(params string[] stopTags) => Assign(a => a.StopTags = stopTags);
35+
36+
}
37+
38+
}

src/Nest/Analysis/TokenFilters/TokenFilters.cs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -318,6 +318,10 @@ public TokenFiltersDescriptor IcuNormalization(string name, Func<IcuNormalizatio
318318
public TokenFiltersDescriptor IcuTransform(string name, Func<IcuTransformTokenFilterDescriptor, IIcuTransformTokenFilter> selector) =>
319319
Assign(name, selector.Invoke(new IcuTransformTokenFilterDescriptor()));
320320

321+
/// <inheritdoc cref="INoriPartOfSpeechTokenFilter"/>
322+
public TokenFiltersDescriptor NoriPartOfSpeech(string name, Func<NoriPartOfSpeechTokenFilterDescriptor, INoriPartOfSpeechTokenFilter> selector) =>
323+
Assign(name, selector.Invoke(new NoriPartOfSpeechTokenFilterDescriptor()));
324+
321325
/// <summary>
322326
/// A token filter of type multiplexer will emit multiple tokens at the same position, each version of the token
323327
/// having been run through a different filter. Identical output tokens at the same position will be removed.
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
using System.Runtime.Serialization;
2+
using Newtonsoft.Json;
3+
using Newtonsoft.Json.Converters;
4+
5+
namespace Nest
6+
{
7+
/// <summary> The decompound mode determines how the tokenizer handles compound tokens. </summary>
8+
[JsonConverter(typeof(StringEnumConverter))]
9+
public enum NoriDecompoundMode
10+
{
11+
/// <summary> Decomposes compounds and discards the original form (default). </summary>
12+
[EnumMember(Value="discard")]
13+
Discard,
14+
/// <summary> No decomposition for compounds </summary>
15+
[EnumMember(Value="none")]
16+
None,
17+
/// <summary> Decomposes compounds and keeps the original form </summary>
18+
[EnumMember(Value="mixed")]
19+
Mixed
20+
}
21+
22+
/// <summary> Tokenizer that ships with the analysis-nori plugin</summary>
23+
public interface INoriTokenizer : ITokenizer
24+
{
25+
/// <summary>
26+
/// The regular expression pattern, defaults to \W+.
27+
/// </summary>
28+
[JsonProperty("decompound_mode")]
29+
NoriDecompoundMode? DecompoundMode { get; set; }
30+
31+
/// <summary>
32+
/// The Nori tokenizer uses the mecab-ko-dic dictionary by default. A user_dictionary with custom nouns (NNG) may be appended to
33+
/// the default dictionary. This property allows you to specify this file on disk
34+
/// </summary>
35+
[JsonProperty("user_dictionary")]
36+
string UserDictionary { get; set; }
37+
}
38+
39+
/// <inheritdoc cref="INoriTokenizer"/>
40+
public class NoriTokenizer : TokenizerBase, INoriTokenizer
41+
{
42+
public NoriTokenizer() => this.Type = "nori_tokenizer";
43+
/// <inheritdoc cref="INoriTokenizer.DecompoundMode"/>
44+
public NoriDecompoundMode? DecompoundMode { get; set; }
45+
/// <inheritdoc cref="INoriTokenizer.UserDictionary"/>
46+
public string UserDictionary { get; set; }
47+
}
48+
/// <inheritdoc cref="INoriTokenizer"/>
49+
public class NoriTokenizerDescriptor
50+
: TokenizerDescriptorBase<NoriTokenizerDescriptor, INoriTokenizer>, INoriTokenizer
51+
{
52+
protected override string Type => "nori_tokenizer";
53+
54+
NoriDecompoundMode? INoriTokenizer.DecompoundMode { get; set; }
55+
string INoriTokenizer.UserDictionary { get; set; }
56+
57+
/// <inheritdoc cref="INoriTokenizer.DecompoundMode"/>
58+
public NoriTokenizerDescriptor DecompoundMode(NoriDecompoundMode? mode) => Assign(a => a.DecompoundMode = mode);
59+
60+
/// <inheritdoc cref="INoriTokenizer.UserDictionary"/>
61+
public NoriTokenizerDescriptor UserDictionary(string path) => Assign(a => a.UserDictionary = path);
62+
}
63+
}

src/Nest/Analysis/Tokenizers/Tokenizers.cs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,10 @@ public TokenizersDescriptor Kuromoji(string name, Func<KuromojiTokenizerDescript
113113
public TokenizersDescriptor Icu(string name, Func<IcuTokenizerDescriptor, IIcuTokenizer> selector) =>
114114
Assign(name, selector?.Invoke(new IcuTokenizerDescriptor()));
115115

116+
/// <inheritdoc cref="INoriTokenizer"/>
117+
public TokenizersDescriptor Nori(string name, Func<NoriTokenizerDescriptor, INoriTokenizer> selector) =>
118+
Assign(name, selector?.Invoke(new NoriTokenizerDescriptor()));
119+
116120
/// <inheritdoc cref="ICharGroupTokenizer.TokenizeOnCharacters"/>>
117121
public TokenizersDescriptor CharGroup(string name, Func<CharGroupTokenizerDescriptor, ICharGroupTokenizer> selector) =>
118122
Assign(name, selector?.Invoke(new CharGroupTokenizerDescriptor()));

src/Nest/Indices/Analyze/AnalyzeTokenFilters.cs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -324,6 +324,10 @@ public AnalyzeTokenFiltersDescriptor IcuNormalization(Func<IcuNormalizationToken
324324
public AnalyzeTokenFiltersDescriptor IcuTransform(Func<IcuTransformTokenFilterDescriptor, IIcuTransformTokenFilter> selector) =>
325325
AssignIfNotNull(selector.Invoke(new IcuTransformTokenFilterDescriptor()));
326326

327+
/// <inheritdoc cref="INoriPartOfSpeechTokenFilter"/>
328+
public AnalyzeTokenFiltersDescriptor NoriPartOfSpeech(Func<NoriPartOfSpeechTokenFilterDescriptor, INoriPartOfSpeechTokenFilter> selector) =>
329+
AssignIfNotNull(selector.Invoke(new NoriPartOfSpeechTokenFilterDescriptor()));
330+
327331
///<inheritdoc cref="IMultiplexerTokenFilter"/>
328332
public AnalyzeTokenFiltersDescriptor Multiplexer(Func<MultiplexerTokenFilterDescriptor, IMultiplexerTokenFilter> selector) =>
329333
AssignIfNotNull(selector.Invoke(new MultiplexerTokenFilterDescriptor()));

src/Nest/Indices/Analyze/AnalyzeTokenizersDescriptor.cs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,10 @@ public ITokenizer Kuromoji(Func<KuromojiTokenizerDescriptor, IKuromojiTokenizer>
9292
public ITokenizer Icu(Func<IcuTokenizerDescriptor, IIcuTokenizer> selector) =>
9393
(selector?.Invoke(new IcuTokenizerDescriptor()));
9494

95+
/// <inheritdoc cref="INoriTokenizer"/>
96+
public ITokenizer Nori(Func<NoriTokenizerDescriptor, INoriTokenizer> selector) =>
97+
selector.Invoke(new NoriTokenizerDescriptor());
98+
9599
/// <inheritdoc cref="ICharGroupTokenizer.TokenizeOnCharacters"/>>
96100
public ITokenizer CharGroup(Func<CharGroupTokenizerDescriptor, ICharGroupTokenizer> selector) => selector?.Invoke(new CharGroupTokenizerDescriptor());
97101
}

src/Tests/Tests.Core/ManagedElasticsearch/Clusters/WritableCluster.cs

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
using Tests.Core.ManagedElasticsearch.NodeSeeders;
1+
using Elastic.Managed.Ephemeral.Plugins;
2+
using Tests.Core.ManagedElasticsearch.NodeSeeders;
23
using static Elastic.Managed.Ephemeral.Plugins.ElasticsearchPlugin;
34

45
namespace Tests.Core.ManagedElasticsearch.Clusters
@@ -12,7 +13,9 @@ public WritableCluster() : base(new ClientTestClusterConfiguration(
1213
AnalysisKuromoji,
1314
AnalysisIcu,
1415
AnalysisPhonetic,
15-
MapperMurmur3
16+
MapperMurmur3,
17+
//TODO move this to elasticsearch-net abstractions
18+
new ElasticsearchPlugin("analysis-nori", v => v >= "6.4.0")
1619
)
1720
{
1821
MaxConcurrency = 4

src/Tests/Tests/Analysis/Analyzers/AnalyzerTests.cs

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
using System;
2+
using Elastic.Xunit.XunitPlumbing;
23
using Nest;
4+
using Tests.Analysis.TokenFilters;
35

46
namespace Tests.Analysis.Analyzers
57
{
@@ -197,5 +199,28 @@ public class KuromojuTests : AnalyzerAssertionBase<KuromojuTests>
197199
};
198200
}
199201

202+
[SkipVersion("<6.4.0", "analysis-nori plugin introduced in 6.4.0")]
203+
public class NoriTests : AnalyzerAssertionBase<NoriTests>
204+
{
205+
public override string Name => "nori";
206+
private readonly string[] _stopTags = {"NR", "SP"};
207+
public override IAnalyzer Initializer => new NoriAnalyzer
208+
{
209+
StopTags = _stopTags,
210+
DecompoundMode = NoriDecompoundMode.Mixed
211+
};
212+
213+
public override FuncTokenizer Fluent => (n, t) => t.Nori(n, e => e
214+
.StopTags(_stopTags)
215+
.DecompoundMode(NoriDecompoundMode.Mixed)
216+
);
217+
218+
public override object Json => new
219+
{
220+
type = "nori",
221+
decompound_mode = "mixed",
222+
stoptags =_stopTags
223+
};
224+
}
200225
}
201226
}

src/Tests/Tests/Analysis/TokenFilters/TokenFilterTests.cs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -919,6 +919,19 @@ public class PhoneticTests : TokenFilterAssertionBase<PhoneticTests>
919919

920920
}
921921

922+
[SkipVersion("<6.4.0", "analysis-nori plugin introduced in 6.4.0")]
923+
public class NoriPartOfSpeechTests : TokenFilterAssertionBase<NoriPartOfSpeechTests>
924+
{
925+
public override string Name => "nori_pos";
926+
private readonly string[] _stopTags = {"NR", "SP"};
927+
928+
public override ITokenFilter Initializer => new NoriPartOfSpeechTokenFilter {StopTags = _stopTags};
929+
930+
public override FuncTokenFilters Fluent => (n, tf) => tf.NoriPartOfSpeech(n, t => t.StopTags(_stopTags));
931+
932+
public override object Json => new { type = "nori_part_of_speech", stoptags = _stopTags };
933+
}
934+
922935
[SkipVersion("<6.4.0", "Introduced in 6.4.0")]
923936
public class MultiplexerTests : TokenFilterAssertionBase<PhoneticTests>
924937
{

src/Tests/Tests/Analysis/Tokenizers/TokenizerTests.cs

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
using System;
1+
using System;
2+
using Elastic.Xunit.XunitPlumbing;
23
using Nest;
34

45
namespace Tests.Analysis.Tokenizers
@@ -204,6 +205,22 @@ public class StandardTests : TokenizerAssertionBase<StandardTests>
204205
public override object Json => new {type = "standard"};
205206
}
206207

208+
[SkipVersion("<6.4.0", "analysis-nori plugin introduced in 6.4.0")]
209+
public class NoriTests : TokenizerAssertionBase<NoriTests>
210+
{
211+
public override string Name => "nori";
212+
public override ITokenizer Initializer => new NoriTokenizer
213+
{
214+
DecompoundMode = NoriDecompoundMode.Mixed
215+
};
216+
217+
public override FuncTokenizer Fluent => (n, t) => t.Nori(n, e => e
218+
.DecompoundMode(NoriDecompoundMode.Mixed)
219+
);
220+
221+
public override object Json => new {type = "nori_tokenizer", decompound_mode = "mixed"};
222+
}
223+
207224
public class CharGroupTests : TokenizerAssertionBase<CharGroupTests>
208225
{
209226
private readonly string[] _chars = {"whitespace", "-", "\n"};

0 commit comments

Comments
 (0)