Skip to content

Extend tokenfilters support in NEST #652

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jun 2, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions src/Nest/Domain/Analysis/TokenFilter/CommonGramsTokenFilter.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
using System.Collections.Generic;
using Newtonsoft.Json;

namespace Nest
{
/// <summary>
/// Token filter that generates bigrams for frequently occuring terms. Single terms are still indexed.
///<para>Note, common_words or common_words_path field is required.</para>
/// </summary>
public class CommonGramsTokenFilter : TokenFilterBase
{
public CommonGramsTokenFilter()
: base("common_grams")
{

}

/// <summary>
/// A list of common words to use.
/// </summary>
[JsonProperty("common_words")]
public IEnumerable<string> CommonWords { get; set; }

/// <summary>
/// A path (either relative to config location, or absolute) to a list of common words.
/// </summary>
[JsonProperty("common_words_path")]
public string CommonWordsPath { get; set; }

/// <summary>
/// If true, common words matching will be case insensitive.
/// </summary>
[JsonProperty("ignore_case")]
public bool? IgnoreCase { get; set; }

/// <summary>
/// Generates bigrams then removes common words and single terms followed by a common word.
/// </summary>
[JsonProperty("query_mode")]
public bool? QueryMode { get; set; }

}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
using System.Collections.Generic;
using Newtonsoft.Json;

namespace Nest
{
/// <summary>
/// Splits tokens into tokens and payload whenever a delimiter character is found.
/// </summary>
public class DelimitedPayloadTokenFilter : TokenFilterBase
{
public DelimitedPayloadTokenFilter()
: base("delimited_payload_filter")
{

}

/// <summary>
/// Character used for splitting the tokens.
/// </summary>
[JsonProperty("delimiter")]
public char Delimiter { get; set; }

/// <summary>
/// The type of the payload. int for integer, float for float and identity for characters.
/// </summary>
[JsonProperty("encoding")]
public string Encoding { get; set; }

}
}
49 changes: 49 additions & 0 deletions src/Nest/Domain/Analysis/TokenFilter/HunspellTokenFilter.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
using Newtonsoft.Json;

namespace Nest
{
/// <summary>
/// Basic support for hunspell stemming.
///<para> Hunspell dictionaries will be picked up from a dedicated hunspell directory on the filesystem.</para>
/// </summary>
public class HunspellTokenFilter : TokenFilterBase
{
public HunspellTokenFilter()
: base("hunspell")
{

}

/// <summary>
/// If true, dictionary matching will be case insensitive.
/// </summary>
[JsonProperty("ignore_case")]
public bool? IgnoreCase { get; set; }

/// <summary>
/// A locale for this filter. If this is unset, the lang or language are used instead - so one of these has to be set.
/// </summary>
[JsonProperty("locale")]
public string Locale { get; set; }

/// <summary>
/// The name of a dictionary.
/// </summary>
[JsonProperty("dictionary")]
public string Dictionary { get; set; }

/// <summary>
/// If only unique terms should be returned, this needs to be set to true.
/// </summary>
[JsonProperty("dedup")]
public bool? Dedup { get; set; }

/// <summary>
/// If only the longest term should be returned, set this to true.
/// </summary>
[JsonProperty("longest_only")]
public bool? LongestOnly { get; set; }


}
}
36 changes: 36 additions & 0 deletions src/Nest/Domain/Analysis/TokenFilter/KeepWordsTokenFilter.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
using System.Collections.Generic;
using Newtonsoft.Json;

namespace Nest
{
/// <summary>
/// A token filter of type keep that only keeps tokens with text contained in a predefined set of words.
/// </summary>
public class KeepWordsTokenFilter : TokenFilterBase
{
public KeepWordsTokenFilter()
: base("keep")
{

}

/// <summary>
/// A list of words to keep.
/// </summary>
[JsonProperty("keep_words")]
public IEnumerable<string> KeepWords { get; set; }

/// <summary>
/// A path to a words file.
/// </summary>
[JsonProperty("rules_path")]
public string KeepWordsPath { get; set; }

/// <summary>
/// A boolean indicating whether to lower case the words.
/// </summary>
[JsonProperty("keep_words_case")]
public bool? KeepWordsCase { get; set; }

}
}
13 changes: 13 additions & 0 deletions src/Nest/Domain/Analysis/TokenFilter/KeywordRepeatTokenFilter.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
namespace Nest
{
/// <summary>
/// The keyword_repeat token filter Emits each incoming token twice once as keyword and once as a non-keyword to allow an unstemmed version of a term to be indexed side by side with the stemmed version of the term.
/// </summary>
public class KeywordRepeatTokenFilter : TokenFilterBase
{
public KeywordRepeatTokenFilter()
: base("keyword_repeat")
{
}
}
}
28 changes: 28 additions & 0 deletions src/Nest/Domain/Analysis/TokenFilter/LimitTokenCountTokenFilter.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
using Newtonsoft.Json;

namespace Nest
{
/// <summary>
/// Limits the number of tokens that are indexed per document and field.
/// </summary>
public class LimitTokenCountTokenFilter : TokenFilterBase
{
public LimitTokenCountTokenFilter()
: base("limit")
{

}

/// <summary>
/// The maximum number of tokens that should be indexed per document and field.
/// </summary>
[JsonProperty("max_token_count")]
public int? MaxTokenCount { get; set; }

/// <summary>
/// If set to true the filter exhaust the stream even if max_token_count tokens have been consumed already.
/// </summary>
[JsonProperty("consume_all_tokens")]
public bool? ConsumeAllTokens { get; set; }
}
}
6 changes: 4 additions & 2 deletions src/Nest/Domain/Analysis/TokenFilter/LowercaseTokenFilter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@ public LowercaseTokenFilter()
: base("lowercase")
{

}

}

[JsonProperty("language")]
public string Language { get; set; }
}
}
25 changes: 25 additions & 0 deletions src/Nest/Domain/Analysis/TokenFilter/PatternCaptureTokenFilter.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
using System.Collections.Generic;
using Newtonsoft.Json;

namespace Nest
{
/// <summary>
/// The pattern_capture token filter, unlike the pattern tokenizer, emits a token for every capture group in the regular expression.
/// </summary>
public class PatternCaptureTokenFilter : TokenFilterBase
{
public PatternCaptureTokenFilter()
: base("pattern_capture")
{
}

[JsonProperty("patterns")]
public IEnumerable<string> Patterns { get; set; }

/// <summary>
/// If preserve_original is set to true then it would also emit the original token
/// </summary>
[JsonProperty("preserve_original")]
public bool? PreserveOriginal { get; set; }
}
}
30 changes: 30 additions & 0 deletions src/Nest/Domain/Analysis/TokenFilter/StemmerOverrideTokenFilter.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
using System.Collections.Generic;
using Newtonsoft.Json;

namespace Nest
{
/// <summary>
/// Overrides stemming algorithms, by applying a custom mapping, then protecting these terms from being modified by stemmers. Must be placed before any stemming filters.
/// </summary>
public class StemmerOverrideTokenFilter : TokenFilterBase
{
public StemmerOverrideTokenFilter()
: base("stemmer_override")
{

}

/// <summary>
/// A list of mapping rules to use.
/// </summary>
[JsonProperty("rules")]
public IEnumerable<string> Rules { get; set; }

/// <summary>
/// A path (either relative to config location, or absolute) to a list of mappings.
/// </summary>
[JsonProperty("rules_path")]
public string RulesPath { get; set; }

}
}
17 changes: 17 additions & 0 deletions src/Nest/Domain/Analysis/TokenFilter/UppercaseTokenFilter.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
using Newtonsoft.Json;

namespace Nest
{
/// <summary>
/// A token filter of type uppercase that normalizes token text to upper case.
/// </summary>
public class UppercaseTokenFilter : TokenFilterBase
{
public UppercaseTokenFilter()
: base("uppercase")
{

}

}
}
Loading