-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Polish char- and word-level tokenizers & stopword removers #2916
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
c560136
ef85fa8
13e55d2
f8096c0
8e5c515
883784a
d99e192
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -55,8 +55,9 @@ public static TextFeaturizingEstimator FeaturizeText(this TransformsCatalog.Text | |
/// <param name="catalog">The text-related transform's catalog.</param> | ||
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param> | ||
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param> | ||
/// <param name="useMarkerCharacters">Whether to use marker characters to separate words.</param> | ||
public static TokenizingByCharactersEstimator TokenizeCharacters(this TransformsCatalog.TextTransforms catalog, | ||
/// <param name="useMarkerCharacters">Whether to prepend a marker character, <see langword="0x02"/>, to the beginning, | ||
/// and append another marker character, <see langword="0x03"/>, to the end of the output vector of characters.</param> | ||
public static TokenizingByCharactersEstimator ProduceCharactersAsKeys(this TransformsCatalog.TextTransforms catalog, | ||
string outputColumnName, | ||
string inputColumnName = null, | ||
bool useMarkerCharacters = CharTokenizingDefaults.UseMarkerCharacters) | ||
|
@@ -67,10 +68,11 @@ public static TokenizingByCharactersEstimator TokenizeCharacters(this Transforms | |
/// Tokenize incoming text in input columns and output the tokens as output columns. | ||
/// </summary> | ||
/// <param name="catalog">The text-related transform's catalog.</param> | ||
/// <param name="useMarkerCharacters">Whether to use marker characters to separate words.</param> | ||
/// <param name="useMarkerCharacters">Whether to prepend a marker character, <see langword="0x02"/>, to the beginning, | ||
/// and append another marker character, <see langword="0x03"/>, to the end of the output vector of characters.</param> | ||
/// <param name="columns">Pairs of columns to run the tokenization on.</param> | ||
|
||
public static TokenizingByCharactersEstimator TokenizeCharacters(this TransformsCatalog.TextTransforms catalog, | ||
public static TokenizingByCharactersEstimator ProduceCharactersAsKeys(this TransformsCatalog.TextTransforms catalog, | ||
bool useMarkerCharacters = CharTokenizingDefaults.UseMarkerCharacters, | ||
params ColumnOptions[] columns) | ||
=> new TokenizingByCharactersEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), useMarkerCharacters, ColumnOptions.ConvertToValueTuples(columns)); | ||
|
@@ -157,29 +159,18 @@ public static WordEmbeddingEstimator ApplyWordEmbedding(this TransformsCatalog.T | |
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param> | ||
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param> | ||
/// <param name="separators">The separators to use (uses space character by default).</param> | ||
public static WordTokenizingEstimator TokenizeWords(this TransformsCatalog.TextTransforms catalog, | ||
public static WordTokenizingEstimator ProduceWordTokens(this TransformsCatalog.TextTransforms catalog, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I would suggest to keep the How about: I believe the usual behavior of word tokenizers is that they produce text tokens, which is why I would keep the name unchanged for the word level tokenizer. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I will do
In reply to: 265253207 [](ancestors = 265253207) |
||
string outputColumnName, | ||
string inputColumnName = null, | ||
char[] separators = null) | ||
=> new WordTokenizingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnName, separators); | ||
|
||
/// <summary> | ||
/// Tokenizes incoming text in input columns and outputs the tokens using <paramref name="separators"/> as separators. | ||
/// </summary> | ||
/// <param name="catalog">The text-related transform's catalog.</param> | ||
/// <param name="columns">Pairs of columns to run the tokenization on.</param> | ||
/// <param name="separators">The separators to use (uses space character by default).</param> | ||
public static WordTokenizingEstimator TokenizeWords(this TransformsCatalog.TextTransforms catalog, | ||
(string outputColumnName, string inputColumnName)[] columns, | ||
char[] separators = null) | ||
=> new WordTokenizingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), columns, separators); | ||
|
||
/// <summary> | ||
/// Tokenizes incoming text in input columns, using per-column configurations, and outputs the tokens. | ||
/// </summary> | ||
/// <param name="catalog">The text-related transform's catalog.</param> | ||
/// <param name="columns">Pairs of columns to run the tokenization on.</param> | ||
public static WordTokenizingEstimator TokenizeWords(this TransformsCatalog.TextTransforms catalog, | ||
public static WordTokenizingEstimator ProduceWordTokens(this TransformsCatalog.TextTransforms catalog, | ||
params WordTokenizingEstimator.ColumnOptions[] columns) | ||
=> new WordTokenizingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), columns); | ||
|
||
|
@@ -243,24 +234,6 @@ public static StopWordsRemovingEstimator RemoveDefaultStopWords(this TransformsC | |
StopWordsRemovingEstimator.Language language = StopWordsRemovingEstimator.Language.English) | ||
=> new StopWordsRemovingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnName, language); | ||
|
||
/// <summary> | ||
/// Removes stop words from incoming token streams in input columns | ||
/// and outputs the token streams without stop words as output columns. | ||
/// </summary> | ||
/// <param name="catalog">The text-related transform's catalog.</param> | ||
/// <param name="columns">Pairs of columns to remove stop words on.</param> | ||
/// <param name="language">Langauge of the input text columns <paramref name="columns"/>.</param> | ||
/// <example> | ||
/// <format type="text/markdown"> | ||
/// <] | ||
/// ]]></format> | ||
/// </example> | ||
public static StopWordsRemovingEstimator RemoveDefaultStopWords(this TransformsCatalog.TextTransforms catalog, | ||
(string outputColumnName, string inputColumnName)[] columns, | ||
StopWordsRemovingEstimator.Language language = StopWordsRemovingEstimator.Language.English) | ||
=> new StopWordsRemovingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), columns, language); | ||
|
||
/// <summary> | ||
/// Removes stop words from incoming token streams in <paramref name="inputColumnName"/> | ||
/// and outputs the token streams without stopwords as <paramref name="outputColumnName"/>. | ||
|
@@ -281,24 +254,6 @@ public static CustomStopWordsRemovingEstimator RemoveStopWords(this TransformsCa | |
params string[] stopwords) | ||
=> new CustomStopWordsRemovingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnName, stopwords); | ||
|
||
/// <summary> | ||
/// Removes stop words from incoming token streams in input columns | ||
/// and outputs the token streams without stop words as output columns. | ||
/// </summary> | ||
/// <param name="catalog">The text-related transform's catalog.</param> | ||
/// <param name="columns">Pairs of columns to remove stop words on.</param> | ||
/// <param name="stopwords">Array of words to remove.</param> | ||
/// <example> | ||
/// <format type="text/markdown"> | ||
/// <] | ||
/// ]]></format> | ||
/// </example> | ||
public static CustomStopWordsRemovingEstimator RemoveStopWords(this TransformsCatalog.TextTransforms catalog, | ||
(string outputColumnName, string inputColumnName)[] columns, | ||
params string[] stopwords) | ||
=> new CustomStopWordsRemovingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), columns, stopwords); | ||
|
||
/// <summary> | ||
/// Produces a bag of counts of ngrams (sequences of consecutive words) in <paramref name="inputColumnName"/> | ||
/// and outputs bag of word vector as <paramref name="outputColumnName"/> | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
See my comment on the
TokenizeWords
extension.I would suggest
TokenizeCharactersAsKeys
as new name. #Resolved