-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Made 'StopWordsRemover' in TextFeaturizer configurable again. #2962
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
a91bbbf
770e8be
46ad17d
d3e6435
309c941
377e579
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
|
@@ -25,6 +25,7 @@ | |||
namespace Microsoft.ML.Transforms.Text | ||||
{ | ||||
using CaseMode = TextNormalizingEstimator.CaseMode; | ||||
using StopWordsCol = StopWordsRemovingTransformer.Column; | ||||
// A transform that turns a collection of text documents into numerical feature vectors. The feature vectors are counts | ||||
// of (word or character) ngrams in a given text. It offers ngram hashing (finding the ngram token string name to feature | ||||
// integer index mapping through hashing) as an option. | ||||
|
@@ -85,6 +86,27 @@ internal bool TryUnparse(StringBuilder sb) | |||
} | ||||
} | ||||
|
||||
/// <summary> | ||||
/// Defines the different type of stop words remover supported. | ||||
/// </summary> | ||||
public interface IStopWordsRemoverOptions { } | ||||
|
||||
/// <summary> | ||||
/// Use stop words remover that can removes language-specific list of stop words (most common words) already defined in the system. | ||||
/// </summary> | ||||
public sealed class PredefinedStopWordsRemoverOptions : IStopWordsRemoverOptions { } | ||||
|
||||
/// <summary> | ||||
/// Use stop words remover that can removes language-specific list of stop words (most common words) already defined in the system. | ||||
/// </summary> | ||||
public sealed class CustomStopWordsRemoverOptions : IStopWordsRemoverOptions | ||||
{ | ||||
/// <summary> | ||||
/// List of stop words to remove. | ||||
/// </summary> | ||||
public string[] StopWords; | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Why not There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I doubt that EntryPoint will break because of this. The actual interface is still using machinelearning/src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs Line 643 in 91a8703
I have mentioned that in the comments in the my PR. I am waiting for the people to comment on it. So that I can decide what to do. In reply to: 266099423 [](ancestors = 266099423) |
||||
} | ||||
|
||||
/// <summary> | ||||
/// Advanced options for the <see cref="TextFeaturizingEstimator"/>. | ||||
/// </summary> | ||||
|
@@ -96,8 +118,49 @@ public sealed class Options : TransformInputBase | |||
[Argument(ArgumentType.AtMostOnce, HelpText = "Dataset language or 'AutoDetect' to detect language per row.", ShortName = "lang", SortOrder = 3)] | ||||
public Language Language = DefaultLanguage; | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
it needs to be part of StopWordsRemovingTransformer #Resolved There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think In reply to: 265808271 [](ancestors = 265808271) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't know what kind of future lays ahead of us, but for now, this option just look silly. In reply to: 265812513 [](ancestors = 265812513,265808271) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hmm...so you think that it should not be here? it should instead be in In reply to: 266564835 [](ancestors = 266564835,265812513,265808271) |
||||
|
||||
[Argument(ArgumentType.Multiple, HelpText = "Use stop remover or not.", ShortName = "remover", SortOrder = 4)] | ||||
public bool UsePredefinedStopWordRemover = false; | ||||
[Argument(ArgumentType.Multiple, Name = "StopWordsRemover", HelpText = "Stopwords remover.", ShortName = "remover", NullName = "<None>", SortOrder = 4)] | ||||
internal IStopWordsRemoverFactory StopWordsRemover; | ||||
|
||||
/// <summary> | ||||
/// The underlying state of <see cref="StopWordsRemover"/> and <see cref="StopWordsRemoverOptions"/>. | ||||
/// </summary> | ||||
private IStopWordsRemoverOptions _stopWordsRemoverOptions; | ||||
|
||||
/// <summary> | ||||
/// Option to set type of stop word remover to use. | ||||
/// The following options are available | ||||
/// <list type="bullet"> | ||||
/// <item> | ||||
/// <description>The <see cref="PredefinedStopWordsRemoverOptions"/> removes the language specific list of stop words from the input.</description> | ||||
/// </item> | ||||
/// <item> | ||||
/// <description>The <see cref="CustomStopWordsRemoverOptions"/> uses user provided list of stop words.</description> | ||||
/// </item> | ||||
/// </list> | ||||
/// Setting this to 'null' does not remove stop words from the input. | ||||
/// </summary> | ||||
public IStopWordsRemoverOptions StopWordsRemoverOptions | ||||
{ | ||||
get { return _stopWordsRemoverOptions; } | ||||
set | ||||
{ | ||||
_stopWordsRemoverOptions = value; | ||||
IStopWordsRemoverFactory options = null; | ||||
if (_stopWordsRemoverOptions != null) | ||||
{ | ||||
if (_stopWordsRemoverOptions is PredefinedStopWordsRemoverOptions) | ||||
options = new PredefinedStopWordsRemoverFactory(); | ||||
else if (_stopWordsRemoverOptions is CustomStopWordsRemoverOptions) | ||||
{ | ||||
options = new CustomStopWordsRemovingTransformer.LoaderArguments() | ||||
{ | ||||
Stopwords = (_stopWordsRemoverOptions as CustomStopWordsRemoverOptions).StopWords | ||||
}; | ||||
} | ||||
} | ||||
StopWordsRemover = options; | ||||
} | ||||
} | ||||
|
||||
[Argument(ArgumentType.AtMostOnce, HelpText = "Casing text using the rules of the invariant culture.", Name="TextCase", ShortName = "case", SortOrder = 5)] | ||||
public CaseMode CaseMode = TextNormalizingEstimator.Defaults.Mode; | ||||
|
@@ -203,6 +266,7 @@ public Options() | |||
|
||||
// These parameters are hardcoded for now. | ||||
// REVIEW: expose them once sub-transforms are estimators. | ||||
private IStopWordsRemoverFactory _stopWordsRemover; | ||||
private TermLoaderArguments _dictionary; | ||||
private INgramExtractorFactoryFactory _wordFeatureExtractor; | ||||
private INgramExtractorFactoryFactory _charFeatureExtractor; | ||||
|
@@ -220,7 +284,7 @@ private sealed class TransformApplierParams | |||
|
||||
public readonly NormFunction Norm; | ||||
public readonly Language Language; | ||||
public readonly bool UsePredefinedStopWordRemover; | ||||
public readonly IStopWordsRemoverFactory StopWordsRemover; | ||||
public readonly CaseMode TextCase; | ||||
public readonly bool KeepDiacritics; | ||||
public readonly bool KeepPunctuations; | ||||
|
@@ -252,7 +316,9 @@ internal LpNormNormalizingEstimatorBase.NormFunction LpNorm | |||
|
||||
// These properties encode the logic needed to determine which transforms to apply. | ||||
#region NeededTransforms | ||||
public bool NeedsWordTokenizationTransform { get { return WordExtractorFactory != null || UsePredefinedStopWordRemover || OutputTextTokens; } } | ||||
public bool NeedsWordTokenizationTransform { get { return WordExtractorFactory != null || NeedsRemoveStopwordsTransform || OutputTextTokens; } } | ||||
|
||||
public bool NeedsRemoveStopwordsTransform { get { return StopWordsRemover != null; } } | ||||
|
||||
public bool NeedsNormalizeTransform | ||||
{ | ||||
|
@@ -298,7 +364,7 @@ public TransformApplierParams(TextFeaturizingEstimator parent) | |||
CharExtractorFactory = parent._charFeatureExtractor?.CreateComponent(host, parent._dictionary); | ||||
Norm = parent.OptionalSettings.Norm; | ||||
Language = parent.OptionalSettings.Language; | ||||
UsePredefinedStopWordRemover = parent.OptionalSettings.UsePredefinedStopWordRemover; | ||||
StopWordsRemover = parent._stopWordsRemover; | ||||
TextCase = parent.OptionalSettings.CaseMode; | ||||
KeepDiacritics = parent.OptionalSettings.KeepDiacritics; | ||||
KeepPunctuations = parent.OptionalSettings.KeepPunctuations; | ||||
|
@@ -340,6 +406,7 @@ internal TextFeaturizingEstimator(IHostEnvironment env, string name, IEnumerable | |||
if (options != null) | ||||
OptionalSettings = options; | ||||
|
||||
_stopWordsRemover = null; | ||||
_dictionary = null; | ||||
_wordFeatureExtractor = OptionalSettings.WordFeatureExtractorFactory; | ||||
_charFeatureExtractor = OptionalSettings.CharFeatureExtractorFactory; | ||||
|
@@ -402,21 +469,23 @@ public ITransformer Fit(IDataView input) | |||
view = new WordTokenizingEstimator(h, xfCols).Fit(view).Transform(view); | ||||
} | ||||
|
||||
if (tparams.UsePredefinedStopWordRemover) | ||||
if (tparams.NeedsRemoveStopwordsTransform) | ||||
{ | ||||
Contracts.Assert(wordTokCols != null, "StopWords transform requires that word tokenization has been applied to the input text."); | ||||
var xfCols = new StopWordsRemovingEstimator.ColumnOptions[wordTokCols.Length]; | ||||
var xfCols = new StopWordsCol[wordTokCols.Length]; | ||||
var dstCols = new string[wordTokCols.Length]; | ||||
for (int i = 0; i < wordTokCols.Length; i++) | ||||
{ | ||||
var tempName = GenerateColumnName(view.Schema, wordTokCols[i], "StopWordsRemoverTransform"); | ||||
var col = new StopWordsRemovingEstimator.ColumnOptions(tempName, wordTokCols[i], tparams.StopwordsLanguage); | ||||
dstCols[i] = tempName; | ||||
tempCols.Add(tempName); | ||||
var col = new StopWordsCol(); | ||||
col.Source = wordTokCols[i]; | ||||
col.Name = GenerateColumnName(view.Schema, wordTokCols[i], "StopWordsRemoverTransform"); | ||||
dstCols[i] = col.Name; | ||||
tempCols.Add(col.Name); | ||||
col.Language = tparams.StopwordsLanguage; | ||||
|
||||
xfCols[i] = col; | ||||
} | ||||
view = new StopWordsRemovingEstimator(h, xfCols).Fit(view).Transform(view); | ||||
view = tparams.StopWordsRemover.CreateComponent(h, view, xfCols); | ||||
wordTokCols = dstCols; | ||||
} | ||||
|
||||
|
@@ -443,7 +512,7 @@ public ITransformer Fit(IDataView input) | |||
if (tparams.CharExtractorFactory != null) | ||||
{ | ||||
{ | ||||
var srcCols = tparams.UsePredefinedStopWordRemover ? wordTokCols : textCols; | ||||
var srcCols = tparams.NeedsRemoveStopwordsTransform ? wordTokCols : textCols; | ||||
charTokCols = new string[srcCols.Length]; | ||||
var xfCols = new (string outputColumnName, string inputColumnName)[srcCols.Length]; | ||||
for (int i = 0; i < srcCols.Length; i++) | ||||
|
@@ -568,6 +637,7 @@ public SchemaShape GetOutputSchema(SchemaShape inputSchema) | |||
internal static IDataTransform Create(IHostEnvironment env, Options args, IDataView data) | ||||
{ | ||||
var estimator = new TextFeaturizingEstimator(env, args.Columns.Name, args.Columns.Source ?? new[] { args.Columns.Name }, args); | ||||
estimator._stopWordsRemover = args.StopWordsRemover; | ||||
estimator._dictionary = args.Dictionary; | ||||
// Review: I don't think the following two lines are needed. | ||||
estimator._wordFeatureExtractor = args.WordFeatureExtractorFactory; | ||||
|
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why this interface and class is part of text transform and not part of StopWordRemover itself? #Resolved
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
My question is why it's not implemented same way as word ngram and char ngram implemented?
In reply to: 266565265 [](ancestors = 266565265)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It can be part of StopWordRemover. I will move it.
Word ngram and char ngram are do different options that can be exclusively applied in Text transform (two parameter for two different options). On the contrary, only one type of StopWordRemover can be applied (one parameter to take two different options). Any idea if it can be done better than this?
In reply to: 266565861 [](ancestors = 266565861,266565265)