Made 'StopWordsRemover' in TextFeaturizer configurable again. (#2962)

zeahmed · web-flow · commit 6733515cb5ca · 2019-03-18T15:24:03.000-07:00
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/TextTransform.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/TextTransform.cs
@@ -36,7 +36,7 @@ public static void Example()
                 KeepPunctuations = false,
                 KeepNumbers = false,
                 OutputTokens = true,
-                Language = TextFeaturizingEstimator.Language.English, // supports  English, French, German, Dutch, Italian, Spanish, Japanese
+                StopWordsRemoverOptions = new StopWordsRemovingEstimator.Options() { Language = TextFeaturizingEstimator.Language.English }, // supports  English, French, German, Dutch, Italian, Spanish, Japanese
             }, "SentimentText");
 
             // The transformed data for both pipelines.
diff --git a/src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs b/src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs
@@ -489,6 +489,22 @@ private protected override Func<int, bool> GetDependenciesCore(Func<int, bool> a
     /// </summary>
     public sealed class StopWordsRemovingEstimator : TrivialEstimator<StopWordsRemovingTransformer>
     {
+        /// <summary>
+        /// Use stop words remover that can remove language-specific list of stop words (most common words) already defined in the system.
+        /// </summary>
+        public sealed class Options : IStopWordsRemoverOptions
+        {
+            /// <summary>
+            /// Language of the text dataset. 'English' is default.
+            /// </summary>
+            public TextFeaturizingEstimator.Language Language;
+
+            public Options()
+            {
+                Language = TextFeaturizingEstimator.DefaultLanguage;
+            }
+        }
+
         /// <summary>
         /// Describes how the transformer handles one column pair.
         /// </summary>
@@ -1065,6 +1081,17 @@ protected override Delegate MakeGetter(DataViewRow input, int iinfo, Func<int, b
     /// </summary>
     public sealed class CustomStopWordsRemovingEstimator : TrivialEstimator<CustomStopWordsRemovingTransformer>
     {
+        /// <summary>
+        /// Use stop words remover that can removes language-specific list of stop words (most common words) already defined in the system.
+        /// </summary>
+        public sealed class Options : IStopWordsRemoverOptions
+        {
+            /// <summary>
+            /// List of stop words to remove.
+            /// </summary>
+            public string[] StopWords;
+        }
+
         internal const string ExpectedColumnType = "vector of Text type";
 
         /// <summary>
diff --git a/src/Microsoft.ML.Transforms/Text/TextFeaturizingEstimator.cs b/src/Microsoft.ML.Transforms/Text/TextFeaturizingEstimator.cs
@@ -24,6 +24,13 @@
 namespace Microsoft.ML.Transforms.Text
 {
     using CaseMode = TextNormalizingEstimator.CaseMode;
+    using StopWordsCol = StopWordsRemovingTransformer.Column;
+
+    /// <summary>
+    /// Defines the different type of stop words remover supported.
+    /// </summary>
+    public interface IStopWordsRemoverOptions { }
+
     // A transform that turns a collection of text documents into numerical feature vectors. The feature vectors are counts
     // of (word or character) ngrams in a given text. It offers ngram hashing (finding the ngram token string name to feature
     // integer index mapping through hashing) as an option.
@@ -93,10 +100,56 @@ public sealed class Options : TransformInputBase
             internal Column Columns;
 
             [Argument(ArgumentType.AtMostOnce, HelpText = "Dataset language or 'AutoDetect' to detect language per row.", ShortName = "lang", SortOrder = 3)]
-            public Language Language = DefaultLanguage;
+            internal Language Language = DefaultLanguage;
+
+            [Argument(ArgumentType.Multiple, Name = "StopWordsRemover", HelpText = "Stopwords remover.", ShortName = "remover", NullName = "<None>", SortOrder = 4)]
+            internal IStopWordsRemoverFactory StopWordsRemover;
 
-            [Argument(ArgumentType.Multiple, HelpText = "Use stop remover or not.", ShortName = "remover", SortOrder = 4)]
-            public bool UsePredefinedStopWordRemover = false;
+            /// <summary>
+            /// The underlying state of <see cref="StopWordsRemover"/> and <see cref="StopWordsRemoverOptions"/>.
+            /// </summary>
+            private IStopWordsRemoverOptions _stopWordsRemoverOptions;
+
+            /// <summary>
+            /// Option to set type of stop word remover to use.
+            /// The following options are available
+            /// <list type="bullet">
+            ///     <item>
+            ///         <description>The <see cref="StopWordsRemovingEstimator.Options"/> removes the language specific list of stop words from the input.</description>
+            ///     </item>
+            ///     <item>
+            ///        <description>The <see cref="CustomStopWordsRemovingEstimator.Options"/> uses user provided list of stop words.</description>
+            ///     </item>
+            /// </list>
+            /// Setting this to 'null' does not remove stop words from the input.
+            /// </summary>
+            public IStopWordsRemoverOptions StopWordsRemoverOptions
+            {
+                get { return _stopWordsRemoverOptions; }
+                set
+                {
+                    _stopWordsRemoverOptions = value;
+                    IStopWordsRemoverFactory options = null;
+                    if (_stopWordsRemoverOptions != null)
+                    {
+                        if (_stopWordsRemoverOptions is StopWordsRemovingEstimator.Options)
+                        {
+                            options = new PredefinedStopWordsRemoverFactory();
+                            Language = (_stopWordsRemoverOptions as StopWordsRemovingEstimator.Options).Language;
+                        }
+                        else if (_stopWordsRemoverOptions is CustomStopWordsRemovingEstimator.Options)
+                        {
+                            var stopwords = (_stopWordsRemoverOptions as CustomStopWordsRemovingEstimator.Options).StopWords;
+                            options = new CustomStopWordsRemovingTransformer.LoaderArguments()
+                            {
+                                Stopwords = stopwords,
+                                Stopword = string.Join(",", stopwords)
+                            };
+                        }
+                    }
+                    StopWordsRemover = options;
+                }
+            }
 
             [Argument(ArgumentType.AtMostOnce, HelpText = "Casing text using the rules of the invariant culture.", Name="TextCase", ShortName = "case", SortOrder = 5)]
             public CaseMode CaseMode = TextNormalizingEstimator.Defaults.Mode;
@@ -202,6 +255,7 @@ public Options()
 
         // These parameters are hardcoded for now.
         // REVIEW: expose them once sub-transforms are estimators.
+        private IStopWordsRemoverFactory _stopWordsRemover;
         private TermLoaderArguments _dictionary;
         private INgramExtractorFactoryFactory _wordFeatureExtractor;
         private INgramExtractorFactoryFactory _charFeatureExtractor;
@@ -219,7 +273,7 @@ private sealed class TransformApplierParams
 
             public readonly NormFunction Norm;
             public readonly Language Language;
-            public readonly bool UsePredefinedStopWordRemover;
+            public readonly IStopWordsRemoverFactory StopWordsRemover;
             public readonly CaseMode TextCase;
             public readonly bool KeepDiacritics;
             public readonly bool KeepPunctuations;
@@ -251,7 +305,9 @@ internal LpNormNormalizingEstimatorBase.NormFunction LpNorm
 
             // These properties encode the logic needed to determine which transforms to apply.
             #region NeededTransforms
-            public bool NeedsWordTokenizationTransform { get { return WordExtractorFactory != null || UsePredefinedStopWordRemover || OutputTextTokens; } }
+            public bool NeedsWordTokenizationTransform { get { return WordExtractorFactory != null || NeedsRemoveStopwordsTransform || OutputTextTokens; } }
+
+            public bool NeedsRemoveStopwordsTransform { get { return StopWordsRemover != null; } }
 
             public bool NeedsNormalizeTransform
             {
@@ -297,7 +353,7 @@ public TransformApplierParams(TextFeaturizingEstimator parent)
                 CharExtractorFactory = parent._charFeatureExtractor?.CreateComponent(host, parent._dictionary);
                 Norm = parent.OptionalSettings.Norm;
                 Language = parent.OptionalSettings.Language;
-                UsePredefinedStopWordRemover = parent.OptionalSettings.UsePredefinedStopWordRemover;
+                StopWordsRemover = parent._stopWordsRemover;
                 TextCase = parent.OptionalSettings.CaseMode;
                 KeepDiacritics = parent.OptionalSettings.KeepDiacritics;
                 KeepPunctuations = parent.OptionalSettings.KeepPunctuations;
@@ -339,6 +395,7 @@ internal TextFeaturizingEstimator(IHostEnvironment env, string name, IEnumerable
             if (options != null)
                 OptionalSettings = options;
 
+            _stopWordsRemover = null;
             _dictionary = null;
             _wordFeatureExtractor = OptionalSettings.WordFeatureExtractorFactory;
             _charFeatureExtractor = OptionalSettings.CharFeatureExtractorFactory;
@@ -401,21 +458,23 @@ public ITransformer Fit(IDataView input)
                 view = new WordTokenizingEstimator(h, xfCols).Fit(view).Transform(view);
             }
 
-            if (tparams.UsePredefinedStopWordRemover)
+            if (tparams.NeedsRemoveStopwordsTransform)
             {
                 Contracts.Assert(wordTokCols != null, "StopWords transform requires that word tokenization has been applied to the input text.");
-                var xfCols = new StopWordsRemovingEstimator.ColumnOptions[wordTokCols.Length];
+                var xfCols = new StopWordsCol[wordTokCols.Length];
                 var dstCols = new string[wordTokCols.Length];
                 for (int i = 0; i < wordTokCols.Length; i++)
                 {
-                    var tempName = GenerateColumnName(view.Schema, wordTokCols[i], "StopWordsRemoverTransform");
-                    var col = new StopWordsRemovingEstimator.ColumnOptions(tempName, wordTokCols[i], tparams.StopwordsLanguage);
-                    dstCols[i] = tempName;
-                    tempCols.Add(tempName);
+                    var col = new StopWordsCol();
+                    col.Source = wordTokCols[i];
+                    col.Name = GenerateColumnName(view.Schema, wordTokCols[i], "StopWordsRemoverTransform");
+                    dstCols[i] = col.Name;
+                    tempCols.Add(col.Name);
+                    col.Language = tparams.StopwordsLanguage;
 
                     xfCols[i] = col;
                 }
-                view = new StopWordsRemovingEstimator(h, xfCols).Fit(view).Transform(view);
+                view = tparams.StopWordsRemover.CreateComponent(h, view, xfCols);
                 wordTokCols = dstCols;
             }
 
@@ -442,7 +501,7 @@ public ITransformer Fit(IDataView input)
             if (tparams.CharExtractorFactory != null)
             {
                 {
-                    var srcCols = tparams.UsePredefinedStopWordRemover ? wordTokCols : textCols;
+                    var srcCols = tparams.NeedsRemoveStopwordsTransform ? wordTokCols : textCols;
                     charTokCols = new string[srcCols.Length];
                     var xfCols = new (string outputColumnName, string inputColumnName)[srcCols.Length];
                     for (int i = 0; i < srcCols.Length; i++)
@@ -567,6 +626,7 @@ public SchemaShape GetOutputSchema(SchemaShape inputSchema)
         internal static IDataTransform Create(IHostEnvironment env, Options args, IDataView data)
         {
             var estimator = new TextFeaturizingEstimator(env, args.Columns.Name, args.Columns.Source ?? new[] { args.Columns.Name }, args);
+            estimator._stopWordsRemover = args.StopWordsRemover;
             estimator._dictionary = args.Dictionary;
             // Review: I don't think the following two lines are needed.
             estimator._wordFeatureExtractor = args.WordFeatureExtractorFactory;
diff --git a/test/BaselineOutput/Common/EntryPoints/core_manifest.json b/test/BaselineOutput/Common/EntryPoints/core_manifest.json
@@ -22358,16 +22358,19 @@
           "Default": "English"
         },
         {
-          "Name": "UsePredefinedStopWordRemover",
-          "Type": "Bool",
-          "Desc": "Use stop remover or not.",
+          "Name": "StopWordsRemover",
+          "Type": {
+            "Kind": "Component",
+            "ComponentKind": "StopWordsRemover"
+          },
+          "Desc": "Stopwords remover.",
           "Aliases": [
             "remover"
           ],
           "Required": false,
           "SortOrder": 4.0,
           "IsNullable": false,
-          "Default": false
+          "Default": null
         },
         {
           "Name": "TextCase",
diff --git a/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs b/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs
@@ -102,7 +102,7 @@ public void TrainSentiment()
             {
                 OutputTokens = true,
                 KeepPunctuations = false,
-                UsePredefinedStopWordRemover = true,
+                StopWordsRemoverOptions = new StopWordsRemovingEstimator.Options(),
                 Norm = TextFeaturizingEstimator.NormFunction.None,
                 CharFeatureExtractor = null,
                 WordFeatureExtractor = null,
diff --git a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs
@@ -972,7 +972,7 @@ public void EntryPointPipelineEnsembleText()
                 {
                     data = new TextFeaturizingEstimator(Env, "Features", new List<string> { "Text" }, 
                         new TextFeaturizingEstimator.Options { 
-                            UsePredefinedStopWordRemover = true,
+                            StopWordsRemoverOptions = new StopWordsRemovingEstimator.Options(),
                         }).Fit(data).Transform(data);
                 }
                 else

Original file line number	Diff line number	Diff line change
`@@ -102,7 +102,7 @@ public void TrainSentiment()`
`102`	`102`	`{`
`103`	`103`	`OutputTokens = true,`
`104`	`104`	`KeepPunctuations = false,`
`105`		`- UsePredefinedStopWordRemover = true,`
	`105`	`+ StopWordsRemoverOptions = new StopWordsRemovingEstimator.Options(),`
`106`	`106`	`Norm = TextFeaturizingEstimator.NormFunction.None,`
`107`	`107`	`CharFeatureExtractor = null,`
`108`	`108`	`WordFeatureExtractor = null,`
Original file line number	Diff line number	Diff line change
`@@ -972,7 +972,7 @@ public void EntryPointPipelineEnsembleText()`
`972`	`972`	`{`
`973`	`973`	`data = new TextFeaturizingEstimator(Env, "Features", new List<string> { "Text" },`
`974`	`974`	`new TextFeaturizingEstimator.Options {`
`975`		`- UsePredefinedStopWordRemover = true,`
	`975`	`+ StopWordsRemoverOptions = new StopWordsRemovingEstimator.Options(),`
`976`	`976`	`}).Fit(data).Transform(data);`
`977`	`977`	`}`
`978`	`978`	`else`