dotnet · zeahmed · Mar 18, 2019 · Mar 14, 2019 · Mar 15, 2019 · Mar 16, 2019
diff --git a/src/Microsoft.ML.Transforms/Text/TextFeaturizingEstimator.cs b/src/Microsoft.ML.Transforms/Text/TextFeaturizingEstimator.cs
@@ -25,6 +25,7 @@
 namespace Microsoft.ML.Transforms.Text
 {
     using CaseMode = TextNormalizingEstimator.CaseMode;
+    using StopWordsCol = StopWordsRemovingTransformer.Column;
     // A transform that turns a collection of text documents into numerical feature vectors. The feature vectors are counts
     // of (word or character) ngrams in a given text. It offers ngram hashing (finding the ngram token string name to feature
     // integer index mapping through hashing) as an option.
@@ -85,6 +86,27 @@ internal bool TryUnparse(StringBuilder sb)
             }
         }
 
+        /// <summary>
+        /// Defines the different type of stop words remover supported.
+        /// </summary>
+        public interface IStopWordsRemoverOptions { }
+
+        /// <summary>
+        /// Use stop words remover that can removes language-specific list of stop words (most common words) already defined in the system.
+        /// </summary>
+        public sealed class PredefinedStopWordsRemoverOptions : IStopWordsRemoverOptions { }
+
+        /// <summary>
+        /// Use stop words remover that can removes language-specific list of stop words (most common words) already defined in the system.
+        /// </summary>
+        public sealed class CustomStopWordsRemoverOptions : IStopWordsRemoverOptions
+        {
+            /// <summary>
+            /// List of stop words to remove.
+            /// </summary>
+            public string[] StopWords;
 public string[] Stopwords; 
 public string[] Stopwords; 
+        }
+
         /// <summary>
         /// Advanced options for the <see cref="TextFeaturizingEstimator"/>.
         /// </summary>
@@ -96,8 +118,49 @@ public sealed class Options : TransformInputBase
             [Argument(ArgumentType.AtMostOnce, HelpText = "Dataset language or 'AutoDetect' to detect language per row.", ShortName = "lang", SortOrder = 3)]
             public Language Language = DefaultLanguage;
 
-            [Argument(ArgumentType.Multiple, HelpText = "Use stop remover or not.", ShortName = "remover", SortOrder = 4)]
-            public bool UsePredefinedStopWordRemover = false;
+            [Argument(ArgumentType.Multiple, Name = "StopWordsRemover", HelpText = "Stopwords remover.", ShortName = "remover", NullName = "<None>", SortOrder = 4)]
+            internal IStopWordsRemoverFactory StopWordsRemover;
+
+            /// <summary>
+            /// The underlying state of <see cref="StopWordsRemover"/> and <see cref="StopWordsRemoverOptions"/>.
+            /// </summary>
+            private IStopWordsRemoverOptions _stopWordsRemoverOptions;
+
+            /// <summary>
+            /// Option to set type of stop word remover to use.
+            /// The following options are available
+            /// <list type="bullet">
+            ///     <item>
+            ///         <description>The <see cref="PredefinedStopWordsRemoverOptions"/> removes the language specific list of stop words from the input.</description>
+            ///     </item>
+            ///     <item>
+            ///        <description>The <see cref="CustomStopWordsRemoverOptions"/> uses user provided list of stop words.</description>
+            ///     </item>
+            /// </list>
+            /// Setting this to 'null' does not remove stop words from the input.
+            /// </summary>
+            public IStopWordsRemoverOptions StopWordsRemoverOptions
+            {
+                get { return _stopWordsRemoverOptions; }
+                set
+                {
+                    _stopWordsRemoverOptions = value;
+                    IStopWordsRemoverFactory options = null;
+                    if (_stopWordsRemoverOptions != null)
+                    {
+                        if (_stopWordsRemoverOptions is PredefinedStopWordsRemoverOptions)
+                            options = new PredefinedStopWordsRemoverFactory();
+                        else if (_stopWordsRemoverOptions is CustomStopWordsRemoverOptions)
+                        {
+                            options = new CustomStopWordsRemovingTransformer.LoaderArguments()
+                            {
+                                Stopwords = (_stopWordsRemoverOptions as CustomStopWordsRemoverOptions).StopWords
+                            };
+                        }
+                    }
+                    StopWordsRemover = options;
+                }
+            }
 
             [Argument(ArgumentType.AtMostOnce, HelpText = "Casing text using the rules of the invariant culture.", Name="TextCase", ShortName = "case", SortOrder = 5)]
             public CaseMode CaseMode = TextNormalizingEstimator.Defaults.Mode;
@@ -203,6 +266,7 @@ public Options()
 
         // These parameters are hardcoded for now.
         // REVIEW: expose them once sub-transforms are estimators.
+        private IStopWordsRemoverFactory _stopWordsRemover;
         private TermLoaderArguments _dictionary;
         private INgramExtractorFactoryFactory _wordFeatureExtractor;
         private INgramExtractorFactoryFactory _charFeatureExtractor;
@@ -220,7 +284,7 @@ private sealed class TransformApplierParams
 
             public readonly NormFunction Norm;
             public readonly Language Language;
-            public readonly bool UsePredefinedStopWordRemover;
+            public readonly IStopWordsRemoverFactory StopWordsRemover;
             public readonly CaseMode TextCase;
             public readonly bool KeepDiacritics;
             public readonly bool KeepPunctuations;
@@ -252,7 +316,9 @@ internal LpNormNormalizingEstimatorBase.NormFunction LpNorm
 
             // These properties encode the logic needed to determine which transforms to apply.
             #region NeededTransforms
-            public bool NeedsWordTokenizationTransform { get { return WordExtractorFactory != null || UsePredefinedStopWordRemover || OutputTextTokens; } }
+            public bool NeedsWordTokenizationTransform { get { return WordExtractorFactory != null || NeedsRemoveStopwordsTransform || OutputTextTokens; } }
+
+            public bool NeedsRemoveStopwordsTransform { get { return StopWordsRemover != null; } }
 
             public bool NeedsNormalizeTransform
             {
@@ -298,7 +364,7 @@ public TransformApplierParams(TextFeaturizingEstimator parent)
                 CharExtractorFactory = parent._charFeatureExtractor?.CreateComponent(host, parent._dictionary);
                 Norm = parent.OptionalSettings.Norm;
                 Language = parent.OptionalSettings.Language;
-                UsePredefinedStopWordRemover = parent.OptionalSettings.UsePredefinedStopWordRemover;
+                StopWordsRemover = parent._stopWordsRemover;
                 TextCase = parent.OptionalSettings.CaseMode;
                 KeepDiacritics = parent.OptionalSettings.KeepDiacritics;
                 KeepPunctuations = parent.OptionalSettings.KeepPunctuations;
@@ -340,6 +406,7 @@ internal TextFeaturizingEstimator(IHostEnvironment env, string name, IEnumerable
             if (options != null)
                 OptionalSettings = options;
 
+            _stopWordsRemover = null;
             _dictionary = null;
             _wordFeatureExtractor = OptionalSettings.WordFeatureExtractorFactory;
             _charFeatureExtractor = OptionalSettings.CharFeatureExtractorFactory;
@@ -402,21 +469,23 @@ public ITransformer Fit(IDataView input)
                 view = new WordTokenizingEstimator(h, xfCols).Fit(view).Transform(view);
             }
 
-            if (tparams.UsePredefinedStopWordRemover)
+            if (tparams.NeedsRemoveStopwordsTransform)
             {
                 Contracts.Assert(wordTokCols != null, "StopWords transform requires that word tokenization has been applied to the input text.");
-                var xfCols = new StopWordsRemovingEstimator.ColumnOptions[wordTokCols.Length];
+                var xfCols = new StopWordsCol[wordTokCols.Length];
                 var dstCols = new string[wordTokCols.Length];
                 for (int i = 0; i < wordTokCols.Length; i++)
                 {
-                    var tempName = GenerateColumnName(view.Schema, wordTokCols[i], "StopWordsRemoverTransform");
-                    var col = new StopWordsRemovingEstimator.ColumnOptions(tempName, wordTokCols[i], tparams.StopwordsLanguage);
-                    dstCols[i] = tempName;
-                    tempCols.Add(tempName);
+                    var col = new StopWordsCol();
+                    col.Source = wordTokCols[i];
+                    col.Name = GenerateColumnName(view.Schema, wordTokCols[i], "StopWordsRemoverTransform");
+                    dstCols[i] = col.Name;
+                    tempCols.Add(col.Name);
+                    col.Language = tparams.StopwordsLanguage;
 
                     xfCols[i] = col;
                 }
-                view = new StopWordsRemovingEstimator(h, xfCols).Fit(view).Transform(view);
+                view = tparams.StopWordsRemover.CreateComponent(h, view, xfCols);
                 wordTokCols = dstCols;
             }
 
@@ -443,7 +512,7 @@ public ITransformer Fit(IDataView input)
             if (tparams.CharExtractorFactory != null)
             {
                 {
-                    var srcCols = tparams.UsePredefinedStopWordRemover ? wordTokCols : textCols;
+                    var srcCols = tparams.NeedsRemoveStopwordsTransform ? wordTokCols : textCols;
                     charTokCols = new string[srcCols.Length];
                     var xfCols = new (string outputColumnName, string inputColumnName)[srcCols.Length];
                     for (int i = 0; i < srcCols.Length; i++)
@@ -568,6 +637,7 @@ public SchemaShape GetOutputSchema(SchemaShape inputSchema)
         internal static IDataTransform Create(IHostEnvironment env, Options args, IDataView data)
         {
             var estimator = new TextFeaturizingEstimator(env, args.Columns.Name, args.Columns.Source ?? new[] { args.Columns.Name }, args);
+            estimator._stopWordsRemover = args.StopWordsRemover;
             estimator._dictionary = args.Dictionary;
             // Review: I don't think the following two lines are needed.
             estimator._wordFeatureExtractor = args.WordFeatureExtractorFactory;

diff --git a/test/BaselineOutput/Common/EntryPoints/core_manifest.json b/test/BaselineOutput/Common/EntryPoints/core_manifest.json
@@ -22369,16 +22369,19 @@
           "Default": "English"
         },
         {
-          "Name": "UsePredefinedStopWordRemover",
-          "Type": "Bool",
-          "Desc": "Use stop remover or not.",
+          "Name": "StopWordsRemover",
+          "Type": {
+            "Kind": "Component",
+            "ComponentKind": "StopWordsRemover"
+          },
+          "Desc": "Stopwords remover.",
           "Aliases": [
             "remover"
           ],
           "Required": false,
           "SortOrder": 4.0,
           "IsNullable": false,
-          "Default": false
+          "Default": null
         },
         {
           "Name": "TextCase",

diff --git a/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs b/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs
@@ -103,7 +103,7 @@ public void TrainSentiment()
             {
                 OutputTokens = true,
                 KeepPunctuations = false,
-                UsePredefinedStopWordRemover = true,
+                StopWordsRemoverOptions = new TextFeaturizingEstimator.PredefinedStopWordsRemoverOptions(),
                 Norm = TextFeaturizingEstimator.NormFunction.None,
                 CharFeatureExtractor = null,
                 WordFeatureExtractor = null,

diff --git a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs
@@ -973,7 +973,7 @@ public void EntryPointPipelineEnsembleText()
                 {
                     data = new TextFeaturizingEstimator(Env, "Features", new List<string> { "Text" }, 
                         new TextFeaturizingEstimator.Options { 
-                            UsePredefinedStopWordRemover = true,
+                            StopWordsRemoverOptions = new TextFeaturizingEstimator.PredefinedStopWordsRemoverOptions(),
                         }).Fit(data).Transform(data);
                 }
                 else