Exposed ngram extraction options in TextFeaturizer (#2911)

zeahmed · web-flow · commit ec6ff086edf6 · 2019-03-12T17:08:36.000-07:00
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/TextTransform.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/TextTransform.cs
@@ -36,7 +36,7 @@ public static void Example()
                 KeepPunctuations = false,
                 KeepNumbers = false,
                 OutputTokens = true,
-                TextLanguage = TextFeaturizingEstimator.Language.English, // supports  English, French, German, Dutch, Italian, Spanish, Japanese
+                Language = TextFeaturizingEstimator.Language.English, // supports  English, French, German, Dutch, Italian, Spanish, Japanese
             }, "SentimentText");
 
             // The transformed data for both pipelines.
diff --git a/src/Microsoft.ML.Transforms/EntryPoints/TextAnalytics.cs b/src/Microsoft.ML.Transforms/EntryPoints/TextAnalytics.cs
@@ -21,7 +21,7 @@ internal static class TextAnalytics
             Desc = TextFeaturizingEstimator.Summary,
             UserName = TextFeaturizingEstimator.UserName,
             ShortName = TextFeaturizingEstimator.LoaderSignature)]
-        public static CommonOutputs.TransformOutput TextTransform(IHostEnvironment env, TextFeaturizingEstimator.Arguments input)
+        public static CommonOutputs.TransformOutput TextTransform(IHostEnvironment env, TextFeaturizingEstimator.Options input)
         {
             var h = EntryPointUtils.CheckArgsAndCreateHost(env, "FeaturizeTextEstimator", input);
             var xf = TextFeaturizingEstimator.Create(h, input, input.Data);
diff --git a/src/Microsoft.ML.Transforms/Text/TextFeaturizingEstimator.cs b/src/Microsoft.ML.Transforms/Text/TextFeaturizingEstimator.cs
@@ -16,7 +16,7 @@
 using Microsoft.ML.Runtime;
 using Microsoft.ML.Transforms.Text;
 
-[assembly: LoadableClass(TextFeaturizingEstimator.Summary, typeof(IDataTransform), typeof(TextFeaturizingEstimator), typeof(TextFeaturizingEstimator.Arguments), typeof(SignatureDataTransform),
+[assembly: LoadableClass(TextFeaturizingEstimator.Summary, typeof(IDataTransform), typeof(TextFeaturizingEstimator), typeof(TextFeaturizingEstimator.Options), typeof(SignatureDataTransform),
     TextFeaturizingEstimator.UserName, "TextTransform", TextFeaturizingEstimator.LoaderSignature)]
 
 [assembly: LoadableClass(TextFeaturizingEstimator.Summary, typeof(ITransformer), typeof(TextFeaturizingEstimator), null, typeof(SignatureLoadModel),
@@ -86,12 +86,12 @@ internal bool TryUnparse(StringBuilder sb)
         }
 
         /// <summary>
-        /// This class exposes <see cref="NgramExtractorTransform"/>/<see cref="NgramHashExtractingTransformer"/> arguments.
+        /// Advanced options for the <see cref="TextFeaturizingEstimator"/>.
         /// </summary>
-        internal sealed class Arguments : TransformInputBase
+        public sealed class Options : TransformInputBase
         {
             [Argument(ArgumentType.Required, HelpText = "New column definition (optional form: name:srcs).", Name = "Column", ShortName = "col", SortOrder = 1)]
-            public Column Columns;
+            internal Column Columns;
 
             [Argument(ArgumentType.AtMostOnce, HelpText = "Dataset language or 'AutoDetect' to detect language per row.", ShortName = "lang", SortOrder = 3)]
             public Language Language = DefaultLanguage;
@@ -115,67 +115,80 @@ internal sealed class Arguments : TransformInputBase
             public bool OutputTokens;
 
             [Argument(ArgumentType.Multiple, HelpText = "A dictionary of whitelisted terms.", ShortName = "dict", NullName = "<None>", SortOrder = 10, Hide = true)]
-            public TermLoaderArguments Dictionary;
+            internal TermLoaderArguments Dictionary;
 
             [TGUI(Label = "Word Gram Extractor")]
-            [Argument(ArgumentType.Multiple, HelpText = "Ngram feature extractor to use for words (WordBag/WordHashBag).", ShortName = "wordExtractor", NullName = "<None>", SortOrder = 11)]
-            public INgramExtractorFactoryFactory WordFeatureExtractor = new NgramExtractorTransform.NgramExtractorArguments();
-
-            [TGUI(Label = "Char Gram Extractor")]
-            [Argument(ArgumentType.Multiple, HelpText = "Ngram feature extractor to use for characters (WordBag/WordHashBag).", ShortName = "charExtractor", NullName = "<None>", SortOrder = 12)]
-            public INgramExtractorFactoryFactory CharFeatureExtractor = new NgramExtractorTransform.NgramExtractorArguments() { NgramLength = 3, AllLengths = false };
-
-            [Argument(ArgumentType.AtMostOnce, HelpText = "Normalize vectors (rows) individually by rescaling them to unit norm.", ShortName = "norm", SortOrder = 13)]
-            public NormFunction VectorNormalizer = NormFunction.L2;
-        }
+            [Argument(ArgumentType.Multiple, Name = "WordFeatureExtractor", HelpText = "Ngram feature extractor to use for words (WordBag/WordHashBag).", ShortName = "wordExtractor", NullName = "<None>", SortOrder = 11)]
+            internal INgramExtractorFactoryFactory WordFeatureExtractorFactory;
 
-        /// <summary>
-        /// Advanced options for the <see cref="TextFeaturizingEstimator"/>.
-        /// </summary>
-        public sealed class Options
-        {
-#pragma warning disable MSML_NoInstanceInitializers // No initializers on instance fields or properties
-            /// <summary>
-            /// Dataset language.
-            /// </summary>
-            public Language TextLanguage { get; set; } = DefaultLanguage;
-            /// <summary>
-            /// Casing used for the text.
-            /// </summary>
-            public CaseMode TextCase { get; set; } = CaseMode.Lower;
-            /// <summary>
-            /// Whether to keep diacritical marks or remove them.
-            /// </summary>
-            public bool KeepDiacritics { get; set; } = false;
-            /// <summary>
-            /// Whether to keep punctuation marks or remove them.
-            /// </summary>
-            public bool KeepPunctuations { get; set; } = true;
-            /// <summary>
-            /// Whether to keep numbers or remove them.
-            /// </summary>
-            public bool KeepNumbers { get; set; } = true;
             /// <summary>
-            /// Whether to output the transformed text tokens as an additional column.
+            /// The underlying state of <see cref="WordFeatureExtractorFactory"/> and <see cref="WordFeatureExtractor"/>.
             /// </summary>
-            public bool OutputTokens { get; set; } = false;
-            /// <summary>
-            /// Vector Normalizer to use.
-            /// </summary>
-            public NormFunction VectorNormalizer { get; set; } = NormFunction.L2;
+            private WordBagEstimator.Options _wordFeatureExtractor;
+
             /// <summary>
-            /// Whether to use stop remover or not.
+            /// Ngram feature extractor to use for words (WordBag/WordHashBag).
             /// </summary>
-            public bool UseStopRemover { get; set; } = false;
+            public WordBagEstimator.Options WordFeatureExtractor
+            {
+                get { return _wordFeatureExtractor; }
+                set
+                {
+                    _wordFeatureExtractor = value;
+                    NgramExtractorTransform.NgramExtractorArguments extractor = null;
+                    if (_wordFeatureExtractor != null)
+                    {
+                        extractor = new NgramExtractorTransform.NgramExtractorArguments();
+                        extractor.NgramLength = _wordFeatureExtractor.NgramLength;
+                        extractor.SkipLength = _wordFeatureExtractor.SkipLength;
+                        extractor.AllLengths = _wordFeatureExtractor.AllLengths;
+                        extractor.MaxNumTerms = _wordFeatureExtractor.MaximumNgramsCount;
+                        extractor.Weighting = _wordFeatureExtractor.Weighting;
+                    }
+                    WordFeatureExtractorFactory = extractor;
+                }
+            }
+
+            [TGUI(Label = "Char Gram Extractor")]
+            [Argument(ArgumentType.Multiple, Name = "CharFeatureExtractor", HelpText = "Ngram feature extractor to use for characters (WordBag/WordHashBag).", ShortName = "charExtractor", NullName = "<None>", SortOrder = 12)]
+            internal INgramExtractorFactoryFactory CharFeatureExtractorFactory;
+
             /// <summary>
-            /// Whether to use char extractor or not.
+            /// The underlying state of <see cref="CharFeatureExtractorFactory"/> and <see cref="CharFeatureExtractor"/>
             /// </summary>
-            public bool UseCharExtractor { get; set; } = true;
+            private WordBagEstimator.Options _charFeatureExtractor;
+
             /// <summary>
-            /// Whether to use word extractor or not.
+            /// Ngram feature extractor to use for characters (WordBag/WordHashBag).
             /// </summary>
-            public bool UseWordExtractor { get; set; } = true;
-#pragma warning restore MSML_NoInstanceInitializers // No initializers on instance fields or properties
+            public WordBagEstimator.Options CharFeatureExtractor
+            {
+                get { return _charFeatureExtractor; }
+                set
+                {
+                    _charFeatureExtractor = value;
+                    NgramExtractorTransform.NgramExtractorArguments extractor = null;
+                    if (_charFeatureExtractor != null)
+                    {
+                        extractor = new NgramExtractorTransform.NgramExtractorArguments();
+                        extractor.NgramLength = _charFeatureExtractor.NgramLength;
+                        extractor.SkipLength = _charFeatureExtractor.SkipLength;
+                        extractor.AllLengths = _charFeatureExtractor.AllLengths;
+                        extractor.MaxNumTerms = _charFeatureExtractor.MaximumNgramsCount;
+                        extractor.Weighting = _charFeatureExtractor.Weighting;
+                    }
+                    CharFeatureExtractorFactory = extractor;
+                }
+            }
+
+            [Argument(ArgumentType.AtMostOnce, HelpText = "Normalize vectors (rows) individually by rescaling them to unit norm.", ShortName = "norm", SortOrder = 13)]
+            public NormFunction VectorNormalizer = NormFunction.L2;
+
+            public Options()
+            {
+                WordFeatureExtractor = new WordBagEstimator.Options();
+                CharFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 3, AllLengths = false };
+            }
         }
 
         internal readonly string OutputColumn;
@@ -274,13 +287,13 @@ public bool NeedInitialSourceColumnConcatTransform
             public TransformApplierParams(TextFeaturizingEstimator parent)
             {
                 var host = parent._host;
-                host.Check(Enum.IsDefined(typeof(Language), parent.OptionalSettings.TextLanguage));
+                host.Check(Enum.IsDefined(typeof(Language), parent.OptionalSettings.Language));
                 host.Check(Enum.IsDefined(typeof(CaseMode), parent.OptionalSettings.TextCase));
                 WordExtractorFactory = parent._wordFeatureExtractor?.CreateComponent(host, parent._dictionary);
                 CharExtractorFactory = parent._charFeatureExtractor?.CreateComponent(host, parent._dictionary);
                 VectorNormalizer = parent.OptionalSettings.VectorNormalizer;
-                Language = parent.OptionalSettings.TextLanguage;
-                UsePredefinedStopWordRemover = parent.OptionalSettings.UseStopRemover;
+                Language = parent.OptionalSettings.Language;
+                UsePredefinedStopWordRemover = parent.OptionalSettings.UsePredefinedStopWordRemover;
                 TextCase = parent.OptionalSettings.TextCase;
                 KeepDiacritics = parent.OptionalSettings.KeepDiacritics;
                 KeepPunctuations = parent.OptionalSettings.KeepPunctuations;
@@ -323,10 +336,9 @@ internal TextFeaturizingEstimator(IHostEnvironment env, string name, IEnumerable
                 OptionalSettings = options;
 
             _dictionary = null;
-            if (OptionalSettings.UseWordExtractor)
-                _wordFeatureExtractor = new NgramExtractorTransform.NgramExtractorArguments();
-            if (OptionalSettings.UseCharExtractor)
-                _charFeatureExtractor = new NgramExtractorTransform.NgramExtractorArguments() { NgramLength = 3, AllLengths = false };
+            _wordFeatureExtractor = OptionalSettings.WordFeatureExtractorFactory;
+            _charFeatureExtractor = OptionalSettings.CharFeatureExtractorFactory;
+
         }
 
         /// <summary>
@@ -548,26 +560,12 @@ public SchemaShape GetOutputSchema(SchemaShape inputSchema)
         }
 
         // Factory method for SignatureDataTransform.
-        internal static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView data)
+        internal static IDataTransform Create(IHostEnvironment env, Options args, IDataView data)
         {
-            var settings = new Options
-            {
-                TextLanguage = args.Language,
-                TextCase = args.TextCase,
-                KeepDiacritics = args.KeepDiacritics,
-                KeepPunctuations = args.KeepPunctuations,
-                KeepNumbers = args.KeepNumbers,
-                OutputTokens = args.OutputTokens,
-                VectorNormalizer = args.VectorNormalizer,
-                UseStopRemover = args.UsePredefinedStopWordRemover,
-                UseWordExtractor = args.WordFeatureExtractor != null,
-                UseCharExtractor = args.CharFeatureExtractor != null,
-            };
-
-            var estimator = new TextFeaturizingEstimator(env, args.Columns.Name, args.Columns.Source ?? new[] { args.Columns.Name }, settings);
+            var estimator = new TextFeaturizingEstimator(env, args.Columns.Name, args.Columns.Source ?? new[] { args.Columns.Name }, args);
             estimator._dictionary = args.Dictionary;
-            estimator._wordFeatureExtractor = args.WordFeatureExtractor;
-            estimator._charFeatureExtractor = args.CharFeatureExtractor;
+            estimator._wordFeatureExtractor = args.WordFeatureExtractorFactory;
+            estimator._charFeatureExtractor = args.CharFeatureExtractorFactory;
             return estimator.Fit(data).Transform(data) as IDataTransform;
         }
 
diff --git a/src/Microsoft.ML.Transforms/Text/TextNormalizing.cs b/src/Microsoft.ML.Transforms/Text/TextNormalizing.cs
@@ -459,7 +459,6 @@ internal static class Defaults
             public const bool KeepDiacritics = false;
             public const bool KeepPunctuations = true;
             public const bool KeepNumbers = true;
-
         }
 
         internal static bool IsColumnTypeValid(DataViewType type) => (type.GetItemType() is TextDataViewType);
diff --git a/src/Microsoft.ML.Transforms/Text/WrappedTextTransformers.cs b/src/Microsoft.ML.Transforms/Text/WrappedTextTransformers.cs
@@ -26,6 +26,47 @@ public sealed class WordBagEstimator : IEstimator<ITransformer>
         private readonly int _maxNumTerms;
         private readonly NgramExtractingEstimator.WeightingCriteria _weighting;
 
+        /// <summary>
+        /// Options for how the ngrams are extracted.
+        /// </summary>
+        public class Options
+        {
+            /// <summary>
+            /// Maximum ngram length.
+            /// </summary>
+            public int NgramLength;
+
+            /// <summary>
+            /// Maximum number of tokens to skip when constructing an ngram.
+            /// </summary>
+            public int SkipLength;
+
+            /// <summary>
+            /// Whether to store all ngram lengths up to ngramLength, or only ngramLength.
+            /// </summary>
+            public bool AllLengths;
+
+            /// <summary>
+            /// The maximum number of grams to store in the dictionary, for each level of ngrams,
+            /// from 1 (in position 0) up to ngramLength (in position ngramLength-1)
+            /// </summary>
+            public int[] MaximumNgramsCount;
+
+            /// <summary>
+            /// The weighting criteria.
+            /// </summary>
+            public NgramExtractingEstimator.WeightingCriteria Weighting;
+
+            public Options()
+            {
+                NgramLength = 1;
+                SkipLength = NgramExtractingEstimator.Defaults.SkipLength;
+                AllLengths = NgramExtractingEstimator.Defaults.AllLengths;
+                MaximumNgramsCount = new int[] { NgramExtractingEstimator.Defaults.MaxNumTerms };
+                Weighting = NgramExtractingEstimator.Defaults.Weighting;
+            }
+        }
+
         /// <summary>
         /// Produces a bag of counts of ngrams (sequences of consecutive words) in <paramref name="inputColumnName"/>
         /// and outputs bag of word vector as <paramref name="outputColumnName"/>
diff --git a/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv b/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv
@@ -126,7 +126,7 @@ Transforms.Scorer	Turn the predictor model into a transform model	Microsoft.ML.E
 Transforms.Segregator	Un-groups vector columns into sequences of rows, inverse of Group transform	Microsoft.ML.Transforms.GroupingOperations	Ungroup	Microsoft.ML.Transforms.UngroupTransform+Options	Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput
 Transforms.SentimentAnalyzer	Uses a pretrained sentiment model to score input strings	Microsoft.ML.Transforms.Text.TextAnalytics	AnalyzeSentiment	Microsoft.ML.Transforms.Text.SentimentAnalyzingTransformer+Arguments	Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput
 Transforms.TensorFlowScorer	Transforms the data using the TensorFlow model.	Microsoft.ML.Transforms.TensorFlowTransformer	TensorFlowScorer	Microsoft.ML.Transforms.TensorFlowEstimator+Options	Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput
-Transforms.TextFeaturizer	A transform that turns a collection of text documents into numerical feature vectors. The feature vectors are normalized counts of (word and/or character) ngrams in a given tokenized text.	Microsoft.ML.Transforms.Text.TextAnalytics	TextTransform	Microsoft.ML.Transforms.Text.TextFeaturizingEstimator+Arguments	Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput
+Transforms.TextFeaturizer	A transform that turns a collection of text documents into numerical feature vectors. The feature vectors are normalized counts of (word and/or character) ngrams in a given tokenized text.	Microsoft.ML.Transforms.Text.TextAnalytics	TextTransform	Microsoft.ML.Transforms.Text.TextFeaturizingEstimator+Options	Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput
 Transforms.TextToKeyConverter	Converts input values (words, numbers, etc.) to index in a dictionary.	Microsoft.ML.Transforms.Categorical	TextToKey	Microsoft.ML.Transforms.ValueToKeyMappingTransformer+Options	Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput
 Transforms.TrainTestDatasetSplitter	Split the dataset into train and test sets	Microsoft.ML.EntryPoints.TrainTestSplit	Split	Microsoft.ML.EntryPoints.TrainTestSplit+Input	Microsoft.ML.EntryPoints.TrainTestSplit+Output
 Transforms.TreeLeafFeaturizer	Trains a tree ensemble, or loads it from a file, then maps a numeric feature vector to three outputs: 1. A vector containing the individual tree outputs of the tree ensemble. 2. A vector indicating the leaves that the feature vector falls on in the tree ensemble. 3. A vector indicating the paths that the feature vector falls on in the tree ensemble. If a both a model file and a trainer are specified - will use the model file. If neither are specified, will train a default FastTree model. This can handle key labels by training a regression model towards their optionally permuted indices.	Microsoft.ML.Data.TreeFeaturize	Featurizer	Microsoft.ML.Data.TreeEnsembleFeaturizerTransform+ArgumentsForEntryPoint	Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput
diff --git a/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs b/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs
@@ -98,10 +98,10 @@ public void TrainSentiment()
             {
                 OutputTokens = true,
                 KeepPunctuations = false,
-                UseStopRemover = true,
+                UsePredefinedStopWordRemover = true,
                 VectorNormalizer = TextFeaturizingEstimator.NormFunction.None,
-                UseCharExtractor = false,
-                UseWordExtractor = false,
+                CharFeatureExtractor = null,
+                WordFeatureExtractor = null,
             }, "SentimentText").Fit(loader).Transform(loader);
 
             var trans = mlContext.Transforms.Text.ApplyWordEmbedding("Features", "WordEmbeddings_TransformedText",
diff --git a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs
diff --git a/test/Microsoft.ML.Functional.Tests/DataTransformation.cs b/test/Microsoft.ML.Functional.Tests/DataTransformation.cs

Original file line number	Diff line number	Diff line change
`@@ -21,7 +21,7 @@ internal static class TextAnalytics`
`21`	`21`	`Desc = TextFeaturizingEstimator.Summary,`
`22`	`22`	`UserName = TextFeaturizingEstimator.UserName,`
`23`	`23`	`ShortName = TextFeaturizingEstimator.LoaderSignature)]`
`24`		`- public static CommonOutputs.TransformOutput TextTransform(IHostEnvironment env, TextFeaturizingEstimator.Arguments input)`
	`24`	`+ public static CommonOutputs.TransformOutput TextTransform(IHostEnvironment env, TextFeaturizingEstimator.Options input)`
`25`	`25`	`{`
`26`	`26`	`var h = EntryPointUtils.CheckArgsAndCreateHost(env, "FeaturizeTextEstimator", input);`
`27`	`27`	`var xf = TextFeaturizingEstimator.Create(h, input, input.Data);`
Original file line number	Diff line number	Diff line change
`@@ -459,7 +459,6 @@ internal static class Defaults`
`459`	`459`	`public const bool KeepDiacritics = false;`
`460`	`460`	`public const bool KeepPunctuations = true;`
`461`	`461`	`public const bool KeepNumbers = true;`
`462`		`-`
`463`	`462`	`}`
`464`	`463`
`465`	`464`	`internal static bool IsColumnTypeValid(DataViewType type) => (type.GetItemType() is TextDataViewType);`