OutputTokens option in FeaturizeText API (#2985)

abgoswam · web-flow · commit fbbc2220a23f · 2019-03-19T00:47:46.000Z
* example for TokenizeIntoWords and ApplyWordEmbedding

* test passes

* uses OutputTokensColumnName to generate column containing tokens

* taking care of review comments
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/TextTransform.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/TextTransform.cs
@@ -35,7 +35,7 @@ public static void Example()
             {
                 KeepPunctuations = false,
                 KeepNumbers = false,
-                OutputTokens = true,
+                OutputTokensColumnName = "OutputTokens",
                 StopWordsRemoverOptions = new StopWordsRemovingEstimator.Options() { Language = TextFeaturizingEstimator.Language.English }, // supports  English, French, German, Dutch, Italian, Spanish, Japanese
             }, "SentimentText");
 
diff --git a/src/Microsoft.ML.Transforms/Text/TextFeaturizingEstimator.cs b/src/Microsoft.ML.Transforms/Text/TextFeaturizingEstimator.cs
@@ -163,8 +163,8 @@ public IStopWordsRemoverOptions StopWordsRemoverOptions
             [Argument(ArgumentType.AtMostOnce, HelpText = "Whether to keep numbers or remove them.", ShortName = "num", SortOrder = 8)]
             public bool KeepNumbers = TextNormalizingEstimator.Defaults.KeepNumbers;
 
-            [Argument(ArgumentType.AtMostOnce, HelpText = "Whether to output the transformed text tokens as an additional column.", ShortName = "tokens,showtext,showTransformedText", SortOrder = 9)]
-            public bool OutputTokens;
+            [Argument(ArgumentType.AtMostOnce, HelpText = "Column containing the transformed text tokens.", ShortName = "tokens,showtext,showTransformedText", SortOrder = 9)]
+            public string OutputTokensColumnName;
 
             [Argument(ArgumentType.Multiple, HelpText = "A dictionary of whitelisted terms.", ShortName = "dict", NullName = "<None>", SortOrder = 10, Hide = true)]
             internal TermLoaderArguments Dictionary;
@@ -278,7 +278,7 @@ private sealed class TransformApplierParams
             public readonly bool KeepDiacritics;
             public readonly bool KeepPunctuations;
             public readonly bool KeepNumbers;
-            public readonly bool OutputTextTokens;
+            public readonly string OutputTextTokensColumnName;
             public readonly TermLoaderArguments Dictionary;
 
             public StopWordsRemovingEstimator.Language StopwordsLanguage
@@ -305,7 +305,7 @@ internal LpNormNormalizingEstimatorBase.NormFunction LpNorm
 
             // These properties encode the logic needed to determine which transforms to apply.
             #region NeededTransforms
-            public bool NeedsWordTokenizationTransform { get { return WordExtractorFactory != null || NeedsRemoveStopwordsTransform || OutputTextTokens; } }
+            public bool NeedsWordTokenizationTransform { get { return WordExtractorFactory != null || NeedsRemoveStopwordsTransform || !string.IsNullOrEmpty(OutputTextTokensColumnName); } }
 
             public bool NeedsRemoveStopwordsTransform { get { return StopWordsRemover != null; } }
 
@@ -358,7 +358,7 @@ public TransformApplierParams(TextFeaturizingEstimator parent)
                 KeepDiacritics = parent.OptionalSettings.KeepDiacritics;
                 KeepPunctuations = parent.OptionalSettings.KeepPunctuations;
                 KeepNumbers = parent.OptionalSettings.KeepNumbers;
-                OutputTextTokens = parent.OptionalSettings.OutputTokens;
+                OutputTextTokensColumnName = parent.OptionalSettings.OutputTokensColumnName;
                 Dictionary = parent._dictionary;
             }
         }
@@ -371,8 +371,6 @@ public TransformApplierParams(TextFeaturizingEstimator parent)
 
         internal const Language DefaultLanguage = Language.English;
 
-        private const string TransformedTextColFormat = "{0}_TransformedText";
-
         internal TextFeaturizingEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName = null)
             : this(env, outputColumnName, new[] { inputColumnName ?? outputColumnName })
         {
@@ -492,10 +490,10 @@ public ITransformer Fit(IDataView input)
                 wordFeatureCol = dstCol;
             }
 
-            if (tparams.OutputTextTokens)
+            if (!string.IsNullOrEmpty(tparams.OutputTextTokensColumnName))
             {
                 string[] srcCols = wordTokCols ?? textCols;
-                view = new ColumnConcatenatingTransformer(h, string.Format(TransformedTextColFormat, OutputColumn), srcCols).Transform(view);
+                view = new ColumnConcatenatingTransformer(h, tparams.OutputTextTokensColumnName, srcCols).Transform(view);
             }
 
             if (tparams.CharExtractorFactory != null)
@@ -564,7 +562,7 @@ public ITransformer Fit(IDataView input)
                     // Otherwise, simply use the slot names, omitting the original source column names
                     // entirely. For the Concat transform setting the Key == Value of the TaggedColumn
                     // KVP signals this intent.
-                    Contracts.Assert(charFeatureCol != null || wordFeatureCol != null || tparams.OutputTextTokens);
+                    Contracts.Assert(charFeatureCol != null || wordFeatureCol != null || !string.IsNullOrEmpty(tparams.OutputTextTokensColumnName));
                     if (charFeatureCol != null)
                         srcTaggedCols.Add(new KeyValuePair<string, string>(charFeatureCol, charFeatureCol));
                     else if (wordFeatureCol != null)
@@ -613,9 +611,10 @@ public SchemaShape GetOutputSchema(SchemaShape inputSchema)
 
             result[OutputColumn] = new SchemaShape.Column(OutputColumn, SchemaShape.Column.VectorKind.Vector, NumberDataViewType.Single, false,
                 new SchemaShape(metadata));
-            if (OptionalSettings.OutputTokens)
+
+            if (!string.IsNullOrEmpty(OptionalSettings.OutputTokensColumnName))
             {
-                string name = string.Format(TransformedTextColFormat, OutputColumn);
+                string name = OptionalSettings.OutputTokensColumnName;
                 result[name] = new SchemaShape.Column(name, SchemaShape.Column.VectorKind.VariableVector, TextDataViewType.Instance, false);
             }
 
diff --git a/test/BaselineOutput/Common/EntryPoints/core_manifest.json b/test/BaselineOutput/Common/EntryPoints/core_manifest.json
@@ -22428,9 +22428,9 @@
           "Default": true
         },
         {
-          "Name": "OutputTokens",
-          "Type": "Bool",
-          "Desc": "Whether to output the transformed text tokens as an additional column.",
+          "Name": "OutputTokensColumnName",
+          "Type": "String",
+          "Desc": "Column containing the transformed text tokens.",
           "Aliases": [
             "tokens",
             "showtext",
@@ -22439,7 +22439,7 @@
           "Required": false,
           "SortOrder": 9.0,
           "IsNullable": false,
-          "Default": false
+          "Default": null
         },
         {
           "Name": "Dictionary",
diff --git a/test/BaselineOutput/SingleDebug/Text/featurized.tsv b/test/BaselineOutput/SingleDebug/Text/featurized.tsv
diff --git a/test/BaselineOutput/SingleRelease/Text/featurized.tsv b/test/BaselineOutput/SingleRelease/Text/featurized.tsv
diff --git a/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs b/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs
@@ -100,7 +100,7 @@ public void TrainSentiment()
             var loader = mlContext.Data.LoadFromTextFile(_sentimentDataPath, arguments);
             var text = mlContext.Transforms.Text.FeaturizeText("WordEmbeddings", new TextFeaturizingEstimator.Options
             {
-                OutputTokens = true,
+                OutputTokensColumnName = "WordEmbeddings_TransformedText",
                 KeepPunctuations = false,
                 StopWordsRemoverOptions = new StopWordsRemovingEstimator.Options(),
                 Norm = TextFeaturizingEstimator.NormFunction.None,
diff --git a/test/Microsoft.ML.Functional.Tests/Debugging.cs b/test/Microsoft.ML.Functional.Tests/Debugging.cs
@@ -47,16 +47,16 @@ void InspectIntermediatePipelineSteps()
 
             // create a training pipeline.
             var pipeline = mlContext.Transforms.Text.FeaturizeText(
-                "Features",
-                new TextFeaturizingEstimator.Options
-                {
-                    KeepPunctuations = false,
-                    OutputTokens = true,
-                    CharFeatureExtractor = null, // new WordBagEstimator.Options { NgramLength = 0, SkipLength = -1 },
-                    WordFeatureExtractor = new WordBagEstimator.Options { NgramLength = 1},
-                    Norm = TextFeaturizingEstimator.NormFunction.None
-                },
-                "SentimentText");
+                    "Features",
+                    new TextFeaturizingEstimator.Options
+                    {
+                        KeepPunctuations = false,
+                        OutputTokensColumnName = "FeaturizeTextTokens",
+                        CharFeatureExtractor = null, // new WordBagEstimator.Options { NgramLength = 0, SkipLength = -1 },
+                        WordFeatureExtractor = new WordBagEstimator.Options { NgramLength = 1},
+                        Norm = TextFeaturizingEstimator.NormFunction.None
+                    },
+                    "SentimentText");
 
             // Fit the pipeline to the data.
             var model = pipeline.Fit(data);
@@ -68,7 +68,7 @@ void InspectIntermediatePipelineSteps()
 
             // Verify that columns can be inspected.
             // Validate the tokens column.
-            var tokensColumn = transformedData.GetColumn<string[]>(transformedData.Schema["Features_TransformedText"]);
+            var tokensColumn = transformedData.GetColumn<string[]>(transformedData.Schema["FeaturizeTextTokens"]);
             var expectedTokens = new string[3][]
             {
                 new string[] {"i", "love", "mlnet"},
diff --git a/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs b/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs
@@ -42,7 +42,7 @@ public void TextFeaturizerWorkout()
                 .AsDynamic;
 
             var feat = data.MakeNewEstimator()
-                 .Append(row => row.text.FeaturizeText(options: new TextFeaturizingEstimator.Options { OutputTokens = true, }));
+                 .Append(row => row.text.FeaturizeText(options: new TextFeaturizingEstimator.Options { OutputTokensColumnName = "OutputTokens", }));
 
             TestEstimatorCore(feat.AsDynamic, data.AsDynamic, invalidInput: invalidData);
 
@@ -51,7 +51,7 @@ public void TextFeaturizerWorkout()
             {
                 var saver = new TextSaver(ML, new TextSaver.Arguments { Silent = true });
                 var savedData = ML.Data.TakeRows(feat.Fit(data).Transform(data).AsDynamic, 4);
-                savedData = ML.Transforms.SelectColumns("Data", "Data_TransformedText").Fit(savedData).Transform(savedData);
+                savedData = ML.Transforms.SelectColumns("Data", "OutputTokens").Fit(savedData).Transform(savedData);
 
                 using (var fs = File.Create(outputPath))
                     DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);

Original file line number	Diff line number	Diff line change
`@@ -35,7 +35,7 @@ public static void Example()`
`35`	`35`	`{`
`36`	`36`	`KeepPunctuations = false,`
`37`	`37`	`KeepNumbers = false,`
`38`		`- OutputTokens = true,`
	`38`	`+ OutputTokensColumnName = "OutputTokens",`
`39`	`39`	`StopWordsRemoverOptions = new StopWordsRemovingEstimator.Options() { Language = TextFeaturizingEstimator.Language.English }, // supports English, French, German, Dutch, Italian, Spanish, Japanese`
`40`	`40`	`}, "SentimentText");`
`41`	`41`
Original file line number	Diff line number	Diff line change
`@@ -100,7 +100,7 @@ public void TrainSentiment()`
`100`	`100`	`var loader = mlContext.Data.LoadFromTextFile(_sentimentDataPath, arguments);`
`101`	`101`	`var text = mlContext.Transforms.Text.FeaturizeText("WordEmbeddings", new TextFeaturizingEstimator.Options`
`102`	`102`	`{`
`103`		`- OutputTokens = true,`
	`103`	`+ OutputTokensColumnName = "WordEmbeddings_TransformedText",`
`104`	`104`	`KeepPunctuations = false,`
`105`	`105`	`StopWordsRemoverOptions = new StopWordsRemovingEstimator.Options(),`
`106`	`106`	`Norm = TextFeaturizingEstimator.NormFunction.None,`