uses OutputTokensColumnName to generate column containing tokens

abgoswam · abgoswam · commit 5d6a2f6b1cd0 · 2019-03-18T03:39:21.000Z
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/TextTransform.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/TextTransform.cs
@@ -35,7 +35,7 @@ public static void Example()
             {
                 KeepPunctuations = false,
                 KeepNumbers = false,
-                OutputTokens = true,
+                OutputTokensColumnName = "OutputTokens",
                 Language = TextFeaturizingEstimator.Language.English, // supports  English, French, German, Dutch, Italian, Spanish, Japanese
             }, "SentimentText");
 
diff --git a/src/Microsoft.ML.Transforms/Text/TextFeaturizingEstimator.cs b/src/Microsoft.ML.Transforms/Text/TextFeaturizingEstimator.cs
@@ -111,8 +111,8 @@ public sealed class Options : TransformInputBase
             [Argument(ArgumentType.AtMostOnce, HelpText = "Whether to keep numbers or remove them.", ShortName = "num", SortOrder = 8)]
             public bool KeepNumbers = TextNormalizingEstimator.Defaults.KeepNumbers;
 
-            [Argument(ArgumentType.AtMostOnce, HelpText = "Whether to output the transformed text tokens as an additional column.", ShortName = "tokens,showtext,showTransformedText", SortOrder = 9)]
-            public bool OutputTokens;
+            [Argument(ArgumentType.AtMostOnce, HelpText = "Column containing the transformed text tokens.", ShortName = "OutputTokens,tokens,showtext,showTransformedText", SortOrder = 9)]
+            public string OutputTokensColumnName;
 
             [Argument(ArgumentType.Multiple, HelpText = "A dictionary of whitelisted terms.", ShortName = "dict", NullName = "<None>", SortOrder = 10, Hide = true)]
             internal TermLoaderArguments Dictionary;
@@ -225,7 +225,7 @@ private sealed class TransformApplierParams
             public readonly bool KeepDiacritics;
             public readonly bool KeepPunctuations;
             public readonly bool KeepNumbers;
-            public readonly bool OutputTextTokens;
+            public readonly string OutputTextTokensColumnName;
             public readonly TermLoaderArguments Dictionary;
 
             public StopWordsRemovingEstimator.Language StopwordsLanguage
@@ -252,7 +252,7 @@ internal LpNormNormalizingEstimatorBase.NormFunction LpNorm
 
             // These properties encode the logic needed to determine which transforms to apply.
             #region NeededTransforms
-            public bool NeedsWordTokenizationTransform { get { return WordExtractorFactory != null || UsePredefinedStopWordRemover || OutputTextTokens; } }
+            public bool NeedsWordTokenizationTransform { get { return WordExtractorFactory != null || UsePredefinedStopWordRemover || !string.IsNullOrEmpty(OutputTextTokensColumnName); } }
 
             public bool NeedsNormalizeTransform
             {
@@ -303,7 +303,7 @@ public TransformApplierParams(TextFeaturizingEstimator parent)
                 KeepDiacritics = parent.OptionalSettings.KeepDiacritics;
                 KeepPunctuations = parent.OptionalSettings.KeepPunctuations;
                 KeepNumbers = parent.OptionalSettings.KeepNumbers;
-                OutputTextTokens = parent.OptionalSettings.OutputTokens;
+                OutputTextTokensColumnName = parent.OptionalSettings.OutputTokensColumnName;
                 Dictionary = parent._dictionary;
             }
         }
@@ -316,8 +316,6 @@ public TransformApplierParams(TextFeaturizingEstimator parent)
 
         internal const Language DefaultLanguage = Language.English;
 
-        private const string TransformedTextColFormat = "{0}_TransformedText";
-
         internal TextFeaturizingEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName = null)
             : this(env, outputColumnName, new[] { inputColumnName ?? outputColumnName })
         {
@@ -434,10 +432,10 @@ public ITransformer Fit(IDataView input)
                 wordFeatureCol = dstCol;
             }
 
-            if (tparams.OutputTextTokens)
+            if (!string.IsNullOrEmpty(tparams.OutputTextTokensColumnName))
             {
                 string[] srcCols = wordTokCols ?? textCols;
-                view = new ColumnConcatenatingTransformer(h, string.Format(TransformedTextColFormat, OutputColumn), srcCols).Transform(view);
+                view = new ColumnConcatenatingTransformer(h, tparams.OutputTextTokensColumnName, srcCols).Transform(view);
             }
 
             if (tparams.CharExtractorFactory != null)
@@ -506,7 +504,7 @@ public ITransformer Fit(IDataView input)
                     // Otherwise, simply use the slot names, omitting the original source column names
                     // entirely. For the Concat transform setting the Key == Value of the TaggedColumn
                     // KVP signals this intent.
-                    Contracts.Assert(charFeatureCol != null || wordFeatureCol != null || tparams.OutputTextTokens);
+                    Contracts.Assert(charFeatureCol != null || wordFeatureCol != null || !string.IsNullOrEmpty(tparams.OutputTextTokensColumnName));
                     if (charFeatureCol != null)
                         srcTaggedCols.Add(new KeyValuePair<string, string>(charFeatureCol, charFeatureCol));
                     else if (wordFeatureCol != null)
@@ -555,9 +553,10 @@ public SchemaShape GetOutputSchema(SchemaShape inputSchema)
 
             result[OutputColumn] = new SchemaShape.Column(OutputColumn, SchemaShape.Column.VectorKind.Vector, NumberDataViewType.Single, false,
                 new SchemaShape(metadata));
-            if (OptionalSettings.OutputTokens)
+
+            if (!string.IsNullOrEmpty(OptionalSettings.OutputTokensColumnName))
             {
-                string name = string.Format(TransformedTextColFormat, OutputColumn);
+                string name = OptionalSettings.OutputTokensColumnName;
                 result[name] = new SchemaShape.Column(name, SchemaShape.Column.VectorKind.VariableVector, TextDataViewType.Instance, false);
             }
 
diff --git a/test/BaselineOutput/Common/EntryPoints/core_manifest.json b/test/BaselineOutput/Common/EntryPoints/core_manifest.json
@@ -22436,18 +22436,19 @@
           "Default": true
         },
         {
-          "Name": "OutputTokens",
-          "Type": "Bool",
-          "Desc": "Whether to output the transformed text tokens as an additional column.",
+          "Name": "OutputTokensColumnName",
+          "Type": "String",
+          "Desc": "Column containing the transformed text tokens.",
           "Aliases": [
+            "OutputTokens",
             "tokens",
             "showtext",
             "showTransformedText"
           ],
           "Required": false,
           "SortOrder": 9.0,
           "IsNullable": false,
-          "Default": false
+          "Default": null
         },
         {
           "Name": "Dictionary",
diff --git a/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs b/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs
@@ -101,7 +101,7 @@ public void TrainSentiment()
             var loader = mlContext.Data.LoadFromTextFile(_sentimentDataPath, arguments);
             var text = mlContext.Transforms.Text.FeaturizeText("WordEmbeddings", new TextFeaturizingEstimator.Options
             {
-                OutputTokens = true,
+                OutputTokensColumnName = "WordEmbeddings_TransformedText",
                 KeepPunctuations = false,
                 UsePredefinedStopWordRemover = true,
                 Norm = TextFeaturizingEstimator.NormFunction.None,
diff --git a/test/Microsoft.ML.Functional.Tests/Debugging.cs b/test/Microsoft.ML.Functional.Tests/Debugging.cs
@@ -46,20 +46,17 @@ void InspectIntermediatePipelineSteps()
                 });
 
             // create a training pipeline.
-            var pipeline =
-                mlContext.Transforms.Text.TokenizeIntoWords("SentimentTextTokenized", "SentimentText")
-                .Append(mlContext.Transforms.Text.ApplyWordEmbedding("SentimentEmbeddingFeatures", "SentimentTextTokenized", WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding))
-                .Append(mlContext.Transforms.Text.FeaturizeText(
+            var pipeline = mlContext.Transforms.Text.FeaturizeText(
                     "Features",
                     new TextFeaturizingEstimator.Options
                     {
                         KeepPunctuations = false,
-                        OutputTokens = true,
+                        OutputTokensColumnName = "FeaturizeTextTokens",
                         CharFeatureExtractor = null, // new WordBagEstimator.Options { NgramLength = 0, SkipLength = -1 },
                         WordFeatureExtractor = new WordBagEstimator.Options { NgramLength = 1},
                         Norm = TextFeaturizingEstimator.NormFunction.None
                     },
-                    "SentimentTextTokenized"));
+                    "SentimentText");
 
             // Fit the pipeline to the data.
             var model = pipeline.Fit(data);
@@ -69,29 +66,9 @@ void InspectIntermediatePipelineSteps()
 
             var preview = transformedData.Preview();
 
-            // Embedding Features
-            var embeddingColumn = transformedData.GetColumn<float[]>(transformedData.Schema["SentimentEmbeddingFeatures"]);
-            foreach(var embeddinFeatures in embeddingColumn)
-            {
-                Assert.Equal(150, embeddinFeatures.Length);
-            }
-
-            // Verify that columns can be inspected.
-            // Validate the tokens column.
-            var tokensColumn1 = transformedData.GetColumn<string[]>(transformedData.Schema["SentimentTextTokenized"]);
-            var expectedTokens1 = new string[3][]
-            {
-                new string[] {"I", "love", "ML.NET."},
-                new string[] {"I", "love", "TLC."},
-                new string[] {"I", "dislike", "fika."},
-            };
-            int j = 0;
-            foreach (var rowTokens in tokensColumn1)
-                Assert.Equal(expectedTokens1[j++], rowTokens);
-
             // Verify that columns can be inspected.
             // Validate the tokens column.
-            var tokensColumn = transformedData.GetColumn<string[]>(transformedData.Schema["Features_TransformedText"]);
+            var tokensColumn = transformedData.GetColumn<string[]>(transformedData.Schema["FeaturizeTextTokens"]);
             var expectedTokens = new string[3][]
             {
                 new string[] {"i", "love", "mlnet"},
diff --git a/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs b/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs
@@ -43,7 +43,7 @@ public void TextFeaturizerWorkout()
                 .AsDynamic;
 
             var feat = data.MakeNewEstimator()
-                 .Append(row => row.text.FeaturizeText(options: new TextFeaturizingEstimator.Options { OutputTokens = true, }));
+                 .Append(row => row.text.FeaturizeText(options: new TextFeaturizingEstimator.Options { OutputTokensColumnName = "Data_TransformedText", }));
 
             TestEstimatorCore(feat.AsDynamic, data.AsDynamic, invalidInput: invalidData);
 

Original file line number	Diff line number	Diff line change
`@@ -35,7 +35,7 @@ public static void Example()`
`35`	`35`	`{`
`36`	`36`	`KeepPunctuations = false,`
`37`	`37`	`KeepNumbers = false,`
`38`		`- OutputTokens = true,`
	`38`	`+ OutputTokensColumnName = "OutputTokens",`
`39`	`39`	`Language = TextFeaturizingEstimator.Language.English, // supports English, French, German, Dutch, Italian, Spanish, Japanese`
`40`	`40`	`}, "SentimentText");`
`41`	`41`
Original file line number	Diff line number	Diff line change
`@@ -101,7 +101,7 @@ public void TrainSentiment()`
`101`	`101`	`var loader = mlContext.Data.LoadFromTextFile(_sentimentDataPath, arguments);`
`102`	`102`	`var text = mlContext.Transforms.Text.FeaturizeText("WordEmbeddings", new TextFeaturizingEstimator.Options`
`103`	`103`	`{`
`104`		`- OutputTokens = true,`
	`104`	`+ OutputTokensColumnName = "WordEmbeddings_TransformedText",`
`105`	`105`	`KeepPunctuations = false,`
`106`	`106`	`UsePredefinedStopWordRemover = true,`
`107`	`107`	`Norm = TextFeaturizingEstimator.NormFunction.None,`