Skip to content

Commit fbbc222

Browse files
authored
OutputTokens option in FeaturizeText API (#2985)
* example for TokenizeIntoWords and ApplyWordEmbedding * test passes * uses OutputTokensColumnName to generate column containing tokens * taking care of review comments
1 parent 02524a7 commit fbbc222

File tree

8 files changed

+32
-33
lines changed

8 files changed

+32
-33
lines changed

docs/samples/Microsoft.ML.Samples/Dynamic/TextTransform.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ public static void Example()
3535
{
3636
KeepPunctuations = false,
3737
KeepNumbers = false,
38-
OutputTokens = true,
38+
OutputTokensColumnName = "OutputTokens",
3939
StopWordsRemoverOptions = new StopWordsRemovingEstimator.Options() { Language = TextFeaturizingEstimator.Language.English }, // supports English, French, German, Dutch, Italian, Spanish, Japanese
4040
}, "SentimentText");
4141

src/Microsoft.ML.Transforms/Text/TextFeaturizingEstimator.cs

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -163,8 +163,8 @@ public IStopWordsRemoverOptions StopWordsRemoverOptions
163163
[Argument(ArgumentType.AtMostOnce, HelpText = "Whether to keep numbers or remove them.", ShortName = "num", SortOrder = 8)]
164164
public bool KeepNumbers = TextNormalizingEstimator.Defaults.KeepNumbers;
165165

166-
[Argument(ArgumentType.AtMostOnce, HelpText = "Whether to output the transformed text tokens as an additional column.", ShortName = "tokens,showtext,showTransformedText", SortOrder = 9)]
167-
public bool OutputTokens;
166+
[Argument(ArgumentType.AtMostOnce, HelpText = "Column containing the transformed text tokens.", ShortName = "tokens,showtext,showTransformedText", SortOrder = 9)]
167+
public string OutputTokensColumnName;
168168

169169
[Argument(ArgumentType.Multiple, HelpText = "A dictionary of whitelisted terms.", ShortName = "dict", NullName = "<None>", SortOrder = 10, Hide = true)]
170170
internal TermLoaderArguments Dictionary;
@@ -278,7 +278,7 @@ private sealed class TransformApplierParams
278278
public readonly bool KeepDiacritics;
279279
public readonly bool KeepPunctuations;
280280
public readonly bool KeepNumbers;
281-
public readonly bool OutputTextTokens;
281+
public readonly string OutputTextTokensColumnName;
282282
public readonly TermLoaderArguments Dictionary;
283283

284284
public StopWordsRemovingEstimator.Language StopwordsLanguage
@@ -305,7 +305,7 @@ internal LpNormNormalizingEstimatorBase.NormFunction LpNorm
305305

306306
// These properties encode the logic needed to determine which transforms to apply.
307307
#region NeededTransforms
308-
public bool NeedsWordTokenizationTransform { get { return WordExtractorFactory != null || NeedsRemoveStopwordsTransform || OutputTextTokens; } }
308+
public bool NeedsWordTokenizationTransform { get { return WordExtractorFactory != null || NeedsRemoveStopwordsTransform || !string.IsNullOrEmpty(OutputTextTokensColumnName); } }
309309

310310
public bool NeedsRemoveStopwordsTransform { get { return StopWordsRemover != null; } }
311311

@@ -358,7 +358,7 @@ public TransformApplierParams(TextFeaturizingEstimator parent)
358358
KeepDiacritics = parent.OptionalSettings.KeepDiacritics;
359359
KeepPunctuations = parent.OptionalSettings.KeepPunctuations;
360360
KeepNumbers = parent.OptionalSettings.KeepNumbers;
361-
OutputTextTokens = parent.OptionalSettings.OutputTokens;
361+
OutputTextTokensColumnName = parent.OptionalSettings.OutputTokensColumnName;
362362
Dictionary = parent._dictionary;
363363
}
364364
}
@@ -371,8 +371,6 @@ public TransformApplierParams(TextFeaturizingEstimator parent)
371371

372372
internal const Language DefaultLanguage = Language.English;
373373

374-
private const string TransformedTextColFormat = "{0}_TransformedText";
375-
376374
internal TextFeaturizingEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName = null)
377375
: this(env, outputColumnName, new[] { inputColumnName ?? outputColumnName })
378376
{
@@ -492,10 +490,10 @@ public ITransformer Fit(IDataView input)
492490
wordFeatureCol = dstCol;
493491
}
494492

495-
if (tparams.OutputTextTokens)
493+
if (!string.IsNullOrEmpty(tparams.OutputTextTokensColumnName))
496494
{
497495
string[] srcCols = wordTokCols ?? textCols;
498-
view = new ColumnConcatenatingTransformer(h, string.Format(TransformedTextColFormat, OutputColumn), srcCols).Transform(view);
496+
view = new ColumnConcatenatingTransformer(h, tparams.OutputTextTokensColumnName, srcCols).Transform(view);
499497
}
500498

501499
if (tparams.CharExtractorFactory != null)
@@ -564,7 +562,7 @@ public ITransformer Fit(IDataView input)
564562
// Otherwise, simply use the slot names, omitting the original source column names
565563
// entirely. For the Concat transform setting the Key == Value of the TaggedColumn
566564
// KVP signals this intent.
567-
Contracts.Assert(charFeatureCol != null || wordFeatureCol != null || tparams.OutputTextTokens);
565+
Contracts.Assert(charFeatureCol != null || wordFeatureCol != null || !string.IsNullOrEmpty(tparams.OutputTextTokensColumnName));
568566
if (charFeatureCol != null)
569567
srcTaggedCols.Add(new KeyValuePair<string, string>(charFeatureCol, charFeatureCol));
570568
else if (wordFeatureCol != null)
@@ -613,9 +611,10 @@ public SchemaShape GetOutputSchema(SchemaShape inputSchema)
613611

614612
result[OutputColumn] = new SchemaShape.Column(OutputColumn, SchemaShape.Column.VectorKind.Vector, NumberDataViewType.Single, false,
615613
new SchemaShape(metadata));
616-
if (OptionalSettings.OutputTokens)
614+
615+
if (!string.IsNullOrEmpty(OptionalSettings.OutputTokensColumnName))
617616
{
618-
string name = string.Format(TransformedTextColFormat, OutputColumn);
617+
string name = OptionalSettings.OutputTokensColumnName;
619618
result[name] = new SchemaShape.Column(name, SchemaShape.Column.VectorKind.VariableVector, TextDataViewType.Instance, false);
620619
}
621620

test/BaselineOutput/Common/EntryPoints/core_manifest.json

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22428,9 +22428,9 @@
2242822428
"Default": true
2242922429
},
2243022430
{
22431-
"Name": "OutputTokens",
22432-
"Type": "Bool",
22433-
"Desc": "Whether to output the transformed text tokens as an additional column.",
22431+
"Name": "OutputTokensColumnName",
22432+
"Type": "String",
22433+
"Desc": "Column containing the transformed text tokens.",
2243422434
"Aliases": [
2243522435
"tokens",
2243622436
"showtext",
@@ -22439,7 +22439,7 @@
2243922439
"Required": false,
2244022440
"SortOrder": 9.0,
2244122441
"IsNullable": false,
22442-
"Default": false
22442+
"Default": null
2244322443
},
2244422444
{
2244522445
"Name": "Dictionary",

test/BaselineOutput/SingleDebug/Text/featurized.tsv

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

test/BaselineOutput/SingleRelease/Text/featurized.tsv

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ public void TrainSentiment()
100100
var loader = mlContext.Data.LoadFromTextFile(_sentimentDataPath, arguments);
101101
var text = mlContext.Transforms.Text.FeaturizeText("WordEmbeddings", new TextFeaturizingEstimator.Options
102102
{
103-
OutputTokens = true,
103+
OutputTokensColumnName = "WordEmbeddings_TransformedText",
104104
KeepPunctuations = false,
105105
StopWordsRemoverOptions = new StopWordsRemovingEstimator.Options(),
106106
Norm = TextFeaturizingEstimator.NormFunction.None,

test/Microsoft.ML.Functional.Tests/Debugging.cs

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -47,16 +47,16 @@ void InspectIntermediatePipelineSteps()
4747

4848
// create a training pipeline.
4949
var pipeline = mlContext.Transforms.Text.FeaturizeText(
50-
"Features",
51-
new TextFeaturizingEstimator.Options
52-
{
53-
KeepPunctuations = false,
54-
OutputTokens = true,
55-
CharFeatureExtractor = null, // new WordBagEstimator.Options { NgramLength = 0, SkipLength = -1 },
56-
WordFeatureExtractor = new WordBagEstimator.Options { NgramLength = 1},
57-
Norm = TextFeaturizingEstimator.NormFunction.None
58-
},
59-
"SentimentText");
50+
"Features",
51+
new TextFeaturizingEstimator.Options
52+
{
53+
KeepPunctuations = false,
54+
OutputTokensColumnName = "FeaturizeTextTokens",
55+
CharFeatureExtractor = null, // new WordBagEstimator.Options { NgramLength = 0, SkipLength = -1 },
56+
WordFeatureExtractor = new WordBagEstimator.Options { NgramLength = 1},
57+
Norm = TextFeaturizingEstimator.NormFunction.None
58+
},
59+
"SentimentText");
6060

6161
// Fit the pipeline to the data.
6262
var model = pipeline.Fit(data);
@@ -68,7 +68,7 @@ void InspectIntermediatePipelineSteps()
6868

6969
// Verify that columns can be inspected.
7070
// Validate the tokens column.
71-
var tokensColumn = transformedData.GetColumn<string[]>(transformedData.Schema["Features_TransformedText"]);
71+
var tokensColumn = transformedData.GetColumn<string[]>(transformedData.Schema["FeaturizeTextTokens"]);
7272
var expectedTokens = new string[3][]
7373
{
7474
new string[] {"i", "love", "mlnet"},

test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ public void TextFeaturizerWorkout()
4242
.AsDynamic;
4343

4444
var feat = data.MakeNewEstimator()
45-
.Append(row => row.text.FeaturizeText(options: new TextFeaturizingEstimator.Options { OutputTokens = true, }));
45+
.Append(row => row.text.FeaturizeText(options: new TextFeaturizingEstimator.Options { OutputTokensColumnName = "OutputTokens", }));
4646

4747
TestEstimatorCore(feat.AsDynamic, data.AsDynamic, invalidInput: invalidData);
4848

@@ -51,7 +51,7 @@ public void TextFeaturizerWorkout()
5151
{
5252
var saver = new TextSaver(ML, new TextSaver.Arguments { Silent = true });
5353
var savedData = ML.Data.TakeRows(feat.Fit(data).Transform(data).AsDynamic, 4);
54-
savedData = ML.Transforms.SelectColumns("Data", "Data_TransformedText").Fit(savedData).Transform(savedData);
54+
savedData = ML.Transforms.SelectColumns("Data", "OutputTokens").Fit(savedData).Transform(savedData);
5555

5656
using (var fs = File.Create(outputPath))
5757
DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);

0 commit comments

Comments
 (0)