Skip to content

Commit 5d6a2f6

Browse files
committed
uses OutputTokensColumnName to generate column containing tokens
1 parent f2ed7c7 commit 5d6a2f6

File tree

6 files changed

+23
-46
lines changed

6 files changed

+23
-46
lines changed

docs/samples/Microsoft.ML.Samples/Dynamic/TextTransform.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ public static void Example()
3535
{
3636
KeepPunctuations = false,
3737
KeepNumbers = false,
38-
OutputTokens = true,
38+
OutputTokensColumnName = "OutputTokens",
3939
Language = TextFeaturizingEstimator.Language.English, // supports English, French, German, Dutch, Italian, Spanish, Japanese
4040
}, "SentimentText");
4141

src/Microsoft.ML.Transforms/Text/TextFeaturizingEstimator.cs

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -111,8 +111,8 @@ public sealed class Options : TransformInputBase
111111
[Argument(ArgumentType.AtMostOnce, HelpText = "Whether to keep numbers or remove them.", ShortName = "num", SortOrder = 8)]
112112
public bool KeepNumbers = TextNormalizingEstimator.Defaults.KeepNumbers;
113113

114-
[Argument(ArgumentType.AtMostOnce, HelpText = "Whether to output the transformed text tokens as an additional column.", ShortName = "tokens,showtext,showTransformedText", SortOrder = 9)]
115-
public bool OutputTokens;
114+
[Argument(ArgumentType.AtMostOnce, HelpText = "Column containing the transformed text tokens.", ShortName = "OutputTokens,tokens,showtext,showTransformedText", SortOrder = 9)]
115+
public string OutputTokensColumnName;
116116

117117
[Argument(ArgumentType.Multiple, HelpText = "A dictionary of whitelisted terms.", ShortName = "dict", NullName = "<None>", SortOrder = 10, Hide = true)]
118118
internal TermLoaderArguments Dictionary;
@@ -225,7 +225,7 @@ private sealed class TransformApplierParams
225225
public readonly bool KeepDiacritics;
226226
public readonly bool KeepPunctuations;
227227
public readonly bool KeepNumbers;
228-
public readonly bool OutputTextTokens;
228+
public readonly string OutputTextTokensColumnName;
229229
public readonly TermLoaderArguments Dictionary;
230230

231231
public StopWordsRemovingEstimator.Language StopwordsLanguage
@@ -252,7 +252,7 @@ internal LpNormNormalizingEstimatorBase.NormFunction LpNorm
252252

253253
// These properties encode the logic needed to determine which transforms to apply.
254254
#region NeededTransforms
255-
public bool NeedsWordTokenizationTransform { get { return WordExtractorFactory != null || UsePredefinedStopWordRemover || OutputTextTokens; } }
255+
public bool NeedsWordTokenizationTransform { get { return WordExtractorFactory != null || UsePredefinedStopWordRemover || !string.IsNullOrEmpty(OutputTextTokensColumnName); } }
256256

257257
public bool NeedsNormalizeTransform
258258
{
@@ -303,7 +303,7 @@ public TransformApplierParams(TextFeaturizingEstimator parent)
303303
KeepDiacritics = parent.OptionalSettings.KeepDiacritics;
304304
KeepPunctuations = parent.OptionalSettings.KeepPunctuations;
305305
KeepNumbers = parent.OptionalSettings.KeepNumbers;
306-
OutputTextTokens = parent.OptionalSettings.OutputTokens;
306+
OutputTextTokensColumnName = parent.OptionalSettings.OutputTokensColumnName;
307307
Dictionary = parent._dictionary;
308308
}
309309
}
@@ -316,8 +316,6 @@ public TransformApplierParams(TextFeaturizingEstimator parent)
316316

317317
internal const Language DefaultLanguage = Language.English;
318318

319-
private const string TransformedTextColFormat = "{0}_TransformedText";
320-
321319
internal TextFeaturizingEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName = null)
322320
: this(env, outputColumnName, new[] { inputColumnName ?? outputColumnName })
323321
{
@@ -434,10 +432,10 @@ public ITransformer Fit(IDataView input)
434432
wordFeatureCol = dstCol;
435433
}
436434

437-
if (tparams.OutputTextTokens)
435+
if (!string.IsNullOrEmpty(tparams.OutputTextTokensColumnName))
438436
{
439437
string[] srcCols = wordTokCols ?? textCols;
440-
view = new ColumnConcatenatingTransformer(h, string.Format(TransformedTextColFormat, OutputColumn), srcCols).Transform(view);
438+
view = new ColumnConcatenatingTransformer(h, tparams.OutputTextTokensColumnName, srcCols).Transform(view);
441439
}
442440

443441
if (tparams.CharExtractorFactory != null)
@@ -506,7 +504,7 @@ public ITransformer Fit(IDataView input)
506504
// Otherwise, simply use the slot names, omitting the original source column names
507505
// entirely. For the Concat transform setting the Key == Value of the TaggedColumn
508506
// KVP signals this intent.
509-
Contracts.Assert(charFeatureCol != null || wordFeatureCol != null || tparams.OutputTextTokens);
507+
Contracts.Assert(charFeatureCol != null || wordFeatureCol != null || !string.IsNullOrEmpty(tparams.OutputTextTokensColumnName));
510508
if (charFeatureCol != null)
511509
srcTaggedCols.Add(new KeyValuePair<string, string>(charFeatureCol, charFeatureCol));
512510
else if (wordFeatureCol != null)
@@ -555,9 +553,10 @@ public SchemaShape GetOutputSchema(SchemaShape inputSchema)
555553

556554
result[OutputColumn] = new SchemaShape.Column(OutputColumn, SchemaShape.Column.VectorKind.Vector, NumberDataViewType.Single, false,
557555
new SchemaShape(metadata));
558-
if (OptionalSettings.OutputTokens)
556+
557+
if (!string.IsNullOrEmpty(OptionalSettings.OutputTokensColumnName))
559558
{
560-
string name = string.Format(TransformedTextColFormat, OutputColumn);
559+
string name = OptionalSettings.OutputTokensColumnName;
561560
result[name] = new SchemaShape.Column(name, SchemaShape.Column.VectorKind.VariableVector, TextDataViewType.Instance, false);
562561
}
563562

test/BaselineOutput/Common/EntryPoints/core_manifest.json

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22436,18 +22436,19 @@
2243622436
"Default": true
2243722437
},
2243822438
{
22439-
"Name": "OutputTokens",
22440-
"Type": "Bool",
22441-
"Desc": "Whether to output the transformed text tokens as an additional column.",
22439+
"Name": "OutputTokensColumnName",
22440+
"Type": "String",
22441+
"Desc": "Column containing the transformed text tokens.",
2244222442
"Aliases": [
22443+
"OutputTokens",
2244322444
"tokens",
2244422445
"showtext",
2244522446
"showTransformedText"
2244622447
],
2244722448
"Required": false,
2244822449
"SortOrder": 9.0,
2244922450
"IsNullable": false,
22450-
"Default": false
22451+
"Default": null
2245122452
},
2245222453
{
2245322454
"Name": "Dictionary",

test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ public void TrainSentiment()
101101
var loader = mlContext.Data.LoadFromTextFile(_sentimentDataPath, arguments);
102102
var text = mlContext.Transforms.Text.FeaturizeText("WordEmbeddings", new TextFeaturizingEstimator.Options
103103
{
104-
OutputTokens = true,
104+
OutputTokensColumnName = "WordEmbeddings_TransformedText",
105105
KeepPunctuations = false,
106106
UsePredefinedStopWordRemover = true,
107107
Norm = TextFeaturizingEstimator.NormFunction.None,

test/Microsoft.ML.Functional.Tests/Debugging.cs

Lines changed: 4 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -46,20 +46,17 @@ void InspectIntermediatePipelineSteps()
4646
});
4747

4848
// create a training pipeline.
49-
var pipeline =
50-
mlContext.Transforms.Text.TokenizeIntoWords("SentimentTextTokenized", "SentimentText")
51-
.Append(mlContext.Transforms.Text.ApplyWordEmbedding("SentimentEmbeddingFeatures", "SentimentTextTokenized", WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding))
52-
.Append(mlContext.Transforms.Text.FeaturizeText(
49+
var pipeline = mlContext.Transforms.Text.FeaturizeText(
5350
"Features",
5451
new TextFeaturizingEstimator.Options
5552
{
5653
KeepPunctuations = false,
57-
OutputTokens = true,
54+
OutputTokensColumnName = "FeaturizeTextTokens",
5855
CharFeatureExtractor = null, // new WordBagEstimator.Options { NgramLength = 0, SkipLength = -1 },
5956
WordFeatureExtractor = new WordBagEstimator.Options { NgramLength = 1},
6057
Norm = TextFeaturizingEstimator.NormFunction.None
6158
},
62-
"SentimentTextTokenized"));
59+
"SentimentText");
6360

6461
// Fit the pipeline to the data.
6562
var model = pipeline.Fit(data);
@@ -69,29 +66,9 @@ void InspectIntermediatePipelineSteps()
6966

7067
var preview = transformedData.Preview();
7168

72-
// Embedding Features
73-
var embeddingColumn = transformedData.GetColumn<float[]>(transformedData.Schema["SentimentEmbeddingFeatures"]);
74-
foreach(var embeddinFeatures in embeddingColumn)
75-
{
76-
Assert.Equal(150, embeddinFeatures.Length);
77-
}
78-
79-
// Verify that columns can be inspected.
80-
// Validate the tokens column.
81-
var tokensColumn1 = transformedData.GetColumn<string[]>(transformedData.Schema["SentimentTextTokenized"]);
82-
var expectedTokens1 = new string[3][]
83-
{
84-
new string[] {"I", "love", "ML.NET."},
85-
new string[] {"I", "love", "TLC."},
86-
new string[] {"I", "dislike", "fika."},
87-
};
88-
int j = 0;
89-
foreach (var rowTokens in tokensColumn1)
90-
Assert.Equal(expectedTokens1[j++], rowTokens);
91-
9269
// Verify that columns can be inspected.
9370
// Validate the tokens column.
94-
var tokensColumn = transformedData.GetColumn<string[]>(transformedData.Schema["Features_TransformedText"]);
71+
var tokensColumn = transformedData.GetColumn<string[]>(transformedData.Schema["FeaturizeTextTokens"]);
9572
var expectedTokens = new string[3][]
9673
{
9774
new string[] {"i", "love", "mlnet"},

test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ public void TextFeaturizerWorkout()
4343
.AsDynamic;
4444

4545
var feat = data.MakeNewEstimator()
46-
.Append(row => row.text.FeaturizeText(options: new TextFeaturizingEstimator.Options { OutputTokens = true, }));
46+
.Append(row => row.text.FeaturizeText(options: new TextFeaturizingEstimator.Options { OutputTokensColumnName = "Data_TransformedText", }));
4747

4848
TestEstimatorCore(feat.AsDynamic, data.AsDynamic, invalidInput: invalidData);
4949

0 commit comments

Comments
 (0)