Skip to content

Commit d6c4872

Browse files
authored
Scrub text featurizers (#2944)
1 parent 91a8703 commit d6c4872

File tree

3 files changed

+11
-11
lines changed

3 files changed

+11
-11
lines changed

src/Microsoft.ML.Transforms/Text/TextFeaturizingEstimator.cs

+9-9
Original file line numberDiff line numberDiff line change
@@ -99,8 +99,8 @@ public sealed class Options : TransformInputBase
9999
[Argument(ArgumentType.Multiple, HelpText = "Use stop remover or not.", ShortName = "remover", SortOrder = 4)]
100100
public bool UsePredefinedStopWordRemover = false;
101101

102-
[Argument(ArgumentType.AtMostOnce, HelpText = "Casing text using the rules of the invariant culture.", ShortName = "case", SortOrder = 5)]
103-
public CaseMode TextCase = TextNormalizingEstimator.Defaults.Mode;
102+
[Argument(ArgumentType.AtMostOnce, HelpText = "Casing text using the rules of the invariant culture.", Name="TextCase", ShortName = "case", SortOrder = 5)]
103+
public CaseMode CaseMode = TextNormalizingEstimator.Defaults.Mode;
104104

105105
[Argument(ArgumentType.AtMostOnce, HelpText = "Whether to keep diacritical marks or remove them.", ShortName = "diac", SortOrder = 6)]
106106
public bool KeepDiacritics = TextNormalizingEstimator.Defaults.KeepDiacritics;
@@ -181,8 +181,8 @@ public WordBagEstimator.Options CharFeatureExtractor
181181
}
182182
}
183183

184-
[Argument(ArgumentType.AtMostOnce, HelpText = "Normalize vectors (rows) individually by rescaling them to unit norm.", ShortName = "norm", SortOrder = 13)]
185-
public NormFunction VectorNormalizer = NormFunction.L2;
184+
[Argument(ArgumentType.AtMostOnce, HelpText = "Normalize vectors (rows) individually by rescaling them to unit norm.", Name = "VectorNormalizer", ShortName = "norm", SortOrder = 13)]
185+
public NormFunction Norm = NormFunction.L2;
186186

187187
public Options()
188188
{
@@ -193,7 +193,7 @@ public Options()
193193

194194
internal readonly string OutputColumn;
195195
private readonly string[] _inputColumns;
196-
internal IReadOnlyCollection<string> InputColumns => _inputColumns.AsReadOnly();
196+
private IReadOnlyCollection<string> InputColumns => _inputColumns.AsReadOnly();
197197
internal Options OptionalSettings { get; }
198198

199199
// These parameters are hardcoded for now.
@@ -288,13 +288,13 @@ public TransformApplierParams(TextFeaturizingEstimator parent)
288288
{
289289
var host = parent._host;
290290
host.Check(Enum.IsDefined(typeof(Language), parent.OptionalSettings.Language));
291-
host.Check(Enum.IsDefined(typeof(CaseMode), parent.OptionalSettings.TextCase));
291+
host.Check(Enum.IsDefined(typeof(CaseMode), parent.OptionalSettings.CaseMode));
292292
WordExtractorFactory = parent._wordFeatureExtractor?.CreateComponent(host, parent._dictionary);
293293
CharExtractorFactory = parent._charFeatureExtractor?.CreateComponent(host, parent._dictionary);
294-
VectorNormalizer = parent.OptionalSettings.VectorNormalizer;
294+
VectorNormalizer = parent.OptionalSettings.Norm;
295295
Language = parent.OptionalSettings.Language;
296296
UsePredefinedStopWordRemover = parent.OptionalSettings.UsePredefinedStopWordRemover;
297-
TextCase = parent.OptionalSettings.TextCase;
297+
TextCase = parent.OptionalSettings.CaseMode;
298298
KeepDiacritics = parent.OptionalSettings.KeepDiacritics;
299299
KeepPunctuations = parent.OptionalSettings.KeepPunctuations;
300300
KeepNumbers = parent.OptionalSettings.KeepNumbers;
@@ -545,7 +545,7 @@ public SchemaShape GetOutputSchema(SchemaShape inputSchema)
545545

546546
var metadata = new List<SchemaShape.Column>(2);
547547
metadata.Add(new SchemaShape.Column(AnnotationUtils.Kinds.SlotNames, SchemaShape.Column.VectorKind.Vector, TextDataViewType.Instance, false));
548-
if (OptionalSettings.VectorNormalizer != NormFunction.None)
548+
if (OptionalSettings.Norm != NormFunction.None)
549549
metadata.Add(new SchemaShape.Column(AnnotationUtils.Kinds.IsNormalized, SchemaShape.Column.VectorKind.Scalar, BooleanDataViewType.Instance, false));
550550

551551
result[OutputColumn] = new SchemaShape.Column(OutputColumn, SchemaShape.Column.VectorKind.Vector, NumberDataViewType.Single, false,

test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ public void TrainSentiment()
9999
OutputTokens = true,
100100
KeepPunctuations = false,
101101
UsePredefinedStopWordRemover = true,
102-
VectorNormalizer = TextFeaturizingEstimator.NormFunction.None,
102+
Norm = TextFeaturizingEstimator.NormFunction.None,
103103
CharFeatureExtractor = null,
104104
WordFeatureExtractor = null,
105105
}, "SentimentText").Fit(loader).Transform(loader);

test/Microsoft.ML.Functional.Tests/DataTransformation.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ void ExtensibilityModifyTextFeaturization()
140140
{
141141
CharFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 3, UseAllLengths = false },
142142
WordFeatureExtractor = new WordBagEstimator.Options(),
143-
VectorNormalizer = TextFeaturizingEstimator.NormFunction.L1
143+
Norm = TextFeaturizingEstimator.NormFunction.L1
144144
}, "SentimentText")
145145
.AppendCacheCheckpoint(mlContext)
146146
.Append(mlContext.BinaryClassification.Trainers.SdcaCalibrated(

0 commit comments

Comments
 (0)