Skip to content

Scrub text featurizers #2944

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 13, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 9 additions & 9 deletions src/Microsoft.ML.Transforms/Text/TextFeaturizingEstimator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,8 @@ public sealed class Options : TransformInputBase
[Argument(ArgumentType.Multiple, HelpText = "Use stop remover or not.", ShortName = "remover", SortOrder = 4)]
public bool UsePredefinedStopWordRemover = false;

[Argument(ArgumentType.AtMostOnce, HelpText = "Casing text using the rules of the invariant culture.", ShortName = "case", SortOrder = 5)]
public CaseMode TextCase = TextNormalizingEstimator.Defaults.Mode;
[Argument(ArgumentType.AtMostOnce, HelpText = "Casing text using the rules of the invariant culture.", Name="TextCase", ShortName = "case", SortOrder = 5)]
public CaseMode CaseMode = TextNormalizingEstimator.Defaults.Mode;
Copy link
Member

@abgoswam abgoswam Mar 13, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CaseMode [](start = 28, length = 8)

does it make sense to call it TextCaseMode ? #WontFix

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is CaseMode in TextFeaturizingEstimator, I'd say Text will look a bit redundant.


In reply to: 265302608 [](ancestors = 265302608)


[Argument(ArgumentType.AtMostOnce, HelpText = "Whether to keep diacritical marks or remove them.", ShortName = "diac", SortOrder = 6)]
public bool KeepDiacritics = TextNormalizingEstimator.Defaults.KeepDiacritics;
Expand Down Expand Up @@ -181,8 +181,8 @@ public WordBagEstimator.Options CharFeatureExtractor
}
}

[Argument(ArgumentType.AtMostOnce, HelpText = "Normalize vectors (rows) individually by rescaling them to unit norm.", ShortName = "norm", SortOrder = 13)]
public NormFunction VectorNormalizer = NormFunction.L2;
[Argument(ArgumentType.AtMostOnce, HelpText = "Normalize vectors (rows) individually by rescaling them to unit norm.", Name = "VectorNormalizer", ShortName = "norm", SortOrder = 13)]
public NormFunction Norm = NormFunction.L2;

public Options()
{
Expand All @@ -193,7 +193,7 @@ public Options()

internal readonly string OutputColumn;
private readonly string[] _inputColumns;
internal IReadOnlyCollection<string> InputColumns => _inputColumns.AsReadOnly();
private IReadOnlyCollection<string> InputColumns => _inputColumns.AsReadOnly();
internal Options OptionalSettings { get; }

// These parameters are hardcoded for now.
Expand Down Expand Up @@ -288,13 +288,13 @@ public TransformApplierParams(TextFeaturizingEstimator parent)
{
var host = parent._host;
host.Check(Enum.IsDefined(typeof(Language), parent.OptionalSettings.Language));
host.Check(Enum.IsDefined(typeof(CaseMode), parent.OptionalSettings.TextCase));
host.Check(Enum.IsDefined(typeof(CaseMode), parent.OptionalSettings.CaseMode));
WordExtractorFactory = parent._wordFeatureExtractor?.CreateComponent(host, parent._dictionary);
CharExtractorFactory = parent._charFeatureExtractor?.CreateComponent(host, parent._dictionary);
VectorNormalizer = parent.OptionalSettings.VectorNormalizer;
VectorNormalizer = parent.OptionalSettings.Norm;
Language = parent.OptionalSettings.Language;
UsePredefinedStopWordRemover = parent.OptionalSettings.UsePredefinedStopWordRemover;
TextCase = parent.OptionalSettings.TextCase;
TextCase = parent.OptionalSettings.CaseMode;
KeepDiacritics = parent.OptionalSettings.KeepDiacritics;
KeepPunctuations = parent.OptionalSettings.KeepPunctuations;
KeepNumbers = parent.OptionalSettings.KeepNumbers;
Expand Down Expand Up @@ -545,7 +545,7 @@ public SchemaShape GetOutputSchema(SchemaShape inputSchema)

var metadata = new List<SchemaShape.Column>(2);
metadata.Add(new SchemaShape.Column(AnnotationUtils.Kinds.SlotNames, SchemaShape.Column.VectorKind.Vector, TextDataViewType.Instance, false));
if (OptionalSettings.VectorNormalizer != NormFunction.None)
if (OptionalSettings.Norm != NormFunction.None)
metadata.Add(new SchemaShape.Column(AnnotationUtils.Kinds.IsNormalized, SchemaShape.Column.VectorKind.Scalar, BooleanDataViewType.Instance, false));

result[OutputColumn] = new SchemaShape.Column(OutputColumn, SchemaShape.Column.VectorKind.Vector, NumberDataViewType.Single, false,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ public void TrainSentiment()
OutputTokens = true,
KeepPunctuations = false,
UsePredefinedStopWordRemover = true,
VectorNormalizer = TextFeaturizingEstimator.NormFunction.None,
Norm = TextFeaturizingEstimator.NormFunction.None,
CharFeatureExtractor = null,
WordFeatureExtractor = null,
}, "SentimentText").Fit(loader).Transform(loader);
Expand Down
2 changes: 1 addition & 1 deletion test/Microsoft.ML.Functional.Tests/DataTransformation.cs
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ void ExtensibilityModifyTextFeaturization()
{
CharFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 3, UseAllLengths = false },
WordFeatureExtractor = new WordBagEstimator.Options(),
VectorNormalizer = TextFeaturizingEstimator.NormFunction.L1
Norm = TextFeaturizingEstimator.NormFunction.L1
}, "SentimentText")
.AppendCacheCheckpoint(mlContext)
.Append(mlContext.BinaryClassification.Trainers.SdcaCalibrated(
Expand Down