Skip to content

Commit ec6ff08

Browse files
authored
Exposed ngram extraction options in TextFeaturizer (#2911)
1 parent 6e1291a commit ec6ff08

File tree

9 files changed

+129
-93
lines changed

9 files changed

+129
-93
lines changed

docs/samples/Microsoft.ML.Samples/Dynamic/TextTransform.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ public static void Example()
3636
KeepPunctuations = false,
3737
KeepNumbers = false,
3838
OutputTokens = true,
39-
TextLanguage = TextFeaturizingEstimator.Language.English, // supports English, French, German, Dutch, Italian, Spanish, Japanese
39+
Language = TextFeaturizingEstimator.Language.English, // supports English, French, German, Dutch, Italian, Spanish, Japanese
4040
}, "SentimentText");
4141

4242
// The transformed data for both pipelines.

src/Microsoft.ML.Transforms/EntryPoints/TextAnalytics.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ internal static class TextAnalytics
2121
Desc = TextFeaturizingEstimator.Summary,
2222
UserName = TextFeaturizingEstimator.UserName,
2323
ShortName = TextFeaturizingEstimator.LoaderSignature)]
24-
public static CommonOutputs.TransformOutput TextTransform(IHostEnvironment env, TextFeaturizingEstimator.Arguments input)
24+
public static CommonOutputs.TransformOutput TextTransform(IHostEnvironment env, TextFeaturizingEstimator.Options input)
2525
{
2626
var h = EntryPointUtils.CheckArgsAndCreateHost(env, "FeaturizeTextEstimator", input);
2727
var xf = TextFeaturizingEstimator.Create(h, input, input.Data);

src/Microsoft.ML.Transforms/Text/TextFeaturizingEstimator.cs

+77-79
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
using Microsoft.ML.Runtime;
1717
using Microsoft.ML.Transforms.Text;
1818

19-
[assembly: LoadableClass(TextFeaturizingEstimator.Summary, typeof(IDataTransform), typeof(TextFeaturizingEstimator), typeof(TextFeaturizingEstimator.Arguments), typeof(SignatureDataTransform),
19+
[assembly: LoadableClass(TextFeaturizingEstimator.Summary, typeof(IDataTransform), typeof(TextFeaturizingEstimator), typeof(TextFeaturizingEstimator.Options), typeof(SignatureDataTransform),
2020
TextFeaturizingEstimator.UserName, "TextTransform", TextFeaturizingEstimator.LoaderSignature)]
2121

2222
[assembly: LoadableClass(TextFeaturizingEstimator.Summary, typeof(ITransformer), typeof(TextFeaturizingEstimator), null, typeof(SignatureLoadModel),
@@ -86,12 +86,12 @@ internal bool TryUnparse(StringBuilder sb)
8686
}
8787

8888
/// <summary>
89-
/// This class exposes <see cref="NgramExtractorTransform"/>/<see cref="NgramHashExtractingTransformer"/> arguments.
89+
/// Advanced options for the <see cref="TextFeaturizingEstimator"/>.
9090
/// </summary>
91-
internal sealed class Arguments : TransformInputBase
91+
public sealed class Options : TransformInputBase
9292
{
9393
[Argument(ArgumentType.Required, HelpText = "New column definition (optional form: name:srcs).", Name = "Column", ShortName = "col", SortOrder = 1)]
94-
public Column Columns;
94+
internal Column Columns;
9595

9696
[Argument(ArgumentType.AtMostOnce, HelpText = "Dataset language or 'AutoDetect' to detect language per row.", ShortName = "lang", SortOrder = 3)]
9797
public Language Language = DefaultLanguage;
@@ -115,67 +115,80 @@ internal sealed class Arguments : TransformInputBase
115115
public bool OutputTokens;
116116

117117
[Argument(ArgumentType.Multiple, HelpText = "A dictionary of whitelisted terms.", ShortName = "dict", NullName = "<None>", SortOrder = 10, Hide = true)]
118-
public TermLoaderArguments Dictionary;
118+
internal TermLoaderArguments Dictionary;
119119

120120
[TGUI(Label = "Word Gram Extractor")]
121-
[Argument(ArgumentType.Multiple, HelpText = "Ngram feature extractor to use for words (WordBag/WordHashBag).", ShortName = "wordExtractor", NullName = "<None>", SortOrder = 11)]
122-
public INgramExtractorFactoryFactory WordFeatureExtractor = new NgramExtractorTransform.NgramExtractorArguments();
123-
124-
[TGUI(Label = "Char Gram Extractor")]
125-
[Argument(ArgumentType.Multiple, HelpText = "Ngram feature extractor to use for characters (WordBag/WordHashBag).", ShortName = "charExtractor", NullName = "<None>", SortOrder = 12)]
126-
public INgramExtractorFactoryFactory CharFeatureExtractor = new NgramExtractorTransform.NgramExtractorArguments() { NgramLength = 3, AllLengths = false };
127-
128-
[Argument(ArgumentType.AtMostOnce, HelpText = "Normalize vectors (rows) individually by rescaling them to unit norm.", ShortName = "norm", SortOrder = 13)]
129-
public NormFunction VectorNormalizer = NormFunction.L2;
130-
}
121+
[Argument(ArgumentType.Multiple, Name = "WordFeatureExtractor", HelpText = "Ngram feature extractor to use for words (WordBag/WordHashBag).", ShortName = "wordExtractor", NullName = "<None>", SortOrder = 11)]
122+
internal INgramExtractorFactoryFactory WordFeatureExtractorFactory;
131123

132-
/// <summary>
133-
/// Advanced options for the <see cref="TextFeaturizingEstimator"/>.
134-
/// </summary>
135-
public sealed class Options
136-
{
137-
#pragma warning disable MSML_NoInstanceInitializers // No initializers on instance fields or properties
138-
/// <summary>
139-
/// Dataset language.
140-
/// </summary>
141-
public Language TextLanguage { get; set; } = DefaultLanguage;
142-
/// <summary>
143-
/// Casing used for the text.
144-
/// </summary>
145-
public CaseMode TextCase { get; set; } = CaseMode.Lower;
146-
/// <summary>
147-
/// Whether to keep diacritical marks or remove them.
148-
/// </summary>
149-
public bool KeepDiacritics { get; set; } = false;
150-
/// <summary>
151-
/// Whether to keep punctuation marks or remove them.
152-
/// </summary>
153-
public bool KeepPunctuations { get; set; } = true;
154-
/// <summary>
155-
/// Whether to keep numbers or remove them.
156-
/// </summary>
157-
public bool KeepNumbers { get; set; } = true;
158124
/// <summary>
159-
/// Whether to output the transformed text tokens as an additional column.
125+
/// The underlying state of <see cref="WordFeatureExtractorFactory"/> and <see cref="WordFeatureExtractor"/>.
160126
/// </summary>
161-
public bool OutputTokens { get; set; } = false;
162-
/// <summary>
163-
/// Vector Normalizer to use.
164-
/// </summary>
165-
public NormFunction VectorNormalizer { get; set; } = NormFunction.L2;
127+
private WordBagEstimator.Options _wordFeatureExtractor;
128+
166129
/// <summary>
167-
/// Whether to use stop remover or not.
130+
/// Ngram feature extractor to use for words (WordBag/WordHashBag).
168131
/// </summary>
169-
public bool UseStopRemover { get; set; } = false;
132+
public WordBagEstimator.Options WordFeatureExtractor
133+
{
134+
get { return _wordFeatureExtractor; }
135+
set
136+
{
137+
_wordFeatureExtractor = value;
138+
NgramExtractorTransform.NgramExtractorArguments extractor = null;
139+
if (_wordFeatureExtractor != null)
140+
{
141+
extractor = new NgramExtractorTransform.NgramExtractorArguments();
142+
extractor.NgramLength = _wordFeatureExtractor.NgramLength;
143+
extractor.SkipLength = _wordFeatureExtractor.SkipLength;
144+
extractor.AllLengths = _wordFeatureExtractor.AllLengths;
145+
extractor.MaxNumTerms = _wordFeatureExtractor.MaximumNgramsCount;
146+
extractor.Weighting = _wordFeatureExtractor.Weighting;
147+
}
148+
WordFeatureExtractorFactory = extractor;
149+
}
150+
}
151+
152+
[TGUI(Label = "Char Gram Extractor")]
153+
[Argument(ArgumentType.Multiple, Name = "CharFeatureExtractor", HelpText = "Ngram feature extractor to use for characters (WordBag/WordHashBag).", ShortName = "charExtractor", NullName = "<None>", SortOrder = 12)]
154+
internal INgramExtractorFactoryFactory CharFeatureExtractorFactory;
155+
170156
/// <summary>
171-
/// Whether to use char extractor or not.
157+
/// The underlying state of <see cref="CharFeatureExtractorFactory"/> and <see cref="CharFeatureExtractor"/>
172158
/// </summary>
173-
public bool UseCharExtractor { get; set; } = true;
159+
private WordBagEstimator.Options _charFeatureExtractor;
160+
174161
/// <summary>
175-
/// Whether to use word extractor or not.
162+
/// Ngram feature extractor to use for characters (WordBag/WordHashBag).
176163
/// </summary>
177-
public bool UseWordExtractor { get; set; } = true;
178-
#pragma warning restore MSML_NoInstanceInitializers // No initializers on instance fields or properties
164+
public WordBagEstimator.Options CharFeatureExtractor
165+
{
166+
get { return _charFeatureExtractor; }
167+
set
168+
{
169+
_charFeatureExtractor = value;
170+
NgramExtractorTransform.NgramExtractorArguments extractor = null;
171+
if (_charFeatureExtractor != null)
172+
{
173+
extractor = new NgramExtractorTransform.NgramExtractorArguments();
174+
extractor.NgramLength = _charFeatureExtractor.NgramLength;
175+
extractor.SkipLength = _charFeatureExtractor.SkipLength;
176+
extractor.AllLengths = _charFeatureExtractor.AllLengths;
177+
extractor.MaxNumTerms = _charFeatureExtractor.MaximumNgramsCount;
178+
extractor.Weighting = _charFeatureExtractor.Weighting;
179+
}
180+
CharFeatureExtractorFactory = extractor;
181+
}
182+
}
183+
184+
[Argument(ArgumentType.AtMostOnce, HelpText = "Normalize vectors (rows) individually by rescaling them to unit norm.", ShortName = "norm", SortOrder = 13)]
185+
public NormFunction VectorNormalizer = NormFunction.L2;
186+
187+
public Options()
188+
{
189+
WordFeatureExtractor = new WordBagEstimator.Options();
190+
CharFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 3, AllLengths = false };
191+
}
179192
}
180193

181194
internal readonly string OutputColumn;
@@ -274,13 +287,13 @@ public bool NeedInitialSourceColumnConcatTransform
274287
public TransformApplierParams(TextFeaturizingEstimator parent)
275288
{
276289
var host = parent._host;
277-
host.Check(Enum.IsDefined(typeof(Language), parent.OptionalSettings.TextLanguage));
290+
host.Check(Enum.IsDefined(typeof(Language), parent.OptionalSettings.Language));
278291
host.Check(Enum.IsDefined(typeof(CaseMode), parent.OptionalSettings.TextCase));
279292
WordExtractorFactory = parent._wordFeatureExtractor?.CreateComponent(host, parent._dictionary);
280293
CharExtractorFactory = parent._charFeatureExtractor?.CreateComponent(host, parent._dictionary);
281294
VectorNormalizer = parent.OptionalSettings.VectorNormalizer;
282-
Language = parent.OptionalSettings.TextLanguage;
283-
UsePredefinedStopWordRemover = parent.OptionalSettings.UseStopRemover;
295+
Language = parent.OptionalSettings.Language;
296+
UsePredefinedStopWordRemover = parent.OptionalSettings.UsePredefinedStopWordRemover;
284297
TextCase = parent.OptionalSettings.TextCase;
285298
KeepDiacritics = parent.OptionalSettings.KeepDiacritics;
286299
KeepPunctuations = parent.OptionalSettings.KeepPunctuations;
@@ -323,10 +336,9 @@ internal TextFeaturizingEstimator(IHostEnvironment env, string name, IEnumerable
323336
OptionalSettings = options;
324337

325338
_dictionary = null;
326-
if (OptionalSettings.UseWordExtractor)
327-
_wordFeatureExtractor = new NgramExtractorTransform.NgramExtractorArguments();
328-
if (OptionalSettings.UseCharExtractor)
329-
_charFeatureExtractor = new NgramExtractorTransform.NgramExtractorArguments() { NgramLength = 3, AllLengths = false };
339+
_wordFeatureExtractor = OptionalSettings.WordFeatureExtractorFactory;
340+
_charFeatureExtractor = OptionalSettings.CharFeatureExtractorFactory;
341+
330342
}
331343

332344
/// <summary>
@@ -548,26 +560,12 @@ public SchemaShape GetOutputSchema(SchemaShape inputSchema)
548560
}
549561

550562
// Factory method for SignatureDataTransform.
551-
internal static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView data)
563+
internal static IDataTransform Create(IHostEnvironment env, Options args, IDataView data)
552564
{
553-
var settings = new Options
554-
{
555-
TextLanguage = args.Language,
556-
TextCase = args.TextCase,
557-
KeepDiacritics = args.KeepDiacritics,
558-
KeepPunctuations = args.KeepPunctuations,
559-
KeepNumbers = args.KeepNumbers,
560-
OutputTokens = args.OutputTokens,
561-
VectorNormalizer = args.VectorNormalizer,
562-
UseStopRemover = args.UsePredefinedStopWordRemover,
563-
UseWordExtractor = args.WordFeatureExtractor != null,
564-
UseCharExtractor = args.CharFeatureExtractor != null,
565-
};
566-
567-
var estimator = new TextFeaturizingEstimator(env, args.Columns.Name, args.Columns.Source ?? new[] { args.Columns.Name }, settings);
565+
var estimator = new TextFeaturizingEstimator(env, args.Columns.Name, args.Columns.Source ?? new[] { args.Columns.Name }, args);
568566
estimator._dictionary = args.Dictionary;
569-
estimator._wordFeatureExtractor = args.WordFeatureExtractor;
570-
estimator._charFeatureExtractor = args.CharFeatureExtractor;
567+
estimator._wordFeatureExtractor = args.WordFeatureExtractorFactory;
568+
estimator._charFeatureExtractor = args.CharFeatureExtractorFactory;
571569
return estimator.Fit(data).Transform(data) as IDataTransform;
572570
}
573571

src/Microsoft.ML.Transforms/Text/TextNormalizing.cs

-1
Original file line numberDiff line numberDiff line change
@@ -459,7 +459,6 @@ internal static class Defaults
459459
public const bool KeepDiacritics = false;
460460
public const bool KeepPunctuations = true;
461461
public const bool KeepNumbers = true;
462-
463462
}
464463

465464
internal static bool IsColumnTypeValid(DataViewType type) => (type.GetItemType() is TextDataViewType);

src/Microsoft.ML.Transforms/Text/WrappedTextTransformers.cs

+41
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,47 @@ public sealed class WordBagEstimator : IEstimator<ITransformer>
2626
private readonly int _maxNumTerms;
2727
private readonly NgramExtractingEstimator.WeightingCriteria _weighting;
2828

29+
/// <summary>
30+
/// Options for how the ngrams are extracted.
31+
/// </summary>
32+
public class Options
33+
{
34+
/// <summary>
35+
/// Maximum ngram length.
36+
/// </summary>
37+
public int NgramLength;
38+
39+
/// <summary>
40+
/// Maximum number of tokens to skip when constructing an ngram.
41+
/// </summary>
42+
public int SkipLength;
43+
44+
/// <summary>
45+
/// Whether to store all ngram lengths up to ngramLength, or only ngramLength.
46+
/// </summary>
47+
public bool AllLengths;
48+
49+
/// <summary>
50+
/// The maximum number of grams to store in the dictionary, for each level of ngrams,
51+
/// from 1 (in position 0) up to ngramLength (in position ngramLength-1)
52+
/// </summary>
53+
public int[] MaximumNgramsCount;
54+
55+
/// <summary>
56+
/// The weighting criteria.
57+
/// </summary>
58+
public NgramExtractingEstimator.WeightingCriteria Weighting;
59+
60+
public Options()
61+
{
62+
NgramLength = 1;
63+
SkipLength = NgramExtractingEstimator.Defaults.SkipLength;
64+
AllLengths = NgramExtractingEstimator.Defaults.AllLengths;
65+
MaximumNgramsCount = new int[] { NgramExtractingEstimator.Defaults.MaxNumTerms };
66+
Weighting = NgramExtractingEstimator.Defaults.Weighting;
67+
}
68+
}
69+
2970
/// <summary>
3071
/// Produces a bag of counts of ngrams (sequences of consecutive words) in <paramref name="inputColumnName"/>
3172
/// and outputs bag of word vector as <paramref name="outputColumnName"/>

test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv

+1-1
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ Transforms.Scorer Turn the predictor model into a transform model Microsoft.ML.E
126126
Transforms.Segregator Un-groups vector columns into sequences of rows, inverse of Group transform Microsoft.ML.Transforms.GroupingOperations Ungroup Microsoft.ML.Transforms.UngroupTransform+Options Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput
127127
Transforms.SentimentAnalyzer Uses a pretrained sentiment model to score input strings Microsoft.ML.Transforms.Text.TextAnalytics AnalyzeSentiment Microsoft.ML.Transforms.Text.SentimentAnalyzingTransformer+Arguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput
128128
Transforms.TensorFlowScorer Transforms the data using the TensorFlow model. Microsoft.ML.Transforms.TensorFlowTransformer TensorFlowScorer Microsoft.ML.Transforms.TensorFlowEstimator+Options Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput
129-
Transforms.TextFeaturizer A transform that turns a collection of text documents into numerical feature vectors. The feature vectors are normalized counts of (word and/or character) ngrams in a given tokenized text. Microsoft.ML.Transforms.Text.TextAnalytics TextTransform Microsoft.ML.Transforms.Text.TextFeaturizingEstimator+Arguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput
129+
Transforms.TextFeaturizer A transform that turns a collection of text documents into numerical feature vectors. The feature vectors are normalized counts of (word and/or character) ngrams in a given tokenized text. Microsoft.ML.Transforms.Text.TextAnalytics TextTransform Microsoft.ML.Transforms.Text.TextFeaturizingEstimator+Options Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput
130130
Transforms.TextToKeyConverter Converts input values (words, numbers, etc.) to index in a dictionary. Microsoft.ML.Transforms.Categorical TextToKey Microsoft.ML.Transforms.ValueToKeyMappingTransformer+Options Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput
131131
Transforms.TrainTestDatasetSplitter Split the dataset into train and test sets Microsoft.ML.EntryPoints.TrainTestSplit Split Microsoft.ML.EntryPoints.TrainTestSplit+Input Microsoft.ML.EntryPoints.TrainTestSplit+Output
132132
Transforms.TreeLeafFeaturizer Trains a tree ensemble, or loads it from a file, then maps a numeric feature vector to three outputs: 1. A vector containing the individual tree outputs of the tree ensemble. 2. A vector indicating the leaves that the feature vector falls on in the tree ensemble. 3. A vector indicating the paths that the feature vector falls on in the tree ensemble. If a both a model file and a trainer are specified - will use the model file. If neither are specified, will train a default FastTree model. This can handle key labels by training a regression model towards their optionally permuted indices. Microsoft.ML.Data.TreeFeaturize Featurizer Microsoft.ML.Data.TreeEnsembleFeaturizerTransform+ArgumentsForEntryPoint Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput

test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs

+3-3
Original file line numberDiff line numberDiff line change
@@ -98,10 +98,10 @@ public void TrainSentiment()
9898
{
9999
OutputTokens = true,
100100
KeepPunctuations = false,
101-
UseStopRemover = true,
101+
UsePredefinedStopWordRemover = true,
102102
VectorNormalizer = TextFeaturizingEstimator.NormFunction.None,
103-
UseCharExtractor = false,
104-
UseWordExtractor = false,
103+
CharFeatureExtractor = null,
104+
WordFeatureExtractor = null,
105105
}, "SentimentText").Fit(loader).Transform(loader);
106106

107107
var trans = mlContext.Transforms.Text.ApplyWordEmbedding("Features", "WordEmbeddings_TransformedText",

0 commit comments

Comments
 (0)