Skip to content

Commit 753f158

Browse files
authored
Creation of components through MLContext and cleanup (text related transforms) (#2393)
* lda * WordEmbeddingsExtractingEstimator * TokenizingByCharactersEstimator * WordTokenizingEstimator * WordBagEstimator, WordHashBagEstimator * NgramExtractingEstimator, NgramHashingEstimator * StopWordsRemovingEstimator; CustomStopWordsRemovingEstimator; TextNormalizingEstimator * making Column internal * review comments regarding adding summary comments * fixed CookBook + summary comment on InputColumnName
1 parent 8d1021d commit 753f158

31 files changed

+986
-909
lines changed

docs/code/MlNetCookBook.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -782,7 +782,7 @@ var pipeline =
782782
// NLP pipeline 4: word embeddings.
783783
.Append(mlContext.Transforms.Text.TokenizeWords("TokenizedMessage", "NormalizedMessage"))
784784
.Append(mlContext.Transforms.Text.ExtractWordEmbeddings("Embeddings", "TokenizedMessage",
785-
WordEmbeddingsExtractingTransformer.PretrainedModelKind.GloVeTwitter25D));
785+
WordEmbeddingsExtractingEstimator.PretrainedModelKind.GloVeTwitter25D));
786786

787787
// Let's train our pipeline, and then apply it to the same data.
788788
// Note that even on a small dataset of 70KB the pipeline above can take up to a minute to completely train.
@@ -1020,4 +1020,4 @@ newContext.CompositionContainer = new CompositionContainer(new TypeCatalog(typeo
10201020
ITransformer loadedModel;
10211021
using (var fs = File.OpenRead(modelPath))
10221022
loadedModel = newContext.Model.Load(fs);
1023-
```
1023+
```

docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValueValueToKey.cs

+2-2
Original file line numberDiff line numberDiff line change
@@ -31,15 +31,15 @@ public static void KeyToValueValueToKey()
3131
// making use of default settings.
3232
string defaultColumnName = "DefaultKeys";
3333
// REVIEW create through the catalog extension
34-
var default_pipeline = new WordTokenizingEstimator(ml, "Review")
34+
var default_pipeline = ml.Transforms.Text.TokenizeWords("Review")
3535
.Append(ml.Transforms.Conversion.MapValueToKey(defaultColumnName, "Review"));
3636

3737
// Another pipeline, that customizes the advanced settings of the ValueToKeyMappingEstimator.
3838
// We can change the maxNumTerm to limit how many keys will get generated out of the set of words,
3939
// and condition the order in which they get evaluated by changing sort from the default Occurence (order in which they get encountered)
4040
// to value/alphabetically.
4141
string customizedColumnName = "CustomizedKeys";
42-
var customized_pipeline = new WordTokenizingEstimator(ml, "Review")
42+
var customized_pipeline = ml.Transforms.Text.TokenizeWords("Review")
4343
.Append(ml.Transforms.Conversion.MapValueToKey(customizedColumnName, "Review", maxNumKeys: 10, sort: ValueToKeyMappingEstimator.SortOrder.Value));
4444

4545
// The transformed data.

docs/samples/Microsoft.ML.Samples/Dynamic/WordEmbeddingTransform.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ public static void ExtractEmbeddings()
6262
// Let's apply pretrained word embedding model GloVeTwitter25D.
6363
// 25D means each word mapped into 25 dimensional space, basically each word represented by 25 float values.
6464
var gloveWordEmbedding = ml.Transforms.Text.ExtractWordEmbeddings("GloveEmbeddings", "CleanWords",
65-
WordEmbeddingsExtractingTransformer.PretrainedModelKind.GloVeTwitter25D);
65+
WordEmbeddingsExtractingEstimator.PretrainedModelKind.GloVeTwitter25D);
6666

6767
// We also have option to apply custom word embedding models.
6868
// Let's first create one.

src/Microsoft.ML.Data/DataLoadSave/TransformWrapper.cs

+4
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,10 @@ protected TrivialWrapperEstimator(IHost host, TransformWrapper transformer)
180180
{
181181
}
182182

183+
/// <summary>
184+
/// Returns the <see cref="SchemaShape"/> of the schema which will be produced by the transformer.
185+
/// Used for schema propagation and verification in a pipeline.
186+
/// </summary>
183187
public override SchemaShape GetOutputSchema(SchemaShape inputSchema)
184188
{
185189
Host.CheckValue(inputSchema, nameof(inputSchema));

src/Microsoft.ML.StaticPipe/LdaStaticExtensions.cs

+2-2
Original file line numberDiff line numberDiff line change
@@ -101,13 +101,13 @@ public override IEstimator<ITransformer> Reconcile(IHostEnvironment env,
101101
IReadOnlyDictionary<PipelineColumn, string> outputNames,
102102
IReadOnlyCollection<string> usedNames)
103103
{
104-
var infos = new LatentDirichletAllocationTransformer.ColumnInfo[toOutput.Length];
104+
var infos = new LatentDirichletAllocationEstimator.ColumnInfo[toOutput.Length];
105105
Action<LatentDirichletAllocationTransformer> onFit = null;
106106
for (int i = 0; i < toOutput.Length; ++i)
107107
{
108108
var tcol = (ILdaCol)toOutput[i];
109109

110-
infos[i] = new LatentDirichletAllocationTransformer.ColumnInfo(outputNames[toOutput[i]],
110+
infos[i] = new LatentDirichletAllocationEstimator.ColumnInfo(outputNames[toOutput[i]],
111111
inputNames[tcol.Input],
112112
tcol.Config.NumTopic,
113113
tcol.Config.AlphaSum,

src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs

+4-4
Original file line numberDiff line numberDiff line change
@@ -151,9 +151,9 @@ public override IEstimator<ITransformer> Reconcile(IHostEnvironment env,
151151
{
152152
Contracts.Assert(toOutput.Length == 1);
153153

154-
var columns = new List<StopWordsRemovingTransformer.ColumnInfo>();
154+
var columns = new List<StopWordsRemovingEstimator.ColumnInfo>();
155155
foreach (var outCol in toOutput)
156-
columns.Add(new StopWordsRemovingTransformer.ColumnInfo(outputNames[outCol], inputNames[((OutPipelineColumn)outCol).Input], _language));
156+
columns.Add(new StopWordsRemovingEstimator.ColumnInfo(outputNames[outCol], inputNames[((OutPipelineColumn)outCol).Input], _language));
157157

158158
return new StopWordsRemovingEstimator(env, columns.ToArray());
159159
}
@@ -559,9 +559,9 @@ public override IEstimator<ITransformer> Reconcile(IHostEnvironment env,
559559
IReadOnlyCollection<string> usedNames)
560560
{
561561
Contracts.Assert(toOutput.Length == 1);
562-
var columns = new List<NgramHashingTransformer.ColumnInfo>();
562+
var columns = new List<NgramHashingEstimator.ColumnInfo>();
563563
foreach (var outCol in toOutput)
564-
columns.Add(new NgramHashingTransformer.ColumnInfo(outputNames[outCol], new[] { inputNames[((OutPipelineColumn)outCol).Input] },
564+
columns.Add(new NgramHashingEstimator.ColumnInfo(outputNames[outCol], new[] { inputNames[((OutPipelineColumn)outCol).Input] },
565565
_ngramLength, _skipLength, _allLengths, _hashBits, _seed, _ordered, _invertHash));
566566

567567
return new NgramHashingEstimator(env, columns.ToArray());

src/Microsoft.ML.StaticPipe/WordEmbeddingsStaticExtensions.cs

+6-6
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ public static class WordEmbeddingsStaticExtensions
1515
/// <param name="input">Vector of tokenized text.</param>
1616
/// <param name="modelKind">The pretrained word embedding model.</param>
1717
/// <returns></returns>
18-
public static Vector<float> WordEmbeddings(this VarVector<string> input, WordEmbeddingsExtractingTransformer.PretrainedModelKind modelKind = WordEmbeddingsExtractingTransformer.PretrainedModelKind.Sswe)
18+
public static Vector<float> WordEmbeddings(this VarVector<string> input, WordEmbeddingsExtractingEstimator.PretrainedModelKind modelKind = WordEmbeddingsExtractingEstimator.PretrainedModelKind.Sswe)
1919
{
2020
Contracts.CheckValue(input, nameof(input));
2121
return new OutColumn(input, modelKind);
@@ -34,7 +34,7 @@ private sealed class OutColumn : Vector<float>
3434
{
3535
public PipelineColumn Input { get; }
3636

37-
public OutColumn(VarVector<string> input, WordEmbeddingsExtractingTransformer.PretrainedModelKind modelKind = WordEmbeddingsExtractingTransformer.PretrainedModelKind.Sswe)
37+
public OutColumn(VarVector<string> input, WordEmbeddingsExtractingEstimator.PretrainedModelKind modelKind = WordEmbeddingsExtractingEstimator.PretrainedModelKind.Sswe)
3838
: base(new Reconciler(modelKind), input)
3939
{
4040
Input = input;
@@ -49,10 +49,10 @@ public OutColumn(VarVector<string> input, string customModelFile = null)
4949

5050
private sealed class Reconciler : EstimatorReconciler
5151
{
52-
private readonly WordEmbeddingsExtractingTransformer.PretrainedModelKind? _modelKind;
52+
private readonly WordEmbeddingsExtractingEstimator.PretrainedModelKind? _modelKind;
5353
private readonly string _customLookupTable;
5454

55-
public Reconciler(WordEmbeddingsExtractingTransformer.PretrainedModelKind modelKind = WordEmbeddingsExtractingTransformer.PretrainedModelKind.Sswe)
55+
public Reconciler(WordEmbeddingsExtractingEstimator.PretrainedModelKind modelKind = WordEmbeddingsExtractingEstimator.PretrainedModelKind.Sswe)
5656
{
5757
_modelKind = modelKind;
5858
_customLookupTable = null;
@@ -72,11 +72,11 @@ public override IEstimator<ITransformer> Reconcile(IHostEnvironment env,
7272
{
7373
Contracts.Assert(toOutput.Length == 1);
7474

75-
var cols = new WordEmbeddingsExtractingTransformer.ColumnInfo[toOutput.Length];
75+
var cols = new WordEmbeddingsExtractingEstimator.ColumnInfo[toOutput.Length];
7676
for (int i = 0; i < toOutput.Length; ++i)
7777
{
7878
var outCol = (OutColumn)toOutput[i];
79-
cols[i] = new WordEmbeddingsExtractingTransformer.ColumnInfo(outputNames[outCol], inputNames[outCol.Input]);
79+
cols[i] = new WordEmbeddingsExtractingEstimator.ColumnInfo(outputNames[outCol], inputNames[outCol.Input]);
8080
}
8181

8282
bool customLookup = !string.IsNullOrWhiteSpace(_customLookupTable);

src/Microsoft.ML.Transforms/EntryPoints/TextAnalytics.cs

+6-6
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ public static CommonOutputs.TransformOutput TextTransform(IHostEnvironment env,
3636
Desc = ML.Transforms.Text.WordTokenizingTransformer.Summary,
3737
UserName = ML.Transforms.Text.WordTokenizingTransformer.UserName,
3838
ShortName = ML.Transforms.Text.WordTokenizingTransformer.LoaderSignature)]
39-
public static CommonOutputs.TransformOutput DelimitedTokenizeTransform(IHostEnvironment env, WordTokenizingTransformer.Arguments input)
39+
public static CommonOutputs.TransformOutput DelimitedTokenizeTransform(IHostEnvironment env, WordTokenizingTransformer.Options input)
4040
{
4141
var h = EntryPointUtils.CheckArgsAndCreateHost(env, "DelimitedTokenizeTransform", input);
4242
var xf = ML.Transforms.Text.WordTokenizingTransformer.Create(h, input, input.Data);
@@ -51,7 +51,7 @@ public static CommonOutputs.TransformOutput DelimitedTokenizeTransform(IHostEnvi
5151
Desc = NgramExtractingTransformer.Summary,
5252
UserName = NgramExtractingTransformer.UserName,
5353
ShortName = NgramExtractingTransformer.LoaderSignature)]
54-
public static CommonOutputs.TransformOutput NGramTransform(IHostEnvironment env, NgramExtractingTransformer.Arguments input)
54+
public static CommonOutputs.TransformOutput NGramTransform(IHostEnvironment env, NgramExtractingTransformer.Options input)
5555
{
5656
var h = EntryPointUtils.CheckArgsAndCreateHost(env, "NGramTransform", input);
5757
var xf = NgramExtractingTransformer.Create(h, input, input.Data);
@@ -96,7 +96,7 @@ public static CommonOutputs.TransformOutput AnalyzeSentiment(IHostEnvironment en
9696
Desc = TokenizingByCharactersTransformer.Summary,
9797
UserName = TokenizingByCharactersTransformer.UserName,
9898
ShortName = TokenizingByCharactersTransformer.LoaderSignature)]
99-
public static CommonOutputs.TransformOutput CharTokenize(IHostEnvironment env, TokenizingByCharactersTransformer.Arguments input)
99+
public static CommonOutputs.TransformOutput CharTokenize(IHostEnvironment env, TokenizingByCharactersTransformer.Options input)
100100
{
101101
Contracts.CheckValue(env, nameof(env));
102102
env.CheckValue(input, nameof(input));
@@ -114,13 +114,13 @@ public static CommonOutputs.TransformOutput CharTokenize(IHostEnvironment env, T
114114
Desc = LatentDirichletAllocationTransformer.Summary,
115115
UserName = LatentDirichletAllocationTransformer.UserName,
116116
ShortName = LatentDirichletAllocationTransformer.ShortName)]
117-
public static CommonOutputs.TransformOutput LightLda(IHostEnvironment env, LatentDirichletAllocationTransformer.Arguments input)
117+
public static CommonOutputs.TransformOutput LightLda(IHostEnvironment env, LatentDirichletAllocationTransformer.Options input)
118118
{
119119
Contracts.CheckValue(env, nameof(env));
120120
env.CheckValue(input, nameof(input));
121121

122122
var h = EntryPointUtils.CheckArgsAndCreateHost(env, "LightLda", input);
123-
var cols = input.Columns.Select(colPair => new LatentDirichletAllocationTransformer.ColumnInfo(colPair, input)).ToArray();
123+
var cols = input.Columns.Select(colPair => new LatentDirichletAllocationEstimator.ColumnInfo(colPair, input)).ToArray();
124124
var est = new LatentDirichletAllocationEstimator(h, cols);
125125
var view = est.Fit(input.Data).Transform(input.Data);
126126

@@ -135,7 +135,7 @@ public static CommonOutputs.TransformOutput LightLda(IHostEnvironment env, Laten
135135
Desc = WordEmbeddingsExtractingTransformer.Summary,
136136
UserName = WordEmbeddingsExtractingTransformer.UserName,
137137
ShortName = WordEmbeddingsExtractingTransformer.ShortName)]
138-
public static CommonOutputs.TransformOutput WordEmbeddings(IHostEnvironment env, WordEmbeddingsExtractingTransformer.Arguments input)
138+
public static CommonOutputs.TransformOutput WordEmbeddings(IHostEnvironment env, WordEmbeddingsExtractingTransformer.Options input)
139139
{
140140
Contracts.CheckValue(env, nameof(env));
141141
env.CheckValue(input, nameof(input));

0 commit comments

Comments
 (0)