Skip to content

Commit 14edcb0

Browse files
authored
Scrub word embedding transform (#2891)
1 parent cbf05ac commit 14edcb0

File tree

14 files changed

+112
-112
lines changed

14 files changed

+112
-112
lines changed

docs/code/MlNetCookBook.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -782,7 +782,7 @@ var pipeline =
782782
// NLP pipeline 4: word embeddings.
783783
.Append(mlContext.Transforms.Text.TokenizeWords("TokenizedMessage", "NormalizedMessage"))
784784
.Append(mlContext.Transforms.Text.ExtractWordEmbeddings("Embeddings", "TokenizedMessage",
785-
WordEmbeddingsExtractingEstimator.PretrainedModelKind.GloVeTwitter25D));
785+
WordEmbeddingsExtractingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding));
786786

787787
// Let's train our pipeline, and then apply it to the same data.
788788
// Note that even on a small dataset of 70KB the pipeline above can take up to a minute to completely train.

docs/samples/Microsoft.ML.Samples/Dynamic/WordEmbeddingTransform.cs

+3-3
Original file line numberDiff line numberDiff line change
@@ -61,8 +61,8 @@ public static void Example()
6161

6262
// Let's apply pretrained word embedding model GloVeTwitter25D.
6363
// 25D means each word mapped into 25 dimensional space, basically each word represented by 25 float values.
64-
var gloveWordEmbedding = ml.Transforms.Text.ExtractWordEmbeddings("GloveEmbeddings", "CleanWords",
65-
WordEmbeddingsExtractingEstimator.PretrainedModelKind.GloVeTwitter25D);
64+
var gloveWordEmbedding = ml.Transforms.Text.ApplyWordEmbedding("GloveEmbeddings", "CleanWords",
65+
WordEmbeddingEstimator.PretrainedModelKind.GloVeTwitter25D);
6666

6767
// We also have option to apply custom word embedding models.
6868
// Let's first create one.
@@ -81,7 +81,7 @@ public static void Example()
8181
file.WriteLine("best" + " " + string.Join(" ", 0f, 0f, 20f));
8282
}
8383
// Now let's add custom embedding on top of same words.
84-
var pipeline = gloveWordEmbedding.Append(ml.Transforms.Text.ExtractWordEmbeddings("CustomEmbeddings", @".\custommodel.txt", "CleanWords"));
84+
var pipeline = gloveWordEmbedding.Append(ml.Transforms.Text.ApplyWordEmbedding("CustomEmbeddings", @".\custommodel.txt", "CleanWords"));
8585

8686
// And do all required transformations.
8787
var embeddingDataview = pipeline.Fit(wordsDataview).Transform(wordsDataview);

src/Microsoft.ML.StaticPipe/WordEmbeddingsStaticExtensions.cs

+8-8
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ public static class WordEmbeddingsStaticExtensions
1414
/// <param name="input">Vector of tokenized text.</param>
1515
/// <param name="modelKind">The pretrained word embedding model.</param>
1616
/// <returns></returns>
17-
public static Vector<float> WordEmbeddings(this VarVector<string> input, WordEmbeddingsExtractingEstimator.PretrainedModelKind modelKind = WordEmbeddingsExtractingEstimator.PretrainedModelKind.Sswe)
17+
public static Vector<float> WordEmbeddings(this VarVector<string> input, WordEmbeddingEstimator.PretrainedModelKind modelKind = WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding)
1818
{
1919
Contracts.CheckValue(input, nameof(input));
2020
return new OutColumn(input, modelKind);
@@ -33,7 +33,7 @@ private sealed class OutColumn : Vector<float>
3333
{
3434
public PipelineColumn Input { get; }
3535

36-
public OutColumn(VarVector<string> input, WordEmbeddingsExtractingEstimator.PretrainedModelKind modelKind = WordEmbeddingsExtractingEstimator.PretrainedModelKind.Sswe)
36+
public OutColumn(VarVector<string> input, WordEmbeddingEstimator.PretrainedModelKind modelKind = WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding)
3737
: base(new Reconciler(modelKind), input)
3838
{
3939
Input = input;
@@ -48,10 +48,10 @@ public OutColumn(VarVector<string> input, string customModelFile = null)
4848

4949
private sealed class Reconciler : EstimatorReconciler
5050
{
51-
private readonly WordEmbeddingsExtractingEstimator.PretrainedModelKind? _modelKind;
51+
private readonly WordEmbeddingEstimator.PretrainedModelKind? _modelKind;
5252
private readonly string _customLookupTable;
5353

54-
public Reconciler(WordEmbeddingsExtractingEstimator.PretrainedModelKind modelKind = WordEmbeddingsExtractingEstimator.PretrainedModelKind.Sswe)
54+
public Reconciler(WordEmbeddingEstimator.PretrainedModelKind modelKind = WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding)
5555
{
5656
_modelKind = modelKind;
5757
_customLookupTable = null;
@@ -71,18 +71,18 @@ public override IEstimator<ITransformer> Reconcile(IHostEnvironment env,
7171
{
7272
Contracts.Assert(toOutput.Length == 1);
7373

74-
var cols = new WordEmbeddingsExtractingEstimator.ColumnOptions[toOutput.Length];
74+
var cols = new WordEmbeddingEstimator.ColumnOptions[toOutput.Length];
7575
for (int i = 0; i < toOutput.Length; ++i)
7676
{
7777
var outCol = (OutColumn)toOutput[i];
78-
cols[i] = new WordEmbeddingsExtractingEstimator.ColumnOptions(outputNames[outCol], inputNames[outCol.Input]);
78+
cols[i] = new WordEmbeddingEstimator.ColumnOptions(outputNames[outCol], inputNames[outCol.Input]);
7979
}
8080

8181
bool customLookup = !string.IsNullOrWhiteSpace(_customLookupTable);
8282
if (customLookup)
83-
return new WordEmbeddingsExtractingEstimator(env, _customLookupTable, cols);
83+
return new WordEmbeddingEstimator(env, _customLookupTable, cols);
8484
else
85-
return new WordEmbeddingsExtractingEstimator(env, _modelKind.Value, cols);
85+
return new WordEmbeddingEstimator(env, _modelKind.Value, cols);
8686
}
8787
}
8888
}

src/Microsoft.ML.Transforms/EntryPoints/TextAnalytics.cs

+5-5
Original file line numberDiff line numberDiff line change
@@ -132,16 +132,16 @@ public static CommonOutputs.TransformOutput LightLda(IHostEnvironment env, Laten
132132
}
133133

134134
[TlcModule.EntryPoint(Name = "Transforms.WordEmbeddings",
135-
Desc = WordEmbeddingsExtractingTransformer.Summary,
136-
UserName = WordEmbeddingsExtractingTransformer.UserName,
137-
ShortName = WordEmbeddingsExtractingTransformer.ShortName)]
138-
public static CommonOutputs.TransformOutput WordEmbeddings(IHostEnvironment env, WordEmbeddingsExtractingTransformer.Options input)
135+
Desc = WordEmbeddingTransformer.Summary,
136+
UserName = WordEmbeddingTransformer.UserName,
137+
ShortName = WordEmbeddingTransformer.ShortName)]
138+
public static CommonOutputs.TransformOutput WordEmbeddings(IHostEnvironment env, WordEmbeddingTransformer.Options input)
139139
{
140140
Contracts.CheckValue(env, nameof(env));
141141
env.CheckValue(input, nameof(input));
142142

143143
var h = EntryPointUtils.CheckArgsAndCreateHost(env, "WordEmbeddings", input);
144-
var view = WordEmbeddingsExtractingTransformer.Create(h, input, input.Data);
144+
var view = WordEmbeddingTransformer.Create(h, input, input.Data);
145145
return new CommonOutputs.TransformOutput()
146146
{
147147
Model = new TransformModelImpl(h, view, input.Data),

src/Microsoft.ML.Transforms/Text/TextCatalog.cs

+11-11
Original file line numberDiff line numberDiff line change
@@ -101,19 +101,19 @@ public static TextNormalizingEstimator NormalizeText(this TransformsCatalog.Text
101101
/// <param name="catalog">The text-related transform's catalog.</param>
102102
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
103103
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
104-
/// <param name="modelKind">The embeddings <see cref="WordEmbeddingsExtractingEstimator.PretrainedModelKind"/> to use. </param>
104+
/// <param name="modelKind">The embeddings <see cref="WordEmbeddingEstimator.PretrainedModelKind"/> to use. </param>
105105
/// <example>
106106
/// <format type="text/markdown">
107107
/// <![CDATA[
108108
/// [!code-csharp[FeaturizeText](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/WordEmbeddingTransform.cs)]
109109
/// ]]>
110110
/// </format>
111111
/// </example>
112-
public static WordEmbeddingsExtractingEstimator ExtractWordEmbeddings(this TransformsCatalog.TextTransforms catalog,
112+
public static WordEmbeddingEstimator ApplyWordEmbedding(this TransformsCatalog.TextTransforms catalog,
113113
string outputColumnName,
114114
string inputColumnName = null,
115-
WordEmbeddingsExtractingEstimator.PretrainedModelKind modelKind = WordEmbeddingsExtractingEstimator.PretrainedModelKind.Sswe)
116-
=> new WordEmbeddingsExtractingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnName, modelKind);
115+
WordEmbeddingEstimator.PretrainedModelKind modelKind = WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding)
116+
=> new WordEmbeddingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnName, modelKind);
117117

118118
/// <include file='doc.xml' path='doc/members/member[@name="WordEmbeddings"]/*' />
119119
/// <param name="catalog">The text-related transform's catalog.</param>
@@ -127,16 +127,16 @@ public static WordEmbeddingsExtractingEstimator ExtractWordEmbeddings(this Trans
127127
/// ]]>
128128
/// </format>
129129
/// </example>
130-
public static WordEmbeddingsExtractingEstimator ExtractWordEmbeddings(this TransformsCatalog.TextTransforms catalog,
130+
public static WordEmbeddingEstimator ApplyWordEmbedding(this TransformsCatalog.TextTransforms catalog,
131131
string outputColumnName,
132132
string customModelFile,
133133
string inputColumnName = null)
134-
=> new WordEmbeddingsExtractingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(),
134+
=> new WordEmbeddingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(),
135135
outputColumnName, customModelFile, inputColumnName ?? outputColumnName);
136136

137137
/// <include file='doc.xml' path='doc/members/member[@name="WordEmbeddings"]/*' />
138138
/// <param name="catalog">The text-related transform's catalog.</param>
139-
/// <param name="modelKind">The embeddings <see cref="WordEmbeddingsExtractingEstimator.PretrainedModelKind"/> to use. </param>
139+
/// <param name="modelKind">The embeddings <see cref="WordEmbeddingEstimator.PretrainedModelKind"/> to use. </param>
140140
/// <param name="columns">The array columns, and per-column configurations to extract embeedings from.</param>
141141
/// <example>
142142
/// <format type="text/markdown">
@@ -145,10 +145,10 @@ public static WordEmbeddingsExtractingEstimator ExtractWordEmbeddings(this Trans
145145
/// ]]>
146146
/// </format>
147147
/// </example>
148-
public static WordEmbeddingsExtractingEstimator ExtractWordEmbeddings(this TransformsCatalog.TextTransforms catalog,
149-
WordEmbeddingsExtractingEstimator.PretrainedModelKind modelKind = WordEmbeddingsExtractingEstimator.PretrainedModelKind.Sswe,
150-
params WordEmbeddingsExtractingEstimator.ColumnOptions[] columns)
151-
=> new WordEmbeddingsExtractingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), modelKind, columns);
148+
public static WordEmbeddingEstimator ApplyWordEmbedding(this TransformsCatalog.TextTransforms catalog,
149+
WordEmbeddingEstimator.PretrainedModelKind modelKind = WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding,
150+
params WordEmbeddingEstimator.ColumnOptions[] columns)
151+
=> new WordEmbeddingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), modelKind, columns);
152152

153153
/// <summary>
154154
/// Tokenizes incoming text in <paramref name="inputColumnName"/>, using <paramref name="separators"/> as separators,

0 commit comments

Comments
 (0)