Skip to content

Commit 52d96b4

Browse files
Ivanidzo4kacodemzs
authored andcommitted
word embedding transform (dotnet#545)
Introduce word embedding transform
1 parent 966b0f3 commit 52d96b4

File tree

8 files changed

+916
-1
lines changed

8 files changed

+916
-1
lines changed

src/Microsoft.ML.Transforms/EntryPoints/TextAnalytics.cs

+20
Original file line numberDiff line numberDiff line change
@@ -137,5 +137,25 @@ public static CommonOutputs.TransformOutput LightLda(IHostEnvironment env, LdaTr
137137
OutputData = view
138138
};
139139
}
140+
141+
[TlcModule.EntryPoint(Name = "Transforms.WordEmbeddings",
142+
Desc = WordEmbeddingsTransform.Summary,
143+
UserName = WordEmbeddingsTransform.UserName,
144+
ShortName = WordEmbeddingsTransform.ShortName,
145+
XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/member[@name=""WordEmbeddings""]/*' />",
146+
@"<include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/example[@name=""WordEmbeddings""]/*' />" })]
147+
public static CommonOutputs.TransformOutput WordEmbeddings(IHostEnvironment env, WordEmbeddingsTransform.Arguments input)
148+
{
149+
Contracts.CheckValue(env, nameof(env));
150+
env.CheckValue(input, nameof(input));
151+
152+
var h = EntryPointUtils.CheckArgsAndCreateHost(env, "WordEmbeddings", input);
153+
var view = new WordEmbeddingsTransform(h, input, input.Data);
154+
return new CommonOutputs.TransformOutput()
155+
{
156+
Model = new TransformModel(h, view, input.Data),
157+
OutputData = view
158+
};
159+
}
140160
}
141161
}

src/Microsoft.ML.Transforms/Text/WordEmbeddingsTransform.cs

+444
Large diffs are not rendered by default.

src/Microsoft.ML.Transforms/Text/doc.xml

+43-1
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,49 @@
179179
<example name="LightLDA">
180180
<example>
181181
<code language="csharp">
182-
pipeline.Add(new LightLda(("InTextCol" , "OutTextCol")));
182+
pipeline.Add(new LightLda((&quot;InTextCol&quot; , &quot;OutTextCol&quot;)));
183+
</code>
184+
</example>
185+
</example>
186+
187+
<member name="WordEmbeddings">
188+
<summary>
189+
Word Embeddings transform is a text featurizer which converts vectors of text tokens into sentence vectors using a pre-trained model.
190+
</summary>
191+
<remarks>
192+
WordEmbeddings wrap different embedding models, such as GloVe. Users can specify which embedding to use.
193+
The available options are various versions of <a href="https://nlp.stanford.edu/projects/glove/">GloVe Models</a>, <a href="https://en.wikipedia.org/wiki/FastText">fastText</a>, and <a href="http://anthology.aclweb.org/P/P14/P14-1146.pdf">SSWE</a>.
194+
<para>
195+
Note: As WordEmbedding requires a column with text vector, e.g. %3C%27this%27, %27is%27, %27good%27%3E, users need to create an input column by
196+
using the output_tokens=True for TextTransform to convert a column with sentences like "This is good" into %3C%27this%27, %27is%27, %27good%27 %3E.
197+
The suffix of %27_TransformedText%27 is added to the original column name to create the output token column. For instance if the input column is %27body%27,
198+
the output tokens column is named %27body_TransformedText%27.
199+
</para>
200+
<para>
201+
License attributes for pretrained models:
202+
<list type="bullet">
203+
<item>
204+
<description>
205+
&quot;fastText Wikipedia 300D&quot; by Facebook, Inc. is licensed under <a href="https://creativecommons.org/licenses/by-sa/3.0/">CC-BY-SA 3.0</a> based on:
206+
P. Bojanowski*, E. Grave*, A. Joulin, T. Mikolov,<a href="https://arxiv.org/abs/1607.04606">Enriching Word Vectors with Subword Information</a>
207+
%40article%7Bbojanowski2016enriching%2C%0A%20%20title%3D%7BEnriching%20Word%20Vectors%20with%20Subword%20Information%7D%2C%0A%20%20author%3D%7BBojanowski%2C%20Piotr%20and%20Grave%2C%20Edouard%20and%20Joulin%2C%20Armand%20and%20Mikolov%2C%20Tomas%7D%2C%0A%20%20journal%3D%7BarXiv%20preprint%20arXiv%3A1607.04606%7D%2C%0A%20%20year%3D%7B2016%7D%0A%7D
208+
More information can be found <a href="https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md">here</a>.
209+
</description>
210+
</item>
211+
<item>
212+
<description>
213+
GloVe models by Stanford University, or (Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. <a href="https://nlp.stanford.edu/pubs/glove.pdf">GloVe: Global Vectors for Word Representation</a>) is licensed under <a href="https://opendatacommons.org/licenses/pddl/1.0/">PDDL</a>.
214+
More information can be found <a href="https://nlp.stanford.edu/projects/glove/">here</a>. Repository can be found <a href="https://github.com/stanfordnlp/GloVe">here</a>.
215+
</description>
216+
</item>
217+
</list>
218+
</para>
219+
</remarks>
220+
</member>
221+
<example name="WordEmbeddings">
222+
<example>
223+
<code language="csharp">
224+
pipeline.Add(new WordEmbeddings((&quot;InVectorTextCol&quot; , &quot;OutTextCol&quot;)));
183225
</code>
184226
</example>
185227
</example>

src/Microsoft.ML/CSharpApi.cs

+154
Original file line numberDiff line numberDiff line change
@@ -1546,6 +1546,18 @@ public void Add(Microsoft.ML.Transforms.TwoHeterogeneousModelCombiner input, Mic
15461546
_jsonNodes.Add(Serialize("Transforms.TwoHeterogeneousModelCombiner", input, output));
15471547
}
15481548

1549+
public Microsoft.ML.Transforms.WordEmbeddings.Output Add(Microsoft.ML.Transforms.WordEmbeddings input)
1550+
{
1551+
var output = new Microsoft.ML.Transforms.WordEmbeddings.Output();
1552+
Add(input, output);
1553+
return output;
1554+
}
1555+
1556+
public void Add(Microsoft.ML.Transforms.WordEmbeddings input, Microsoft.ML.Transforms.WordEmbeddings.Output output)
1557+
{
1558+
_jsonNodes.Add(Serialize("Transforms.WordEmbeddings", input, output));
1559+
}
1560+
15491561
public Microsoft.ML.Transforms.WordTokenizer.Output Add(Microsoft.ML.Transforms.WordTokenizer input)
15501562
{
15511563
var output = new Microsoft.ML.Transforms.WordTokenizer.Output();
@@ -15664,6 +15676,148 @@ public sealed class Output
1566415676
}
1566515677
}
1566615678

15679+
namespace Transforms
15680+
{
15681+
public enum WordEmbeddingsTransformPretrainedModelKind
15682+
{
15683+
GloVe50D = 0,
15684+
GloVe100D = 1,
15685+
GloVe200D = 2,
15686+
GloVe300D = 3,
15687+
GloVeTwitter25D = 4,
15688+
GloVeTwitter50D = 5,
15689+
GloVeTwitter100D = 6,
15690+
GloVeTwitter200D = 7,
15691+
FastTextWikipedia300D = 8,
15692+
Sswe = 9
15693+
}
15694+
15695+
15696+
public sealed partial class WordEmbeddingsTransformColumn : OneToOneColumn<WordEmbeddingsTransformColumn>, IOneToOneColumn
15697+
{
15698+
/// <summary>
15699+
/// Name of the new column
15700+
/// </summary>
15701+
public string Name { get; set; }
15702+
15703+
/// <summary>
15704+
/// Name of the source column
15705+
/// </summary>
15706+
public string Source { get; set; }
15707+
15708+
}
15709+
15710+
/// <include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/member[@name="WordEmbeddings"]/*' />
15711+
/// <include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/example[@name="WordEmbeddings"]/*' />
15712+
public sealed partial class WordEmbeddings : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
15713+
{
15714+
15715+
public WordEmbeddings()
15716+
{
15717+
}
15718+
15719+
public WordEmbeddings(params string[] inputColumns)
15720+
{
15721+
if (inputColumns != null)
15722+
{
15723+
foreach (string input in inputColumns)
15724+
{
15725+
AddColumn(input);
15726+
}
15727+
}
15728+
}
15729+
15730+
public WordEmbeddings(params (string inputColumn, string outputColumn)[] inputOutputColumns)
15731+
{
15732+
if (inputOutputColumns != null)
15733+
{
15734+
foreach (var inputOutput in inputOutputColumns)
15735+
{
15736+
AddColumn(inputOutput.outputColumn, inputOutput.inputColumn);
15737+
}
15738+
}
15739+
}
15740+
15741+
public void AddColumn(string inputColumn)
15742+
{
15743+
var list = Column == null ? new List<Microsoft.ML.Transforms.WordEmbeddingsTransformColumn>() : new List<Microsoft.ML.Transforms.WordEmbeddingsTransformColumn>(Column);
15744+
list.Add(OneToOneColumn<Microsoft.ML.Transforms.WordEmbeddingsTransformColumn>.Create(inputColumn));
15745+
Column = list.ToArray();
15746+
}
15747+
15748+
public void AddColumn(string outputColumn, string inputColumn)
15749+
{
15750+
var list = Column == null ? new List<Microsoft.ML.Transforms.WordEmbeddingsTransformColumn>() : new List<Microsoft.ML.Transforms.WordEmbeddingsTransformColumn>(Column);
15751+
list.Add(OneToOneColumn<Microsoft.ML.Transforms.WordEmbeddingsTransformColumn>.Create(outputColumn, inputColumn));
15752+
Column = list.ToArray();
15753+
}
15754+
15755+
15756+
/// <summary>
15757+
/// New column definition(s) (optional form: name:src)
15758+
/// </summary>
15759+
public WordEmbeddingsTransformColumn[] Column { get; set; }
15760+
15761+
/// <summary>
15762+
/// Pre-trained model used to create the vocabulary
15763+
/// </summary>
15764+
public WordEmbeddingsTransformPretrainedModelKind? ModelKind { get; set; } = WordEmbeddingsTransformPretrainedModelKind.Sswe;
15765+
15766+
/// <summary>
15767+
/// Filename for custom word embedding model
15768+
/// </summary>
15769+
public string CustomLookupTable { get; set; }
15770+
15771+
/// <summary>
15772+
/// Input dataset
15773+
/// </summary>
15774+
public Var<Microsoft.ML.Runtime.Data.IDataView> Data { get; set; } = new Var<Microsoft.ML.Runtime.Data.IDataView>();
15775+
15776+
15777+
public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITransformOutput
15778+
{
15779+
/// <summary>
15780+
/// Transformed dataset
15781+
/// </summary>
15782+
public Var<Microsoft.ML.Runtime.Data.IDataView> OutputData { get; set; } = new Var<Microsoft.ML.Runtime.Data.IDataView>();
15783+
15784+
/// <summary>
15785+
/// Transform model
15786+
/// </summary>
15787+
public Var<Microsoft.ML.Runtime.EntryPoints.ITransformModel> Model { get; set; } = new Var<Microsoft.ML.Runtime.EntryPoints.ITransformModel>();
15788+
15789+
}
15790+
public Var<IDataView> GetInputData() => Data;
15791+
15792+
public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment)
15793+
{
15794+
if (previousStep != null)
15795+
{
15796+
if (!(previousStep is ILearningPipelineDataStep dataStep))
15797+
{
15798+
throw new InvalidOperationException($"{ nameof(WordEmbeddings)} only supports an { nameof(ILearningPipelineDataStep)} as an input.");
15799+
}
15800+
15801+
Data = dataStep.Data;
15802+
}
15803+
Output output = experiment.Add(this);
15804+
return new WordEmbeddingsPipelineStep(output);
15805+
}
15806+
15807+
private class WordEmbeddingsPipelineStep : ILearningPipelineDataStep
15808+
{
15809+
public WordEmbeddingsPipelineStep(Output output)
15810+
{
15811+
Data = output.OutputData;
15812+
Model = output.Model;
15813+
}
15814+
15815+
public Var<IDataView> Data { get; }
15816+
public Var<ITransformModel> Model { get; }
15817+
}
15818+
}
15819+
}
15820+
1566715821
namespace Transforms
1566815822
{
1566915823

test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv

+1
Original file line numberDiff line numberDiff line change
@@ -125,4 +125,5 @@ Transforms.TextToKeyConverter Converts input values (words, numbers, etc.) to in
125125
Transforms.TrainTestDatasetSplitter Split the dataset into train and test sets Microsoft.ML.Runtime.EntryPoints.TrainTestSplit Split Microsoft.ML.Runtime.EntryPoints.TrainTestSplit+Input Microsoft.ML.Runtime.EntryPoints.TrainTestSplit+Output
126126
Transforms.TreeLeafFeaturizer Trains a tree ensemble, or loads it from a file, then maps a numeric feature vector to three outputs: 1. A vector containing the individual tree outputs of the tree ensemble. 2. A vector indicating the leaves that the feature vector falls on in the tree ensemble. 3. A vector indicating the paths that the feature vector falls on in the tree ensemble. If a both a model file and a trainer are specified - will use the model file. If neither are specified, will train a default FastTree model. This can handle key labels by training a regression model towards their optionally permuted indices. Microsoft.ML.Runtime.Data.TreeFeaturize Featurizer Microsoft.ML.Runtime.Data.TreeEnsembleFeaturizerTransform+ArgumentsForEntryPoint Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput
127127
Transforms.TwoHeterogeneousModelCombiner Combines a TransformModel and a PredictorModel into a single PredictorModel. Microsoft.ML.Runtime.EntryPoints.ModelOperations CombineTwoModels Microsoft.ML.Runtime.EntryPoints.ModelOperations+SimplePredictorModelInput Microsoft.ML.Runtime.EntryPoints.ModelOperations+PredictorModelOutput
128+
Transforms.WordEmbeddings Word Embeddings transform is a text featurizer which converts vectors of text tokens into sentence vectors using a pre-trained model Microsoft.ML.Runtime.Transforms.TextAnalytics WordEmbeddings Microsoft.ML.Runtime.Data.WordEmbeddingsTransform+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput
128129
Transforms.WordTokenizer The input to this transform is text, and the output is a vector of text containing the words (tokens) in the original text. The separator is space, but can be specified as any other character (or multiple characters) if needed. Microsoft.ML.Runtime.Transforms.TextAnalytics DelimitedTokenizeTransform Microsoft.ML.Runtime.Data.DelimitedTokenizeTransform+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput

test/BaselineOutput/Common/EntryPoints/core_manifest.json

+114
Original file line numberDiff line numberDiff line change
@@ -21840,6 +21840,120 @@
2184021840
}
2184121841
]
2184221842
},
21843+
{
21844+
"Name": "Transforms.WordEmbeddings",
21845+
"Desc": "Word Embeddings transform is a text featurizer which converts vectors of text tokens into sentence vectors using a pre-trained model",
21846+
"FriendlyName": "Word Embeddings Transform",
21847+
"ShortName": "WordEmbeddings",
21848+
"Inputs": [
21849+
{
21850+
"Name": "Column",
21851+
"Type": {
21852+
"Kind": "Array",
21853+
"ItemType": {
21854+
"Kind": "Struct",
21855+
"Fields": [
21856+
{
21857+
"Name": "Name",
21858+
"Type": "String",
21859+
"Desc": "Name of the new column",
21860+
"Aliases": [
21861+
"name"
21862+
],
21863+
"Required": false,
21864+
"SortOrder": 150.0,
21865+
"IsNullable": false,
21866+
"Default": null
21867+
},
21868+
{
21869+
"Name": "Source",
21870+
"Type": "String",
21871+
"Desc": "Name of the source column",
21872+
"Aliases": [
21873+
"src"
21874+
],
21875+
"Required": false,
21876+
"SortOrder": 150.0,
21877+
"IsNullable": false,
21878+
"Default": null
21879+
}
21880+
]
21881+
}
21882+
},
21883+
"Desc": "New column definition(s) (optional form: name:src)",
21884+
"Aliases": [
21885+
"col"
21886+
],
21887+
"Required": true,
21888+
"SortOrder": 0.0,
21889+
"IsNullable": false
21890+
},
21891+
{
21892+
"Name": "ModelKind",
21893+
"Type": {
21894+
"Kind": "Enum",
21895+
"Values": [
21896+
"GloVe50D",
21897+
"GloVe100D",
21898+
"GloVe200D",
21899+
"GloVe300D",
21900+
"GloVeTwitter25D",
21901+
"GloVeTwitter50D",
21902+
"GloVeTwitter100D",
21903+
"GloVeTwitter200D",
21904+
"FastTextWikipedia300D",
21905+
"Sswe"
21906+
]
21907+
},
21908+
"Desc": "Pre-trained model used to create the vocabulary",
21909+
"Aliases": [
21910+
"model"
21911+
],
21912+
"Required": false,
21913+
"SortOrder": 1.0,
21914+
"IsNullable": true,
21915+
"Default": "Sswe"
21916+
},
21917+
{
21918+
"Name": "Data",
21919+
"Type": "DataView",
21920+
"Desc": "Input dataset",
21921+
"Required": true,
21922+
"SortOrder": 1.0,
21923+
"IsNullable": false
21924+
},
21925+
{
21926+
"Name": "CustomLookupTable",
21927+
"Type": "String",
21928+
"Desc": "Filename for custom word embedding model",
21929+
"Aliases": [
21930+
"dataFile"
21931+
],
21932+
"Required": false,
21933+
"SortOrder": 2.0,
21934+
"IsNullable": false,
21935+
"Default": null
21936+
}
21937+
],
21938+
"Outputs": [
21939+
{
21940+
"Name": "OutputData",
21941+
"Type": "DataView",
21942+
"Desc": "Transformed dataset"
21943+
},
21944+
{
21945+
"Name": "Model",
21946+
"Type": "TransformModel",
21947+
"Desc": "Transform model"
21948+
}
21949+
],
21950+
"InputKind": [
21951+
"ITransformInput"
21952+
],
21953+
"OutputKind": [
21954+
"ITransformOutput"
21955+
]
21956+
},
2184321957
{
2184421958
"Name": "Transforms.WordTokenizer",
2184521959
"Desc": "The input to this transform is text, and the output is a vector of text containing the words (tokens) in the original text. The separator is space, but can be specified as any other character (or multiple characters) if needed.",

0 commit comments

Comments
 (0)