Skip to content

Commit b727d10

Browse files
authored
word embedding transform (#545)
Introduce word embedding transform
1 parent 98318b6 commit b727d10

File tree

8 files changed

+916
-1
lines changed

8 files changed

+916
-1
lines changed

src/Microsoft.ML.Transforms/EntryPoints/TextAnalytics.cs

+20
Original file line numberDiff line numberDiff line change
@@ -137,5 +137,25 @@ public static CommonOutputs.TransformOutput LightLda(IHostEnvironment env, LdaTr
137137
OutputData = view
138138
};
139139
}
140+
141+
[TlcModule.EntryPoint(Name = "Transforms.WordEmbeddings",
142+
Desc = WordEmbeddingsTransform.Summary,
143+
UserName = WordEmbeddingsTransform.UserName,
144+
ShortName = WordEmbeddingsTransform.ShortName,
145+
XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/member[@name=""WordEmbeddings""]/*' />",
146+
@"<include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/example[@name=""WordEmbeddings""]/*' />" })]
147+
public static CommonOutputs.TransformOutput WordEmbeddings(IHostEnvironment env, WordEmbeddingsTransform.Arguments input)
148+
{
149+
Contracts.CheckValue(env, nameof(env));
150+
env.CheckValue(input, nameof(input));
151+
152+
var h = EntryPointUtils.CheckArgsAndCreateHost(env, "WordEmbeddings", input);
153+
var view = new WordEmbeddingsTransform(h, input, input.Data);
154+
return new CommonOutputs.TransformOutput()
155+
{
156+
Model = new TransformModel(h, view, input.Data),
157+
OutputData = view
158+
};
159+
}
140160
}
141161
}

src/Microsoft.ML.Transforms/Text/WordEmbeddingsTransform.cs

+444
Large diffs are not rendered by default.

src/Microsoft.ML.Transforms/Text/doc.xml

+43-1
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,49 @@
179179
<example name="LightLDA">
180180
<example>
181181
<code language="csharp">
182-
pipeline.Add(new LightLda(("InTextCol" , "OutTextCol")));
182+
pipeline.Add(new LightLda((&quot;InTextCol&quot; , &quot;OutTextCol&quot;)));
183+
</code>
184+
</example>
185+
</example>
186+
187+
<member name="WordEmbeddings">
188+
<summary>
189+
Word Embeddings transform is a text featurizer which converts vectors of text tokens into sentence vectors using a pre-trained model.
190+
</summary>
191+
<remarks>
192+
WordEmbeddings wrap different embedding models, such as GloVe. Users can specify which embedding to use.
193+
The available options are various versions of <a href="https://nlp.stanford.edu/projects/glove/">GloVe Models</a>, <a href="https://en.wikipedia.org/wiki/FastText">fastText</a>, and <a href="http://anthology.aclweb.org/P/P14/P14-1146.pdf">SSWE</a>.
194+
<para>
195+
Note: As WordEmbedding requires a column with text vector, e.g. %3C%27this%27, %27is%27, %27good%27%3E, users need to create an input column by
196+
using the output_tokens=True for TextTransform to convert a column with sentences like "This is good" into %3C%27this%27, %27is%27, %27good%27 %3E.
197+
The suffix of %27_TransformedText%27 is added to the original column name to create the output token column. For instance if the input column is %27body%27,
198+
the output tokens column is named %27body_TransformedText%27.
199+
</para>
200+
<para>
201+
License attributes for pretrained models:
202+
<list type="bullet">
203+
<item>
204+
<description>
205+
&quot;fastText Wikipedia 300D&quot; by Facebook, Inc. is licensed under <a href="https://creativecommons.org/licenses/by-sa/3.0/">CC-BY-SA 3.0</a> based on:
206+
P. Bojanowski*, E. Grave*, A. Joulin, T. Mikolov,<a href="https://arxiv.org/abs/1607.04606">Enriching Word Vectors with Subword Information</a>
207+
%40article%7Bbojanowski2016enriching%2C%0A%20%20title%3D%7BEnriching%20Word%20Vectors%20with%20Subword%20Information%7D%2C%0A%20%20author%3D%7BBojanowski%2C%20Piotr%20and%20Grave%2C%20Edouard%20and%20Joulin%2C%20Armand%20and%20Mikolov%2C%20Tomas%7D%2C%0A%20%20journal%3D%7BarXiv%20preprint%20arXiv%3A1607.04606%7D%2C%0A%20%20year%3D%7B2016%7D%0A%7D
208+
More information can be found <a href="https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md">here</a>.
209+
</description>
210+
</item>
211+
<item>
212+
<description>
213+
GloVe models by Stanford University, or (Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. <a href="https://nlp.stanford.edu/pubs/glove.pdf">GloVe: Global Vectors for Word Representation</a>) is licensed under <a href="https://opendatacommons.org/licenses/pddl/1.0/">PDDL</a>.
214+
More information can be found <a href="https://nlp.stanford.edu/projects/glove/">here</a>. Repository can be found <a href="https://github.com/stanfordnlp/GloVe">here</a>.
215+
</description>
216+
</item>
217+
</list>
218+
</para>
219+
</remarks>
220+
</member>
221+
<example name="WordEmbeddings">
222+
<example>
223+
<code language="csharp">
224+
pipeline.Add(new WordEmbeddings((&quot;InVectorTextCol&quot; , &quot;OutTextCol&quot;)));
183225
</code>
184226
</example>
185227
</example>

src/Microsoft.ML/CSharpApi.cs

+154
Original file line numberDiff line numberDiff line change
@@ -1534,6 +1534,18 @@ public void Add(Microsoft.ML.Transforms.TwoHeterogeneousModelCombiner input, Mic
15341534
_jsonNodes.Add(Serialize("Transforms.TwoHeterogeneousModelCombiner", input, output));
15351535
}
15361536

1537+
public Microsoft.ML.Transforms.WordEmbeddings.Output Add(Microsoft.ML.Transforms.WordEmbeddings input)
1538+
{
1539+
var output = new Microsoft.ML.Transforms.WordEmbeddings.Output();
1540+
Add(input, output);
1541+
return output;
1542+
}
1543+
1544+
public void Add(Microsoft.ML.Transforms.WordEmbeddings input, Microsoft.ML.Transforms.WordEmbeddings.Output output)
1545+
{
1546+
_jsonNodes.Add(Serialize("Transforms.WordEmbeddings", input, output));
1547+
}
1548+
15371549
public Microsoft.ML.Transforms.WordTokenizer.Output Add(Microsoft.ML.Transforms.WordTokenizer input)
15381550
{
15391551
var output = new Microsoft.ML.Transforms.WordTokenizer.Output();
@@ -15530,6 +15542,148 @@ public sealed class Output
1553015542
}
1553115543
}
1553215544

15545+
namespace Transforms
15546+
{
15547+
public enum WordEmbeddingsTransformPretrainedModelKind
15548+
{
15549+
GloVe50D = 0,
15550+
GloVe100D = 1,
15551+
GloVe200D = 2,
15552+
GloVe300D = 3,
15553+
GloVeTwitter25D = 4,
15554+
GloVeTwitter50D = 5,
15555+
GloVeTwitter100D = 6,
15556+
GloVeTwitter200D = 7,
15557+
FastTextWikipedia300D = 8,
15558+
Sswe = 9
15559+
}
15560+
15561+
15562+
public sealed partial class WordEmbeddingsTransformColumn : OneToOneColumn<WordEmbeddingsTransformColumn>, IOneToOneColumn
15563+
{
15564+
/// <summary>
15565+
/// Name of the new column
15566+
/// </summary>
15567+
public string Name { get; set; }
15568+
15569+
/// <summary>
15570+
/// Name of the source column
15571+
/// </summary>
15572+
public string Source { get; set; }
15573+
15574+
}
15575+
15576+
/// <include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/member[@name="WordEmbeddings"]/*' />
15577+
/// <include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/example[@name="WordEmbeddings"]/*' />
15578+
public sealed partial class WordEmbeddings : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
15579+
{
15580+
15581+
public WordEmbeddings()
15582+
{
15583+
}
15584+
15585+
public WordEmbeddings(params string[] inputColumns)
15586+
{
15587+
if (inputColumns != null)
15588+
{
15589+
foreach (string input in inputColumns)
15590+
{
15591+
AddColumn(input);
15592+
}
15593+
}
15594+
}
15595+
15596+
public WordEmbeddings(params (string inputColumn, string outputColumn)[] inputOutputColumns)
15597+
{
15598+
if (inputOutputColumns != null)
15599+
{
15600+
foreach (var inputOutput in inputOutputColumns)
15601+
{
15602+
AddColumn(inputOutput.outputColumn, inputOutput.inputColumn);
15603+
}
15604+
}
15605+
}
15606+
15607+
public void AddColumn(string inputColumn)
15608+
{
15609+
var list = Column == null ? new List<Microsoft.ML.Transforms.WordEmbeddingsTransformColumn>() : new List<Microsoft.ML.Transforms.WordEmbeddingsTransformColumn>(Column);
15610+
list.Add(OneToOneColumn<Microsoft.ML.Transforms.WordEmbeddingsTransformColumn>.Create(inputColumn));
15611+
Column = list.ToArray();
15612+
}
15613+
15614+
public void AddColumn(string outputColumn, string inputColumn)
15615+
{
15616+
var list = Column == null ? new List<Microsoft.ML.Transforms.WordEmbeddingsTransformColumn>() : new List<Microsoft.ML.Transforms.WordEmbeddingsTransformColumn>(Column);
15617+
list.Add(OneToOneColumn<Microsoft.ML.Transforms.WordEmbeddingsTransformColumn>.Create(outputColumn, inputColumn));
15618+
Column = list.ToArray();
15619+
}
15620+
15621+
15622+
/// <summary>
15623+
/// New column definition(s) (optional form: name:src)
15624+
/// </summary>
15625+
public WordEmbeddingsTransformColumn[] Column { get; set; }
15626+
15627+
/// <summary>
15628+
/// Pre-trained model used to create the vocabulary
15629+
/// </summary>
15630+
public WordEmbeddingsTransformPretrainedModelKind? ModelKind { get; set; } = WordEmbeddingsTransformPretrainedModelKind.Sswe;
15631+
15632+
/// <summary>
15633+
/// Filename for custom word embedding model
15634+
/// </summary>
15635+
public string CustomLookupTable { get; set; }
15636+
15637+
/// <summary>
15638+
/// Input dataset
15639+
/// </summary>
15640+
public Var<Microsoft.ML.Runtime.Data.IDataView> Data { get; set; } = new Var<Microsoft.ML.Runtime.Data.IDataView>();
15641+
15642+
15643+
public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITransformOutput
15644+
{
15645+
/// <summary>
15646+
/// Transformed dataset
15647+
/// </summary>
15648+
public Var<Microsoft.ML.Runtime.Data.IDataView> OutputData { get; set; } = new Var<Microsoft.ML.Runtime.Data.IDataView>();
15649+
15650+
/// <summary>
15651+
/// Transform model
15652+
/// </summary>
15653+
public Var<Microsoft.ML.Runtime.EntryPoints.ITransformModel> Model { get; set; } = new Var<Microsoft.ML.Runtime.EntryPoints.ITransformModel>();
15654+
15655+
}
15656+
public Var<IDataView> GetInputData() => Data;
15657+
15658+
public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment)
15659+
{
15660+
if (previousStep != null)
15661+
{
15662+
if (!(previousStep is ILearningPipelineDataStep dataStep))
15663+
{
15664+
throw new InvalidOperationException($"{ nameof(WordEmbeddings)} only supports an { nameof(ILearningPipelineDataStep)} as an input.");
15665+
}
15666+
15667+
Data = dataStep.Data;
15668+
}
15669+
Output output = experiment.Add(this);
15670+
return new WordEmbeddingsPipelineStep(output);
15671+
}
15672+
15673+
private class WordEmbeddingsPipelineStep : ILearningPipelineDataStep
15674+
{
15675+
public WordEmbeddingsPipelineStep(Output output)
15676+
{
15677+
Data = output.OutputData;
15678+
Model = output.Model;
15679+
}
15680+
15681+
public Var<IDataView> Data { get; }
15682+
public Var<ITransformModel> Model { get; }
15683+
}
15684+
}
15685+
}
15686+
1553315687
namespace Transforms
1553415688
{
1553515689

test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv

+1
Original file line numberDiff line numberDiff line change
@@ -124,4 +124,5 @@ Transforms.TextToKeyConverter Converts input values (words, numbers, etc.) to in
124124
Transforms.TrainTestDatasetSplitter Split the dataset into train and test sets Microsoft.ML.Runtime.EntryPoints.TrainTestSplit Split Microsoft.ML.Runtime.EntryPoints.TrainTestSplit+Input Microsoft.ML.Runtime.EntryPoints.TrainTestSplit+Output
125125
Transforms.TreeLeafFeaturizer Trains a tree ensemble, or loads it from a file, then maps a numeric feature vector to three outputs: 1. A vector containing the individual tree outputs of the tree ensemble. 2. A vector indicating the leaves that the feature vector falls on in the tree ensemble. 3. A vector indicating the paths that the feature vector falls on in the tree ensemble. If a both a model file and a trainer are specified - will use the model file. If neither are specified, will train a default FastTree model. This can handle key labels by training a regression model towards their optionally permuted indices. Microsoft.ML.Runtime.Data.TreeFeaturize Featurizer Microsoft.ML.Runtime.Data.TreeEnsembleFeaturizerTransform+ArgumentsForEntryPoint Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput
126126
Transforms.TwoHeterogeneousModelCombiner Combines a TransformModel and a PredictorModel into a single PredictorModel. Microsoft.ML.Runtime.EntryPoints.ModelOperations CombineTwoModels Microsoft.ML.Runtime.EntryPoints.ModelOperations+SimplePredictorModelInput Microsoft.ML.Runtime.EntryPoints.ModelOperations+PredictorModelOutput
127+
Transforms.WordEmbeddings Word Embeddings transform is a text featurizer which converts vectors of text tokens into sentence vectors using a pre-trained model Microsoft.ML.Runtime.Transforms.TextAnalytics WordEmbeddings Microsoft.ML.Runtime.Data.WordEmbeddingsTransform+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput
127128
Transforms.WordTokenizer The input to this transform is text, and the output is a vector of text containing the words (tokens) in the original text. The separator is space, but can be specified as any other character (or multiple characters) if needed. Microsoft.ML.Runtime.Transforms.TextAnalytics DelimitedTokenizeTransform Microsoft.ML.Runtime.Data.DelimitedTokenizeTransform+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput

test/BaselineOutput/Common/EntryPoints/core_manifest.json

+114
Original file line numberDiff line numberDiff line change
@@ -21593,6 +21593,120 @@
2159321593
}
2159421594
]
2159521595
},
21596+
{
21597+
"Name": "Transforms.WordEmbeddings",
21598+
"Desc": "Word Embeddings transform is a text featurizer which converts vectors of text tokens into sentence vectors using a pre-trained model",
21599+
"FriendlyName": "Word Embeddings Transform",
21600+
"ShortName": "WordEmbeddings",
21601+
"Inputs": [
21602+
{
21603+
"Name": "Column",
21604+
"Type": {
21605+
"Kind": "Array",
21606+
"ItemType": {
21607+
"Kind": "Struct",
21608+
"Fields": [
21609+
{
21610+
"Name": "Name",
21611+
"Type": "String",
21612+
"Desc": "Name of the new column",
21613+
"Aliases": [
21614+
"name"
21615+
],
21616+
"Required": false,
21617+
"SortOrder": 150.0,
21618+
"IsNullable": false,
21619+
"Default": null
21620+
},
21621+
{
21622+
"Name": "Source",
21623+
"Type": "String",
21624+
"Desc": "Name of the source column",
21625+
"Aliases": [
21626+
"src"
21627+
],
21628+
"Required": false,
21629+
"SortOrder": 150.0,
21630+
"IsNullable": false,
21631+
"Default": null
21632+
}
21633+
]
21634+
}
21635+
},
21636+
"Desc": "New column definition(s) (optional form: name:src)",
21637+
"Aliases": [
21638+
"col"
21639+
],
21640+
"Required": true,
21641+
"SortOrder": 0.0,
21642+
"IsNullable": false
21643+
},
21644+
{
21645+
"Name": "ModelKind",
21646+
"Type": {
21647+
"Kind": "Enum",
21648+
"Values": [
21649+
"GloVe50D",
21650+
"GloVe100D",
21651+
"GloVe200D",
21652+
"GloVe300D",
21653+
"GloVeTwitter25D",
21654+
"GloVeTwitter50D",
21655+
"GloVeTwitter100D",
21656+
"GloVeTwitter200D",
21657+
"FastTextWikipedia300D",
21658+
"Sswe"
21659+
]
21660+
},
21661+
"Desc": "Pre-trained model used to create the vocabulary",
21662+
"Aliases": [
21663+
"model"
21664+
],
21665+
"Required": false,
21666+
"SortOrder": 1.0,
21667+
"IsNullable": true,
21668+
"Default": "Sswe"
21669+
},
21670+
{
21671+
"Name": "Data",
21672+
"Type": "DataView",
21673+
"Desc": "Input dataset",
21674+
"Required": true,
21675+
"SortOrder": 1.0,
21676+
"IsNullable": false
21677+
},
21678+
{
21679+
"Name": "CustomLookupTable",
21680+
"Type": "String",
21681+
"Desc": "Filename for custom word embedding model",
21682+
"Aliases": [
21683+
"dataFile"
21684+
],
21685+
"Required": false,
21686+
"SortOrder": 2.0,
21687+
"IsNullable": false,
21688+
"Default": null
21689+
}
21690+
],
21691+
"Outputs": [
21692+
{
21693+
"Name": "OutputData",
21694+
"Type": "DataView",
21695+
"Desc": "Transformed dataset"
21696+
},
21697+
{
21698+
"Name": "Model",
21699+
"Type": "TransformModel",
21700+
"Desc": "Transform model"
21701+
}
21702+
],
21703+
"InputKind": [
21704+
"ITransformInput"
21705+
],
21706+
"OutputKind": [
21707+
"ITransformOutput"
21708+
]
21709+
},
2159621710
{
2159721711
"Name": "Transforms.WordTokenizer",
2159821712
"Desc": "The input to this transform is text, and the output is a vector of text containing the words (tokens) in the original text. The separator is space, but can be specified as any other character (or multiple characters) if needed.",

0 commit comments

Comments
 (0)