Skip to content

Commit 68696f4

Browse files
author
Ivan Matantsev
committed
update documentation
1 parent 547d221 commit 68696f4

File tree

6 files changed

+47
-14
lines changed

6 files changed

+47
-14
lines changed

src/Microsoft.ML.Transforms/EntryPoints/TextAnalytics.cs

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -138,8 +138,12 @@ public static CommonOutputs.TransformOutput LightLda(IHostEnvironment env, LdaTr
138138
};
139139
}
140140

141-
[TlcModule.EntryPoint(Name = "Transforms.WordEmbeddings", Desc = WordEmbeddingsTransform.Summary,
142-
UserName = WordEmbeddingsTransform.UserName)]
141+
[TlcModule.EntryPoint(Name = "Transforms.WordEmbeddings",
142+
Desc = WordEmbeddingsTransform.Summary,
143+
UserName = WordEmbeddingsTransform.UserName,
144+
ShortName = WordEmbeddingsTransform.ShortName,
145+
XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/member[@name=""WordEmbeddings""]/*' />",
146+
@"<include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/example[@name=""WordEmbeddings""]/*' />" })]
143147
public static CommonOutputs.TransformOutput WordEmbeddings(IHostEnvironment env, WordEmbeddingsTransform.Arguments input)
144148
{
145149
Contracts.CheckValue(env, nameof(env));

src/Microsoft.ML.Transforms/Text/WordEmbeddingsTransform.cs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,14 @@
1818
using Microsoft.ML.Runtime.Model;
1919

2020
[assembly: LoadableClass(WordEmbeddingsTransform.Summary, typeof(IDataTransform), typeof(WordEmbeddingsTransform), typeof(WordEmbeddingsTransform.Arguments),
21-
typeof(SignatureDataTransform), WordEmbeddingsTransform.UserName, "WordEmbeddingsTransform", "WordEmbeddings", DocName = "transform/WordEmbeddingsTransform.md")]
21+
typeof(SignatureDataTransform), WordEmbeddingsTransform.UserName, "WordEmbeddingsTransform", WordEmbeddingsTransform.ShortName, DocName = "transform/WordEmbeddingsTransform.md")]
2222

2323
[assembly: LoadableClass(typeof(WordEmbeddingsTransform), null, typeof(SignatureLoadDataTransform),
2424
WordEmbeddingsTransform.UserName, WordEmbeddingsTransform.LoaderSignature)]
2525

2626
namespace Microsoft.ML.Runtime.Data
2727
{
28+
/// <include file='doc.xml' path='doc/members/member[@name="WordEmbeddings"]/*' />
2829
public sealed class WordEmbeddingsTransform : OneToOneTransformBase
2930
{
3031
public sealed class Column : OneToOneColumn
@@ -62,6 +63,7 @@ public sealed class Arguments : TransformInputBase
6263
internal const string Summary = "Word Embeddings transform is a text featurizer which converts vectors of text tokens into sentence " +
6364
"vectors using a pre-trained model";
6465
internal const string UserName = "Word Embeddings Transform";
66+
internal const string ShortName = "WordEmbeddings";
6567
public const string LoaderSignature = "WordEmbeddingsTransform";
6668

6769
public static VersionInfo GetVersionInfo()

src/Microsoft.ML.Transforms/Text/doc.xml

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,5 +184,37 @@
184184
</example>
185185
</example>
186186

187+
<member name="WordEmbeddings">
188+
<summary>
189+
Word Embeddings transform is a text featurizer which converts vectors of text tokens into sentence vectors using a pre-trained model.
190+
</summary>
191+
<remarks>
192+
WordEmbeddings wrap different embedding models, such as GloVe. Users can specify which embedding to use.
193+
The available options are various versions of <a href="https://nlp.stanford.edu/projects/glove/">GloVe Models</a>, <a href="https://en.wikipedia.org/wiki/FastText">FastText</a>, and <a href="http://anthology.aclweb.org/P/P14/P14-1146.pdf">Sswe</a>.
194+
<para>
195+
Note: As WordEmbedding requires a column with text vector, e.g. %3C'This', 'is', 'good'%3E, users need to create an input column by:
196+
<list type="bullet">
197+
<item><description>concatenating columns with TX type,</description></item>
198+
<item>
199+
<description>or using the output_tokens=True for NGramFeaturizer() to convert a column with sentences like "This is good" into %3C'This', 'is', 'good' %3E.
200+
The column for the output token column is renamed with a prefix of '_TranformedText'.</description>
201+
</item>
202+
</list>
203+
In the following example, after the NGramFeaturizer, features named ngram.__ are generated. A new column named ngram_TransformedText is
204+
also created with the text vector, similar as running .split(' '). However, due to the variable length of this column it cannot be properly
205+
converted to pandas dataframe, thus any pipelines/transforms output this text vector column will throw errors. However, we use
206+
ngram_TransformedText as the input to WordEmbedding, the ngram_TransformedText column will be overwritten by the output from
207+
WordEmbedding. The output from WordEmbedding is named ngram_TransformedText.__
208+
</para>
209+
</remarks>
210+
</member>
211+
<example name="WordEmbeddings">
212+
<example>
213+
<code language="csharp">
214+
pipeline.Add(new WordEmbeddings(("InTextCol" , "OutTextCol")));
215+
</code>
216+
</example>
217+
</example>
218+
187219
</members>
188220
</doc>

src/Microsoft.ML/CSharpApi.cs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15474,9 +15474,8 @@ public sealed partial class WordEmbeddingsTransformColumn : OneToOneColumn<WordE
1547415474

1547515475
}
1547615476

15477-
/// <summary>
15478-
/// Word Embeddings transform is a text featurizer which converts vectors of text tokens into sentence vectors using a pre-trained model
15479-
/// </summary>
15477+
/// <include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/member[@name="WordEmbeddings"]/*' />
15478+
/// <include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/example[@name="WordEmbeddings"]/*' />
1548015479
public sealed partial class WordEmbeddings : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
1548115480
{
1548215481

test/BaselineOutput/Common/EntryPoints/core_manifest.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21454,7 +21454,7 @@
2145421454
"Name": "Transforms.WordEmbeddings",
2145521455
"Desc": "Word Embeddings transform is a text featurizer which converts vectors of text tokens into sentence vectors using a pre-trained model",
2145621456
"FriendlyName": "Word Embeddings Transform",
21457-
"ShortName": null,
21457+
"ShortName": "WordEmbeddings",
2145821458
"Inputs": [
2145921459
{
2146021460
"Name": "Column",

test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3719,8 +3719,6 @@ public void EntryPointWordEmbeddings()
37193719
"The quick brown fox jumps over the lazy dog.",
37203720
"The five boxing wizards jump quickly."
37213721
});
3722-
3723-
37243722
var inputFile = new SimpleFileHandle(Env, dataFile, false, false);
37253723
var dataView = ImportTextData.TextLoader(Env, new ImportTextData.LoaderInput()
37263724
{
@@ -3733,19 +3731,17 @@ public void EntryPointWordEmbeddings()
37333731
{
37343732
Name = "Text",
37353733
Source = new [] { new TextLoader.Range() { Min = 0, VariableEnd=true, ForceVector=true} },
3736-
37373734
Type = DataKind.Text
37383735
}
37393736
}
37403737
},
3741-
37423738
InputFile = inputFile,
37433739
}).Data;
37443740
var embedding = Transforms.TextAnalytics.WordEmbeddings(Env, new WordEmbeddingsTransform.Arguments()
37453741
{
3746-
Data= dataView,
3747-
Column = new[] {new WordEmbeddingsTransform.Column { Name = "Features", Source = "Text" } },
3748-
ModelKind= WordEmbeddingsTransform.PretrainedModelKind.Sswe
3742+
Data = dataView,
3743+
Column = new[] { new WordEmbeddingsTransform.Column { Name = "Features", Source = "Text" } },
3744+
ModelKind = WordEmbeddingsTransform.PretrainedModelKind.Sswe
37493745
});
37503746
var result = embedding.OutputData;
37513747
using (var cursor = result.GetRowCursor((x => true)))

0 commit comments

Comments
 (0)