Skip to content

Commit 4797745

Browse files
committed
Lda snapping to template
1 parent ea5c095 commit 4797745

File tree

3 files changed

+55
-38
lines changed

3 files changed

+55
-38
lines changed

src/Microsoft.ML.Transforms/Text/LdaTransform.cs

+53-2
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,9 @@ namespace Microsoft.ML.Transforms.Text
4646
//
4747
// See <a href="https://github.com/dotnet/machinelearning/blob/master/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs"/>
4848
// for an example on how to use LatentDirichletAllocationTransformer.
49-
/// <include file='doc.xml' path='doc/members/member[@name="LightLDA"]/*' />
49+
/// <summary>
50+
/// <see cref="ITransformer"/> resulting from fitting a <see cref="LatentDirichletAllocationEstimator"/>.
51+
/// </summary>
5052
public sealed class LatentDirichletAllocationTransformer : OneToOneTransformerBase
5153
{
5254
internal sealed class Options : TransformInputBase
@@ -936,7 +938,56 @@ private protected override IRowMapper MakeRowMapper(DataViewSchema schema)
936938
=> new Mapper(this, schema);
937939
}
938940

939-
/// <include file='doc.xml' path='doc/members/member[@name="LightLDA"]/*' />
941+
/// <summary>
942+
/// The LDA transform implements <a href="https://arxiv.org/abs/1412.1576">LightLDA</a>, a state-of-the-art implementation of Latent Dirichlet Allocation.
943+
/// </summary>
944+
/// <remarks>
945+
/// <format type="text/markdown"><![CDATA[
946+
///
947+
/// ### Estimator Characteristics
948+
/// | | |
949+
/// | -- | -- |
950+
/// | Does this estimator need to look at the data to train its parameters? | Yes |
951+
/// | Input column data type | [key](xref:Microsoft.ML.Data.KeyDataViewType) data types|
952+
/// | Output column data type | Vector or <xref:System.Single>|
953+
///
954+
/// Latent Dirichlet Allocation is a well-known [topic modeling](https://en.wikipedia.org/wiki/Topic_model) algorithm that infers semantic structure from text data,
955+
/// and ultimately helps answer the question on "what is this document about?".
956+
/// It can be used to featurize any text fields as low-dimensional topical vectors.
957+
/// LightLDA is an extremely efficient implementation of LDA developed in MSR-Asia that incorporates a number of
958+
/// optimization techniques.
959+
/// With the LDA transform, ML.NET users can train a topic model to produce 1 million topics with 1 million vocabulary
960+
/// on a 1-billion-token document set one a single machine in a few hours(typically, LDA at this scale takes days and requires large clusters).
961+
/// The most significant innovation is a super-efficient O(1) [Metropolis-Hastings sampling algorithm](https://en.wikipedia.org/wiki/Metropolis–Hastings_algorithm),
962+
/// whose running cost is agnostic of model size, allowing it to converges nearly an order of magnitude faster than other [Gibbs samplers](https://en.wikipedia.org/wiki/Gibbs_sampling).
963+
///
964+
/// In an Ml.Net pipeline, this estimator requires the output of some preprocessing, as its input.
965+
/// A typical pipeline operating on text would require performing text normalization, tokenization and producing n-grams to than supply to LDA.
966+
/// See the example usage in the SeeAlso section for usage suggestions.
967+
///
968+
/// If we have the following three lines of text, as data points:
969+
/// * I like to eat bananas.
970+
/// * I eat bananas everyday.
971+
/// * LightLDA improves the sampling throughput and convergence speed via a novel O(1) metropolis-Hastings sampler,
972+
/// and allows a small cluster of machines to tackle very large data and model sizes based on the model scheduling
973+
/// and data parallelism capabilities of the DMTK parameter server.(quoted from [LightLDA](http://www.dmtk.io/lightlda.html))
974+
///
975+
/// To illustrate the effect of this estimator on text, notice the similarity in values of the first and second row, compared to the third,
976+
/// and see how those values are indicative of semantic similarities between those lines.
977+
///
978+
/// | Topic1 | Topic2 |
979+
/// | ------- | --------|
980+
/// | 0.8571 | 0.1429 |
981+
/// | 0.8571 | 0.1429 |
982+
/// | 0.4909 | 0.5091 |
983+
///
984+
/// For more technical details you can consult the following papers.
985+
/// * [LightLDA: Big Topic Models on Modest Computer Clusters](https://arxiv.org/abs/1412.1576)
986+
/// * [LightLDA](https://github.com/Microsoft/LightLDA)
987+
///
988+
/// ]]></format>
989+
/// </remarks>
990+
/// <seealso cref="TextCatalog.LatentDirichletAllocation(TransformsCatalog.TextTransforms, string, string, int, float, float, int, int, int, int, int, int, int, bool)"/>
940991
public sealed class LatentDirichletAllocationEstimator : IEstimator<LatentDirichletAllocationTransformer>
941992
{
942993
[BestFriend]

src/Microsoft.ML.Transforms/Text/TextCatalog.cs

+2-2
Original file line numberDiff line numberDiff line change
@@ -496,8 +496,8 @@ internal static NgramHashingEstimator ProduceHashedNgrams(this TransformsCatalog
496496
=> new NgramHashingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), columns);
497497

498498
/// <summary>
499-
/// Uses <a href="https://arxiv.org/abs/1412.1576">LightLDA</a> to transform a document (represented as a vector of floats)
500-
/// into a vector of floats over a set of topics.
499+
/// Create a <see cref="LatentDirichletAllocationEstimator"/>, which uses <a href="https://arxiv.org/abs/1412.1576">LightLDA</a> to transform text (represented as a vector of floats)
500+
/// into a vector of floats indicating the similarity of the text with each topic identified.
501501
/// </summary>
502502
/// <param name="catalog">The transform's catalog.</param>
503503
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>

src/Microsoft.ML.Transforms/Text/doc.xml

-34
Original file line numberDiff line numberDiff line change
@@ -150,40 +150,6 @@
150150
</example>
151151
</member>
152152

153-
<member name="LightLDA">
154-
<summary>
155-
The LDA transform implements LightLDA, a state-of-the-art implementation of Latent Dirichlet Allocation.
156-
</summary>
157-
<remarks>
158-
Latent Dirichlet Allocation is a well-known topic modeling algorithm that infers topical structure from text data,
159-
and can be used to featurize any text fields as low-dimensional topical vectors.
160-
<para>LightLDA is an extremely efficient implementation of LDA developed in MSR-Asia that incorporates a number of
161-
optimization techniques. See <a href="https://arxiv.org/abs/1412.1576">LightLDA: Big Topic Models on Modest Compute Clusters</a>.
162-
</para>
163-
<para>
164-
With the LDA transform, ML.NET users can train a topic model to produce 1 million topics with 1 million vocabulary
165-
on a 1-billion-token document set one a single machine in a few hours (typically, LDA at this scale takes days and requires large clusters).
166-
The most significant innovation is a super-efficient O(1) <a href="https://en.wikipedia.org/wiki/Metropolis–Hastings_algorithm">Metropolis-Hastings sampling algorithm</a>,
167-
whose running cost is (surprisingly) agnostic of model size,
168-
allowing it to converges nearly an order of magnitude faster than other <a href="https://en.wikipedia.org/wiki/Gibbs_sampling">Gibbs samplers.</a>
169-
</para>
170-
<para>
171-
For more details please see original LightLDA paper, and its open source implementation.
172-
<list type="bullet">
173-
<item><description><a href="https://arxiv.org/abs/1412.1576"> LightLDA: Big Topic Models on Modest Computer Clusters</a></description></item>
174-
<item><description><a href=" https://github.com/Microsoft/LightLDA">LightLDA </a></description></item>
175-
</list>
176-
</para>
177-
</remarks>
178-
</member>
179-
<example name="LightLDA">
180-
<example>
181-
<code language="csharp">
182-
pipeline.Add(new LightLda((&quot;InTextCol&quot; , &quot;OutTextCol&quot;)));
183-
</code>
184-
</example>
185-
</example>
186-
187153
<member name="WordEmbeddings">
188154
<summary>
189155
Word Embeddings transform is a text featurizer which converts vectors of text tokens into sentence vectors using a pre-trained model.

0 commit comments

Comments
 (0)