Lda snapping to template

sfilipi · sfilipi · commit 47977456406b · 2019-04-19T13:45:49.000-07:00
diff --git a/src/Microsoft.ML.Transforms/Text/LdaTransform.cs b/src/Microsoft.ML.Transforms/Text/LdaTransform.cs
@@ -46,7 +46,9 @@ namespace Microsoft.ML.Transforms.Text
     //
     // See <a href="https://github.com/dotnet/machinelearning/blob/master/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs"/>
     // for an example on how to use LatentDirichletAllocationTransformer.
-    /// <include file='doc.xml' path='doc/members/member[@name="LightLDA"]/*' />
+    /// <summary>
+    /// <see cref="ITransformer"/> resulting from fitting a <see cref="LatentDirichletAllocationEstimator"/>.
+    /// </summary>
     public sealed class LatentDirichletAllocationTransformer : OneToOneTransformerBase
     {
         internal sealed class Options : TransformInputBase
@@ -936,7 +938,56 @@ private protected override IRowMapper MakeRowMapper(DataViewSchema schema)
             => new Mapper(this, schema);
     }
 
-    /// <include file='doc.xml' path='doc/members/member[@name="LightLDA"]/*' />
+    /// <summary>
+    /// The LDA transform implements <a href="https://arxiv.org/abs/1412.1576">LightLDA</a>, a state-of-the-art implementation of Latent Dirichlet Allocation.
+    /// </summary>
+    /// <remarks>
+    /// <format type="text/markdown"><![CDATA[
+    ///
+    /// ###  Estimator Characteristics
+    /// |  |  |
+    /// | -- | -- |
+    /// | Does this estimator need to look at the data to train its parameters? | Yes |
+    /// | Input column data type | [key](xref:Microsoft.ML.Data.KeyDataViewType) data types|
+    /// | Output column data type | Vector or <xref:System.Single>|
+    ///
+    ///  Latent Dirichlet Allocation is a well-known [topic modeling](https://en.wikipedia.org/wiki/Topic_model) algorithm that infers semantic structure from text data,
+    ///  and ultimately helps answer the question on "what is this document about?".
+    ///  It can be used to featurize any text fields as low-dimensional topical vectors.
+    ///  LightLDA is an extremely efficient implementation of LDA developed in MSR-Asia that incorporates a number of
+    ///  optimization techniques.
+    ///  With the LDA transform, ML.NET users can train a topic model to produce 1 million topics with 1 million vocabulary
+    ///  on a 1-billion-token document set one a single machine in a few hours(typically, LDA at this scale takes days and requires large clusters).
+    ///  The most significant innovation is a super-efficient O(1) [Metropolis-Hastings sampling algorithm](https://en.wikipedia.org/wiki/Metropolis–Hastings_algorithm),
+    ///  whose running cost is agnostic of model size, allowing it to converges nearly an order of magnitude faster than other [Gibbs samplers](https://en.wikipedia.org/wiki/Gibbs_sampling).
+    ///
+    ///  In an Ml.Net pipeline, this estimator requires the output of some preprocessing, as its input.
+    ///  A typical pipeline operating on text would require performing text normalization, tokenization and producing n-grams to than supply to LDA.
+    ///  See the example usage in the SeeAlso section for usage suggestions.
+    ///
+    ///  If we have the following three lines of text, as data points:
+    ///  * I like to eat bananas.
+    ///  * I eat bananas everyday.
+    ///  * LightLDA improves the sampling throughput and convergence speed via a novel O(1) metropolis-Hastings sampler,
+    ///  and allows a small cluster of machines to tackle very large data and model sizes based on the model scheduling
+    ///  and data parallelism capabilities of the DMTK parameter server.(quoted from [LightLDA](http://www.dmtk.io/lightlda.html))
+    ///
+    ///  To illustrate the effect of this estimator on text, notice the similarity in values of the first and second row, compared to the third,
+    ///  and see how those values are indicative of semantic similarities between those lines.
+    ///
+    ///  | Topic1  | Topic2  |
+    ///  | ------- | --------|
+    ///  |  0.8571 |  0.1429 |
+    ///  |  0.8571 |  0.1429 |
+    ///  |  0.4909 |  0.5091 |
+    ///
+    ///  For more technical details you can consult the following papers.
+    ///  * [LightLDA: Big Topic Models on Modest Computer Clusters](https://arxiv.org/abs/1412.1576)
+    ///  * [LightLDA](https://github.com/Microsoft/LightLDA)
+    ///
+    /// ]]></format>
+    /// </remarks>
+    /// <seealso cref="TextCatalog.LatentDirichletAllocation(TransformsCatalog.TextTransforms, string, string, int, float, float, int, int, int, int, int, int, int, bool)"/>
     public sealed class LatentDirichletAllocationEstimator : IEstimator<LatentDirichletAllocationTransformer>
     {
         [BestFriend]
diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs
@@ -496,8 +496,8 @@ internal static NgramHashingEstimator ProduceHashedNgrams(this TransformsCatalog
              => new NgramHashingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), columns);
 
         /// <summary>
-        /// Uses <a href="https://arxiv.org/abs/1412.1576">LightLDA</a> to transform a document (represented as a vector of floats)
-        /// into a vector of floats over a set of topics.
+        /// Create a <see cref="LatentDirichletAllocationEstimator"/>, which uses <a href="https://arxiv.org/abs/1412.1576">LightLDA</a> to transform text (represented as a vector of floats)
+        /// into a vector of floats indicating the similarity of the text with each topic identified.
         /// </summary>
         /// <param name="catalog">The transform's catalog.</param>
         /// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
diff --git a/src/Microsoft.ML.Transforms/Text/doc.xml b/src/Microsoft.ML.Transforms/Text/doc.xml
@@ -150,40 +150,6 @@
       </example>
     </member>
 
-    <member name="LightLDA">
-      <summary>
-        The LDA transform implements LightLDA, a state-of-the-art implementation of Latent Dirichlet Allocation.
-      </summary>
-      <remarks>
-        Latent Dirichlet Allocation is a well-known topic modeling algorithm that infers topical structure from text data,
-        and can be used to featurize any text fields as low-dimensional topical vectors. 
-        <para>LightLDA is an extremely efficient implementation of LDA developed in MSR-Asia that incorporates a number of 
-         optimization techniques. See <a href="https://arxiv.org/abs/1412.1576">LightLDA: Big Topic Models on Modest Compute Clusters</a>.
-        </para>
-        <para>
-          With the LDA transform, ML.NET users can train a topic model to produce 1 million topics with 1 million vocabulary
-          on a 1-billion-token document set one a single machine in a few hours (typically, LDA at this scale takes days and requires large clusters).
-          The most significant innovation is a super-efficient O(1) <a href="https://en.wikipedia.org/wiki/Metropolis–Hastings_algorithm">Metropolis-Hastings sampling algorithm</a>,
-          whose running cost is (surprisingly) agnostic of model size,
-          allowing it to converges nearly an order of magnitude faster than other <a href="https://en.wikipedia.org/wiki/Gibbs_sampling">Gibbs samplers.</a>
-        </para>
-        <para>
-          For more details please see original LightLDA paper, and its open source implementation. 
-          <list type="bullet">
-            <item><description><a href="https://arxiv.org/abs/1412.1576"> LightLDA: Big Topic Models on Modest Computer Clusters</a></description></item>
-            <item><description><a href=" https://github.com/Microsoft/LightLDA">LightLDA </a></description></item>
-          </list>
-        </para>
-      </remarks>
-    </member>
-    <example name="LightLDA">
-      <example>
-        <code language="csharp">
-          pipeline.Add(new LightLda((&quot;InTextCol&quot; , &quot;OutTextCol&quot;)));
-        </code>
-      </example>
-    </example>
-
     <member name="WordEmbeddings">
       <summary>
         Word Embeddings transform is a text featurizer which converts vectors of text tokens into sentence vectors using a pre-trained model.