Lda snapping to template (#3442)

sfilipi · web-flow · commit b301aec1a64f · 2019-04-20T23:09:27.000-07:00
diff --git a/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs b/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs
@@ -28,7 +28,7 @@ public static class ConversionsExtensionsCatalog
         /// are vectors or scalars.</param>
         /// <param name="inputColumnName">Name of the column whose data will be hashed.
         /// If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.
-        /// This estimator operates over text, numeric, boolean, key or <see cref="DataViewRowId"/> data types. </param>
+        /// This estimator operates over vectors or scalars of text, numeric, boolean, key or <see cref="DataViewRowId"/> data types. </param>
         /// <param name="numberOfBits">Number of bits to hash into. Must be between 1 and 31, inclusive.</param>
         /// <param name="maximumNumberOfInverts">During hashing we construct mappings between original values and the produced hash values.
         /// Text representation of original values are stored in the slot names of the annotations for the new column.Hashing, as such, can map many initial values to one.
diff --git a/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingEstimator.cs b/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingEstimator.cs
@@ -21,7 +21,7 @@ namespace Microsoft.ML.Transforms
     /// |  |  |
     /// | -- | -- |
     /// | Does this estimator need to look at the data to train its parameters? | Yes |
-    /// | Input column data type | Vector or primitive numeric, boolean, [text](xref:Microsoft.ML.Data.TextDataViewType), [System.DateTime](xref:System.DateTime) and [key](xref:Microsoft.ML.Data.KeyDataViewType) data types.|
+    /// | Input column data type | Scalar numeric, boolean, [text](xref:Microsoft.ML.Data.TextDataViewType), [System.DateTime](xref:System.DateTime) or [key](xref:Microsoft.ML.Data.KeyDataViewType) data types.|
     /// | Output column data type | [key](xref:Microsoft.ML.Data.KeyDataViewType)|
     ///
     /// The ValueToKeyMappingEstimator builds up term vocabularies(dictionaries) mapping the input values to the keys on the dictionary.
diff --git a/src/Microsoft.ML.Transforms/Text/LdaTransform.cs b/src/Microsoft.ML.Transforms/Text/LdaTransform.cs
@@ -46,7 +46,9 @@ namespace Microsoft.ML.Transforms.Text
     //
     // See <a href="https://github.com/dotnet/machinelearning/blob/master/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs"/>
     // for an example on how to use LatentDirichletAllocationTransformer.
-    /// <include file='doc.xml' path='doc/members/member[@name="LightLDA"]/*' />
+    /// <summary>
+    /// <see cref="ITransformer"/> resulting from fitting a <see cref="LatentDirichletAllocationEstimator"/>.
+    /// </summary>
     public sealed class LatentDirichletAllocationTransformer : OneToOneTransformerBase
     {
         internal sealed class Options : TransformInputBase
@@ -936,7 +938,56 @@ private protected override IRowMapper MakeRowMapper(DataViewSchema schema)
             => new Mapper(this, schema);
     }
 
-    /// <include file='doc.xml' path='doc/members/member[@name="LightLDA"]/*' />
+    /// <summary>
+    /// The LDA transform implements <a href="https://arxiv.org/abs/1412.1576">LightLDA</a>, a state-of-the-art implementation of Latent Dirichlet Allocation.
+    /// </summary>
+    /// <remarks>
+    /// <format type="text/markdown"><![CDATA[
+    ///
+    /// ###  Estimator Characteristics
+    /// |  |  |
+    /// | -- | -- |
+    /// | Does this estimator need to look at the data to train its parameters? | Yes |
+    /// | Input column data type | Vector of <xref:System.Single> |
+    /// | Output column data type | Vector of <xref:System.Single>|
+    ///
+    ///  Latent Dirichlet Allocation is a well-known [topic modeling](https://en.wikipedia.org/wiki/Topic_model) algorithm that infers semantic structure from text data,
+    ///  and ultimately helps answer the question on "what is this document about?".
+    ///  It can be used to featurize any text fields as low-dimensional topical vectors.
+    ///  LightLDA is an extremely efficient implementation of LDA that incorporates a number of
+    ///  optimization techniques.
+    ///  With the LDA transform, ML.NET users can train a topic model to produce 1 million topics with 1 million words vocabulary
+    ///  on a 1-billion-token document set one a single machine in a few hours(typically, LDA at this scale takes days and requires large clusters).
+    ///  The most significant innovation is a super-efficient $O(1)$. [Metropolis-Hastings sampling algorithm](https://en.wikipedia.org/wiki/Metropolis–Hastings_algorithm),
+    ///  whose running cost is agnostic of model size, allowing it to converges nearly an order of magnitude faster than other [Gibbs samplers](https://en.wikipedia.org/wiki/Gibbs_sampling).
+    ///
+    ///  In an ML.NET pipeline, this estimator requires the output of some preprocessing, as its input.
+    ///  A typical pipeline operating on text would require text normalization, tokenization and producing n-grams to supply to the LDA estimator.
+    ///  See the example usage in the See Also section for usage suggestions.
+    ///
+    ///  If we have the following three examples of text, as data points, and use the LDA transform with the number of topics set to 3,
+    ///  we would get the results displayed in the table below. Example documents:
+    ///  * I like to eat bananas.
+    ///  * I eat bananas everyday.
+    ///  * First celebrated in 1970, Earth Day now includes events in more than 193 countries,
+    ///    which are now coordinated globally by the Earth Day Network.
+    ///
+    ///  Notice the similarity in values of the first and second row, compared to the third,
+    ///  and see how those values are indicative of similarities between those two (small) bodies of text.
+    ///
+    ///  | Topic1  | Topic2  | Topic 3 |
+    ///  | ------- | ------- | ------- |
+    ///  |  0.5714 | 0.0000  | 0.4286  |
+    ///  |  0.5714 | 0.0000  | 0.4286  |
+    ///  |  0.2400 | 0.3200  | 0.4400  |
+    ///
+    ///  For more technical details you can consult the following papers.
+    ///  * [LightLDA: Big Topic Models on Modest Computer Clusters](https://arxiv.org/abs/1412.1576)
+    ///  * [LightLDA](https://github.com/Microsoft/LightLDA)
+    ///
+    /// ]]></format>
+    /// </remarks>
+    /// <seealso cref="TextCatalog.LatentDirichletAllocation(TransformsCatalog.TextTransforms, string, string, int, float, float, int, int, int, int, int, int, int, bool)"/>
     public sealed class LatentDirichletAllocationEstimator : IEstimator<LatentDirichletAllocationTransformer>
     {
         [BestFriend]
diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs
@@ -556,12 +556,15 @@ internal static NgramHashingEstimator ProduceHashedNgrams(this TransformsCatalog
              => new NgramHashingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), columns);
 
         /// <summary>
-        /// Uses <a href="https://arxiv.org/abs/1412.1576">LightLDA</a> to transform a document (represented as a vector of floats)
-        /// into a vector of floats over a set of topics.
+        /// Create a <see cref="LatentDirichletAllocationEstimator"/>, which uses <a href="https://arxiv.org/abs/1412.1576">LightLDA</a> to transform text (represented as a vector of floats)
+        /// into a vector of <see cref="System.Single"/> indicating the similarity of the text with each topic identified.
         /// </summary>
         /// <param name="catalog">The transform's catalog.</param>
-        /// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
-        /// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
+        /// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.
+        /// This estimator outputs a vector of <see cref="System.Single"/>.</param>
+        /// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.
+        /// This estimator operates over a vector of <see cref="System.Single"/>.
+        /// </param>
         /// <param name="numberOfTopics">The number of topics.</param>
         /// <param name="alphaSum">Dirichlet prior on document-topic vectors.</param>
         /// <param name="beta">Dirichlet prior on vocab-topic vectors.</param>
diff --git a/src/Microsoft.ML.Transforms/Text/doc.xml b/src/Microsoft.ML.Transforms/Text/doc.xml
@@ -150,40 +150,6 @@
       </example>
     </member>
 
-    <member name="LightLDA">
-      <summary>
-        The LDA transform implements LightLDA, a state-of-the-art implementation of Latent Dirichlet Allocation.
-      </summary>
-      <remarks>
-        Latent Dirichlet Allocation is a well-known topic modeling algorithm that infers topical structure from text data,
-        and can be used to featurize any text fields as low-dimensional topical vectors. 
-        <para>LightLDA is an extremely efficient implementation of LDA developed in MSR-Asia that incorporates a number of 
-         optimization techniques. See <a href="https://arxiv.org/abs/1412.1576">LightLDA: Big Topic Models on Modest Compute Clusters</a>.
-        </para>
-        <para>
-          With the LDA transform, ML.NET users can train a topic model to produce 1 million topics with 1 million vocabulary
-          on a 1-billion-token document set one a single machine in a few hours (typically, LDA at this scale takes days and requires large clusters).
-          The most significant innovation is a super-efficient O(1) <a href="https://en.wikipedia.org/wiki/Metropolis–Hastings_algorithm">Metropolis-Hastings sampling algorithm</a>,
-          whose running cost is (surprisingly) agnostic of model size,
-          allowing it to converges nearly an order of magnitude faster than other <a href="https://en.wikipedia.org/wiki/Gibbs_sampling">Gibbs samplers.</a>
-        </para>
-        <para>
-          For more details please see original LightLDA paper, and its open source implementation. 
-          <list type="bullet">
-            <item><description><a href="https://arxiv.org/abs/1412.1576"> LightLDA: Big Topic Models on Modest Computer Clusters</a></description></item>
-            <item><description><a href=" https://github.com/Microsoft/LightLDA">LightLDA </a></description></item>
-          </list>
-        </para>
-      </remarks>
-    </member>
-    <example name="LightLDA">
-      <example>
-        <code language="csharp">
-          pipeline.Add(new LightLda((&quot;InTextCol&quot; , &quot;OutTextCol&quot;)));
-        </code>
-      </example>
-    </example>
-
     <member name="WordEmbeddings">
       <summary>
         Word Embeddings transform is a text featurizer which converts vectors of text tokens into sentence vectors using a pre-trained model.