Update documentation for WordBag (#3440)

Ivanidzo4ka · web-flow · commit c0832b53264e · 2019-04-20T10:28:08.000-07:00
diff --git a/src/Microsoft.ML.Transforms/Text/NgramHashingTransformer.cs b/src/Microsoft.ML.Transforms/Text/NgramHashingTransformer.cs
@@ -866,11 +866,11 @@ public VBuffer<ReadOnlyMemory<char>>[] SlotNamesMetadata(out VectorDataViewType[
     /// |  |  |
     /// | -- | -- |
     /// | Does this estimator need to look at the data to train its parameters? | Yes |
-    /// | Input column data type | Vector of [Key](<xref:Microsoft.ML.Data.KeyDataViewType>) |
+    /// | Input column data type | Vector of [Key](xref:Microsoft.ML.Data.KeyDataViewType) |
     /// | Output column data type | Vector of known size of <xref:System.Single> |
     ///
     /// The resulting <xref:Microsoft.ML.Transforms.Text.NgramHashingTransformer/> creates a new column, named as specified in the output column name parameters, and
-    /// produces a vector of counts of n-grams (sequences of consecutive words of length 1-n) from a given data.
+    /// produces a vector of n-gram counts (sequences of consecutive words of length 1-n) from a given data.
     /// It does so by hashing each n-gram and using the hash value as the index in the bag.
     ///
     /// <xref:Microsoft.ML.Transforms.Text.NgramHashingEstimator> is different from <xref:Microsoft.ML.Transforms.Text.WordHashBagEstimator>
diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs
@@ -323,12 +323,18 @@ public static CustomStopWordsRemovingEstimator RemoveStopWords(this TransformsCa
             => new CustomStopWordsRemovingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnName, stopwords);
 
         /// <summary>
-        /// Produces a bag of counts of ngrams (sequences of consecutive words) in <paramref name="inputColumnName"/>
-        /// and outputs bag of word vector as <paramref name="outputColumnName"/>
+        /// Create a <see cref="WordHashBagEstimator"/>, which maps the column specified in <paramref name="inputColumnName"/>
+        /// to a vector of n-gram counts in a new column named <paramref name="outputColumnName"/>.
         /// </summary>
-        /// <param name="catalog">The text-related transform's catalog.</param>
-        /// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
-        /// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
+        /// <remarks>
+        /// <see cref="WordBagEstimator"/> is different from <see cref="NgramExtractingEstimator"/> in that the former
+        /// tokenizes text internally and the latter takes tokenized text as input.
+        /// </remarks>
+        /// <param name="catalog">The transform's catalog.</param>
+        /// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.
+        /// This column's data type will be known-size vector of <see cref="System.Single"/>.</param>
+        /// <param name="inputColumnName">Name of the column to take the data from.
+        /// This estimator operates over vector of text.</param>
         /// <param name="ngramLength">Ngram length.</param>
         /// <param name="skipLength">Maximum number of tokens to skip when constructing an ngram.</param>
         /// <param name="useAllLengths">Whether to include all ngram lengths up to <paramref name="ngramLength"/> or only <paramref name="ngramLength"/>.</param>
@@ -346,12 +352,18 @@ public static WordBagEstimator ProduceWordBags(this TransformsCatalog.TextTransf
                 outputColumnName, inputColumnName, ngramLength, skipLength, useAllLengths, maximumNgramsCount, weighting);
 
         /// <summary>
-        /// Produces a bag of counts of ngrams (sequences of consecutive words) in <paramref name="inputColumnNames"/>
-        /// and outputs bag of word vector as <paramref name="outputColumnName"/>
+        /// Create a <see cref="WordHashBagEstimator"/>, which maps the multiple columns specified in <paramref name="inputColumnNames"/>
+        /// to a vector of n-gram counts in a new column named <paramref name="outputColumnName"/>.
         /// </summary>
-        /// <param name="catalog">The text-related transform's catalog.</param>
-        /// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnNames"/>.</param>
-        /// <param name="inputColumnNames">Name of the columns to transform.</param>
+        /// <remarks>
+        /// <see cref="WordBagEstimator"/> is different from <see cref="NgramExtractingEstimator"/> in that the former
+        /// tokenizes text internally and the latter takes tokenized text as input.
+        /// </remarks>
+        /// <param name="catalog">The transform's catalog.</param>
+        /// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnNames"/>.
+        /// This column's data type will be known-size vector of <see cref="System.Single"/>.</param>
+        /// <param name="inputColumnNames">Names of the multiple columns to take the data from.
+        /// This estimator operates over vector of text.</param>
         /// <param name="ngramLength">Ngram length.</param>
         /// <param name="skipLength">Maximum number of tokens to skip when constructing an ngram.</param>
         /// <param name="useAllLengths">Whether to include all ngram lengths up to <paramref name="ngramLength"/> or only <paramref name="ngramLength"/>.</param>
@@ -369,12 +381,18 @@ public static WordBagEstimator ProduceWordBags(this TransformsCatalog.TextTransf
                 outputColumnName, inputColumnNames, ngramLength, skipLength, useAllLengths, maximumNgramsCount, weighting);
 
         /// <summary>
-        /// Produces a bag of counts of hashed ngrams in <paramref name="inputColumnName"/>
-        /// and outputs bag of word vector as <paramref name="outputColumnName"/>
+        /// Create a <see cref="WordHashBagEstimator"/>, which maps the column specified in <paramref name="inputColumnName"/>
+        /// to a vector of counts of hashed n-grams in a new column named <paramref name="outputColumnName"/>.
         /// </summary>
-        /// <param name="catalog">The text-related transform's catalog.</param>
-        /// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
-        /// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
+        /// <remarks>
+        /// <see cref="WordHashBagEstimator"/> is different from <see cref="NgramHashingEstimator"/> in that the former
+        /// tokenizes text internally and the latter takes tokenized text as input.
+        /// </remarks>
+        /// <param name="catalog">The transform's catalog.</param>
+        /// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.
+        /// This column's data type will be known-size vector of <see cref="System.Single"/>.</param>
+        /// <param name="inputColumnName">Name of the column to take the data from.
+        /// This estimator operates over vector of text.</param>
         /// <param name="numberOfBits">Number of bits to hash into. Must be between 1 and 30, inclusive.</param>
         /// <param name="ngramLength">Ngram length.</param>
         /// <param name="skipLength">Maximum number of tokens to skip when constructing an ngram.</param>
@@ -401,12 +419,18 @@ public static WordHashBagEstimator ProduceHashedWordBags(this TransformsCatalog.
                 maximumNumberOfInverts: maximumNumberOfInverts);
 
         /// <summary>
-        /// Produces a bag of counts of hashed ngrams in <paramref name="inputColumnNames"/>
-        /// and outputs bag of word vector as <paramref name="outputColumnName"/>
+        /// Create a <see cref="WordHashBagEstimator"/>, which maps the multiple columns specified in <paramref name="inputColumnNames"/>
+        /// to a vector of counts of hashed n-grams in a new column named <paramref name="outputColumnName"/>.
         /// </summary>
-        /// <param name="catalog">The text-related transform's catalog.</param>
-        /// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnNames"/>.</param>
-        /// <param name="inputColumnNames">Name of the columns to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
+        /// <remarks>
+        /// <see cref="WordHashBagEstimator"/> is different from <see cref="NgramHashingEstimator"/> in that the former
+        /// tokenizes text internally and the latter takes tokenized text as input.
+        /// </remarks>
+        /// <param name="catalog">The transform's catalog.</param>
+        /// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnNames"/>.
+        /// This column's data type will be known-size vector of <see cref="System.Single"/>.</param>
+        /// <param name="inputColumnNames">Names of the multiple columns to take the data from.
+        /// This estimator operates over vector of text.</param>
         /// <param name="numberOfBits">Number of bits to hash into. Must be between 1 and 30, inclusive.</param>
         /// <param name="ngramLength">Ngram length.</param>
         /// <param name="skipLength">Maximum number of tokens to skip when constructing an ngram.</param>
diff --git a/src/Microsoft.ML.Transforms/Text/WrappedTextTransformers.cs b/src/Microsoft.ML.Transforms/Text/WrappedTextTransformers.cs
@@ -10,11 +10,30 @@
 
 namespace Microsoft.ML.Transforms.Text
 {
-
     /// <summary>
-    /// Produces a bag of counts of ngrams (sequences of consecutive words) in a given text.
-    /// It does so by building a dictionary of ngrams and using the id in the dictionary as the index in the bag.
+    /// <see cref="IEstimator{TTransformer}"/> for the <see cref="ITransformer"/>.
     /// </summary>
+    /// <remarks>
+    /// <format type="text/markdown"><![CDATA[
+    /// ###  Estimator Characteristics
+    /// |  |  |
+    /// | -- | -- |
+    /// | Does this estimator need to look at the data to train its parameters? | Yes |
+    /// | Input column data type | Vector of [Text](xref:Microsoft.ML.Data.TextDataViewType) |
+    /// | Output column data type | Vector of known-size of <xref:System.Single> |
+    ///
+    /// The resulting <xref:Microsoft.ML.ITransformer> creates a new column, named as specified in the output column name parameters, and
+    /// produces a vector of n-gram counts (sequences of n consecutive words) from a given data.
+    /// It does so by building a dictionary of ngrams and using the id in the dictionary as the index in the bag.
+    ///
+    /// <xref:Microsoft.ML.Transforms.Text.WordBagEstimator> is different from <xref:Microsoft.ML.Transforms.Text.NgramExtractingEstimator>
+    /// in that the former takes tokenizes text internally while the latter takes tokenized text as input.
+    /// See the See Also section for links to examples of the usage.
+    /// ]]>
+    /// </format>
+    /// </remarks>
+    /// <seealso cref="TextCatalog.ProduceWordBags(TransformsCatalog.TextTransforms, string, string, int, int, bool, int, NgramExtractingEstimator.WeightingCriteria)" />
+    /// <seealso cref="TextCatalog.ProduceWordBags(TransformsCatalog.TextTransforms, string, string[], int, int, bool, int, NgramExtractingEstimator.WeightingCriteria)" />
     public sealed class WordBagEstimator : IEstimator<ITransformer>
     {
         private readonly IHost _host;
@@ -182,9 +201,29 @@ public SchemaShape GetOutputSchema(SchemaShape inputSchema)
     }
 
     /// <summary>
-    /// Produces a bag of counts of ngrams (sequences of consecutive words of length 1-n) in a given text.
-    /// It does so by hashing each ngram and using the hash value as the index in the bag.
+    /// <see cref="IEstimator{TTransformer}"/> for the <see cref="ITransformer"/>.
     /// </summary>
+    /// <remarks>
+    /// <format type="text/markdown"><![CDATA[
+    /// ###  Estimator Characteristics
+    /// |  |  |
+    /// | -- | -- |
+    /// | Does this estimator need to look at the data to train its parameters? | Yes |
+    /// | Input column data type | Vector of [Text](xref:Microsoft.ML.Data.TextDataViewType) |
+    /// | Output column data type | Vector of known-size of <xref:System.Single> |
+    ///
+    /// The resulting <xref:Microsoft.ML.ITransformer> creates a new column, named as specified in the output column name parameters, and
+    /// produces a vector of n-gram counts (sequences of n consecutive words) from a given data.
+    /// It does so by hashing each ngram and using the hash value as the index in the bag.
+    ///
+    /// <xref:Microsoft.ML.Transforms.Text.WordHashBagEstimator> is different from <xref:Microsoft.ML.Transforms.Text.NgramHashingEstimator>
+    /// in that the former takes tokenizes text internally while the latter takes tokenized text as input.
+    /// See the See Also section for links to examples of the usage.
+    /// ]]>
+    /// </format>
+    /// </remarks>
+    /// <seealso cref="TextCatalog.ProduceHashedWordBags(TransformsCatalog.TextTransforms, string, string, int, int, int, bool, uint, bool, int)" />
+    /// <seealso cref="TextCatalog.ProduceHashedWordBags(TransformsCatalog.TextTransforms, string, string[], int, int, int, bool, uint, bool, int)" />
     public sealed class WordHashBagEstimator : IEstimator<ITransformer>
     {
         private readonly IHost _host;