dotnet · artidoro · Apr 19, 2019 · Apr 18, 2019 · Apr 19, 2019 · Apr 19, 2019
diff --git a/src/Microsoft.ML.Transforms/Text/NgramTransform.cs b/src/Microsoft.ML.Transforms/Text/NgramTransform.cs
@@ -30,8 +30,7 @@
 namespace Microsoft.ML.Transforms.Text
 {
     /// <summary>
-    /// Produces a bag of counts of ngrams(sequences of consecutive values of length 1-n) in a given vector of keys.
-    /// It does so by building a dictionary of ngrams and using the id in the dictionary as the index in the bag.
+    /// <see cref="ITransformer"/> resulting from fitting an <see cref="NgramExtractingEstimator"/>.
     /// </summary>
     public sealed class NgramExtractingTransformer : OneToOneTransformerBase
     {
@@ -668,9 +667,29 @@ protected override Delegate MakeGetter(DataViewRow input, int iinfo, Func<int, b
     }
 
     /// <summary>
-    /// Produces a bag of counts of ngrams(sequences of consecutive values of length 1-n) in a given vector of keys.
-    /// It does so by building a dictionary of ngrams and using the id in the dictionary as the index in the bag.
+    /// Produces a vector of counts of ngrams (sequences of consecutive words) encountered in the input text.
     /// </summary>
+    /// <remarks>
+    /// <format type="text/markdown"><![CDATA[
+    ///
+    /// ###  Estimator Characteristics
+    /// |  |  |
+    /// | -- | -- |
+    /// | Does this estimator need to look at the data to train its parameters? | Yes |
+    /// | Input column data type | Vector of [Keys](xref:Microsoft.ML.Data.KeyDataViewType) |
+    /// | Output column data type | Known-sized vector of <xref:System.Single> |
+    ///
+    /// The resulting <xref:Microsoft.ML.Transforms.Text.NgramExtractingTransformer>
+    /// creates a new column, named as specified in the output column name parameters, where each
+    /// input vector is mapped to a vector of counts of ngrams (sequences of consecutive words) encountered in the input text.
+    ///
+    /// The estimator builds a dictionary of ngrams and the <xref:Microsoft.ML.Transforms.Text.NgramExtractingTransformer>
+    /// uses the id in the dictionary as the index in the count vector that it produces.
+    ///
+    /// See the See Also section for links to examples of the usage.
+    /// ]]></format>
+    /// </remarks>
+    /// <seealso cref="TextCatalog.ProduceNgrams(TransformsCatalog.TextTransforms, string, string, int, int, bool, int, WeightingCriteria)"/>
     public sealed class NgramExtractingEstimator : IEstimator<NgramExtractingTransformer>
     {
         /// <summary>
@@ -679,12 +698,18 @@ public sealed class NgramExtractingEstimator : IEstimator<NgramExtractingTransfo
         /// </summary>
         public enum WeightingCriteria
         {
+            /// <summary>Term Frequency. Calculated based on the number of occurrences in the document.</summary>
             [EnumValueDisplay("TF (Term Frequency)")]
             Tf = 0,
 
+            /// <summary>
+            /// Inverse Document Frequency. A ratio (the logarithm of inverse relative frequency)
+            /// that measures the information a slot provides by determining how common or rare it is across the entire corpus.
+            /// </summary>
             [EnumValueDisplay("IDF (Inverse Document Frequency)")]
             Idf = 1,
 
+            /// <summary>The product of the term frequency and the inverse document frequency.</summary>
             [EnumValueDisplay("TF-IDF")]
             TfIdf = 2
         }
@@ -782,7 +807,7 @@ internal static bool IsSchemaColumnValid(SchemaShape.Column col)
                 return false;
             if (!col.IsKey)
                 return false;
-            // Can only accept key types that can be converted to U4.
+            // Can only accept key types that can be converted to U8.
             if (!NgramUtils.IsValidNgramRawType(col.ItemType.RawType))
                 return false;
             return true;

diff --git a/src/Microsoft.ML.Transforms/Text/NgramUtils.cs b/src/Microsoft.ML.Transforms/Text/NgramUtils.cs
@@ -206,7 +206,7 @@ internal static class NgramUtils
     {
         public static bool IsValidNgramRawType(Type rawType)
         {
-            // Can only accept key types that can be converted to U4 (uint).
+            // Can only accept key types that can be converted to U8 (ulong).
             return rawType != typeof(ulong);
         }
     }

diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs
@@ -57,11 +57,15 @@ public static TextFeaturizingEstimator FeaturizeText(this TransformsCatalog.Text
                 outputColumnName, inputColumnNames, options);
 
         /// <summary>
-        /// Tokenize incoming text in <paramref name="inputColumnName"/> and output the tokens as <paramref name="outputColumnName"/>.
+        /// Create a <see cref="TokenizingByCharactersEstimator"/>, which tokenizes characters by splitting text into sequences of characters
+        /// using a sliding window.
         /// </summary>
         /// <param name="catalog">The text-related transform's catalog.</param>
-        /// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
-        /// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
+        /// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.
+        /// This column's data type will be a variable-sized vector of keys.</param>
+        /// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the
+        /// <paramref name="outputColumnName"/> will be used as source.
+        /// This estimator operates over text data type.</param>
         /// <param name="useMarkerCharacters">Whether to prepend a marker character, <see langword="0x02"/>, to the beginning,
         /// and append another marker character, <see langword="0x03"/>, to the end of the output vector of characters.</param>
         /// <example>
@@ -85,7 +89,6 @@ public static TokenizingByCharactersEstimator TokenizeIntoCharactersAsKeys(this
         /// <param name="useMarkerCharacters">Whether to prepend a marker character, <see langword="0x02"/>, to the beginning,
         /// and append another marker character, <see langword="0x03"/>, to the end of the output vector of characters.</param>
         /// <param name="columns">Pairs of columns to run the tokenization on.</param>
-
         [BestFriend]
         internal static TokenizingByCharactersEstimator TokenizeIntoCharactersAsKeys(this TransformsCatalog.TextTransforms catalog,
             bool useMarkerCharacters = CharTokenizingDefaults.UseMarkerCharacters,
@@ -97,12 +100,15 @@ internal static TokenizingByCharactersEstimator TokenizeIntoCharactersAsKeys(thi
         }
 
         /// <summary>
-        /// Normalizes incoming text in <paramref name="inputColumnName"/> by changing case, removing diacritical marks, punctuation marks and/or numbers
-        /// and outputs new text as <paramref name="outputColumnName"/>.
+        /// Creates a <see cref="TextNormalizingEstimator"/>, which normalizes incoming text in <paramref name="inputColumnName"/> by optionally
+        /// changing case, removing diacritical marks, punctuation marks, numbers, and outputs new text as <paramref name="outputColumnName"/>.
         /// </summary>
         /// <param name="catalog">The text-related transform's catalog.</param>
-        /// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
-        /// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
+        /// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.
+        /// This column's data type will remain scalar of text or a vector of text depending on the input column data type.</param>
+        /// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>,
+        /// the value of the <paramref name="outputColumnName"/> will be used as source.
+        /// This estimator operates on text or vector of text data types.</param>
         /// <param name="caseMode">Casing text using the rules of the invariant culture.</param>
         /// <param name="keepDiacritics">Whether to keep diacritical marks or remove them.</param>
         /// <param name="keepPunctuations">Whether to keep punctuation marks or remove them.</param>
@@ -124,10 +130,16 @@ public static TextNormalizingEstimator NormalizeText(this TransformsCatalog.Text
             => new TextNormalizingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(),
                 outputColumnName, inputColumnName, caseMode, keepDiacritics, keepPunctuations, keepNumbers);
 
-        /// <include file='doc.xml' path='doc/members/member[@name="WordEmbeddings"]/*' />
+        /// <summary>
+        /// Create an <see cref="WordEmbeddingEstimator"/>, which is a text featurizer that converts a vector
+        /// of text into a numerical vector using pre-trained embeddings models.
+        /// </summary>
         /// <param name="catalog">The text-related transform's catalog.</param>
-        /// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
-        /// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
+        /// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.
+        /// This column's data type will be a vector of <see cref="System.Single"/>.</param>
+        /// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>,
+        /// the value of the <paramref name="outputColumnName"/> will be used as source.
+        /// This estimator operates over known-sized vector of text data type.</param>
         /// <param name="modelKind">The embeddings <see cref="WordEmbeddingEstimator.PretrainedModelKind"/> to use. </param>
         /// <example>
         /// <format type="text/markdown">
@@ -142,11 +154,17 @@ public static WordEmbeddingEstimator ApplyWordEmbedding(this TransformsCatalog.T
             WordEmbeddingEstimator.PretrainedModelKind modelKind = WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding)
             => new WordEmbeddingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnName, modelKind);
 
-        /// <include file='doc.xml' path='doc/members/member[@name="WordEmbeddings"]/*' />
+        /// <summary>
+        /// Create an <see cref="WordEmbeddingEstimator"/>, which is a text featurizer that converts vectors
+        /// of text into numerical vectors using pre-trained embeddings models.
+        /// </summary>
         /// <param name="catalog">The text-related transform's catalog.</param>
-        /// <param name="customModelFile">The path of the pre-trained embeedings model to use. </param>
-        /// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
-        /// <param name="inputColumnName">Name of the column to transform.</param>
+        /// <param name="customModelFile">The path of the pre-trained embeddings model to use.</param>
+        /// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.
+        /// This column's data type will be a vector of <see cref="System.Single"/>.</param>
+        /// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>,
+        /// the value of the <paramref name="outputColumnName"/> will be used as source.
+        /// This estimator operates over known-sized vector of text data type.</param>
         /// <example>
         /// <format type="text/markdown">
         /// <![CDATA[
@@ -161,10 +179,13 @@ public static WordEmbeddingEstimator ApplyWordEmbedding(this TransformsCatalog.T
             => new WordEmbeddingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(),
                 outputColumnName, customModelFile, inputColumnName ?? outputColumnName);
 
-        /// <include file='doc.xml' path='doc/members/member[@name="WordEmbeddings"]/*' />
+        /// <summary>
+        /// Create an <see cref="WordEmbeddingEstimator"/>, which is a text featurizer that converts vectors
+        /// of text into numerical vectors using pre-trained embeddings models.
+        /// </summary>
         /// <param name="catalog">The text-related transform's catalog.</param>
         /// <param name="modelKind">The embeddings <see cref="WordEmbeddingEstimator.PretrainedModelKind"/> to use. </param>
-        /// <param name="columns">The array columns, and per-column configurations to extract embeedings from.</param>
+        /// <param name="columns">The array columns, and per-column configurations to extract embeddings from.</param>
         /// <example>
         /// <format type="text/markdown">
         /// <![CDATA[
@@ -179,12 +200,13 @@ internal static WordEmbeddingEstimator ApplyWordEmbedding(this TransformsCatalog
             => new WordEmbeddingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), modelKind, columns);
 
         /// <summary>
-        /// Tokenizes incoming text in <paramref name="inputColumnName"/>, using <paramref name="separators"/> as separators,
-        /// and outputs the tokens as <paramref name="outputColumnName"/>.
+        /// Create a <see cref="WordTokenizingEstimator"/>, which tokenizes input text using <paramref name="separators"/> as separators.
         /// </summary>
         /// <param name="catalog">The text-related transform's catalog.</param>
-        /// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
-        /// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
+        /// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.
+        /// This column's data type will be a variable-sized vector of text.</param>
+        /// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.
+        /// This estimator operates on scalar of text and vector of text data type.</param>
         /// <param name="separators">The separators to use (uses space character by default).</param>
         /// <example>
         /// <format type="text/markdown">
@@ -210,17 +232,21 @@ internal static WordTokenizingEstimator TokenizeIntoWords(this TransformsCatalog
           => new WordTokenizingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), columns);
 
         /// <summary>
-        /// Produces a bag of counts of ngrams (sequences of consecutive words) in <paramref name="inputColumnName"/>
-        /// and outputs bag of word vector as <paramref name="outputColumnName"/>
+        /// Creates a <see cref="NgramExtractingEstimator"/> which produces a vector of counts of ngrams (sequences of consecutive words)
+        /// encountered in the input text.
         /// </summary>
         /// <param name="catalog">The text-related transform's catalog.</param>
-        /// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
-        /// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
+        /// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.
+        /// This column's data type will be a vector of <see cref="System.Single"/>.</param>
+        /// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.
+        /// This estimator operates over vectors of keys data type.</param>
         /// <param name="ngramLength">Ngram length.</param>
-        /// <param name="skipLength">Maximum number of tokens to skip when constructing an ngram.</param>
+        /// <param name="skipLength">Number of tokens to skip between each ngram. By default no token is skipped.</param>
         /// <param name="useAllLengths">Whether to include all ngram lengths up to <paramref name="ngramLength"/> or only <paramref name="ngramLength"/>.</param>
         /// <param name="maximumNgramsCount">Maximum number of n-grams to store in the dictionary.</param>
-        /// <param name="weighting">Statistical measure used to evaluate how important a word is to a document in a corpus.</param>
+        /// <param name="weighting">Statistical measure used to evaluate how important a word or ngram is to a document in a corpus.
+        /// When <paramref name="maximumNgramsCount"/> is smaller than the total number of encountered ngrams this measure is used
+        /// to determine which ngrams to keep.</param>
         /// <example>
         /// <format type="text/markdown">
         /// <![CDATA[