review comments

artidoro · artidoro · commit 3f4b9971adba · 2019-04-19T12:26:43.000-07:00
diff --git a/src/Microsoft.ML.Transforms/Text/NgramTransform.cs b/src/Microsoft.ML.Transforms/Text/NgramTransform.cs
@@ -676,15 +676,17 @@ protected override Delegate MakeGetter(DataViewRow input, int iinfo, Func<int, b
     /// |  |  |
     /// | -- | -- |
     /// | Does this estimator need to look at the data to train its parameters? | Yes |
-    /// | Input column data type | Vector of [Keys](<xref:Microsoft.ML.Data.KeyDataViewType>) |
+    /// | Input column data type | Vector of [Keys](xref:Microsoft.ML.Data.KeyDataViewType) |
     /// | Output column data type | Known-sized vector of <xref:System.Single> |
     ///
-    /// The resulting [NgramExtractingTransformer]<xref:Microsoft.ML.Transforms.Text.NgramExtractingTransformer>
+    /// The resulting <xref:Microsoft.ML.Transforms.Text.NgramExtractingTransformer>
     /// creates a new column, named as specified in the output column name parameters, where each
     /// input vector is mapped to a vector of counts of ngrams (sequences of consecutive words) encountered in the input text.
     ///
     /// The estimator builds a dictionary of ngrams and the [NgramExtractingTransformer]<xref:Microsoft.ML.Transforms.Text.NgramExtractingTransformer>
     /// uses the id in the dictionary as the index in the count vector that it produces.
+    ///
+    /// See the See Also section for links to examples of the usage.
     /// ]]></format>
     /// </remarks>
     /// <seealso cref="TextCatalog.ProduceNgrams(TransformsCatalog.TextTransforms, string, string, int, int, bool, int, WeightingCriteria)"/>
@@ -700,8 +702,10 @@ public enum WeightingCriteria
             [EnumValueDisplay("TF (Term Frequency)")]
             Tf = 0,
 
-            /// <summary>Inverse Document Frequency. A ratio (the logarithm of inverse relative frequency)
-            /// that measures the information a slot provides by determining how common or rare it is across the entire corpus.</summary>
+            /// <summary>
+            /// Inverse Document Frequency. A ratio (the logarithm of inverse relative frequency)
+            /// that measures the information a slot provides by determining how common or rare it is across the entire corpus.
+            /// </summary>
             [EnumValueDisplay("IDF (Inverse Document Frequency)")]
             Idf = 1,
 
diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs
@@ -62,7 +62,7 @@ public static TextFeaturizingEstimator FeaturizeText(this TransformsCatalog.Text
         /// </summary>
         /// <param name="catalog">The text-related transform's catalog.</param>
         /// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.
-        /// This column's data type will be a vector of keys.</param>
+        /// This column's data type will be a variable-sized vector of keys.</param>
         /// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the
         /// <paramref name="outputColumnName"/> will be used as source.
         /// This estimator operates over text data type.</param>
@@ -100,15 +100,15 @@ internal static TokenizingByCharactersEstimator TokenizeIntoCharactersAsKeys(thi
         }
 
         /// <summary>
-        /// Creates a <see cref="TextNormalizingEstimator"/>, which normalizes incoming text in <paramref name="inputColumnName"/> by changing case,
-        /// removing diacritical marks, punctuation marks and/or numbers and outputs new text as <paramref name="outputColumnName"/>.
+        /// Creates a <see cref="TextNormalizingEstimator"/>, which normalizes incoming text in <paramref name="inputColumnName"/> by optionally
+        /// changing case, removing diacritical marks, punctuation marks, numbers, and outputs new text as <paramref name="outputColumnName"/>.
         /// </summary>
         /// <param name="catalog">The text-related transform's catalog.</param>
         /// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.
         /// This column's data type will remain scalar of text or a vector of text depending on the input column data type.</param>
         /// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>,
         /// the value of the <paramref name="outputColumnName"/> will be used as source.
-        /// This estimator operates on text and vector of text data types.</param>
+        /// This estimator operates on text or vector of text data types.</param>
         /// <param name="caseMode">Casing text using the rules of the invariant culture.</param>
         /// <param name="keepDiacritics">Whether to keep diacritical marks or remove them.</param>
         /// <param name="keepPunctuations">Whether to keep punctuation marks or remove them.</param>
@@ -131,8 +131,8 @@ public static TextNormalizingEstimator NormalizeText(this TransformsCatalog.Text
                 outputColumnName, inputColumnName, caseMode, keepDiacritics, keepPunctuations, keepNumbers);
 
         /// <summary>
-        /// Create an <see cref="WordEmbeddingEstimator"/>, which is a text featurizer that converts vectors
-        /// of text into numerical vectors using pre-trained embeddings models.
+        /// Create an <see cref="WordEmbeddingEstimator"/>, which is a text featurizer that converts a vector
+        /// of text into a numerical vector using pre-trained embeddings models.
         /// </summary>
         /// <param name="catalog">The text-related transform's catalog.</param>
         /// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.
@@ -204,7 +204,7 @@ internal static WordEmbeddingEstimator ApplyWordEmbedding(this TransformsCatalog
         /// </summary>
         /// <param name="catalog">The text-related transform's catalog.</param>
         /// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.
-        /// This column's data type will be a vector of text.</param>
+        /// This column's data type will be a variable-sized vector of text.</param>
         /// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.
         /// This estimator operates of text data type.</param>
         /// <param name="separators">The separators to use (uses space character by default).</param>
@@ -241,7 +241,7 @@ internal static WordTokenizingEstimator TokenizeIntoWords(this TransformsCatalog
         /// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.
         /// This estimator operates over vectors of keys data type.</param>
         /// <param name="ngramLength">Ngram length.</param>
-        /// <param name="skipLength">Number of tokens to skip between each ngram. By defaults no token is skipped.</param>
+        /// <param name="skipLength">Number of tokens to skip between each ngram. By default no token is skipped.</param>
         /// <param name="useAllLengths">Whether to include all ngram lengths up to <paramref name="ngramLength"/> or only <paramref name="ngramLength"/>.</param>
         /// <param name="maximumNgramsCount">Maximum number of n-grams to store in the dictionary.</param>
         /// <param name="weighting">Statistical measure used to evaluate how important a word or ngram is to a document in a corpus.
diff --git a/src/Microsoft.ML.Transforms/Text/TextNormalizing.cs b/src/Microsoft.ML.Transforms/Text/TextNormalizing.cs
@@ -440,11 +440,13 @@ private bool IsCombiningDiacritic(char ch)
     /// |  |  |
     /// | -- | -- |
     /// | Does this estimator need to look at the data to train its parameters? | No |
-    /// | Input column data type | <xref:System.ReadOnlyMemory{System.Char}> or vector of <xref:System.ReadOnlyMemory{System.Char}> |
+    /// | Input column data type | [Text](xref:Microsoft.ML.Data.TextDataViewType)  or Vector of [Text](xref:Microsoft.ML.Data.TextDataViewType)  |
     /// | Output column data type | The same as the data type in the input column |
     ///
-    /// The resulting transformer creates a new column, named as specified in the output column name parameters, and
-    /// normalizes the textual input data by changing case, removing diacritical marks, punctuation marks and/or numbers.
+    /// The resulting <xref:Microsoft.ML.Transforms.Text.TextNormalizingTransformer> creates a new column, named as specified
+    /// in the output column name parameters, and normalizes the textual input data by changing case, removing diacritical marks,
+    /// punctuation marks and/or numbers.
+    ///
     /// See the See Also section for links to examples of the usage.
     /// ]]>
     /// </format>
diff --git a/src/Microsoft.ML.Transforms/Text/TokenizingByCharacters.cs b/src/Microsoft.ML.Transforms/Text/TokenizingByCharacters.cs
@@ -556,14 +556,16 @@ private ValueGetter<VBuffer<ushort>> MakeGetterVec(DataViewRow input, int iinfo)
     /// |  |  |
     /// | -- | -- |
     /// | Does this estimator need to look at the data to train its parameters? | Yes |
-    /// | Input column data type | <xref:System.ReadOnlyMemory{System.Char}> |
-    /// | Output column data type | Variable-sized vector of [Keys](<xref:Microsoft.ML.Data.KeyDataViewType>) |
+    /// | Input column data type | Scalar of [Text](xref:Microsoft.ML.Data.TextDataViewType)  |
+    /// | Output column data type | Variable-sized vector of [Keys](xref:Microsoft.ML.Data.KeyDataViewType) |
     ///
     /// The estimator tokenizes characters by splitting text into sequences of characters using a sliding window.
     /// During training, the estimator builds a key-value pair dictionary with the encountered sequences of characters.
     ///
-    /// The transformer resulting from fitting the estimator creates a new column, named as specified in the output
-    /// column name parameters, which contains the keys of the sequences of characters that were encountered in the input.
+    /// The <xref:Microsoft.ML.Transforms.Text.TokenizingByCharactersTransformer> resulting from fitting the estimator
+    /// creates a new column, named as specified in the output column name parameters, which contains the keys of the
+    /// sequences of characters that were encountered in the input.
+    ///
     /// See the See Also section for links to examples of the usage.
     /// ]]>
     /// </format>
diff --git a/src/Microsoft.ML.Transforms/Text/WordEmbeddingsExtractor.cs b/src/Microsoft.ML.Transforms/Text/WordEmbeddingsExtractor.cs
@@ -734,10 +734,10 @@ private static ParallelOptions GetParallelOptions(IHostEnvironment hostEnvironme
     /// |  |  |
     /// | -- | -- |
     /// | Does this estimator need to look at the data to train its parameters? | No |
-    /// | Input column data type | Vector of <xref:System.ReadOnlyMemory{System.Char}> |
+    /// | Input column data type | Vector of [Text](xref:Microsoft.ML.Data.TextDataViewType)  |
     /// | Output column data type | Known-sized vector of <xref:System.Single> |
     ///
-    /// The [WordEmbeddingTransformer](<xref:Microsoft.ML.Transforms.Text.WordEmbeddingTransformer>) produces a new column,
+    /// The <xref:Microsoft.ML.Transforms.Text.WordEmbeddingTransformer> produces a new column,
     /// named as specified in the output column name parameters, where each input vector is mapped to a numerical vector
     /// with size of 3 * dimensionality of the embedding model used. Notice that this is independent of the size of the input vector.
     ///
@@ -750,6 +750,8 @@ private static ParallelOptions GetParallelOptions(IHostEnvironment hostEnvironme
     /// The user can specify a custom pre-trained embeddings model or one of the available pre-trained models.
     /// The available options are various versions of [GloVe Models](https://nlp.stanford.edu/projects/glove/),
     /// [FastText](https://en.wikipedia.org/wiki/FastText), and [SSWE](https://anthology.aclweb.org/P/P14/P14-1146.pdf).
+    ///
+    /// See the See Also section for links to examples of the usage.
     /// ]]></format>
     /// </remarks>
     /// <seealso cref="TextCatalog.ApplyWordEmbedding(TransformsCatalog.TextTransforms, string, string, PretrainedModelKind)"/>
@@ -826,46 +828,67 @@ internal WordEmbeddingEstimator(IHostEnvironment env, string customModelFile, pa
         /// </summary>
         public enum PretrainedModelKind
         {
-            /// <summary>GloVe 50 dimensional word embeddings.</summary>
+            /// <summary>
+            /// GloVe 50 dimensional word embeddings.
+            /// </summary>
             [TGUI(Label = "GloVe 50D")]
             GloVe50D = 0,
 
-            /// <summary>GloVe 100 dimensional word embeddings.</summary>
+            /// <summary>
+            /// GloVe 100 dimensional word embeddings.
+            /// </summary>
             [TGUI(Label = "GloVe 100D")]
             GloVe100D = 1,
 
-            /// <summary>GloVe 200 dimensional word embeddings.</summary>
+            /// <summary>
+            /// GloVe 200 dimensional word embeddings.
+            /// </summary>
             [TGUI(Label = "GloVe 200D")]
             GloVe200D = 2,
 
-            /// <summary>GloVe 300 dimensional word embeddings.</summary>
+            /// <summary>
+            /// GloVe 300 dimensional word embeddings.
+            /// </summary>
             [TGUI(Label = "GloVe 300D")]
             GloVe300D = 3,
 
-            /// <summary>GloVe 25 dimensional word embeddings trained on Twitter data.</summary>
+            /// <summary>
+            /// GloVe 25 dimensional word embeddings trained on Twitter data.
+            /// </summary>
             [TGUI(Label = "GloVe Twitter 25D")]
             GloVeTwitter25D = 4,
 
-            /// <summary>GloVe 50 dimensional word embeddings trained on Twitter data.</summary>
+            /// <summary>
+            /// GloVe 50 dimensional word embeddings trained on Twitter data.
+            /// </summary>
             [TGUI(Label = "GloVe Twitter 50D")]
             GloVeTwitter50D = 5,
 
-            /// <summary>GloVe 100 dimensional word embeddings trained on Twitter data.</summary>
+            /// <summary>
+            /// GloVe 100 dimensional word embeddings trained on Twitter data.
+            /// </summary>
             [TGUI(Label = "GloVe Twitter 100D")]
             GloVeTwitter100D = 6,
 
-            /// <summary>GloVe 200 dimensional word embeddings trained on Twitter data.</summary>
+            /// <summary>
+            /// GloVe 200 dimensional word embeddings trained on Twitter data.
+            /// </summary>
             [TGUI(Label = "GloVe Twitter 200D")]
             GloVeTwitter200D = 7,
 
-            /// <summary>FastText 300 dimensional word embeddings trained on Wikipedia.</summary>
+            /// <summary>
+            /// FastText 300 dimensional word embeddings trained on Wikipedia.
+            /// </summary>
             [TGUI(Label = "fastText Wikipedia 300D")]
             FastTextWikipedia300D = 8,
 
-            /// <summary>Word embeddings trained on sentiment analysis tasks.</summary>
+            /// <summary>
+            /// Word embeddings trained on sentiment analysis tasks.
+            /// </summary>
             [TGUI(Label = "Sentiment-Specific Word Embedding")]
             SentimentSpecificWordEmbedding = 9
         }
+
         /// <summary>
         /// Information for each column pair.
         /// </summary>
diff --git a/src/Microsoft.ML.Transforms/Text/WordTokenizing.cs b/src/Microsoft.ML.Transforms/Text/WordTokenizing.cs
@@ -405,14 +405,16 @@ private JToken SaveAsPfaCore(BoundPfaContext ctx, int iinfo, JToken srcToken)
     /// |  |  |
     /// | -- | -- |
     /// | Does this estimator need to look at the data to train its parameters? | No |
-    /// | Input column data type | <xref:System.ReadOnlyMemory{System.Char}> |
-    /// | Output column data type | Variable-size vector of <xref:System.ReadOnlyMemory{System.Char}> |
+    /// | Input column data type | Scalar of [Text](xref:Microsoft.ML.Data.TextDataViewType)  |
+    /// | Output column data type | Variable-size vector of [Text](xref:Microsoft.ML.Data.TextDataViewType) |
     ///
-    /// The resulting [WordTokenizingTransformer](Microsoft.ML.Transforms.Text.WordTokenizingTransformer) creates a new column,
+    /// The resulting <xref:Microsoft.ML.Transforms.Text.WordTokenizingTransformer> creates a new column,
     /// named as specified in the output column name parameters, where each input string is mapped to a vector of substrings obtained
     /// by splitting the input string according to the user defined delimiters. The space character is the default delimiter.
     ///
     /// Empty strings and strings containing only spaces are dropped.
+    ///
+    /// See the See Also section for links to examples of the usage.
     /// ]]></format>
     /// </remarks>
     /// <seealso cref="TextCatalog.TokenizeIntoWords(TransformsCatalog.TextTransforms, string, string, char[])"/>