diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs
index 3d6402ff1d..df024103ce 100644
--- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs
+++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs
@@ -17,11 +17,14 @@ namespace Microsoft.ML
public static class TextCatalog
{
///
- /// Transform a text column into featurized float array that represents counts of ngrams and char-grams.
+ /// Create a , which transforms a text column into a featurized vector of that represents normalized counts of ngrams and char-grams.
///
/// The text-related transform's catalog.
- /// Name of the column resulting from the transformation of .
- /// Name of the column to transform. If set to , the value of the will be used as source.
+ /// Name of the column resulting from the transformation of .
+ /// This column's data type will be a vector of .
+ /// Name of the column to transform. If set to , the value of the will be used as source.
+ /// This estimator operates over text data.
+ ///
///
///
///
- /// Transform several text columns into featurized float array that represents counts of ngrams and char-grams.
+ /// Create a , which transforms a text column into featurized float array that represents normalized counts of ngrams and char-grams.
///
+ /// This transform can operate over several columns.
/// The text-related transform's catalog.
- /// Name of the column resulting from the transformation of .
+ /// Name of the column resulting from the transformation of .
+ /// This column's data type will be a vector of .
+ ///
/// Advanced options to the algorithm.
- /// Name of the columns to transform. If set to , the value of the will be used as source.
+ /// Name of the columns to transform. If set to , the value of the will be used as source.
+ /// This estimator operates over text data, and it can transform several columns at once, yielding one vector of
+ /// as the resulting features for all columns.
///
///
///
public interface IStopWordsRemoverOptions { }
- // A transform that turns a collection of text documents into numerical feature vectors. The feature vectors are counts
- // of (word or character) ngrams in a given text. It offers ngram hashing (finding the ngram token string name to feature
- // integer index mapping through hashing) as an option.
- ///
+ ///
+ /// An estimator that turns a collection of text documents into numerical feature vectors.
+ /// The feature vectors are normalized counts of word and/or character ngrams (based on the options supplied).
+ ///
+ ///
+ /// |
+ ///
+ /// This estimator gives the user one-stop solution for doing:
+ /// * Language Detection
+ /// * [Tokenization](https://en.wikipedia.org/wiki/Lexical_analysis#Tokenization)
+ /// * [Text normalization](https://en.wikipedia.org/wiki/Text_normalization)
+ /// * [Predefined and custom stopwords removal](https://en.wikipedia.org/wiki/Stop_words)
+ /// * [Word-based or character-based Ngram extraction and SkipGram extraction (through the advanced [options](xref:Microsoft.ML.Transforms.TextFeaturizingEstimator.Options.WordFeatureExtractor))](https://en.wikipedia.org/wiki/N-gram)
+ /// * [TF, IDF or TF-IDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf)
+ /// * [L-p vector normalization](xref: Microsoft.ML.Transforms.LpNormNormalizingTransformer)
+ ///
+ /// By default the features are made of (word/character) n-grams/skip-grams and the number of features are equal to the vocabulary size found by analyzing the data.
+ /// To output an additional column with the tokens generated, use [OutputTokensColumnName](xref: Microsoft.ML.Transforms.TextFeaturizingEstimator.Options.OutputTokensColumnName).
+ /// The number of features can also be specified by selecting the maximum number of n-gram to keep in the , where the estimator can be further tuned.
+ /// ]]>
+ ///
+ ///
+ ///
public sealed class TextFeaturizingEstimator : IEstimator
{
///
diff --git a/src/Microsoft.ML.Transforms/Text/doc.xml b/src/Microsoft.ML.Transforms/Text/doc.xml
index bc590b9332..f23c20a71e 100644
--- a/src/Microsoft.ML.Transforms/Text/doc.xml
+++ b/src/Microsoft.ML.Transforms/Text/doc.xml
@@ -2,45 +2,6 @@
-
-
- A transform that turns a collection of text documents into numerical feature vectors.
- The feature vectors are normalized counts of (word and/or character) ngrams in a given tokenized text.
-
-
- The TextFeaturizer transform gives user one-stop solution for doing:
-
- - Language Detection
- - Tokenzation
- - Text normalization
- - Predefined and custom stopwords removal.
- - Word-based or character-based Ngram and SkipGram extraction.
- - TF, IDF or TF-IDF.
- - L-p vector normalization.
-
- The TextFeaturizer will show the transformed text, after being applied.
- It converts a collection of text columns to a matrix of token ngrams/skip-grams counts.
- Features are made of (word/character) n-grams/skip-grams and the number of features are equal to the vocabulary size found by analyzing the data.
-
-
-
-
-
- pipeline.Add(new TextFeaturizer("Features", "SentimentText")
- {
- KeepDiacritics = false,
- KeepPunctuations = false,
- TextCase = TextNormalizerTransformCaseNormalizationMode.Lower,
- OutputTokens = true,
- StopWordsRemover = new PredefinedStopWordsRemover(),
- VectorNormalizer = TextTransformTextNormKind.L2,
- CharFeatureExtractor = new NGramNgramExtractor() { NgramLength = 3, AllLengths = false },
- WordFeatureExtractor = new NGramNgramExtractor() { NgramLength = 2, AllLengths = true }
- });
-
-
-
-
This transform splits the text into words using the separator character(s).