diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs index 3d6402ff1d..df024103ce 100644 --- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs +++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs @@ -17,11 +17,14 @@ namespace Microsoft.ML public static class TextCatalog { /// - /// Transform a text column into featurized float array that represents counts of ngrams and char-grams. + /// Create a , which transforms a text column into a featurized vector of that represents normalized counts of ngrams and char-grams. /// /// The text-related transform's catalog. - /// Name of the column resulting from the transformation of . - /// Name of the column to transform. If set to , the value of the will be used as source. + /// Name of the column resulting from the transformation of . + /// This column's data type will be a vector of . + /// Name of the column to transform. If set to , the value of the will be used as source. + /// This estimator operates over text data. + /// /// /// /// - /// Transform several text columns into featurized float array that represents counts of ngrams and char-grams. + /// Create a , which transforms a text column into featurized float array that represents normalized counts of ngrams and char-grams. /// + /// This transform can operate over several columns. /// The text-related transform's catalog. - /// Name of the column resulting from the transformation of . + /// Name of the column resulting from the transformation of . + /// This column's data type will be a vector of . + /// /// Advanced options to the algorithm. - /// Name of the columns to transform. If set to , the value of the will be used as source. + /// Name of the columns to transform. If set to , the value of the will be used as source. + /// This estimator operates over text data, and it can transform several columns at once, yielding one vector of + /// as the resulting features for all columns. /// /// /// public interface IStopWordsRemoverOptions { } - // A transform that turns a collection of text documents into numerical feature vectors. The feature vectors are counts - // of (word or character) ngrams in a given text. It offers ngram hashing (finding the ngram token string name to feature - // integer index mapping through hashing) as an option. - /// + /// + /// An estimator that turns a collection of text documents into numerical feature vectors. + /// The feature vectors are normalized counts of word and/or character ngrams (based on the options supplied). + /// + /// + /// | + /// + /// This estimator gives the user one-stop solution for doing: + /// * Language Detection + /// * [Tokenization](https://en.wikipedia.org/wiki/Lexical_analysis#Tokenization) + /// * [Text normalization](https://en.wikipedia.org/wiki/Text_normalization) + /// * [Predefined and custom stopwords removal](https://en.wikipedia.org/wiki/Stop_words) + /// * [Word-based or character-based Ngram extraction and SkipGram extraction (through the advanced [options](xref:Microsoft.ML.Transforms.TextFeaturizingEstimator.Options.WordFeatureExtractor))](https://en.wikipedia.org/wiki/N-gram) + /// * [TF, IDF or TF-IDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf) + /// * [L-p vector normalization](xref: Microsoft.ML.Transforms.LpNormNormalizingTransformer) + /// + /// By default the features are made of (word/character) n-grams/skip-grams​ and the number of features are equal to the vocabulary size found by analyzing the data. + /// To output an additional column with the tokens generated, use [OutputTokensColumnName](xref: Microsoft.ML.Transforms.TextFeaturizingEstimator.Options.OutputTokensColumnName). + /// The number of features can also be specified by selecting the maximum number of n-gram to keep in the , where the estimator can be further tuned. + /// ]]> + /// + /// + /// public sealed class TextFeaturizingEstimator : IEstimator { /// diff --git a/src/Microsoft.ML.Transforms/Text/doc.xml b/src/Microsoft.ML.Transforms/Text/doc.xml index bc590b9332..f23c20a71e 100644 --- a/src/Microsoft.ML.Transforms/Text/doc.xml +++ b/src/Microsoft.ML.Transforms/Text/doc.xml @@ -2,45 +2,6 @@ - - - A transform that turns a collection of text documents into numerical feature vectors. - The feature vectors are normalized counts of (word and/or character) ngrams in a given tokenized text. - - - The TextFeaturizer transform gives user one-stop solution for doing: - - Language Detection - Tokenzation​ - Text normalization - Predefined and custom stopwords removal. - Word-based or character-based Ngram and SkipGram extraction.​ - TF, IDF or TF-IDF. - L-p vector normalization.​ - - The TextFeaturizer will show the transformed text, after being applied. - It converts a collection of text columns to a matrix of token ngrams/skip-grams counts. - Features are made of (word/character) n-grams/skip-grams​ and the number of features are equal to the vocabulary size found by analyzing the data. - - - - - - pipeline.Add(new TextFeaturizer("Features", "SentimentText") - { - KeepDiacritics = false, - KeepPunctuations = false, - TextCase = TextNormalizerTransformCaseNormalizationMode.Lower, - OutputTokens = true, - StopWordsRemover = new PredefinedStopWordsRemover(), - VectorNormalizer = TextTransformTextNormKind.L2, - CharFeatureExtractor = new NGramNgramExtractor() { NgramLength = 3, AllLengths = false }, - WordFeatureExtractor = new NGramNgramExtractor() { NgramLength = 2, AllLengths = true } - }); - - - - This transform splits the text into words using the separator character(s).