diff --git a/src/Microsoft.ML.Transforms/Text/NgramHashingTransformer.cs b/src/Microsoft.ML.Transforms/Text/NgramHashingTransformer.cs index cc194c5590..3151fe0002 100644 --- a/src/Microsoft.ML.Transforms/Text/NgramHashingTransformer.cs +++ b/src/Microsoft.ML.Transforms/Text/NgramHashingTransformer.cs @@ -866,11 +866,11 @@ public VBuffer>[] SlotNamesMetadata(out VectorDataViewType[ /// | | | /// | -- | -- | /// | Does this estimator need to look at the data to train its parameters? | Yes | - /// | Input column data type | Vector of [Key]() | + /// | Input column data type | Vector of [Key](xref:Microsoft.ML.Data.KeyDataViewType) | /// | Output column data type | Vector of known size of | /// /// The resulting creates a new column, named as specified in the output column name parameters, and - /// produces a vector of counts of n-grams (sequences of consecutive words of length 1-n) from a given data. + /// produces a vector of n-gram counts (sequences of consecutive words of length 1-n) from a given data. /// It does so by hashing each n-gram and using the hash value as the index in the bag. /// /// is different from diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs index 46a0181c88..c7a60c45a8 100644 --- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs +++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs @@ -293,12 +293,18 @@ public static CustomStopWordsRemovingEstimator RemoveStopWords(this TransformsCa => new CustomStopWordsRemovingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnName, stopwords); /// - /// Produces a bag of counts of ngrams (sequences of consecutive words) in - /// and outputs bag of word vector as + /// Create a , which maps the column specified in + /// to a vector of n-gram counts in a new column named . /// - /// The text-related transform's catalog. - /// Name of the column resulting from the transformation of . - /// Name of the column to transform. If set to , the value of the will be used as source. + /// + /// is different from in that the former + /// tokenizes text internally and the latter takes tokenized text as input. + /// + /// The transform's catalog. + /// Name of the column resulting from the transformation of . + /// This column's data type will be known-size vector of . + /// Name of the column to take the data from. + /// This estimator operates over vector of text. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. /// Whether to include all ngram lengths up to or only . @@ -316,12 +322,18 @@ public static WordBagEstimator ProduceWordBags(this TransformsCatalog.TextTransf outputColumnName, inputColumnName, ngramLength, skipLength, useAllLengths, maximumNgramsCount, weighting); /// - /// Produces a bag of counts of ngrams (sequences of consecutive words) in - /// and outputs bag of word vector as + /// Create a , which maps the multiple columns specified in + /// to a vector of n-gram counts in a new column named . /// - /// The text-related transform's catalog. - /// Name of the column resulting from the transformation of . - /// Name of the columns to transform. + /// + /// is different from in that the former + /// tokenizes text internally and the latter takes tokenized text as input. + /// + /// The transform's catalog. + /// Name of the column resulting from the transformation of . + /// This column's data type will be known-size vector of . + /// Names of the multiple columns to take the data from. + /// This estimator operates over vector of text. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. /// Whether to include all ngram lengths up to or only . @@ -339,12 +351,18 @@ public static WordBagEstimator ProduceWordBags(this TransformsCatalog.TextTransf outputColumnName, inputColumnNames, ngramLength, skipLength, useAllLengths, maximumNgramsCount, weighting); /// - /// Produces a bag of counts of hashed ngrams in - /// and outputs bag of word vector as + /// Create a , which maps the column specified in + /// to a vector of counts of hashed n-grams in a new column named . /// - /// The text-related transform's catalog. - /// Name of the column resulting from the transformation of . - /// Name of the column to transform. If set to , the value of the will be used as source. + /// + /// is different from in that the former + /// tokenizes text internally and the latter takes tokenized text as input. + /// + /// The transform's catalog. + /// Name of the column resulting from the transformation of . + /// This column's data type will be known-size vector of . + /// Name of the column to take the data from. + /// This estimator operates over vector of text. /// Number of bits to hash into. Must be between 1 and 30, inclusive. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. @@ -371,12 +389,18 @@ public static WordHashBagEstimator ProduceHashedWordBags(this TransformsCatalog. maximumNumberOfInverts: maximumNumberOfInverts); /// - /// Produces a bag of counts of hashed ngrams in - /// and outputs bag of word vector as + /// Create a , which maps the multiple columns specified in + /// to a vector of counts of hashed n-grams in a new column named . /// - /// The text-related transform's catalog. - /// Name of the column resulting from the transformation of . - /// Name of the columns to transform. If set to , the value of the will be used as source. + /// + /// is different from in that the former + /// tokenizes text internally and the latter takes tokenized text as input. + /// + /// The transform's catalog. + /// Name of the column resulting from the transformation of . + /// This column's data type will be known-size vector of . + /// Names of the multiple columns to take the data from. + /// This estimator operates over vector of text. /// Number of bits to hash into. Must be between 1 and 30, inclusive. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. diff --git a/src/Microsoft.ML.Transforms/Text/WrappedTextTransformers.cs b/src/Microsoft.ML.Transforms/Text/WrappedTextTransformers.cs index 7e3f557efb..f8e8fa13fd 100644 --- a/src/Microsoft.ML.Transforms/Text/WrappedTextTransformers.cs +++ b/src/Microsoft.ML.Transforms/Text/WrappedTextTransformers.cs @@ -10,11 +10,30 @@ namespace Microsoft.ML.Transforms.Text { - /// - /// Produces a bag of counts of ngrams (sequences of consecutive words) in a given text. - /// It does so by building a dictionary of ngrams and using the id in the dictionary as the index in the bag. + /// for the . /// + /// + /// | + /// + /// The resulting creates a new column, named as specified in the output column name parameters, and + /// produces a vector of n-gram counts (sequences of n consecutive words) from a given data. + /// It does so by building a dictionary of ngrams and using the id in the dictionary as the index in the bag. + /// + /// is different from + /// in that the former takes tokenizes text internally while the latter takes tokenized text as input. + /// See the See Also section for links to examples of the usage. + /// ]]> + /// + /// + /// + /// public sealed class WordBagEstimator : IEstimator { private readonly IHost _host; @@ -182,9 +201,29 @@ public SchemaShape GetOutputSchema(SchemaShape inputSchema) } /// - /// Produces a bag of counts of ngrams (sequences of consecutive words of length 1-n) in a given text. - /// It does so by hashing each ngram and using the hash value as the index in the bag. + /// for the . /// + /// + /// | + /// + /// The resulting creates a new column, named as specified in the output column name parameters, and + /// produces a vector of n-gram counts (sequences of n consecutive words) from a given data. + /// It does so by hashing each ngram and using the hash value as the index in the bag. + /// + /// is different from + /// in that the former takes tokenizes text internally while the latter takes tokenized text as input. + /// See the See Also section for links to examples of the usage. + /// ]]> + /// + /// + /// + /// public sealed class WordHashBagEstimator : IEstimator { private readonly IHost _host;