diff --git a/src/Microsoft.ML.Transforms/Text/NgramTransform.cs b/src/Microsoft.ML.Transforms/Text/NgramTransform.cs index 8ea1b20772..dff3419520 100644 --- a/src/Microsoft.ML.Transforms/Text/NgramTransform.cs +++ b/src/Microsoft.ML.Transforms/Text/NgramTransform.cs @@ -30,8 +30,7 @@ namespace Microsoft.ML.Transforms.Text { /// - /// Produces a bag of counts of ngrams(sequences of consecutive values of length 1-n) in a given vector of keys. - /// It does so by building a dictionary of ngrams and using the id in the dictionary as the index in the bag. + /// resulting from fitting an . /// public sealed class NgramExtractingTransformer : OneToOneTransformerBase { @@ -668,9 +667,29 @@ protected override Delegate MakeGetter(DataViewRow input, int iinfo, Func - /// Produces a bag of counts of ngrams(sequences of consecutive values of length 1-n) in a given vector of keys. - /// It does so by building a dictionary of ngrams and using the id in the dictionary as the index in the bag. + /// Produces a vector of counts of ngrams (sequences of consecutive words) encountered in the input text. /// + /// + /// | + /// + /// The resulting + /// creates a new column, named as specified in the output column name parameters, where each + /// input vector is mapped to a vector of counts of ngrams (sequences of consecutive words) encountered in the input text. + /// + /// The estimator builds a dictionary of ngrams and the + /// uses the id in the dictionary as the index in the count vector that it produces. + /// + /// See the See Also section for links to examples of the usage. + /// ]]> + /// + /// public sealed class NgramExtractingEstimator : IEstimator { /// @@ -679,12 +698,18 @@ public sealed class NgramExtractingEstimator : IEstimator public enum WeightingCriteria { + /// Term Frequency. Calculated based on the number of occurrences in the document. [EnumValueDisplay("TF (Term Frequency)")] Tf = 0, + /// + /// Inverse Document Frequency. A ratio (the logarithm of inverse relative frequency) + /// that measures the information a slot provides by determining how common or rare it is across the entire corpus. + /// [EnumValueDisplay("IDF (Inverse Document Frequency)")] Idf = 1, + /// The product of the term frequency and the inverse document frequency. [EnumValueDisplay("TF-IDF")] TfIdf = 2 } @@ -782,7 +807,7 @@ internal static bool IsSchemaColumnValid(SchemaShape.Column col) return false; if (!col.IsKey) return false; - // Can only accept key types that can be converted to U4. + // Can only accept key types that can be converted to U8. if (!NgramUtils.IsValidNgramRawType(col.ItemType.RawType)) return false; return true; diff --git a/src/Microsoft.ML.Transforms/Text/NgramUtils.cs b/src/Microsoft.ML.Transforms/Text/NgramUtils.cs index 2cd529552e..aff3fd9ea6 100644 --- a/src/Microsoft.ML.Transforms/Text/NgramUtils.cs +++ b/src/Microsoft.ML.Transforms/Text/NgramUtils.cs @@ -206,7 +206,7 @@ internal static class NgramUtils { public static bool IsValidNgramRawType(Type rawType) { - // Can only accept key types that can be converted to U4 (uint). + // Can only accept key types that can be converted to U8 (ulong). return rawType != typeof(ulong); } } diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs index 3d6402ff1d..b767522697 100644 --- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs +++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs @@ -57,11 +57,15 @@ public static TextFeaturizingEstimator FeaturizeText(this TransformsCatalog.Text outputColumnName, inputColumnNames, options); /// - /// Tokenize incoming text in and output the tokens as . + /// Create a , which tokenizes characters by splitting text into sequences of characters + /// using a sliding window. /// /// The text-related transform's catalog. - /// Name of the column resulting from the transformation of . - /// Name of the column to transform. If set to , the value of the will be used as source. + /// Name of the column resulting from the transformation of . + /// This column's data type will be a variable-sized vector of keys. + /// Name of the column to transform. If set to , the value of the + /// will be used as source. + /// This estimator operates over text data type. /// Whether to prepend a marker character, , to the beginning, /// and append another marker character, , to the end of the output vector of characters. /// @@ -85,7 +89,6 @@ public static TokenizingByCharactersEstimator TokenizeIntoCharactersAsKeys(this /// Whether to prepend a marker character, , to the beginning, /// and append another marker character, , to the end of the output vector of characters. /// Pairs of columns to run the tokenization on. - [BestFriend] internal static TokenizingByCharactersEstimator TokenizeIntoCharactersAsKeys(this TransformsCatalog.TextTransforms catalog, bool useMarkerCharacters = CharTokenizingDefaults.UseMarkerCharacters, @@ -97,12 +100,15 @@ internal static TokenizingByCharactersEstimator TokenizeIntoCharactersAsKeys(thi } /// - /// Normalizes incoming text in by changing case, removing diacritical marks, punctuation marks and/or numbers - /// and outputs new text as . + /// Creates a , which normalizes incoming text in by optionally + /// changing case, removing diacritical marks, punctuation marks, numbers, and outputs new text as . /// /// The text-related transform's catalog. - /// Name of the column resulting from the transformation of . - /// Name of the column to transform. If set to , the value of the will be used as source. + /// Name of the column resulting from the transformation of . + /// This column's data type will remain scalar of text or a vector of text depending on the input column data type. + /// Name of the column to transform. If set to , + /// the value of the will be used as source. + /// This estimator operates on text or vector of text data types. /// Casing text using the rules of the invariant culture. /// Whether to keep diacritical marks or remove them. /// Whether to keep punctuation marks or remove them. @@ -124,10 +130,16 @@ public static TextNormalizingEstimator NormalizeText(this TransformsCatalog.Text => new TextNormalizingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnName, caseMode, keepDiacritics, keepPunctuations, keepNumbers); - /// + /// + /// Create an , which is a text featurizer that converts a vector + /// of text into a numerical vector using pre-trained embeddings models. + /// /// The text-related transform's catalog. - /// Name of the column resulting from the transformation of . - /// Name of the column to transform. If set to , the value of the will be used as source. + /// Name of the column resulting from the transformation of . + /// This column's data type will be a vector of . + /// Name of the column to transform. If set to , + /// the value of the will be used as source. + /// This estimator operates over known-sized vector of text data type. /// The embeddings to use. /// /// @@ -142,11 +154,17 @@ public static WordEmbeddingEstimator ApplyWordEmbedding(this TransformsCatalog.T WordEmbeddingEstimator.PretrainedModelKind modelKind = WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding) => new WordEmbeddingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnName, modelKind); - /// + /// + /// Create an , which is a text featurizer that converts vectors + /// of text into numerical vectors using pre-trained embeddings models. + /// /// The text-related transform's catalog. - /// The path of the pre-trained embeedings model to use. - /// Name of the column resulting from the transformation of . - /// Name of the column to transform. + /// The path of the pre-trained embeddings model to use. + /// Name of the column resulting from the transformation of . + /// This column's data type will be a vector of . + /// Name of the column to transform. If set to , + /// the value of the will be used as source. + /// This estimator operates over known-sized vector of text data type. /// /// /// new WordEmbeddingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, customModelFile, inputColumnName ?? outputColumnName); - /// + /// + /// Create an , which is a text featurizer that converts vectors + /// of text into numerical vectors using pre-trained embeddings models. + /// /// The text-related transform's catalog. /// The embeddings to use. - /// The array columns, and per-column configurations to extract embeedings from. + /// The array columns, and per-column configurations to extract embeddings from. /// /// /// new WordEmbeddingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), modelKind, columns); /// - /// Tokenizes incoming text in , using as separators, - /// and outputs the tokens as . + /// Create a , which tokenizes input text using as separators. /// /// The text-related transform's catalog. - /// Name of the column resulting from the transformation of . - /// Name of the column to transform. If set to , the value of the will be used as source. + /// Name of the column resulting from the transformation of . + /// This column's data type will be a variable-sized vector of text. + /// Name of the column to transform. If set to , the value of the will be used as source. + /// This estimator operates on scalar of text and vector of text data type. /// The separators to use (uses space character by default). /// /// @@ -210,17 +232,21 @@ internal static WordTokenizingEstimator TokenizeIntoWords(this TransformsCatalog => new WordTokenizingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), columns); /// - /// Produces a bag of counts of ngrams (sequences of consecutive words) in - /// and outputs bag of word vector as + /// Creates a which produces a vector of counts of ngrams (sequences of consecutive words) + /// encountered in the input text. /// /// The text-related transform's catalog. - /// Name of the column resulting from the transformation of . - /// Name of the column to transform. If set to , the value of the will be used as source. + /// Name of the column resulting from the transformation of . + /// This column's data type will be a vector of . + /// Name of the column to transform. If set to , the value of the will be used as source. + /// This estimator operates over vectors of keys data type. /// Ngram length. - /// Maximum number of tokens to skip when constructing an ngram. + /// Number of tokens to skip between each ngram. By default no token is skipped. /// Whether to include all ngram lengths up to or only . /// Maximum number of n-grams to store in the dictionary. - /// Statistical measure used to evaluate how important a word is to a document in a corpus. + /// Statistical measure used to evaluate how important a word or ngram is to a document in a corpus. + /// When is smaller than the total number of encountered ngrams this measure is used + /// to determine which ngrams to keep. /// /// /// - /// A text normalization transform that allows normalizing text case, removing diacritical marks, punctuation marks and/or numbers. - /// The transform operates on text input as well as vector of tokens/text (vector of ReadOnlyMemory). + /// resulting from fitting a . /// public sealed class TextNormalizingTransformer : OneToOneTransformerBase { @@ -431,6 +430,28 @@ private bool IsCombiningDiacritic(char ch) } } + /// + /// for the . + /// + /// + /// creates a new column, named as specified + /// in the output column name parameters, and normalizes the textual input data by changing case, removing diacritical marks, + /// punctuation marks and/or numbers. + /// + /// See the See Also section for links to examples of the usage. + /// ]]> + /// + /// + /// public sealed class TextNormalizingEstimator : TrivialEstimator { /// @@ -521,7 +542,7 @@ public override SchemaShape GetOutputSchema(SchemaShape inputSchema) throw Host.ExceptSchemaMismatch(nameof(inputSchema), "input", colInfo.inputColumnName); if (!IsColumnTypeValid(col.ItemType)) throw Host.ExceptSchemaMismatch(nameof(inputSchema), "input", colInfo.inputColumnName, TextNormalizingEstimator.ExpectedColumnType, col.ItemType.ToString()); - result[colInfo.outputColumnName] = new SchemaShape.Column(colInfo.outputColumnName, col.Kind == SchemaShape.Column.VectorKind.Vector ? SchemaShape.Column.VectorKind.VariableVector : SchemaShape.Column.VectorKind.Scalar, col.ItemType, false); + result[colInfo.outputColumnName] = new SchemaShape.Column(colInfo.outputColumnName, col.Kind == SchemaShape.Column.VectorKind.Scalar ? SchemaShape.Column.VectorKind.Scalar : SchemaShape.Column.VectorKind.VariableVector, col.ItemType, false); } return new SchemaShape(result.Values); } diff --git a/src/Microsoft.ML.Transforms/Text/TokenizingByCharacters.cs b/src/Microsoft.ML.Transforms/Text/TokenizingByCharacters.cs index 14763c64c8..04bc501bef 100644 --- a/src/Microsoft.ML.Transforms/Text/TokenizingByCharacters.cs +++ b/src/Microsoft.ML.Transforms/Text/TokenizingByCharacters.cs @@ -29,7 +29,7 @@ namespace Microsoft.ML.Transforms.Text { /// - /// Character-oriented tokenizer where text is considered a sequence of characters. + /// resulting from fitting a . /// public sealed class TokenizingByCharactersTransformer : OneToOneTransformerBase { @@ -547,8 +547,30 @@ private ValueGetter> MakeGetterVec(DataViewRow input, int iinfo) } /// - /// Character tokenizer splits text into sequences of characters using a sliding window. + /// for the . /// + /// + /// resulting from fitting the estimator + /// creates a new column, named as specified in the output column name parameters, which contains the keys of the + /// sequences of characters that were encountered in the input. + /// + /// See the See Also section for links to examples of the usage. + /// ]]> + /// + /// + /// public sealed class TokenizingByCharactersEstimator : TrivialEstimator { internal static class Defaults diff --git a/src/Microsoft.ML.Transforms/Text/WordEmbeddingsExtractor.cs b/src/Microsoft.ML.Transforms/Text/WordEmbeddingsExtractor.cs index 293cb59aac..46006420de 100644 --- a/src/Microsoft.ML.Transforms/Text/WordEmbeddingsExtractor.cs +++ b/src/Microsoft.ML.Transforms/Text/WordEmbeddingsExtractor.cs @@ -32,7 +32,9 @@ namespace Microsoft.ML.Transforms.Text { - /// + /// + /// resulting from fitting an . + /// public sealed class WordEmbeddingTransformer : OneToOneTransformerBase { internal sealed class Column : OneToOneColumn @@ -722,7 +724,38 @@ private static ParallelOptions GetParallelOptions(IHostEnvironment hostEnvironme => new ParallelOptions(); // we provide default options and let the Parallel decide } - /// + /// + /// Text featurizer which converts vectors of text tokens into a numerical vector using a pre-trained embeddings model. + /// + /// + /// | + /// + /// The produces a new column, + /// named as specified in the output column name parameters, where each input vector is mapped to a numerical vector + /// with size of 3 * dimensionality of the embedding model used. Notice that this is independent of the size of the input vector. + /// + /// For example, when using GloVe50D, which itself is 50 dimensional, the output column is a vector of size 150. + /// The first third of slots contains the minimum values across the embeddings corresponding to each string in the input vector. + /// The second third contains the average of the embeddings. The last third of slots contains maximum values + /// of the encountered embeddings. The min/max provides a bounding hyper-rectangle for the words in the word embedding space. + /// This can assist for longer phrases where the average of many words drowns out the useful signal. + /// + /// The user can specify a custom pre-trained embeddings model or one of the available pre-trained models. + /// The available options are various versions of [GloVe Models](https://nlp.stanford.edu/projects/glove/), + /// [FastText](https://en.wikipedia.org/wiki/FastText), and [SSWE](https://anthology.aclweb.org/P/P14/P14-1146.pdf). + /// + /// See the See Also section for links to examples of the usage. + /// ]]> + /// + /// + /// public sealed class WordEmbeddingEstimator : IEstimator { private readonly IHost _host; @@ -795,36 +828,67 @@ internal WordEmbeddingEstimator(IHostEnvironment env, string customModelFile, pa /// public enum PretrainedModelKind { + /// + /// GloVe 50 dimensional word embeddings. + /// [TGUI(Label = "GloVe 50D")] GloVe50D = 0, + /// + /// GloVe 100 dimensional word embeddings. + /// [TGUI(Label = "GloVe 100D")] GloVe100D = 1, + /// + /// GloVe 200 dimensional word embeddings. + /// [TGUI(Label = "GloVe 200D")] GloVe200D = 2, + /// + /// GloVe 300 dimensional word embeddings. + /// [TGUI(Label = "GloVe 300D")] GloVe300D = 3, + /// + /// GloVe 25 dimensional word embeddings trained on Twitter data. + /// [TGUI(Label = "GloVe Twitter 25D")] GloVeTwitter25D = 4, + /// + /// GloVe 50 dimensional word embeddings trained on Twitter data. + /// [TGUI(Label = "GloVe Twitter 50D")] GloVeTwitter50D = 5, + /// + /// GloVe 100 dimensional word embeddings trained on Twitter data. + /// [TGUI(Label = "GloVe Twitter 100D")] GloVeTwitter100D = 6, + /// + /// GloVe 200 dimensional word embeddings trained on Twitter data. + /// [TGUI(Label = "GloVe Twitter 200D")] GloVeTwitter200D = 7, + /// + /// FastText 300 dimensional word embeddings trained on Wikipedia. + /// [TGUI(Label = "fastText Wikipedia 300D")] FastTextWikipedia300D = 8, + /// + /// Word embeddings trained on sentiment analysis tasks. + /// [TGUI(Label = "Sentiment-Specific Word Embedding")] SentimentSpecificWordEmbedding = 9 } + /// /// Information for each column pair. /// diff --git a/src/Microsoft.ML.Transforms/Text/WordTokenizing.cs b/src/Microsoft.ML.Transforms/Text/WordTokenizing.cs index 8b04eb0b39..0ee02c60bf 100644 --- a/src/Microsoft.ML.Transforms/Text/WordTokenizing.cs +++ b/src/Microsoft.ML.Transforms/Text/WordTokenizing.cs @@ -31,10 +31,9 @@ namespace Microsoft.ML.Transforms.Text { - // The input for this transform is a ReadOnlyMemory or a vector of ReadOnlyMemory, and its output is a vector of ReadOnlyMemory, - // corresponding to the tokens in the input text, split using a set of user specified separator characters. - // Empty strings and strings containing only spaces are dropped. - /// + /// + /// resulting from fitting an . + /// public sealed class WordTokenizingTransformer : OneToOneTransformerBase { internal class Column : OneToOneColumn @@ -397,9 +396,28 @@ private JToken SaveAsPfaCore(BoundPfaContext ctx, int iinfo, JToken srcToken) } /// - /// Word tokenizer splits text into tokens using the delimiter. - /// For each text input, the output column is a variable vector of text. + /// Tokenizes input text using specified delimiters. /// + /// + /// creates a new column, + /// named as specified in the output column name parameters, where each input string is mapped to a vector of substrings obtained + /// by splitting the input string according to the user defined delimiters. The space character is the default delimiter. + /// + /// Empty strings and strings containing only spaces are dropped. + /// + /// See the See Also section for links to examples of the usage. + /// ]]> + /// + /// public sealed class WordTokenizingEstimator : TrivialEstimator { internal static bool IsColumnTypeValid(DataViewType type) => type.GetItemType() is TextDataViewType; @@ -410,8 +428,10 @@ public sealed class WordTokenizingEstimator : TrivialEstimator and output the tokens as . /// /// The environment. - /// Name of the column resulting from the transformation of . - /// Name of the column to transform. If set to , the value of the will be used as source. + /// Name of the column resulting from the transformation of . + /// The output column is of type variable vector of string. + /// Name of the column to transform. If set to , the value of the will be used as source. + /// This column should be of type string. /// The separators to use (uses space character by default). internal WordTokenizingEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName = null, char[] separators = null) : this(env, new[] { (outputColumnName, inputColumnName ?? outputColumnName) }, separators)