diff --git a/src/Microsoft.ML.Transforms/Text/NgramTransform.cs b/src/Microsoft.ML.Transforms/Text/NgramTransform.cs
index 8ea1b20772..dff3419520 100644
--- a/src/Microsoft.ML.Transforms/Text/NgramTransform.cs
+++ b/src/Microsoft.ML.Transforms/Text/NgramTransform.cs
@@ -30,8 +30,7 @@
namespace Microsoft.ML.Transforms.Text
{
///
- /// Produces a bag of counts of ngrams(sequences of consecutive values of length 1-n) in a given vector of keys.
- /// It does so by building a dictionary of ngrams and using the id in the dictionary as the index in the bag.
+ /// resulting from fitting an .
///
public sealed class NgramExtractingTransformer : OneToOneTransformerBase
{
@@ -668,9 +667,29 @@ protected override Delegate MakeGetter(DataViewRow input, int iinfo, Func
- /// Produces a bag of counts of ngrams(sequences of consecutive values of length 1-n) in a given vector of keys.
- /// It does so by building a dictionary of ngrams and using the id in the dictionary as the index in the bag.
+ /// Produces a vector of counts of ngrams (sequences of consecutive words) encountered in the input text.
///
+ ///
+ /// |
+ ///
+ /// The resulting
+ /// creates a new column, named as specified in the output column name parameters, where each
+ /// input vector is mapped to a vector of counts of ngrams (sequences of consecutive words) encountered in the input text.
+ ///
+ /// The estimator builds a dictionary of ngrams and the
+ /// uses the id in the dictionary as the index in the count vector that it produces.
+ ///
+ /// See the See Also section for links to examples of the usage.
+ /// ]]>
+ ///
+ ///
public sealed class NgramExtractingEstimator : IEstimator
{
///
@@ -679,12 +698,18 @@ public sealed class NgramExtractingEstimator : IEstimator
public enum WeightingCriteria
{
+ /// Term Frequency. Calculated based on the number of occurrences in the document.
[EnumValueDisplay("TF (Term Frequency)")]
Tf = 0,
+ ///
+ /// Inverse Document Frequency. A ratio (the logarithm of inverse relative frequency)
+ /// that measures the information a slot provides by determining how common or rare it is across the entire corpus.
+ ///
[EnumValueDisplay("IDF (Inverse Document Frequency)")]
Idf = 1,
+ /// The product of the term frequency and the inverse document frequency.
[EnumValueDisplay("TF-IDF")]
TfIdf = 2
}
@@ -782,7 +807,7 @@ internal static bool IsSchemaColumnValid(SchemaShape.Column col)
return false;
if (!col.IsKey)
return false;
- // Can only accept key types that can be converted to U4.
+ // Can only accept key types that can be converted to U8.
if (!NgramUtils.IsValidNgramRawType(col.ItemType.RawType))
return false;
return true;
diff --git a/src/Microsoft.ML.Transforms/Text/NgramUtils.cs b/src/Microsoft.ML.Transforms/Text/NgramUtils.cs
index 2cd529552e..aff3fd9ea6 100644
--- a/src/Microsoft.ML.Transforms/Text/NgramUtils.cs
+++ b/src/Microsoft.ML.Transforms/Text/NgramUtils.cs
@@ -206,7 +206,7 @@ internal static class NgramUtils
{
public static bool IsValidNgramRawType(Type rawType)
{
- // Can only accept key types that can be converted to U4 (uint).
+ // Can only accept key types that can be converted to U8 (ulong).
return rawType != typeof(ulong);
}
}
diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs
index 3d6402ff1d..b767522697 100644
--- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs
+++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs
@@ -57,11 +57,15 @@ public static TextFeaturizingEstimator FeaturizeText(this TransformsCatalog.Text
outputColumnName, inputColumnNames, options);
///
- /// Tokenize incoming text in and output the tokens as .
+ /// Create a , which tokenizes characters by splitting text into sequences of characters
+ /// using a sliding window.
///
/// The text-related transform's catalog.
- /// Name of the column resulting from the transformation of .
- /// Name of the column to transform. If set to , the value of the will be used as source.
+ /// Name of the column resulting from the transformation of .
+ /// This column's data type will be a variable-sized vector of keys.
+ /// Name of the column to transform. If set to , the value of the
+ /// will be used as source.
+ /// This estimator operates over text data type.
/// Whether to prepend a marker character, , to the beginning,
/// and append another marker character, , to the end of the output vector of characters.
///
@@ -85,7 +89,6 @@ public static TokenizingByCharactersEstimator TokenizeIntoCharactersAsKeys(this
/// Whether to prepend a marker character, , to the beginning,
/// and append another marker character, , to the end of the output vector of characters.
/// Pairs of columns to run the tokenization on.
-
[BestFriend]
internal static TokenizingByCharactersEstimator TokenizeIntoCharactersAsKeys(this TransformsCatalog.TextTransforms catalog,
bool useMarkerCharacters = CharTokenizingDefaults.UseMarkerCharacters,
@@ -97,12 +100,15 @@ internal static TokenizingByCharactersEstimator TokenizeIntoCharactersAsKeys(thi
}
///
- /// Normalizes incoming text in by changing case, removing diacritical marks, punctuation marks and/or numbers
- /// and outputs new text as .
+ /// Creates a , which normalizes incoming text in by optionally
+ /// changing case, removing diacritical marks, punctuation marks, numbers, and outputs new text as .
///
/// The text-related transform's catalog.
- /// Name of the column resulting from the transformation of .
- /// Name of the column to transform. If set to , the value of the will be used as source.
+ /// Name of the column resulting from the transformation of .
+ /// This column's data type will remain scalar of text or a vector of text depending on the input column data type.
+ /// Name of the column to transform. If set to ,
+ /// the value of the will be used as source.
+ /// This estimator operates on text or vector of text data types.
/// Casing text using the rules of the invariant culture.
/// Whether to keep diacritical marks or remove them.
/// Whether to keep punctuation marks or remove them.
@@ -124,10 +130,16 @@ public static TextNormalizingEstimator NormalizeText(this TransformsCatalog.Text
=> new TextNormalizingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(),
outputColumnName, inputColumnName, caseMode, keepDiacritics, keepPunctuations, keepNumbers);
- ///
+ ///
+ /// Create an , which is a text featurizer that converts a vector
+ /// of text into a numerical vector using pre-trained embeddings models.
+ ///
/// The text-related transform's catalog.
- /// Name of the column resulting from the transformation of .
- /// Name of the column to transform. If set to , the value of the will be used as source.
+ /// Name of the column resulting from the transformation of .
+ /// This column's data type will be a vector of .
+ /// Name of the column to transform. If set to ,
+ /// the value of the will be used as source.
+ /// This estimator operates over known-sized vector of text data type.
/// The embeddings to use.
///
///
@@ -142,11 +154,17 @@ public static WordEmbeddingEstimator ApplyWordEmbedding(this TransformsCatalog.T
WordEmbeddingEstimator.PretrainedModelKind modelKind = WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding)
=> new WordEmbeddingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnName, modelKind);
- ///
+ ///
+ /// Create an , which is a text featurizer that converts vectors
+ /// of text into numerical vectors using pre-trained embeddings models.
+ ///
/// The text-related transform's catalog.
- /// The path of the pre-trained embeedings model to use.
- /// Name of the column resulting from the transformation of .
- /// Name of the column to transform.
+ /// The path of the pre-trained embeddings model to use.
+ /// Name of the column resulting from the transformation of .
+ /// This column's data type will be a vector of .
+ /// Name of the column to transform. If set to ,
+ /// the value of the will be used as source.
+ /// This estimator operates over known-sized vector of text data type.
///
///
/// new WordEmbeddingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(),
outputColumnName, customModelFile, inputColumnName ?? outputColumnName);
- ///
+ ///
+ /// Create an , which is a text featurizer that converts vectors
+ /// of text into numerical vectors using pre-trained embeddings models.
+ ///
/// The text-related transform's catalog.
/// The embeddings to use.
- /// The array columns, and per-column configurations to extract embeedings from.
+ /// The array columns, and per-column configurations to extract embeddings from.
///
///
/// new WordEmbeddingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), modelKind, columns);
///
- /// Tokenizes incoming text in , using as separators,
- /// and outputs the tokens as .
+ /// Create a , which tokenizes input text using as separators.
///
/// The text-related transform's catalog.
- /// Name of the column resulting from the transformation of .
- /// Name of the column to transform. If set to , the value of the will be used as source.
+ /// Name of the column resulting from the transformation of .
+ /// This column's data type will be a variable-sized vector of text.
+ /// Name of the column to transform. If set to , the value of the will be used as source.
+ /// This estimator operates on scalar of text and vector of text data type.
/// The separators to use (uses space character by default).
///
///
@@ -210,17 +232,21 @@ internal static WordTokenizingEstimator TokenizeIntoWords(this TransformsCatalog
=> new WordTokenizingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), columns);
///
- /// Produces a bag of counts of ngrams (sequences of consecutive words) in
- /// and outputs bag of word vector as
+ /// Creates a which produces a vector of counts of ngrams (sequences of consecutive words)
+ /// encountered in the input text.
///
/// The text-related transform's catalog.
- /// Name of the column resulting from the transformation of .
- /// Name of the column to transform. If set to , the value of the will be used as source.
+ /// Name of the column resulting from the transformation of .
+ /// This column's data type will be a vector of .
+ /// Name of the column to transform. If set to , the value of the will be used as source.
+ /// This estimator operates over vectors of keys data type.
/// Ngram length.
- /// Maximum number of tokens to skip when constructing an ngram.
+ /// Number of tokens to skip between each ngram. By default no token is skipped.
/// Whether to include all ngram lengths up to or only .
/// Maximum number of n-grams to store in the dictionary.
- /// Statistical measure used to evaluate how important a word is to a document in a corpus.
+ /// Statistical measure used to evaluate how important a word or ngram is to a document in a corpus.
+ /// When is smaller than the total number of encountered ngrams this measure is used
+ /// to determine which ngrams to keep.
///
///
///
- /// A text normalization transform that allows normalizing text case, removing diacritical marks, punctuation marks and/or numbers.
- /// The transform operates on text input as well as vector of tokens/text (vector of ReadOnlyMemory).
+ /// resulting from fitting a .
///
public sealed class TextNormalizingTransformer : OneToOneTransformerBase
{
@@ -431,6 +430,28 @@ private bool IsCombiningDiacritic(char ch)
}
}
+ ///
+ /// for the .
+ ///
+ ///
+ /// creates a new column, named as specified
+ /// in the output column name parameters, and normalizes the textual input data by changing case, removing diacritical marks,
+ /// punctuation marks and/or numbers.
+ ///
+ /// See the See Also section for links to examples of the usage.
+ /// ]]>
+ ///
+ ///
+ ///
public sealed class TextNormalizingEstimator : TrivialEstimator
{
///
@@ -521,7 +542,7 @@ public override SchemaShape GetOutputSchema(SchemaShape inputSchema)
throw Host.ExceptSchemaMismatch(nameof(inputSchema), "input", colInfo.inputColumnName);
if (!IsColumnTypeValid(col.ItemType))
throw Host.ExceptSchemaMismatch(nameof(inputSchema), "input", colInfo.inputColumnName, TextNormalizingEstimator.ExpectedColumnType, col.ItemType.ToString());
- result[colInfo.outputColumnName] = new SchemaShape.Column(colInfo.outputColumnName, col.Kind == SchemaShape.Column.VectorKind.Vector ? SchemaShape.Column.VectorKind.VariableVector : SchemaShape.Column.VectorKind.Scalar, col.ItemType, false);
+ result[colInfo.outputColumnName] = new SchemaShape.Column(colInfo.outputColumnName, col.Kind == SchemaShape.Column.VectorKind.Scalar ? SchemaShape.Column.VectorKind.Scalar : SchemaShape.Column.VectorKind.VariableVector, col.ItemType, false);
}
return new SchemaShape(result.Values);
}
diff --git a/src/Microsoft.ML.Transforms/Text/TokenizingByCharacters.cs b/src/Microsoft.ML.Transforms/Text/TokenizingByCharacters.cs
index 14763c64c8..04bc501bef 100644
--- a/src/Microsoft.ML.Transforms/Text/TokenizingByCharacters.cs
+++ b/src/Microsoft.ML.Transforms/Text/TokenizingByCharacters.cs
@@ -29,7 +29,7 @@
namespace Microsoft.ML.Transforms.Text
{
///
- /// Character-oriented tokenizer where text is considered a sequence of characters.
+ /// resulting from fitting a .
///
public sealed class TokenizingByCharactersTransformer : OneToOneTransformerBase
{
@@ -547,8 +547,30 @@ private ValueGetter> MakeGetterVec(DataViewRow input, int iinfo)
}
///
- /// Character tokenizer splits text into sequences of characters using a sliding window.
+ /// for the .
///
+ ///
+ /// resulting from fitting the estimator
+ /// creates a new column, named as specified in the output column name parameters, which contains the keys of the
+ /// sequences of characters that were encountered in the input.
+ ///
+ /// See the See Also section for links to examples of the usage.
+ /// ]]>
+ ///
+ ///
+ ///
public sealed class TokenizingByCharactersEstimator : TrivialEstimator
{
internal static class Defaults
diff --git a/src/Microsoft.ML.Transforms/Text/WordEmbeddingsExtractor.cs b/src/Microsoft.ML.Transforms/Text/WordEmbeddingsExtractor.cs
index 293cb59aac..46006420de 100644
--- a/src/Microsoft.ML.Transforms/Text/WordEmbeddingsExtractor.cs
+++ b/src/Microsoft.ML.Transforms/Text/WordEmbeddingsExtractor.cs
@@ -32,7 +32,9 @@
namespace Microsoft.ML.Transforms.Text
{
- ///
+ ///
+ /// resulting from fitting an .
+ ///
public sealed class WordEmbeddingTransformer : OneToOneTransformerBase
{
internal sealed class Column : OneToOneColumn
@@ -722,7 +724,38 @@ private static ParallelOptions GetParallelOptions(IHostEnvironment hostEnvironme
=> new ParallelOptions(); // we provide default options and let the Parallel decide
}
- ///
+ ///
+ /// Text featurizer which converts vectors of text tokens into a numerical vector using a pre-trained embeddings model.
+ ///
+ ///
+ /// |
+ ///
+ /// The produces a new column,
+ /// named as specified in the output column name parameters, where each input vector is mapped to a numerical vector
+ /// with size of 3 * dimensionality of the embedding model used. Notice that this is independent of the size of the input vector.
+ ///
+ /// For example, when using GloVe50D, which itself is 50 dimensional, the output column is a vector of size 150.
+ /// The first third of slots contains the minimum values across the embeddings corresponding to each string in the input vector.
+ /// The second third contains the average of the embeddings. The last third of slots contains maximum values
+ /// of the encountered embeddings. The min/max provides a bounding hyper-rectangle for the words in the word embedding space.
+ /// This can assist for longer phrases where the average of many words drowns out the useful signal.
+ ///
+ /// The user can specify a custom pre-trained embeddings model or one of the available pre-trained models.
+ /// The available options are various versions of [GloVe Models](https://nlp.stanford.edu/projects/glove/),
+ /// [FastText](https://en.wikipedia.org/wiki/FastText), and [SSWE](https://anthology.aclweb.org/P/P14/P14-1146.pdf).
+ ///
+ /// See the See Also section for links to examples of the usage.
+ /// ]]>
+ ///
+ ///
+ ///
public sealed class WordEmbeddingEstimator : IEstimator
{
private readonly IHost _host;
@@ -795,36 +828,67 @@ internal WordEmbeddingEstimator(IHostEnvironment env, string customModelFile, pa
///
public enum PretrainedModelKind
{
+ ///
+ /// GloVe 50 dimensional word embeddings.
+ ///
[TGUI(Label = "GloVe 50D")]
GloVe50D = 0,
+ ///
+ /// GloVe 100 dimensional word embeddings.
+ ///
[TGUI(Label = "GloVe 100D")]
GloVe100D = 1,
+ ///
+ /// GloVe 200 dimensional word embeddings.
+ ///
[TGUI(Label = "GloVe 200D")]
GloVe200D = 2,
+ ///
+ /// GloVe 300 dimensional word embeddings.
+ ///
[TGUI(Label = "GloVe 300D")]
GloVe300D = 3,
+ ///
+ /// GloVe 25 dimensional word embeddings trained on Twitter data.
+ ///
[TGUI(Label = "GloVe Twitter 25D")]
GloVeTwitter25D = 4,
+ ///
+ /// GloVe 50 dimensional word embeddings trained on Twitter data.
+ ///
[TGUI(Label = "GloVe Twitter 50D")]
GloVeTwitter50D = 5,
+ ///
+ /// GloVe 100 dimensional word embeddings trained on Twitter data.
+ ///
[TGUI(Label = "GloVe Twitter 100D")]
GloVeTwitter100D = 6,
+ ///
+ /// GloVe 200 dimensional word embeddings trained on Twitter data.
+ ///
[TGUI(Label = "GloVe Twitter 200D")]
GloVeTwitter200D = 7,
+ ///
+ /// FastText 300 dimensional word embeddings trained on Wikipedia.
+ ///
[TGUI(Label = "fastText Wikipedia 300D")]
FastTextWikipedia300D = 8,
+ ///
+ /// Word embeddings trained on sentiment analysis tasks.
+ ///
[TGUI(Label = "Sentiment-Specific Word Embedding")]
SentimentSpecificWordEmbedding = 9
}
+
///
/// Information for each column pair.
///
diff --git a/src/Microsoft.ML.Transforms/Text/WordTokenizing.cs b/src/Microsoft.ML.Transforms/Text/WordTokenizing.cs
index 8b04eb0b39..0ee02c60bf 100644
--- a/src/Microsoft.ML.Transforms/Text/WordTokenizing.cs
+++ b/src/Microsoft.ML.Transforms/Text/WordTokenizing.cs
@@ -31,10 +31,9 @@
namespace Microsoft.ML.Transforms.Text
{
- // The input for this transform is a ReadOnlyMemory or a vector of ReadOnlyMemory, and its output is a vector of ReadOnlyMemory,
- // corresponding to the tokens in the input text, split using a set of user specified separator characters.
- // Empty strings and strings containing only spaces are dropped.
- ///
+ ///
+ /// resulting from fitting an .
+ ///
public sealed class WordTokenizingTransformer : OneToOneTransformerBase
{
internal class Column : OneToOneColumn
@@ -397,9 +396,28 @@ private JToken SaveAsPfaCore(BoundPfaContext ctx, int iinfo, JToken srcToken)
}
///
- /// Word tokenizer splits text into tokens using the delimiter.
- /// For each text input, the output column is a variable vector of text.
+ /// Tokenizes input text using specified delimiters.
///
+ ///
+ /// creates a new column,
+ /// named as specified in the output column name parameters, where each input string is mapped to a vector of substrings obtained
+ /// by splitting the input string according to the user defined delimiters. The space character is the default delimiter.
+ ///
+ /// Empty strings and strings containing only spaces are dropped.
+ ///
+ /// See the See Also section for links to examples of the usage.
+ /// ]]>
+ ///
+ ///
public sealed class WordTokenizingEstimator : TrivialEstimator
{
internal static bool IsColumnTypeValid(DataViewType type) => type.GetItemType() is TextDataViewType;
@@ -410,8 +428,10 @@ public sealed class WordTokenizingEstimator : TrivialEstimator and output the tokens as .
///
/// The environment.
- /// Name of the column resulting from the transformation of .
- /// Name of the column to transform. If set to , the value of the will be used as source.
+ /// Name of the column resulting from the transformation of .
+ /// The output column is of type variable vector of string.
+ /// Name of the column to transform. If set to , the value of the will be used as source.
+ /// This column should be of type string.
/// The separators to use (uses space character by default).
internal WordTokenizingEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName = null, char[] separators = null)
: this(env, new[] { (outputColumnName, inputColumnName ?? outputColumnName) }, separators)