Skip to content

XML documentation for five text related transforms #3418

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Apr 19, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 30 additions & 5 deletions src/Microsoft.ML.Transforms/Text/NgramTransform.cs
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,7 @@
namespace Microsoft.ML.Transforms.Text
{
/// <summary>
/// Produces a bag of counts of ngrams(sequences of consecutive values of length 1-n) in a given vector of keys.
/// It does so by building a dictionary of ngrams and using the id in the dictionary as the index in the bag.
/// <see cref="ITransformer"/> resulting from fitting an <see cref="NgramExtractingEstimator"/>.
/// </summary>
public sealed class NgramExtractingTransformer : OneToOneTransformerBase
{
Expand Down Expand Up @@ -668,9 +667,29 @@ protected override Delegate MakeGetter(DataViewRow input, int iinfo, Func<int, b
}

/// <summary>
/// Produces a bag of counts of ngrams(sequences of consecutive values of length 1-n) in a given vector of keys.
/// It does so by building a dictionary of ngrams and using the id in the dictionary as the index in the bag.
/// Produces a vector of counts of ngrams (sequences of consecutive words) encountered in the input text.
/// </summary>
/// <remarks>
/// <format type="text/markdown"><![CDATA[
///
/// ### Estimator Characteristics
/// | | |
/// | -- | -- |
/// | Does this estimator need to look at the data to train its parameters? | Yes |
/// | Input column data type | Vector of [Keys](xref:Microsoft.ML.Data.KeyDataViewType) |
/// | Output column data type | Known-sized vector of <xref:System.Single> |
///
/// The resulting <xref:Microsoft.ML.Transforms.Text.NgramExtractingTransformer>
/// creates a new column, named as specified in the output column name parameters, where each
/// input vector is mapped to a vector of counts of ngrams (sequences of consecutive words) encountered in the input text.
///
/// The estimator builds a dictionary of ngrams and the <xref:Microsoft.ML.Transforms.Text.NgramExtractingTransformer>
/// uses the id in the dictionary as the index in the count vector that it produces.
///
/// See the See Also section for links to examples of the usage.
/// ]]></format>
/// </remarks>
/// <seealso cref="TextCatalog.ProduceNgrams(TransformsCatalog.TextTransforms, string, string, int, int, bool, int, WeightingCriteria)"/>
public sealed class NgramExtractingEstimator : IEstimator<NgramExtractingTransformer>
{
/// <summary>
Expand All @@ -679,12 +698,18 @@ public sealed class NgramExtractingEstimator : IEstimator<NgramExtractingTransfo
/// </summary>
public enum WeightingCriteria
{
/// <summary>Term Frequency. Calculated based on the number of occurrences in the document.</summary>
[EnumValueDisplay("TF (Term Frequency)")]
Tf = 0,

/// <summary>
/// Inverse Document Frequency. A ratio (the logarithm of inverse relative frequency)
/// that measures the information a slot provides by determining how common or rare it is across the entire corpus.
/// </summary>
[EnumValueDisplay("IDF (Inverse Document Frequency)")]
Idf = 1,

/// <summary>The product of the term frequency and the inverse document frequency.</summary>
[EnumValueDisplay("TF-IDF")]
TfIdf = 2
}
Expand Down Expand Up @@ -782,7 +807,7 @@ internal static bool IsSchemaColumnValid(SchemaShape.Column col)
return false;
if (!col.IsKey)
return false;
// Can only accept key types that can be converted to U4.
// Can only accept key types that can be converted to U8.
if (!NgramUtils.IsValidNgramRawType(col.ItemType.RawType))
return false;
return true;
Expand Down
2 changes: 1 addition & 1 deletion src/Microsoft.ML.Transforms/Text/NgramUtils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ internal static class NgramUtils
{
public static bool IsValidNgramRawType(Type rawType)
{
// Can only accept key types that can be converted to U4 (uint).
// Can only accept key types that can be converted to U8 (ulong).
return rawType != typeof(ulong);
}
}
Expand Down
80 changes: 53 additions & 27 deletions src/Microsoft.ML.Transforms/Text/TextCatalog.cs
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,15 @@ public static TextFeaturizingEstimator FeaturizeText(this TransformsCatalog.Text
outputColumnName, inputColumnNames, options);

/// <summary>
/// Tokenize incoming text in <paramref name="inputColumnName"/> and output the tokens as <paramref name="outputColumnName"/>.
/// Create a <see cref="TokenizingByCharactersEstimator"/>, which tokenizes characters by splitting text into sequences of characters
/// using a sliding window.
/// </summary>
/// <param name="catalog">The text-related transform's catalog.</param>
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.
/// This column's data type will be a variable-sized vector of keys.</param>
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the
/// <paramref name="outputColumnName"/> will be used as source.
/// This estimator operates over text data type.</param>
/// <param name="useMarkerCharacters">Whether to prepend a marker character, <see langword="0x02"/>, to the beginning,
/// and append another marker character, <see langword="0x03"/>, to the end of the output vector of characters.</param>
/// <example>
Expand All @@ -85,7 +89,6 @@ public static TokenizingByCharactersEstimator TokenizeIntoCharactersAsKeys(this
/// <param name="useMarkerCharacters">Whether to prepend a marker character, <see langword="0x02"/>, to the beginning,
/// and append another marker character, <see langword="0x03"/>, to the end of the output vector of characters.</param>
/// <param name="columns">Pairs of columns to run the tokenization on.</param>

[BestFriend]
internal static TokenizingByCharactersEstimator TokenizeIntoCharactersAsKeys(this TransformsCatalog.TextTransforms catalog,
bool useMarkerCharacters = CharTokenizingDefaults.UseMarkerCharacters,
Expand All @@ -97,12 +100,15 @@ internal static TokenizingByCharactersEstimator TokenizeIntoCharactersAsKeys(thi
}

/// <summary>
/// Normalizes incoming text in <paramref name="inputColumnName"/> by changing case, removing diacritical marks, punctuation marks and/or numbers
/// and outputs new text as <paramref name="outputColumnName"/>.
/// Creates a <see cref="TextNormalizingEstimator"/>, which normalizes incoming text in <paramref name="inputColumnName"/> by optionally
/// changing case, removing diacritical marks, punctuation marks, numbers, and outputs new text as <paramref name="outputColumnName"/>.
/// </summary>
/// <param name="catalog">The text-related transform's catalog.</param>
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.
/// This column's data type will remain scalar of text or a vector of text depending on the input column data type.</param>
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>,
/// the value of the <paramref name="outputColumnName"/> will be used as source.
/// This estimator operates on text or vector of text data types.</param>
/// <param name="caseMode">Casing text using the rules of the invariant culture.</param>
/// <param name="keepDiacritics">Whether to keep diacritical marks or remove them.</param>
/// <param name="keepPunctuations">Whether to keep punctuation marks or remove them.</param>
Expand All @@ -124,10 +130,16 @@ public static TextNormalizingEstimator NormalizeText(this TransformsCatalog.Text
=> new TextNormalizingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(),
outputColumnName, inputColumnName, caseMode, keepDiacritics, keepPunctuations, keepNumbers);

/// <include file='doc.xml' path='doc/members/member[@name="WordEmbeddings"]/*' />
Copy link

@shmoradims shmoradims Apr 19, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

'doc.xml' [](start = 26, length = 9)

if doc.xml is no longer used, please remove the file #Resolved

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The file is still used by the trasnforms that Ivan, and senja are workign on . I would rather remove it at the end, and not modify it to avoid conflicts.


In reply to: 277032552 [](ancestors = 277032552)

/// <summary>
/// Create an <see cref="WordEmbeddingEstimator"/>, which is a text featurizer that converts a vector
/// of text into a numerical vector using pre-trained embeddings models.
/// </summary>
/// <param name="catalog">The text-related transform's catalog.</param>
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.
/// This column's data type will be a vector of <see cref="System.Single"/>.</param>
Copy link
Contributor

@Ivanidzo4ka Ivanidzo4ka Apr 19, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vector [](start = 46, length = 6)

known-sized #Resolved

/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>,
/// the value of the <paramref name="outputColumnName"/> will be used as source.
/// This estimator operates over known-sized vector of text data type.</param>
/// <param name="modelKind">The embeddings <see cref="WordEmbeddingEstimator.PretrainedModelKind"/> to use. </param>
/// <example>
/// <format type="text/markdown">
Expand All @@ -142,11 +154,17 @@ public static WordEmbeddingEstimator ApplyWordEmbedding(this TransformsCatalog.T
WordEmbeddingEstimator.PretrainedModelKind modelKind = WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding)
=> new WordEmbeddingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnName, modelKind);

/// <include file='doc.xml' path='doc/members/member[@name="WordEmbeddings"]/*' />
/// <summary>
/// Create an <see cref="WordEmbeddingEstimator"/>, which is a text featurizer that converts vectors
/// of text into numerical vectors using pre-trained embeddings models.
/// </summary>
/// <param name="catalog">The text-related transform's catalog.</param>
/// <param name="customModelFile">The path of the pre-trained embeedings model to use. </param>
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
/// <param name="inputColumnName">Name of the column to transform.</param>
/// <param name="customModelFile">The path of the pre-trained embeddings model to use.</param>
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.
/// This column's data type will be a vector of <see cref="System.Single"/>.</param>
Copy link
Contributor

@Ivanidzo4ka Ivanidzo4ka Apr 19, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vector [](start = 46, length = 6)

known-sized #Resolved

/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>,
/// the value of the <paramref name="outputColumnName"/> will be used as source.
Copy link

@shmoradims shmoradims Apr 19, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

source [](start = 81, length = 6)

let's not use 'source'. just drop 'as source'. it reads fine without it. #Pending

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can modify it here, but it's everywhere, that's the pattern that a lot of our transforms follow.


In reply to: 277032960 [](ancestors = 277032960)

/// This estimator operates over known-sized vector of text data type.</param>
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
Expand All @@ -161,10 +179,13 @@ public static WordEmbeddingEstimator ApplyWordEmbedding(this TransformsCatalog.T
=> new WordEmbeddingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(),
outputColumnName, customModelFile, inputColumnName ?? outputColumnName);

/// <include file='doc.xml' path='doc/members/member[@name="WordEmbeddings"]/*' />
/// <summary>
/// Create an <see cref="WordEmbeddingEstimator"/>, which is a text featurizer that converts vectors
Copy link

@shmoradims shmoradims Apr 19, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vectors [](start = 101, length = 7)

singular or plural? if it's plural, please say 'multiple vectors' or 'one or more vectors' #Resolved

/// of text into numerical vectors using pre-trained embeddings models.
/// </summary>
/// <param name="catalog">The text-related transform's catalog.</param>
/// <param name="modelKind">The embeddings <see cref="WordEmbeddingEstimator.PretrainedModelKind"/> to use. </param>
/// <param name="columns">The array columns, and per-column configurations to extract embeedings from.</param>
/// <param name="columns">The array columns, and per-column configurations to extract embeddings from.</param>
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
Expand All @@ -179,12 +200,13 @@ internal static WordEmbeddingEstimator ApplyWordEmbedding(this TransformsCatalog
=> new WordEmbeddingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), modelKind, columns);

/// <summary>
/// Tokenizes incoming text in <paramref name="inputColumnName"/>, using <paramref name="separators"/> as separators,
/// and outputs the tokens as <paramref name="outputColumnName"/>.
/// Create a <see cref="WordTokenizingEstimator"/>, which tokenizes input text using <paramref name="separators"/> as separators.
/// </summary>
/// <param name="catalog">The text-related transform's catalog.</param>
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.
/// This column's data type will be a variable-sized vector of text.</param>
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.
/// This estimator operates on scalar of text and vector of text data type.</param>
/// <param name="separators">The separators to use (uses space character by default).</param>
/// <example>
/// <format type="text/markdown">
Expand All @@ -210,17 +232,21 @@ internal static WordTokenizingEstimator TokenizeIntoWords(this TransformsCatalog
=> new WordTokenizingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), columns);

/// <summary>
/// Produces a bag of counts of ngrams (sequences of consecutive words) in <paramref name="inputColumnName"/>
/// and outputs bag of word vector as <paramref name="outputColumnName"/>
/// Creates a <see cref="NgramExtractingEstimator"/> which produces a vector of counts of ngrams (sequences of consecutive words)
/// encountered in the input text.
/// </summary>
/// <param name="catalog">The text-related transform's catalog.</param>
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.
/// This column's data type will be a vector of <see cref="System.Single"/>.</param>
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.
/// This estimator operates over vectors of keys data type.</param>
/// <param name="ngramLength">Ngram length.</param>
/// <param name="skipLength">Maximum number of tokens to skip when constructing an ngram.</param>
/// <param name="skipLength">Number of tokens to skip between each ngram. By default no token is skipped.</param>
/// <param name="useAllLengths">Whether to include all ngram lengths up to <paramref name="ngramLength"/> or only <paramref name="ngramLength"/>.</param>
/// <param name="maximumNgramsCount">Maximum number of n-grams to store in the dictionary.</param>
/// <param name="weighting">Statistical measure used to evaluate how important a word is to a document in a corpus.</param>
/// <param name="weighting">Statistical measure used to evaluate how important a word or ngram is to a document in a corpus.
/// When <paramref name="maximumNgramsCount"/> is smaller than the total number of encountered ngrams this measure is used
/// to determine which ngrams to keep.</param>
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
Expand Down
Loading