Skip to content

Commit e823567

Browse files
committed
text transforms xml doc
1 parent fa95c00 commit e823567

File tree

7 files changed

+192
-47
lines changed

7 files changed

+192
-47
lines changed

src/Microsoft.ML.Transforms/Text/NgramTransform.cs

+26-5
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,7 @@
3030
namespace Microsoft.ML.Transforms.Text
3131
{
3232
/// <summary>
33-
/// Produces a bag of counts of ngrams(sequences of consecutive values of length 1-n) in a given vector of keys.
34-
/// It does so by building a dictionary of ngrams and using the id in the dictionary as the index in the bag.
33+
/// <see cref="ITransformer"/> resulting from fitting an <see cref="NgramExtractingEstimator"/>.
3534
/// </summary>
3635
public sealed class NgramExtractingTransformer : OneToOneTransformerBase
3736
{
@@ -668,9 +667,27 @@ protected override Delegate MakeGetter(DataViewRow input, int iinfo, Func<int, b
668667
}
669668

670669
/// <summary>
671-
/// Produces a bag of counts of ngrams(sequences of consecutive values of length 1-n) in a given vector of keys.
672-
/// It does so by building a dictionary of ngrams and using the id in the dictionary as the index in the bag.
670+
/// Produces a vector of counts of ngrams (sequences of consecutive words) encountered in the input text.
673671
/// </summary>
672+
/// <remarks>
673+
/// <format type="text/markdown"><![CDATA[
674+
///
675+
/// ### Estimator Characteristics
676+
/// | | |
677+
/// | -- | -- |
678+
/// | Does this estimator need to look at the data to train its parameters? | Yes |
679+
/// | Input column data type | Vector of [Keys](<xref:Microsoft.ML.Data.KeyDataViewType>) |
680+
/// | Output column data type | Known-sized vector of <xref:System.Single> |
681+
///
682+
/// The resulting [NgramExtractingTransformer]<xref:Microsoft.ML.Transforms.Text.NgramExtractingTransformer>
683+
/// creates a new column, named as specified in the output column name parameters, where each
684+
/// input vector is mapped to a vector of counts of ngrams (sequences of consecutive words) encountered in the input text.
685+
///
686+
/// The estimator builds a dictionary of ngrams and the [NgramExtractingTransformer]<xref:Microsoft.ML.Transforms.Text.NgramExtractingTransformer>
687+
/// uses the id in the dictionary as the index in the count vector that it produces.
688+
/// ]]></format>
689+
/// </remarks>
690+
/// <seealso cref="TextCatalog.ProduceNgrams(TransformsCatalog.TextTransforms, string, string, int, int, bool, int, WeightingCriteria)"/>
674691
public sealed class NgramExtractingEstimator : IEstimator<NgramExtractingTransformer>
675692
{
676693
/// <summary>
@@ -679,12 +696,16 @@ public sealed class NgramExtractingEstimator : IEstimator<NgramExtractingTransfo
679696
/// </summary>
680697
public enum WeightingCriteria
681698
{
699+
/// <summary>Term Frequency. Calculated based on the number of occurrences in the document.</summary>
682700
[EnumValueDisplay("TF (Term Frequency)")]
683701
Tf = 0,
684702

703+
/// <summary>Inverse Document Frequency. A ratio (the logarithm of inverse relative frequency)
704+
/// that measures the information a slot provides by determining how common or rare it is across the entire corpus.</summary>
685705
[EnumValueDisplay("IDF (Inverse Document Frequency)")]
686706
Idf = 1,
687707

708+
/// <summary>The product of the term frequency and the inverse document frequency.</summary>
688709
[EnumValueDisplay("TF-IDF")]
689710
TfIdf = 2
690711
}
@@ -782,7 +803,7 @@ internal static bool IsSchemaColumnValid(SchemaShape.Column col)
782803
return false;
783804
if (!col.IsKey)
784805
return false;
785-
// Can only accept key types that can be converted to U4.
806+
// Can only accept key types that can be converted to U8.
786807
if (!NgramUtils.IsValidNgramRawType(col.ItemType.RawType))
787808
return false;
788809
return true;

src/Microsoft.ML.Transforms/Text/NgramUtils.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,7 @@ internal static class NgramUtils
206206
{
207207
public static bool IsValidNgramRawType(Type rawType)
208208
{
209-
// Can only accept key types that can be converted to U4 (uint).
209+
// Can only accept key types that can be converted to U8 (ulong).
210210
return rawType != typeof(ulong);
211211
}
212212
}

src/Microsoft.ML.Transforms/Text/TextCatalog.cs

+53-27
Original file line numberDiff line numberDiff line change
@@ -57,11 +57,15 @@ public static TextFeaturizingEstimator FeaturizeText(this TransformsCatalog.Text
5757
outputColumnName, inputColumnNames, options);
5858

5959
/// <summary>
60-
/// Tokenize incoming text in <paramref name="inputColumnName"/> and output the tokens as <paramref name="outputColumnName"/>.
60+
/// Create a <see cref="TokenizingByCharactersEstimator"/>, which tokenizes characters by splitting text into sequences of characters
61+
/// using a sliding window.
6162
/// </summary>
6263
/// <param name="catalog">The text-related transform's catalog.</param>
63-
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
64-
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
64+
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.
65+
/// This column's data type will be a vector of keys.</param>
66+
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the
67+
/// <paramref name="outputColumnName"/> will be used as source.
68+
/// This estimator operates over text data type.</param>
6569
/// <param name="useMarkerCharacters">Whether to prepend a marker character, <see langword="0x02"/>, to the beginning,
6670
/// and append another marker character, <see langword="0x03"/>, to the end of the output vector of characters.</param>
6771
/// <example>
@@ -85,7 +89,6 @@ public static TokenizingByCharactersEstimator TokenizeIntoCharactersAsKeys(this
8589
/// <param name="useMarkerCharacters">Whether to prepend a marker character, <see langword="0x02"/>, to the beginning,
8690
/// and append another marker character, <see langword="0x03"/>, to the end of the output vector of characters.</param>
8791
/// <param name="columns">Pairs of columns to run the tokenization on.</param>
88-
8992
[BestFriend]
9093
internal static TokenizingByCharactersEstimator TokenizeIntoCharactersAsKeys(this TransformsCatalog.TextTransforms catalog,
9194
bool useMarkerCharacters = CharTokenizingDefaults.UseMarkerCharacters,
@@ -97,12 +100,15 @@ internal static TokenizingByCharactersEstimator TokenizeIntoCharactersAsKeys(thi
97100
}
98101

99102
/// <summary>
100-
/// Normalizes incoming text in <paramref name="inputColumnName"/> by changing case, removing diacritical marks, punctuation marks and/or numbers
101-
/// and outputs new text as <paramref name="outputColumnName"/>.
103+
/// Creates a <see cref="TextNormalizingEstimator"/>, which normalizes incoming text in <paramref name="inputColumnName"/> by changing case,
104+
/// removing diacritical marks, punctuation marks and/or numbers and outputs new text as <paramref name="outputColumnName"/>.
102105
/// </summary>
103106
/// <param name="catalog">The text-related transform's catalog.</param>
104-
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
105-
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
107+
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.
108+
/// This column's data type will remain scalar of text or a vector of text depending on the input column data type.</param>
109+
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>,
110+
/// the value of the <paramref name="outputColumnName"/> will be used as source.
111+
/// This estimator operates on text and vector of text data types.</param>
106112
/// <param name="caseMode">Casing text using the rules of the invariant culture.</param>
107113
/// <param name="keepDiacritics">Whether to keep diacritical marks or remove them.</param>
108114
/// <param name="keepPunctuations">Whether to keep punctuation marks or remove them.</param>
@@ -124,10 +130,16 @@ public static TextNormalizingEstimator NormalizeText(this TransformsCatalog.Text
124130
=> new TextNormalizingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(),
125131
outputColumnName, inputColumnName, caseMode, keepDiacritics, keepPunctuations, keepNumbers);
126132

127-
/// <include file='doc.xml' path='doc/members/member[@name="WordEmbeddings"]/*' />
133+
/// <summary>
134+
/// Create an <see cref="WordEmbeddingEstimator"/>, which is a text featurizer that converts vectors
135+
/// of text into numerical vectors using pre-trained embeddings models.
136+
/// </summary>
128137
/// <param name="catalog">The text-related transform's catalog.</param>
129-
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
130-
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
138+
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.
139+
/// This column's data type will be a vector of <see cref="System.Single"/>.</param>
140+
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>,
141+
/// the value of the <paramref name="outputColumnName"/> will be used as source.
142+
/// This estimator operates over vector of text data type.</param>
131143
/// <param name="modelKind">The embeddings <see cref="WordEmbeddingEstimator.PretrainedModelKind"/> to use. </param>
132144
/// <example>
133145
/// <format type="text/markdown">
@@ -142,11 +154,17 @@ public static WordEmbeddingEstimator ApplyWordEmbedding(this TransformsCatalog.T
142154
WordEmbeddingEstimator.PretrainedModelKind modelKind = WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding)
143155
=> new WordEmbeddingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnName, modelKind);
144156

145-
/// <include file='doc.xml' path='doc/members/member[@name="WordEmbeddings"]/*' />
157+
/// <summary>
158+
/// Create an <see cref="WordEmbeddingEstimator"/>, which is a text featurizer that converts vectors
159+
/// of text into numerical vectors using pre-trained embeddings models.
160+
/// </summary>
146161
/// <param name="catalog">The text-related transform's catalog.</param>
147-
/// <param name="customModelFile">The path of the pre-trained embeedings model to use. </param>
148-
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
149-
/// <param name="inputColumnName">Name of the column to transform.</param>
162+
/// <param name="customModelFile">The path of the pre-trained embeddings model to use.</param>
163+
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.
164+
/// This column's data type will be a vector of <see cref="System.Single"/>.</param>
165+
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>,
166+
/// the value of the <paramref name="outputColumnName"/> will be used as source.
167+
/// This estimator operates over vector of text data type.</param>
150168
/// <example>
151169
/// <format type="text/markdown">
152170
/// <![CDATA[
@@ -161,10 +179,13 @@ public static WordEmbeddingEstimator ApplyWordEmbedding(this TransformsCatalog.T
161179
=> new WordEmbeddingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(),
162180
outputColumnName, customModelFile, inputColumnName ?? outputColumnName);
163181

164-
/// <include file='doc.xml' path='doc/members/member[@name="WordEmbeddings"]/*' />
182+
/// <summary>
183+
/// Create an <see cref="WordEmbeddingEstimator"/>, which is a text featurizer that converts vectors
184+
/// of text into numerical vectors using pre-trained embeddings models.
185+
/// </summary>
165186
/// <param name="catalog">The text-related transform's catalog.</param>
166187
/// <param name="modelKind">The embeddings <see cref="WordEmbeddingEstimator.PretrainedModelKind"/> to use. </param>
167-
/// <param name="columns">The array columns, and per-column configurations to extract embeedings from.</param>
188+
/// <param name="columns">The array columns, and per-column configurations to extract embeddings from.</param>
168189
/// <example>
169190
/// <format type="text/markdown">
170191
/// <![CDATA[
@@ -179,12 +200,13 @@ internal static WordEmbeddingEstimator ApplyWordEmbedding(this TransformsCatalog
179200
=> new WordEmbeddingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), modelKind, columns);
180201

181202
/// <summary>
182-
/// Tokenizes incoming text in <paramref name="inputColumnName"/>, using <paramref name="separators"/> as separators,
183-
/// and outputs the tokens as <paramref name="outputColumnName"/>.
203+
/// Create a <see cref="WordTokenizingEstimator"/>, which tokenizes input text using <paramref name="separators"/> as separators.
184204
/// </summary>
185205
/// <param name="catalog">The text-related transform's catalog.</param>
186-
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
187-
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
206+
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.
207+
/// This column's data type will be a vector of text.</param>
208+
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.
209+
/// This estimator operates of text data type.</param>
188210
/// <param name="separators">The separators to use (uses space character by default).</param>
189211
/// <example>
190212
/// <format type="text/markdown">
@@ -210,17 +232,21 @@ internal static WordTokenizingEstimator TokenizeIntoWords(this TransformsCatalog
210232
=> new WordTokenizingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), columns);
211233

212234
/// <summary>
213-
/// Produces a bag of counts of ngrams (sequences of consecutive words) in <paramref name="inputColumnName"/>
214-
/// and outputs bag of word vector as <paramref name="outputColumnName"/>
235+
/// Creates a <see cref="NgramExtractingEstimator"/> which produces a vector of counts of ngrams (sequences of consecutive words)
236+
/// encountered in the input text.
215237
/// </summary>
216238
/// <param name="catalog">The text-related transform's catalog.</param>
217-
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
218-
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
239+
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.
240+
/// This column's data type will be a vector of <see cref="System.Single"/>.</param>
241+
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.
242+
/// This estimator operates over vectors of keys data type.</param>
219243
/// <param name="ngramLength">Ngram length.</param>
220-
/// <param name="skipLength">Maximum number of tokens to skip when constructing an ngram.</param>
244+
/// <param name="skipLength">Number of tokens to skip between each ngram. By defaults no token is skipped.</param>
221245
/// <param name="useAllLengths">Whether to include all ngram lengths up to <paramref name="ngramLength"/> or only <paramref name="ngramLength"/>.</param>
222246
/// <param name="maximumNgramsCount">Maximum number of n-grams to store in the dictionary.</param>
223-
/// <param name="weighting">Statistical measure used to evaluate how important a word is to a document in a corpus.</param>
247+
/// <param name="weighting">Statistical measure used to evaluate how important a word or ngram is to a document in a corpus.
248+
/// When <paramref name="maximumNgramsCount"/> is smaller than the total number of encountered ngrams this measure is used
249+
/// to determine which ngrams to keep.</param>
224250
/// <example>
225251
/// <format type="text/markdown">
226252
/// <![CDATA[

src/Microsoft.ML.Transforms/Text/TextNormalizing.cs

+21-2
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,7 @@
2929
namespace Microsoft.ML.Transforms.Text
3030
{
3131
/// <summary>
32-
/// A text normalization transform that allows normalizing text case, removing diacritical marks, punctuation marks and/or numbers.
33-
/// The transform operates on text input as well as vector of tokens/text (vector of ReadOnlyMemory).
32+
/// <see cref="ITransformer"/> resulting from fitting a <see cref="TextNormalizingEstimator"/>.
3433
/// </summary>
3534
public sealed class TextNormalizingTransformer : OneToOneTransformerBase
3635
{
@@ -431,6 +430,26 @@ private bool IsCombiningDiacritic(char ch)
431430
}
432431
}
433432

433+
/// <summary>
434+
/// <see cref="IEstimator{TTransformer}"/> for the <see cref="TextNormalizingTransformer"/>.
435+
/// </summary>
436+
/// <remarks>
437+
/// <format type="text/markdown"><![CDATA[
438+
///
439+
/// ### Estimator Characteristics
440+
/// | | |
441+
/// | -- | -- |
442+
/// | Does this estimator need to look at the data to train its parameters? | No |
443+
/// | Input column data type | <xref:System.ReadOnlyMemory{System.Char}> or vector of <xref:System.ReadOnlyMemory{System.Char}> |
444+
/// | Output column data type | The same as the data type in the input column |
445+
///
446+
/// The resulting transformer creates a new column, named as specified in the output column name parameters, and
447+
/// normalizes the textual input data by changing case, removing diacritical marks, punctuation marks and/or numbers.
448+
/// See the See Also section for links to examples of the usage.
449+
/// ]]>
450+
/// </format>
451+
/// </remarks>
452+
/// <seealso cref="TextCatalog.NormalizeText" />
434453
public sealed class TextNormalizingEstimator : TrivialEstimator<TextNormalizingTransformer>
435454
{
436455
/// <summary>

0 commit comments

Comments
 (0)