Skip to content

Commit c0832b5

Browse files
authored
Update documentation for WordBag (#3440)
1 parent cb4566d commit c0832b5

File tree

3 files changed

+90
-27
lines changed

3 files changed

+90
-27
lines changed

src/Microsoft.ML.Transforms/Text/NgramHashingTransformer.cs

+2-2
Original file line numberDiff line numberDiff line change
@@ -866,11 +866,11 @@ public VBuffer<ReadOnlyMemory<char>>[] SlotNamesMetadata(out VectorDataViewType[
866866
/// | | |
867867
/// | -- | -- |
868868
/// | Does this estimator need to look at the data to train its parameters? | Yes |
869-
/// | Input column data type | Vector of [Key](<xref:Microsoft.ML.Data.KeyDataViewType>) |
869+
/// | Input column data type | Vector of [Key](xref:Microsoft.ML.Data.KeyDataViewType) |
870870
/// | Output column data type | Vector of known size of <xref:System.Single> |
871871
///
872872
/// The resulting <xref:Microsoft.ML.Transforms.Text.NgramHashingTransformer/> creates a new column, named as specified in the output column name parameters, and
873-
/// produces a vector of counts of n-grams (sequences of consecutive words of length 1-n) from a given data.
873+
/// produces a vector of n-gram counts (sequences of consecutive words of length 1-n) from a given data.
874874
/// It does so by hashing each n-gram and using the hash value as the index in the bag.
875875
///
876876
/// <xref:Microsoft.ML.Transforms.Text.NgramHashingEstimator> is different from <xref:Microsoft.ML.Transforms.Text.WordHashBagEstimator>

src/Microsoft.ML.Transforms/Text/TextCatalog.cs

+44-20
Original file line numberDiff line numberDiff line change
@@ -323,12 +323,18 @@ public static CustomStopWordsRemovingEstimator RemoveStopWords(this TransformsCa
323323
=> new CustomStopWordsRemovingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnName, stopwords);
324324

325325
/// <summary>
326-
/// Produces a bag of counts of ngrams (sequences of consecutive words) in <paramref name="inputColumnName"/>
327-
/// and outputs bag of word vector as <paramref name="outputColumnName"/>
326+
/// Create a <see cref="WordHashBagEstimator"/>, which maps the column specified in <paramref name="inputColumnName"/>
327+
/// to a vector of n-gram counts in a new column named <paramref name="outputColumnName"/>.
328328
/// </summary>
329-
/// <param name="catalog">The text-related transform's catalog.</param>
330-
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
331-
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
329+
/// <remarks>
330+
/// <see cref="WordBagEstimator"/> is different from <see cref="NgramExtractingEstimator"/> in that the former
331+
/// tokenizes text internally and the latter takes tokenized text as input.
332+
/// </remarks>
333+
/// <param name="catalog">The transform's catalog.</param>
334+
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.
335+
/// This column's data type will be known-size vector of <see cref="System.Single"/>.</param>
336+
/// <param name="inputColumnName">Name of the column to take the data from.
337+
/// This estimator operates over vector of text.</param>
332338
/// <param name="ngramLength">Ngram length.</param>
333339
/// <param name="skipLength">Maximum number of tokens to skip when constructing an ngram.</param>
334340
/// <param name="useAllLengths">Whether to include all ngram lengths up to <paramref name="ngramLength"/> or only <paramref name="ngramLength"/>.</param>
@@ -346,12 +352,18 @@ public static WordBagEstimator ProduceWordBags(this TransformsCatalog.TextTransf
346352
outputColumnName, inputColumnName, ngramLength, skipLength, useAllLengths, maximumNgramsCount, weighting);
347353

348354
/// <summary>
349-
/// Produces a bag of counts of ngrams (sequences of consecutive words) in <paramref name="inputColumnNames"/>
350-
/// and outputs bag of word vector as <paramref name="outputColumnName"/>
355+
/// Create a <see cref="WordHashBagEstimator"/>, which maps the multiple columns specified in <paramref name="inputColumnNames"/>
356+
/// to a vector of n-gram counts in a new column named <paramref name="outputColumnName"/>.
351357
/// </summary>
352-
/// <param name="catalog">The text-related transform's catalog.</param>
353-
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnNames"/>.</param>
354-
/// <param name="inputColumnNames">Name of the columns to transform.</param>
358+
/// <remarks>
359+
/// <see cref="WordBagEstimator"/> is different from <see cref="NgramExtractingEstimator"/> in that the former
360+
/// tokenizes text internally and the latter takes tokenized text as input.
361+
/// </remarks>
362+
/// <param name="catalog">The transform's catalog.</param>
363+
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnNames"/>.
364+
/// This column's data type will be known-size vector of <see cref="System.Single"/>.</param>
365+
/// <param name="inputColumnNames">Names of the multiple columns to take the data from.
366+
/// This estimator operates over vector of text.</param>
355367
/// <param name="ngramLength">Ngram length.</param>
356368
/// <param name="skipLength">Maximum number of tokens to skip when constructing an ngram.</param>
357369
/// <param name="useAllLengths">Whether to include all ngram lengths up to <paramref name="ngramLength"/> or only <paramref name="ngramLength"/>.</param>
@@ -369,12 +381,18 @@ public static WordBagEstimator ProduceWordBags(this TransformsCatalog.TextTransf
369381
outputColumnName, inputColumnNames, ngramLength, skipLength, useAllLengths, maximumNgramsCount, weighting);
370382

371383
/// <summary>
372-
/// Produces a bag of counts of hashed ngrams in <paramref name="inputColumnName"/>
373-
/// and outputs bag of word vector as <paramref name="outputColumnName"/>
384+
/// Create a <see cref="WordHashBagEstimator"/>, which maps the column specified in <paramref name="inputColumnName"/>
385+
/// to a vector of counts of hashed n-grams in a new column named <paramref name="outputColumnName"/>.
374386
/// </summary>
375-
/// <param name="catalog">The text-related transform's catalog.</param>
376-
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
377-
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
387+
/// <remarks>
388+
/// <see cref="WordHashBagEstimator"/> is different from <see cref="NgramHashingEstimator"/> in that the former
389+
/// tokenizes text internally and the latter takes tokenized text as input.
390+
/// </remarks>
391+
/// <param name="catalog">The transform's catalog.</param>
392+
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.
393+
/// This column's data type will be known-size vector of <see cref="System.Single"/>.</param>
394+
/// <param name="inputColumnName">Name of the column to take the data from.
395+
/// This estimator operates over vector of text.</param>
378396
/// <param name="numberOfBits">Number of bits to hash into. Must be between 1 and 30, inclusive.</param>
379397
/// <param name="ngramLength">Ngram length.</param>
380398
/// <param name="skipLength">Maximum number of tokens to skip when constructing an ngram.</param>
@@ -401,12 +419,18 @@ public static WordHashBagEstimator ProduceHashedWordBags(this TransformsCatalog.
401419
maximumNumberOfInverts: maximumNumberOfInverts);
402420

403421
/// <summary>
404-
/// Produces a bag of counts of hashed ngrams in <paramref name="inputColumnNames"/>
405-
/// and outputs bag of word vector as <paramref name="outputColumnName"/>
422+
/// Create a <see cref="WordHashBagEstimator"/>, which maps the multiple columns specified in <paramref name="inputColumnNames"/>
423+
/// to a vector of counts of hashed n-grams in a new column named <paramref name="outputColumnName"/>.
406424
/// </summary>
407-
/// <param name="catalog">The text-related transform's catalog.</param>
408-
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnNames"/>.</param>
409-
/// <param name="inputColumnNames">Name of the columns to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
425+
/// <remarks>
426+
/// <see cref="WordHashBagEstimator"/> is different from <see cref="NgramHashingEstimator"/> in that the former
427+
/// tokenizes text internally and the latter takes tokenized text as input.
428+
/// </remarks>
429+
/// <param name="catalog">The transform's catalog.</param>
430+
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnNames"/>.
431+
/// This column's data type will be known-size vector of <see cref="System.Single"/>.</param>
432+
/// <param name="inputColumnNames">Names of the multiple columns to take the data from.
433+
/// This estimator operates over vector of text.</param>
410434
/// <param name="numberOfBits">Number of bits to hash into. Must be between 1 and 30, inclusive.</param>
411435
/// <param name="ngramLength">Ngram length.</param>
412436
/// <param name="skipLength">Maximum number of tokens to skip when constructing an ngram.</param>

src/Microsoft.ML.Transforms/Text/WrappedTextTransformers.cs

+44-5
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,30 @@
1010

1111
namespace Microsoft.ML.Transforms.Text
1212
{
13-
1413
/// <summary>
15-
/// Produces a bag of counts of ngrams (sequences of consecutive words) in a given text.
16-
/// It does so by building a dictionary of ngrams and using the id in the dictionary as the index in the bag.
14+
/// <see cref="IEstimator{TTransformer}"/> for the <see cref="ITransformer"/>.
1715
/// </summary>
16+
/// <remarks>
17+
/// <format type="text/markdown"><![CDATA[
18+
/// ### Estimator Characteristics
19+
/// | | |
20+
/// | -- | -- |
21+
/// | Does this estimator need to look at the data to train its parameters? | Yes |
22+
/// | Input column data type | Vector of [Text](xref:Microsoft.ML.Data.TextDataViewType) |
23+
/// | Output column data type | Vector of known-size of <xref:System.Single> |
24+
///
25+
/// The resulting <xref:Microsoft.ML.ITransformer> creates a new column, named as specified in the output column name parameters, and
26+
/// produces a vector of n-gram counts (sequences of n consecutive words) from a given data.
27+
/// It does so by building a dictionary of ngrams and using the id in the dictionary as the index in the bag.
28+
///
29+
/// <xref:Microsoft.ML.Transforms.Text.WordBagEstimator> is different from <xref:Microsoft.ML.Transforms.Text.NgramExtractingEstimator>
30+
/// in that the former takes tokenizes text internally while the latter takes tokenized text as input.
31+
/// See the See Also section for links to examples of the usage.
32+
/// ]]>
33+
/// </format>
34+
/// </remarks>
35+
/// <seealso cref="TextCatalog.ProduceWordBags(TransformsCatalog.TextTransforms, string, string, int, int, bool, int, NgramExtractingEstimator.WeightingCriteria)" />
36+
/// <seealso cref="TextCatalog.ProduceWordBags(TransformsCatalog.TextTransforms, string, string[], int, int, bool, int, NgramExtractingEstimator.WeightingCriteria)" />
1837
public sealed class WordBagEstimator : IEstimator<ITransformer>
1938
{
2039
private readonly IHost _host;
@@ -182,9 +201,29 @@ public SchemaShape GetOutputSchema(SchemaShape inputSchema)
182201
}
183202

184203
/// <summary>
185-
/// Produces a bag of counts of ngrams (sequences of consecutive words of length 1-n) in a given text.
186-
/// It does so by hashing each ngram and using the hash value as the index in the bag.
204+
/// <see cref="IEstimator{TTransformer}"/> for the <see cref="ITransformer"/>.
187205
/// </summary>
206+
/// <remarks>
207+
/// <format type="text/markdown"><![CDATA[
208+
/// ### Estimator Characteristics
209+
/// | | |
210+
/// | -- | -- |
211+
/// | Does this estimator need to look at the data to train its parameters? | Yes |
212+
/// | Input column data type | Vector of [Text](xref:Microsoft.ML.Data.TextDataViewType) |
213+
/// | Output column data type | Vector of known-size of <xref:System.Single> |
214+
///
215+
/// The resulting <xref:Microsoft.ML.ITransformer> creates a new column, named as specified in the output column name parameters, and
216+
/// produces a vector of n-gram counts (sequences of n consecutive words) from a given data.
217+
/// It does so by hashing each ngram and using the hash value as the index in the bag.
218+
///
219+
/// <xref:Microsoft.ML.Transforms.Text.WordHashBagEstimator> is different from <xref:Microsoft.ML.Transforms.Text.NgramHashingEstimator>
220+
/// in that the former takes tokenizes text internally while the latter takes tokenized text as input.
221+
/// See the See Also section for links to examples of the usage.
222+
/// ]]>
223+
/// </format>
224+
/// </remarks>
225+
/// <seealso cref="TextCatalog.ProduceHashedWordBags(TransformsCatalog.TextTransforms, string, string, int, int, int, bool, uint, bool, int)" />
226+
/// <seealso cref="TextCatalog.ProduceHashedWordBags(TransformsCatalog.TextTransforms, string, string[], int, int, int, bool, uint, bool, int)" />
188227
public sealed class WordHashBagEstimator : IEstimator<ITransformer>
189228
{
190229
private readonly IHost _host;

0 commit comments

Comments
 (0)