Skip to content

Commit 3f4b997

Browse files
committed
review comments
1 parent e823567 commit 3f4b997

File tree

6 files changed

+67
-34
lines changed

6 files changed

+67
-34
lines changed

src/Microsoft.ML.Transforms/Text/NgramTransform.cs

+8-4
Original file line numberDiff line numberDiff line change
@@ -676,15 +676,17 @@ protected override Delegate MakeGetter(DataViewRow input, int iinfo, Func<int, b
676676
/// | | |
677677
/// | -- | -- |
678678
/// | Does this estimator need to look at the data to train its parameters? | Yes |
679-
/// | Input column data type | Vector of [Keys](<xref:Microsoft.ML.Data.KeyDataViewType>) |
679+
/// | Input column data type | Vector of [Keys](xref:Microsoft.ML.Data.KeyDataViewType) |
680680
/// | Output column data type | Known-sized vector of <xref:System.Single> |
681681
///
682-
/// The resulting [NgramExtractingTransformer]<xref:Microsoft.ML.Transforms.Text.NgramExtractingTransformer>
682+
/// The resulting <xref:Microsoft.ML.Transforms.Text.NgramExtractingTransformer>
683683
/// creates a new column, named as specified in the output column name parameters, where each
684684
/// input vector is mapped to a vector of counts of ngrams (sequences of consecutive words) encountered in the input text.
685685
///
686686
/// The estimator builds a dictionary of ngrams and the [NgramExtractingTransformer]<xref:Microsoft.ML.Transforms.Text.NgramExtractingTransformer>
687687
/// uses the id in the dictionary as the index in the count vector that it produces.
688+
///
689+
/// See the See Also section for links to examples of the usage.
688690
/// ]]></format>
689691
/// </remarks>
690692
/// <seealso cref="TextCatalog.ProduceNgrams(TransformsCatalog.TextTransforms, string, string, int, int, bool, int, WeightingCriteria)"/>
@@ -700,8 +702,10 @@ public enum WeightingCriteria
700702
[EnumValueDisplay("TF (Term Frequency)")]
701703
Tf = 0,
702704

703-
/// <summary>Inverse Document Frequency. A ratio (the logarithm of inverse relative frequency)
704-
/// that measures the information a slot provides by determining how common or rare it is across the entire corpus.</summary>
705+
/// <summary>
706+
/// Inverse Document Frequency. A ratio (the logarithm of inverse relative frequency)
707+
/// that measures the information a slot provides by determining how common or rare it is across the entire corpus.
708+
/// </summary>
705709
[EnumValueDisplay("IDF (Inverse Document Frequency)")]
706710
Idf = 1,
707711

src/Microsoft.ML.Transforms/Text/TextCatalog.cs

+8-8
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ public static TextFeaturizingEstimator FeaturizeText(this TransformsCatalog.Text
6262
/// </summary>
6363
/// <param name="catalog">The text-related transform's catalog.</param>
6464
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.
65-
/// This column's data type will be a vector of keys.</param>
65+
/// This column's data type will be a variable-sized vector of keys.</param>
6666
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the
6767
/// <paramref name="outputColumnName"/> will be used as source.
6868
/// This estimator operates over text data type.</param>
@@ -100,15 +100,15 @@ internal static TokenizingByCharactersEstimator TokenizeIntoCharactersAsKeys(thi
100100
}
101101

102102
/// <summary>
103-
/// Creates a <see cref="TextNormalizingEstimator"/>, which normalizes incoming text in <paramref name="inputColumnName"/> by changing case,
104-
/// removing diacritical marks, punctuation marks and/or numbers and outputs new text as <paramref name="outputColumnName"/>.
103+
/// Creates a <see cref="TextNormalizingEstimator"/>, which normalizes incoming text in <paramref name="inputColumnName"/> by optionally
104+
/// changing case, removing diacritical marks, punctuation marks, numbers, and outputs new text as <paramref name="outputColumnName"/>.
105105
/// </summary>
106106
/// <param name="catalog">The text-related transform's catalog.</param>
107107
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.
108108
/// This column's data type will remain scalar of text or a vector of text depending on the input column data type.</param>
109109
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>,
110110
/// the value of the <paramref name="outputColumnName"/> will be used as source.
111-
/// This estimator operates on text and vector of text data types.</param>
111+
/// This estimator operates on text or vector of text data types.</param>
112112
/// <param name="caseMode">Casing text using the rules of the invariant culture.</param>
113113
/// <param name="keepDiacritics">Whether to keep diacritical marks or remove them.</param>
114114
/// <param name="keepPunctuations">Whether to keep punctuation marks or remove them.</param>
@@ -131,8 +131,8 @@ public static TextNormalizingEstimator NormalizeText(this TransformsCatalog.Text
131131
outputColumnName, inputColumnName, caseMode, keepDiacritics, keepPunctuations, keepNumbers);
132132

133133
/// <summary>
134-
/// Create an <see cref="WordEmbeddingEstimator"/>, which is a text featurizer that converts vectors
135-
/// of text into numerical vectors using pre-trained embeddings models.
134+
/// Create an <see cref="WordEmbeddingEstimator"/>, which is a text featurizer that converts a vector
135+
/// of text into a numerical vector using pre-trained embeddings models.
136136
/// </summary>
137137
/// <param name="catalog">The text-related transform's catalog.</param>
138138
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.
@@ -204,7 +204,7 @@ internal static WordEmbeddingEstimator ApplyWordEmbedding(this TransformsCatalog
204204
/// </summary>
205205
/// <param name="catalog">The text-related transform's catalog.</param>
206206
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.
207-
/// This column's data type will be a vector of text.</param>
207+
/// This column's data type will be a variable-sized vector of text.</param>
208208
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.
209209
/// This estimator operates of text data type.</param>
210210
/// <param name="separators">The separators to use (uses space character by default).</param>
@@ -241,7 +241,7 @@ internal static WordTokenizingEstimator TokenizeIntoWords(this TransformsCatalog
241241
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.
242242
/// This estimator operates over vectors of keys data type.</param>
243243
/// <param name="ngramLength">Ngram length.</param>
244-
/// <param name="skipLength">Number of tokens to skip between each ngram. By defaults no token is skipped.</param>
244+
/// <param name="skipLength">Number of tokens to skip between each ngram. By default no token is skipped.</param>
245245
/// <param name="useAllLengths">Whether to include all ngram lengths up to <paramref name="ngramLength"/> or only <paramref name="ngramLength"/>.</param>
246246
/// <param name="maximumNgramsCount">Maximum number of n-grams to store in the dictionary.</param>
247247
/// <param name="weighting">Statistical measure used to evaluate how important a word or ngram is to a document in a corpus.

src/Microsoft.ML.Transforms/Text/TextNormalizing.cs

+5-3
Original file line numberDiff line numberDiff line change
@@ -440,11 +440,13 @@ private bool IsCombiningDiacritic(char ch)
440440
/// | | |
441441
/// | -- | -- |
442442
/// | Does this estimator need to look at the data to train its parameters? | No |
443-
/// | Input column data type | <xref:System.ReadOnlyMemory{System.Char}> or vector of <xref:System.ReadOnlyMemory{System.Char}> |
443+
/// | Input column data type | [Text](xref:Microsoft.ML.Data.TextDataViewType) or Vector of [Text](xref:Microsoft.ML.Data.TextDataViewType) |
444444
/// | Output column data type | The same as the data type in the input column |
445445
///
446-
/// The resulting transformer creates a new column, named as specified in the output column name parameters, and
447-
/// normalizes the textual input data by changing case, removing diacritical marks, punctuation marks and/or numbers.
446+
/// The resulting <xref:Microsoft.ML.Transforms.Text.TextNormalizingTransformer> creates a new column, named as specified
447+
/// in the output column name parameters, and normalizes the textual input data by changing case, removing diacritical marks,
448+
/// punctuation marks and/or numbers.
449+
///
448450
/// See the See Also section for links to examples of the usage.
449451
/// ]]>
450452
/// </format>

src/Microsoft.ML.Transforms/Text/TokenizingByCharacters.cs

+6-4
Original file line numberDiff line numberDiff line change
@@ -556,14 +556,16 @@ private ValueGetter<VBuffer<ushort>> MakeGetterVec(DataViewRow input, int iinfo)
556556
/// | | |
557557
/// | -- | -- |
558558
/// | Does this estimator need to look at the data to train its parameters? | Yes |
559-
/// | Input column data type | <xref:System.ReadOnlyMemory{System.Char}> |
560-
/// | Output column data type | Variable-sized vector of [Keys](<xref:Microsoft.ML.Data.KeyDataViewType>) |
559+
/// | Input column data type | Scalar of [Text](xref:Microsoft.ML.Data.TextDataViewType) |
560+
/// | Output column data type | Variable-sized vector of [Keys](xref:Microsoft.ML.Data.KeyDataViewType) |
561561
///
562562
/// The estimator tokenizes characters by splitting text into sequences of characters using a sliding window.
563563
/// During training, the estimator builds a key-value pair dictionary with the encountered sequences of characters.
564564
///
565-
/// The transformer resulting from fitting the estimator creates a new column, named as specified in the output
566-
/// column name parameters, which contains the keys of the sequences of characters that were encountered in the input.
565+
/// The <xref:Microsoft.ML.Transforms.Text.TokenizingByCharactersTransformer> resulting from fitting the estimator
566+
/// creates a new column, named as specified in the output column name parameters, which contains the keys of the
567+
/// sequences of characters that were encountered in the input.
568+
///
567569
/// See the See Also section for links to examples of the usage.
568570
/// ]]>
569571
/// </format>

src/Microsoft.ML.Transforms/Text/WordEmbeddingsExtractor.cs

+35-12
Original file line numberDiff line numberDiff line change
@@ -734,10 +734,10 @@ private static ParallelOptions GetParallelOptions(IHostEnvironment hostEnvironme
734734
/// | | |
735735
/// | -- | -- |
736736
/// | Does this estimator need to look at the data to train its parameters? | No |
737-
/// | Input column data type | Vector of <xref:System.ReadOnlyMemory{System.Char}> |
737+
/// | Input column data type | Vector of [Text](xref:Microsoft.ML.Data.TextDataViewType) |
738738
/// | Output column data type | Known-sized vector of <xref:System.Single> |
739739
///
740-
/// The [WordEmbeddingTransformer](<xref:Microsoft.ML.Transforms.Text.WordEmbeddingTransformer>) produces a new column,
740+
/// The <xref:Microsoft.ML.Transforms.Text.WordEmbeddingTransformer> produces a new column,
741741
/// named as specified in the output column name parameters, where each input vector is mapped to a numerical vector
742742
/// with size of 3 * dimensionality of the embedding model used. Notice that this is independent of the size of the input vector.
743743
///
@@ -750,6 +750,8 @@ private static ParallelOptions GetParallelOptions(IHostEnvironment hostEnvironme
750750
/// The user can specify a custom pre-trained embeddings model or one of the available pre-trained models.
751751
/// The available options are various versions of [GloVe Models](https://nlp.stanford.edu/projects/glove/),
752752
/// [FastText](https://en.wikipedia.org/wiki/FastText), and [SSWE](https://anthology.aclweb.org/P/P14/P14-1146.pdf).
753+
///
754+
/// See the See Also section for links to examples of the usage.
753755
/// ]]></format>
754756
/// </remarks>
755757
/// <seealso cref="TextCatalog.ApplyWordEmbedding(TransformsCatalog.TextTransforms, string, string, PretrainedModelKind)"/>
@@ -826,46 +828,67 @@ internal WordEmbeddingEstimator(IHostEnvironment env, string customModelFile, pa
826828
/// </summary>
827829
public enum PretrainedModelKind
828830
{
829-
/// <summary>GloVe 50 dimensional word embeddings.</summary>
831+
/// <summary>
832+
/// GloVe 50 dimensional word embeddings.
833+
/// </summary>
830834
[TGUI(Label = "GloVe 50D")]
831835
GloVe50D = 0,
832836

833-
/// <summary>GloVe 100 dimensional word embeddings.</summary>
837+
/// <summary>
838+
/// GloVe 100 dimensional word embeddings.
839+
/// </summary>
834840
[TGUI(Label = "GloVe 100D")]
835841
GloVe100D = 1,
836842

837-
/// <summary>GloVe 200 dimensional word embeddings.</summary>
843+
/// <summary>
844+
/// GloVe 200 dimensional word embeddings.
845+
/// </summary>
838846
[TGUI(Label = "GloVe 200D")]
839847
GloVe200D = 2,
840848

841-
/// <summary>GloVe 300 dimensional word embeddings.</summary>
849+
/// <summary>
850+
/// GloVe 300 dimensional word embeddings.
851+
/// </summary>
842852
[TGUI(Label = "GloVe 300D")]
843853
GloVe300D = 3,
844854

845-
/// <summary>GloVe 25 dimensional word embeddings trained on Twitter data.</summary>
855+
/// <summary>
856+
/// GloVe 25 dimensional word embeddings trained on Twitter data.
857+
/// </summary>
846858
[TGUI(Label = "GloVe Twitter 25D")]
847859
GloVeTwitter25D = 4,
848860

849-
/// <summary>GloVe 50 dimensional word embeddings trained on Twitter data.</summary>
861+
/// <summary>
862+
/// GloVe 50 dimensional word embeddings trained on Twitter data.
863+
/// </summary>
850864
[TGUI(Label = "GloVe Twitter 50D")]
851865
GloVeTwitter50D = 5,
852866

853-
/// <summary>GloVe 100 dimensional word embeddings trained on Twitter data.</summary>
867+
/// <summary>
868+
/// GloVe 100 dimensional word embeddings trained on Twitter data.
869+
/// </summary>
854870
[TGUI(Label = "GloVe Twitter 100D")]
855871
GloVeTwitter100D = 6,
856872

857-
/// <summary>GloVe 200 dimensional word embeddings trained on Twitter data.</summary>
873+
/// <summary>
874+
/// GloVe 200 dimensional word embeddings trained on Twitter data.
875+
/// </summary>
858876
[TGUI(Label = "GloVe Twitter 200D")]
859877
GloVeTwitter200D = 7,
860878

861-
/// <summary>FastText 300 dimensional word embeddings trained on Wikipedia.</summary>
879+
/// <summary>
880+
/// FastText 300 dimensional word embeddings trained on Wikipedia.
881+
/// </summary>
862882
[TGUI(Label = "fastText Wikipedia 300D")]
863883
FastTextWikipedia300D = 8,
864884

865-
/// <summary>Word embeddings trained on sentiment analysis tasks.</summary>
885+
/// <summary>
886+
/// Word embeddings trained on sentiment analysis tasks.
887+
/// </summary>
866888
[TGUI(Label = "Sentiment-Specific Word Embedding")]
867889
SentimentSpecificWordEmbedding = 9
868890
}
891+
869892
/// <summary>
870893
/// Information for each column pair.
871894
/// </summary>

src/Microsoft.ML.Transforms/Text/WordTokenizing.cs

+5-3
Original file line numberDiff line numberDiff line change
@@ -405,14 +405,16 @@ private JToken SaveAsPfaCore(BoundPfaContext ctx, int iinfo, JToken srcToken)
405405
/// | | |
406406
/// | -- | -- |
407407
/// | Does this estimator need to look at the data to train its parameters? | No |
408-
/// | Input column data type | <xref:System.ReadOnlyMemory{System.Char}> |
409-
/// | Output column data type | Variable-size vector of <xref:System.ReadOnlyMemory{System.Char}> |
408+
/// | Input column data type | Scalar of [Text](xref:Microsoft.ML.Data.TextDataViewType) |
409+
/// | Output column data type | Variable-size vector of [Text](xref:Microsoft.ML.Data.TextDataViewType) |
410410
///
411-
/// The resulting [WordTokenizingTransformer](Microsoft.ML.Transforms.Text.WordTokenizingTransformer) creates a new column,
411+
/// The resulting <xref:Microsoft.ML.Transforms.Text.WordTokenizingTransformer> creates a new column,
412412
/// named as specified in the output column name parameters, where each input string is mapped to a vector of substrings obtained
413413
/// by splitting the input string according to the user defined delimiters. The space character is the default delimiter.
414414
///
415415
/// Empty strings and strings containing only spaces are dropped.
416+
///
417+
/// See the See Also section for links to examples of the usage.
416418
/// ]]></format>
417419
/// </remarks>
418420
/// <seealso cref="TextCatalog.TokenizeIntoWords(TransformsCatalog.TextTransforms, string, string, char[])"/>

0 commit comments

Comments
 (0)