Skip to content

Commit ee835d1

Browse files
committed
Addressed reviewers's comments.
1 parent 6b28ca6 commit ee835d1

File tree

2 files changed

+16
-21
lines changed

2 files changed

+16
-21
lines changed

src/Microsoft.ML.Transforms/Text/TextStaticExtensions.cs

Lines changed: 11 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -234,8 +234,7 @@ public override IEstimator<ITransformer> Reconcile(IHostEnvironment env,
234234
/// <param name="keepDiacritics">Whether to keep diacritical marks or remove them.</param>
235235
/// <param name="keepPunctuations">Whether to keep punctuation marks or remove them.</param>
236236
/// <param name="keepNumbers">Whether to keep numbers or remove them.</param>
237-
/// <returns></returns>
238-
public static Scalar<string> Normalize(this Scalar<string> input,
237+
public static Scalar<string> NormalizeText(this Scalar<string> input,
239238
CaseNormalizationMode textCase = CaseNormalizationMode.Lower,
240239
bool keepDiacritics = false,
241240
bool keepPunctuations = true,
@@ -311,8 +310,7 @@ public override IEstimator<ITransformer> Reconcile(IHostEnvironment env,
311310
/// <param name="allLengths">Whether to include all ngram lengths up to <paramref name="ngramLength"/> or only <paramref name="ngramLength"/>.</param>
312311
/// <param name="maxNumTerms">Maximum number of ngrams to store in the dictionary.</param>
313312
/// <param name="weighting">Statistical measure used to evaluate how important a word is to a document in a corpus.</param>
314-
/// <returns></returns>
315-
public static Vector<float> BagofWords(this Scalar<string> input,
313+
public static Vector<float> ToBagofWords(this Scalar<string> input,
316314
int ngramLength = 1,
317315
int skipLength = 0,
318316
bool allLengths = true,
@@ -396,8 +394,7 @@ public override IEstimator<ITransformer> Reconcile(IHostEnvironment env,
396394
/// <param name="seed">Hashing seed.</param>
397395
/// <param name="ordered">Whether the position of each source column should be included in the hash (when there are multiple source columns).</param>
398396
/// <param name="invertHash">Limit the number of keys used to generate the slot name to this many. 0 means no invert hashing, -1 means no limit.</param>
399-
/// <returns></returns>
400-
public static Vector<float> BagofHashedWords(this Scalar<string> input,
397+
public static Vector<float> ToBagofHashedWords(this Scalar<string> input,
401398
int hashBits = 16,
402399
int ngramLength = 1,
403400
int skipLength = 0,
@@ -467,20 +464,19 @@ public override IEstimator<ITransformer> Reconcile(IHostEnvironment env,
467464
}
468465

469466
/// <summary>
470-
/// Produces a bag of counts of ngrams (sequences of consecutive words ) in a given text.
467+
/// Produces a bag of counts of ngrams (sequences of consecutive words ) in a given tokenized text.
471468
/// It does so by building a dictionary of ngrams and using the id in the dictionary as the index in the bag.
472469
///
473-
/// /// <see cref="Ngrams"/> is different from <see cref="WordBagEstimatorExtensions.BagofWords"/>
474-
/// in a way that <see cref="Ngrams"/> takes tokenized text as input while <see cref="WordBagEstimatorExtensions.BagofWords"/> tokenizes text internally.
470+
/// /// <see cref="ToNgrams"/> is different from <see cref="WordBagEstimatorExtensions.ToBagofWords"/>
471+
/// in a way that <see cref="ToNgrams"/> takes tokenized text as input while <see cref="WordBagEstimatorExtensions.ToBagofWords"/> tokenizes text internally.
475472
/// </summary>
476473
/// <param name="input">The column to apply to.</param>
477474
/// <param name="ngramLength">Ngram length.</param>
478475
/// <param name="skipLength">Maximum number of tokens to skip when constructing an ngram.</param>
479476
/// <param name="allLengths">Whether to include all ngram lengths up to <paramref name="ngramLength"/> or only <paramref name="ngramLength"/>.</param>
480477
/// <param name="maxNumTerms">Maximum number of ngrams to store in the dictionary.</param>
481478
/// <param name="weighting">Statistical measure used to evaluate how important a word is to a document in a corpus.</param>
482-
/// <returns></returns>
483-
public static Vector<float> Ngrams(this VarVector<Key<uint,string>> input,
479+
public static Vector<float> ToNgrams(this VarVector<Key<uint,string>> input,
484480
int ngramLength = 1,
485481
int skipLength = 0,
486482
bool allLengths = true,
@@ -553,11 +549,11 @@ public override IEstimator<ITransformer> Reconcile(IHostEnvironment env,
553549
}
554550

555551
/// <summary>
556-
/// Produces a bag of counts of ngrams (sequences of consecutive words of length 1-n) in a given text.
552+
/// Produces a bag of counts of ngrams (sequences of consecutive words of length 1-n) in a given tokenized text.
557553
/// It does so by hashing each ngram and using the hash value as the index in the bag.
558554
///
559-
/// <see cref="NgramsHash"/> is different from <see cref="WordHashBagEstimatorExtensions.BagofHashedWords"/>
560-
/// in a way that <see cref="NgramsHash"/> takes tokenized text as input while <see cref="WordHashBagEstimatorExtensions.BagofHashedWords"/> tokenizes text internally.
555+
/// <see cref="ToNgramsHash"/> is different from <see cref="WordHashBagEstimatorExtensions.ToBagofHashedWords"/>
556+
/// in a way that <see cref="ToNgramsHash"/> takes tokenized text as input while <see cref="WordHashBagEstimatorExtensions.ToBagofHashedWords"/> tokenizes text internally.
561557
/// </summary>
562558
/// <param name="input">The column to apply to.</param>
563559
/// <param name="hashBits">Number of bits to hash into. Must be between 1 and 30, inclusive.</param>
@@ -567,8 +563,7 @@ public override IEstimator<ITransformer> Reconcile(IHostEnvironment env,
567563
/// <param name="seed">Hashing seed.</param>
568564
/// <param name="ordered">Whether the position of each source column should be included in the hash (when there are multiple source columns).</param>
569565
/// <param name="invertHash">Limit the number of keys used to generate the slot name to this many. 0 means no invert hashing, -1 means no limit.</param>
570-
/// <returns></returns>
571-
public static Vector<float> NgramsHash(this VarVector<Key<uint, string>> input,
566+
public static Vector<float> ToNgramsHash(this VarVector<Key<uint, string>> input,
572567
int hashBits = 16,
573568
int ngramLength = 2,
574569
int skipLength = 0,

test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -446,7 +446,7 @@ public void NormalizeTextAndRemoveStopWords()
446446
var est = data.MakeNewEstimator()
447447
.Append(r => (
448448
r.label,
449-
normalized_text: r.text.Normalize(),
449+
normalized_text: r.text.NormalizeText(),
450450
words_without_stopwords: r.text.TokenizeText().RemoveStopwords()));
451451

452452
var tdata = est.Fit(data).Transform(data);
@@ -475,8 +475,8 @@ public void ConvertToWordBag()
475475
var est = data.MakeNewEstimator()
476476
.Append(r => (
477477
r.label,
478-
bagofword: r.text.BagofWords(),
479-
bagofhashedword: r.text.BagofHashedWords()));
478+
bagofword: r.text.ToBagofWords(),
479+
bagofhashedword: r.text.ToBagofHashedWords()));
480480

481481
var tdata = est.Fit(data).Transform(data);
482482
var schema = tdata.AsDynamic.Schema;
@@ -504,8 +504,8 @@ public void Ngrams()
504504
var est = data.MakeNewEstimator()
505505
.Append(r => (
506506
r.label,
507-
ngrams: r.text.TokenizeText().ToKey().Ngrams(),
508-
ngramshash: r.text.TokenizeText().ToKey().NgramsHash()));
507+
ngrams: r.text.TokenizeText().ToKey().ToNgrams(),
508+
ngramshash: r.text.TokenizeText().ToKey().ToNgramsHash()));
509509

510510
var tdata = est.Fit(data).Transform(data);
511511
var schema = tdata.AsDynamic.Schema;

0 commit comments

Comments
 (0)