Skip to content

Commit 4ffa12b

Browse files
committed
Address comments and handle WordBags and HashedWordBags
1 parent cabdb9a commit 4ffa12b

File tree

8 files changed

+54
-97
lines changed

8 files changed

+54
-97
lines changed

docs/samples/Microsoft.ML.Samples/Dynamic/NgramExtraction.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ public static void NgramTransform()
6161
// 'e' - 1 '<?>' - 2 'd' - 1 '=' - 4 'R' - 1 'U' - 1 'D' - 2 'E' - 1 'u' - 1 ',' - 1 '2' - 1
6262
// 'B' - 0 'e' - 6 's' - 3 't' - 6 '<?>' - 9 'g' - 2 'a' - 2 'm' - 2 'I' - 0 ''' - 0 'v' - 0 ...
6363
// Preview of the CharsTwoGrams column obtained after processing the input.
64-
var charsTwoGramColumn = transformedData_twochars.GetColumn<VBuffer<float>>(transformedData_onechars.Schema["CharsUnigrams"]);
64+
var charsTwoGramColumn = transformedData_twochars.GetColumn<VBuffer<float>>(transformedData_twochars.Schema["CharsTwograms"]);
6565
transformedData_twochars.Schema["CharsTwograms"].GetSlotNames(ref slotNames);
6666
printHelper("CharsTwograms", charsTwoGramColumn, slotNames);
6767

src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs

+12-12
Original file line numberDiff line numberDiff line change
@@ -310,15 +310,15 @@ public override IEstimator<ITransformer> Reconcile(IHostEnvironment env,
310310
/// <param name="ngramLength">Ngram length.</param>
311311
/// <param name="skipLength">Maximum number of tokens to skip when constructing an ngram.</param>
312312
/// <param name="allLengths">Whether to include all ngram lengths up to <paramref name="ngramLength"/> or only <paramref name="ngramLength"/>.</param>
313-
/// <param name="maxNumTerms">Maximum number of ngrams to store in the dictionary.</param>
313+
/// <param name="maximumNgramsCount">Maximum number of ngrams to store in the dictionary.</param>
314314
/// <param name="weighting">Statistical measure used to evaluate how important a word is to a document in a corpus.</param>
315-
public static Vector<float> ToBagofWords(this Scalar<string> input,
315+
public static Vector<float> ProduceWordBags(this Scalar<string> input,
316316
int ngramLength = 1,
317317
int skipLength = 0,
318318
bool allLengths = true,
319-
int maxNumTerms = 10000000,
319+
int maximumNgramsCount = 10000000,
320320
NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.WeightingCriteria.Tf)
321-
=> new OutPipelineColumn(input, ngramLength, skipLength, allLengths, maxNumTerms, weighting);
321+
=> new OutPipelineColumn(input, ngramLength, skipLength, allLengths, maximumNgramsCount, weighting);
322322
}
323323

324324
/// <summary>
@@ -397,7 +397,7 @@ public override IEstimator<ITransformer> Reconcile(IHostEnvironment env,
397397
/// It does so by hashing each ngram and using the hash value as the index in the bag.
398398
/// </summary>
399399
/// <param name="input">The column to apply to.</param>
400-
/// <param name="hashBits">Number of bits to hash into. Must be between 1 and 30, inclusive.</param>
400+
/// <param name="numberOfBits">Number of bits to hash into. Must be between 1 and 30, inclusive.</param>
401401
/// <param name="ngramLength">Ngram length.</param>
402402
/// <param name="skipLength">Maximum number of tokens to skip when constructing an ngram.</param>
403403
/// <param name="allLengths">Whether to include all ngram lengths up to <paramref name="ngramLength"/> or only <paramref name="ngramLength"/>.</param>
@@ -407,14 +407,14 @@ public override IEstimator<ITransformer> Reconcile(IHostEnvironment env,
407407
/// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one.
408408
/// <paramref name="invertHash"/> specifies the upper bound of the number of distinct input values mapping to a hash that should be retained.
409409
/// <value>0</value> does not retain any input values. <value>-1</value> retains all input values mapping to each hash.</param>
410-
public static Vector<float> ToBagofHashedWords(this Scalar<string> input,
411-
int hashBits = 16,
410+
public static Vector<float> ProduceHashedWordBags(this Scalar<string> input,
411+
int numberOfBits = 16,
412412
int ngramLength = 1,
413413
int skipLength = 0,
414414
bool allLengths = true,
415415
uint seed = 314489979,
416416
bool ordered = true,
417-
int invertHash = 0) => new OutPipelineColumn(input, hashBits, ngramLength, skipLength, allLengths, seed, ordered, invertHash);
417+
int invertHash = 0) => new OutPipelineColumn(input, numberOfBits, ngramLength, skipLength, allLengths, seed, ordered, invertHash);
418418
}
419419

420420
/// <summary>
@@ -485,8 +485,8 @@ public override IEstimator<ITransformer> Reconcile(IHostEnvironment env,
485485
/// Produces a bag of counts of ngrams (sequences of consecutive words ) in a given tokenized text.
486486
/// It does so by building a dictionary of ngrams and using the id in the dictionary as the index in the bag.
487487
///
488-
/// /// <see cref="ProduceNgrams"/> is different from <see cref="WordBagEstimatorStaticExtensions.ToBagofWords"/>
489-
/// in a way that <see cref="ProduceNgrams"/> takes tokenized text as input while <see cref="WordBagEstimatorStaticExtensions.ToBagofWords"/> tokenizes text internally.
488+
/// /// <see cref="ProduceNgrams"/> is different from <see cref="WordBagEstimatorStaticExtensions.ProduceWordBags"/>
489+
/// in a way that <see cref="ProduceNgrams"/> takes tokenized text as input while <see cref="WordBagEstimatorStaticExtensions.ProduceWordBags"/> tokenizes text internally.
490490
/// </summary>
491491
/// <param name="input">The column to apply to.</param>
492492
/// <param name="ngramLength">Ngram length.</param>
@@ -571,8 +571,8 @@ public override IEstimator<ITransformer> Reconcile(IHostEnvironment env,
571571
/// Produces a bag of counts of ngrams (sequences of consecutive words of length 1-n) in a given tokenized text.
572572
/// It does so by hashing each ngram and using the hash value as the index in the bag.
573573
///
574-
/// <see cref="ProduceHashedNgrams"/> is different from <see cref="WordHashBagEstimatorStaticExtensions.ToBagofHashedWords"/>
575-
/// in a way that <see cref="ProduceHashedNgrams"/> takes tokenized text as input while <see cref="WordHashBagEstimatorStaticExtensions.ToBagofHashedWords"/> tokenizes text internally.
574+
/// <see cref="ProduceHashedNgrams"/> is different from <see cref="WordHashBagEstimatorStaticExtensions.ProduceHashedWordBags"/>
575+
/// in a way that <see cref="ProduceHashedNgrams"/> takes tokenized text as input while <see cref="WordHashBagEstimatorStaticExtensions.ProduceHashedWordBags"/> tokenizes text internally.
576576
/// </summary>
577577
/// <param name="input">The column to apply to.</param>
578578
/// <param name="numberOfBits">Number of bits to hash into. Must be between 1 and 30, inclusive.</param>

src/Microsoft.ML.Transforms/Text/NgramHashingTransformer.cs

+4-1
Original file line numberDiff line numberDiff line change
@@ -945,11 +945,14 @@ public ColumnOptions(string name,
945945
if (invertHash != 0 && numberOfBits >= 31)
946946
throw Contracts.ExceptParam(nameof(numberOfBits), $"Cannot support invertHash for a {0} bit hash. 30 is the maximum possible.", numberOfBits);
947947

948-
if (NgramLength + SkipLength > NgramBufferBuilder.MaxSkipNgramLength)
948+
if (ngramLength == 1 && skipLength != 0)
949+
throw Contracts.ExceptUserArg(nameof(skipLength), $"Number of skips can only be zero when the maximum n-gram's length is one.");
950+
if (ngramLength + skipLength > NgramBufferBuilder.MaxSkipNgramLength)
949951
{
950952
throw Contracts.ExceptUserArg(nameof(skipLength),
951953
$"The sum of skipLength and ngramLength must be less than or equal to {NgramBufferBuilder.MaxSkipNgramLength}");
952954
}
955+
953956
FriendlyNames = null;
954957
Name = name;
955958
InputColumnNamesArray = inputColumnNames;

src/Microsoft.ML.Transforms/Text/NgramTransform.cs

+13-11
Original file line numberDiff line numberDiff line change
@@ -846,20 +846,15 @@ internal ColumnOptions(string name,
846846
int[] maximumNgramsCounts,
847847
string inputColumnName = null)
848848
{
849-
Name = name;
850-
InputColumnName = inputColumnName ?? name;
851-
NgramLength = ngramLength;
852-
Contracts.CheckUserArg(0 < NgramLength && NgramLength <= NgramBufferBuilder.MaxSkipNgramLength, nameof(ngramLength));
853-
SkipLength = skipLength;
854-
if (NgramLength + SkipLength > NgramBufferBuilder.MaxSkipNgramLength)
855-
{
849+
if (ngramLength == 1 && skipLength != 0)
850+
throw Contracts.ExceptUserArg(nameof(skipLength), $"Number of skips can only be zero when the maximum n-gram's length is one.");
851+
if (ngramLength + skipLength > NgramBufferBuilder.MaxSkipNgramLength)
856852
throw Contracts.ExceptUserArg(nameof(skipLength),
857853
$"The sum of skipLength and ngramLength must be less than or equal to {NgramBufferBuilder.MaxSkipNgramLength}");
858-
}
859-
AllLengths = allLengths;
860-
Weighting = weighting;
854+
Contracts.CheckUserArg(0 < ngramLength && ngramLength <= NgramBufferBuilder.MaxSkipNgramLength, nameof(ngramLength));
855+
861856
var limits = new int[ngramLength];
862-
if (!AllLengths)
857+
if (!allLengths)
863858
{
864859
Contracts.CheckUserArg(Utils.Size(maximumNgramsCounts) == 0 ||
865860
Utils.Size(maximumNgramsCounts) == 1 && maximumNgramsCounts[0] > 0, nameof(maximumNgramsCounts));
@@ -873,6 +868,13 @@ internal ColumnOptions(string name,
873868
limits = Utils.BuildArray(ngramLength, i => i < Utils.Size(maximumNgramsCounts) ? maximumNgramsCounts[i] : extend);
874869
}
875870
_maximumNgramsCounts = ImmutableArray.Create(limits);
871+
872+
Name = name;
873+
InputColumnName = inputColumnName ?? name;
874+
NgramLength = ngramLength;
875+
SkipLength = skipLength;
876+
AllLengths = allLengths;
877+
Weighting = weighting;
876878
}
877879
}
878880

0 commit comments

Comments
 (0)