@@ -93,7 +93,7 @@ internal sealed class Options : TransformInputBase
93
93
public int SkipLength = NgramExtractingEstimator . Defaults . SkipLength ;
94
94
95
95
[ Argument ( ArgumentType . Multiple , HelpText = "Maximum number of ngrams to store in the dictionary" , ShortName = "max" ) ]
96
- public int [ ] MaxNumTerms = new int [ ] { NgramExtractingEstimator . Defaults . MaxNumTerms } ;
96
+ public int [ ] MaxNumTerms = new int [ ] { NgramExtractingEstimator . Defaults . MaximumTermCount } ;
97
97
98
98
[ Argument ( ArgumentType . AtMostOnce , HelpText = "The weighting criteria" ) ]
99
99
public NgramExtractingEstimator . WeightingCriteria Weighting = NgramExtractingEstimator . Defaults . Weighting ;
@@ -253,7 +253,7 @@ private static SequencePool[] Train(IHostEnvironment env, NgramExtractingEstimat
253
253
// Note: GetNgramIdFinderAdd will control how many ngrams of a specific length will
254
254
// be added (using lims[iinfo]), therefore we set slotLim to the maximum
255
255
helpers [ iinfo ] = new NgramBufferBuilder ( ngramLength , skipLength , Utils . ArrayMaxSize ,
256
- GetNgramIdFinderAdd ( env , counts [ iinfo ] , columns [ iinfo ] . Limits , ngramMaps [ iinfo ] , transformInfos [ iinfo ] . RequireIdf ) ) ;
256
+ GetNgramIdFinderAdd ( env , counts [ iinfo ] , columns [ iinfo ] . MaximumTermCounts , ngramMaps [ iinfo ] , transformInfos [ iinfo ] . RequireIdf ) ) ;
257
257
}
258
258
259
259
int cInfoFull = 0 ;
@@ -293,7 +293,7 @@ private static SequencePool[] Train(IHostEnvironment env, NgramExtractingEstimat
293
293
}
294
294
}
295
295
}
296
- AssertValid ( env , counts [ iinfo ] , columns [ iinfo ] . Limits , ngramMaps [ iinfo ] ) ;
296
+ AssertValid ( env , counts [ iinfo ] , columns [ iinfo ] . MaximumTermCounts , ngramMaps [ iinfo ] ) ;
297
297
}
298
298
}
299
299
@@ -307,7 +307,7 @@ private static SequencePool[] Train(IHostEnvironment env, NgramExtractingEstimat
307
307
308
308
for ( int iinfo = 0 ; iinfo < columns . Length ; iinfo ++ )
309
309
{
310
- AssertValid ( env , counts [ iinfo ] , columns [ iinfo ] . Limits , ngramMaps [ iinfo ] ) ;
310
+ AssertValid ( env , counts [ iinfo ] , columns [ iinfo ] . MaximumTermCounts , ngramMaps [ iinfo ] ) ;
311
311
312
312
int ngramLength = transformInfos [ iinfo ] . NgramLength ;
313
313
for ( int i = 0 ; i < ngramLength ; i ++ )
@@ -319,11 +319,11 @@ private static SequencePool[] Train(IHostEnvironment env, NgramExtractingEstimat
319
319
}
320
320
321
321
[ Conditional ( "DEBUG" ) ]
322
- private static void AssertValid ( IHostEnvironment env , int [ ] counts , ImmutableArray < int > lims , SequencePool pool )
322
+ private static void AssertValid ( IHostEnvironment env , int [ ] counts , IReadOnlyList < int > lims , SequencePool pool )
323
323
{
324
324
int count = 0 ;
325
325
int countFull = 0 ;
326
- for ( int i = 0 ; i < lims . Length ; i ++ )
326
+ for ( int i = 0 ; i < lims . Count ; i ++ )
327
327
{
328
328
env . Assert ( counts [ i ] >= 0 ) ;
329
329
env . Assert ( counts [ i ] <= lims [ i ] ) ;
@@ -334,20 +334,20 @@ private static void AssertValid(IHostEnvironment env, int[] counts, ImmutableArr
334
334
env . Assert ( count == pool . Count ) ;
335
335
}
336
336
337
- private static NgramIdFinder GetNgramIdFinderAdd ( IHostEnvironment env , int [ ] counts , ImmutableArray < int > lims , SequencePool pool , bool requireIdf )
337
+ private static NgramIdFinder GetNgramIdFinderAdd ( IHostEnvironment env , int [ ] counts , IReadOnlyList < int > lims , SequencePool pool , bool requireIdf )
338
338
{
339
339
Contracts . AssertValue ( env ) ;
340
- env . Assert ( lims . Length > 0 ) ;
341
- env . Assert ( lims . Length == Utils . Size ( counts ) ) ;
340
+ env . Assert ( lims . Count > 0 ) ;
341
+ env . Assert ( lims . Count == Utils . Size ( counts ) ) ;
342
342
343
343
int numFull = lims . Count ( l => l <= 0 ) ;
344
- int ngramLength = lims . Length ;
344
+ int ngramLength = lims . Count ;
345
345
return
346
346
( uint [ ] ngram , int lim , int icol , ref bool more ) =>
347
347
{
348
348
env . Assert ( 0 < lim && lim <= Utils . Size ( ngram ) ) ;
349
349
env . Assert ( lim <= Utils . Size ( counts ) ) ;
350
- env . Assert ( lim <= lims . Length ) ;
350
+ env . Assert ( lim <= lims . Count ) ;
351
351
env . Assert ( icol == 0 ) ;
352
352
353
353
var max = lim - 1 ;
@@ -695,7 +695,7 @@ internal static class Defaults
695
695
public const int NgramLength = 2 ;
696
696
public const bool AllLengths = true ;
697
697
public const int SkipLength = 0 ;
698
- public const int MaxNumTerms = 10000000 ;
698
+ public const int MaximumTermCount = 10000000 ;
699
699
public const WeightingCriteria Weighting = WeightingCriteria . Tf ;
700
700
}
701
701
@@ -712,16 +712,16 @@ internal static class Defaults
712
712
/// <param name="ngramLength">Ngram length.</param>
713
713
/// <param name="skipLength">Maximum number of tokens to skip when constructing an ngram.</param>
714
714
/// <param name="allLengths">Whether to include all ngram lengths up to <paramref name="ngramLength"/> or only <paramref name="ngramLength"/>.</param>
715
- /// <param name="maxNumTerms ">Maximum number of ngrams to store in the dictionary.</param>
715
+ /// <param name="maximumTermCount ">Maximum number of ngrams to store in the dictionary.</param>
716
716
/// <param name="weighting">Statistical measure used to evaluate how important a word is to a document in a corpus.</param>
717
717
internal NgramExtractingEstimator ( IHostEnvironment env ,
718
718
string outputColumnName , string inputColumnName = null ,
719
719
int ngramLength = Defaults . NgramLength ,
720
720
int skipLength = Defaults . SkipLength ,
721
721
bool allLengths = Defaults . AllLengths ,
722
- int maxNumTerms = Defaults . MaxNumTerms ,
722
+ int maximumTermCount = Defaults . MaximumTermCount ,
723
723
WeightingCriteria weighting = Defaults . Weighting )
724
- : this ( env , new [ ] { ( outputColumnName , inputColumnName ?? outputColumnName ) } , ngramLength , skipLength , allLengths , maxNumTerms , weighting )
724
+ : this ( env , new [ ] { ( outputColumnName , inputColumnName ?? outputColumnName ) } , ngramLength , skipLength , allLengths , maximumTermCount , weighting )
725
725
{
726
726
}
727
727
@@ -734,16 +734,16 @@ internal NgramExtractingEstimator(IHostEnvironment env,
734
734
/// <param name="ngramLength">Ngram length.</param>
735
735
/// <param name="skipLength">Maximum number of tokens to skip when constructing an ngram.</param>
736
736
/// <param name="allLengths">Whether to include all ngram lengths up to <paramref name="ngramLength"/> or only <paramref name="ngramLength"/>.</param>
737
- /// <param name="maxNumTerms ">Maximum number of ngrams to store in the dictionary.</param>
737
+ /// <param name="maximumTermCount ">Maximum number of ngrams to store in the dictionary.</param>
738
738
/// <param name="weighting">Statistical measure used to evaluate how important a word is to a document in a corpus.</param>
739
739
internal NgramExtractingEstimator ( IHostEnvironment env ,
740
740
( string outputColumnName , string inputColumnName ) [ ] columns ,
741
741
int ngramLength = Defaults . NgramLength ,
742
742
int skipLength = Defaults . SkipLength ,
743
743
bool allLengths = Defaults . AllLengths ,
744
- int maxNumTerms = Defaults . MaxNumTerms ,
744
+ int maximumTermCount = Defaults . MaximumTermCount ,
745
745
WeightingCriteria weighting = Defaults . Weighting )
746
- : this ( env , columns . Select ( x => new ColumnOptions ( x . outputColumnName , x . inputColumnName , ngramLength , skipLength , allLengths , weighting , maxNumTerms ) ) . ToArray ( ) )
746
+ : this ( env , columns . Select ( x => new ColumnOptions ( x . outputColumnName , x . inputColumnName , ngramLength , skipLength , allLengths , weighting , maximumTermCount ) ) . ToArray ( ) )
747
747
{
748
748
}
749
749
@@ -809,10 +809,14 @@ public sealed class ColumnOptions
809
809
/// <summary>The weighting criteria.</summary>
810
810
public readonly WeightingCriteria Weighting ;
811
811
/// <summary>
812
+ /// Underlying state of <see cref="MaximumTermCounts"/>.
813
+ /// </summary>
814
+ private readonly ImmutableArray < int > _maximumTermCounts ;
815
+ /// <summary>
812
816
/// Contains the maximum number of grams to store in the dictionary, for each level of ngrams,
813
817
/// from 1 (in position 0) up to ngramLength (in position ngramLength-1)
814
818
/// </summary>
815
- public readonly ImmutableArray < int > Limits ;
819
+ public IReadOnlyList < int > MaximumTermCounts => _maximumTermCounts ;
816
820
817
821
/// <summary>
818
822
/// Describes how the transformer handles one Gcn column pair.
@@ -823,14 +827,14 @@ public sealed class ColumnOptions
823
827
/// <param name="skipLength">Maximum number of tokens to skip when constructing an ngram.</param>
824
828
/// <param name="allLengths">Whether to store all ngram lengths up to ngramLength, or only ngramLength.</param>
825
829
/// <param name="weighting">The weighting criteria.</param>
826
- /// <param name="maxNumTerms ">Maximum number of ngrams to store in the dictionary.</param>
830
+ /// <param name="maximumTermCount ">Maximum number of ngrams to store in the dictionary.</param>
827
831
public ColumnOptions ( string name , string inputColumnName = null ,
828
832
int ngramLength = Defaults . NgramLength ,
829
833
int skipLength = Defaults . SkipLength ,
830
834
bool allLengths = Defaults . AllLengths ,
831
835
WeightingCriteria weighting = Defaults . Weighting ,
832
- int maxNumTerms = Defaults . MaxNumTerms )
833
- : this ( name , ngramLength , skipLength , allLengths , weighting , new int [ ] { maxNumTerms } , inputColumnName ?? name )
836
+ int maximumTermCount = Defaults . MaximumTermCount )
837
+ : this ( name , ngramLength , skipLength , allLengths , weighting , new int [ ] { maximumTermCount } , inputColumnName ?? name )
834
838
{
835
839
}
836
840
@@ -839,7 +843,7 @@ internal ColumnOptions(string name,
839
843
int skipLength ,
840
844
bool allLengths ,
841
845
WeightingCriteria weighting ,
842
- int [ ] maxNumTerms ,
846
+ int [ ] maximumTermCounts ,
843
847
string inputColumnName = null )
844
848
{
845
849
Name = name ;
@@ -857,18 +861,18 @@ internal ColumnOptions(string name,
857
861
var limits = new int [ ngramLength ] ;
858
862
if ( ! AllLengths )
859
863
{
860
- Contracts . CheckUserArg ( Utils . Size ( maxNumTerms ) == 0 ||
861
- Utils . Size ( maxNumTerms ) == 1 && maxNumTerms [ 0 ] > 0 , nameof ( maxNumTerms ) ) ;
862
- limits [ ngramLength - 1 ] = Utils . Size ( maxNumTerms ) == 0 ? Defaults . MaxNumTerms : maxNumTerms [ 0 ] ;
864
+ Contracts . CheckUserArg ( Utils . Size ( maximumTermCounts ) == 0 ||
865
+ Utils . Size ( maximumTermCounts ) == 1 && maximumTermCounts [ 0 ] > 0 , nameof ( maximumTermCounts ) ) ;
866
+ limits [ ngramLength - 1 ] = Utils . Size ( maximumTermCounts ) == 0 ? Defaults . MaximumTermCount : maximumTermCounts [ 0 ] ;
863
867
}
864
868
else
865
869
{
866
- Contracts . CheckUserArg ( Utils . Size ( maxNumTerms ) <= ngramLength , nameof ( maxNumTerms ) ) ;
867
- Contracts . CheckUserArg ( Utils . Size ( maxNumTerms ) == 0 || maxNumTerms . All ( i => i >= 0 ) && maxNumTerms [ maxNumTerms . Length - 1 ] > 0 , nameof ( maxNumTerms ) ) ;
868
- var extend = Utils . Size ( maxNumTerms ) == 0 ? Defaults . MaxNumTerms : maxNumTerms [ maxNumTerms . Length - 1 ] ;
869
- limits = Utils . BuildArray ( ngramLength , i => i < Utils . Size ( maxNumTerms ) ? maxNumTerms [ i ] : extend ) ;
870
+ Contracts . CheckUserArg ( Utils . Size ( maximumTermCounts ) <= ngramLength , nameof ( maximumTermCounts ) ) ;
871
+ Contracts . CheckUserArg ( Utils . Size ( maximumTermCounts ) == 0 || maximumTermCounts . All ( i => i >= 0 ) && maximumTermCounts [ maximumTermCounts . Length - 1 ] > 0 , nameof ( maximumTermCounts ) ) ;
872
+ var extend = Utils . Size ( maximumTermCounts ) == 0 ? Defaults . MaximumTermCount : maximumTermCounts [ maximumTermCounts . Length - 1 ] ;
873
+ limits = Utils . BuildArray ( ngramLength , i => i < Utils . Size ( maximumTermCounts ) ? maximumTermCounts [ i ] : extend ) ;
870
874
}
871
- Limits = ImmutableArray . Create ( limits ) ;
875
+ _maximumTermCounts = ImmutableArray . Create ( limits ) ;
872
876
}
873
877
}
874
878
0 commit comments