24
24
namespace Microsoft . ML . Transforms . Text
25
25
{
26
26
using CaseMode = TextNormalizingEstimator . CaseMode ;
27
+ using StopWordsCol = StopWordsRemovingTransformer . Column ;
28
+
29
+ /// <summary>
30
+ /// Defines the different type of stop words remover supported.
31
+ /// </summary>
32
+ public interface IStopWordsRemoverOptions { }
33
+
27
34
// A transform that turns a collection of text documents into numerical feature vectors. The feature vectors are counts
28
35
// of (word or character) ngrams in a given text. It offers ngram hashing (finding the ngram token string name to feature
29
36
// integer index mapping through hashing) as an option.
@@ -93,10 +100,56 @@ public sealed class Options : TransformInputBase
93
100
internal Column Columns ;
94
101
95
102
[ Argument ( ArgumentType . AtMostOnce , HelpText = "Dataset language or 'AutoDetect' to detect language per row." , ShortName = "lang" , SortOrder = 3 ) ]
96
- public Language Language = DefaultLanguage ;
103
+ internal Language Language = DefaultLanguage ;
104
+
105
+ [ Argument ( ArgumentType . Multiple , Name = "StopWordsRemover" , HelpText = "Stopwords remover." , ShortName = "remover" , NullName = "<None>" , SortOrder = 4 ) ]
106
+ internal IStopWordsRemoverFactory StopWordsRemover ;
97
107
98
- [ Argument ( ArgumentType . Multiple , HelpText = "Use stop remover or not." , ShortName = "remover" , SortOrder = 4 ) ]
99
- public bool UsePredefinedStopWordRemover = false ;
108
+ /// <summary>
109
+ /// The underlying state of <see cref="StopWordsRemover"/> and <see cref="StopWordsRemoverOptions"/>.
110
+ /// </summary>
111
+ private IStopWordsRemoverOptions _stopWordsRemoverOptions ;
112
+
113
+ /// <summary>
114
+ /// Option to set type of stop word remover to use.
115
+ /// The following options are available
116
+ /// <list type="bullet">
117
+ /// <item>
118
+ /// <description>The <see cref="StopWordsRemovingEstimator.Options"/> removes the language specific list of stop words from the input.</description>
119
+ /// </item>
120
+ /// <item>
121
+ /// <description>The <see cref="CustomStopWordsRemovingEstimator.Options"/> uses user provided list of stop words.</description>
122
+ /// </item>
123
+ /// </list>
124
+ /// Setting this to 'null' does not remove stop words from the input.
125
+ /// </summary>
126
+ public IStopWordsRemoverOptions StopWordsRemoverOptions
127
+ {
128
+ get { return _stopWordsRemoverOptions ; }
129
+ set
130
+ {
131
+ _stopWordsRemoverOptions = value ;
132
+ IStopWordsRemoverFactory options = null ;
133
+ if ( _stopWordsRemoverOptions != null )
134
+ {
135
+ if ( _stopWordsRemoverOptions is StopWordsRemovingEstimator . Options )
136
+ {
137
+ options = new PredefinedStopWordsRemoverFactory ( ) ;
138
+ Language = ( _stopWordsRemoverOptions as StopWordsRemovingEstimator . Options ) . Language ;
139
+ }
140
+ else if ( _stopWordsRemoverOptions is CustomStopWordsRemovingEstimator . Options )
141
+ {
142
+ var stopwords = ( _stopWordsRemoverOptions as CustomStopWordsRemovingEstimator . Options ) . StopWords ;
143
+ options = new CustomStopWordsRemovingTransformer . LoaderArguments ( )
144
+ {
145
+ Stopwords = stopwords ,
146
+ Stopword = string . Join ( "," , stopwords )
147
+ } ;
148
+ }
149
+ }
150
+ StopWordsRemover = options ;
151
+ }
152
+ }
100
153
101
154
[ Argument ( ArgumentType . AtMostOnce , HelpText = "Casing text using the rules of the invariant culture." , Name = "TextCase" , ShortName = "case" , SortOrder = 5 ) ]
102
155
public CaseMode CaseMode = TextNormalizingEstimator . Defaults . Mode ;
@@ -202,6 +255,7 @@ public Options()
202
255
203
256
// These parameters are hardcoded for now.
204
257
// REVIEW: expose them once sub-transforms are estimators.
258
+ private IStopWordsRemoverFactory _stopWordsRemover ;
205
259
private TermLoaderArguments _dictionary ;
206
260
private INgramExtractorFactoryFactory _wordFeatureExtractor ;
207
261
private INgramExtractorFactoryFactory _charFeatureExtractor ;
@@ -219,7 +273,7 @@ private sealed class TransformApplierParams
219
273
220
274
public readonly NormFunction Norm ;
221
275
public readonly Language Language ;
222
- public readonly bool UsePredefinedStopWordRemover ;
276
+ public readonly IStopWordsRemoverFactory StopWordsRemover ;
223
277
public readonly CaseMode TextCase ;
224
278
public readonly bool KeepDiacritics ;
225
279
public readonly bool KeepPunctuations ;
@@ -251,7 +305,9 @@ internal LpNormNormalizingEstimatorBase.NormFunction LpNorm
251
305
252
306
// These properties encode the logic needed to determine which transforms to apply.
253
307
#region NeededTransforms
254
- public bool NeedsWordTokenizationTransform { get { return WordExtractorFactory != null || UsePredefinedStopWordRemover || OutputTextTokens ; } }
308
+ public bool NeedsWordTokenizationTransform { get { return WordExtractorFactory != null || NeedsRemoveStopwordsTransform || OutputTextTokens ; } }
309
+
310
+ public bool NeedsRemoveStopwordsTransform { get { return StopWordsRemover != null ; } }
255
311
256
312
public bool NeedsNormalizeTransform
257
313
{
@@ -297,7 +353,7 @@ public TransformApplierParams(TextFeaturizingEstimator parent)
297
353
CharExtractorFactory = parent . _charFeatureExtractor ? . CreateComponent ( host , parent . _dictionary ) ;
298
354
Norm = parent . OptionalSettings . Norm ;
299
355
Language = parent . OptionalSettings . Language ;
300
- UsePredefinedStopWordRemover = parent . OptionalSettings . UsePredefinedStopWordRemover ;
356
+ StopWordsRemover = parent . _stopWordsRemover ;
301
357
TextCase = parent . OptionalSettings . CaseMode ;
302
358
KeepDiacritics = parent . OptionalSettings . KeepDiacritics ;
303
359
KeepPunctuations = parent . OptionalSettings . KeepPunctuations ;
@@ -339,6 +395,7 @@ internal TextFeaturizingEstimator(IHostEnvironment env, string name, IEnumerable
339
395
if ( options != null )
340
396
OptionalSettings = options ;
341
397
398
+ _stopWordsRemover = null ;
342
399
_dictionary = null ;
343
400
_wordFeatureExtractor = OptionalSettings . WordFeatureExtractorFactory ;
344
401
_charFeatureExtractor = OptionalSettings . CharFeatureExtractorFactory ;
@@ -401,21 +458,23 @@ public ITransformer Fit(IDataView input)
401
458
view = new WordTokenizingEstimator ( h , xfCols ) . Fit ( view ) . Transform ( view ) ;
402
459
}
403
460
404
- if ( tparams . UsePredefinedStopWordRemover )
461
+ if ( tparams . NeedsRemoveStopwordsTransform )
405
462
{
406
463
Contracts . Assert ( wordTokCols != null , "StopWords transform requires that word tokenization has been applied to the input text." ) ;
407
- var xfCols = new StopWordsRemovingEstimator . ColumnOptions [ wordTokCols . Length ] ;
464
+ var xfCols = new StopWordsCol [ wordTokCols . Length ] ;
408
465
var dstCols = new string [ wordTokCols . Length ] ;
409
466
for ( int i = 0 ; i < wordTokCols . Length ; i ++ )
410
467
{
411
- var tempName = GenerateColumnName ( view . Schema , wordTokCols [ i ] , "StopWordsRemoverTransform" ) ;
412
- var col = new StopWordsRemovingEstimator . ColumnOptions ( tempName , wordTokCols [ i ] , tparams . StopwordsLanguage ) ;
413
- dstCols [ i ] = tempName ;
414
- tempCols . Add ( tempName ) ;
468
+ var col = new StopWordsCol ( ) ;
469
+ col . Source = wordTokCols [ i ] ;
470
+ col . Name = GenerateColumnName ( view . Schema , wordTokCols [ i ] , "StopWordsRemoverTransform" ) ;
471
+ dstCols [ i ] = col . Name ;
472
+ tempCols . Add ( col . Name ) ;
473
+ col . Language = tparams . StopwordsLanguage ;
415
474
416
475
xfCols [ i ] = col ;
417
476
}
418
- view = new StopWordsRemovingEstimator ( h , xfCols ) . Fit ( view ) . Transform ( view ) ;
477
+ view = tparams . StopWordsRemover . CreateComponent ( h , view , xfCols ) ;
419
478
wordTokCols = dstCols ;
420
479
}
421
480
@@ -442,7 +501,7 @@ public ITransformer Fit(IDataView input)
442
501
if ( tparams . CharExtractorFactory != null )
443
502
{
444
503
{
445
- var srcCols = tparams . UsePredefinedStopWordRemover ? wordTokCols : textCols ;
504
+ var srcCols = tparams . NeedsRemoveStopwordsTransform ? wordTokCols : textCols ;
446
505
charTokCols = new string [ srcCols . Length ] ;
447
506
var xfCols = new ( string outputColumnName , string inputColumnName ) [ srcCols . Length ] ;
448
507
for ( int i = 0 ; i < srcCols . Length ; i ++ )
@@ -567,6 +626,7 @@ public SchemaShape GetOutputSchema(SchemaShape inputSchema)
567
626
internal static IDataTransform Create ( IHostEnvironment env , Options args , IDataView data )
568
627
{
569
628
var estimator = new TextFeaturizingEstimator ( env , args . Columns . Name , args . Columns . Source ?? new [ ] { args . Columns . Name } , args ) ;
629
+ estimator . _stopWordsRemover = args . StopWordsRemover ;
570
630
estimator . _dictionary = args . Dictionary ;
571
631
// Review: I don't think the following two lines are needed.
572
632
estimator . _wordFeatureExtractor = args . WordFeatureExtractorFactory ;
0 commit comments