Skip to content

Commit 2f0c475

Browse files
authored
No loader on APIs for ValueToKey/OneHotEncoding (#2245)
* ValueToKey and OneHotEncoding use IDataView directly for side data. * CustomStopWordsRemovingTransform becomes transformer. * Rename termData to keyData per Ivan.
1 parent fbe65f1 commit 2f0c475

File tree

8 files changed

+217
-103
lines changed

8 files changed

+217
-103
lines changed

src/Microsoft.ML.Data/Transforms/ColumnSelecting.cs

+5-5
Original file line numberDiff line numberDiff line change
@@ -199,13 +199,13 @@ public ColumnSelectingTransformer(IHostEnvironment env, string[] keepColumns, st
199199
_host.CheckValueOrNull(keepColumns);
200200
_host.CheckValueOrNull(dropColumns);
201201

202-
bool keepValid = keepColumns != null && keepColumns.Count() > 0;
203-
bool dropValid = dropColumns != null && dropColumns.Count() > 0;
202+
bool keepValid = Utils.Size(keepColumns) > 0;
203+
bool dropValid = Utils.Size(dropColumns) > 0;
204204

205205
// Check that both are not valid
206-
_host.Check(!(keepValid && dropValid), "Both keepColumns and dropColumns are set, only one can be specified.");
206+
_host.Check(!(keepValid && dropValid), "Both " + nameof(keepColumns) + " and " + nameof(dropColumns) + " are set. Exactly one can be specified.");
207207
// Check that both are invalid
208-
_host.Check(!(!keepValid && !dropValid), "Neither keepColumns or dropColumns is set, one must be specified.");
208+
_host.Check(!(!keepValid && !dropValid), "Neither " + nameof(keepColumns) + " and " + nameof(dropColumns) + " is set. Exactly one must be specified.");
209209

210210
_selectedColumns = (keepValid) ? keepColumns : dropColumns;
211211
KeepColumns = keepValid;
@@ -558,7 +558,7 @@ private static int[] BuildOutputToInputMap(IEnumerable<string> selectedColumns,
558558
// given an input of ABC and dropping column B will result in AC.
559559
// In drop mode, we drop all columns with the specified names and keep all the rest,
560560
// ignoring the keepHidden argument.
561-
for(int colIdx = 0; colIdx < inputSchema.Count; colIdx++)
561+
for (int colIdx = 0; colIdx < inputSchema.Count; colIdx++)
562562
{
563563
if (selectedColumns.Contains(inputSchema[colIdx].Name))
564564
continue;

src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs

+6-9
Original file line numberDiff line numberDiff line change
@@ -112,19 +112,16 @@ public static ValueToKeyMappingEstimator MapValueToKey(this TransformsCatalog.Co
112112
=> new ValueToKeyMappingEstimator(CatalogUtils.GetEnvironment(catalog), inputColumn, outputColumn, maxNumTerms, sort);
113113

114114
/// <summary>
115-
/// Converts value types into <see cref="KeyType"/> loading the keys to use from <paramref name="file"/>.
115+
/// Converts value types into <see cref="KeyType"/>, optionally loading the keys to use from <paramref name="keyData"/>.
116116
/// </summary>
117117
/// <param name="catalog">The categorical transform's catalog.</param>
118118
/// <param name="columns">The data columns to map to keys.</param>
119-
/// <param name="file">The path of the file containing the terms.</param>
120-
/// <param name="termsColumn"></param>
121-
/// <param name="loaderFactory"></param>
119+
/// <param name="keyData">The data view containing the terms. If specified, this should be a single column data
120+
/// view, and the key-values will be taken from taht column. If unspecified, the key-values will be determined
121+
/// from the input data upon fitting.</param>
122122
public static ValueToKeyMappingEstimator MapValueToKey(this TransformsCatalog.ConversionTransforms catalog,
123-
ValueToKeyMappingTransformer.ColumnInfo[] columns,
124-
string file = null,
125-
string termsColumn = null,
126-
IComponentFactory<IMultiStreamSource, IDataLoader> loaderFactory = null)
127-
=> new ValueToKeyMappingEstimator(CatalogUtils.GetEnvironment(catalog), columns, file, termsColumn, loaderFactory);
123+
ValueToKeyMappingTransformer.ColumnInfo[] columns, IDataView keyData = null)
124+
=> new ValueToKeyMappingEstimator(CatalogUtils.GetEnvironment(catalog), columns, keyData);
128125

129126
/// <summary>
130127
/// Maps specified keys to specified values

src/Microsoft.ML.Data/Transforms/ValueToKeyMappingEstimator.cs

+15-12
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
namespace Microsoft.ML.Transforms.Conversions
1010
{
1111
/// <include file='doc.xml' path='doc/members/member[@name="ValueToKeyMappingEstimator"]/*' />
12-
public sealed class ValueToKeyMappingEstimator: IEstimator<ValueToKeyMappingTransformer>
12+
public sealed class ValueToKeyMappingEstimator : IEstimator<ValueToKeyMappingTransformer>
1313
{
1414
public static class Defaults
1515
{
@@ -19,9 +19,7 @@ public static class Defaults
1919

2020
private readonly IHost _host;
2121
private readonly ValueToKeyMappingTransformer.ColumnInfo[] _columns;
22-
private readonly string _file;
23-
private readonly string _termsColumn;
24-
private readonly IComponentFactory<IMultiStreamSource, IDataLoader> _loaderFactory;
22+
private readonly IDataView _keyData;
2523

2624
/// <summary>
2725
/// Initializes a new instance of <see cref="ValueToKeyMappingEstimator"/>.
@@ -33,23 +31,28 @@ public static class Defaults
3331
/// <param name="sort">How items should be ordered when vectorized. If <see cref="ValueToKeyMappingTransformer.SortOrder.Occurrence"/> choosen they will be in the order encountered.
3432
/// If <see cref="ValueToKeyMappingTransformer.SortOrder.Value"/>, items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a').</param>
3533
public ValueToKeyMappingEstimator(IHostEnvironment env, string inputColumn, string outputColumn = null, int maxNumTerms = Defaults.MaxNumTerms, ValueToKeyMappingTransformer.SortOrder sort = Defaults.Sort) :
36-
this(env, new [] { new ValueToKeyMappingTransformer.ColumnInfo(inputColumn, outputColumn ?? inputColumn, maxNumTerms, sort) })
34+
this(env, new[] { new ValueToKeyMappingTransformer.ColumnInfo(inputColumn, outputColumn ?? inputColumn, maxNumTerms, sort) })
3735
{
3836
}
3937

40-
public ValueToKeyMappingEstimator(IHostEnvironment env, ValueToKeyMappingTransformer.ColumnInfo[] columns,
41-
string file = null, string termsColumn = null,
42-
IComponentFactory<IMultiStreamSource, IDataLoader> loaderFactory = null)
38+
public ValueToKeyMappingEstimator(IHostEnvironment env, ValueToKeyMappingTransformer.ColumnInfo[] columns, IDataView keyData = null)
4339
{
4440
Contracts.CheckValue(env, nameof(env));
4541
_host = env.Register(nameof(ValueToKeyMappingEstimator));
42+
_host.CheckNonEmpty(columns, nameof(columns));
43+
_host.CheckValueOrNull(keyData);
44+
if (keyData != null && keyData.Schema.Count != 1)
45+
{
46+
throw _host.ExceptParam(nameof(keyData), "If specified, this data view should contain only a single column " +
47+
$"containing the terms to map, but this had {keyData.Schema.Count} columns.");
48+
49+
}
50+
4651
_columns = columns;
47-
_file = file;
48-
_termsColumn = termsColumn;
49-
_loaderFactory = loaderFactory;
52+
_keyData = keyData;
5053
}
5154

52-
public ValueToKeyMappingTransformer Fit(IDataView input) => new ValueToKeyMappingTransformer(_host, input, _columns, _file, _termsColumn, _loaderFactory);
55+
public ValueToKeyMappingTransformer Fit(IDataView input) => new ValueToKeyMappingTransformer(_host, input, _columns, _keyData, false);
5356

5457
public SchemaShape GetOutputSchema(SchemaShape inputSchema)
5558
{

0 commit comments

Comments
 (0)