Skip to content

Commit a3419f4

Browse files
committed
multicolumn mapping for some estimators
1 parent 49403ab commit a3419f4

File tree

7 files changed

+207
-38
lines changed

7 files changed

+207
-38
lines changed

src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs

+60
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,20 @@ public static TypeConvertingEstimator ConvertType(this TransformsCatalog.Convers
6565
DataKind outputKind = ConvertDefaults.DefaultOutputKind)
6666
=> new TypeConvertingEstimator(CatalogUtils.GetEnvironment(catalog), new[] { new TypeConvertingEstimator.ColumnOptions(outputColumnName, outputKind, inputColumnName) });
6767

68+
/// <summary>
69+
/// Changes column type of the input columns.
70+
/// </summary>
71+
/// <param name="catalog">The conversion transform's catalog.</param>
72+
/// <param name="columns">Specifies the names of the columns on which to apply the transformation.</param>
73+
/// <param name="outputKind">The expected kind of the output column.</param>
74+
public static TypeConvertingEstimator ConvertType(this TransformsCatalog.ConversionTransforms catalog,
75+
InputOutputColumnPair[] columns,
76+
DataKind outputKind = ConvertDefaults.DefaultOutputKind)
77+
{
78+
var columnOptions = columns.Select(x => new TypeConvertingEstimator.ColumnOptions(x.OutputColumnName, outputKind, x.InputColumnName)).ToArray();
79+
return new TypeConvertingEstimator(CatalogUtils.GetEnvironment(catalog), columnOptions);
80+
}
81+
6882
/// <summary>
6983
/// Changes column type of the input column.
7084
/// </summary>
@@ -89,6 +103,14 @@ internal static TypeConvertingEstimator ConvertType(this TransformsCatalog.Conve
89103
public static KeyToValueMappingEstimator MapKeyToValue(this TransformsCatalog.ConversionTransforms catalog, string outputColumnName, string inputColumnName = null)
90104
=> new KeyToValueMappingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumnName, inputColumnName);
91105

106+
/// <summary>
107+
/// Convert the key types back to their original values.
108+
/// </summary>
109+
/// <param name="catalog">The conversion transform's catalog.</param>
110+
/// <param name="columns">Specifies the names of the columns on which to apply the transformation.</param>
111+
public static KeyToValueMappingEstimator MapKeyToValue(this TransformsCatalog.ConversionTransforms catalog, InputOutputColumnPair[] columns)
112+
=> new KeyToValueMappingEstimator(CatalogUtils.GetEnvironment(catalog), columns.Select(x => (x.OutputColumnName, x.InputColumnName)).ToArray());
113+
92114
/// <summary>
93115
/// Convert the key types (name of the column specified in the first item of the tuple) back to their original values
94116
/// (named as specified in the second item of the tuple).
@@ -127,6 +149,21 @@ public static KeyToVectorMappingEstimator MapKeyToVector(this TransformsCatalog.
127149
string outputColumnName, string inputColumnName = null, bool outputCountVector = KeyToVectorMappingEstimator.Defaults.OutputCountVector)
128150
=> new KeyToVectorMappingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumnName, inputColumnName, outputCountVector);
129151

152+
/// <summary>
153+
/// Maps columns of key types or key values into columns of floating point vectors.
154+
/// </summary>
155+
/// <param name="catalog">The conversion transform's catalog.</param>
156+
/// <param name="columns">Specifies the names of the columns on which to apply the transformation.</param>
157+
/// <param name="outputCountVector">Whether to combine multiple indicator vectors into a single vector of counts instead of concatenating them.
158+
/// This is only relevant when the input column is a vector of keys.</param>
159+
public static KeyToVectorMappingEstimator MapKeyToVector(this TransformsCatalog.ConversionTransforms catalog,
160+
InputOutputColumnPair[] columns, bool outputCountVector = KeyToVectorMappingEstimator.Defaults.OutputCountVector)
161+
{
162+
var columnOptions = columns.Select(x => new KeyToVectorMappingEstimator.ColumnOptions(x.OutputColumnName, x.InputColumnName, outputCountVector)).ToArray();
163+
return new KeyToVectorMappingEstimator(CatalogUtils.GetEnvironment(catalog), columnOptions);
164+
165+
}
166+
130167
/// <summary>
131168
/// Converts value types into <see cref="KeyType"/>.
132169
/// </summary>
@@ -157,6 +194,29 @@ public static ValueToKeyMappingEstimator MapValueToKey(this TransformsCatalog.Co
157194
=> new ValueToKeyMappingEstimator(CatalogUtils.GetEnvironment(catalog),
158195
new[] { new ValueToKeyMappingEstimator.ColumnOptions(outputColumnName, inputColumnName, maximumNumberOfKeys, keyOrdinality, addKeyValueAnnotationsAsText) }, keyData);
159196

197+
/// <summary>
198+
/// Converts value types into <see cref="KeyType"/>.
199+
/// </summary>
200+
/// <param name="catalog">The conversion transform's catalog.</param>
201+
/// <param name="columns">Specifies the names of the columns on which to apply the transformation.</param>
202+
/// <param name="maximumNumberOfKeys">Maximum number of keys to keep per column when auto-training.</param>
203+
/// <param name="keyOrdinality">How items should be ordered when vectorized. If <see cref="ValueToKeyMappingEstimator.KeyOrdinality.ByOccurrence"/> choosen they will be in the order encountered.
204+
/// If <see cref="ValueToKeyMappingEstimator.KeyOrdinality.ByValue"/>, items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a').</param>
205+
/// <param name="addKeyValueAnnotationsAsText">Whether key value annotations should be text, regardless of the actual input type.</param>
206+
/// <param name="keyData">The data view containing the terms. If specified, this should be a single column data
207+
/// view, and the key-values will be taken from that column. If unspecified, the key-values will be determined
208+
/// from the input data upon fitting.</param>
209+
public static ValueToKeyMappingEstimator MapValueToKey(this TransformsCatalog.ConversionTransforms catalog,
210+
InputOutputColumnPair[] columns,
211+
int maximumNumberOfKeys = ValueToKeyMappingEstimator.Defaults.MaximumNumberOfKeys,
212+
ValueToKeyMappingEstimator.KeyOrdinality keyOrdinality = ValueToKeyMappingEstimator.Defaults.Ordinality,
213+
bool addKeyValueAnnotationsAsText = ValueToKeyMappingEstimator.Defaults.AddKeyValueAnnotationsAsText,
214+
IDataView keyData = null)
215+
{
216+
var columnOptions = columns.Select(x => new ValueToKeyMappingEstimator.ColumnOptions(x.OutputColumnName, x.InputColumnName, maximumNumberOfKeys, keyOrdinality, addKeyValueAnnotationsAsText)).ToArray();
217+
return new ValueToKeyMappingEstimator(CatalogUtils.GetEnvironment(catalog), columnOptions, keyData);
218+
}
219+
160220
/// <summary>
161221
/// Converts value types into <see cref="KeyType"/>, optionally loading the keys to use from <paramref name="keyData"/>.
162222
/// </summary>

src/Microsoft.ML.Data/Transforms/ExtensionsCatalog.cs

+26
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,32 @@
88

99
namespace Microsoft.ML
1010
{
11+
/// <summary>
12+
/// Specifies input and output column names for a transformation.
13+
/// </summary>
14+
public sealed class InputOutputColumnPair
15+
{
16+
/// <summary>
17+
/// Name of the column to transform. If set to <see langword="null"/>, the value of the <see cref="OutputColumnName"/> will be used as source.
18+
/// </summary>
19+
public readonly string InputColumnName;
20+
/// <summary>
21+
/// Name of the column resulting from the transformation of <see cref="InputColumnName"/>.
22+
/// </summary>
23+
public readonly string OutputColumnName;
24+
25+
/// <summary>
26+
/// Specifies input and output column names for a transformation.
27+
/// </summary>
28+
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
29+
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
30+
public InputOutputColumnPair(string outputColumnName, string inputColumnName = null)
31+
{
32+
InputColumnName = inputColumnName;
33+
OutputColumnName = outputColumnName;
34+
}
35+
}
36+
1137
/// <summary>
1238
/// Specifies input and output column names for a transformation.
1339
/// </summary>

src/Microsoft.ML.Transforms/CategoricalCatalog.cs

+48
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
// The .NET Foundation licenses this file to you under the MIT license.
33
// See the LICENSE file in the project root for more information.
44

5+
using System.Linq;
56
using Microsoft.ML.Data;
67
using Microsoft.ML.Transforms;
78

@@ -40,6 +41,28 @@ public static OneHotEncodingEstimator OneHotEncoding(this TransformsCatalog.Cate
4041
=> new OneHotEncodingEstimator(CatalogUtils.GetEnvironment(catalog),
4142
new[] { new OneHotEncodingEstimator.ColumnOptions(outputColumnName, inputColumnName, outputKind, maximumNumberOfKeys, keyOrdinality) }, keyData);
4243

44+
/// <summary>
45+
/// Convert text columns into one-hot encoded vectors.
46+
/// </summary>
47+
/// <param name="catalog">The transform catalog</param>
48+
/// <param name="columns">Specifies the names of the columns on which to apply the transformation.</param>
49+
/// <param name="outputKind">Output kind: Bag (multi-set vector), Ind (indicator vector), Key (index), or Binary encoded indicator vector.</param>
50+
/// <param name="maximumNumberOfKeys">Maximum number of terms to keep per column when auto-training.</param>
51+
/// <param name="keyOrdinality">How items should be ordered when vectorized. If <see cref="ValueToKeyMappingEstimator.KeyOrdinality.ByOccurrence"/> choosen they will be in the order encountered.
52+
/// If <see cref="ValueToKeyMappingEstimator.KeyOrdinality.ByValue"/>, items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a').</param>
53+
/// <param name="keyData">Specifies an ordering for the encoding. If specified, this should be a single column data view,
54+
/// and the key-values will be taken from that column. If unspecified, the ordering will be determined from the input data upon fitting.</param>
55+
public static OneHotEncodingEstimator OneHotEncoding(this TransformsCatalog.CategoricalTransforms catalog,
56+
InputOutputColumnPair[] columns,
57+
OneHotEncodingEstimator.OutputKind outputKind = OneHotEncodingEstimator.Defaults.OutKind,
58+
int maximumNumberOfKeys = ValueToKeyMappingEstimator.Defaults.MaximumNumberOfKeys,
59+
ValueToKeyMappingEstimator.KeyOrdinality keyOrdinality = ValueToKeyMappingEstimator.Defaults.Ordinality,
60+
IDataView keyData = null)
61+
{
62+
var columnOptions = columns.Select(x => new OneHotEncodingEstimator.ColumnOptions(x.OutputColumnName, x.InputColumnName, outputKind, maximumNumberOfKeys, keyOrdinality)).ToArray();
63+
return new OneHotEncodingEstimator(CatalogUtils.GetEnvironment(catalog), columnOptions, keyData);
64+
}
65+
4366
/// <summary>
4467
/// Convert several text column into one-hot encoded vectors.
4568
/// </summary>
@@ -88,6 +111,31 @@ public static OneHotHashEncodingEstimator OneHotHashEncoding(this TransformsCata
88111
=> new OneHotHashEncodingEstimator(CatalogUtils.GetEnvironment(catalog),
89112
new[] { new OneHotHashEncodingEstimator.ColumnOptions(outputColumnName, inputColumnName, outputKind, numberOfBits, seed, useOrderedHashing, maximumNumberOfInverts) });
90113

114+
/// <summary>
115+
/// Convert text columns into hash-based one-hot encoded vector columns.
116+
/// </summary>
117+
/// <param name="catalog">The transform catalog</param>
118+
/// <param name="columns">Specifies the names of the columns on which to apply the transformation.</param>
119+
/// <param name="outputKind">The conversion mode.</param>
120+
/// <param name="numberOfBits">Number of bits to hash into. Must be between 1 and 30, inclusive.</param>
121+
/// <param name="seed">Hashing seed.</param>
122+
/// <param name="useOrderedHashing">Whether the position of each term should be included in the hash.</param>
123+
/// <param name="maximumNumberOfInverts">During hashing we constuct mappings between original values and the produced hash values.
124+
/// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one.
125+
/// <paramref name="maximumNumberOfInverts"/> specifies the upper bound of the number of distinct input values mapping to a hash that should be retained.
126+
/// <value>0</value> does not retain any input values. <value>-1</value> retains all input values mapping to each hash.</param>
127+
public static OneHotHashEncodingEstimator OneHotHashEncoding(this TransformsCatalog.CategoricalTransforms catalog,
128+
InputOutputColumnPair[] columns,
129+
OneHotEncodingEstimator.OutputKind outputKind = OneHotEncodingEstimator.OutputKind.Indicator,
130+
int numberOfBits = OneHotHashEncodingEstimator.Defaults.NumberOfBits,
131+
uint seed = OneHotHashEncodingEstimator.Defaults.Seed,
132+
bool useOrderedHashing = OneHotHashEncodingEstimator.Defaults.UseOrderedHashing,
133+
int maximumNumberOfInverts = OneHotHashEncodingEstimator.Defaults.MaximumNumberOfInverts)
134+
{
135+
var columnOptions = columns.Select(x => new OneHotHashEncodingEstimator.ColumnOptions(x.OutputColumnName, x.InputColumnName, outputKind, numberOfBits, seed, useOrderedHashing, maximumNumberOfInverts)).ToArray();
136+
return new OneHotHashEncodingEstimator(CatalogUtils.GetEnvironment(catalog), columnOptions);
137+
}
138+
91139
/// <summary>
92140
/// Convert several text column into hash-based one-hot encoded vectors.
93141
/// </summary>

src/Microsoft.ML.Transforms/ExtensionsCatalog.cs

+29-11
Original file line numberDiff line numberDiff line change
@@ -2,24 +2,14 @@
22
// The .NET Foundation licenses this file to you under the MIT license.
33
// See the LICENSE file in the project root for more information.
44

5+
using System.Linq;
56
using Microsoft.ML.Data;
67
using Microsoft.ML.Transforms;
78

89
namespace Microsoft.ML
910
{
1011
public static class ExtensionsCatalog
1112
{
12-
/// <summary>
13-
/// Creates a new output column, of boolean type, with the same number of slots as the input column. The value in the output column
14-
/// is true if the value in the input column is missing.
15-
/// </summary>
16-
/// <param name="catalog">The transform extensions' catalog.</param>
17-
/// <param name="columns">The names of the input columns of the transformation and the corresponding names for the output columns.</param>
18-
[BestFriend]
19-
internal static MissingValueIndicatorEstimator IndicateMissingValues(this TransformsCatalog catalog,
20-
params ColumnOptions[] columns)
21-
=> new MissingValueIndicatorEstimator(CatalogUtils.GetEnvironment(catalog), ColumnOptions.ConvertToValueTuples(columns));
22-
2313
/// <summary>
2414
/// Creates a new output column, or replaces the source with a new column
2515
/// (depending on whether the <paramref name="inputColumnName"/> is given a value, or left to null)
@@ -41,6 +31,15 @@ public static MissingValueIndicatorEstimator IndicateMissingValues(this Transfor
4131
string inputColumnName = null)
4232
=> new MissingValueIndicatorEstimator(CatalogUtils.GetEnvironment(catalog), outputColumnName, inputColumnName);
4333

34+
/// <summary>
35+
/// Creates a new output column, of boolean type, with the same number of slots as the input column. The value in the output column
36+
/// is true if the value in the input column is missing.
37+
/// </summary>
38+
/// <param name="catalog">The transform extensions' catalog.</param>
39+
/// <param name="columns">Specifies the names of the columns on which to apply the transformation.</param>
40+
public static MissingValueIndicatorEstimator IndicateMissingValues(this TransformsCatalog catalog, InputOutputColumnPair[] columns)
41+
=> new MissingValueIndicatorEstimator(CatalogUtils.GetEnvironment(catalog), columns.Select(x => (x.OutputColumnName, x.InputColumnName)).ToArray());
42+
4443
/// <summary>
4544
/// Creates a new output column, or replaces the source with a new column
4645
/// (depending on whether the <paramref name="outputColumnName"/> is given a value, or left to null)
@@ -69,6 +68,25 @@ public static MissingValueReplacingEstimator ReplaceMissingValues(this Transform
6968
bool imputeBySlot = MissingValueReplacingEstimator.Defaults.ImputeBySlot)
7069
=> new MissingValueReplacingEstimator(CatalogUtils.GetEnvironment(catalog), new[] { new MissingValueReplacingEstimator.ColumnOptions(outputColumnName, inputColumnName, replacementMode, imputeBySlot) });
7170

71+
/// <summary>
72+
/// Creates a new output column, identical to the input column for everything but the missing values.
73+
/// The missing values of the input column, in this new column are replaced with <see cref="MissingValueReplacingEstimator.ReplacementMode.DefaultValue"/>.
74+
/// </summary>
75+
/// <param name="catalog">The transform extensions' catalog.</param>
76+
/// <param name="columns">Specifies the names of the columns on which to apply the transformation.</param>
77+
/// <param name="replacementMode">The type of replacement to use as specified in <see cref="MissingValueReplacingEstimator.ReplacementMode"/></param>
78+
/// <param name="imputeBySlot">If true, per-slot imputation of replacement is performed.
79+
/// Otherwise, replacement value is imputed for the entire vector column. This setting is ignored for scalars and variable vectors,
80+
/// where imputation is always for the entire column.</param>
81+
public static MissingValueReplacingEstimator ReplaceMissingValues(this TransformsCatalog catalog,
82+
InputOutputColumnPair[] columns,
83+
MissingValueReplacingEstimator.ReplacementMode replacementMode = MissingValueReplacingEstimator.Defaults.Mode,
84+
bool imputeBySlot = MissingValueReplacingEstimator.Defaults.ImputeBySlot)
85+
{
86+
var columnOptions = columns.Select(x => new MissingValueReplacingEstimator.ColumnOptions(x.OutputColumnName, x.InputColumnName, replacementMode, imputeBySlot)).ToArray();
87+
return new MissingValueReplacingEstimator(CatalogUtils.GetEnvironment(catalog), columnOptions);
88+
}
89+
7290
/// <summary>
7391
/// Creates a new output column, identical to the input column for everything but the missing values.
7492
/// The missing values of the input column, in this new column are replaced with <see cref="MissingValueReplacingEstimator.ReplacementMode.DefaultValue"/>.

0 commit comments

Comments
 (0)