Skip to content

Commit d08a2f9

Browse files
committed
multicolumn mapping for some estimators
1 parent 49403ab commit d08a2f9

File tree

7 files changed

+246
-32
lines changed

7 files changed

+246
-32
lines changed

src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs

+74-1
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ internal static HashingEstimator Hash(this TransformsCatalog.ConversionTransform
4949
=> new HashingEstimator(CatalogUtils.GetEnvironment(catalog), columns);
5050

5151
/// <summary>
52-
/// Changes column type of the input column.
52+
/// Changes column type of the input columns.
5353
/// </summary>
5454
/// <param name="catalog">The conversion transform's catalog.</param>
5555
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
@@ -65,6 +65,20 @@ public static TypeConvertingEstimator ConvertType(this TransformsCatalog.Convers
6565
DataKind outputKind = ConvertDefaults.DefaultOutputKind)
6666
=> new TypeConvertingEstimator(CatalogUtils.GetEnvironment(catalog), new[] { new TypeConvertingEstimator.ColumnOptions(outputColumnName, outputKind, inputColumnName) });
6767

68+
/// <summary>
69+
/// Changes column type of the input column.
70+
/// </summary>
71+
/// <param name="catalog">The conversion transform's catalog.</param>
72+
/// <param name="columns">Specifies the names of the columns on which to apply the transformation.</param>
73+
/// <param name="outputKind">The expected kind of the output column.</param>
74+
public static TypeConvertingEstimator ConvertType(this TransformsCatalog.ConversionTransforms catalog,
75+
InputOutputColumnPair[] columns,
76+
DataKind outputKind = ConvertDefaults.DefaultOutputKind)
77+
{
78+
var columnOptions = columns.Select(x => new TypeConvertingEstimator.ColumnOptions(x.OutputColumnName, outputKind, x.InputColumnName)).ToArray();
79+
return new TypeConvertingEstimator(CatalogUtils.GetEnvironment(catalog), columnOptions);
80+
}
81+
6882
/// <summary>
6983
/// Changes column type of the input column.
7084
/// </summary>
@@ -89,6 +103,20 @@ internal static TypeConvertingEstimator ConvertType(this TransformsCatalog.Conve
89103
public static KeyToValueMappingEstimator MapKeyToValue(this TransformsCatalog.ConversionTransforms catalog, string outputColumnName, string inputColumnName = null)
90104
=> new KeyToValueMappingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumnName, inputColumnName);
91105

106+
/// <summary>
107+
/// Convert the key types back to their original values.
108+
/// </summary>
109+
/// <param name="catalog">The conversion transform's catalog.</param>
110+
/// <param name="columns">Specifies the names of the columns on which to apply the transformation.</param>
111+
/// <example>
112+
/// <format type="text/markdown">
113+
/// <![CDATA[
114+
/// [!code-csharp[KeyToValueMappingEstimator](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/ValueMappingStringToKeyType.cs)]
115+
/// ]]></format>
116+
/// </example>
117+
public static KeyToValueMappingEstimator MapKeyToValue(this TransformsCatalog.ConversionTransforms catalog, InputOutputColumnPair[] columns)
118+
=> new KeyToValueMappingEstimator(CatalogUtils.GetEnvironment(catalog), columns.Select(x => (x.OutputColumnName, x.InputColumnName)).ToArray());
119+
92120
/// <summary>
93121
/// Convert the key types (name of the column specified in the first item of the tuple) back to their original values
94122
/// (named as specified in the second item of the tuple).
@@ -127,6 +155,21 @@ public static KeyToVectorMappingEstimator MapKeyToVector(this TransformsCatalog.
127155
string outputColumnName, string inputColumnName = null, bool outputCountVector = KeyToVectorMappingEstimator.Defaults.OutputCountVector)
128156
=> new KeyToVectorMappingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumnName, inputColumnName, outputCountVector);
129157

158+
/// <summary>
159+
/// Maps columns of key types or key values into columns of floating point vectors.
160+
/// </summary>
161+
/// <param name="catalog">The conversion transform's catalog.</param>
162+
/// <param name="columns">Specifies the names of the columns on which to apply the transformation.</param>
163+
/// <param name="outputCountVector">Whether to combine multiple indicator vectors into a single vector of counts instead of concatenating them.
164+
/// This is only relevant when the input column is a vector of keys.</param>
165+
public static KeyToVectorMappingEstimator MapKeyToVector(this TransformsCatalog.ConversionTransforms catalog,
166+
InputOutputColumnPair[] columns, bool outputCountVector = KeyToVectorMappingEstimator.Defaults.OutputCountVector)
167+
{
168+
var columnOptions = columns.Select(x => new KeyToVectorMappingEstimator.ColumnOptions(x.OutputColumnName, x.InputColumnName, outputCountVector)).ToArray();
169+
return new KeyToVectorMappingEstimator(CatalogUtils.GetEnvironment(catalog), columnOptions);
170+
171+
}
172+
130173
/// <summary>
131174
/// Converts value types into <see cref="KeyType"/>.
132175
/// </summary>
@@ -157,6 +200,36 @@ public static ValueToKeyMappingEstimator MapValueToKey(this TransformsCatalog.Co
157200
=> new ValueToKeyMappingEstimator(CatalogUtils.GetEnvironment(catalog),
158201
new[] { new ValueToKeyMappingEstimator.ColumnOptions(outputColumnName, inputColumnName, maximumNumberOfKeys, keyOrdinality, addKeyValueAnnotationsAsText) }, keyData);
159202

203+
/// <summary>
204+
/// Converts value types into <see cref="KeyType"/>.
205+
/// </summary>
206+
/// <param name="catalog">The conversion transform's catalog.</param>
207+
/// <param name="columns">Specifies the names of the columns on which to apply the transformation.</param>
208+
/// <param name="maximumNumberOfKeys">Maximum number of keys to keep per column when auto-training.</param>
209+
/// <param name="keyOrdinality">How items should be ordered when vectorized. If <see cref="ValueToKeyMappingEstimator.KeyOrdinality.ByOccurrence"/> choosen they will be in the order encountered.
210+
/// If <see cref="ValueToKeyMappingEstimator.KeyOrdinality.ByValue"/>, items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a').</param>
211+
/// <param name="addKeyValueAnnotationsAsText">Whether key value annotations should be text, regardless of the actual input type.</param>
212+
/// <param name="keyData">The data view containing the terms. If specified, this should be a single column data
213+
/// view, and the key-values will be taken from that column. If unspecified, the key-values will be determined
214+
/// from the input data upon fitting.</param>
215+
/// <example>
216+
/// <format type="text/markdown">
217+
/// <![CDATA[
218+
/// [!code-csharp[ValueToKey](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValueValueToKey.cs)]
219+
/// ]]>
220+
/// </format>
221+
/// </example>
222+
public static ValueToKeyMappingEstimator MapValueToKey(this TransformsCatalog.ConversionTransforms catalog,
223+
InputOutputColumnPair[] columns,
224+
int maximumNumberOfKeys = ValueToKeyMappingEstimator.Defaults.MaximumNumberOfKeys,
225+
ValueToKeyMappingEstimator.KeyOrdinality keyOrdinality = ValueToKeyMappingEstimator.Defaults.Ordinality,
226+
bool addKeyValueAnnotationsAsText = ValueToKeyMappingEstimator.Defaults.AddKeyValueAnnotationsAsText,
227+
IDataView keyData = null)
228+
{
229+
var columnOptions = columns.Select(x => new ValueToKeyMappingEstimator.ColumnOptions(x.OutputColumnName, x.InputColumnName, maximumNumberOfKeys, keyOrdinality, addKeyValueAnnotationsAsText)).ToArray();
230+
return new ValueToKeyMappingEstimator(CatalogUtils.GetEnvironment(catalog), columnOptions, keyData);
231+
}
232+
160233
/// <summary>
161234
/// Converts value types into <see cref="KeyType"/>, optionally loading the keys to use from <paramref name="keyData"/>.
162235
/// </summary>

src/Microsoft.ML.Data/Transforms/ExtensionsCatalog.cs

+26
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,32 @@
88

99
namespace Microsoft.ML
1010
{
11+
/// <summary>
12+
/// Specifies input and output column names for a transformation.
13+
/// </summary>
14+
public sealed class InputOutputColumnPair
15+
{
16+
/// <summary>
17+
/// Name of the column to transform. If set to <see langword="null"/>, the value of the <see cref="OutputColumnName"/> will be used as source.
18+
/// </summary>
19+
public readonly string InputColumnName;
20+
/// <summary>
21+
/// Name of the column resulting from the transformation of <see cref="InputColumnName"/>.
22+
/// </summary>
23+
public readonly string OutputColumnName;
24+
25+
/// <summary>
26+
/// Specifies input and output column names for a transformation.
27+
/// </summary>
28+
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
29+
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
30+
public InputOutputColumnPair(string outputColumnName, string inputColumnName = null)
31+
{
32+
InputColumnName = inputColumnName;
33+
OutputColumnName = outputColumnName;
34+
}
35+
}
36+
1137
/// <summary>
1238
/// Specifies input and output column names for a transformation.
1339
/// </summary>

src/Microsoft.ML.Transforms/CategoricalCatalog.cs

+54
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
// The .NET Foundation licenses this file to you under the MIT license.
33
// See the LICENSE file in the project root for more information.
44

5+
using System.Linq;
56
using Microsoft.ML.Data;
67
using Microsoft.ML.Transforms;
78

@@ -40,6 +41,34 @@ public static OneHotEncodingEstimator OneHotEncoding(this TransformsCatalog.Cate
4041
=> new OneHotEncodingEstimator(CatalogUtils.GetEnvironment(catalog),
4142
new[] { new OneHotEncodingEstimator.ColumnOptions(outputColumnName, inputColumnName, outputKind, maximumNumberOfKeys, keyOrdinality) }, keyData);
4243

44+
/// <summary>
45+
/// Convert text columns into one-hot encoded vectors.
46+
/// </summary>
47+
/// <param name="catalog">The transform catalog</param>
48+
/// <param name="columns">Specifies the names of the columns on which to apply the transformation.</param>
49+
/// <param name="outputKind">Output kind: Bag (multi-set vector), Ind (indicator vector), Key (index), or Binary encoded indicator vector.</param>
50+
/// <param name="maximumNumberOfKeys">Maximum number of terms to keep per column when auto-training.</param>
51+
/// <param name="keyOrdinality">How items should be ordered when vectorized. If <see cref="ValueToKeyMappingEstimator.KeyOrdinality.ByOccurrence"/> choosen they will be in the order encountered.
52+
/// If <see cref="ValueToKeyMappingEstimator.KeyOrdinality.ByValue"/>, items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a').</param>
53+
/// <param name="keyData">Specifies an ordering for the encoding. If specified, this should be a single column data view,
54+
/// and the key-values will be taken from that column. If unspecified, the ordering will be determined from the input data upon fitting.</param>
55+
/// <example>
56+
/// <format type="text/markdown">
57+
/// <![CDATA[
58+
/// [!code-csharp[RPCA](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Categorical/OneHotEncoding.cs)]
59+
/// ]]></format>
60+
/// </example>
61+
public static OneHotEncodingEstimator OneHotEncoding(this TransformsCatalog.CategoricalTransforms catalog,
62+
InputOutputColumnPair[] columns,
63+
OneHotEncodingEstimator.OutputKind outputKind = OneHotEncodingEstimator.Defaults.OutKind,
64+
int maximumNumberOfKeys = ValueToKeyMappingEstimator.Defaults.MaximumNumberOfKeys,
65+
ValueToKeyMappingEstimator.KeyOrdinality keyOrdinality = ValueToKeyMappingEstimator.Defaults.Ordinality,
66+
IDataView keyData = null)
67+
{
68+
var columnOptions = columns.Select(x => new OneHotEncodingEstimator.ColumnOptions(x.OutputColumnName, x.InputColumnName, outputKind, maximumNumberOfKeys, keyOrdinality)).ToArray();
69+
return new OneHotEncodingEstimator(CatalogUtils.GetEnvironment(catalog), columnOptions, keyData);
70+
}
71+
4372
/// <summary>
4473
/// Convert several text column into one-hot encoded vectors.
4574
/// </summary>
@@ -88,6 +117,31 @@ public static OneHotHashEncodingEstimator OneHotHashEncoding(this TransformsCata
88117
=> new OneHotHashEncodingEstimator(CatalogUtils.GetEnvironment(catalog),
89118
new[] { new OneHotHashEncodingEstimator.ColumnOptions(outputColumnName, inputColumnName, outputKind, numberOfBits, seed, useOrderedHashing, maximumNumberOfInverts) });
90119

120+
/// <summary>
121+
/// Convert text columns into hash-based one-hot encoded vector columns.
122+
/// </summary>
123+
/// <param name="catalog">The transform catalog</param>
124+
/// <param name="columns">Specifies the names of the columns on which to apply the transformation.</param>
125+
/// <param name="outputKind">The conversion mode.</param>
126+
/// <param name="numberOfBits">Number of bits to hash into. Must be between 1 and 30, inclusive.</param>
127+
/// <param name="seed">Hashing seed.</param>
128+
/// <param name="useOrderedHashing">Whether the position of each term should be included in the hash.</param>
129+
/// <param name="maximumNumberOfInverts">During hashing we constuct mappings between original values and the produced hash values.
130+
/// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one.
131+
/// <paramref name="maximumNumberOfInverts"/> specifies the upper bound of the number of distinct input values mapping to a hash that should be retained.
132+
/// <value>0</value> does not retain any input values. <value>-1</value> retains all input values mapping to each hash.</param>
133+
public static OneHotHashEncodingEstimator OneHotHashEncoding(this TransformsCatalog.CategoricalTransforms catalog,
134+
InputOutputColumnPair[] columns,
135+
OneHotEncodingEstimator.OutputKind outputKind = OneHotEncodingEstimator.OutputKind.Indicator,
136+
int numberOfBits = OneHotHashEncodingEstimator.Defaults.NumberOfBits,
137+
uint seed = OneHotHashEncodingEstimator.Defaults.Seed,
138+
bool useOrderedHashing = OneHotHashEncodingEstimator.Defaults.UseOrderedHashing,
139+
int maximumNumberOfInverts = OneHotHashEncodingEstimator.Defaults.MaximumNumberOfInverts)
140+
{
141+
var columnOptions = columns.Select(x => new OneHotHashEncodingEstimator.ColumnOptions(x.OutputColumnName, x.InputColumnName, outputKind, numberOfBits, seed, useOrderedHashing, maximumNumberOfInverts)).ToArray();
142+
return new OneHotHashEncodingEstimator(CatalogUtils.GetEnvironment(catalog), columnOptions);
143+
}
144+
91145
/// <summary>
92146
/// Convert several text column into hash-based one-hot encoded vectors.
93147
/// </summary>

0 commit comments

Comments
 (0)