Skip to content

Commit 7ef45f1

Browse files
authored
ValueMapperTransformer: Added support for loading map from file through dataview. (#2232)
1 parent 4627647 commit 7ef45f1

File tree

2 files changed

+103
-88
lines changed

2 files changed

+103
-88
lines changed

src/Microsoft.ML.Data/Transforms/ValueMappingTransformer.cs renamed to src/Microsoft.ML.Data/Transforms/ValueMapping.cs

Lines changed: 63 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -33,14 +33,68 @@
3333

3434
namespace Microsoft.ML.Transforms.Conversions
3535
{
36+
/// <summary>
37+
/// The ValueMappingEstimator is a 1-1 mapping from a key to value. This particular class load the mappings from an <see cref="IDataView"/>.
38+
/// This gives user the flexibility to load the mapping from file instead of using IEnumerable in <see cref="ValueMappingEstimator{TKey, TValue}"/>
39+
/// </summary>
40+
public class ValueMappingEstimator : TrivialEstimator<ValueMappingTransformer>
41+
{
42+
private readonly (string input, string output)[] _columns;
43+
44+
/// <summary>
45+
/// Constructs the ValueMappingEstimator, key type -> value type mapping
46+
/// </summary>
47+
/// <param name="env">The environment to use.</param>
48+
/// <param name="lookupMap">An instance of <see cref="IDataView"/> that contains the key and value columns.</param>
49+
/// <param name="keyColumn">Name of the key column in <paramref name="lookupMap"/>.</param>
50+
/// <param name="valueColumn">Name of the value column in <paramref name="lookupMap"/>.</param>
51+
/// <param name="columns">The list of names of the input columns to apply the transformation, and the name of the resulting column.</param>
52+
public ValueMappingEstimator(IHostEnvironment env, IDataView lookupMap, string keyColumn, string valueColumn, params (string input, string output)[] columns)
53+
: base(Contracts.CheckRef(env, nameof(env)).Register(nameof(ValueMappingEstimator)),
54+
new ValueMappingTransformer(env, lookupMap, keyColumn, valueColumn, columns))
55+
{
56+
_columns = columns;
57+
}
58+
59+
/// <summary>
60+
/// Retrieves the output schema given the input schema
61+
/// </summary>
62+
/// <param name="inputSchema">Input schema</param>
63+
/// <returns>Returns the generated output schema</returns>
64+
public override SchemaShape GetOutputSchema(SchemaShape inputSchema)
65+
{
66+
Host.CheckValue(inputSchema, nameof(inputSchema));
67+
68+
var resultDic = inputSchema.ToDictionary(x => x.Name);
69+
var vectorKind = Transformer.ValueColumnType is VectorType ? SchemaShape.Column.VectorKind.Vector : SchemaShape.Column.VectorKind.Scalar;
70+
var isKey = Transformer.ValueColumnType is KeyType;
71+
var columnType = (isKey) ? NumberType.U4 :
72+
Transformer.ValueColumnType;
73+
var metadataShape = SchemaShape.Create(Transformer.ValueColumnMetadata.Schema);
74+
foreach (var (Input, Output) in _columns)
75+
{
76+
if (!inputSchema.TryFindColumn(Input, out var originalColumn))
77+
throw Host.ExceptSchemaMismatch(nameof(inputSchema), "input", Input);
78+
79+
if ((originalColumn.Kind == SchemaShape.Column.VectorKind.VariableVector ||
80+
originalColumn.Kind == SchemaShape.Column.VectorKind.Vector) && Transformer.ValueColumnType is VectorType)
81+
throw Host.ExceptNotSupp("Column '{0}' cannot be mapped to values when the column and the map values are both vector type.", Input);
82+
// Create the Value column
83+
var col = new SchemaShape.Column(Output, vectorKind, columnType, isKey, metadataShape);
84+
resultDic[Output] = col;
85+
}
86+
return new SchemaShape(resultDic.Values);
87+
}
88+
}
89+
3690
/// <summary>
3791
/// The ValueMappingEstimator is a 1-1 mapping from a key to value. The key type and value type are specified
3892
/// through TKey and TValue. TKey is always a scalar. TValue can be either a scalar or an array (array is only possible when input is scalar).
3993
/// The mapping is specified, not trained by providing a list of keys and a list of values.
4094
/// </summary>
4195
/// <typeparam name="TKey">Specifies the key type.</typeparam>
4296
/// <typeparam name="TValue">Specifies the value type.</typeparam>
43-
public sealed class ValueMappingEstimator<TKey, TValue> : TrivialEstimator<ValueMappingTransformer<TKey, TValue>>
97+
public sealed class ValueMappingEstimator<TKey, TValue> : ValueMappingEstimator
4498
{
4599
private (string input, string output)[] _columns;
46100

@@ -52,8 +106,7 @@ public sealed class ValueMappingEstimator<TKey, TValue> : TrivialEstimator<Value
52106
/// <param name="values">The list of values of TValue.</param>
53107
/// <param name="columns">The list of columns to apply.</param>
54108
public ValueMappingEstimator(IHostEnvironment env, IEnumerable<TKey> keys, IEnumerable<TValue> values, params (string input, string output)[] columns)
55-
: base(Contracts.CheckRef(env, nameof(env)).Register(nameof(ValueMappingEstimator<TKey, TValue>)),
56-
new ValueMappingTransformer<TKey, TValue>(env, keys, values, false, columns))
109+
: base(env, DataViewHelper.CreateDataView(env, keys, values, ValueMappingTransformer.KeyColumnName, ValueMappingTransformer.ValueColumnName, false), ValueMappingTransformer.KeyColumnName, ValueMappingTransformer.ValueColumnName, columns)
57110
{
58111
_columns = columns;
59112
}
@@ -67,8 +120,7 @@ public ValueMappingEstimator(IHostEnvironment env, IEnumerable<TKey> keys, IEnum
67120
/// <param name="treatValuesAsKeyType">Specifies to treat the values as a <see cref="KeyType"/>.</param>
68121
/// <param name="columns">The list of columns to apply.</param>
69122
public ValueMappingEstimator(IHostEnvironment env, IEnumerable<TKey> keys, IEnumerable<TValue> values, bool treatValuesAsKeyType, params (string input, string output)[] columns)
70-
: base(Contracts.CheckRef(env, nameof(env)).Register(nameof(ValueMappingEstimator<TKey, TValue>)),
71-
new ValueMappingTransformer<TKey, TValue>(env, keys, values, treatValuesAsKeyType, columns))
123+
: base(env, DataViewHelper.CreateDataView(env, keys, values, ValueMappingTransformer.KeyColumnName, ValueMappingTransformer.ValueColumnName, treatValuesAsKeyType), ValueMappingTransformer.KeyColumnName, ValueMappingTransformer.ValueColumnName, columns)
72124
{
73125
_columns = columns;
74126
}
@@ -81,41 +133,10 @@ public ValueMappingEstimator(IHostEnvironment env, IEnumerable<TKey> keys, IEnum
81133
/// <param name="values">The list of values of TValue[].</param>
82134
/// <param name="columns">The list of columns to apply.</param>
83135
public ValueMappingEstimator(IHostEnvironment env, IEnumerable<TKey> keys, IEnumerable<TValue[]> values, params (string input, string output)[] columns)
84-
: base(Contracts.CheckRef(env, nameof(env)).Register(nameof(ValueMappingEstimator<TKey, TValue>)),
85-
new ValueMappingTransformer<TKey, TValue>(env, keys, values, columns))
136+
: base(env, DataViewHelper.CreateDataView(env, keys, values, ValueMappingTransformer.KeyColumnName, ValueMappingTransformer.ValueColumnName), ValueMappingTransformer.KeyColumnName, ValueMappingTransformer.ValueColumnName, columns)
86137
{
87138
_columns = columns;
88139
}
89-
90-
/// <summary>
91-
/// Retrieves the output schema given the input schema
92-
/// </summary>
93-
/// <param name="inputSchema">Input schema</param>
94-
/// <returns>Returns the generated output schema</returns>
95-
public override SchemaShape GetOutputSchema(SchemaShape inputSchema)
96-
{
97-
Host.CheckValue(inputSchema, nameof(inputSchema));
98-
99-
var resultDic = inputSchema.ToDictionary(x => x.Name);
100-
var vectorKind = Transformer.ValueColumnType is VectorType ? SchemaShape.Column.VectorKind.Vector : SchemaShape.Column.VectorKind.Scalar;
101-
var isKey = Transformer.ValueColumnType is KeyType;
102-
var columnType = (isKey) ? ColumnTypeExtensions.PrimitiveTypeFromKind(DataKind.U4) :
103-
Transformer.ValueColumnType;
104-
var metadataShape = SchemaShape.Create(Transformer.ValueColumnMetadata.Schema);
105-
foreach (var (Input, Output) in _columns)
106-
{
107-
if (!inputSchema.TryFindColumn(Input, out var originalColumn))
108-
throw Host.ExceptSchemaMismatch(nameof(inputSchema), "input", Input);
109-
110-
if ((originalColumn.Kind == SchemaShape.Column.VectorKind.VariableVector ||
111-
originalColumn.Kind == SchemaShape.Column.VectorKind.Vector) && Transformer.ValueColumnType is VectorType)
112-
throw Host.ExceptNotSupp("Column '{0}' cannot be mapped to values when the column and the map values are both vector type.", Input);
113-
// Create the Value column
114-
var col = new SchemaShape.Column(Output, vectorKind, columnType, isKey, metadataShape);
115-
resultDic[Output] = col;
116-
}
117-
return new SchemaShape(resultDic.Values);
118-
}
119140
}
120141

121142
/// <summary>
@@ -281,53 +302,6 @@ internal static IDataView CreateDataView<TKey, TValue>(IHostEnvironment env,
281302
}
282303
}
283304

284-
/// <summary>
285-
/// The ValueMappingTransformer is a 1-1 mapping from a key to value. The key type and value type are specified
286-
/// through TKey and TValue. Arrays are supported for vector types which can be used as either a key or a value
287-
/// or both. The mapping is specified, not trained by providiing a list of keys and a list of values.
288-
/// </summary>
289-
/// <typeparam name="TKey">Specifies the key type</typeparam>
290-
/// <typeparam name="TValue">Specifies the value type</typeparam>
291-
public sealed class ValueMappingTransformer<TKey, TValue> : ValueMappingTransformer
292-
{
293-
/// <summary>
294-
/// Constructs a ValueMappingTransformer with a key type to value type.
295-
/// </summary>
296-
/// <param name="env">The environment to use.</param>
297-
/// <param name="keys">The list of keys that are TKey.</param>
298-
/// <param name="values">The list of values that are TValue.</param>
299-
/// <param name="treatValuesAsKeyTypes">Specifies to treat the values as a <see cref="KeyType"/>.</param>
300-
/// <param name="columns">The specified columns to apply</param>
301-
public ValueMappingTransformer(IHostEnvironment env, IEnumerable<TKey> keys, IEnumerable<TValue> values, bool treatValuesAsKeyTypes, (string input, string output)[] columns)
302-
: base(Contracts.CheckRef(env, nameof(env)).Register(nameof(ValueMappingTransformer<TKey, TValue>)),
303-
ConvertToDataView(env, keys, values, treatValuesAsKeyTypes), KeyColumnName, ValueColumnName, columns)
304-
{ }
305-
306-
/// <summary>
307-
/// Constructs a ValueMappingTransformer with a key type to value array type.
308-
/// </summary>
309-
/// <param name="env">The environment to use.</param>
310-
/// <param name="keys">The list of keys that are TKey.</param>
311-
/// <param name="values">The list of values that are TValue[].</param>
312-
/// <param name="columns">The specified columns to apply.</param>
313-
public ValueMappingTransformer(IHostEnvironment env, IEnumerable<TKey> keys, IEnumerable<TValue[]> values, (string input, string output)[] columns)
314-
: base(Contracts.CheckRef(env, nameof(env)).Register(nameof(ValueMappingTransformer<TKey, TValue>)),
315-
ConvertToDataView(env, keys, values), KeyColumnName, ValueColumnName, columns)
316-
{ }
317-
318-
private static IDataView ConvertToDataView(IHostEnvironment env, IEnumerable<TKey> keys, IEnumerable<TValue> values, bool treatValuesAsKeyValue)
319-
=> DataViewHelper.CreateDataView(env,
320-
keys,
321-
values,
322-
ValueMappingTransformer.KeyColumnName,
323-
ValueMappingTransformer.ValueColumnName,
324-
treatValuesAsKeyValue);
325-
326-
// Handler for vector value types
327-
private static IDataView ConvertToDataView(IHostEnvironment env, IEnumerable<TKey> keys, IEnumerable<TValue[]> values)
328-
=> DataViewHelper.CreateDataView(env, keys, values, ValueMappingTransformer.KeyColumnName, ValueMappingTransformer.ValueColumnName);
329-
}
330-
331305
public class ValueMappingTransformer : OneToOneTransformerBase
332306
{
333307
internal const string Summary = "Maps text values columns to new columns using a map dataset.";
@@ -339,8 +313,8 @@ public class ValueMappingTransformer : OneToOneTransformerBase
339313

340314
// Stream names for the binary idv streams.
341315
private const string DefaultMapName = "DefaultMap.idv";
342-
protected static string KeyColumnName = "Key";
343-
protected static string ValueColumnName = "Value";
316+
internal static string KeyColumnName = "Key";
317+
internal static string ValueColumnName = "Value";
344318
private ValueMap _valueMap;
345319
private Schema.Metadata _valueMetadata;
346320
private byte[] _dataView;
@@ -411,7 +385,7 @@ public sealed class Arguments
411385
public bool ValuesAsKeyType = true;
412386
}
413387

414-
protected ValueMappingTransformer(IHostEnvironment env, IDataView lookupMap,
388+
internal ValueMappingTransformer(IHostEnvironment env, IDataView lookupMap,
415389
string keyColumn, string valueColumn, (string input, string output)[] columns)
416390
: base(Contracts.CheckRef(env, nameof(env)).Register(nameof(ValueMappingTransformer)), columns)
417391
{
@@ -569,7 +543,8 @@ private static ValueMappingTransformer CreateTransformInvoke<TKey, TValue>(IHost
569543
}
570544
}
571545

572-
return new ValueMappingTransformer<TKey, TValue>(env, keys, values, treatValuesAsKeyTypes, columns);
546+
var lookupMap = DataViewHelper.CreateDataView(env, keys, values, keyColumnName, valueColumnName, treatValuesAsKeyTypes);
547+
return new ValueMappingTransformer(env, lookupMap, keyColumnName, valueColumnName, columns);
573548
}
574549

575550
private static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input)

test/Microsoft.ML.Tests/Transformers/ValueMappingTests.cs

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,46 @@ public void ValueMapVectorValueTest()
177177
Assert.Equal(values[0].Length, fValue.Length);
178178
}
179179

180+
class Map
181+
{
182+
public string Key;
183+
public int Value;
184+
}
185+
186+
[Fact]
187+
public void ValueMapDataViewAsMapTest()
188+
{
189+
var data = new[] { new TestClass() { A = "bar", B = "test", C = "foo" } };
190+
var dataView = ML.Data.ReadFromEnumerable(data);
191+
192+
var map = new[] { new Map() { Key = "foo", Value = 1 },
193+
new Map() { Key = "bar", Value = 2 },
194+
new Map() { Key = "test", Value = 3 },
195+
new Map() { Key = "wahoo", Value = 4 }
196+
};
197+
var mapView = ML.Data.ReadFromEnumerable(map);
198+
199+
var estimator = new ValueMappingEstimator(Env, mapView, "Key", "Value", new[] { ("A", "D"), ("B", "E"), ("C", "F") });
200+
var t = estimator.Fit(dataView);
201+
202+
var result = t.Transform(dataView);
203+
var cursor = result.GetRowCursorForAllColumns();
204+
var getterD = cursor.GetGetter<int>(result.Schema["D"].Index);
205+
var getterE = cursor.GetGetter<int>(result.Schema["E"].Index);
206+
var getterF = cursor.GetGetter<int>(result.Schema["F"].Index);
207+
cursor.MoveNext();
208+
209+
int dValue = 0;
210+
getterD(ref dValue);
211+
Assert.Equal(2, dValue);
212+
int eValue = 0;
213+
getterE(ref eValue);
214+
Assert.Equal(3, eValue);
215+
int fValue = 0;
216+
getterF(ref fValue);
217+
Assert.Equal(1, fValue);
218+
}
219+
180220
[Fact]
181221
public void ValueMapVectorStringValueTest()
182222
{

0 commit comments

Comments
 (0)