Skip to content

Commit 06ab3d0

Browse files
singlissfilipi
authored andcommitted
Addition of the ValueMappingEstimator and ValueMappingTransform. (#1710)
* Addition of the ValueMappingEstimator and ValueMappingTransform. This will be replacing the TermLookupTransform and provide a way to specify the mapping betweeen two values (note this is specified and not trained). A user can specify the mapping by providing a keys list and values list that must be equal in size. The Estimator will then generate a 1-1 mapping based on the two lists. The PR references #754 which covers the conversion of Transformer to use the new Estimator API.
1 parent c4236e3 commit 06ab3d0

14 files changed

+1520
-754
lines changed

src/Microsoft.ML.Data/DataView/ArrayDataViewBuilder.cs

+8-2
Original file line numberDiff line numberDiff line change
@@ -75,12 +75,18 @@ public void AddColumn<T>(string name, PrimitiveType type, params T[] values)
7575
/// Constructs a new key column from an array where values are copied to output simply
7676
/// by being assigned.
7777
/// </summary>
78-
public void AddColumn(string name, ValueGetter<VBuffer<ReadOnlyMemory<char>>> getKeyValues, ulong keyMin, int keyCount, params uint[] values)
78+
/// <param name="name">The name of the column.</param>
79+
/// <param name="getKeyValues">The delegate that does a reverse lookup based upon the given key. This is for metadata creation</param>
80+
/// <param name="keyMin">The <see cref="KeyType"/> minimum to use.</param>
81+
/// <param name="keyCount">The count of unique keys specified in values</param>
82+
/// <param name="values">The values to add to the column. Note that since this is creating a <see cref="KeyType"/> column, the values will be offset by 1.</param>
83+
public void AddColumn<T1>(string name, ValueGetter<VBuffer<ReadOnlyMemory<char>>> getKeyValues, ulong keyMin, int keyCount, params T1[] values)
7984
{
8085
_host.CheckValue(getKeyValues, nameof(getKeyValues));
8186
_host.CheckParam(keyCount > 0, nameof(keyCount));
8287
CheckLength(name, values);
83-
_columns.Add(new AssignmentColumn<uint>(new KeyType(DataKind.U4, keyMin, keyCount), values));
88+
values.GetType().GetElementType().TryGetDataKind(out DataKind kind);
89+
_columns.Add(new AssignmentColumn<T1>(new KeyType(kind, keyMin, keyCount), values));
8490
_getKeyValues.Add(name, getKeyValues);
8591
_names.Add(name);
8692
}

src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs

+20
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,9 @@
44

55
using Microsoft.ML.Runtime;
66
using Microsoft.ML.Runtime.Data;
7+
using Microsoft.ML.Transforms;
78
using Microsoft.ML.Transforms.Conversions;
9+
using System.Collections.Generic;
810

911
namespace Microsoft.ML
1012
{
@@ -125,5 +127,23 @@ public static ValueToKeyMappingEstimator MapValueToKey(this TransformsCatalog.Co
125127
string termsColumn = null,
126128
IComponentFactory<IMultiStreamSource, IDataLoader> loaderFactory = null)
127129
=> new ValueToKeyMappingEstimator(CatalogUtils.GetEnvironment(catalog), columns, file, termsColumn, loaderFactory);
130+
131+
/// <summary>
132+
/// Maps specified keys to specified values
133+
/// </summary>
134+
/// <typeparam name="TInputType">The key type.</typeparam>
135+
/// <typeparam name="TOutputType">The value type.</typeparam>
136+
/// <param name="catalog">The categorical transform's catalog</param>
137+
/// <param name="keys">The list of keys to use for the mapping. The mapping is 1-1 with values. This list must be the same length as values and
138+
/// cannot contain duplicate keys.</param>
139+
/// <param name="values">The list of values to pair with the keys for the mapping. This list must be equal to the same length as keys.</param>
140+
/// <param name="columns">The columns to apply this transform on.</param>
141+
/// <returns></returns>
142+
public static ValueMappingEstimator<TInputType, TOutputType> ValueMap<TInputType, TOutputType>(
143+
this TransformsCatalog.ConversionTransforms catalog,
144+
IEnumerable<TInputType> keys,
145+
IEnumerable<TOutputType> values,
146+
params (string source, string name)[] columns)
147+
=> new ValueMappingEstimator<TInputType, TOutputType>(CatalogUtils.GetEnvironment(catalog), keys, values, columns);
128148
}
129149
}

src/Microsoft.ML.Data/Transforms/ValueMappingTransformer.cs

+975
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)