-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Multicolumn mapping for some estimators #3066
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,6 +5,7 @@ | |
using System.Collections.Generic; | ||
using System.Linq; | ||
using Microsoft.ML.Data; | ||
using Microsoft.ML.Runtime; | ||
using Microsoft.ML.Transforms; | ||
|
||
namespace Microsoft.ML | ||
|
@@ -65,6 +66,22 @@ public static TypeConvertingEstimator ConvertType(this TransformsCatalog.Convers | |
DataKind outputKind = ConvertDefaults.DefaultOutputKind) | ||
=> new TypeConvertingEstimator(CatalogUtils.GetEnvironment(catalog), new[] { new TypeConvertingEstimator.ColumnOptions(outputColumnName, outputKind, inputColumnName) }); | ||
|
||
/// <summary> | ||
/// Changes column type of the input columns. | ||
/// </summary> | ||
/// <param name="catalog">The conversion transform's catalog.</param> | ||
/// <param name="columns">Specifies the names of the columns on which to apply the transformation.</param> | ||
/// <param name="outputKind">The expected kind of the output column.</param> | ||
public static TypeConvertingEstimator ConvertType(this TransformsCatalog.ConversionTransforms catalog, | ||
InputOutputColumnPair[] columns, | ||
DataKind outputKind = ConvertDefaults.DefaultOutputKind) | ||
{ | ||
var env = CatalogUtils.GetEnvironment(catalog); | ||
env.CheckValue(columns, nameof(columns)); | ||
var columnOptions = columns.Select(x => new TypeConvertingEstimator.ColumnOptions(x.OutputColumnName, outputKind, x.InputColumnName)).ToArray(); | ||
return new TypeConvertingEstimator(env, columnOptions); | ||
} | ||
|
||
/// <summary> | ||
/// Changes column type of the input column. | ||
/// </summary> | ||
|
@@ -90,20 +107,16 @@ public static KeyToValueMappingEstimator MapKeyToValue(this TransformsCatalog.Co | |
=> new KeyToValueMappingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumnName, inputColumnName); | ||
|
||
/// <summary> | ||
/// Convert the key types (name of the column specified in the first item of the tuple) back to their original values | ||
/// (named as specified in the second item of the tuple). | ||
/// Convert the key types back to their original values. | ||
/// </summary> | ||
/// <param name="catalog">The conversion transform's catalog</param> | ||
/// <param name="columns">The pairs of input and output columns.</param> | ||
/// <example> | ||
/// <format type="text/markdown"> | ||
/// <] | ||
/// ]]></format> | ||
/// </example> | ||
[BestFriend] | ||
internal static KeyToValueMappingEstimator MapKeyToValue(this TransformsCatalog.ConversionTransforms catalog, params ColumnOptions[] columns) | ||
=> new KeyToValueMappingEstimator(CatalogUtils.GetEnvironment(catalog), ColumnOptions.ConvertToValueTuples(columns)); | ||
/// <param name="catalog">The conversion transform's catalog.</param> | ||
/// <param name="columns">Specifies the names of the columns on which to apply the transformation.</param> | ||
public static KeyToValueMappingEstimator MapKeyToValue(this TransformsCatalog.ConversionTransforms catalog, InputOutputColumnPair[] columns) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
also check for null #Resolved |
||
{ | ||
var env = CatalogUtils.GetEnvironment(catalog); | ||
env.CheckValue(columns, nameof(columns)); | ||
return new KeyToValueMappingEstimator(env, columns.Select(x => (x.OutputColumnName, x.InputColumnName)).ToArray()); | ||
} | ||
|
||
/// <summary> | ||
/// Maps key types or key values into a floating point vector. | ||
|
@@ -127,6 +140,23 @@ public static KeyToVectorMappingEstimator MapKeyToVector(this TransformsCatalog. | |
string outputColumnName, string inputColumnName = null, bool outputCountVector = KeyToVectorMappingEstimator.Defaults.OutputCountVector) | ||
=> new KeyToVectorMappingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumnName, inputColumnName, outputCountVector); | ||
|
||
/// <summary> | ||
/// Maps columns of key types or key values into columns of floating point vectors. | ||
/// </summary> | ||
/// <param name="catalog">The conversion transform's catalog.</param> | ||
/// <param name="columns">Specifies the names of the columns on which to apply the transformation.</param> | ||
/// <param name="outputCountVector">Whether to combine multiple indicator vectors into a single vector of counts instead of concatenating them. | ||
/// This is only relevant when the input column is a vector of keys.</param> | ||
public static KeyToVectorMappingEstimator MapKeyToVector(this TransformsCatalog.ConversionTransforms catalog, | ||
InputOutputColumnPair[] columns, bool outputCountVector = KeyToVectorMappingEstimator.Defaults.OutputCountVector) | ||
{ | ||
var env = CatalogUtils.GetEnvironment(catalog); | ||
env.CheckValue(columns, nameof(columns)); | ||
var columnOptions = columns.Select(x => new KeyToVectorMappingEstimator.ColumnOptions(x.OutputColumnName, x.InputColumnName, outputCountVector)).ToArray(); | ||
return new KeyToVectorMappingEstimator(env, columnOptions); | ||
|
||
} | ||
|
||
/// <summary> | ||
/// Converts value types into <see cref="KeyDataViewType"/>. | ||
/// </summary> | ||
|
@@ -157,6 +187,31 @@ public static ValueToKeyMappingEstimator MapValueToKey(this TransformsCatalog.Co | |
=> new ValueToKeyMappingEstimator(CatalogUtils.GetEnvironment(catalog), | ||
new[] { new ValueToKeyMappingEstimator.ColumnOptions(outputColumnName, inputColumnName, maximumNumberOfKeys, keyOrdinality, addKeyValueAnnotationsAsText) }, keyData); | ||
|
||
/// <summary> | ||
/// Converts value types into <see cref="KeyDataViewType"/>. | ||
/// </summary> | ||
/// <param name="catalog">The conversion transform's catalog.</param> | ||
/// <param name="columns">Specifies the names of the columns on which to apply the transformation.</param> | ||
/// <param name="maximumNumberOfKeys">Maximum number of keys to keep per column when auto-training.</param> | ||
/// <param name="keyOrdinality">How items should be ordered when vectorized. If <see cref="ValueToKeyMappingEstimator.KeyOrdinality.ByOccurrence"/> choosen they will be in the order encountered. | ||
/// If <see cref="ValueToKeyMappingEstimator.KeyOrdinality.ByValue"/>, items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a').</param> | ||
/// <param name="addKeyValueAnnotationsAsText">Whether key value annotations should be text, regardless of the actual input type.</param> | ||
/// <param name="keyData">The data view containing the terms. If specified, this should be a single column data | ||
/// view, and the key-values will be taken from that column. If unspecified, the key-values will be determined | ||
/// from the input data upon fitting.</param> | ||
public static ValueToKeyMappingEstimator MapValueToKey(this TransformsCatalog.ConversionTransforms catalog, | ||
InputOutputColumnPair[] columns, | ||
int maximumNumberOfKeys = ValueToKeyMappingEstimator.Defaults.MaximumNumberOfKeys, | ||
ValueToKeyMappingEstimator.KeyOrdinality keyOrdinality = ValueToKeyMappingEstimator.Defaults.Ordinality, | ||
bool addKeyValueAnnotationsAsText = ValueToKeyMappingEstimator.Defaults.AddKeyValueAnnotationsAsText, | ||
IDataView keyData = null) | ||
{ | ||
var env = CatalogUtils.GetEnvironment(catalog); | ||
env.CheckValue(columns, nameof(columns)); | ||
var columnOptions = columns.Select(x => new ValueToKeyMappingEstimator.ColumnOptions(x.OutputColumnName, x.InputColumnName, maximumNumberOfKeys, keyOrdinality, addKeyValueAnnotationsAsText)).ToArray(); | ||
return new ValueToKeyMappingEstimator(env, columnOptions, keyData); | ||
} | ||
|
||
/// <summary> | ||
/// Converts value types into <see cref="KeyDataViewType"/>, optionally loading the keys to use from <paramref name="keyData"/>. | ||
/// </summary> | ||
|
@@ -232,11 +287,13 @@ public static ValueMappingEstimator<TInputType, TOutputType> MapValue<TInputType | |
internal static ValueMappingEstimator<TInputType, TOutputType> MapValue<TInputType, TOutputType>( | ||
this TransformsCatalog.ConversionTransforms catalog, | ||
IEnumerable<KeyValuePair<TInputType, TOutputType>> keyValuePairs, | ||
params ColumnOptions[] columns) | ||
params InputOutputColumnPair[] columns) | ||
{ | ||
var env = CatalogUtils.GetEnvironment(catalog); | ||
env.CheckValue(columns, nameof(columns)); | ||
var keys = keyValuePairs.Select(pair => pair.Key); | ||
var values = keyValuePairs.Select(pair => pair.Value); | ||
return new ValueMappingEstimator<TInputType, TOutputType>(CatalogUtils.GetEnvironment(catalog), keys, values, ColumnOptions.ConvertToValueTuples(columns)); | ||
return new ValueMappingEstimator<TInputType, TOutputType>(env, keys, values, InputOutputColumnPair.ConvertToValueTuples(columns)); | ||
} | ||
|
||
/// <summary> | ||
|
@@ -260,12 +317,14 @@ internal static ValueMappingEstimator<TInputType, TOutputType> MapValue<TInputTy | |
this TransformsCatalog.ConversionTransforms catalog, | ||
IEnumerable<KeyValuePair<TInputType, TOutputType>> keyValuePairs, | ||
bool treatValuesAsKeyType, | ||
params ColumnOptions[] columns) | ||
params InputOutputColumnPair[] columns) | ||
{ | ||
var env = CatalogUtils.GetEnvironment(catalog); | ||
env.CheckValue(columns, nameof(columns)); | ||
var keys = keyValuePairs.Select(pair => pair.Key); | ||
var values = keyValuePairs.Select(pair => pair.Value); | ||
return new ValueMappingEstimator<TInputType, TOutputType>(CatalogUtils.GetEnvironment(catalog), keys, values, treatValuesAsKeyType, | ||
ColumnOptions.ConvertToValueTuples(columns)); | ||
return new ValueMappingEstimator<TInputType, TOutputType>(env, keys, values, treatValuesAsKeyType, | ||
InputOutputColumnPair.ConvertToValueTuples(columns)); | ||
} | ||
|
||
/// <summary> | ||
|
@@ -321,12 +380,14 @@ public static ValueMappingEstimator<TInputType, TOutputType> MapValue<TInputType | |
internal static ValueMappingEstimator<TInputType, TOutputType> MapValue<TInputType, TOutputType>( | ||
this TransformsCatalog.ConversionTransforms catalog, | ||
IEnumerable<KeyValuePair<TInputType, TOutputType[]>> keyValuePairs, | ||
params ColumnOptions[] columns) | ||
params InputOutputColumnPair[] columns) | ||
{ | ||
var env = CatalogUtils.GetEnvironment(catalog); | ||
env.CheckValue(columns, nameof(columns)); | ||
var keys = keyValuePairs.Select(pair => pair.Key); | ||
var values = keyValuePairs.Select(pair => pair.Value); | ||
return new ValueMappingEstimator<TInputType, TOutputType>(CatalogUtils.GetEnvironment(catalog), keys, values, | ||
ColumnOptions.ConvertToValueTuples(columns)); | ||
return new ValueMappingEstimator<TInputType, TOutputType>(env, keys, values, | ||
InputOutputColumnPair.ConvertToValueTuples(columns)); | ||
} | ||
|
||
/// <summary> | ||
|
@@ -377,8 +438,12 @@ public static ValueMappingEstimator MapValue( | |
[BestFriend] | ||
internal static ValueMappingEstimator MapValue( | ||
this TransformsCatalog.ConversionTransforms catalog, | ||
IDataView lookupMap, DataViewSchema.Column keyColumn, DataViewSchema.Column valueColumn, params ColumnOptions[] columns) | ||
=> new ValueMappingEstimator(CatalogUtils.GetEnvironment(catalog), lookupMap, keyColumn.Name, valueColumn.Name, | ||
ColumnOptions.ConvertToValueTuples(columns)); | ||
IDataView lookupMap, DataViewSchema.Column keyColumn, DataViewSchema.Column valueColumn, params InputOutputColumnPair[] columns) | ||
{ | ||
var env = CatalogUtils.GetEnvironment(catalog); | ||
env.CheckValue(columns, nameof(columns)); | ||
return new ValueMappingEstimator(env, lookupMap, keyColumn.Name, valueColumn.Name, | ||
InputOutputColumnPair.ConvertToValueTuples(columns)); | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,42 +4,41 @@ | |
|
||
using System.Linq; | ||
using Microsoft.ML.Data; | ||
using Microsoft.ML.Runtime; | ||
using Microsoft.ML.Transforms; | ||
|
||
namespace Microsoft.ML | ||
{ | ||
/// <summary> | ||
/// Specifies input and output column names for a transformation. | ||
/// </summary> | ||
[BestFriend] | ||
internal sealed class ColumnOptions | ||
public sealed class InputOutputColumnPair | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
what is difference between this one and There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not much, I would have used the bellow if it were me, but Tom asked it to be different, for some reason! In reply to: 268334077 [](ancestors = 268334077) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, a very good reason. In reply to: 268334389 [](ancestors = 268334389,268334077) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just in case, I'm not talking about Can we delete In reply to: 268337035 [](ancestors = 268337035,268334389,268334077) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's fine. What Artidoro and I had discussed was actually somewhat different. (Or we were talking about two separate things without realizing it.) In reply to: 268339442 [](ancestors = 268339442,268337035,268334389,268334077) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess we were talking about something different, glad we are on the same page. In reply to: 268760195 [](ancestors = 268760195,268339442,268337035,268334389,268334077) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems good to me. The We should probably rename them back to In reply to: 268771510 [](ancestors = 268771510,268760195,268339442,268337035,268334389,268334077) |
||
{ | ||
private readonly string _outputColumnName; | ||
private readonly string _inputColumnName; | ||
/// <summary> | ||
/// Name of the column to transform. If set to <see langword="null"/>, the value of the <see cref="OutputColumnName"/> will be used as source. | ||
/// </summary> | ||
public string InputColumnName { get; } | ||
/// <summary> | ||
/// Name of the column resulting from the transformation of <see cref="InputColumnName"/>. | ||
/// </summary> | ||
public string OutputColumnName { get; } | ||
|
||
/// <summary> | ||
/// Specifies input and output column names for a transformation. | ||
/// </summary> | ||
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param> | ||
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param> | ||
public ColumnOptions(string outputColumnName, string inputColumnName = null) | ||
public InputOutputColumnPair(string outputColumnName, string inputColumnName = null) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Add another overload for the case when input name = output name. That would make it a lot clearer vs setting one to null. #ByDesign There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unfortunately I believe that everywhere in the codebase we have the same pattern In reply to: 268726871 [](ancestors = 268726871) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Check non-empty on |
||
{ | ||
_outputColumnName = outputColumnName; | ||
_inputColumnName = inputColumnName ?? outputColumnName; | ||
} | ||
|
||
/// <summary> | ||
/// Instantiates a <see cref="ColumnOptions"/> from a tuple of input and output column names. | ||
/// </summary> | ||
public static implicit operator ColumnOptions((string outputColumnName, string inputColumnName) value) | ||
{ | ||
return new ColumnOptions(value.outputColumnName, value.inputColumnName); | ||
Contracts.CheckNonEmpty(outputColumnName, nameof(outputColumnName)); | ||
InputColumnName = inputColumnName ?? outputColumnName; | ||
OutputColumnName = outputColumnName; | ||
} | ||
|
||
[BestFriend] | ||
internal static (string outputColumnName, string inputColumnName)[] ConvertToValueTuples(ColumnOptions[] infos) | ||
internal static (string outputColumnName, string inputColumnName)[] ConvertToValueTuples(InputOutputColumnPair[] infos) | ||
{ | ||
return infos.Select(info => (info._outputColumnName, info._inputColumnName)).ToArray(); | ||
return infos.Select(info => (info.OutputColumnName, info.InputColumnName)).ToArray(); | ||
} | ||
} | ||
|
||
|
@@ -78,8 +77,12 @@ public static ColumnCopyingEstimator CopyColumns(this TransformsCatalog catalog, | |
/// </format> | ||
/// </example> | ||
[BestFriend] | ||
internal static ColumnCopyingEstimator CopyColumns(this TransformsCatalog catalog, params ColumnOptions[] columns) | ||
=> new ColumnCopyingEstimator(CatalogUtils.GetEnvironment(catalog), ColumnOptions.ConvertToValueTuples(columns)); | ||
internal static ColumnCopyingEstimator CopyColumns(this TransformsCatalog catalog, params InputOutputColumnPair[] columns) | ||
{ | ||
var env = CatalogUtils.GetEnvironment(catalog); | ||
env.CheckValue(columns, nameof(columns)); | ||
return new ColumnCopyingEstimator(env, InputOutputColumnPair.ConvertToValueTuples(columns)); | ||
} | ||
|
||
/// <summary> | ||
/// Concatenates columns together. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Don't you need to check for null for the columns arg to avoid null reference exception? #Resolved
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I added the checks thanks for pointing out. I only fixed the extensions that are public. Not those that are internal.
In reply to: 268724517 [](ancestors = 268724517)