Skip to content

Multicolumn mapping for some estimators #3066

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Mar 25, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/code/MlNetCookBook.md
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,7 @@ var pipeline =
// Use the multi-class SDCA model to predict the label using features.
.Append(mlContext.MulticlassClassification.Trainers.SdcaCalibrated())
// Apply the inverse conversion from 'PredictedLabel' column back to string value.
.Append(mlContext.Transforms.Conversion.MapKeyToValue(("PredictedLabel", "Data")));
.Append(mlContext.Transforms.Conversion.MapKeyToValue("Data", "PredictedLabel"));

// Train the model.
var model = pipeline.Fit(trainData);
Expand Down
113 changes: 89 additions & 24 deletions src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
using System.Collections.Generic;
using System.Linq;
using Microsoft.ML.Data;
using Microsoft.ML.Runtime;
using Microsoft.ML.Transforms;

namespace Microsoft.ML
Expand Down Expand Up @@ -65,6 +66,22 @@ public static TypeConvertingEstimator ConvertType(this TransformsCatalog.Convers
DataKind outputKind = ConvertDefaults.DefaultOutputKind)
=> new TypeConvertingEstimator(CatalogUtils.GetEnvironment(catalog), new[] { new TypeConvertingEstimator.ColumnOptions(outputColumnName, outputKind, inputColumnName) });

/// <summary>
/// Changes column type of the input columns.
/// </summary>
/// <param name="catalog">The conversion transform's catalog.</param>
/// <param name="columns">Specifies the names of the columns on which to apply the transformation.</param>
/// <param name="outputKind">The expected kind of the output column.</param>
public static TypeConvertingEstimator ConvertType(this TransformsCatalog.ConversionTransforms catalog,
InputOutputColumnPair[] columns,
Copy link
Contributor

@glebuk glebuk Mar 25, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

columns [](start = 36, length = 7)

Don't you need to check for null for the columns arg to avoid null reference exception? #Resolved

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added the checks thanks for pointing out. I only fixed the extensions that are public. Not those that are internal.


In reply to: 268724517 [](ancestors = 268724517)

DataKind outputKind = ConvertDefaults.DefaultOutputKind)
{
var env = CatalogUtils.GetEnvironment(catalog);
env.CheckValue(columns, nameof(columns));
var columnOptions = columns.Select(x => new TypeConvertingEstimator.ColumnOptions(x.OutputColumnName, outputKind, x.InputColumnName)).ToArray();
return new TypeConvertingEstimator(env, columnOptions);
}

/// <summary>
/// Changes column type of the input column.
/// </summary>
Expand All @@ -90,20 +107,16 @@ public static KeyToValueMappingEstimator MapKeyToValue(this TransformsCatalog.Co
=> new KeyToValueMappingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumnName, inputColumnName);

/// <summary>
/// Convert the key types (name of the column specified in the first item of the tuple) back to their original values
/// (named as specified in the second item of the tuple).
/// Convert the key types back to their original values.
/// </summary>
/// <param name="catalog">The conversion transform's catalog</param>
/// <param name="columns">The pairs of input and output columns.</param>
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
/// [!code-csharp[KeyToValueMappingEstimator](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/ValueMappingStringToKeyType.cs)]
/// ]]></format>
/// </example>
[BestFriend]
internal static KeyToValueMappingEstimator MapKeyToValue(this TransformsCatalog.ConversionTransforms catalog, params ColumnOptions[] columns)
=> new KeyToValueMappingEstimator(CatalogUtils.GetEnvironment(catalog), ColumnOptions.ConvertToValueTuples(columns));
/// <param name="catalog">The conversion transform's catalog.</param>
/// <param name="columns">Specifies the names of the columns on which to apply the transformation.</param>
public static KeyToValueMappingEstimator MapKeyToValue(this TransformsCatalog.ConversionTransforms catalog, InputOutputColumnPair[] columns)
Copy link
Contributor

@glebuk glebuk Mar 25, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

columns [](start = 140, length = 7)

also check for null #Resolved

{
var env = CatalogUtils.GetEnvironment(catalog);
env.CheckValue(columns, nameof(columns));
return new KeyToValueMappingEstimator(env, columns.Select(x => (x.OutputColumnName, x.InputColumnName)).ToArray());
}

/// <summary>
/// Maps key types or key values into a floating point vector.
Expand All @@ -127,6 +140,23 @@ public static KeyToVectorMappingEstimator MapKeyToVector(this TransformsCatalog.
string outputColumnName, string inputColumnName = null, bool outputCountVector = KeyToVectorMappingEstimator.Defaults.OutputCountVector)
=> new KeyToVectorMappingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumnName, inputColumnName, outputCountVector);

/// <summary>
/// Maps columns of key types or key values into columns of floating point vectors.
/// </summary>
/// <param name="catalog">The conversion transform's catalog.</param>
/// <param name="columns">Specifies the names of the columns on which to apply the transformation.</param>
/// <param name="outputCountVector">Whether to combine multiple indicator vectors into a single vector of counts instead of concatenating them.
/// This is only relevant when the input column is a vector of keys.</param>
public static KeyToVectorMappingEstimator MapKeyToVector(this TransformsCatalog.ConversionTransforms catalog,
InputOutputColumnPair[] columns, bool outputCountVector = KeyToVectorMappingEstimator.Defaults.OutputCountVector)
{
var env = CatalogUtils.GetEnvironment(catalog);
env.CheckValue(columns, nameof(columns));
var columnOptions = columns.Select(x => new KeyToVectorMappingEstimator.ColumnOptions(x.OutputColumnName, x.InputColumnName, outputCountVector)).ToArray();
return new KeyToVectorMappingEstimator(env, columnOptions);

}

/// <summary>
/// Converts value types into <see cref="KeyDataViewType"/>.
/// </summary>
Expand Down Expand Up @@ -157,6 +187,31 @@ public static ValueToKeyMappingEstimator MapValueToKey(this TransformsCatalog.Co
=> new ValueToKeyMappingEstimator(CatalogUtils.GetEnvironment(catalog),
new[] { new ValueToKeyMappingEstimator.ColumnOptions(outputColumnName, inputColumnName, maximumNumberOfKeys, keyOrdinality, addKeyValueAnnotationsAsText) }, keyData);

/// <summary>
/// Converts value types into <see cref="KeyDataViewType"/>.
/// </summary>
/// <param name="catalog">The conversion transform's catalog.</param>
/// <param name="columns">Specifies the names of the columns on which to apply the transformation.</param>
/// <param name="maximumNumberOfKeys">Maximum number of keys to keep per column when auto-training.</param>
/// <param name="keyOrdinality">How items should be ordered when vectorized. If <see cref="ValueToKeyMappingEstimator.KeyOrdinality.ByOccurrence"/> choosen they will be in the order encountered.
/// If <see cref="ValueToKeyMappingEstimator.KeyOrdinality.ByValue"/>, items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a').</param>
/// <param name="addKeyValueAnnotationsAsText">Whether key value annotations should be text, regardless of the actual input type.</param>
/// <param name="keyData">The data view containing the terms. If specified, this should be a single column data
/// view, and the key-values will be taken from that column. If unspecified, the key-values will be determined
/// from the input data upon fitting.</param>
public static ValueToKeyMappingEstimator MapValueToKey(this TransformsCatalog.ConversionTransforms catalog,
InputOutputColumnPair[] columns,
int maximumNumberOfKeys = ValueToKeyMappingEstimator.Defaults.MaximumNumberOfKeys,
ValueToKeyMappingEstimator.KeyOrdinality keyOrdinality = ValueToKeyMappingEstimator.Defaults.Ordinality,
bool addKeyValueAnnotationsAsText = ValueToKeyMappingEstimator.Defaults.AddKeyValueAnnotationsAsText,
IDataView keyData = null)
{
var env = CatalogUtils.GetEnvironment(catalog);
env.CheckValue(columns, nameof(columns));
var columnOptions = columns.Select(x => new ValueToKeyMappingEstimator.ColumnOptions(x.OutputColumnName, x.InputColumnName, maximumNumberOfKeys, keyOrdinality, addKeyValueAnnotationsAsText)).ToArray();
return new ValueToKeyMappingEstimator(env, columnOptions, keyData);
}

/// <summary>
/// Converts value types into <see cref="KeyDataViewType"/>, optionally loading the keys to use from <paramref name="keyData"/>.
/// </summary>
Expand Down Expand Up @@ -232,11 +287,13 @@ public static ValueMappingEstimator<TInputType, TOutputType> MapValue<TInputType
internal static ValueMappingEstimator<TInputType, TOutputType> MapValue<TInputType, TOutputType>(
this TransformsCatalog.ConversionTransforms catalog,
IEnumerable<KeyValuePair<TInputType, TOutputType>> keyValuePairs,
params ColumnOptions[] columns)
params InputOutputColumnPair[] columns)
{
var env = CatalogUtils.GetEnvironment(catalog);
env.CheckValue(columns, nameof(columns));
var keys = keyValuePairs.Select(pair => pair.Key);
var values = keyValuePairs.Select(pair => pair.Value);
return new ValueMappingEstimator<TInputType, TOutputType>(CatalogUtils.GetEnvironment(catalog), keys, values, ColumnOptions.ConvertToValueTuples(columns));
return new ValueMappingEstimator<TInputType, TOutputType>(env, keys, values, InputOutputColumnPair.ConvertToValueTuples(columns));
}

/// <summary>
Expand All @@ -260,12 +317,14 @@ internal static ValueMappingEstimator<TInputType, TOutputType> MapValue<TInputTy
this TransformsCatalog.ConversionTransforms catalog,
IEnumerable<KeyValuePair<TInputType, TOutputType>> keyValuePairs,
bool treatValuesAsKeyType,
params ColumnOptions[] columns)
params InputOutputColumnPair[] columns)
{
var env = CatalogUtils.GetEnvironment(catalog);
env.CheckValue(columns, nameof(columns));
var keys = keyValuePairs.Select(pair => pair.Key);
var values = keyValuePairs.Select(pair => pair.Value);
return new ValueMappingEstimator<TInputType, TOutputType>(CatalogUtils.GetEnvironment(catalog), keys, values, treatValuesAsKeyType,
ColumnOptions.ConvertToValueTuples(columns));
return new ValueMappingEstimator<TInputType, TOutputType>(env, keys, values, treatValuesAsKeyType,
InputOutputColumnPair.ConvertToValueTuples(columns));
}

/// <summary>
Expand Down Expand Up @@ -321,12 +380,14 @@ public static ValueMappingEstimator<TInputType, TOutputType> MapValue<TInputType
internal static ValueMappingEstimator<TInputType, TOutputType> MapValue<TInputType, TOutputType>(
this TransformsCatalog.ConversionTransforms catalog,
IEnumerable<KeyValuePair<TInputType, TOutputType[]>> keyValuePairs,
params ColumnOptions[] columns)
params InputOutputColumnPair[] columns)
{
var env = CatalogUtils.GetEnvironment(catalog);
env.CheckValue(columns, nameof(columns));
var keys = keyValuePairs.Select(pair => pair.Key);
var values = keyValuePairs.Select(pair => pair.Value);
return new ValueMappingEstimator<TInputType, TOutputType>(CatalogUtils.GetEnvironment(catalog), keys, values,
ColumnOptions.ConvertToValueTuples(columns));
return new ValueMappingEstimator<TInputType, TOutputType>(env, keys, values,
InputOutputColumnPair.ConvertToValueTuples(columns));
}

/// <summary>
Expand Down Expand Up @@ -377,8 +438,12 @@ public static ValueMappingEstimator MapValue(
[BestFriend]
internal static ValueMappingEstimator MapValue(
this TransformsCatalog.ConversionTransforms catalog,
IDataView lookupMap, DataViewSchema.Column keyColumn, DataViewSchema.Column valueColumn, params ColumnOptions[] columns)
=> new ValueMappingEstimator(CatalogUtils.GetEnvironment(catalog), lookupMap, keyColumn.Name, valueColumn.Name,
ColumnOptions.ConvertToValueTuples(columns));
IDataView lookupMap, DataViewSchema.Column keyColumn, DataViewSchema.Column valueColumn, params InputOutputColumnPair[] columns)
{
var env = CatalogUtils.GetEnvironment(catalog);
env.CheckValue(columns, nameof(columns));
return new ValueMappingEstimator(env, lookupMap, keyColumn.Name, valueColumn.Name,
InputOutputColumnPair.ConvertToValueTuples(columns));
}
}
}
41 changes: 22 additions & 19 deletions src/Microsoft.ML.Data/Transforms/ExtensionsCatalog.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,42 +4,41 @@

using System.Linq;
using Microsoft.ML.Data;
using Microsoft.ML.Runtime;
using Microsoft.ML.Transforms;

namespace Microsoft.ML
{
/// <summary>
/// Specifies input and output column names for a transformation.
/// </summary>
[BestFriend]
internal sealed class ColumnOptions
public sealed class InputOutputColumnPair
Copy link
Contributor

@Ivanidzo4ka Ivanidzo4ka Mar 22, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

InputOutputColumnPair [](start = 24, length = 21)

what is difference between this one and ColumnOptions? #Resolved

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not much, I would have used the bellow if it were me, but Tom asked it to be different, for some reason!


In reply to: 268334077 [](ancestors = 268334077)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, a very good reason. ColumnOptions is a struct meant to serve a specific transformer base, and that is involved in the type heirarchy, and in particular something that captured all of the individual settings and state for each mapping. Our goals here were comparatively more modest: we just needed to . Well designed code does what it is designed to do, and in the simplest possible way. There is no need for this to be part of an elaborate type hierarchy -- this was in fact the mistake that led to the issue #2884 being filed, that we'd conflated two distinct techniques for the extremely bad reasoning that they both had to do with "column." (And, of course, if something deals with the same sort of object, obviously they belong in the same type heirarchy, right?)


In reply to: 268334389 [](ancestors = 268334389,268334077)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just in case, I'm not talking about ColumnOptions inside Estimator, (for example HashingEstimator.ColumnOptions.
I'm talking about ColumnOptions in this exact file few lines below (line 41)
Only difference I see is Input/output column names is public in this one instead of private in other, and name of class.
Ok, two difference, one below has implicit converter from tuple.

Can we delete ColumnOptions in this file?


In reply to: 268337035 [](ancestors = 268337035,268334389,268334077)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's fine. What Artidoro and I had discussed was actually somewhat different. (Or we were talking about two separate things without realizing it.)


In reply to: 268339442 [](ancestors = 268339442,268337035,268334389,268334077)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess we were talking about something different, glad we are on the same page.


In reply to: 268760195 [](ancestors = 268760195,268339442,268337035,268334389,268334077)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems good to me. The ColumnOptions I had been talking about (which is to say, the vast majority of things with that name) are as I said meant to serve a different purpose.

We should probably rename them back to ColumnInfo at some point but this can be delayed as they are internal... ummm except one. Whoops. Opened #3078. :) Aside from that, yeah.


In reply to: 268771510 [](ancestors = 268771510,268760195,268339442,268337035,268334389,268334077)

{
private readonly string _outputColumnName;
private readonly string _inputColumnName;
/// <summary>
/// Name of the column to transform. If set to <see langword="null"/>, the value of the <see cref="OutputColumnName"/> will be used as source.
/// </summary>
public string InputColumnName { get; }
/// <summary>
/// Name of the column resulting from the transformation of <see cref="InputColumnName"/>.
/// </summary>
public string OutputColumnName { get; }

/// <summary>
/// Specifies input and output column names for a transformation.
/// </summary>
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
public ColumnOptions(string outputColumnName, string inputColumnName = null)
public InputOutputColumnPair(string outputColumnName, string inputColumnName = null)
Copy link
Contributor

@glebuk glebuk Mar 25, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

public InputOutputColumnPair(string outputColumnName, string inputColumnName = null) [](start = 8, length = 84)

Add another overload for the case when input name = output name. That would make it a lot clearer vs setting one to null. #ByDesign

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately I believe that everywhere in the codebase we have the same pattern string outputColumnName, string inputColumnName = null. This is found in all the mlContext extensions for transforms. I can add it here, but I think it would make more sense to stick to the general pattern.


In reply to: 268726871 [](ancestors = 268726871)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We must be consistent. Sorry @glebuk!


In reply to: 268779384 [](ancestors = 268779384,268726871)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok then.


In reply to: 268819893 [](ancestors = 268819893,268779384,268726871)

Copy link
Contributor

@TomFinley TomFinley Mar 25, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

outputColumnName [](start = 44, length = 16)

Check non-empty on outputColumName probably. #Resolved

{
_outputColumnName = outputColumnName;
_inputColumnName = inputColumnName ?? outputColumnName;
}

/// <summary>
/// Instantiates a <see cref="ColumnOptions"/> from a tuple of input and output column names.
/// </summary>
public static implicit operator ColumnOptions((string outputColumnName, string inputColumnName) value)
{
return new ColumnOptions(value.outputColumnName, value.inputColumnName);
Contracts.CheckNonEmpty(outputColumnName, nameof(outputColumnName));
InputColumnName = inputColumnName ?? outputColumnName;
OutputColumnName = outputColumnName;
}

[BestFriend]
internal static (string outputColumnName, string inputColumnName)[] ConvertToValueTuples(ColumnOptions[] infos)
internal static (string outputColumnName, string inputColumnName)[] ConvertToValueTuples(InputOutputColumnPair[] infos)
{
return infos.Select(info => (info._outputColumnName, info._inputColumnName)).ToArray();
return infos.Select(info => (info.OutputColumnName, info.InputColumnName)).ToArray();
}
}

Expand Down Expand Up @@ -78,8 +77,12 @@ public static ColumnCopyingEstimator CopyColumns(this TransformsCatalog catalog,
/// </format>
/// </example>
[BestFriend]
internal static ColumnCopyingEstimator CopyColumns(this TransformsCatalog catalog, params ColumnOptions[] columns)
=> new ColumnCopyingEstimator(CatalogUtils.GetEnvironment(catalog), ColumnOptions.ConvertToValueTuples(columns));
internal static ColumnCopyingEstimator CopyColumns(this TransformsCatalog catalog, params InputOutputColumnPair[] columns)
{
var env = CatalogUtils.GetEnvironment(catalog);
env.CheckValue(columns, nameof(columns));
return new ColumnCopyingEstimator(env, InputOutputColumnPair.ConvertToValueTuples(columns));
}

/// <summary>
/// Concatenates columns together.
Expand Down
17 changes: 13 additions & 4 deletions src/Microsoft.ML.ImageAnalytics/ExtensionsCatalog.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
// See the LICENSE file in the project root for more information.

using Microsoft.ML.Data;
using Microsoft.ML.Runtime;
using Microsoft.ML.Transforms.Image;

namespace Microsoft.ML
Expand Down Expand Up @@ -32,8 +33,12 @@ public static ImageGrayscalingEstimator ConvertToGrayscale(this TransformsCatalo
/// ]]></format>
/// </example>
[BestFriend]
internal static ImageGrayscalingEstimator ConvertToGrayscale(this TransformsCatalog catalog, params ColumnOptions[] columns)
=> new ImageGrayscalingEstimator(CatalogUtils.GetEnvironment(catalog), ColumnOptions.ConvertToValueTuples(columns));
internal static ImageGrayscalingEstimator ConvertToGrayscale(this TransformsCatalog catalog, params InputOutputColumnPair[] columns)
{
var env = CatalogUtils.GetEnvironment(catalog);
env.CheckValue(columns, nameof(columns));
return new ImageGrayscalingEstimator(env, InputOutputColumnPair.ConvertToValueTuples(columns));
}

/// <summary>
/// Loads the images from the <see cref="ImageLoadingTransformer.ImageFolder" /> into memory.
Expand Down Expand Up @@ -80,8 +85,12 @@ public static ImageLoadingEstimator LoadImages(this TransformsCatalog catalog, s
/// ]]></format>
/// </example>
[BestFriend]
internal static ImageLoadingEstimator LoadImages(this TransformsCatalog catalog, string imageFolder, params ColumnOptions[] columns)
=> new ImageLoadingEstimator(CatalogUtils.GetEnvironment(catalog), imageFolder, ColumnOptions.ConvertToValueTuples(columns));
internal static ImageLoadingEstimator LoadImages(this TransformsCatalog catalog, string imageFolder, params InputOutputColumnPair[] columns)
{
var env = CatalogUtils.GetEnvironment(catalog);
env.CheckValue(columns, nameof(columns));
return new ImageLoadingEstimator(env, imageFolder, InputOutputColumnPair.ConvertToValueTuples(columns));
}

/// <include file='doc.xml' path='doc/members/member[@name="ImagePixelExtractingEstimator"]/*' />
/// <param name="catalog">The transform's catalog.</param>
Expand Down
Loading