Skip to content

Restore OVA ability to preserve key names on predicted label #3101

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Apr 1, 2019
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions src/Microsoft.ML.Core/Data/AnnotationUtils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -441,8 +441,15 @@ public static bool TryGetCategoricalFeatureIndices(DataViewSchema schema, int co
public static IEnumerable<SchemaShape.Column> AnnotationsForMulticlassScoreColumn(SchemaShape.Column? labelColumn = null)
{
var cols = new List<SchemaShape.Column>();
if (labelColumn != null && labelColumn.Value.IsKey && NeedsSlotNames(labelColumn.Value))
cols.Add(new SchemaShape.Column(Kinds.SlotNames, SchemaShape.Column.VectorKind.Vector, TextDataViewType.Instance, false));
if (labelColumn != null && labelColumn.Value.IsKey)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would probably be good to wrap this in curly braces, now that it is more than one line (and that you have this blank line in between the if and the body.

if (labelColumn.Value.Annotations.TryFindColumn(Kinds.KeyValues, out var metaCol) &&
metaCol.Kind == SchemaShape.Column.VectorKind.Vector)
{
if (metaCol.ItemType is TextDataViewType)
cols.Add(new SchemaShape.Column(Kinds.SlotNames, SchemaShape.Column.VectorKind.Vector, TextDataViewType.Instance, false));
cols.Add(new SchemaShape.Column(Kinds.TrainingLabelValues, SchemaShape.Column.VectorKind.Vector, metaCol.ItemType, false));
}
cols.AddRange(GetTrainerOutputAnnotation());
return cols;
}
Expand Down
36 changes: 27 additions & 9 deletions src/Microsoft.ML.Data/Scorers/MulticlassClassificationScorer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -390,7 +390,7 @@ private static ISchemaBoundMapper WrapIfNeeded(IHostEnvironment env, ISchemaBoun
if (trainSchema?.Label == null)
return mapper; // We don't even have a label identified in a training schema.
var keyType = trainSchema.Label.Value.Annotations.Schema.GetColumnOrNull(AnnotationUtils.Kinds.KeyValues)?.Type as VectorDataViewType;
if (keyType == null || !CanWrap(mapper, keyType))
if (keyType == null)
return mapper;

// Great!! All checks pass.
Expand All @@ -409,11 +409,19 @@ private static ISchemaBoundMapper WrapIfNeeded(IHostEnvironment env, ISchemaBoun
/// from the model of a bindable mapper)</param>
/// <returns>Whether we can call <see cref="LabelNameBindableMapper.CreateBound{T}"/> with
/// this mapper and expect it to succeed</returns>
internal static bool CanWrap(ISchemaBoundMapper mapper, DataViewType labelNameType)
internal static bool CanWrapTrainingLabels(ISchemaBoundMapper mapper, DataViewType labelNameType)
{
if (GetTypesForWrapping(mapper, labelNameType, AnnotationUtils.Kinds.TrainingLabelValues, out var scoreType))
// Check that the type is vector, and is of compatible size with the score output.
return labelNameType is VectorDataViewType vectorType && vectorType.Size == scoreType.GetVectorSize();
return false;
}

internal static bool GetTypesForWrapping(ISchemaBoundMapper mapper, DataViewType labelNameType, string metaKind, out DataViewType scoreType)
{
Contracts.AssertValue(mapper);
Contracts.AssertValue(labelNameType);

scoreType = null;
ISchemaBoundRowMapper rowMapper = mapper as ISchemaBoundRowMapper;
if (rowMapper == null)
return false; // We could cover this case, but it is of no practical worth as far as I see, so I decline to do so.
Expand All @@ -423,12 +431,18 @@ internal static bool CanWrap(ISchemaBoundMapper mapper, DataViewType labelNameTy
var scoreCol = outSchema.GetColumnOrNull(AnnotationUtils.Const.ScoreValueKind.Score);
if (!outSchema.TryGetColumnIndex(AnnotationUtils.Const.ScoreValueKind.Score, out scoreIdx))
return false; // The mapper doesn't even publish a score column to attach the metadata to.
if (outSchema[scoreIdx].Annotations.Schema.GetColumnOrNull(AnnotationUtils.Kinds.SlotNames)?.Type != null)
if (outSchema[scoreIdx].Annotations.Schema.GetColumnOrNull(AnnotationUtils.Kinds.TrainingLabelValues)?.Type != null)
return false; // The mapper publishes a score column, and already produces its own slot names.
var scoreType = outSchema[scoreIdx].Type;
scoreType = outSchema[scoreIdx].Type;
return true;
}

// Check that the type is vector, and is of compatible size with the score output.
return labelNameType is VectorDataViewType vectorType && vectorType.Size == scoreType.GetVectorSize() && vectorType.ItemType == TextDataViewType.Instance;
internal static bool CanWrapSlotNames(ISchemaBoundMapper mapper, DataViewType labelNameType)
{
if (GetTypesForWrapping(mapper, labelNameType, AnnotationUtils.Kinds.SlotNames, out var scoreType))
// Check that the type is vector, and is of compatible size with the score output.
return labelNameType is VectorDataViewType vectorType && vectorType.Size == scoreType.GetVectorSize() && vectorType.ItemType == TextDataViewType.Instance;
return false;
}

internal static ISchemaBoundMapper WrapCore<T>(IHostEnvironment env, ISchemaBoundMapper mapper, RoleMappedSchema trainSchema)
Expand All @@ -449,8 +463,12 @@ internal static ISchemaBoundMapper WrapCore<T>(IHostEnvironment env, ISchemaBoun
{
trainSchema.Label.Value.GetKeyValues(ref value);
};

return LabelNameBindableMapper.CreateBound<T>(env, (ISchemaBoundRowMapper)mapper, type as VectorDataViewType, getter, AnnotationUtils.Kinds.SlotNames, CanWrap);
var resultMapper = mapper;
if (CanWrapTrainingLabels(resultMapper, type))
resultMapper = LabelNameBindableMapper.CreateBound<T>(env, (ISchemaBoundRowMapper)resultMapper, type as VectorDataViewType, getter, AnnotationUtils.Kinds.TrainingLabelValues, CanWrapTrainingLabels);
if (CanWrapSlotNames(resultMapper, type))
resultMapper = LabelNameBindableMapper.CreateBound<T>(env, (ISchemaBoundRowMapper)resultMapper, type as VectorDataViewType, getter, AnnotationUtils.Kinds.SlotNames, CanWrapSlotNames);
return resultMapper;
}

[BestFriend]
Expand Down
20 changes: 5 additions & 15 deletions src/Microsoft.ML.Data/Scorers/PredictedLabelScorerBase.cs
Original file line number Diff line number Diff line change
Expand Up @@ -62,22 +62,12 @@ private BindingsImpl(DataViewSchema input, ISchemaBoundRowMapper mapper, string
{
var scoreColMetadata = mapper.OutputSchema[scoreColIndex].Annotations;

var slotColumn = scoreColMetadata.Schema.GetColumnOrNull(AnnotationUtils.Kinds.SlotNames);
if (slotColumn?.Type is VectorDataViewType slotColVecType && (ulong)slotColVecType.Size == predColKeyType.Count)
var trainLabelColumn = scoreColMetadata.Schema.GetColumnOrNull(AnnotationUtils.Kinds.TrainingLabelValues);
if (trainLabelColumn?.Type is VectorDataViewType trainLabelColVecType && (ulong)trainLabelColVecType.Size == predColKeyType.Count)
{
Contracts.Assert(slotColVecType.Size > 0);
_predColMetadata = Utils.MarshalInvoke(KeyValueMetadataFromMetadata<int>, slotColVecType.RawType,
scoreColMetadata, slotColumn.Value);
}
else
{
var trainLabelColumn = scoreColMetadata.Schema.GetColumnOrNull(AnnotationUtils.Kinds.TrainingLabelValues);
if (trainLabelColumn?.Type is VectorDataViewType trainLabelColVecType && (ulong)trainLabelColVecType.Size == predColKeyType.Count)
{
Contracts.Assert(trainLabelColVecType.Size > 0);
_predColMetadata = Utils.MarshalInvoke(KeyValueMetadataFromMetadata<int>, trainLabelColVecType.RawType,
scoreColMetadata, trainLabelColumn.Value);
}
Contracts.Assert(trainLabelColVecType.Size > 0);
_predColMetadata = Utils.MarshalInvoke(KeyValueMetadataFromMetadata<int>, trainLabelColVecType.RawType,
scoreColMetadata, trainLabelColumn.Value);
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,22 +37,22 @@ void PredictAndMetadata()

var testLoader = ml.Data.LoadFromTextFile(dataPath, TestDatasets.irisData.GetLoaderColumns(), separatorChar: ',', hasHeader: true);
var testData = ml.Data.CreateEnumerable<IrisData>(testLoader, false);

// During prediction we will get Score column with 3 float values.
// We need to find way to map each score to original label.
// In order to do what we need to get SlotNames from Score column.
// Slot names on top of Score column represent original labels for i-th value in Score array.
VBuffer<ReadOnlyMemory<char>> slotNames = default;
engine.OutputSchema[nameof(IrisPrediction.Score)].GetSlotNames(ref slotNames);
// In order to do what we need to get TrainingLabelValues from Score column.
// TrainingLabelValues on top of Score column represent original labels for i-th value in Score array.
VBuffer<ReadOnlyMemory<char>> originalLabels = default;
Copy link
Contributor

@TomFinley TomFinley Apr 1, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ReadOnlyMemory [](start = 20, length = 14)

In this particular case we should be propagating both slot names and label names, right? Since they're string in both cases? While I see the point in augmenting the test to cover this new metadata type, is there any particular reason to remove the test that the vector has teh appropriate slot names? #WontFix

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is old scenario test. Purpose of it to show user how to do work with metadata.
In this particular test label is string, and it has slotnames, but it wont in case of non string label.
If I add slotnames here it would be confusing? What for do I get slotnames?

All tests with TestEstimatorCore routing would test on presence of slotnames and TrainingLabelValues. And we have plenty of them,. why should we do anything here with slotname?


In reply to: 271046772 [](ancestors = 271046772)

Copy link
Contributor

@TomFinley TomFinley Apr 1, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, that's fine. I had the idea that these "showing the user" things were more the point of functional tests, but as you like.


In reply to: 271050961 [](ancestors = 271050961,271046772)

engine.OutputSchema[nameof(IrisPrediction.Score)].Annotations.GetValue(AnnotationUtils.Kinds.TrainingLabelValues, ref originalLabels);
Copy link
Contributor

@TomFinley TomFinley Mar 27, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TrainingLabelValues [](start = 105, length = 19)

Is the distinction with slot names that slots names must be text, while these might be any type? That might excuse not using them. But in such a case I'd argue that we should still have the slot names for descriptive user-facing purposes. so I'd like to confirm we're still doing that. #Resolved

Copy link
Contributor Author

@Ivanidzo4ka Ivanidzo4ka Mar 27, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No we don't, but I can change that. Any reason while we want continue to propagate slotnames? #Resolved

Copy link
Contributor

@TomFinley TomFinley Mar 27, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, let's imagine I write out a text file, and I have this scores column. With slot names, I get a descriptive header. Without it I don't. Does that make sense? #Resolved

Copy link
Contributor Author

@Ivanidzo4ka Ivanidzo4ka Mar 27, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Only if original labels were string, but ok.
Just before I make crucial mistake.
Do you prefer to have two wrappers on top of multiclass scorer one for TrainingLabelValues one for SlotNames or you would prefer to extend LabelNameBindableMapper to support multiple getters/ metakinds? #Resolved

Copy link
Contributor

@TomFinley TomFinley Apr 1, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Multiple metadata kinds is good, thanks Ivan. I believe this is what you did, that is, from the code I read you are propagating the labels always, and propogating slot names if tehy're text, and that seems fine to me.


In reply to: 269742478 [](ancestors = 269742478)

// Since we apply MapValueToKey estimator with default parameters, key values
// depends on order of occurence in data file. Which is "Iris-setosa", "Iris-versicolor", "Iris-virginica"
// So if we have Score column equal to [0.2, 0.3, 0.5] that's mean what score for
// Iris-setosa is 0.2
// Iris-versicolor is 0.3
// Iris-virginica is 0.5.
Assert.True(slotNames.GetItemOrDefault(0).ToString() == "Iris-setosa");
Assert.True(slotNames.GetItemOrDefault(1).ToString() == "Iris-versicolor");
Assert.True(slotNames.GetItemOrDefault(2).ToString() == "Iris-virginica");
Assert.Equal("Iris-setosa", originalLabels.GetItemOrDefault(0).ToString());
Assert.Equal("Iris-versicolor", originalLabels.GetItemOrDefault(1).ToString());
Assert.Equal("Iris-virginica", originalLabels.GetItemOrDefault(2).ToString());

// Let's look how we can convert key value for PredictedLabel to original labels.
// We need to read KeyValues for "PredictedLabel" column.
Expand Down
36 changes: 34 additions & 2 deletions test/Microsoft.ML.Tests/TrainerEstimators/MetalinearEstimators.cs
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,36 @@ public void OVAUncalibrated()
Done();
}

/// <summary>
/// Test what OVA preserves key values for label.
/// </summary>
[Fact]
public void OvaKeyNames()
Copy link
Contributor

@TomFinley TomFinley Mar 27, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OvaKeyNames [](start = 20, length = 11)

Just curious, I heard something fairly troubling from @eerhardt, could we test this sort of scenario still works for things other than OVA? #Resolved

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any tests which use TestEstimatorCore will compare metadata from expected and resulted output schema.
I've change AnnotationsForMulticlassScoreColumn to always have TrainingLabelValue, and slotNames if key was text type, so technically we check that everywhere where we use TestEstimatorCore


In reply to: 269746627 [](ancestors = 269746627)

Copy link
Member

@eerhardt eerhardt Mar 27, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we add this to FunctionalTests instead? Let's not keep adding tests that the product allows InternalsVisibleTo. That way we can test like a customer uses the product. #Resolved

{
var textLoaderOptions = new TextLoader.Options()
{
Columns = new[]
{ new TextLoader.Column("Label", DataKind.Single, 0),
new TextLoader.Column("Row", DataKind.Single, 1),
new TextLoader.Column("Column", DataKind.Single, 2),
},
HasHeader = true,
Separators = new[] { '\t' }
};
var textLoader = ML.Data.CreateTextLoader(textLoaderOptions);
var data = textLoader.Load(GetDataPath(TestDatasets.trivialMatrixFactorization.trainFilename));

var ap = ML.BinaryClassification.Trainers.AveragedPerceptron();
var ova = ML.MulticlassClassification.Trainers.OneVersusAll(ap);

var pipeline = ML.Transforms.Conversion.MapValueToKey("Label")
.Append(ML.Transforms.Concatenate("Features", "Row", "Column"))
.Append(ova)
.Append(ML.Transforms.Conversion.MapKeyToValue("PredictedLabel"));

var model = pipeline.Fit(data);
Copy link
Member

@eerhardt eerhardt Mar 28, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should do something with the model to ensure it was created correctly. #Resolved

}

/// <summary>
/// Pairwise Coupling trainer
/// </summary>
Expand Down Expand Up @@ -83,12 +113,14 @@ public void MetacomponentsFeaturesRenamed()
var data = loader.Load(GetDataPath(TestDatasets.irisData.trainFilename));

var sdcaTrainer = ML.BinaryClassification.Trainers.SdcaNonCalibrated(
new SdcaNonCalibratedBinaryTrainer.Options {
new SdcaNonCalibratedBinaryTrainer.Options
{
LabelColumnName = "Label",
FeatureColumnName = "Vars",
MaximumNumberOfIterations = 100,
Shuffle = true,
NumberOfThreads = 1, });
NumberOfThreads = 1,
});

var pipeline = new ColumnConcatenatingEstimator(Env, "Vars", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth")
.Append(new ValueToKeyMappingEstimator(Env, "Label"), TransformerScope.TrainTest)
Expand Down