Skip to content

Fix missing ExampleWeightColumnName in the advanced Options for some trainers #3104

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Mar 29, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ public static void Example()
var mlContext = new MLContext();

// Get a small dataset as an IEnumerable and them read it as ML.NET's data type.
IEnumerable<SamplesUtils.DatasetUtils.BinaryLabelFloatFeatureVectorSample> enumerableOfData = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorSamples(5);
IEnumerable<SamplesUtils.DatasetUtils.BinaryLabelFloatFeatureVectorFloatWeightSample> enumerableOfData = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(5);
Copy link
Member

@wschin wschin Mar 28, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How do you examine the effect of having weight? Should it be somehow checked in a test? I feel we need two trainers w/wo weight column and make sure the two trained models are different. #Resolved

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thats what i did in the test SdcaLogisticRegressionWithWeight .. Added two trainers w/wo weights and verified it produced different metrics. Is that sufficient ?


In reply to: 270204851 [](ancestors = 270204851)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Their scores are similar. I'd like to have a more strict criterion. As you heard from Zeeshan S, tiny changes induced large SDCA regression this morning.


In reply to: 270205745 [](ancestors = 270205745,270204851)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added checks in the other tests where we test thoroughly for this


In reply to: 270207991 [](ancestors = 270207991,270205745,270204851)

var data = mlContext.Data.LoadFromEnumerable(enumerableOfData);

// Look at the original dataset
Expand Down Expand Up @@ -43,7 +43,7 @@ public static void Example()
{
var resample = mlContext.Data.BootstrapSample(data, seed: i);

var enumerable = mlContext.Data.CreateEnumerable<SamplesUtils.DatasetUtils.BinaryLabelFloatFeatureVectorSample>(resample, reuseRowObject: false);
var enumerable = mlContext.Data.CreateEnumerable<SamplesUtils.DatasetUtils.BinaryLabelFloatFeatureVectorFloatWeightSample>(resample, reuseRowObject: false);
Console.WriteLine($"Label\tFeatures[0]");
foreach (var row in enumerable)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ public static class StochasticDualCoordinateAscentNonCalibrated
public static void Example()
{
// Generate IEnumerable<BinaryLabelFloatFeatureVectorSample> as training examples.
var rawData = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorSamples(100);
var rawData = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(100);

// Information in first example.
// Label: true
Expand Down
21 changes: 14 additions & 7 deletions src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -508,18 +508,20 @@ public static IEnumerable<SampleVectorOfNumbersData> GetVectorOfNumbersData()
private const int _simpleBinaryClassSampleFeatureLength = 10;

/// <summary>
/// Example with one binary label and 10 feature values.
/// Example with one binary label, 10 feature values and a weight (float).
/// </summary>
public class BinaryLabelFloatFeatureVectorSample
public class BinaryLabelFloatFeatureVectorFloatWeightSample
{
public bool Label;

[VectorType(_simpleBinaryClassSampleFeatureLength)]
public float[] Features;

public float Weight;
}

/// <summary>
/// Class used to capture prediction of <see cref="BinaryLabelFloatFeatureVectorSample"/> when
/// Class used to capture prediction of <see cref="BinaryLabelFloatFeatureVectorFloatWeightSample"/> when
/// calling <see cref="DataOperationsCatalog.CreateEnumerable{TRow}(IDataView, bool, bool, SchemaDefinition)"/> via on <see cref="MLContext"/>.
/// </summary>
public class CalibratedBinaryClassifierOutput
Expand All @@ -530,7 +532,7 @@ public class CalibratedBinaryClassifierOutput
}

/// <summary>
/// Class used to capture prediction of <see cref="BinaryLabelFloatFeatureVectorSample"/> when
/// Class used to capture prediction of <see cref="BinaryLabelFloatFeatureVectorFloatWeightSample"/> when
/// calling <see cref="DataOperationsCatalog.CreateEnumerable{TRow}(IDataView, bool, bool, SchemaDefinition)"/> via on <see cref="MLContext"/>.
/// </summary>
public class NonCalibratedBinaryClassifierOutput
Expand All @@ -539,14 +541,19 @@ public class NonCalibratedBinaryClassifierOutput
public float Score;
}

public static IEnumerable<BinaryLabelFloatFeatureVectorSample> GenerateBinaryLabelFloatFeatureVectorSamples(int exampleCount)
public static IEnumerable<BinaryLabelFloatFeatureVectorFloatWeightSample> GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(int exampleCount)
{
var rnd = new Random(0);
var data = new List<BinaryLabelFloatFeatureVectorSample>();
var data = new List<BinaryLabelFloatFeatureVectorFloatWeightSample>();
for (int i = 0; i < exampleCount; ++i)
{
// Initialize an example with a random label and an empty feature vector.
var sample = new BinaryLabelFloatFeatureVectorSample() { Label = rnd.Next() % 2 == 0, Features = new float[_simpleBinaryClassSampleFeatureLength] };
var sample = new BinaryLabelFloatFeatureVectorFloatWeightSample() {
Label = rnd.Next() % 2 == 0,
Features = new float[_simpleBinaryClassSampleFeatureLength],
Weight = (float)rnd.NextDouble()
};

// Fill feature vector according the assigned label.
for (int j = 0; j < _simpleBinaryClassSampleFeatureLength; ++j)
{
Expand Down
4 changes: 2 additions & 2 deletions src/Microsoft.ML.StandardTrainers/Standard/SdcaBinary.cs
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ public abstract class SdcaTrainerBase<TOptions, TTransformer, TModel> : Stochast
/// <summary>
/// Options for the SDCA-based trainers.
/// </summary>
public abstract class OptionsBase : TrainerInputBaseWithLabel
public abstract class OptionsBase : TrainerInputBaseWithWeight
Copy link
Contributor

@Ivanidzo4ka Ivanidzo4ka Mar 27, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TrainerInputBaseWithWeight [](start = 44, length = 26)

I would assume you check all mlContext.*Catalog extensions for SDCA trainers to have exampleWeightColumnName in it? #Resolved

Copy link
Member Author

@abgoswam abgoswam Mar 27, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeap. i did verify all of the "simple" SDCA trainer extensions have the exampleWeightColumnName parameter


In reply to: 269666016 [](ancestors = 269666016)

{
/// <summary>
/// The L2 <a href='tmpurl_regularization'>regularization</a> hyperparameter.
Expand Down Expand Up @@ -1505,7 +1505,7 @@ private protected SdcaBinaryTrainerBase(IHostEnvironment env,
}

private protected SdcaBinaryTrainerBase(IHostEnvironment env, BinaryOptionsBase options, ISupportSdcaClassificationLoss loss = null, bool doCalibration = false)
: base(env, options, TrainerUtils.MakeBoolScalarLabel(options.LabelColumnName))
: base(env, options, TrainerUtils.MakeBoolScalarLabel(options.LabelColumnName), TrainerUtils.MakeR4ScalarWeightColumn(options.ExampleWeightColumnName))
{
_loss = loss ?? new LogLossFactory().CreateComponent(env);
Loss = _loss;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ internal SdcaMulticlassTrainerBase(IHostEnvironment env, MulticlassOptions optio
}

internal SdcaMulticlassTrainerBase(IHostEnvironment env, MulticlassOptions options)
: this(env, options, options.FeatureColumnName, options.LabelColumnName)
: this(env, options, options.FeatureColumnName, options.LabelColumnName, options.ExampleWeightColumnName)
{
}

Expand Down
3 changes: 2 additions & 1 deletion src/Microsoft.ML.StandardTrainers/Standard/SdcaRegression.cs
Original file line number Diff line number Diff line change
Expand Up @@ -104,13 +104,14 @@ internal SdcaRegressionTrainer(IHostEnvironment env, Options options, string fea
{
Host.CheckValue(labelColumn, nameof(labelColumn));
Host.CheckValue(featureColumn, nameof(featureColumn));
Host.CheckValueOrNull(weightColumn);

_loss = options.LossFunction ?? options.LossFunctionFactory.CreateComponent(env);
Loss = _loss;
}

internal SdcaRegressionTrainer(IHostEnvironment env, Options options)
: this(env, options, options.FeatureColumnName, options.LabelColumnName)
: this(env, options, options.FeatureColumnName, options.LabelColumnName, options.ExampleWeightColumnName)
{
}

Expand Down
5 changes: 5 additions & 0 deletions src/Microsoft.ML.StaticPipe/SdcaStaticExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ public static Scalar<float> Sdca(this RegressionCatalog.RegressionTrainers catal
{
options.LabelColumnName = labelName;
options.FeatureColumnName = featuresName;
options.ExampleWeightColumnName = weightsName;

var trainer = new SdcaRegressionTrainer(env, options);
if (onFit != null)
Expand Down Expand Up @@ -206,6 +207,7 @@ public static (Scalar<float> score, Scalar<float> probability, Scalar<bool> pred
{
options.LabelColumnName = labelName;
options.FeatureColumnName = featuresName;
options.ExampleWeightColumnName = weightsName;

var trainer = new SdcaLogisticRegressionBinaryTrainer(env, options);
if (onFit != null)
Expand Down Expand Up @@ -313,6 +315,7 @@ public static (Scalar<float> score, Scalar<bool> predictedLabel) SdcaNonCalibrat
{
options.FeatureColumnName = featuresName;
options.LabelColumnName = labelName;
options.ExampleWeightColumnName = weightsName;

var trainer = new SdcaNonCalibratedBinaryTrainer(env, options);
if (onFit != null)
Expand Down Expand Up @@ -407,6 +410,7 @@ public static (Vector<float> score, Key<uint, TVal> predictedLabel) Sdca<TVal>(
{
options.LabelColumnName = labelName;
options.FeatureColumnName = featuresName;
options.ExampleWeightColumnName = weightsName;

var trainer = new SdcaMaximumEntropyMulticlassTrainer(env, options);
if (onFit != null)
Expand Down Expand Up @@ -499,6 +503,7 @@ public static (Vector<float> score, Key<uint, TVal> predictedLabel) SdcaNonCalib
{
options.LabelColumnName = labelName;
options.FeatureColumnName = featuresName;
options.ExampleWeightColumnName = weightsName;

var trainer = new SdcaNonCalibratedMulticlassTrainer(env, options);
if (onFit != null)
Expand Down
39 changes: 39 additions & 0 deletions test/BaselineOutput/Common/EntryPoints/core_manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -15006,6 +15006,18 @@
"IsNullable": false,
"Default": "Label"
},
{
"Name": "ExampleWeightColumnName",
"Type": "String",
"Desc": "Column to use for example weight",
"Aliases": [
"weight"
],
"Required": false,
"SortOrder": 4.0,
"IsNullable": false,
"Default": null
},
{
"Name": "NormalizeFeatures",
"Type": {
Expand Down Expand Up @@ -15218,6 +15230,7 @@
}
],
"InputKind": [
"ITrainerInputWithWeight",
"ITrainerInputWithLabel",
"ITrainerInput"
],
Expand Down Expand Up @@ -15315,6 +15328,18 @@
"IsNullable": false,
"Default": "Label"
},
{
"Name": "ExampleWeightColumnName",
"Type": "String",
"Desc": "Column to use for example weight",
"Aliases": [
"weight"
],
"Required": false,
"SortOrder": 4.0,
"IsNullable": false,
"Default": null
},
{
"Name": "NormalizeFeatures",
"Type": {
Expand Down Expand Up @@ -15492,6 +15517,7 @@
}
],
"InputKind": [
"ITrainerInputWithWeight",
"ITrainerInputWithLabel",
"ITrainerInput"
],
Expand Down Expand Up @@ -15589,6 +15615,18 @@
"IsNullable": false,
"Default": "Label"
},
{
"Name": "ExampleWeightColumnName",
"Type": "String",
"Desc": "Column to use for example weight",
"Aliases": [
"weight"
],
"Required": false,
"SortOrder": 4.0,
"IsNullable": false,
"Default": null
},
{
"Name": "NormalizeFeatures",
"Type": {
Expand Down Expand Up @@ -15766,6 +15804,7 @@
}
],
"InputKind": [
"ITrainerInputWithWeight",
"ITrainerInputWithLabel",
"ITrainerInput"
],
Expand Down
115 changes: 113 additions & 2 deletions test/Microsoft.ML.Tests/TrainerEstimators/SdcaTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ public void SdcaWorkout()
public void SdcaLogisticRegression()
{
// Generate C# objects as training examples.
var rawData = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorSamples(100);
var rawData = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(100);

// Create a new context for ML.NET operations. It can be used for exception tracking and logging,
// as a catalog of available operations and as the source of randomness.
Expand Down Expand Up @@ -88,11 +88,122 @@ public void SdcaLogisticRegression()
Assert.InRange(first.Probability, 0.8, 1);
}

[Fact]
public void SdcaLogisticRegressionWithWeight()
Copy link
Member

@wschin wschin Mar 28, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LogisticRegression [](start = 24, length = 18)

This is called LogisticRegression but contains MaximumEntropy trainers. #Resolved

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

WIll fix by separating into 2 tests one each for binary and multiclass


In reply to: 270208235 [](ancestors = 270208235)

{
// Generate C# objects as training examples.
var rawData = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(100);

// Create a new context for ML.NET operations. It can be used for exception tracking and logging,
// as a catalog of available operations and as the source of randomness.
var mlContext = new MLContext(0);

// Read the data as an IDataView.
var data = mlContext.Data.LoadFromEnumerable(rawData);

// ML.NET doesn't cache data set by default. Caching is very helpful when working with iterative
// algorithms which needs many data passes. Since SDCA is the case, we cache.
data = mlContext.Data.Cache(data);

// SdcaLogisticRegression with and without weights.
var sdcaWithoutWeightBinary = mlContext.BinaryClassification.Trainers.SdcaLogisticRegression(
new SdcaLogisticRegressionBinaryTrainer.Options { NumberOfThreads = 1 });
var sdcaWithWeightBinary = mlContext.BinaryClassification.Trainers.SdcaLogisticRegression(
new SdcaLogisticRegressionBinaryTrainer.Options { ExampleWeightColumnName = "Weight", NumberOfThreads = 1 });

var modelWithoutWeights = sdcaWithoutWeightBinary.Fit(data);
var modelWithWeights = sdcaWithWeightBinary.Fit(data);

var prediction1 = modelWithoutWeights.Transform(data);
var prediction2 = modelWithWeights.Transform(data);

// Verify the metrics produced are different.
var metrics1 = mlContext.BinaryClassification.Evaluate(prediction1);
var metrics2 = mlContext.BinaryClassification.Evaluate(prediction2);
Assert.Equal(0.9658, metrics1.AreaUnderRocCurve, 4);
Assert.Equal(0.3488, metrics1.LogLoss, 4);
Assert.Equal(0.9596, metrics2.AreaUnderRocCurve, 4);
Assert.Equal(0.3591, metrics2.LogLoss, 4);

// Verify the raw scores are different.
var scores1 = prediction1.GetColumn<float>(prediction1.Schema["Score"]).ToArray();
var scores2 = prediction2.GetColumn<float>(prediction2.Schema["Score"]).ToArray();
Assert.True(scores1.Length == scores2.Length);

bool sameScores = true;
for (int i = 0; i < scores1.Length; i++)
{
if(!CompareNumbersWithTolerance(scores1[i], scores2[i]))
{
sameScores = false;
break;
}
}
Assert.False(sameScores);
}

[Fact]
public void SdcaMaximumEntropyWithWeight()
{
// Generate C# objects as training examples.
var rawData = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(100);

// Create a new context for ML.NET operations. It can be used for exception tracking and logging,
// as a catalog of available operations and as the source of randomness.
var mlContext = new MLContext(0);

// Read the data as an IDataView.
var data = mlContext.Data.LoadFromEnumerable(rawData);

// ML.NET doesn't cache data set by default. Caching is very helpful when working with iterative
// algorithms which needs many data passes. Since SDCA is the case, we cache.
data = mlContext.Data.Cache(data);

// SdcaMaximumEntropy with and without weights.
var sdcaWithoutWeightMulticlass = mlContext.Transforms.Conversion.MapValueToKey("LabelIndex", "Label").
Copy link
Member

@wschin wschin Mar 28, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

May we put multi-class test to another independent test (to make test small)? #Resolved

Append(mlContext.MulticlassClassification.Trainers.SdcaMaximumEntropy(
new SdcaMaximumEntropyMulticlassTrainer.Options { LabelColumnName = "LabelIndex", NumberOfThreads = 1 }));

var sdcaWithWeightMulticlass = mlContext.Transforms.Conversion.MapValueToKey("LabelIndex", "Label").
Append(mlContext.MulticlassClassification.Trainers.SdcaMaximumEntropy(
new SdcaMaximumEntropyMulticlassTrainer.Options { LabelColumnName = "LabelIndex", ExampleWeightColumnName = "Weight", NumberOfThreads = 1 }));

var modelWithoutWeights = sdcaWithoutWeightMulticlass.Fit(data);
var modelWithWeights = sdcaWithWeightMulticlass.Fit(data);

var prediction1 = modelWithoutWeights.Transform(data);
var prediction2 = modelWithWeights.Transform(data);

// Verify the metrics produced are different.
var metrics1 = mlContext.MulticlassClassification.Evaluate(prediction1, labelColumnName: "LabelIndex", topKPredictionCount: 1);
var metrics2 = mlContext.MulticlassClassification.Evaluate(prediction2, labelColumnName: "LabelIndex", topKPredictionCount: 1);
Assert.Equal(0.9100, metrics1.TopKAccuracy, 4);
Assert.Equal(0.2411, metrics1.LogLoss, 4);
Assert.Equal(0.8800, metrics2.TopKAccuracy, 4);
Assert.Equal(0.2464, metrics2.LogLoss, 4);

// Verify the raw scores are different.
var scores1 = prediction1.GetColumn<float[]>(prediction1.Schema["Score"]).ToArray();
var scores2 = prediction2.GetColumn<float[]>(prediction2.Schema["Score"]).ToArray();
Assert.True(scores1.Length == scores2.Length);

bool sameScores = true;
for (int i = 0; i < scores1.Length; i++)
{
if (!CompareNumbersWithTolerance(scores1[i][0], scores2[i][0]))
{
sameScores = false;
break;
}
}
Assert.False(sameScores);
}

[Fact]
public void SdcaSupportVectorMachine()
{
// Generate C# objects as training examples.
var rawData = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorSamples(100);
var rawData = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(100);

// Create a new context for ML.NET operations. It can be used for exception tracking and logging,
// as a catalog of available operations and as the source of randomness.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ public partial class TrainerEstimators
public void TreeEnsembleFeaturizerOutputSchemaTest()
{
// Create data set
var data = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorSamples(1000).ToList();
var data = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(1000).ToList();
var dataView = ML.Data.LoadFromEnumerable(data);

// Define a tree model whose trees will be extracted to construct a tree featurizer.
Expand All @@ -36,8 +36,8 @@ public void TreeEnsembleFeaturizerOutputSchemaTest()

// To get output schema, we need to create RoleMappedSchema for calling Bind(...).
var roleMappedSchema = new RoleMappedSchema(dataView.Schema,
label: nameof(SamplesUtils.DatasetUtils.BinaryLabelFloatFeatureVectorSample.Label),
feature: nameof(SamplesUtils.DatasetUtils.BinaryLabelFloatFeatureVectorSample.Features));
label: nameof(SamplesUtils.DatasetUtils.BinaryLabelFloatFeatureVectorFloatWeightSample.Label),
feature: nameof(SamplesUtils.DatasetUtils.BinaryLabelFloatFeatureVectorFloatWeightSample.Features));

// Retrieve output schema.
var boundMapper = (treeFeaturizer as ISchemaBindableMapper).Bind(Env, roleMappedSchema);
Expand Down