-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Fix missing ExampleWeightColumnName in the advanced Options for some trainers #3104
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -154,7 +154,7 @@ public abstract class SdcaTrainerBase<TOptions, TTransformer, TModel> : Stochast | |
/// <summary> | ||
/// Options for the SDCA-based trainers. | ||
/// </summary> | ||
public abstract class OptionsBase : TrainerInputBaseWithLabel | ||
public abstract class OptionsBase : TrainerInputBaseWithWeight | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I would assume you check all mlContext.*Catalog extensions for SDCA trainers to have exampleWeightColumnName in it? #Resolved There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeap. i did verify all of the "simple" SDCA trainer extensions have the In reply to: 269666016 [](ancestors = 269666016) |
||
{ | ||
/// <summary> | ||
/// The L2 <a href='tmpurl_regularization'>regularization</a> hyperparameter. | ||
|
@@ -1505,7 +1505,7 @@ private protected SdcaBinaryTrainerBase(IHostEnvironment env, | |
} | ||
|
||
private protected SdcaBinaryTrainerBase(IHostEnvironment env, BinaryOptionsBase options, ISupportSdcaClassificationLoss loss = null, bool doCalibration = false) | ||
: base(env, options, TrainerUtils.MakeBoolScalarLabel(options.LabelColumnName)) | ||
: base(env, options, TrainerUtils.MakeBoolScalarLabel(options.LabelColumnName), TrainerUtils.MakeR4ScalarWeightColumn(options.ExampleWeightColumnName)) | ||
{ | ||
_loss = loss ?? new LogLossFactory().CreateComponent(env); | ||
Loss = _loss; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -48,7 +48,7 @@ public void SdcaWorkout() | |
public void SdcaLogisticRegression() | ||
{ | ||
// Generate C# objects as training examples. | ||
var rawData = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorSamples(100); | ||
var rawData = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(100); | ||
|
||
// Create a new context for ML.NET operations. It can be used for exception tracking and logging, | ||
// as a catalog of available operations and as the source of randomness. | ||
|
@@ -88,11 +88,122 @@ public void SdcaLogisticRegression() | |
Assert.InRange(first.Probability, 0.8, 1); | ||
} | ||
|
||
[Fact] | ||
public void SdcaLogisticRegressionWithWeight() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This is called There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. WIll fix by separating into 2 tests one each for binary and multiclass In reply to: 270208235 [](ancestors = 270208235) |
||
{ | ||
// Generate C# objects as training examples. | ||
var rawData = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(100); | ||
|
||
// Create a new context for ML.NET operations. It can be used for exception tracking and logging, | ||
// as a catalog of available operations and as the source of randomness. | ||
var mlContext = new MLContext(0); | ||
|
||
// Read the data as an IDataView. | ||
var data = mlContext.Data.LoadFromEnumerable(rawData); | ||
|
||
// ML.NET doesn't cache data set by default. Caching is very helpful when working with iterative | ||
// algorithms which needs many data passes. Since SDCA is the case, we cache. | ||
data = mlContext.Data.Cache(data); | ||
|
||
// SdcaLogisticRegression with and without weights. | ||
var sdcaWithoutWeightBinary = mlContext.BinaryClassification.Trainers.SdcaLogisticRegression( | ||
new SdcaLogisticRegressionBinaryTrainer.Options { NumberOfThreads = 1 }); | ||
var sdcaWithWeightBinary = mlContext.BinaryClassification.Trainers.SdcaLogisticRegression( | ||
new SdcaLogisticRegressionBinaryTrainer.Options { ExampleWeightColumnName = "Weight", NumberOfThreads = 1 }); | ||
|
||
var modelWithoutWeights = sdcaWithoutWeightBinary.Fit(data); | ||
var modelWithWeights = sdcaWithWeightBinary.Fit(data); | ||
|
||
var prediction1 = modelWithoutWeights.Transform(data); | ||
var prediction2 = modelWithWeights.Transform(data); | ||
|
||
// Verify the metrics produced are different. | ||
var metrics1 = mlContext.BinaryClassification.Evaluate(prediction1); | ||
var metrics2 = mlContext.BinaryClassification.Evaluate(prediction2); | ||
Assert.Equal(0.9658, metrics1.AreaUnderRocCurve, 4); | ||
Assert.Equal(0.3488, metrics1.LogLoss, 4); | ||
Assert.Equal(0.9596, metrics2.AreaUnderRocCurve, 4); | ||
Assert.Equal(0.3591, metrics2.LogLoss, 4); | ||
|
||
// Verify the raw scores are different. | ||
var scores1 = prediction1.GetColumn<float>(prediction1.Schema["Score"]).ToArray(); | ||
var scores2 = prediction2.GetColumn<float>(prediction2.Schema["Score"]).ToArray(); | ||
Assert.True(scores1.Length == scores2.Length); | ||
|
||
bool sameScores = true; | ||
for (int i = 0; i < scores1.Length; i++) | ||
{ | ||
if(!CompareNumbersWithTolerance(scores1[i], scores2[i])) | ||
{ | ||
sameScores = false; | ||
break; | ||
} | ||
} | ||
Assert.False(sameScores); | ||
} | ||
|
||
[Fact] | ||
public void SdcaMaximumEntropyWithWeight() | ||
{ | ||
// Generate C# objects as training examples. | ||
var rawData = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(100); | ||
|
||
// Create a new context for ML.NET operations. It can be used for exception tracking and logging, | ||
// as a catalog of available operations and as the source of randomness. | ||
var mlContext = new MLContext(0); | ||
|
||
// Read the data as an IDataView. | ||
var data = mlContext.Data.LoadFromEnumerable(rawData); | ||
|
||
// ML.NET doesn't cache data set by default. Caching is very helpful when working with iterative | ||
// algorithms which needs many data passes. Since SDCA is the case, we cache. | ||
data = mlContext.Data.Cache(data); | ||
|
||
// SdcaMaximumEntropy with and without weights. | ||
var sdcaWithoutWeightMulticlass = mlContext.Transforms.Conversion.MapValueToKey("LabelIndex", "Label"). | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. May we put multi-class test to another independent test (to make test small)? #Resolved |
||
Append(mlContext.MulticlassClassification.Trainers.SdcaMaximumEntropy( | ||
new SdcaMaximumEntropyMulticlassTrainer.Options { LabelColumnName = "LabelIndex", NumberOfThreads = 1 })); | ||
|
||
var sdcaWithWeightMulticlass = mlContext.Transforms.Conversion.MapValueToKey("LabelIndex", "Label"). | ||
Append(mlContext.MulticlassClassification.Trainers.SdcaMaximumEntropy( | ||
new SdcaMaximumEntropyMulticlassTrainer.Options { LabelColumnName = "LabelIndex", ExampleWeightColumnName = "Weight", NumberOfThreads = 1 })); | ||
|
||
var modelWithoutWeights = sdcaWithoutWeightMulticlass.Fit(data); | ||
var modelWithWeights = sdcaWithWeightMulticlass.Fit(data); | ||
|
||
var prediction1 = modelWithoutWeights.Transform(data); | ||
var prediction2 = modelWithWeights.Transform(data); | ||
|
||
// Verify the metrics produced are different. | ||
var metrics1 = mlContext.MulticlassClassification.Evaluate(prediction1, labelColumnName: "LabelIndex", topKPredictionCount: 1); | ||
var metrics2 = mlContext.MulticlassClassification.Evaluate(prediction2, labelColumnName: "LabelIndex", topKPredictionCount: 1); | ||
Assert.Equal(0.9100, metrics1.TopKAccuracy, 4); | ||
Assert.Equal(0.2411, metrics1.LogLoss, 4); | ||
Assert.Equal(0.8800, metrics2.TopKAccuracy, 4); | ||
Assert.Equal(0.2464, metrics2.LogLoss, 4); | ||
|
||
// Verify the raw scores are different. | ||
var scores1 = prediction1.GetColumn<float[]>(prediction1.Schema["Score"]).ToArray(); | ||
var scores2 = prediction2.GetColumn<float[]>(prediction2.Schema["Score"]).ToArray(); | ||
Assert.True(scores1.Length == scores2.Length); | ||
|
||
bool sameScores = true; | ||
for (int i = 0; i < scores1.Length; i++) | ||
{ | ||
if (!CompareNumbersWithTolerance(scores1[i][0], scores2[i][0])) | ||
{ | ||
sameScores = false; | ||
break; | ||
} | ||
} | ||
Assert.False(sameScores); | ||
} | ||
|
||
[Fact] | ||
public void SdcaSupportVectorMachine() | ||
{ | ||
// Generate C# objects as training examples. | ||
var rawData = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorSamples(100); | ||
var rawData = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(100); | ||
|
||
// Create a new context for ML.NET operations. It can be used for exception tracking and logging, | ||
// as a catalog of available operations and as the source of randomness. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How do you examine the effect of having weight? Should it be somehow checked in a test? I feel we need two trainers w/wo weight column and make sure the two trained models are different. #Resolved
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thats what i did in the test
SdcaLogisticRegressionWithWeight
.. Added two trainers w/wo weights and verified it produced different metrics. Is that sufficient ?In reply to: 270204851 [](ancestors = 270204851)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Their scores are similar. I'd like to have a more strict criterion. As you heard from Zeeshan S, tiny changes induced large SDCA regression this morning.
In reply to: 270205745 [](ancestors = 270205745,270204851)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
added checks in the other tests where we test thoroughly for this
In reply to: 270207991 [](ancestors = 270207991,270205745,270204851)