From c51a39e1a996f539443d6462995edb69e54fa769 Mon Sep 17 00:00:00 2001 From: Abhishek Goswami Date: Wed, 27 Mar 2019 16:28:00 +0000 Subject: [PATCH 1/3] fixed issue, added tests --- .../Dynamic/DataOperations/BootstrapSample.cs | 4 +- ...hasticDualCoordinateAscentNonCalibrated.cs | 2 +- .../SamplesDatasetUtils.cs | 21 ++++--- .../Standard/SdcaBinary.cs | 4 +- .../Standard/SdcaMulticlass.cs | 2 +- .../Standard/SdcaRegression.cs | 3 +- .../Common/EntryPoints/core_manifest.json | 39 ++++++++++++ .../TrainerEstimators/SdcaTests.cs | 59 ++++++++++++++++++- .../TreeEnsembleFeaturizerTest.cs | 6 +- 9 files changed, 121 insertions(+), 19 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/BootstrapSample.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/BootstrapSample.cs index bf56a41dbf..c49a0f35dc 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/BootstrapSample.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/BootstrapSample.cs @@ -12,7 +12,7 @@ public static void Example() var mlContext = new MLContext(); // Get a small dataset as an IEnumerable and them read it as ML.NET's data type. - IEnumerable enumerableOfData = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorSamples(5); + IEnumerable enumerableOfData = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(5); var data = mlContext.Data.LoadFromEnumerable(enumerableOfData); // Look at the original dataset @@ -43,7 +43,7 @@ public static void Example() { var resample = mlContext.Data.BootstrapSample(data, seed: i); - var enumerable = mlContext.Data.CreateEnumerable(resample, reuseRowObject: false); + var enumerable = mlContext.Data.CreateEnumerable(resample, reuseRowObject: false); Console.WriteLine($"Label\tFeatures[0]"); foreach (var row in enumerable) { diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticDualCoordinateAscentNonCalibrated.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticDualCoordinateAscentNonCalibrated.cs index 964add1503..b73f9e6867 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticDualCoordinateAscentNonCalibrated.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticDualCoordinateAscentNonCalibrated.cs @@ -9,7 +9,7 @@ public static class StochasticDualCoordinateAscentNonCalibrated public static void Example() { // Generate IEnumerable as training examples. - var rawData = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorSamples(100); + var rawData = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(100); // Information in first example. // Label: true diff --git a/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs b/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs index 94f8e99d7e..3d2aa09791 100644 --- a/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs +++ b/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs @@ -508,18 +508,20 @@ public static IEnumerable GetVectorOfNumbersData() private const int _simpleBinaryClassSampleFeatureLength = 10; /// - /// Example with one binary label and 10 feature values. + /// Example with one binary label, 10 feature values and a weight (float). /// - public class BinaryLabelFloatFeatureVectorSample + public class BinaryLabelFloatFeatureVectorFloatWeightSample { public bool Label; [VectorType(_simpleBinaryClassSampleFeatureLength)] public float[] Features; + + public float Weight; } /// - /// Class used to capture prediction of when + /// Class used to capture prediction of when /// calling via on . /// public class CalibratedBinaryClassifierOutput @@ -530,7 +532,7 @@ public class CalibratedBinaryClassifierOutput } /// - /// Class used to capture prediction of when + /// Class used to capture prediction of when /// calling via on . /// public class NonCalibratedBinaryClassifierOutput @@ -539,14 +541,19 @@ public class NonCalibratedBinaryClassifierOutput public float Score; } - public static IEnumerable GenerateBinaryLabelFloatFeatureVectorSamples(int exampleCount) + public static IEnumerable GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(int exampleCount) { var rnd = new Random(0); - var data = new List(); + var data = new List(); for (int i = 0; i < exampleCount; ++i) { // Initialize an example with a random label and an empty feature vector. - var sample = new BinaryLabelFloatFeatureVectorSample() { Label = rnd.Next() % 2 == 0, Features = new float[_simpleBinaryClassSampleFeatureLength] }; + var sample = new BinaryLabelFloatFeatureVectorFloatWeightSample() { + Label = rnd.Next() % 2 == 0, + Features = new float[_simpleBinaryClassSampleFeatureLength], + Weight = (float)rnd.NextDouble() + }; + // Fill feature vector according the assigned label. for (int j = 0; j < _simpleBinaryClassSampleFeatureLength; ++j) { diff --git a/src/Microsoft.ML.StandardTrainers/Standard/SdcaBinary.cs b/src/Microsoft.ML.StandardTrainers/Standard/SdcaBinary.cs index e0c9a1124c..5c763998c2 100644 --- a/src/Microsoft.ML.StandardTrainers/Standard/SdcaBinary.cs +++ b/src/Microsoft.ML.StandardTrainers/Standard/SdcaBinary.cs @@ -154,7 +154,7 @@ public abstract class SdcaTrainerBase : Stochast /// /// Options for the SDCA-based trainers. /// - public abstract class OptionsBase : TrainerInputBaseWithLabel + public abstract class OptionsBase : TrainerInputBaseWithWeight { /// /// The L2 regularization hyperparameter. @@ -1505,7 +1505,7 @@ private protected SdcaBinaryTrainerBase(IHostEnvironment env, } private protected SdcaBinaryTrainerBase(IHostEnvironment env, BinaryOptionsBase options, ISupportSdcaClassificationLoss loss = null, bool doCalibration = false) - : base(env, options, TrainerUtils.MakeBoolScalarLabel(options.LabelColumnName)) + : base(env, options, TrainerUtils.MakeBoolScalarLabel(options.LabelColumnName), TrainerUtils.MakeR4ScalarWeightColumn(options.ExampleWeightColumnName)) { _loss = loss ?? new LogLossFactory().CreateComponent(env); Loss = _loss; diff --git a/src/Microsoft.ML.StandardTrainers/Standard/SdcaMulticlass.cs b/src/Microsoft.ML.StandardTrainers/Standard/SdcaMulticlass.cs index b36fae1a8f..2344651d59 100644 --- a/src/Microsoft.ML.StandardTrainers/Standard/SdcaMulticlass.cs +++ b/src/Microsoft.ML.StandardTrainers/Standard/SdcaMulticlass.cs @@ -103,7 +103,7 @@ internal SdcaMulticlassTrainerBase(IHostEnvironment env, MulticlassOptions optio } internal SdcaMulticlassTrainerBase(IHostEnvironment env, MulticlassOptions options) - : this(env, options, options.FeatureColumnName, options.LabelColumnName) + : this(env, options, options.FeatureColumnName, options.LabelColumnName, options.ExampleWeightColumnName) { } diff --git a/src/Microsoft.ML.StandardTrainers/Standard/SdcaRegression.cs b/src/Microsoft.ML.StandardTrainers/Standard/SdcaRegression.cs index f87535bd09..3b242f907c 100644 --- a/src/Microsoft.ML.StandardTrainers/Standard/SdcaRegression.cs +++ b/src/Microsoft.ML.StandardTrainers/Standard/SdcaRegression.cs @@ -104,13 +104,14 @@ internal SdcaRegressionTrainer(IHostEnvironment env, Options options, string fea { Host.CheckValue(labelColumn, nameof(labelColumn)); Host.CheckValue(featureColumn, nameof(featureColumn)); + Host.CheckValueOrNull(weightColumn); _loss = options.LossFunction ?? options.LossFunctionFactory.CreateComponent(env); Loss = _loss; } internal SdcaRegressionTrainer(IHostEnvironment env, Options options) - : this(env, options, options.FeatureColumnName, options.LabelColumnName) + : this(env, options, options.FeatureColumnName, options.LabelColumnName, options.ExampleWeightColumnName) { } diff --git a/test/BaselineOutput/Common/EntryPoints/core_manifest.json b/test/BaselineOutput/Common/EntryPoints/core_manifest.json index d874dce5b6..94ea2341ac 100644 --- a/test/BaselineOutput/Common/EntryPoints/core_manifest.json +++ b/test/BaselineOutput/Common/EntryPoints/core_manifest.json @@ -15006,6 +15006,18 @@ "IsNullable": false, "Default": "Label" }, + { + "Name": "ExampleWeightColumnName", + "Type": "String", + "Desc": "Column to use for example weight", + "Aliases": [ + "weight" + ], + "Required": false, + "SortOrder": 4.0, + "IsNullable": false, + "Default": null + }, { "Name": "NormalizeFeatures", "Type": { @@ -15218,6 +15230,7 @@ } ], "InputKind": [ + "ITrainerInputWithWeight", "ITrainerInputWithLabel", "ITrainerInput" ], @@ -15315,6 +15328,18 @@ "IsNullable": false, "Default": "Label" }, + { + "Name": "ExampleWeightColumnName", + "Type": "String", + "Desc": "Column to use for example weight", + "Aliases": [ + "weight" + ], + "Required": false, + "SortOrder": 4.0, + "IsNullable": false, + "Default": null + }, { "Name": "NormalizeFeatures", "Type": { @@ -15492,6 +15517,7 @@ } ], "InputKind": [ + "ITrainerInputWithWeight", "ITrainerInputWithLabel", "ITrainerInput" ], @@ -15589,6 +15615,18 @@ "IsNullable": false, "Default": "Label" }, + { + "Name": "ExampleWeightColumnName", + "Type": "String", + "Desc": "Column to use for example weight", + "Aliases": [ + "weight" + ], + "Required": false, + "SortOrder": 4.0, + "IsNullable": false, + "Default": null + }, { "Name": "NormalizeFeatures", "Type": { @@ -15766,6 +15804,7 @@ } ], "InputKind": [ + "ITrainerInputWithWeight", "ITrainerInputWithLabel", "ITrainerInput" ], diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/SdcaTests.cs b/test/Microsoft.ML.Tests/TrainerEstimators/SdcaTests.cs index d418e8ad98..430db959e3 100644 --- a/test/Microsoft.ML.Tests/TrainerEstimators/SdcaTests.cs +++ b/test/Microsoft.ML.Tests/TrainerEstimators/SdcaTests.cs @@ -48,7 +48,7 @@ public void SdcaWorkout() public void SdcaLogisticRegression() { // Generate C# objects as training examples. - var rawData = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorSamples(100); + var rawData = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(100); // Create a new context for ML.NET operations. It can be used for exception tracking and logging, // as a catalog of available operations and as the source of randomness. @@ -88,11 +88,66 @@ public void SdcaLogisticRegression() Assert.InRange(first.Probability, 0.8, 1); } + [Fact] + public void SdcaLogisticRegressionWithWeight() + { + // Generate C# objects as training examples. + var rawData = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(100); + + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // as a catalog of available operations and as the source of randomness. + var mlContext = new MLContext(0); + + // Read the data as an IDataView. + var data = mlContext.Data.LoadFromEnumerable(rawData); + + // ML.NET doesn't cache data set by default. Caching is very helpful when working with iterative + // algorithms which needs many data passes. Since SDCA is the case, we cache. + data = mlContext.Data.Cache(data); + + // Verify SdcaLogisticRegression with and without weights. + var sdcaWithoutWeightBinary = mlContext.BinaryClassification.Trainers.SdcaLogisticRegression( + new SdcaLogisticRegressionBinaryTrainer.Options { NumberOfThreads = 1 }); + var sdcaWithWeightBinary = mlContext.BinaryClassification.Trainers.SdcaLogisticRegression( + new SdcaLogisticRegressionBinaryTrainer.Options { ExampleWeightColumnName = "Weight", NumberOfThreads = 1 }); + + var prediction1 = sdcaWithoutWeightBinary.Fit(data).Transform(data); + var prediction2 = sdcaWithWeightBinary.Fit(data).Transform(data); + + var metrics1 = mlContext.BinaryClassification.Evaluate(prediction1); + var metrics2 = mlContext.BinaryClassification.Evaluate(prediction2); + + Assert.Equal(0.9658, metrics1.AreaUnderRocCurve, 4); + Assert.Equal(0.3488, metrics1.LogLoss, 4); + Assert.Equal(0.9596, metrics2.AreaUnderRocCurve, 4); + Assert.Equal(0.3591, metrics2.LogLoss, 4); + + // Verify SdcaMaximumEntropy with and without weights. + var sdcaWithoutWeightMulticlass = mlContext.Transforms.Conversion.MapValueToKey("LabelIndex", "Label"). + Append(mlContext.MulticlassClassification.Trainers.SdcaMaximumEntropy( + new SdcaMaximumEntropyMulticlassTrainer.Options { LabelColumnName = "LabelIndex", NumberOfThreads = 1 })); + + var sdcaWithWeightMulticlass = mlContext.Transforms.Conversion.MapValueToKey("LabelIndex", "Label"). + Append(mlContext.MulticlassClassification.Trainers.SdcaMaximumEntropy( + new SdcaMaximumEntropyMulticlassTrainer.Options { LabelColumnName = "LabelIndex", ExampleWeightColumnName = "Weight", NumberOfThreads = 1 })); + + var prediction3 = sdcaWithoutWeightMulticlass.Fit(data).Transform(data); + var prediction4 = sdcaWithWeightMulticlass.Fit(data).Transform(data); + + var metrics3 = mlContext.MulticlassClassification.Evaluate(prediction3, labelColumnName: "LabelIndex", topKPredictionCount: 1); + var metrics4 = mlContext.MulticlassClassification.Evaluate(prediction4, labelColumnName: "LabelIndex", topKPredictionCount: 1); + + Assert.Equal(0.9000, metrics3.TopKAccuracy, 4); + Assert.Equal(0.2411, metrics3.LogLoss, 4); + Assert.Equal(0.8800, metrics4.TopKAccuracy, 4); + Assert.Equal(0.2469, metrics4.LogLoss, 4); + } + [Fact] public void SdcaSupportVectorMachine() { // Generate C# objects as training examples. - var rawData = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorSamples(100); + var rawData = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(100); // Create a new context for ML.NET operations. It can be used for exception tracking and logging, // as a catalog of available operations and as the source of randomness. diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs b/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs index ddb92ecb02..188a9ced46 100644 --- a/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs +++ b/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs @@ -16,7 +16,7 @@ public partial class TrainerEstimators public void TreeEnsembleFeaturizerOutputSchemaTest() { // Create data set - var data = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorSamples(1000).ToList(); + var data = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(1000).ToList(); var dataView = ML.Data.LoadFromEnumerable(data); // Define a tree model whose trees will be extracted to construct a tree featurizer. @@ -36,8 +36,8 @@ public void TreeEnsembleFeaturizerOutputSchemaTest() // To get output schema, we need to create RoleMappedSchema for calling Bind(...). var roleMappedSchema = new RoleMappedSchema(dataView.Schema, - label: nameof(SamplesUtils.DatasetUtils.BinaryLabelFloatFeatureVectorSample.Label), - feature: nameof(SamplesUtils.DatasetUtils.BinaryLabelFloatFeatureVectorSample.Features)); + label: nameof(SamplesUtils.DatasetUtils.BinaryLabelFloatFeatureVectorFloatWeightSample.Label), + feature: nameof(SamplesUtils.DatasetUtils.BinaryLabelFloatFeatureVectorFloatWeightSample.Features)); // Retrieve output schema. var boundMapper = (treeFeaturizer as ISchemaBindableMapper).Bind(Env, roleMappedSchema); From c1000c0a66af6cb53a866e2137774611f6bd59b1 Mon Sep 17 00:00:00 2001 From: Abhishek Goswami Date: Fri, 29 Mar 2019 21:27:25 +0000 Subject: [PATCH 2/3] fix review comments --- .../SdcaStaticExtensions.cs | 5 ++ .../TrainerEstimators/SdcaTests.cs | 81 +++++++++++++++---- 2 files changed, 71 insertions(+), 15 deletions(-) diff --git a/src/Microsoft.ML.StaticPipe/SdcaStaticExtensions.cs b/src/Microsoft.ML.StaticPipe/SdcaStaticExtensions.cs index 58ebcc798b..595a8f3e5c 100644 --- a/src/Microsoft.ML.StaticPipe/SdcaStaticExtensions.cs +++ b/src/Microsoft.ML.StaticPipe/SdcaStaticExtensions.cs @@ -102,6 +102,7 @@ public static Scalar Sdca(this RegressionCatalog.RegressionTrainers catal { options.LabelColumnName = labelName; options.FeatureColumnName = featuresName; + options.ExampleWeightColumnName = weightsName; var trainer = new SdcaRegressionTrainer(env, options); if (onFit != null) @@ -206,6 +207,7 @@ public static (Scalar score, Scalar probability, Scalar pred { options.LabelColumnName = labelName; options.FeatureColumnName = featuresName; + options.ExampleWeightColumnName = weightsName; var trainer = new SdcaLogisticRegressionBinaryTrainer(env, options); if (onFit != null) @@ -313,6 +315,7 @@ public static (Scalar score, Scalar predictedLabel) SdcaNonCalibrat { options.FeatureColumnName = featuresName; options.LabelColumnName = labelName; + options.ExampleWeightColumnName = weightsName; var trainer = new SdcaNonCalibratedBinaryTrainer(env, options); if (onFit != null) @@ -407,6 +410,7 @@ public static (Vector score, Key predictedLabel) Sdca( { options.LabelColumnName = labelName; options.FeatureColumnName = featuresName; + options.ExampleWeightColumnName = weightsName; var trainer = new SdcaMaximumEntropyMulticlassTrainer(env, options); if (onFit != null) @@ -499,6 +503,7 @@ public static (Vector score, Key predictedLabel) SdcaNonCalib { options.LabelColumnName = labelName; options.FeatureColumnName = featuresName; + options.ExampleWeightColumnName = weightsName; var trainer = new SdcaNonCalibratedMulticlassTrainer(env, options); if (onFit != null) diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/SdcaTests.cs b/test/Microsoft.ML.Tests/TrainerEstimators/SdcaTests.cs index 430db959e3..3a7b32742f 100644 --- a/test/Microsoft.ML.Tests/TrainerEstimators/SdcaTests.cs +++ b/test/Microsoft.ML.Tests/TrainerEstimators/SdcaTests.cs @@ -105,24 +105,54 @@ public void SdcaLogisticRegressionWithWeight() // algorithms which needs many data passes. Since SDCA is the case, we cache. data = mlContext.Data.Cache(data); - // Verify SdcaLogisticRegression with and without weights. + // SdcaLogisticRegression with and without weights. var sdcaWithoutWeightBinary = mlContext.BinaryClassification.Trainers.SdcaLogisticRegression( new SdcaLogisticRegressionBinaryTrainer.Options { NumberOfThreads = 1 }); var sdcaWithWeightBinary = mlContext.BinaryClassification.Trainers.SdcaLogisticRegression( new SdcaLogisticRegressionBinaryTrainer.Options { ExampleWeightColumnName = "Weight", NumberOfThreads = 1 }); - var prediction1 = sdcaWithoutWeightBinary.Fit(data).Transform(data); - var prediction2 = sdcaWithWeightBinary.Fit(data).Transform(data); + var modelWithoutWeights = sdcaWithoutWeightBinary.Fit(data); + var modelWithWeights = sdcaWithWeightBinary.Fit(data); + // Verify the metrics produced are different. + var prediction1 = modelWithoutWeights.Transform(data); + var prediction2 = modelWithWeights.Transform(data); var metrics1 = mlContext.BinaryClassification.Evaluate(prediction1); var metrics2 = mlContext.BinaryClassification.Evaluate(prediction2); - Assert.Equal(0.9658, metrics1.AreaUnderRocCurve, 4); Assert.Equal(0.3488, metrics1.LogLoss, 4); Assert.Equal(0.9596, metrics2.AreaUnderRocCurve, 4); Assert.Equal(0.3591, metrics2.LogLoss, 4); - // Verify SdcaMaximumEntropy with and without weights. + // Verify the SDCA model parameters are different. + // (bias). + var bias1 = modelWithoutWeights.Model.SubModel.Bias; + var bias2 = modelWithWeights.Model.SubModel.Bias; + Assert.NotEqual(bias1, bias2); + // (model weights). + var weights1 = modelWithoutWeights.Model.SubModel.Weights; + var weights2 = modelWithWeights.Model.SubModel.Weights; + Assert.False(weights1.SequenceEqual(weights2)); + } + + [Fact] + public void SdcaMaximumEntropyWithWeight() + { + // Generate C# objects as training examples. + var rawData = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(100); + + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // as a catalog of available operations and as the source of randomness. + var mlContext = new MLContext(0); + + // Read the data as an IDataView. + var data = mlContext.Data.LoadFromEnumerable(rawData); + + // ML.NET doesn't cache data set by default. Caching is very helpful when working with iterative + // algorithms which needs many data passes. Since SDCA is the case, we cache. + data = mlContext.Data.Cache(data); + + // SdcaMaximumEntropy with and without weights. var sdcaWithoutWeightMulticlass = mlContext.Transforms.Conversion.MapValueToKey("LabelIndex", "Label"). Append(mlContext.MulticlassClassification.Trainers.SdcaMaximumEntropy( new SdcaMaximumEntropyMulticlassTrainer.Options { LabelColumnName = "LabelIndex", NumberOfThreads = 1 })); @@ -131,16 +161,37 @@ public void SdcaLogisticRegressionWithWeight() Append(mlContext.MulticlassClassification.Trainers.SdcaMaximumEntropy( new SdcaMaximumEntropyMulticlassTrainer.Options { LabelColumnName = "LabelIndex", ExampleWeightColumnName = "Weight", NumberOfThreads = 1 })); - var prediction3 = sdcaWithoutWeightMulticlass.Fit(data).Transform(data); - var prediction4 = sdcaWithWeightMulticlass.Fit(data).Transform(data); - - var metrics3 = mlContext.MulticlassClassification.Evaluate(prediction3, labelColumnName: "LabelIndex", topKPredictionCount: 1); - var metrics4 = mlContext.MulticlassClassification.Evaluate(prediction4, labelColumnName: "LabelIndex", topKPredictionCount: 1); - - Assert.Equal(0.9000, metrics3.TopKAccuracy, 4); - Assert.Equal(0.2411, metrics3.LogLoss, 4); - Assert.Equal(0.8800, metrics4.TopKAccuracy, 4); - Assert.Equal(0.2469, metrics4.LogLoss, 4); + var modelWithoutWeights = sdcaWithoutWeightMulticlass.Fit(data); + var modelWithWeights = sdcaWithWeightMulticlass.Fit(data); + + // Verify the metrics produced are different. + var prediction1 = modelWithoutWeights.Transform(data); + var prediction2 = modelWithWeights.Transform(data); + var metrics1 = mlContext.MulticlassClassification.Evaluate(prediction1, labelColumnName: "LabelIndex", topKPredictionCount: 1); + var metrics2 = mlContext.MulticlassClassification.Evaluate(prediction2, labelColumnName: "LabelIndex", topKPredictionCount: 1); + Assert.Equal(0.9100, metrics1.TopKAccuracy, 4); + Assert.Equal(0.2411, metrics1.LogLoss, 4); + Assert.Equal(0.8800, metrics2.TopKAccuracy, 4); + Assert.Equal(0.2464, metrics2.LogLoss, 4); + + // Extract the linear model from the pipeline. + var sdcaModelWithoutWeights = modelWithoutWeights.LastTransformer.Model; + var sdcaModelWithWeights = modelWithWeights.LastTransformer.Model; + + // Verify the SDCA model parameters are different. + // (bias). + var bias1 = sdcaModelWithoutWeights.GetBiases(); + var bias2 = sdcaModelWithWeights.GetBiases(); + Assert.False(bias1.SequenceEqual(bias2)); + + // (model weights). + VBuffer[] modelWeights1 = null; + VBuffer[] modelWeights2 = null; + sdcaModelWithoutWeights.GetWeights(ref modelWeights1, out int c1); + sdcaModelWithWeights.GetWeights(ref modelWeights2, out int c2); + Assert.Equal(c1, c2); + Assert.False(modelWeights1[0].DenseValues().SequenceEqual(modelWeights2[0].DenseValues())); + Assert.False(modelWeights1[1].DenseValues().SequenceEqual(modelWeights2[1].DenseValues())); } [Fact] From d6b84e0957d63026c4016b977891b0a1016add3b Mon Sep 17 00:00:00 2001 From: Abhishek Goswami Date: Fri, 29 Mar 2019 22:28:21 +0000 Subject: [PATCH 3/3] updating equality checks for floats --- .../TrainerEstimators/SdcaTests.cs | 63 ++++++++++--------- 1 file changed, 34 insertions(+), 29 deletions(-) diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/SdcaTests.cs b/test/Microsoft.ML.Tests/TrainerEstimators/SdcaTests.cs index 3a7b32742f..0f171368bf 100644 --- a/test/Microsoft.ML.Tests/TrainerEstimators/SdcaTests.cs +++ b/test/Microsoft.ML.Tests/TrainerEstimators/SdcaTests.cs @@ -114,9 +114,10 @@ public void SdcaLogisticRegressionWithWeight() var modelWithoutWeights = sdcaWithoutWeightBinary.Fit(data); var modelWithWeights = sdcaWithWeightBinary.Fit(data); - // Verify the metrics produced are different. var prediction1 = modelWithoutWeights.Transform(data); var prediction2 = modelWithWeights.Transform(data); + + // Verify the metrics produced are different. var metrics1 = mlContext.BinaryClassification.Evaluate(prediction1); var metrics2 = mlContext.BinaryClassification.Evaluate(prediction2); Assert.Equal(0.9658, metrics1.AreaUnderRocCurve, 4); @@ -124,15 +125,21 @@ public void SdcaLogisticRegressionWithWeight() Assert.Equal(0.9596, metrics2.AreaUnderRocCurve, 4); Assert.Equal(0.3591, metrics2.LogLoss, 4); - // Verify the SDCA model parameters are different. - // (bias). - var bias1 = modelWithoutWeights.Model.SubModel.Bias; - var bias2 = modelWithWeights.Model.SubModel.Bias; - Assert.NotEqual(bias1, bias2); - // (model weights). - var weights1 = modelWithoutWeights.Model.SubModel.Weights; - var weights2 = modelWithWeights.Model.SubModel.Weights; - Assert.False(weights1.SequenceEqual(weights2)); + // Verify the raw scores are different. + var scores1 = prediction1.GetColumn(prediction1.Schema["Score"]).ToArray(); + var scores2 = prediction2.GetColumn(prediction2.Schema["Score"]).ToArray(); + Assert.True(scores1.Length == scores2.Length); + + bool sameScores = true; + for (int i = 0; i < scores1.Length; i++) + { + if(!CompareNumbersWithTolerance(scores1[i], scores2[i])) + { + sameScores = false; + break; + } + } + Assert.False(sameScores); } [Fact] @@ -164,9 +171,10 @@ public void SdcaMaximumEntropyWithWeight() var modelWithoutWeights = sdcaWithoutWeightMulticlass.Fit(data); var modelWithWeights = sdcaWithWeightMulticlass.Fit(data); - // Verify the metrics produced are different. var prediction1 = modelWithoutWeights.Transform(data); var prediction2 = modelWithWeights.Transform(data); + + // Verify the metrics produced are different. var metrics1 = mlContext.MulticlassClassification.Evaluate(prediction1, labelColumnName: "LabelIndex", topKPredictionCount: 1); var metrics2 = mlContext.MulticlassClassification.Evaluate(prediction2, labelColumnName: "LabelIndex", topKPredictionCount: 1); Assert.Equal(0.9100, metrics1.TopKAccuracy, 4); @@ -174,24 +182,21 @@ public void SdcaMaximumEntropyWithWeight() Assert.Equal(0.8800, metrics2.TopKAccuracy, 4); Assert.Equal(0.2464, metrics2.LogLoss, 4); - // Extract the linear model from the pipeline. - var sdcaModelWithoutWeights = modelWithoutWeights.LastTransformer.Model; - var sdcaModelWithWeights = modelWithWeights.LastTransformer.Model; - - // Verify the SDCA model parameters are different. - // (bias). - var bias1 = sdcaModelWithoutWeights.GetBiases(); - var bias2 = sdcaModelWithWeights.GetBiases(); - Assert.False(bias1.SequenceEqual(bias2)); - - // (model weights). - VBuffer[] modelWeights1 = null; - VBuffer[] modelWeights2 = null; - sdcaModelWithoutWeights.GetWeights(ref modelWeights1, out int c1); - sdcaModelWithWeights.GetWeights(ref modelWeights2, out int c2); - Assert.Equal(c1, c2); - Assert.False(modelWeights1[0].DenseValues().SequenceEqual(modelWeights2[0].DenseValues())); - Assert.False(modelWeights1[1].DenseValues().SequenceEqual(modelWeights2[1].DenseValues())); + // Verify the raw scores are different. + var scores1 = prediction1.GetColumn(prediction1.Schema["Score"]).ToArray(); + var scores2 = prediction2.GetColumn(prediction2.Schema["Score"]).ToArray(); + Assert.True(scores1.Length == scores2.Length); + + bool sameScores = true; + for (int i = 0; i < scores1.Length; i++) + { + if (!CompareNumbersWithTolerance(scores1[i][0], scores2[i][0])) + { + sameScores = false; + break; + } + } + Assert.False(sameScores); } [Fact]