Add Permutation Feature Importance for Binary Classification (#1735)

rogancarr · web-flow · commit 213ef9ed99d8 · 2018-11-27T15:03:43.000-08:00
Adding support for binary classification in Permutation Feature Importance
diff --git a/src/Microsoft.ML.Data/Evaluators/BinaryClassifierEvaluator.cs b/src/Microsoft.ML.Data/Evaluators/BinaryClassifierEvaluator.cs
@@ -878,6 +878,20 @@ internal Result(IExceptionContext ectx, IRow overallResult)
                 F1Score = Fetch(BinaryClassifierEvaluator.F1);
                 Auprc = Fetch(BinaryClassifierEvaluator.AuPrc);
             }
+
+            [BestFriend]
+            internal Result(double auc, double accuracy, double positivePrecision, double positiveRecall,
+                double negativePrecision, double negativeRecall, double f1Score, double auprc)
+            {
+                Auc = auc;
+                Accuracy = accuracy;
+                PositivePrecision = positivePrecision;
+                PositiveRecall = positiveRecall;
+                NegativePrecision = negativePrecision;
+                NegativeRecall = negativeRecall;
+                F1Score = f1Score;
+                Auprc = auprc;
+            }
         }
 
         /// <summary>
diff --git a/src/Microsoft.ML.Transforms/PermutationFeatureImportanceExtensions.cs b/src/Microsoft.ML.Transforms/PermutationFeatureImportanceExtensions.cs
@@ -55,5 +55,53 @@ private static RegressionEvaluator.Result RegressionDelta(
                 lossFunction: a.LossFn - b.LossFn,
                 rSquared: a.RSquared - b.RSquared);
         }
+
+        /// <summary>
+        /// Permutation Feature Importance is a technique that calculates how much each feature 'matters' to the predictions.
+        /// Namely, how much the model's predictions will change if we randomly permute the values of one feature across the evaluation set.
+        /// If the quality doesn't change much, this feature is not very important. If the quality drops drastically, this was a really important feature.
+        /// </summary>
+        /// <param name="ctx">The binary classification context.</param>
+        /// <param name="model">The model to evaluate.</param>
+        /// <param name="data">The evaluation data set.</param>
+        /// <param name="label">Label column name.</param>
+        /// <param name="features">Feature column names.</param>
+        /// <param name="useFeatureWeightFilter">Use features weight to pre-filter features.</param>
+        /// <param name="topExamples">Limit the number of examples to evaluate on. null means examples (up to ~ 2 bln) from input will be used.</param>
+        /// <returns>Array of per-feature 'contributions' to the score.</returns>
+        public static ImmutableArray<BinaryClassifierEvaluator.Result>
+            PermutationFeatureImportance(
+                this BinaryClassificationContext ctx,
+                IPredictionTransformer<IPredictor> model,
+                IDataView data,
+                string label = DefaultColumnNames.Label,
+                string features = DefaultColumnNames.Features,
+                bool useFeatureWeightFilter = false,
+                int? topExamples = null)
+        {
+            return PermutationFeatureImportance<BinaryClassifierEvaluator.Result>.GetImportanceMetricsMatrix(
+                            CatalogUtils.GetEnvironment(ctx),
+                            model,
+                            data,
+                            idv => ctx.Evaluate(idv, label),
+                            BinaryClassifierDelta,
+                            features,
+                            useFeatureWeightFilter,
+                            topExamples);
+        }
+
+        private static BinaryClassifierEvaluator.Result BinaryClassifierDelta(
+            BinaryClassifierEvaluator.Result a, BinaryClassifierEvaluator.Result b)
+        {
+            return new BinaryClassifierEvaluator.Result(
+                auc: a.Auc - b.Auc,
+                accuracy: a.Accuracy - b.Accuracy,
+                positivePrecision: a.PositivePrecision - b.PositivePrecision,
+                positiveRecall: a.PositiveRecall - b.PositiveRecall,
+                negativePrecision: a.NegativePrecision - b.NegativePrecision,
+                negativeRecall: a.NegativeRecall - b.NegativeRecall,
+                f1Score: a.F1Score - b.F1Score,
+                auprc: a.Auprc - b.Auprc);
+        }
     }
 }
diff --git a/test/Microsoft.ML.Tests/PermutationFeatureImportanceTests.cs b/test/Microsoft.ML.Tests/PermutationFeatureImportanceTests.cs
@@ -3,6 +3,7 @@
 // See the LICENSE file in the project root for more information.
 
 using Microsoft.ML.Runtime.Data;
+using Microsoft.ML.Runtime.Internal.Utilities;
 using Microsoft.ML.Runtime.RunTests;
 using System;
 using System.Collections.Immutable;
@@ -24,7 +25,151 @@ public PermutationFeatureImportanceTests(ITestOutputHelper output) : base(output
         /// Also test checks that x2 has the biggest importance.
         /// </summary>
         [Fact]
-        public void TestDenseSGD()
+        public void TestPfiRegressionOnDenseFeatures()
+        {
+            var data = GetDenseDataset();
+            var model = ML.Regression.Trainers.OnlineGradientDescent().Fit(data);
+            var pfi = ML.Regression.PermutationFeatureImportance(model, data);
+
+            // Pfi Indices:
+            // X1: 0
+            // X2Important: 1
+            // X3: 2
+            // X4Rand: 3
+
+            // For the following metrics lower is better, so maximum delta means more important feature, and vice versa
+            Assert.True(MinDeltaIndex(pfi, m => m.L1) == 3);
+            Assert.True(MaxDeltaIndex(pfi, m => m.L1) == 1);
+
+            Assert.True(MinDeltaIndex(pfi, m => m.L2) == 3);
+            Assert.True(MaxDeltaIndex(pfi, m => m.L2) == 1);
+
+            Assert.True(MinDeltaIndex(pfi, m => m.Rms) == 3);
+            Assert.True(MaxDeltaIndex(pfi, m => m.Rms) == 1);
+
+            // For the following metrics higher is better, so minimum delta means more important feature, and vice versa
+            Assert.True(MaxDeltaIndex(pfi, m => m.RSquared) == 3);
+            Assert.True(MinDeltaIndex(pfi, m => m.RSquared) == 1);
+
+            Done();
+        }
+
+        /// <summary>
+        /// Features: x1, x2vBuff(sparce vector), x3. 
+        /// y = 10x1 + 10x2vBuff + 30x3 + e.
+        /// Within xBuff feature  2nd slot will be sparse most of the time.
+        /// Test verifies that 2nd slot of xBuff has the least importance: L1, L2, RMS and Loss-Fn do not change a lot when this slot is permuted.
+        /// Also test checks that x2 has the biggest importance.
+        /// </summary>
+        [Fact]
+        public void TestPfiRegressionOnSparseFeatures()
+        {
+            var data = GetSparseDataset();
+            var model = ML.Regression.Trainers.OnlineGradientDescent().Fit(data);
+            var results = ML.Regression.PermutationFeatureImportance(model, data);
+
+            // Pfi Indices:
+            // X1: 0
+            // X2VBuffer-Slot-0: 1
+            // X2VBuffer-Slot-1: 2
+            // X2VBuffer-Slot-2: 3
+            // X2VBuffer-Slot-3: 4
+            // X3Important: 5
+
+            // Permuted X2VBuffer-Slot-1 lot (f2) should have min impact on SGD metrics, X3Important -- max impact.
+            // For the following metrics lower is better, so maximum delta means more important feature, and vice versa
+            Assert.True(MinDeltaIndex(results, m => m.L1) == 2);
+            Assert.True(MaxDeltaIndex(results, m => m.L1) == 5);
+
+            Assert.True(MinDeltaIndex(results, m => m.L2) == 2);
+            Assert.True(MaxDeltaIndex(results, m => m.L2) == 5);
+
+            Assert.True(MinDeltaIndex(results, m => m.Rms) == 2);
+            Assert.True(MaxDeltaIndex(results, m => m.Rms) == 5);
+
+            // For the following metrics higher is better, so minimum delta means more important feature, and vice versa
+            Assert.True(MaxDeltaIndex(results, m => m.RSquared) == 2);
+            Assert.True(MinDeltaIndex(results, m => m.RSquared) == 5);
+        }
+
+        [Fact]
+        public void TestPfiBinaryClassificationOnDenseFeatures()
+        {
+            var data = GetDenseDataset(TaskType.BinaryClassification);
+            var model = ML.BinaryClassification.Trainers.LogisticRegression().Fit(data);
+            var pfi = ML.BinaryClassification.PermutationFeatureImportance(model, data);
+
+            // Pfi Indices:
+            // X1: 0
+            // X2Important: 1
+            // X3: 2
+            // X4Rand: 3
+
+            // For the following metrics higher is better, so minimum delta means more important feature, and vice versa
+            Assert.True(MaxDeltaIndex(pfi, m => m.Auc) == 3);
+            Assert.True(MinDeltaIndex(pfi, m => m.Auc) == 1);
+            Assert.True(MaxDeltaIndex(pfi, m => m.Accuracy) == 3);
+            Assert.True(MinDeltaIndex(pfi, m => m.Accuracy) == 1);
+            Assert.True(MaxDeltaIndex(pfi, m => m.PositivePrecision) == 3);
+            Assert.True(MinDeltaIndex(pfi, m => m.PositivePrecision) == 1);
+            Assert.True(MaxDeltaIndex(pfi, m => m.PositiveRecall) == 3);
+            Assert.True(MinDeltaIndex(pfi, m => m.PositiveRecall) == 1);
+            Assert.True(MaxDeltaIndex(pfi, m => m.NegativePrecision) == 3);
+            Assert.True(MinDeltaIndex(pfi, m => m.NegativePrecision) == 1);
+            Assert.True(MaxDeltaIndex(pfi, m => m.NegativeRecall) == 3);
+            Assert.True(MinDeltaIndex(pfi, m => m.NegativeRecall) == 1);
+            Assert.True(MaxDeltaIndex(pfi, m => m.F1Score) == 3);
+            Assert.True(MinDeltaIndex(pfi, m => m.F1Score) == 1);
+            Assert.True(MaxDeltaIndex(pfi, m => m.Auprc) == 3);
+            Assert.True(MinDeltaIndex(pfi, m => m.Auprc) == 1);
+
+            Done();
+        }
+
+        /// <summary>
+        /// Features: x1, x2vBuff(sparce vector), x3. 
+        /// y = 10x1 + 10x2vBuff + 30x3 + e.
+        /// Within xBuff feature  2nd slot will be sparse most of the time.
+        /// Test verifies that 2nd slot of xBuff has the least importance: L1, L2, RMS and Loss-Fn do not change a lot when this slot is permuted.
+        /// Also test checks that x2 has the biggest importance.
+        /// </summary>
+        [Fact]
+        public void TestPfiBinaryClassificationOnSparseFeatures()
+        {
+            var data = GetSparseDataset(TaskType.BinaryClassification);
+            var model = ML.BinaryClassification.Trainers.LogisticRegression().Fit(data);
+            var pfi = ML.BinaryClassification.PermutationFeatureImportance(model, data);
+
+            // Pfi Indices:
+            // X1: 0
+            // X2VBuffer-Slot-0: 1
+            // X2VBuffer-Slot-1: 2
+            // X2VBuffer-Slot-2: 3
+            // X2VBuffer-Slot-3: 4
+            // X3Important: 5
+
+            // For the following metrics higher is better, so minimum delta means more important feature, and vice versa
+            Assert.True(MaxDeltaIndex(pfi, m => m.Auc) == 2);
+            Assert.True(MinDeltaIndex(pfi, m => m.Auc) == 5);
+            Assert.True(MaxDeltaIndex(pfi, m => m.Accuracy) == 2);
+            Assert.True(MinDeltaIndex(pfi, m => m.Accuracy) == 5);
+            Assert.True(MaxDeltaIndex(pfi, m => m.PositivePrecision) == 2);
+            Assert.True(MinDeltaIndex(pfi, m => m.PositivePrecision) == 5);
+            Assert.True(MaxDeltaIndex(pfi, m => m.PositiveRecall) == 2);
+            Assert.True(MinDeltaIndex(pfi, m => m.PositiveRecall) == 5);
+            Assert.True(MaxDeltaIndex(pfi, m => m.NegativePrecision) == 2);
+            Assert.True(MinDeltaIndex(pfi, m => m.NegativePrecision) == 5);
+            Assert.True(MaxDeltaIndex(pfi, m => m.NegativeRecall) == 2);
+            Assert.True(MinDeltaIndex(pfi, m => m.NegativeRecall) == 5);
+            Assert.True(MaxDeltaIndex(pfi, m => m.F1Score) == 2);
+            Assert.True(MinDeltaIndex(pfi, m => m.F1Score) == 5);
+            Assert.True(MaxDeltaIndex(pfi, m => m.Auprc) == 2);
+            Assert.True(MinDeltaIndex(pfi, m => m.Auprc) == 5);
+
+            Done();
+        }
+
+        private IDataView GetDenseDataset(TaskType task = TaskType.Regression)
         {
             // Setup synthetic dataset.
             const int numberOfInstances = 1000;
@@ -50,6 +195,10 @@ public void TestDenseSGD()
                 yArray[i] = (float)(10 * x1 + 20 * x2Important + 5.5 * x3 + noise);
             }
 
+            // If binary classification, modify the labels
+            if (task == TaskType.BinaryClassification)
+                GetBinaryClassificationScores(yArray);
+
             // Create data view.
             var bldr = new ArrayDataViewBuilder(Env);
             bldr.AddColumn("X1", NumberType.Float, x1Array);
@@ -62,41 +211,11 @@ public void TestDenseSGD()
             var pipeline = ML.Transforms.Concatenate("Features", "X1", "X2Important", "X3", "X4Rand")
                 .Append(ML.Transforms.Normalize("Features"));
             var data = pipeline.Fit(srcDV).Transform(srcDV);
-            var model = ML.Regression.Trainers.OnlineGradientDescent().Fit(data);
-            var pfi = ML.Regression.PermutationFeatureImportance(model, data);
-
-            // Pfi Indices:
-            // X1: 0
-            // X2Important: 1
-            // X3: 2
-            // X4Rand: 3
-
-            // For the following metrics lower is better, so maximum delta means more important feature, and vice versa
-            Assert.True(MinDeltaIndex(pfi, m => m.L1) == 3);
-            Assert.True(MaxDeltaIndex(pfi, m => m.L1) == 1);
-
-            Assert.True(MinDeltaIndex(pfi, m => m.L2) == 3);
-            Assert.True(MaxDeltaIndex(pfi, m => m.L2) == 1);
-
-            Assert.True(MinDeltaIndex(pfi, m => m.Rms) == 3);
-            Assert.True(MaxDeltaIndex(pfi, m => m.Rms) == 1);
-
-            // For the following metrics higher is better, so minimum delta means more important feature, and vice versa
-            Assert.True(MaxDeltaIndex(pfi, m => m.RSquared) == 3);
-            Assert.True(MinDeltaIndex(pfi, m => m.RSquared) == 1);
 
-            Done();
+            return data;
         }
 
-        /// <summary>
-        /// Features: x1, x2vBuff(sparce vector), x3. 
-        /// y = 10x1 + 10x2vBuff + 30x3 + e.
-        /// Within xBuff feature  2nd slot will be sparse most of the time.
-        /// Test verifies that 2nd slot of xBuff has the least importance: L1, L2, RMS and Loss-Fn do not change a lot when this slot is permuted.
-        /// Also test checks that x2 has the biggest importance.
-        /// </summary>
-        [Fact]
-        public void TestSparseSGD()
+        private IDataView GetSparseDataset(TaskType task = TaskType.Regression)
         {
             // Setup synthetic dataset.
             const int numberOfInstances = 10000;
@@ -137,6 +256,10 @@ public void TestSparseSGD()
                 yArray[i] = 10 * x1 + vbSum + 20 * x3Important + noise;
             }
 
+            // If binary classification, modify the labels
+            if (task == TaskType.BinaryClassification)
+                GetBinaryClassificationScores(yArray);
+
             // Create data view.
             var bldr = new ArrayDataViewBuilder(Env);
             bldr.AddColumn("X1", NumberType.Float, x1Array);
@@ -148,47 +271,43 @@ public void TestSparseSGD()
             var pipeline = ML.Transforms.Concatenate("Features", "X1", "X2VBuffer", "X3Important")
                 .Append(ML.Transforms.Normalize("Features"));
             var data = pipeline.Fit(srcDV).Transform(srcDV);
-            var model = ML.Regression.Trainers.OnlineGradientDescent().Fit(data);
-            var results = ML.Regression.PermutationFeatureImportance(model, data);
-
-            // Pfi Indices:
-            // X1: 0
-            // X2VBuffer-Slot-0: 1
-            // X2VBuffer-Slot-1: 2
-            // X2VBuffer-Slot-2: 3
-            // X2VBuffer-Slot-3: 4
-            // X3Important: 5
 
-            // Permuted X2VBuffer-Slot-1 lot (f2) should have min impact on SGD metrics, X3Important -- max impact.
-            // For the following metrics lower is better, so maximum delta means more important feature, and vice versa
-            Assert.True(MinDeltaIndex(results, m => m.L1) == 2);
-            Assert.True(MaxDeltaIndex(results, m => m.L1) == 5);
-
-            Assert.True(MinDeltaIndex(results, m => m.L2) == 2);
-            Assert.True(MaxDeltaIndex(results, m => m.L2) == 5);
-
-            Assert.True(MinDeltaIndex(results, m => m.Rms) == 2);
-            Assert.True(MaxDeltaIndex(results, m => m.Rms) == 5);
-
-            // For the following metrics higher is better, so minimum delta means more important feature, and vice versa
-            Assert.True(MaxDeltaIndex(results, m => m.RSquared) == 2);
-            Assert.True(MinDeltaIndex(results, m => m.RSquared) == 5);
+            return data;
         }
 
-        private int MinDeltaIndex(
-            ImmutableArray<RegressionEvaluator.Result> metricsDelta,
-            Func<RegressionEvaluator.Result, double> metricSelector)
+        private int MinDeltaIndex<T>(
+            ImmutableArray<T> metricsDelta,
+            Func<T, double> metricSelector)
         {
             var min = metricsDelta.OrderBy(m => metricSelector(m)).First();
             return metricsDelta.IndexOf(min);
         }
 
-        private int MaxDeltaIndex(
-            ImmutableArray<RegressionEvaluator.Result> metricsDelta,
-            Func<RegressionEvaluator.Result, double> metricSelector)
+        private int MaxDeltaIndex<T>(
+            ImmutableArray<T> metricsDelta,
+            Func<T, double> metricSelector)
         {
             var max = metricsDelta.OrderByDescending(m => metricSelector(m)).First();
             return metricsDelta.IndexOf(max);
         }
+
+        private void GetBinaryClassificationScores(float[] rawScores)
+        {
+            // Compute the average so we can center the response
+            float averageScore = 0.0f;
+            for (int i = 0; i < rawScores.Length; i++)
+                averageScore += rawScores[i];
+            averageScore /= rawScores.Length;
+
+            // Center the response and then take the sigmoid to generate the classes
+            for (int i = 0; i < rawScores.Length; i++)
+                rawScores[i] = MathUtils.Sigmoid(rawScores[i] - averageScore) > 0.5 ? 1 : 0;
+        }
+
+        private enum TaskType
+        {
+            Regression,
+            BinaryClassification
+        }
     }
 }