Adding a binary classification PFI Example

Rogan Carr · Rogan Carr · commit 865573089cd4 · 2018-12-20T10:04:20.000-08:00
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs
@@ -6,55 +6,22 @@
 
 namespace Microsoft.ML.Samples.Dynamic
 {
-    public class PFI_RegressionExample
+    public class PermutationFeatureImportance_Examples
     {
         public static void PFI_Regression()
         {
-            // Download the dataset from github.com/dotnet/machinelearning.
-            // This will create a housing.txt file in the filesystem.
-            // You can open this file to see the data. 
-            string dataFile = SamplesUtils.DatasetUtils.DownloadHousingRegressionDataset();
-
             // Create a new context for ML.NET operations. It can be used for exception tracking and logging, 
             // as a catalog of available operations and as the source of randomness.
             var mlContext = new MLContext();
 
-            // Step 1: Read the data as an IDataView.
-            // First, we define the reader: specify the data columns and where to find them in the text file.
-            // The data file is composed of rows of data, with each row having 11 numerical columns
-            // separated by whitespace.
-            var reader = mlContext.Data.CreateTextReader(
-                columns: new[]
-                    {
-                        // Read the first column (indexed by 0) in the data file as an R4 (float)
-                        new TextLoader.Column("MedianHomeValue", DataKind.R4, 0),
-                        new TextLoader.Column("CrimesPerCapita", DataKind.R4, 1),
-                        new TextLoader.Column("PercentResidental", DataKind.R4, 2),
-                        new TextLoader.Column("PercentNonRetail", DataKind.R4, 3),
-                        new TextLoader.Column("CharlesRiver", DataKind.R4, 4),
-                        new TextLoader.Column("NitricOxides", DataKind.R4, 5),
-                        new TextLoader.Column("RoomsPerDwelling", DataKind.R4, 6),
-                        new TextLoader.Column("PercentPre40s", DataKind.R4, 7),
-                        new TextLoader.Column("EmploymentDistance", DataKind.R4, 8),
-                        new TextLoader.Column("HighwayDistance", DataKind.R4, 9),
-                        new TextLoader.Column("TaxRate", DataKind.R4, 10),
-                        new TextLoader.Column("TeacherRatio", DataKind.R4, 11)
-                    },
-                hasHeader: true
-            );
-            
-            // Read the data
-            var data = reader.Read(dataFile);
+            // Step 1: Read the data
+            var data = GetHousingRegressionIDataView(mlContext, out string labelName, out string[] featureNames);
 
             // Step 2: Pipeline
             // Concatenate the features to create a Feature vector.
             // Normalize the data set so that for each feature, its maximum value is 1 while its minimum value is 0.
-            // Then append a linear regression trainer, setting the "MedianHomeValue" column as the label of the dataset,
-            // the "Features" column produced by concatenation as the features of the dataset.
-            var labelName = "MedianHomeValue";
-            var pipeline = mlContext.Transforms.Concatenate("Features", "CrimesPerCapita", "PercentResidental",
-                        "PercentNonRetail", "CharlesRiver", "NitricOxides", "RoomsPerDwelling", "PercentPre40s",
-                        "EmploymentDistance", "HighwayDistance", "TaxRate", "TeacherRatio")
+            // Then append a linear regression trainer.
+            var pipeline = mlContext.Transforms.Concatenate("Features", featureNames)
                     .Append(mlContext.Transforms.Normalize("Features"))
                     .Append(mlContext.Regression.Trainers.OrdinaryLeastSquares(
                         labelColumn: labelName, featureColumn: "Features"));
@@ -64,19 +31,12 @@ public static void PFI_Regression()
             var linearPredictor = model.LastTransformer;
             var weights = GetLinearModelWeights(linearPredictor.Model);
 
-            // Compute the permutation metrics using the properly-featurized data.
+            // Compute the permutation metrics using the properly normalized data.
             var transformedData = model.Transform(data);
             var permutationMetrics = mlContext.Regression.PermutationFeatureImportance(
                 linearPredictor, transformedData, label: labelName, features: "Features", permutationCount: 3);
 
             // Now let's look at which features are most important to the model overall
-            // First, we have to prepare the data:
-            // Get the feature names as an IEnumerable
-            var featureNames = data.Schema
-                .Select(column => column.Name) // Get the column names
-                .Where(name => name != labelName) // Drop the Label
-                .ToArray();
-
             // Get the feature indices sorted by their impact on R-Squared
             var sortedIndices = permutationMetrics.Select((metrics, index) => new { index, metrics.RSquared })
                 .OrderByDescending(feature => Math.Abs(feature.RSquared.Mean))
@@ -116,10 +76,156 @@ public static void PFI_Regression()
                 Console.WriteLine($"{featureNames[i]}\t{weights[i]:0.00}\t{rSquared[i].Mean:G4}\t{1.96 * rSquared[i].StandardError:G4}");
             }
         }
+        public static void PFI_BinaryClassification()
+        {
+            // Create a new context for ML.NET operations. It can be used for exception tracking and logging, 
+            // as a catalog of available operations and as the source of randomness.
+            var mlContext = new MLContext();
+
+            // Step 1: Read the data
+            var data = GetHousingRegressionIDataView(mlContext, out string labelName, out string[] featureNames, binaryPrediction: true);
+
+            // Step 2: Pipeline
+            // Concatenate the features to create a Feature vector.
+            // Normalize the data set so that for each feature, its maximum value is 1 while its minimum value is 0.
+            // Then append a logistic regression trainer.
+            var pipeline = mlContext.Transforms.Concatenate("Features", featureNames)
+                    .Append(mlContext.Transforms.Normalize("Features"))
+                    .Append(mlContext.BinaryClassification.Trainers.LogisticRegression(
+                        labelColumn: labelName, featureColumn: "Features"));
+            var model = pipeline.Fit(data);
+
+            // Extract the model from the pipeline
+            var linearPredictor = model.LastTransformer;
+            // Linear models for binary classification are wrapped by a calibrator as a generic predictor
+            //  To access it directly, we must extract it out and cast it to the proper class
+            var weights = GetLinearModelWeights(linearPredictor.Model.SubPredictor as LinearBinaryModelParameters);
+
+            // Compute the permutation metrics using the properly normalized data.
+            var transformedData = model.Transform(data);
+            var permutationMetrics = mlContext.BinaryClassification.PermutationFeatureImportance(
+                linearPredictor, transformedData, label: labelName, features: "Features");
+
+            // Now let's look at which features are most important to the model overall
+            // Get the feature indices sorted by their impact on AUC
+            var sortedIndices = permutationMetrics.Select((metrics, index) => new { index, metrics.Auc })
+                .OrderByDescending(feature => Math.Abs(feature.Auc.Mean))
+                .Select(feature => feature.index);
+
+            // Print out the permutation results, with the model weights, in order of their impact:
+            // Expected console output:
+            //    Feature            Model Weight Change in AUC
+            //    PercentPre40s         -1.96 -0.04582
+            //    RoomsPerDwelling       3.71 -0.04516
+            //    EmploymentDistance    -1.31 -0.02375
+            //    TeacherRatio          -2.46 -0.01476
+            //    CharlesRiver           0.66 -0.008683
+            //    PercentNonRetail      -1.58 -0.007314
+            //    PercentResidental      0.60  0.003979
+            //    TaxRate               -0.95  0.002739
+            //    NitricOxides          -0.32  0.001917
+            //    CrimesPerCapita       -0.04 -3.222E-05
+            //    HighwayDistance        0.00  0
+            //
+            // Let's look at these results.
+            // First, if you look at the weights of the model, they generally correlate with the results of PFI,
+            // but there are some significant misorderings. See the discussion in the Regression example for an
+            // explanation of why this happens and how to interpret it.
+            // Second, the logistic regression learner uses L1 regularization by default. Here, it causes the "HighWay Distance"
+            // feature to be zeroed out from the model. PFI assigns zero importance to this variable, as expected.
+            // Third, some features showed an *increase* in AUC. This means that the model actually improved 
+            // when these features were shuffled. This is actually expected when the effects are small (here on the order of 10^-3).
+            // This is due to the random nature of permutations. To reduce computational costs, PFI performs a single
+            // permutation per feature, which means that the change in AUC is just from one sample of the data.
+            // If each feature were permuted many times and the average computed, the resuting average change in AUC
+            // would be small and negative for these features, or zero if the features truly were meaningless.
+            // To see observe this behavior yourself, try adding a second call to PFI and compare the results, or
+            // rerun the script with a different seed set in the MLContext(), like so:
+            //  `var mlContext = new MLContext(seed: 12345);`
+            Console.WriteLine("Feature\tModel Weight\tChange in AUC\t95% Confidence in the Mean Change in AUC");
+            var auc = permutationMetrics.Select(x => x.Auc).ToArray(); // Fetch AUC as an array
+            foreach (int i in sortedIndices)
+            {
+                Console.WriteLine($"{featureNames[i]}\t{weights[i]:0.00}\t{auc[i].Mean:G4}\t{1.96 * auc[i].StandardError:G4}");
+            }
+            // DON"T CHECK IN UNTIL TEXT IS COMPLETE
+        }
+
+        private static IDataView GetHousingRegressionIDataView(MLContext mlContext, out string labelName, out string[] featureNames, bool binaryPrediction = false)
+        {
+            // Download the dataset from github.com/dotnet/machinelearning.
+            // This will create a housing.txt file in the filesystem.
+            // You can open this file to see the data. 
+            string dataFile = SamplesUtils.DatasetUtils.DownloadHousingRegressionDataset();
+
+            // Read the data as an IDataView.
+            // First, we define the reader: specify the data columns and where to find them in the text file.
+            // The data file is composed of rows of data, with each row having 11 numerical columns
+            // separated by whitespace.
+            var reader = mlContext.Data.CreateTextReader(
+                columns: new[]
+                    {
+                        // Read the first column (indexed by 0) in the data file as an R4 (float)
+                        new TextLoader.Column("MedianHomeValue", DataKind.R4, 0),
+                        new TextLoader.Column("CrimesPerCapita", DataKind.R4, 1),
+                        new TextLoader.Column("PercentResidental", DataKind.R4, 2),
+                        new TextLoader.Column("PercentNonRetail", DataKind.R4, 3),
+                        new TextLoader.Column("CharlesRiver", DataKind.R4, 4),
+                        new TextLoader.Column("NitricOxides", DataKind.R4, 5),
+                        new TextLoader.Column("RoomsPerDwelling", DataKind.R4, 6),
+                        new TextLoader.Column("PercentPre40s", DataKind.R4, 7),
+                        new TextLoader.Column("EmploymentDistance", DataKind.R4, 8),
+                        new TextLoader.Column("HighwayDistance", DataKind.R4, 9),
+                        new TextLoader.Column("TaxRate", DataKind.R4, 10),
+                        new TextLoader.Column("TeacherRatio", DataKind.R4, 11),
+                    },
+                hasHeader: true
+            );
+
+            // Read the data
+            var data = reader.Read(dataFile);
+            var labelColumn = "MedianHomeValue";
+
+            if (binaryPrediction)
+            {
+                labelColumn = nameof(BinaryOutputRow.AboveAverage);
+                data = mlContext.Transforms.CustomMappingTransformer(GreaterThanAverage, null).Transform(data);
+                data = mlContext.Transforms.DropColumns("MedianHomeValue").Fit(data).Transform(data);
+            }
+
+            labelName = labelColumn;
+            featureNames = data.Schema.AsEnumerable()
+                .Select(column => column.Name) // Get the column names
+                .Where(name => name != labelColumn) // Drop the Label
+                .ToArray();
+
+            return data;
+        }
+
+        // Define a class for all the input columns that we intend to consume.
+        private class ContinuousInputRow
+        {
+            public float MedianHomeValue { get; set; }
+        }
+
+        // Define a class for all output columns that we intend to produce.
+        private class BinaryOutputRow
+        {
+            public bool AboveAverage { get; set; }
+        }
+
+        // Define an Action to apply a custom mapping from one object to the other
+        private readonly static Action<ContinuousInputRow, BinaryOutputRow> GreaterThanAverage = (input, output) 
+            => output.AboveAverage = input.MedianHomeValue > 22.6;
 
         private static float[] GetLinearModelWeights(OlsLinearRegressionModelParameters linearModel)
         {
             return linearModel.Weights.ToArray();
         }
+
+        private static float[] GetLinearModelWeights(LinearBinaryModelParameters linearModel)
+        {
+            return linearModel.Weights.ToArray();
+        }
     }
 }