Adding a binary classification PFI Example (#1793)

rogancarr · web-flow · commit f25bd4b476c7 · 2018-12-20T12:52:52.000-08:00
* Adding a binary classification PFI Example, breaking the PFI examples into different files in a subfolder, and correcting XMLDocs links.
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance/PFIHelper.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance/PFIHelper.cs
@@ -0,0 +1,88 @@
+﻿using Microsoft.ML.Runtime.Data;
+using Microsoft.ML.Runtime.Learners;
+using Microsoft.ML.Trainers.HalLearners;
+using System;
+using System.Linq;
+
+namespace Microsoft.ML.Samples.Dynamic.PermutationFeatureImportance
+{
+    public class PfiHelper
+    {
+        public static IDataView GetHousingRegressionIDataView(MLContext mlContext, out string labelName, out string[] featureNames, bool binaryPrediction = false)
+        {
+            // Download the dataset from github.com/dotnet/machinelearning.
+            // This will create a housing.txt file in the filesystem.
+            // You can open this file to see the data. 
+            string dataFile = SamplesUtils.DatasetUtils.DownloadHousingRegressionDataset();
+
+            // Read the data as an IDataView.
+            // First, we define the reader: specify the data columns and where to find them in the text file.
+            // The data file is composed of rows of data, with each row having 11 numerical columns
+            // separated by whitespace.
+            var reader = mlContext.Data.CreateTextReader(
+                columns: new[]
+                    {
+                        // Read the first column (indexed by 0) in the data file as an R4 (float)
+                        new TextLoader.Column("MedianHomeValue", DataKind.R4, 0),
+                        new TextLoader.Column("CrimesPerCapita", DataKind.R4, 1),
+                        new TextLoader.Column("PercentResidental", DataKind.R4, 2),
+                        new TextLoader.Column("PercentNonRetail", DataKind.R4, 3),
+                        new TextLoader.Column("CharlesRiver", DataKind.R4, 4),
+                        new TextLoader.Column("NitricOxides", DataKind.R4, 5),
+                        new TextLoader.Column("RoomsPerDwelling", DataKind.R4, 6),
+                        new TextLoader.Column("PercentPre40s", DataKind.R4, 7),
+                        new TextLoader.Column("EmploymentDistance", DataKind.R4, 8),
+                        new TextLoader.Column("HighwayDistance", DataKind.R4, 9),
+                        new TextLoader.Column("TaxRate", DataKind.R4, 10),
+                        new TextLoader.Column("TeacherRatio", DataKind.R4, 11),
+                    },
+                hasHeader: true
+            );
+
+            // Read the data
+            var data = reader.Read(dataFile);
+            var labelColumn = "MedianHomeValue";
+
+            if (binaryPrediction)
+            {
+                labelColumn = nameof(BinaryOutputRow.AboveAverage);
+                data = mlContext.Transforms.CustomMappingTransformer(GreaterThanAverage, null).Transform(data);
+                data = mlContext.Transforms.DropColumns("MedianHomeValue").Fit(data).Transform(data);
+            }
+
+            labelName = labelColumn;
+            featureNames = data.Schema.AsEnumerable()
+                .Select(column => column.Name) // Get the column names
+                .Where(name => name != labelColumn) // Drop the Label
+                .ToArray();
+
+            return data;
+        }
+
+        // Define a class for all the input columns that we intend to consume.
+        private class ContinuousInputRow
+        {
+            public float MedianHomeValue { get; set; }
+        }
+
+        // Define a class for all output columns that we intend to produce.
+        private class BinaryOutputRow
+        {
+            public bool AboveAverage { get; set; }
+        }
+
+        // Define an Action to apply a custom mapping from one object to the other
+        private readonly static Action<ContinuousInputRow, BinaryOutputRow> GreaterThanAverage = (input, output) 
+            => output.AboveAverage = input.MedianHomeValue > 22.6;
+
+        public static float[] GetLinearModelWeights(OlsLinearRegressionModelParameters linearModel)
+        {
+            return linearModel.Weights.ToArray();
+        }
+
+        public static float[] GetLinearModelWeights(LinearBinaryModelParameters linearModel)
+        {
+            return linearModel.Weights.ToArray();
+        }
+    }
+}
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance/PFIRegressionExample.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance/PFIRegressionExample.cs
@@ -1,82 +1,39 @@
-﻿using Microsoft.ML.Runtime.Data;
-using Microsoft.ML.Runtime.Learners;
-using Microsoft.ML.Trainers.HalLearners;
-using System;
+﻿using System;
 using System.Linq;
 
-namespace Microsoft.ML.Samples.Dynamic
+namespace Microsoft.ML.Samples.Dynamic.PermutationFeatureImportance
 {
-    public class PFI_RegressionExample
+    public class PfiRegressionExample
     {
-        public static void PFI_Regression()
+        public static void RunExample()
         {
-            // Download the dataset from github.com/dotnet/machinelearning.
-            // This will create a housing.txt file in the filesystem.
-            // You can open this file to see the data. 
-            string dataFile = SamplesUtils.DatasetUtils.DownloadHousingRegressionDataset();
-
             // Create a new context for ML.NET operations. It can be used for exception tracking and logging, 
             // as a catalog of available operations and as the source of randomness.
             var mlContext = new MLContext();
 
-            // Step 1: Read the data as an IDataView.
-            // First, we define the reader: specify the data columns and where to find them in the text file.
-            // The data file is composed of rows of data, with each row having 11 numerical columns
-            // separated by whitespace.
-            var reader = mlContext.Data.CreateTextReader(
-                columns: new[]
-                    {
-                        // Read the first column (indexed by 0) in the data file as an R4 (float)
-                        new TextLoader.Column("MedianHomeValue", DataKind.R4, 0),
-                        new TextLoader.Column("CrimesPerCapita", DataKind.R4, 1),
-                        new TextLoader.Column("PercentResidental", DataKind.R4, 2),
-                        new TextLoader.Column("PercentNonRetail", DataKind.R4, 3),
-                        new TextLoader.Column("CharlesRiver", DataKind.R4, 4),
-                        new TextLoader.Column("NitricOxides", DataKind.R4, 5),
-                        new TextLoader.Column("RoomsPerDwelling", DataKind.R4, 6),
-                        new TextLoader.Column("PercentPre40s", DataKind.R4, 7),
-                        new TextLoader.Column("EmploymentDistance", DataKind.R4, 8),
-                        new TextLoader.Column("HighwayDistance", DataKind.R4, 9),
-                        new TextLoader.Column("TaxRate", DataKind.R4, 10),
-                        new TextLoader.Column("TeacherRatio", DataKind.R4, 11)
-                    },
-                hasHeader: true
-            );
-            
-            // Read the data
-            var data = reader.Read(dataFile);
+            // Step 1: Read the data
+            var data = PfiHelper.GetHousingRegressionIDataView(mlContext, out string labelName, out string[] featureNames);
 
             // Step 2: Pipeline
             // Concatenate the features to create a Feature vector.
             // Normalize the data set so that for each feature, its maximum value is 1 while its minimum value is 0.
-            // Then append a linear regression trainer, setting the "MedianHomeValue" column as the label of the dataset,
-            // the "Features" column produced by concatenation as the features of the dataset.
-            var labelName = "MedianHomeValue";
-            var pipeline = mlContext.Transforms.Concatenate("Features", "CrimesPerCapita", "PercentResidental",
-                        "PercentNonRetail", "CharlesRiver", "NitricOxides", "RoomsPerDwelling", "PercentPre40s",
-                        "EmploymentDistance", "HighwayDistance", "TaxRate", "TeacherRatio")
+            // Then append a linear regression trainer.
+            var pipeline = mlContext.Transforms.Concatenate("Features", featureNames)
                     .Append(mlContext.Transforms.Normalize("Features"))
                     .Append(mlContext.Regression.Trainers.OrdinaryLeastSquares(
                         labelColumn: labelName, featureColumn: "Features"));
-
             var model = pipeline.Fit(data);
+
             // Extract the model from the pipeline
             var linearPredictor = model.LastTransformer;
-            var weights = GetLinearModelWeights(linearPredictor.Model);
+            var weights = PfiHelper.GetLinearModelWeights(linearPredictor.Model);
 
-            // Compute the permutation metrics using the properly-featurized data.
+            // Compute the permutation metrics using the properly normalized data.
             var transformedData = model.Transform(data);
             var permutationMetrics = mlContext.Regression.PermutationFeatureImportance(
                 linearPredictor, transformedData, label: labelName, features: "Features", permutationCount: 3);
 
             // Now let's look at which features are most important to the model overall
-            // First, we have to prepare the data:
-            // Get the feature names as an IEnumerable
-            var featureNames = data.Schema
-                .Select(column => column.Name) // Get the column names
-                .Where(name => name != labelName) // Drop the Label
-                .ToArray();
-
             // Get the feature indices sorted by their impact on R-Squared
             var sortedIndices = permutationMetrics.Select((metrics, index) => new { index, metrics.RSquared })
                 .OrderByDescending(feature => Math.Abs(feature.RSquared.Mean))
@@ -116,10 +73,5 @@ public static void PFI_Regression()
                 Console.WriteLine($"{featureNames[i]}\t{weights[i]:0.00}\t{rSquared[i].Mean:G4}\t{1.96 * rSquared[i].StandardError:G4}");
             }
         }
-
-        private static float[] GetLinearModelWeights(OlsLinearRegressionModelParameters linearModel)
-        {
-            return linearModel.Weights.ToArray();
-        }
     }
 }
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance/PfiBinaryClassificationExample.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance/PfiBinaryClassificationExample.cs
@@ -0,0 +1,76 @@
+﻿using Microsoft.ML.Runtime.Learners;
+using System;
+using System.Linq;
+
+namespace Microsoft.ML.Samples.Dynamic.PermutationFeatureImportance
+{
+    public class PfiBinaryClassificationExample
+    {
+        public static void RunExample()
+        {
+            // Create a new context for ML.NET operations. It can be used for exception tracking and logging, 
+            // as a catalog of available operations and as the source of randomness.
+            var mlContext = new MLContext(seed:999123);
+
+            // Step 1: Read the data
+            var data = PfiHelper.GetHousingRegressionIDataView(mlContext, 
+                out string labelName, out string[] featureNames, binaryPrediction: true);
+
+            // Step 2: Pipeline
+            // Concatenate the features to create a Feature vector.
+            // Normalize the data set so that for each feature, its maximum value is 1 while its minimum value is 0.
+            // Then append a logistic regression trainer.
+            var pipeline = mlContext.Transforms.Concatenate("Features", featureNames)
+                    .Append(mlContext.Transforms.Normalize("Features"))
+                    .Append(mlContext.BinaryClassification.Trainers.LogisticRegression(
+                        labelColumn: labelName, featureColumn: "Features"));
+            var model = pipeline.Fit(data);
+
+            // Extract the model from the pipeline
+            var linearPredictor = model.LastTransformer;
+            // Linear models for binary classification are wrapped by a calibrator as a generic predictor
+            //  To access it directly, we must extract it out and cast it to the proper class
+            var weights = PfiHelper.GetLinearModelWeights(linearPredictor.Model.SubPredictor as LinearBinaryModelParameters);
+
+            // Compute the permutation metrics using the properly normalized data.
+            var transformedData = model.Transform(data);
+            var permutationMetrics = mlContext.BinaryClassification.PermutationFeatureImportance(
+                linearPredictor, transformedData, label: labelName, features: "Features", permutationCount: 3);
+
+            // Now let's look at which features are most important to the model overall
+            // Get the feature indices sorted by their impact on AUC
+            var sortedIndices = permutationMetrics.Select((metrics, index) => new { index, metrics.Auc })
+                .OrderByDescending(feature => Math.Abs(feature.Auc.Mean))
+                .Select(feature => feature.index);
+
+            // Print out the permutation results, with the model weights, in order of their impact:
+            // Expected console output (for 100 permutations):
+            //    Feature            Model Weight    Change in AUC   95% Confidence in the Mean Change in AUC
+            //    PercentPre40s      -1.96            -0.06316        0.002377
+            //    RoomsPerDwelling    3.71            -0.04385        0.001245
+            //    EmploymentDistance -1.31            -0.02139        0.0006867
+            //    TeacherRatio       -2.46            -0.0203         0.0009566
+            //    PercentNonRetail   -1.58            -0.01846        0.001586
+            //    CharlesRiver        0.66            -0.008605       0.0005136
+            //    PercentResidental   0.60             0.002483       0.0004818
+            //    TaxRate            -0.95            -0.00221        0.0007394
+            //    NitricOxides       -0.32             0.00101        0.0001428
+            //    CrimesPerCapita    -0.04            -3.029E-05      1.678E-05
+            //    HighwayDistance     0.00             0              0
+            // Let's look at these results.
+            // First, if you look at the weights of the model, they generally correlate with the results of PFI,
+            // but there are some significant misorderings. See the discussion in the Regression example for an
+            // explanation of why this happens and how to interpret it.
+            // Second, the logistic regression learner uses L1 regularization by default. Here, it causes the "HighWay Distance"
+            // feature to be zeroed out from the model. PFI assigns zero importance to this variable, as expected.
+            // Third, some features show an *increase* in AUC. This means that the model actually improved 
+            // when these features were shuffled. This is a sign to investigate these features further.
+            Console.WriteLine("Feature\tModel Weight\tChange in AUC\t95% Confidence in the Mean Change in AUC");
+            var auc = permutationMetrics.Select(x => x.Auc).ToArray(); // Fetch AUC as an array
+            foreach (int i in sortedIndices)
+            {
+                Console.WriteLine($"{featureNames[i]}\t{weights[i]:0.00}\t{auc[i].Mean:G4}\t{1.96 * auc[i].StandardError:G4}");
+            }
+        }
+    }
+}
diff --git a/src/Microsoft.ML.Transforms/PermutationFeatureImportanceExtensions.cs b/src/Microsoft.ML.Transforms/PermutationFeatureImportanceExtensions.cs
@@ -43,7 +43,7 @@ public static class PermutationFeatureImportanceExtensions
         /// <example>
         /// <format type="text/markdown">
         /// <![CDATA[
-        /// [!code-csharp[PFI](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs)]
+        /// [!code-csharp[PFI](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance/PFIRegressionExample.cs)]
         /// ]]>
         /// </format>
         /// </example>
@@ -120,7 +120,7 @@ private static RegressionMetrics RegressionDelta(
         /// <example>
         /// <format type="text/markdown">
         /// <![CDATA[
-        /// [!code-csharp[PFI](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs)]
+        /// [!code-csharp[PFI](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance/PfiBinaryClassificationExample.cs)]
         /// ]]>
         /// </format>
         /// </example>
@@ -198,13 +198,6 @@ private static BinaryClassificationMetrics BinaryClassifierDelta(
         /// example of working with these results to analyze the feature importance of a model.
         /// </para>
         /// </remarks>
-        /// <example>
-        /// <format type="text/markdown">
-        /// <![CDATA[
-        /// [!code-csharp[PFI](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs)]
-        /// ]]>
-        /// </format>
-        /// </example>
         /// <param name="ctx">The clustering context.</param>
         /// <param name="model">The model to evaluate.</param>
         /// <param name="data">The evaluation data set.</param>
@@ -284,13 +277,6 @@ private static MultiClassClassifierMetrics MulticlassClassificationDelta(
         /// example of working with these results to analyze the feature importance of a model.
         /// </para>
         /// </remarks>
-        /// <example>
-        /// <format type="text/markdown">
-        /// <![CDATA[
-        /// [!code-csharp[PFI](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs)]
-        /// ]]>
-        /// </format>
-        /// </example>
         /// <param name="ctx">The clustering context.</param>
         /// <param name="model">The model to evaluate.</param>
         /// <param name="data">The evaluation data set.</param>