From e1a5d3a95231e09ac7d0eba6da399beb64eb90f6 Mon Sep 17 00:00:00 2001
From: Rogan Carr <rocarr@microsoft.com>
Date: Mon, 26 Nov 2018 16:11:33 -0800
Subject: [PATCH 1/8] Adding an example for using PFI

---
 .../Dynamic/PermutationFeatureImportance.cs   | 97 +++++++++++++++++++
 1 file changed, 97 insertions(+)
 create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs

diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs
new file mode 100644
index 0000000000..9f8c59cc6b
--- /dev/null
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs
@@ -0,0 +1,97 @@
+﻿using Microsoft.ML.Runtime.Data;
+using Microsoft.ML.Runtime.Learners;
+using System;
+using System.Linq;
+
+namespace Microsoft.ML.Samples.Dynamic
+{
+    public class PFI_RegressionExample
+    {
+        public static void PFI_Regression()
+        {
+            // Download the dataset from github.com/dotnet/machinelearning.
+            // This will create a housing.txt file in the filesystem.
+            // You can open this file to see the data. 
+            string dataFile = SamplesUtils.DatasetUtils.DownloadHousingRegressionDataset();
+
+            // Create a new context for ML.NET operations. It can be used for exception tracking and logging, 
+            // as a catalog of available operations and as the source of randomness.
+            var mlContext = new MLContext();
+
+            // Step 1: Read the data as an IDataView.
+            // First, we define the reader: specify the data columns and where to find them in the text file.
+            var reader = mlContext.Data.TextReader(new TextLoader.Arguments()
+                {
+                    Separator = "tab",
+                    HasHeader = true,
+                    Column = new[]
+                    {
+                        new TextLoader.Column("MedianHomeValue", DataKind.R4, 0),
+                        new TextLoader.Column("CrimesPerCapita", DataKind.R4, 1),
+                        new TextLoader.Column("PercentResidental", DataKind.R4, 2),
+                        new TextLoader.Column("PercentNonRetail", DataKind.R4, 3),
+                        new TextLoader.Column("CharlesRiver", DataKind.R4, 4),
+                        new TextLoader.Column("NitricOxides", DataKind.R4, 5),
+                        new TextLoader.Column("RoomsPerDwelling", DataKind.R4, 6),
+                        new TextLoader.Column("PercentPre40s", DataKind.R4, 7),
+                        new TextLoader.Column("EmploymentDistance", DataKind.R4, 8),
+                        new TextLoader.Column("HighwayDistance", DataKind.R4, 9),
+                        new TextLoader.Column("TaxRate", DataKind.R4, 10),
+                        new TextLoader.Column("TeacherRatio", DataKind.R4, 11),
+                    }
+                });
+            
+            // Read the data
+            var data = reader.Read(dataFile);
+
+            // Step 2: Pipeline
+            // Concatenate the features to create a Feature vector.
+            // Then append a gam regressor, setting the "MedianHomeValue" column as the label of the dataset,
+            // the "Features" column produced by concatenation as the features column.
+            var labelName = "MedianHomeValue";
+            var pipeline = mlContext.Transforms.Concatenate("Features", "CrimesPerCapita", "PercentResidental",
+                    "PercentNonRetail", "CharlesRiver", "NitricOxides", "RoomsPerDwelling", "PercentPre40s",
+                    "EmploymentDistance", "HighwayDistance", "TaxRate", "TeacherRatio")
+                    .Append(mlContext.Regression.Trainers.StochasticDualCoordinateAscent(
+                        labelColumn: labelName, featureColumn: "Features"));
+            var fitPipeline = pipeline.Fit(data);
+
+            // Extract the model from the pipeline
+            var linearPredictor = fitPipeline.LastTransformer;
+            var weights = GetLinearModelWeights(linearPredictor.Model);
+
+            // Compute the permutation metrics using the properly-featurized data.
+            var transformedData = fitPipeline.Transform(data);
+            var permutationMetrics = mlContext.Regression.PermutationFeatureImportance(
+                linearPredictor, transformedData, label: labelName, features: "Features");
+
+            // Now let's look at which features are most important to the model overall
+            // First, we have to prepare the data:
+            // Get the feature names as an IEnumerable
+            var featureNames = data.Schema.GetColumns()
+                .Select(tuple => tuple.column.Name) // Get the column names
+                .Where(name => name != labelName) // Drop the Label
+                .ToArray();
+
+            // Get the feature indices sorted by their impact on R-Squared
+            var sortedIndices = permutationMetrics.Select((metrics, index) => new { index, metrics.RSquared })
+                .OrderByDescending(feature => Math.Abs(feature.RSquared))
+                .Select(feature => feature.index);
+
+            // Print out the permutation results, with the model weights, in order of their impact
+            Console.WriteLine("Feature\tModel Weight\tChange in R-Squared");
+            var rSquared = permutationMetrics.Select(x => x.RSquared).ToArray(); // Fetch r-squared as an array
+            foreach (int i in sortedIndices)
+            {
+                Console.WriteLine("{0}\t{1:0.00}\t{2:G4}", featureNames[i], weights[i], rSquared[i]);
+            }
+        }
+
+        private static float[] GetLinearModelWeights(LinearRegressionPredictor linearModel)
+        {
+            var weights = new VBuffer<float>();
+            linearModel.GetFeatureWeights(ref weights);
+            return weights.GetValues().ToArray();
+        }
+    }
+}

From b81965a7f3547e8f90e64b51ea1d68e73e7f9427 Mon Sep 17 00:00:00 2001
From: Rogan Carr <rocarr@microsoft.com>
Date: Mon, 26 Nov 2018 17:00:02 -0800
Subject: [PATCH 2/8] Adding in the expected console output.

---
 .../Dynamic/PermutationFeatureImportance.cs   | 33 ++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs
index 9f8c59cc6b..396d654149 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs
@@ -46,12 +46,14 @@ public static void PFI_Regression()
 
             // Step 2: Pipeline
             // Concatenate the features to create a Feature vector.
+            // Normalize the values between 0 and 1
             // Then append a gam regressor, setting the "MedianHomeValue" column as the label of the dataset,
             // the "Features" column produced by concatenation as the features column.
             var labelName = "MedianHomeValue";
             var pipeline = mlContext.Transforms.Concatenate("Features", "CrimesPerCapita", "PercentResidental",
                     "PercentNonRetail", "CharlesRiver", "NitricOxides", "RoomsPerDwelling", "PercentPre40s",
                     "EmploymentDistance", "HighwayDistance", "TaxRate", "TeacherRatio")
+                    .Append(mlContext.Transforms.Normalize("Features"))
                     .Append(mlContext.Regression.Trainers.StochasticDualCoordinateAscent(
                         labelColumn: labelName, featureColumn: "Features"));
             var fitPipeline = pipeline.Fit(data);
@@ -65,6 +67,8 @@ public static void PFI_Regression()
             var permutationMetrics = mlContext.Regression.PermutationFeatureImportance(
                 linearPredictor, transformedData, label: labelName, features: "Features");
 
+            var booooo = transformedData.Preview();
+
             // Now let's look at which features are most important to the model overall
             // First, we have to prepare the data:
             // Get the feature names as an IEnumerable
@@ -78,7 +82,34 @@ public static void PFI_Regression()
                 .OrderByDescending(feature => Math.Abs(feature.RSquared))
                 .Select(feature => feature.index);
 
-            // Print out the permutation results, with the model weights, in order of their impact
+
+            // Print out the permutation results, with the model weights, in order of their impact:
+            // Expected console output:
+            //    Feature            Model Weight    Change in R - Squared
+            //    RoomsPerDwelling      50.80 -0.3695
+            //    EmploymentDistance   -17.79 -0.2238
+            //    TeacherRatio         -19.83 -0.1228
+            //    TaxRate              -8.60  -0.1042
+            //    NitricOxides         -15.95 -0.1025
+            //    HighwayDistance        5.37 -0.09345
+            //    CrimesPerCapita      -15.05 -0.05797
+            //    PercentPre40s         -4.64 -0.0385
+            //    PercentResidental      3.98 -0.02184
+            //    CharlesRiver           3.38 -0.01487
+            //    PercentNonRetail      -1.94 -0.007231
+            //
+            // Let's dig into these results a little bit. First, if you look at the weights of the model, they generally correlate
+            // with the results of PFI, but there are some significant misorderings. For example, "Tax Rate" is weighted lower than
+            // "Nitric Oxides" and "Crimes Per Capita", but the permutation analysis shows this feature to have a larger effect
+            // on the accuracy of the model even though it has a relatively small weight. To understand why the weights don't 
+            // reflect the same feature importance as PFI, we need to go back to the basics of linear models: one of the 
+            // assumptions of a linear model is that the features are uncorrelated. Now, the features in this dataset are clearly 
+            // correlated: the tax rate for a house and the student-to-teacher ratio at the nearest school, for example, are often 
+            // coupled through school levies. The tax rate, presence of pollution (e.g. nitric oxides), and the crime rate would also
+            // seem to be correlated with each other through social dynamics. We could draw out similar relationships for all the 
+            // variables in this dataset. The reason why the linear model weights don't reflect the same feature importance as PFI
+            // is that the solution to the linear model redistributes weights between correlated variables in unpredictable ways, so
+            // that the weights themselves are no longer a good measure of feature importance.
             Console.WriteLine("Feature\tModel Weight\tChange in R-Squared");
             var rSquared = permutationMetrics.Select(x => x.RSquared).ToArray(); // Fetch r-squared as an array
             foreach (int i in sortedIndices)

From 89b6ffc2c56daa9daadbb53df9c0e064ea2ff8a5 Mon Sep 17 00:00:00 2001
From: Rogan Carr <rocarr@microsoft.com>
Date: Mon, 26 Nov 2018 22:23:11 -0800
Subject: [PATCH 3/8] Addresing PR comments.

---
 .../Dynamic/PermutationFeatureImportance.cs               | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs
index 396d654149..e514d64346 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs
@@ -51,8 +51,8 @@ public static void PFI_Regression()
             // the "Features" column produced by concatenation as the features column.
             var labelName = "MedianHomeValue";
             var pipeline = mlContext.Transforms.Concatenate("Features", "CrimesPerCapita", "PercentResidental",
-                    "PercentNonRetail", "CharlesRiver", "NitricOxides", "RoomsPerDwelling", "PercentPre40s",
-                    "EmploymentDistance", "HighwayDistance", "TaxRate", "TeacherRatio")
+                        "PercentNonRetail", "CharlesRiver", "NitricOxides", "RoomsPerDwelling", "PercentPre40s",
+                        "EmploymentDistance", "HighwayDistance", "TaxRate", "TeacherRatio")
                     .Append(mlContext.Transforms.Normalize("Features"))
                     .Append(mlContext.Regression.Trainers.StochasticDualCoordinateAscent(
                         labelColumn: labelName, featureColumn: "Features"));
@@ -67,8 +67,6 @@ public static void PFI_Regression()
             var permutationMetrics = mlContext.Regression.PermutationFeatureImportance(
                 linearPredictor, transformedData, label: labelName, features: "Features");
 
-            var booooo = transformedData.Preview();
-
             // Now let's look at which features are most important to the model overall
             // First, we have to prepare the data:
             // Get the feature names as an IEnumerable
@@ -114,7 +112,7 @@ public static void PFI_Regression()
             var rSquared = permutationMetrics.Select(x => x.RSquared).ToArray(); // Fetch r-squared as an array
             foreach (int i in sortedIndices)
             {
-                Console.WriteLine("{0}\t{1:0.00}\t{2:G4}", featureNames[i], weights[i], rSquared[i]);
+                Console.WriteLine($"{featureNames[i]}\t{weights[i]:0.00}\t{rSquared[i]:G4}");
             }
         }
 

From 6c9c01a636311a6f48157ab55a178b0c02b5092b Mon Sep 17 00:00:00 2001
From: Rogan Carr <rocarr@microsoft.com>
Date: Tue, 27 Nov 2018 12:30:23 -0800
Subject: [PATCH 4/8] removing blank line (nit)

---
 .../Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs
index e514d64346..680161bd14 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs
@@ -80,7 +80,6 @@ public static void PFI_Regression()
                 .OrderByDescending(feature => Math.Abs(feature.RSquared))
                 .Select(feature => feature.index);
 
-
             // Print out the permutation results, with the model weights, in order of their impact:
             // Expected console output:
             //    Feature            Model Weight    Change in R - Squared

From 3fe43f76cb060033637d82b203725ff1f37e1017 Mon Sep 17 00:00:00 2001
From: Wei-Sheng Chin <wschin@outlook.com>
Date: Tue, 27 Nov 2018 13:53:15 -0800
Subject: [PATCH 5/8] Change normalization comment

Co-Authored-By: rogancarr <rogan.carr@hotmail.com>
---
 .../Dynamic/PermutationFeatureImportance.cs                     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs
index 680161bd14..304bd4435d 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs
@@ -46,7 +46,7 @@ public static void PFI_Regression()
 
             // Step 2: Pipeline
             // Concatenate the features to create a Feature vector.
-            // Normalize the values between 0 and 1
+            // Normalize the data set so that for each feature, its maximum value is 1 while its minimum value is 0.
             // Then append a gam regressor, setting the "MedianHomeValue" column as the label of the dataset,
             // the "Features" column produced by concatenation as the features column.
             var labelName = "MedianHomeValue";

From 2ec6b6a2ffed4233f5239026eda8f76cb924d19b Mon Sep 17 00:00:00 2001
From: Rogan Carr <rocarr@microsoft.com>
Date: Tue, 27 Nov 2018 16:27:39 -0800
Subject: [PATCH 6/8] Fixing comments

---
 .../Dynamic/PermutationFeatureImportance.cs                  | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs
index 680161bd14..ad508a2bd7 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs
@@ -20,6 +20,9 @@ public static void PFI_Regression()
 
             // Step 1: Read the data as an IDataView.
             // First, we define the reader: specify the data columns and where to find them in the text file.
+            // The data looks like this:
+            //     24.00  0.00632  18.00  2.310  0  0.5380  6.5750  65.20  4.0900  1  296.0  15.30
+            //     21.60  0.02731   0.00  7.070  0  0.4690  6.4210  78.90  4.9671  2  242.0  17.80
             var reader = mlContext.Data.TextReader(new TextLoader.Arguments()
                 {
                     Separator = "tab",
@@ -47,7 +50,7 @@ public static void PFI_Regression()
             // Step 2: Pipeline
             // Concatenate the features to create a Feature vector.
             // Normalize the values between 0 and 1
-            // Then append a gam regressor, setting the "MedianHomeValue" column as the label of the dataset,
+            // Then append a linear regression trainer, setting the "MedianHomeValue" column as the label of the dataset,
             // the "Features" column produced by concatenation as the features column.
             var labelName = "MedianHomeValue";
             var pipeline = mlContext.Transforms.Concatenate("Features", "CrimesPerCapita", "PercentResidental",

From 03bb41502bebfd5b374c98711a798978762a90cd Mon Sep 17 00:00:00 2001
From: Rogan Carr <rocarr@microsoft.com>
Date: Tue, 27 Nov 2018 16:37:01 -0800
Subject: [PATCH 7/8] Cleaning up the language in comments, and variable names

---
 .../Dynamic/PermutationFeatureImportance.cs               | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs
index defdd29460..283cce6a1a 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs
@@ -51,7 +51,7 @@ public static void PFI_Regression()
             // Concatenate the features to create a Feature vector.
             // Normalize the data set so that for each feature, its maximum value is 1 while its minimum value is 0.
             // Then append a linear regression trainer, setting the "MedianHomeValue" column as the label of the dataset,
-            // the "Features" column produced by concatenation as the features column.
+            // the "Features" column produced by concatenation as the features of the dataset.
             var labelName = "MedianHomeValue";
             var pipeline = mlContext.Transforms.Concatenate("Features", "CrimesPerCapita", "PercentResidental",
                         "PercentNonRetail", "CharlesRiver", "NitricOxides", "RoomsPerDwelling", "PercentPre40s",
@@ -59,14 +59,14 @@ public static void PFI_Regression()
                     .Append(mlContext.Transforms.Normalize("Features"))
                     .Append(mlContext.Regression.Trainers.StochasticDualCoordinateAscent(
                         labelColumn: labelName, featureColumn: "Features"));
-            var fitPipeline = pipeline.Fit(data);
+            var model = pipeline.Fit(data);
 
             // Extract the model from the pipeline
-            var linearPredictor = fitPipeline.LastTransformer;
+            var linearPredictor = model.LastTransformer;
             var weights = GetLinearModelWeights(linearPredictor.Model);
 
             // Compute the permutation metrics using the properly-featurized data.
-            var transformedData = fitPipeline.Transform(data);
+            var transformedData = model.Transform(data);
             var permutationMetrics = mlContext.Regression.PermutationFeatureImportance(
                 linearPredictor, transformedData, label: labelName, features: "Features");
 

From 648473a4642447a94cab2b268207908332744eaa Mon Sep 17 00:00:00 2001
From: Rogan Carr <rocarr@microsoft.com>
Date: Tue, 27 Nov 2018 17:01:31 -0800
Subject: [PATCH 8/8] Updating comments around data load

---
 .../Dynamic/PermutationFeatureImportance.cs                 | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs
index 283cce6a1a..0c95abacb8 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs
@@ -20,15 +20,15 @@ public static void PFI_Regression()
 
             // Step 1: Read the data as an IDataView.
             // First, we define the reader: specify the data columns and where to find them in the text file.
-            // The data looks like this:
-            //     24.00  0.00632  18.00  2.310  0  0.5380  6.5750  65.20  4.0900  1  296.0  15.30
-            //     21.60  0.02731   0.00  7.070  0  0.4690  6.4210  78.90  4.9671  2  242.0  17.80
+            // The data file is composed of rows of data, with each row having 11 numerical columns
+            // separated by whitespace.
             var reader = mlContext.Data.TextReader(new TextLoader.Arguments()
                 {
                     Separator = "tab",
                     HasHeader = true,
                     Column = new[]
                     {
+                        // Read the first column (indexed by 0) in the data file as an R4 (float)
                         new TextLoader.Column("MedianHomeValue", DataKind.R4, 0),
                         new TextLoader.Column("CrimesPerCapita", DataKind.R4, 1),
                         new TextLoader.Column("PercentResidental", DataKind.R4, 2),