From e1a5d3a95231e09ac7d0eba6da399beb64eb90f6 Mon Sep 17 00:00:00 2001 From: Rogan Carr Date: Mon, 26 Nov 2018 16:11:33 -0800 Subject: [PATCH 1/8] Adding an example for using PFI --- .../Dynamic/PermutationFeatureImportance.cs | 97 +++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs new file mode 100644 index 0000000000..9f8c59cc6b --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs @@ -0,0 +1,97 @@ +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Learners; +using System; +using System.Linq; + +namespace Microsoft.ML.Samples.Dynamic +{ + public class PFI_RegressionExample + { + public static void PFI_Regression() + { + // Download the dataset from github.com/dotnet/machinelearning. + // This will create a housing.txt file in the filesystem. + // You can open this file to see the data. + string dataFile = SamplesUtils.DatasetUtils.DownloadHousingRegressionDataset(); + + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // as a catalog of available operations and as the source of randomness. + var mlContext = new MLContext(); + + // Step 1: Read the data as an IDataView. + // First, we define the reader: specify the data columns and where to find them in the text file. + var reader = mlContext.Data.TextReader(new TextLoader.Arguments() + { + Separator = "tab", + HasHeader = true, + Column = new[] + { + new TextLoader.Column("MedianHomeValue", DataKind.R4, 0), + new TextLoader.Column("CrimesPerCapita", DataKind.R4, 1), + new TextLoader.Column("PercentResidental", DataKind.R4, 2), + new TextLoader.Column("PercentNonRetail", DataKind.R4, 3), + new TextLoader.Column("CharlesRiver", DataKind.R4, 4), + new TextLoader.Column("NitricOxides", DataKind.R4, 5), + new TextLoader.Column("RoomsPerDwelling", DataKind.R4, 6), + new TextLoader.Column("PercentPre40s", DataKind.R4, 7), + new TextLoader.Column("EmploymentDistance", DataKind.R4, 8), + new TextLoader.Column("HighwayDistance", DataKind.R4, 9), + new TextLoader.Column("TaxRate", DataKind.R4, 10), + new TextLoader.Column("TeacherRatio", DataKind.R4, 11), + } + }); + + // Read the data + var data = reader.Read(dataFile); + + // Step 2: Pipeline + // Concatenate the features to create a Feature vector. + // Then append a gam regressor, setting the "MedianHomeValue" column as the label of the dataset, + // the "Features" column produced by concatenation as the features column. + var labelName = "MedianHomeValue"; + var pipeline = mlContext.Transforms.Concatenate("Features", "CrimesPerCapita", "PercentResidental", + "PercentNonRetail", "CharlesRiver", "NitricOxides", "RoomsPerDwelling", "PercentPre40s", + "EmploymentDistance", "HighwayDistance", "TaxRate", "TeacherRatio") + .Append(mlContext.Regression.Trainers.StochasticDualCoordinateAscent( + labelColumn: labelName, featureColumn: "Features")); + var fitPipeline = pipeline.Fit(data); + + // Extract the model from the pipeline + var linearPredictor = fitPipeline.LastTransformer; + var weights = GetLinearModelWeights(linearPredictor.Model); + + // Compute the permutation metrics using the properly-featurized data. + var transformedData = fitPipeline.Transform(data); + var permutationMetrics = mlContext.Regression.PermutationFeatureImportance( + linearPredictor, transformedData, label: labelName, features: "Features"); + + // Now let's look at which features are most important to the model overall + // First, we have to prepare the data: + // Get the feature names as an IEnumerable + var featureNames = data.Schema.GetColumns() + .Select(tuple => tuple.column.Name) // Get the column names + .Where(name => name != labelName) // Drop the Label + .ToArray(); + + // Get the feature indices sorted by their impact on R-Squared + var sortedIndices = permutationMetrics.Select((metrics, index) => new { index, metrics.RSquared }) + .OrderByDescending(feature => Math.Abs(feature.RSquared)) + .Select(feature => feature.index); + + // Print out the permutation results, with the model weights, in order of their impact + Console.WriteLine("Feature\tModel Weight\tChange in R-Squared"); + var rSquared = permutationMetrics.Select(x => x.RSquared).ToArray(); // Fetch r-squared as an array + foreach (int i in sortedIndices) + { + Console.WriteLine("{0}\t{1:0.00}\t{2:G4}", featureNames[i], weights[i], rSquared[i]); + } + } + + private static float[] GetLinearModelWeights(LinearRegressionPredictor linearModel) + { + var weights = new VBuffer(); + linearModel.GetFeatureWeights(ref weights); + return weights.GetValues().ToArray(); + } + } +} From b81965a7f3547e8f90e64b51ea1d68e73e7f9427 Mon Sep 17 00:00:00 2001 From: Rogan Carr Date: Mon, 26 Nov 2018 17:00:02 -0800 Subject: [PATCH 2/8] Adding in the expected console output. --- .../Dynamic/PermutationFeatureImportance.cs | 33 ++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs index 9f8c59cc6b..396d654149 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs @@ -46,12 +46,14 @@ public static void PFI_Regression() // Step 2: Pipeline // Concatenate the features to create a Feature vector. + // Normalize the values between 0 and 1 // Then append a gam regressor, setting the "MedianHomeValue" column as the label of the dataset, // the "Features" column produced by concatenation as the features column. var labelName = "MedianHomeValue"; var pipeline = mlContext.Transforms.Concatenate("Features", "CrimesPerCapita", "PercentResidental", "PercentNonRetail", "CharlesRiver", "NitricOxides", "RoomsPerDwelling", "PercentPre40s", "EmploymentDistance", "HighwayDistance", "TaxRate", "TeacherRatio") + .Append(mlContext.Transforms.Normalize("Features")) .Append(mlContext.Regression.Trainers.StochasticDualCoordinateAscent( labelColumn: labelName, featureColumn: "Features")); var fitPipeline = pipeline.Fit(data); @@ -65,6 +67,8 @@ public static void PFI_Regression() var permutationMetrics = mlContext.Regression.PermutationFeatureImportance( linearPredictor, transformedData, label: labelName, features: "Features"); + var booooo = transformedData.Preview(); + // Now let's look at which features are most important to the model overall // First, we have to prepare the data: // Get the feature names as an IEnumerable @@ -78,7 +82,34 @@ public static void PFI_Regression() .OrderByDescending(feature => Math.Abs(feature.RSquared)) .Select(feature => feature.index); - // Print out the permutation results, with the model weights, in order of their impact + + // Print out the permutation results, with the model weights, in order of their impact: + // Expected console output: + // Feature Model Weight Change in R - Squared + // RoomsPerDwelling 50.80 -0.3695 + // EmploymentDistance -17.79 -0.2238 + // TeacherRatio -19.83 -0.1228 + // TaxRate -8.60 -0.1042 + // NitricOxides -15.95 -0.1025 + // HighwayDistance 5.37 -0.09345 + // CrimesPerCapita -15.05 -0.05797 + // PercentPre40s -4.64 -0.0385 + // PercentResidental 3.98 -0.02184 + // CharlesRiver 3.38 -0.01487 + // PercentNonRetail -1.94 -0.007231 + // + // Let's dig into these results a little bit. First, if you look at the weights of the model, they generally correlate + // with the results of PFI, but there are some significant misorderings. For example, "Tax Rate" is weighted lower than + // "Nitric Oxides" and "Crimes Per Capita", but the permutation analysis shows this feature to have a larger effect + // on the accuracy of the model even though it has a relatively small weight. To understand why the weights don't + // reflect the same feature importance as PFI, we need to go back to the basics of linear models: one of the + // assumptions of a linear model is that the features are uncorrelated. Now, the features in this dataset are clearly + // correlated: the tax rate for a house and the student-to-teacher ratio at the nearest school, for example, are often + // coupled through school levies. The tax rate, presence of pollution (e.g. nitric oxides), and the crime rate would also + // seem to be correlated with each other through social dynamics. We could draw out similar relationships for all the + // variables in this dataset. The reason why the linear model weights don't reflect the same feature importance as PFI + // is that the solution to the linear model redistributes weights between correlated variables in unpredictable ways, so + // that the weights themselves are no longer a good measure of feature importance. Console.WriteLine("Feature\tModel Weight\tChange in R-Squared"); var rSquared = permutationMetrics.Select(x => x.RSquared).ToArray(); // Fetch r-squared as an array foreach (int i in sortedIndices) From 89b6ffc2c56daa9daadbb53df9c0e064ea2ff8a5 Mon Sep 17 00:00:00 2001 From: Rogan Carr Date: Mon, 26 Nov 2018 22:23:11 -0800 Subject: [PATCH 3/8] Addresing PR comments. --- .../Dynamic/PermutationFeatureImportance.cs | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs index 396d654149..e514d64346 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs @@ -51,8 +51,8 @@ public static void PFI_Regression() // the "Features" column produced by concatenation as the features column. var labelName = "MedianHomeValue"; var pipeline = mlContext.Transforms.Concatenate("Features", "CrimesPerCapita", "PercentResidental", - "PercentNonRetail", "CharlesRiver", "NitricOxides", "RoomsPerDwelling", "PercentPre40s", - "EmploymentDistance", "HighwayDistance", "TaxRate", "TeacherRatio") + "PercentNonRetail", "CharlesRiver", "NitricOxides", "RoomsPerDwelling", "PercentPre40s", + "EmploymentDistance", "HighwayDistance", "TaxRate", "TeacherRatio") .Append(mlContext.Transforms.Normalize("Features")) .Append(mlContext.Regression.Trainers.StochasticDualCoordinateAscent( labelColumn: labelName, featureColumn: "Features")); @@ -67,8 +67,6 @@ public static void PFI_Regression() var permutationMetrics = mlContext.Regression.PermutationFeatureImportance( linearPredictor, transformedData, label: labelName, features: "Features"); - var booooo = transformedData.Preview(); - // Now let's look at which features are most important to the model overall // First, we have to prepare the data: // Get the feature names as an IEnumerable @@ -114,7 +112,7 @@ public static void PFI_Regression() var rSquared = permutationMetrics.Select(x => x.RSquared).ToArray(); // Fetch r-squared as an array foreach (int i in sortedIndices) { - Console.WriteLine("{0}\t{1:0.00}\t{2:G4}", featureNames[i], weights[i], rSquared[i]); + Console.WriteLine($"{featureNames[i]}\t{weights[i]:0.00}\t{rSquared[i]:G4}"); } } From 6c9c01a636311a6f48157ab55a178b0c02b5092b Mon Sep 17 00:00:00 2001 From: Rogan Carr Date: Tue, 27 Nov 2018 12:30:23 -0800 Subject: [PATCH 4/8] removing blank line (nit) --- .../Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs index e514d64346..680161bd14 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs @@ -80,7 +80,6 @@ public static void PFI_Regression() .OrderByDescending(feature => Math.Abs(feature.RSquared)) .Select(feature => feature.index); - // Print out the permutation results, with the model weights, in order of their impact: // Expected console output: // Feature Model Weight Change in R - Squared From 3fe43f76cb060033637d82b203725ff1f37e1017 Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Tue, 27 Nov 2018 13:53:15 -0800 Subject: [PATCH 5/8] Change normalization comment Co-Authored-By: rogancarr --- .../Dynamic/PermutationFeatureImportance.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs index 680161bd14..304bd4435d 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs @@ -46,7 +46,7 @@ public static void PFI_Regression() // Step 2: Pipeline // Concatenate the features to create a Feature vector. - // Normalize the values between 0 and 1 + // Normalize the data set so that for each feature, its maximum value is 1 while its minimum value is 0. // Then append a gam regressor, setting the "MedianHomeValue" column as the label of the dataset, // the "Features" column produced by concatenation as the features column. var labelName = "MedianHomeValue"; From 2ec6b6a2ffed4233f5239026eda8f76cb924d19b Mon Sep 17 00:00:00 2001 From: Rogan Carr Date: Tue, 27 Nov 2018 16:27:39 -0800 Subject: [PATCH 6/8] Fixing comments --- .../Dynamic/PermutationFeatureImportance.cs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs index 680161bd14..ad508a2bd7 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs @@ -20,6 +20,9 @@ public static void PFI_Regression() // Step 1: Read the data as an IDataView. // First, we define the reader: specify the data columns and where to find them in the text file. + // The data looks like this: + // 24.00 0.00632 18.00 2.310 0 0.5380 6.5750 65.20 4.0900 1 296.0 15.30 + // 21.60 0.02731 0.00 7.070 0 0.4690 6.4210 78.90 4.9671 2 242.0 17.80 var reader = mlContext.Data.TextReader(new TextLoader.Arguments() { Separator = "tab", @@ -47,7 +50,7 @@ public static void PFI_Regression() // Step 2: Pipeline // Concatenate the features to create a Feature vector. // Normalize the values between 0 and 1 - // Then append a gam regressor, setting the "MedianHomeValue" column as the label of the dataset, + // Then append a linear regression trainer, setting the "MedianHomeValue" column as the label of the dataset, // the "Features" column produced by concatenation as the features column. var labelName = "MedianHomeValue"; var pipeline = mlContext.Transforms.Concatenate("Features", "CrimesPerCapita", "PercentResidental", From 03bb41502bebfd5b374c98711a798978762a90cd Mon Sep 17 00:00:00 2001 From: Rogan Carr Date: Tue, 27 Nov 2018 16:37:01 -0800 Subject: [PATCH 7/8] Cleaning up the language in comments, and variable names --- .../Dynamic/PermutationFeatureImportance.cs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs index defdd29460..283cce6a1a 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs @@ -51,7 +51,7 @@ public static void PFI_Regression() // Concatenate the features to create a Feature vector. // Normalize the data set so that for each feature, its maximum value is 1 while its minimum value is 0. // Then append a linear regression trainer, setting the "MedianHomeValue" column as the label of the dataset, - // the "Features" column produced by concatenation as the features column. + // the "Features" column produced by concatenation as the features of the dataset. var labelName = "MedianHomeValue"; var pipeline = mlContext.Transforms.Concatenate("Features", "CrimesPerCapita", "PercentResidental", "PercentNonRetail", "CharlesRiver", "NitricOxides", "RoomsPerDwelling", "PercentPre40s", @@ -59,14 +59,14 @@ public static void PFI_Regression() .Append(mlContext.Transforms.Normalize("Features")) .Append(mlContext.Regression.Trainers.StochasticDualCoordinateAscent( labelColumn: labelName, featureColumn: "Features")); - var fitPipeline = pipeline.Fit(data); + var model = pipeline.Fit(data); // Extract the model from the pipeline - var linearPredictor = fitPipeline.LastTransformer; + var linearPredictor = model.LastTransformer; var weights = GetLinearModelWeights(linearPredictor.Model); // Compute the permutation metrics using the properly-featurized data. - var transformedData = fitPipeline.Transform(data); + var transformedData = model.Transform(data); var permutationMetrics = mlContext.Regression.PermutationFeatureImportance( linearPredictor, transformedData, label: labelName, features: "Features"); From 648473a4642447a94cab2b268207908332744eaa Mon Sep 17 00:00:00 2001 From: Rogan Carr Date: Tue, 27 Nov 2018 17:01:31 -0800 Subject: [PATCH 8/8] Updating comments around data load --- .../Dynamic/PermutationFeatureImportance.cs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs index 283cce6a1a..0c95abacb8 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs @@ -20,15 +20,15 @@ public static void PFI_Regression() // Step 1: Read the data as an IDataView. // First, we define the reader: specify the data columns and where to find them in the text file. - // The data looks like this: - // 24.00 0.00632 18.00 2.310 0 0.5380 6.5750 65.20 4.0900 1 296.0 15.30 - // 21.60 0.02731 0.00 7.070 0 0.4690 6.4210 78.90 4.9671 2 242.0 17.80 + // The data file is composed of rows of data, with each row having 11 numerical columns + // separated by whitespace. var reader = mlContext.Data.TextReader(new TextLoader.Arguments() { Separator = "tab", HasHeader = true, Column = new[] { + // Read the first column (indexed by 0) in the data file as an R4 (float) new TextLoader.Column("MedianHomeValue", DataKind.R4, 0), new TextLoader.Column("CrimesPerCapita", DataKind.R4, 1), new TextLoader.Column("PercentResidental", DataKind.R4, 2),