Skip to content

Commit 3ff25e0

Browse files
authored
Add a sample for Permutation Feature Importance (#1728)
Adding an example for using Permutation Feature Importance and using its output to evaluate features in a model.
1 parent eccab70 commit 3ff25e0

File tree

1 file changed

+128
-0
lines changed

1 file changed

+128
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
using Microsoft.ML.Runtime.Data;
2+
using Microsoft.ML.Runtime.Learners;
3+
using System;
4+
using System.Linq;
5+
6+
namespace Microsoft.ML.Samples.Dynamic
7+
{
8+
public class PFI_RegressionExample
9+
{
10+
public static void PFI_Regression()
11+
{
12+
// Download the dataset from github.com/dotnet/machinelearning.
13+
// This will create a housing.txt file in the filesystem.
14+
// You can open this file to see the data.
15+
string dataFile = SamplesUtils.DatasetUtils.DownloadHousingRegressionDataset();
16+
17+
// Create a new context for ML.NET operations. It can be used for exception tracking and logging,
18+
// as a catalog of available operations and as the source of randomness.
19+
var mlContext = new MLContext();
20+
21+
// Step 1: Read the data as an IDataView.
22+
// First, we define the reader: specify the data columns and where to find them in the text file.
23+
// The data file is composed of rows of data, with each row having 11 numerical columns
24+
// separated by whitespace.
25+
var reader = mlContext.Data.TextReader(new TextLoader.Arguments()
26+
{
27+
Separator = "tab",
28+
HasHeader = true,
29+
Column = new[]
30+
{
31+
// Read the first column (indexed by 0) in the data file as an R4 (float)
32+
new TextLoader.Column("MedianHomeValue", DataKind.R4, 0),
33+
new TextLoader.Column("CrimesPerCapita", DataKind.R4, 1),
34+
new TextLoader.Column("PercentResidental", DataKind.R4, 2),
35+
new TextLoader.Column("PercentNonRetail", DataKind.R4, 3),
36+
new TextLoader.Column("CharlesRiver", DataKind.R4, 4),
37+
new TextLoader.Column("NitricOxides", DataKind.R4, 5),
38+
new TextLoader.Column("RoomsPerDwelling", DataKind.R4, 6),
39+
new TextLoader.Column("PercentPre40s", DataKind.R4, 7),
40+
new TextLoader.Column("EmploymentDistance", DataKind.R4, 8),
41+
new TextLoader.Column("HighwayDistance", DataKind.R4, 9),
42+
new TextLoader.Column("TaxRate", DataKind.R4, 10),
43+
new TextLoader.Column("TeacherRatio", DataKind.R4, 11),
44+
}
45+
});
46+
47+
// Read the data
48+
var data = reader.Read(dataFile);
49+
50+
// Step 2: Pipeline
51+
// Concatenate the features to create a Feature vector.
52+
// Normalize the data set so that for each feature, its maximum value is 1 while its minimum value is 0.
53+
// Then append a linear regression trainer, setting the "MedianHomeValue" column as the label of the dataset,
54+
// the "Features" column produced by concatenation as the features of the dataset.
55+
var labelName = "MedianHomeValue";
56+
var pipeline = mlContext.Transforms.Concatenate("Features", "CrimesPerCapita", "PercentResidental",
57+
"PercentNonRetail", "CharlesRiver", "NitricOxides", "RoomsPerDwelling", "PercentPre40s",
58+
"EmploymentDistance", "HighwayDistance", "TaxRate", "TeacherRatio")
59+
.Append(mlContext.Transforms.Normalize("Features"))
60+
.Append(mlContext.Regression.Trainers.StochasticDualCoordinateAscent(
61+
labelColumn: labelName, featureColumn: "Features"));
62+
var model = pipeline.Fit(data);
63+
64+
// Extract the model from the pipeline
65+
var linearPredictor = model.LastTransformer;
66+
var weights = GetLinearModelWeights(linearPredictor.Model);
67+
68+
// Compute the permutation metrics using the properly-featurized data.
69+
var transformedData = model.Transform(data);
70+
var permutationMetrics = mlContext.Regression.PermutationFeatureImportance(
71+
linearPredictor, transformedData, label: labelName, features: "Features");
72+
73+
// Now let's look at which features are most important to the model overall
74+
// First, we have to prepare the data:
75+
// Get the feature names as an IEnumerable
76+
var featureNames = data.Schema.GetColumns()
77+
.Select(tuple => tuple.column.Name) // Get the column names
78+
.Where(name => name != labelName) // Drop the Label
79+
.ToArray();
80+
81+
// Get the feature indices sorted by their impact on R-Squared
82+
var sortedIndices = permutationMetrics.Select((metrics, index) => new { index, metrics.RSquared })
83+
.OrderByDescending(feature => Math.Abs(feature.RSquared))
84+
.Select(feature => feature.index);
85+
86+
// Print out the permutation results, with the model weights, in order of their impact:
87+
// Expected console output:
88+
// Feature Model Weight Change in R - Squared
89+
// RoomsPerDwelling 50.80 -0.3695
90+
// EmploymentDistance -17.79 -0.2238
91+
// TeacherRatio -19.83 -0.1228
92+
// TaxRate -8.60 -0.1042
93+
// NitricOxides -15.95 -0.1025
94+
// HighwayDistance 5.37 -0.09345
95+
// CrimesPerCapita -15.05 -0.05797
96+
// PercentPre40s -4.64 -0.0385
97+
// PercentResidental 3.98 -0.02184
98+
// CharlesRiver 3.38 -0.01487
99+
// PercentNonRetail -1.94 -0.007231
100+
//
101+
// Let's dig into these results a little bit. First, if you look at the weights of the model, they generally correlate
102+
// with the results of PFI, but there are some significant misorderings. For example, "Tax Rate" is weighted lower than
103+
// "Nitric Oxides" and "Crimes Per Capita", but the permutation analysis shows this feature to have a larger effect
104+
// on the accuracy of the model even though it has a relatively small weight. To understand why the weights don't
105+
// reflect the same feature importance as PFI, we need to go back to the basics of linear models: one of the
106+
// assumptions of a linear model is that the features are uncorrelated. Now, the features in this dataset are clearly
107+
// correlated: the tax rate for a house and the student-to-teacher ratio at the nearest school, for example, are often
108+
// coupled through school levies. The tax rate, presence of pollution (e.g. nitric oxides), and the crime rate would also
109+
// seem to be correlated with each other through social dynamics. We could draw out similar relationships for all the
110+
// variables in this dataset. The reason why the linear model weights don't reflect the same feature importance as PFI
111+
// is that the solution to the linear model redistributes weights between correlated variables in unpredictable ways, so
112+
// that the weights themselves are no longer a good measure of feature importance.
113+
Console.WriteLine("Feature\tModel Weight\tChange in R-Squared");
114+
var rSquared = permutationMetrics.Select(x => x.RSquared).ToArray(); // Fetch r-squared as an array
115+
foreach (int i in sortedIndices)
116+
{
117+
Console.WriteLine($"{featureNames[i]}\t{weights[i]:0.00}\t{rSquared[i]:G4}");
118+
}
119+
}
120+
121+
private static float[] GetLinearModelWeights(LinearRegressionPredictor linearModel)
122+
{
123+
var weights = new VBuffer<float>();
124+
linearModel.GetFeatureWeights(ref weights);
125+
return weights.GetValues().ToArray();
126+
}
127+
}
128+
}

0 commit comments

Comments
 (0)