|
1 |
| -using Microsoft.ML.Runtime.Data; |
2 |
| -using Microsoft.ML.Runtime.Learners; |
3 |
| -using Microsoft.ML.Trainers.HalLearners; |
4 |
| -using System; |
| 1 | +using System; |
5 | 2 | using System.Linq;
|
6 | 3 |
|
7 |
| -namespace Microsoft.ML.Samples.Dynamic |
| 4 | +namespace Microsoft.ML.Samples.Dynamic.PermutationFeatureImportance |
8 | 5 | {
|
9 |
| - public class PFI_RegressionExample |
| 6 | + public class PfiRegressionExample |
10 | 7 | {
|
11 |
| - public static void PFI_Regression() |
| 8 | + public static void RunExample() |
12 | 9 | {
|
13 |
| - // Download the dataset from github.com/dotnet/machinelearning. |
14 |
| - // This will create a housing.txt file in the filesystem. |
15 |
| - // You can open this file to see the data. |
16 |
| - string dataFile = SamplesUtils.DatasetUtils.DownloadHousingRegressionDataset(); |
17 |
| - |
18 | 10 | // Create a new context for ML.NET operations. It can be used for exception tracking and logging,
|
19 | 11 | // as a catalog of available operations and as the source of randomness.
|
20 | 12 | var mlContext = new MLContext();
|
21 | 13 |
|
22 |
| - // Step 1: Read the data as an IDataView. |
23 |
| - // First, we define the reader: specify the data columns and where to find them in the text file. |
24 |
| - // The data file is composed of rows of data, with each row having 11 numerical columns |
25 |
| - // separated by whitespace. |
26 |
| - var reader = mlContext.Data.CreateTextReader( |
27 |
| - columns: new[] |
28 |
| - { |
29 |
| - // Read the first column (indexed by 0) in the data file as an R4 (float) |
30 |
| - new TextLoader.Column("MedianHomeValue", DataKind.R4, 0), |
31 |
| - new TextLoader.Column("CrimesPerCapita", DataKind.R4, 1), |
32 |
| - new TextLoader.Column("PercentResidental", DataKind.R4, 2), |
33 |
| - new TextLoader.Column("PercentNonRetail", DataKind.R4, 3), |
34 |
| - new TextLoader.Column("CharlesRiver", DataKind.R4, 4), |
35 |
| - new TextLoader.Column("NitricOxides", DataKind.R4, 5), |
36 |
| - new TextLoader.Column("RoomsPerDwelling", DataKind.R4, 6), |
37 |
| - new TextLoader.Column("PercentPre40s", DataKind.R4, 7), |
38 |
| - new TextLoader.Column("EmploymentDistance", DataKind.R4, 8), |
39 |
| - new TextLoader.Column("HighwayDistance", DataKind.R4, 9), |
40 |
| - new TextLoader.Column("TaxRate", DataKind.R4, 10), |
41 |
| - new TextLoader.Column("TeacherRatio", DataKind.R4, 11) |
42 |
| - }, |
43 |
| - hasHeader: true |
44 |
| - ); |
45 |
| - |
46 |
| - // Read the data |
47 |
| - var data = reader.Read(dataFile); |
| 14 | + // Step 1: Read the data |
| 15 | + var data = PfiHelper.GetHousingRegressionIDataView(mlContext, out string labelName, out string[] featureNames); |
48 | 16 |
|
49 | 17 | // Step 2: Pipeline
|
50 | 18 | // Concatenate the features to create a Feature vector.
|
51 | 19 | // Normalize the data set so that for each feature, its maximum value is 1 while its minimum value is 0.
|
52 |
| - // Then append a linear regression trainer, setting the "MedianHomeValue" column as the label of the dataset, |
53 |
| - // the "Features" column produced by concatenation as the features of the dataset. |
54 |
| - var labelName = "MedianHomeValue"; |
55 |
| - var pipeline = mlContext.Transforms.Concatenate("Features", "CrimesPerCapita", "PercentResidental", |
56 |
| - "PercentNonRetail", "CharlesRiver", "NitricOxides", "RoomsPerDwelling", "PercentPre40s", |
57 |
| - "EmploymentDistance", "HighwayDistance", "TaxRate", "TeacherRatio") |
| 20 | + // Then append a linear regression trainer. |
| 21 | + var pipeline = mlContext.Transforms.Concatenate("Features", featureNames) |
58 | 22 | .Append(mlContext.Transforms.Normalize("Features"))
|
59 | 23 | .Append(mlContext.Regression.Trainers.OrdinaryLeastSquares(
|
60 | 24 | labelColumn: labelName, featureColumn: "Features"));
|
61 |
| - |
62 | 25 | var model = pipeline.Fit(data);
|
| 26 | + |
63 | 27 | // Extract the model from the pipeline
|
64 | 28 | var linearPredictor = model.LastTransformer;
|
65 |
| - var weights = GetLinearModelWeights(linearPredictor.Model); |
| 29 | + var weights = PfiHelper.GetLinearModelWeights(linearPredictor.Model); |
66 | 30 |
|
67 |
| - // Compute the permutation metrics using the properly-featurized data. |
| 31 | + // Compute the permutation metrics using the properly normalized data. |
68 | 32 | var transformedData = model.Transform(data);
|
69 | 33 | var permutationMetrics = mlContext.Regression.PermutationFeatureImportance(
|
70 | 34 | linearPredictor, transformedData, label: labelName, features: "Features", permutationCount: 3);
|
71 | 35 |
|
72 | 36 | // Now let's look at which features are most important to the model overall
|
73 |
| - // First, we have to prepare the data: |
74 |
| - // Get the feature names as an IEnumerable |
75 |
| - var featureNames = data.Schema |
76 |
| - .Select(column => column.Name) // Get the column names |
77 |
| - .Where(name => name != labelName) // Drop the Label |
78 |
| - .ToArray(); |
79 |
| - |
80 | 37 | // Get the feature indices sorted by their impact on R-Squared
|
81 | 38 | var sortedIndices = permutationMetrics.Select((metrics, index) => new { index, metrics.RSquared })
|
82 | 39 | .OrderByDescending(feature => Math.Abs(feature.RSquared.Mean))
|
@@ -116,10 +73,5 @@ public static void PFI_Regression()
|
116 | 73 | Console.WriteLine($"{featureNames[i]}\t{weights[i]:0.00}\t{rSquared[i].Mean:G4}\t{1.96 * rSquared[i].StandardError:G4}");
|
117 | 74 | }
|
118 | 75 | }
|
119 |
| - |
120 |
| - private static float[] GetLinearModelWeights(OlsLinearRegressionModelParameters linearModel) |
121 |
| - { |
122 |
| - return linearModel.Weights.ToArray(); |
123 |
| - } |
124 | 76 | }
|
125 | 77 | }
|
0 commit comments