|
| 1 | +using Microsoft.ML.Data; |
| 2 | +using Microsoft.ML.Runtime.Data; |
| 3 | +using System; |
| 4 | +using System.Collections.Generic; |
| 5 | + |
| 6 | +namespace Microsoft.ML.Samples.Dynamic |
| 7 | +{ |
| 8 | + public class FeatureSelectionTransformExample |
| 9 | + { |
| 10 | + public static void FeatureSelectionTransform() |
| 11 | + { |
| 12 | + // Downloading a classification dataset from github.com/dotnet/machinelearning. |
| 13 | + // It will be stored in the same path as the executable |
| 14 | + string dataFilePath = SamplesUtils.DatasetUtils.DownloadBreastCancerDataset(); |
| 15 | + |
| 16 | + // Data Preview |
| 17 | + // 1. Label 0=benign, 1=malignant |
| 18 | + // 2. Clump Thickness 1 - 10 |
| 19 | + // 3. Uniformity of Cell Size 1 - 10 |
| 20 | + // 4. Uniformity of Cell Shape 1 - 10 |
| 21 | + // 5. Marginal Adhesion 1 - 10 |
| 22 | + // 6. Single Epithelial Cell Size 1 - 10 |
| 23 | + // 7. Bare Nuclei 1 - 10 |
| 24 | + // 8. Bland Chromatin 1 - 10 |
| 25 | + // 9. Normal Nucleoli 1 - 10 |
| 26 | + // 10. Mitoses 1 - 10 |
| 27 | + |
| 28 | + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, |
| 29 | + // as well as the source of randomness. |
| 30 | + var ml = new MLContext(); |
| 31 | + |
| 32 | + // First, we define the reader: specify the data columns and where to find them in the text file. Notice that we combine entries from |
| 33 | + // all the feature columns into entries of a vector of a single column named "Features". |
| 34 | + var reader = ml.Data.TextReader(new TextLoader.Arguments() |
| 35 | + { |
| 36 | + Separator = "tab", |
| 37 | + HasHeader = true, |
| 38 | + Column = new[] |
| 39 | + { |
| 40 | + new TextLoader.Column("Label", DataKind.BL, 0), |
| 41 | + new TextLoader.Column("Features", DataKind.Num, new [] { new TextLoader.Range(1, 9) }) |
| 42 | + } |
| 43 | + }); |
| 44 | + |
| 45 | + // Then, we use the reader to read the data as an IDataView. |
| 46 | + var data = reader.Read(dataFilePath); |
| 47 | + |
| 48 | + // Second, we define the transformations that we apply on the data. Remember that an Estimator does not transform data |
| 49 | + // directly, but it needs to be trained on data using .Fit(), and it will output a Transformer, which can transform data. |
| 50 | + |
| 51 | + // In this example we define a CountFeatureSelectingEstimator, that selects slots in a feature vector that have more non-default |
| 52 | + // values than the specified count. This transformation can be used to remove slots with too many missing values. |
| 53 | + var countSelectEst = ml.Transforms.FeatureSelection.SelectFeaturesBasedOnCount( |
| 54 | + inputColumn: "Features", outputColumn: "FeaturesCountSelect", count: 695); |
| 55 | + |
| 56 | + // We also define a MutualInformationFeatureSelectingEstimator that selects the top k slots in a feature |
| 57 | + // vector based on highest mutual information between that slot and a specified label. Notice that it is possible to |
| 58 | + // specify the parameter `numBins', which controls the number of bins used in the approximation of the mutual information |
| 59 | + // between features and label. |
| 60 | + var mutualInfoEst = ml.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation( |
| 61 | + inputColumn: "FeaturesCountSelect", outputColumn: "FeaturesMISelect", labelColumn: "Label", slotsInOutput: 5); |
| 62 | + |
| 63 | + // Now, we can put the previous two transformations together in a pipeline. |
| 64 | + var pipeline = countSelectEst.Append(mutualInfoEst); |
| 65 | + |
| 66 | + // The pipeline can then be trained, using .Fit(), and the resulting transformer can be used to transform data. |
| 67 | + var transformedData = pipeline.Fit(data).Transform(data); |
| 68 | + |
| 69 | + // Small helper to print the data inside a column, in the console. Only prints the first 10 rows. |
| 70 | + Action<string, IEnumerable<VBuffer<float>>> printHelper = (columnName, column) => |
| 71 | + { |
| 72 | + Console.WriteLine($"{columnName} column obtained post-transformation."); |
| 73 | + int count = 0; |
| 74 | + foreach (var row in column) |
| 75 | + { |
| 76 | + foreach (var value in row.GetValues()) |
| 77 | + Console.Write($"{value}\t"); |
| 78 | + Console.WriteLine(""); |
| 79 | + count++; |
| 80 | + if (count >= 10) |
| 81 | + break; |
| 82 | + } |
| 83 | + |
| 84 | + Console.WriteLine("==================================================="); |
| 85 | + }; |
| 86 | + |
| 87 | + // Print the data that results from the transformations. |
| 88 | + var countSelectColumn = transformedData.GetColumn<VBuffer<float>>(ml, "FeaturesCountSelect"); |
| 89 | + var MISelectColumn = transformedData.GetColumn<VBuffer<float>>(ml, "FeaturesMISelect"); |
| 90 | + printHelper("FeaturesCountSelect", countSelectColumn); |
| 91 | + printHelper("FeaturesMISelect", MISelectColumn); |
| 92 | + |
| 93 | + // Below is the output of the this code. We see that some slots habe been dropped by the first transformation. |
| 94 | + // Among the remaining slots, the second transformation only preserves the top 5 slots based on mutualinformation |
| 95 | + // with the label column. |
| 96 | + |
| 97 | + // FeaturesCountSelect column obtained post-transformation. |
| 98 | + // 5 4 4 5 7 3 2 1 |
| 99 | + // 3 1 1 1 2 3 1 1 |
| 100 | + // 6 8 8 1 3 3 7 1 |
| 101 | + // 4 1 1 3 2 3 1 1 |
| 102 | + // 8 10 10 8 7 9 7 1 |
| 103 | + // 1 1 1 1 2 3 1 1 |
| 104 | + // 2 1 2 1 2 3 1 1 |
| 105 | + // 2 1 1 1 2 1 1 5 |
| 106 | + // 4 2 1 1 2 2 1 1 |
| 107 | + // 1 1 1 1 1 3 1 1 |
| 108 | + // =================================================== |
| 109 | + // FeaturesMISelect column obtained post-transformation. |
| 110 | + // 4 4 7 3 2 |
| 111 | + // 1 1 2 3 1 |
| 112 | + // 8 8 3 3 7 |
| 113 | + // 1 1 2 3 1 |
| 114 | + // 10 10 7 9 7 |
| 115 | + // 1 1 2 3 1 |
| 116 | + // 1 2 2 3 1 |
| 117 | + // 1 1 2 1 1 |
| 118 | + // 2 1 2 2 1 |
| 119 | + // 1 1 1 3 1 |
| 120 | + // =================================================== |
| 121 | + } |
| 122 | + } |
| 123 | +} |
0 commit comments