Skip to content

Commit 3bf74ed

Browse files
authored
Conversion of DropSlots, MutualInformationFeatureSelection, and CountFeatureSelection into estimator and transformers (#1683)
1 parent 213ef9e commit 3bf74ed

32 files changed

+1779
-1086
lines changed

docs/code/MlNetCookBook.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -1055,7 +1055,7 @@ var dynamicPipeline =
10551055
.Append(mlContext.Transforms.Categorical.OneHotEncoding("CategoricalFeatures", "CategoricalBag", CategoricalTransform.OutputKind.Bag))
10561056
// One-hot encode the workclass column, then drop all the categories that have fewer than 10 instances in the train set.
10571057
.Append(mlContext.Transforms.Categorical.OneHotEncoding("Workclass", "WorkclassOneHot"))
1058-
.Append(new CountFeatureSelector(mlContext, "WorkclassOneHot", "WorkclassOneHotTrimmed", count: 10));
1058+
.Append(mlContext.Transforms.FeatureSelection.CountFeatureSelectingEstimator("WorkclassOneHot", "WorkclassOneHotTrimmed", count: 10));
10591059

10601060
// Let's train our pipeline, and then apply it to the same data.
10611061
var transformedData = dynamicPipeline.Fit(data).Transform(data);
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
using Microsoft.ML.Data;
2+
using Microsoft.ML.Runtime.Data;
3+
using System;
4+
using System.Collections.Generic;
5+
6+
namespace Microsoft.ML.Samples.Dynamic
7+
{
8+
public class FeatureSelectionTransformExample
9+
{
10+
public static void FeatureSelectionTransform()
11+
{
12+
// Downloading a classification dataset from github.com/dotnet/machinelearning.
13+
// It will be stored in the same path as the executable
14+
string dataFilePath = SamplesUtils.DatasetUtils.DownloadBreastCancerDataset();
15+
16+
// Data Preview
17+
// 1. Label 0=benign, 1=malignant
18+
// 2. Clump Thickness 1 - 10
19+
// 3. Uniformity of Cell Size 1 - 10
20+
// 4. Uniformity of Cell Shape 1 - 10
21+
// 5. Marginal Adhesion 1 - 10
22+
// 6. Single Epithelial Cell Size 1 - 10
23+
// 7. Bare Nuclei 1 - 10
24+
// 8. Bland Chromatin 1 - 10
25+
// 9. Normal Nucleoli 1 - 10
26+
// 10. Mitoses 1 - 10
27+
28+
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
29+
// as well as the source of randomness.
30+
var ml = new MLContext();
31+
32+
// First, we define the reader: specify the data columns and where to find them in the text file. Notice that we combine entries from
33+
// all the feature columns into entries of a vector of a single column named "Features".
34+
var reader = ml.Data.TextReader(new TextLoader.Arguments()
35+
{
36+
Separator = "tab",
37+
HasHeader = true,
38+
Column = new[]
39+
{
40+
new TextLoader.Column("Label", DataKind.BL, 0),
41+
new TextLoader.Column("Features", DataKind.Num, new [] { new TextLoader.Range(1, 9) })
42+
}
43+
});
44+
45+
// Then, we use the reader to read the data as an IDataView.
46+
var data = reader.Read(dataFilePath);
47+
48+
// Second, we define the transformations that we apply on the data. Remember that an Estimator does not transform data
49+
// directly, but it needs to be trained on data using .Fit(), and it will output a Transformer, which can transform data.
50+
51+
// In this example we define a CountFeatureSelectingEstimator, that selects slots in a feature vector that have more non-default
52+
// values than the specified count. This transformation can be used to remove slots with too many missing values.
53+
var countSelectEst = ml.Transforms.FeatureSelection.SelectFeaturesBasedOnCount(
54+
inputColumn: "Features", outputColumn: "FeaturesCountSelect", count: 695);
55+
56+
// We also define a MutualInformationFeatureSelectingEstimator that selects the top k slots in a feature
57+
// vector based on highest mutual information between that slot and a specified label. Notice that it is possible to
58+
// specify the parameter `numBins', which controls the number of bins used in the approximation of the mutual information
59+
// between features and label.
60+
var mutualInfoEst = ml.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation(
61+
inputColumn: "FeaturesCountSelect", outputColumn: "FeaturesMISelect", labelColumn: "Label", slotsInOutput: 5);
62+
63+
// Now, we can put the previous two transformations together in a pipeline.
64+
var pipeline = countSelectEst.Append(mutualInfoEst);
65+
66+
// The pipeline can then be trained, using .Fit(), and the resulting transformer can be used to transform data.
67+
var transformedData = pipeline.Fit(data).Transform(data);
68+
69+
// Small helper to print the data inside a column, in the console. Only prints the first 10 rows.
70+
Action<string, IEnumerable<VBuffer<float>>> printHelper = (columnName, column) =>
71+
{
72+
Console.WriteLine($"{columnName} column obtained post-transformation.");
73+
int count = 0;
74+
foreach (var row in column)
75+
{
76+
foreach (var value in row.GetValues())
77+
Console.Write($"{value}\t");
78+
Console.WriteLine("");
79+
count++;
80+
if (count >= 10)
81+
break;
82+
}
83+
84+
Console.WriteLine("===================================================");
85+
};
86+
87+
// Print the data that results from the transformations.
88+
var countSelectColumn = transformedData.GetColumn<VBuffer<float>>(ml, "FeaturesCountSelect");
89+
var MISelectColumn = transformedData.GetColumn<VBuffer<float>>(ml, "FeaturesMISelect");
90+
printHelper("FeaturesCountSelect", countSelectColumn);
91+
printHelper("FeaturesMISelect", MISelectColumn);
92+
93+
// Below is the output of the this code. We see that some slots habe been dropped by the first transformation.
94+
// Among the remaining slots, the second transformation only preserves the top 5 slots based on mutualinformation
95+
// with the label column.
96+
97+
// FeaturesCountSelect column obtained post-transformation.
98+
// 5 4 4 5 7 3 2 1
99+
// 3 1 1 1 2 3 1 1
100+
// 6 8 8 1 3 3 7 1
101+
// 4 1 1 3 2 3 1 1
102+
// 8 10 10 8 7 9 7 1
103+
// 1 1 1 1 2 3 1 1
104+
// 2 1 2 1 2 3 1 1
105+
// 2 1 1 1 2 1 1 5
106+
// 4 2 1 1 2 2 1 1
107+
// 1 1 1 1 1 3 1 1
108+
// ===================================================
109+
// FeaturesMISelect column obtained post-transformation.
110+
// 4 4 7 3 2
111+
// 1 1 2 3 1
112+
// 8 8 3 3 7
113+
// 1 1 2 3 1
114+
// 10 10 7 9 7
115+
// 1 1 2 3 1
116+
// 1 2 2 3 1
117+
// 1 1 2 1 1
118+
// 2 1 2 2 1
119+
// 1 1 1 3 1
120+
// ===================================================
121+
}
122+
}
123+
}

docs/samples/Microsoft.ML.Samples/Static/FastTreeBinaryClassification.cs

+1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
using Microsoft.ML.StaticPipe;
33
using Microsoft.ML.Transforms;
44
using Microsoft.ML.Transforms.Categorical;
5+
using Microsoft.ML.Transforms.FeatureSelection;
56
using System;
67

78
namespace Microsoft.ML.Samples.Static
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
using Microsoft.ML.Data;
2+
using Microsoft.ML.Runtime.Data;
3+
using Microsoft.ML.StaticPipe;
4+
using System;
5+
using System.Collections.Generic;
6+
7+
namespace Microsoft.ML.Samples.Dynamic
8+
{
9+
public class FeatureSelectionTransformStaticExample
10+
{
11+
public static void FeatureSelectionTransform()
12+
{
13+
// Downloading a classification dataset from github.com/dotnet/machinelearning.
14+
// It will be stored in the same path as the executable
15+
string dataFilePath = SamplesUtils.DatasetUtils.DownloadBreastCancerDataset();
16+
17+
// Data Preview
18+
// 1. Label 0=benign, 1=malignant
19+
// 2. Clump Thickness 1 - 10
20+
// 3. Uniformity of Cell Size 1 - 10
21+
// 4. Uniformity of Cell Shape 1 - 10
22+
// 5. Marginal Adhesion 1 - 10
23+
// 6. Single Epithelial Cell Size 1 - 10
24+
// 7. Bare Nuclei 1 - 10
25+
// 8. Bland Chromatin 1 - 10
26+
// 9. Normal Nucleoli 1 - 10
27+
// 10. Mitoses 1 - 10
28+
29+
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
30+
// as well as the source of randomness.
31+
var ml = new MLContext();
32+
33+
// First, we define the reader: specify the data columns and where to find them in the text file. Notice that we combine entries from
34+
// all the feature columns into entries of a vector of a single column named "Features".
35+
var reader = TextLoader.CreateReader(ml, c => (
36+
Label: c.LoadBool(0),
37+
Features: c.LoadFloat(1, 9)
38+
),
39+
separator: '\t', hasHeader: true);
40+
41+
// Then, we use the reader to read the data as an IDataView.
42+
var data = reader.Read(dataFilePath);
43+
44+
// Second, we define the transformations that we apply on the data. Remember that an Estimator does not transform data
45+
// directly, but it needs to be trained on data using .Fit(), and it will output a Transformer, which can transform data.
46+
47+
// In this example we define a CountFeatureSelectingEstimator, that selects slots in a feature vector that have more non-default
48+
// values than the specified count. This transformation can be used to remove slots with too many missing values.
49+
// We also define a MutualInformationFeatureSelectingEstimator that selects the top k slots in a feature
50+
// vector based on highest mutual information between that slot and a specified label. Notice that it is possible to
51+
// specify the parameter `numBins', which controls the number of bins used in the approximation of the mutual information
52+
// between features and label.
53+
var pipeline = reader.MakeNewEstimator()
54+
.Append(r =>(
55+
FeaturesCountSelect: r.Features.SelectFeaturesBasedOnCount(count: 695),
56+
Label: r.Label
57+
))
58+
.Append(r => (
59+
FeaturesCountSelect: r.FeaturesCountSelect,
60+
FeaturesMISelect: r.FeaturesCountSelect.SelectFeaturesBasedOnMutualInformation(r.Label, slotsInOutput: 5),
61+
Label: r.Label
62+
));
63+
64+
65+
// The pipeline can then be trained, using .Fit(), and the resulting transformer can be used to transform data.
66+
var transformedData = pipeline.Fit(data).Transform(data);
67+
68+
// Small helper to print the data inside a column, in the console. Only prints the first 10 rows.
69+
Action<string, IEnumerable<VBuffer<float>>> printHelper = (columnName, column) =>
70+
{
71+
Console.WriteLine($"{columnName} column obtained post-transformation.");
72+
int count = 0;
73+
foreach (var row in column)
74+
{
75+
foreach (var value in row.GetValues())
76+
Console.Write($"{value}\t");
77+
Console.WriteLine("");
78+
count++;
79+
if (count >= 10)
80+
break;
81+
}
82+
83+
Console.WriteLine("===================================================");
84+
};
85+
86+
// Print the data that results from the transformations.
87+
var countSelectColumn = transformedData.AsDynamic.GetColumn<VBuffer<float>>(ml, "FeaturesCountSelect");
88+
var MISelectColumn = transformedData.AsDynamic.GetColumn<VBuffer<float>>(ml, "FeaturesMISelect");
89+
printHelper("FeaturesCountSelect", countSelectColumn);
90+
printHelper("FeaturesMISelect", MISelectColumn);
91+
92+
// Below is the output of the this code. We see that some slots habe been dropped by the first transformation.
93+
// Among the remaining slots, the second transformation only preserves the top 5 slots based on mutualinformation
94+
// with the label column.
95+
96+
// FeaturesCountSelect column obtained post-transformation.
97+
// 5 4 4 5 7 3 2 1
98+
// 3 1 1 1 2 3 1 1
99+
// 6 8 8 1 3 3 7 1
100+
// 4 1 1 3 2 3 1 1
101+
// 8 10 10 8 7 9 7 1
102+
// 1 1 1 1 2 3 1 1
103+
// 2 1 2 1 2 3 1 1
104+
// 2 1 1 1 2 1 1 5
105+
// 4 2 1 1 2 2 1 1
106+
// 1 1 1 1 1 3 1 1
107+
// ===================================================
108+
// FeaturesMISelect column obtained post-transformation.
109+
// 4 4 7 3 2
110+
// 1 1 2 3 1
111+
// 8 8 3 3 7
112+
// 1 1 2 3 1
113+
// 10 10 7 9 7
114+
// 1 1 2 3 1
115+
// 1 2 2 3 1
116+
// 1 1 2 1 1
117+
// 2 1 2 2 1
118+
// 1 1 1 3 1
119+
// ===================================================
120+
}
121+
}
122+
}

docs/samples/Microsoft.ML.Samples/Static/LightGBMBinaryClassification.cs

+1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
using Microsoft.ML.StaticPipe;
33
using Microsoft.ML.Transforms;
44
using Microsoft.ML.Transforms.Categorical;
5+
using Microsoft.ML.Transforms.FeatureSelection;
56
using System;
67

78
namespace Microsoft.ML.Samples.Static

docs/samples/Microsoft.ML.Samples/Static/SDCABinaryClassification.cs

+1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
using Microsoft.ML.StaticPipe;
33
using Microsoft.ML.Transforms;
44
using Microsoft.ML.Transforms.Categorical;
5+
using Microsoft.ML.Transforms.FeatureSelection;
56
using System;
67

78
namespace Microsoft.ML.Samples.Static

src/Microsoft.ML.Data/Evaluators/ClusteringEvaluator.cs

+3-39
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
using Microsoft.ML.Runtime.Internal.Utilities;
1111
using Microsoft.ML.Runtime.Model;
1212
using Microsoft.ML.Runtime.Numeric;
13-
using Microsoft.ML.Transforms;
13+
using Microsoft.ML.Transforms.FeatureSelection;
1414
using System;
1515
using System.Collections.Generic;
1616
using System.Linq;
@@ -877,50 +877,14 @@ protected override IDataView GetPerInstanceMetricsCore(IDataView perInst, RoleMa
877877
{
878878
var type = perInst.Schema.GetColumnType(index);
879879
if (_numTopClusters < type.VectorSize)
880-
{
881-
var args = new DropSlotsTransform.Arguments
882-
{
883-
Column = new DropSlotsTransform.Column[]
884-
{
885-
new DropSlotsTransform.Column()
886-
{
887-
Name = ClusteringPerInstanceEvaluator.SortedClusters,
888-
Slots = new[] {
889-
new DropSlotsTransform.Range()
890-
{
891-
Min = _numTopClusters
892-
}
893-
}
894-
}
895-
}
896-
};
897-
perInst = new DropSlotsTransform(Host, args, perInst);
898-
}
880+
perInst = new SlotsDroppingTransformer(Host, ClusteringPerInstanceEvaluator.SortedClusters, min: _numTopClusters).Transform(perInst);
899881
}
900882

901883
if (perInst.Schema.TryGetColumnIndex(ClusteringPerInstanceEvaluator.SortedClusterScores, out index))
902884
{
903885
var type = perInst.Schema.GetColumnType(index);
904886
if (_numTopClusters < type.VectorSize)
905-
{
906-
var args = new DropSlotsTransform.Arguments
907-
{
908-
Column = new DropSlotsTransform.Column[]
909-
{
910-
new DropSlotsTransform.Column()
911-
{
912-
Name = ClusteringPerInstanceEvaluator.SortedClusterScores,
913-
Slots = new[] {
914-
new DropSlotsTransform.Range()
915-
{
916-
Min = _numTopClusters
917-
}
918-
}
919-
}
920-
}
921-
};
922-
perInst = new DropSlotsTransform(Host, args, perInst);
923-
}
887+
perInst = new SlotsDroppingTransformer(Host, ClusteringPerInstanceEvaluator.SortedClusterScores, min: _numTopClusters).Transform(perInst);
924888
}
925889
return perInst;
926890
}

0 commit comments

Comments
 (0)