-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Conversion of DropSlots, MutualInformationFeatureSelection, and CountFeatureSelection into estimator and transformers #1683
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 32 commits
e129445
4968169
ce6e733
4b0d716
733a8d2
5cd5211
75df87e
d9975bc
e2ffe99
50076cf
827700d
49e6f83
b05110b
7af2057
a5a91ba
a98c8a2
8ff7e23
15e2eb2
336732e
c7dccec
e5cc371
209492c
3e9217e
7156f6a
098bcf9
e40f42f
2539fa0
0d4605a
c3a5149
df9ef2b
7226216
f0bc2e6
f093fae
8e9c307
4675885
05ea809
fbb7475
0f2e815
3bc1fdd
318a1ae
8d19b36
695cdc9
eea19f4
2fbfbba
fb6120d
7e5ec2d
2669fa5
8000682
3fb399b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,123 @@ | ||
using Microsoft.ML.Data; | ||
using Microsoft.ML.Runtime.Data; | ||
using System; | ||
using System.Collections.Generic; | ||
|
||
namespace Microsoft.ML.Samples.Dynamic | ||
{ | ||
public class FeatureSelectionTransformExample | ||
{ | ||
public static void FeatureSelectionTransform() | ||
{ | ||
// Downloading a classification dataset from github.com/dotnet/machinelearning. | ||
// It will be stored in the same path as the executable | ||
string dataFilePath = SamplesUtils.DatasetUtils.DownloadBreastCancerDataset(); | ||
|
||
// Data Preview | ||
// 1. Label 0=benign, 1=malignant | ||
// 2. Clump Thickness 1 - 10 | ||
// 3. Uniformity of Cell Size 1 - 10 | ||
// 4. Uniformity of Cell Shape 1 - 10 | ||
// 5. Marginal Adhesion 1 - 10 | ||
// 6. Single Epithelial Cell Size 1 - 10 | ||
// 7. Bare Nuclei 1 - 10 | ||
// 8. Bland Chromatin 1 - 10 | ||
// 9. Normal Nucleoli 1 - 10 | ||
// 10. Mitoses 1 - 10 | ||
|
||
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, | ||
// as well as the source of randomness. | ||
var ml = new MLContext(); | ||
|
||
// First, we define the reader: specify the data columns and where to find them in the text file. Notice that we combine entries from | ||
// all the feature columns into entries of a vector of a single column named "Features". | ||
var reader = ml.Data.TextReader(new TextLoader.Arguments() | ||
{ | ||
Separator = "tab", | ||
HasHeader = true, | ||
Column = new[] | ||
{ | ||
new TextLoader.Column("Label", DataKind.BL, 0), | ||
new TextLoader.Column("Features", DataKind.Num, new [] { new TextLoader.Range(1, 9) }) | ||
} | ||
}); | ||
|
||
// Then, we use the reader to read the data as an IDataView. | ||
var data = reader.Read(dataFilePath); | ||
|
||
// Second, we define the transformations that we apply on the data. Remember that an Estimator does not trnasform data | ||
// directly, but it needs to be trained on data using .Fit(), and it will output a Transformer, which can transform data. | ||
|
||
// In this example we define a CountFeatureSelectingEstimator, that selects slots in a feature vector that have more non-default | ||
// values than the specified count. This transformation can be used to remove columns with too many missing values. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
slots? #Closed |
||
var countSelectEst = ml.Transforms.FeatureSelection.CountFeatureSelectingEstimator( | ||
inputColumn: "Features", outputColumn: "FeaturesCountSelect", count: 695); | ||
|
||
// We also define a MutualInformationFeatureSelectingEstimator that selects the top k slots in a feature | ||
// vector based on highest mutual information between that slot and a specified label. Notice that it is possible to | ||
// specify the parameter `numBins', which controls the number of bins used in the approximation of the mutual information | ||
// between features and label. | ||
var mutualInfoEst = ml.Transforms.FeatureSelection.MutualInformationFeatureSelectingEstimator( | ||
inputColumn: "FeaturesCountSelect", outputColumn: "FeaturesMISelect", labelColumn: "Label", slotsInOutput: 5); | ||
|
||
// Now, we can put the previous two transformations together in a pipeline. | ||
var pipeline = countSelectEst.Append(mutualInfoEst); | ||
|
||
// The pipeline can then be trained, using .Fit(), and the resulting transformer can be used to transform data. | ||
var transformedData = pipeline.Fit(data).Transform(data); | ||
|
||
// Small helper to print the data inside a column, in the console. Only prints the first 10 rows. | ||
Action<string, IEnumerable<VBuffer<float>>> printHelper = (columnName, column) => | ||
{ | ||
Console.WriteLine($"{columnName} column obtained post-transformation."); | ||
int count = 0; | ||
foreach (var row in column) | ||
{ | ||
foreach (var value in row.GetValues()) | ||
Console.Write($"{value}\t"); | ||
Console.WriteLine(""); | ||
count++; | ||
if (count >= 10) | ||
break; | ||
} | ||
|
||
Console.WriteLine("==================================================="); | ||
}; | ||
|
||
// Print the data that results from the transformations. | ||
var countSelectColumn = transformedData.GetColumn<VBuffer<float>>(ml, "FeaturesCountSelect"); | ||
var MISelectColumn = transformedData.GetColumn<VBuffer<float>>(ml, "FeaturesMISelect"); | ||
printHelper("FeaturesCountSelect", countSelectColumn); | ||
printHelper("FeaturesMISelect", MISelectColumn); | ||
|
||
// Below is the output of the this code. We see that some slots habe been dropped by the first trnsformation. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
typo #Resolved There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
msitype #Resolved |
||
// Among the remaining slots, the second transformation only preserves the top 5 slots based on mutualinformation | ||
// with the label column. | ||
|
||
// FeaturesCountSelect column obtained post-transformation. | ||
// 5 4 4 5 7 3 2 1 | ||
// 3 1 1 1 2 3 1 1 | ||
// 6 8 8 1 3 3 7 1 | ||
// 4 1 1 3 2 3 1 1 | ||
// 8 10 10 8 7 9 7 1 | ||
// 1 1 1 1 2 3 1 1 | ||
// 2 1 2 1 2 3 1 1 | ||
// 2 1 1 1 2 1 1 5 | ||
// 4 2 1 1 2 2 1 1 | ||
// 1 1 1 1 1 3 1 1 | ||
// =================================================== | ||
// FeaturesMISelect column obtained post-transformation. | ||
// 4 4 7 3 2 | ||
// 1 1 2 3 1 | ||
// 8 8 3 3 7 | ||
// 1 1 2 3 1 | ||
// 10 10 7 9 7 | ||
// 1 1 2 3 1 | ||
// 1 2 2 3 1 | ||
// 1 1 2 1 1 | ||
// 2 1 2 2 1 | ||
// 1 1 1 3 1 | ||
// =================================================== | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,6 +2,7 @@ | |
using Microsoft.ML.StaticPipe; | ||
using Microsoft.ML.Transforms; | ||
using Microsoft.ML.Transforms.Categorical; | ||
using Microsoft.ML.Transforms.FeatureSelection; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not needed #Resolved |
||
using System; | ||
|
||
namespace Microsoft.ML.Samples.Static | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
using Microsoft.ML.Data; | ||
using Microsoft.ML.Runtime.Data; | ||
using Microsoft.ML.StaticPipe; | ||
using System; | ||
using System.Collections.Generic; | ||
|
||
namespace Microsoft.ML.Samples.Dynamic | ||
{ | ||
public class FeatureSelectionTransformStaticExample | ||
{ | ||
public static void FeatureSelectionTransform() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
do we want to refer to this example in the FeatureSelectionCatalog.cs file ? |
||
{ | ||
// Downloading a classification dataset from github.com/dotnet/machinelearning. | ||
// It will be stored in the same path as the executable | ||
string dataFilePath = SamplesUtils.DatasetUtils.DownloadBreastCancerDataset(); | ||
|
||
// Data Preview | ||
// 1. Label 0=benign, 1=malignant | ||
// 2. Clump Thickness 1 - 10 | ||
// 3. Uniformity of Cell Size 1 - 10 | ||
// 4. Uniformity of Cell Shape 1 - 10 | ||
// 5. Marginal Adhesion 1 - 10 | ||
// 6. Single Epithelial Cell Size 1 - 10 | ||
// 7. Bare Nuclei 1 - 10 | ||
// 8. Bland Chromatin 1 - 10 | ||
// 9. Normal Nucleoli 1 - 10 | ||
// 10. Mitoses 1 - 10 | ||
|
||
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, | ||
// as well as the source of randomness. | ||
var ml = new MLContext(); | ||
|
||
// First, we define the reader: specify the data columns and where to find them in the text file. Notice that we combine entries from | ||
// all the feature columns into entries of a vector of a single column named "Features". | ||
var reader = TextLoader.CreateReader(ml, c => ( | ||
Label: c.LoadBool(0), | ||
Features: c.LoadFloat(1, 9) | ||
), | ||
separator: '\t', hasHeader: true); | ||
|
||
// Then, we use the reader to read the data as an IDataView. | ||
var data = reader.Read(dataFilePath); | ||
|
||
// Second, we define the transformations that we apply on the data. Remember that an Estimator does not trnasform data | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
msitype #Resolved |
||
// directly, but it needs to be trained on data using .Fit(), and it will output a Transformer, which can transform data. | ||
|
||
// In this example we define a CountFeatureSelectingEstimator, that selects slots in a feature vector that have more non-default | ||
// values than the specified count. This transformation can be used to remove columns with too many missing values. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
slots? #Resolved |
||
// We also define a MutualInformationFeatureSelectingEstimator that selects the top k slots in a feature | ||
// vector based on highest mutual information between that slot and a specified label. Notice that it is possible to | ||
// specify the parameter `numBins', which controls the number of bins used in the approximation of the mutual information | ||
// between features and label. | ||
var pipeline = reader.MakeNewEstimator() | ||
.Append(r =>( | ||
FeaturesCountSelect: r.Features.SelectFeaturesBasedOnCount(count: 695), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Is this the correct way to show the static api? Or should I use MLContext to find the estimator? #Resolved |
||
Label: r.Label | ||
)) | ||
.Append(r => ( | ||
FeaturesCountSelect: r.FeaturesCountSelect, | ||
FeaturesMISelect: r.FeaturesCountSelect.SelectFeaturesBasedOnMutualInformation(r.Label, slotsInOutput: 5), | ||
Label: r.Label | ||
)); | ||
|
||
|
||
// The pipeline can then be trained, using .Fit(), and the resulting transformer can be used to transform data. | ||
var transformedData = pipeline.Fit(data).Transform(data); | ||
|
||
// Small helper to print the data inside a column, in the console. Only prints the first 10 rows. | ||
Action<string, IEnumerable<VBuffer<float>>> printHelper = (columnName, column) => | ||
{ | ||
Console.WriteLine($"{columnName} column obtained post-transformation."); | ||
int count = 0; | ||
foreach (var row in column) | ||
{ | ||
foreach (var value in row.GetValues()) | ||
Console.Write($"{value}\t"); | ||
Console.WriteLine(""); | ||
count++; | ||
if (count >= 10) | ||
break; | ||
} | ||
|
||
Console.WriteLine("==================================================="); | ||
}; | ||
|
||
// Print the data that results from the transformations. | ||
var countSelectColumn = transformedData.AsDynamic.GetColumn<VBuffer<float>>(ml, "FeaturesCountSelect"); | ||
var MISelectColumn = transformedData.AsDynamic.GetColumn<VBuffer<float>>(ml, "FeaturesMISelect"); | ||
printHelper("FeaturesCountSelect", countSelectColumn); | ||
printHelper("FeaturesMISelect", MISelectColumn); | ||
|
||
// Below is the output of the this code. We see that some slots habe been dropped by the first trnsformation. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
typo #Resolved There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
// Among the remaining slots, the second transformation only preserves the top 5 slots based on mutualinformation | ||
// with the label column. | ||
|
||
// FeaturesCountSelect column obtained post-transformation. | ||
// 5 4 4 5 7 3 2 1 | ||
// 3 1 1 1 2 3 1 1 | ||
// 6 8 8 1 3 3 7 1 | ||
// 4 1 1 3 2 3 1 1 | ||
// 8 10 10 8 7 9 7 1 | ||
// 1 1 1 1 2 3 1 1 | ||
// 2 1 2 1 2 3 1 1 | ||
// 2 1 1 1 2 1 1 5 | ||
// 4 2 1 1 2 2 1 1 | ||
// 1 1 1 1 1 3 1 1 | ||
// =================================================== | ||
// FeaturesMISelect column obtained post-transformation. | ||
// 4 4 7 3 2 | ||
// 1 1 2 3 1 | ||
// 8 8 3 3 7 | ||
// 1 1 2 3 1 | ||
// 10 10 7 9 7 | ||
// 1 1 2 3 1 | ||
// 1 2 2 3 1 | ||
// 1 1 2 1 1 | ||
// 2 1 2 2 1 | ||
// 1 1 1 3 1 | ||
// =================================================== | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
msitype #Closed