-
Notifications
You must be signed in to change notification settings - Fork 1.9k
[AutoML] Add AutoML example code #3458
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
6b4f4da
e12f741
9565ce3
b8dc2c8
59da285
b9a0c1c
e85a782
c5a8d9e
b322082
4b5bfd5
86d1732
23241f9
0187416
e3eb1ed
3efe412
4fe5295
16939d6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
using System; | ||
using System.IO; | ||
using System.Linq; | ||
using Microsoft.ML.Auto; | ||
using Microsoft.ML.Data; | ||
|
||
namespace Microsoft.ML.AutoML.Samples | ||
{ | ||
public static class BinaryClassificationExperiment | ||
{ | ||
private static string BaseDatasetsLocation = "Data"; | ||
private static string TrainDataPath = Path.Combine(BaseDatasetsLocation, "wikipedia-detox-250-line-data.tsv"); | ||
private static string TestDataPath = Path.Combine(BaseDatasetsLocation, "wikipedia-detox-250-line-test.tsv"); | ||
private static string ModelPath = Path.Combine(BaseDatasetsLocation, "SentimentModel.zip"); | ||
private static uint ExperimentTime = 60; | ||
|
||
public static void Run() | ||
{ | ||
MLContext mlContext = new MLContext(); | ||
|
||
// STEP 1: Load data | ||
IDataView trainDataView = mlContext.Data.LoadFromTextFile<SentimentIssue>(TrainDataPath, hasHeader: true); | ||
IDataView testDataView = mlContext.Data.LoadFromTextFile<SentimentIssue>(TestDataPath, hasHeader: true); | ||
|
||
// STEP 2: Run AutoML experiment | ||
Console.WriteLine($"Running AutoML binary classification experiment for {ExperimentTime} seconds..."); | ||
ExperimentResult<BinaryClassificationMetrics> experimentResult = mlContext.Auto() | ||
.CreateBinaryClassificationExperiment(ExperimentTime) | ||
.Execute(trainDataView); | ||
|
||
// STEP 3: Print metric from the best model | ||
RunDetail<BinaryClassificationMetrics> bestRun = experimentResult.BestRun; | ||
Console.WriteLine($"Total models produced: {experimentResult.RunDetails.Count()}"); | ||
Console.WriteLine($"Best model's trainer: {bestRun.TrainerName}"); | ||
Console.WriteLine($"Accuracy of best model from validation data: {bestRun.ValidationMetrics.Accuracy}"); | ||
|
||
// STEP 4: Evaluate test data | ||
IDataView testDataViewWithBestScore = bestRun.Model.Transform(testDataView); | ||
BinaryClassificationMetrics testMetrics = mlContext.BinaryClassification.EvaluateNonCalibrated(testDataViewWithBestScore); | ||
Console.WriteLine($"Accuracy of best model on test data: {testMetrics.Accuracy}"); | ||
|
||
// STEP 5: Save the best model for later deployment and inferencing | ||
using (FileStream fs = File.Create(ModelPath)) | ||
mlContext.Model.Save(bestRun.Model, trainDataView.Schema, fs); | ||
|
||
// STEP 6: Create prediction engine from the best trained model | ||
var predictionEngine = mlContext.Model.CreatePredictionEngine<SentimentIssue, SentimentPrediction>(bestRun.Model); | ||
|
||
// STEP 7: Initialize a new sentiment issue, and get the predicted sentiment | ||
var testSentimentIssue = new SentimentIssue | ||
{ | ||
Text = "I hope this helps." | ||
}; | ||
var prediction = predictionEngine.Predict(testSentimentIssue); | ||
Console.WriteLine($"Predicted sentiment for test issue: {prediction.Prediction}"); | ||
|
||
Console.WriteLine("Press any key to continue..."); | ||
Console.ReadKey(); | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
using Microsoft.ML.Data; | ||
|
||
namespace Microsoft.ML.AutoML.Samples | ||
{ | ||
public class PixelData | ||
{ | ||
[LoadColumn(0, 63)] | ||
[VectorType(64)] | ||
public float[] PixelValues; | ||
|
||
[LoadColumn(64)] | ||
public float Number; | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
using Microsoft.ML.Data; | ||
|
||
namespace Microsoft.ML.AutoML.Samples | ||
{ | ||
public class PixelPrediction | ||
{ | ||
[ColumnName("PredictedLabel")] | ||
public float Prediction; | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
using Microsoft.ML.Data; | ||
|
||
namespace Microsoft.ML.AutoML.Samples | ||
{ | ||
public class SentimentIssue | ||
{ | ||
[LoadColumn(0)] | ||
public bool Label { get; set; } | ||
|
||
[LoadColumn(1)] | ||
public string Text { get; set; } | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
using Microsoft.ML.Data; | ||
|
||
namespace Microsoft.ML.AutoML.Samples | ||
{ | ||
public class SentimentPrediction | ||
{ | ||
// ColumnName attribute is used to change the column name from | ||
// its default value, which is the name of the field. | ||
[ColumnName("PredictedLabel")] | ||
public bool Prediction { get; set; } | ||
|
||
// No need to specify ColumnName attribute, because the field | ||
// name "Probability" is the column name we want. | ||
public float Probability { get; set; } | ||
|
||
public float Score { get; set; } | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
using Microsoft.ML.Data; | ||
|
||
namespace Microsoft.ML.AutoML.Samples | ||
{ | ||
public class TaxiTrip | ||
{ | ||
[LoadColumn(0)] | ||
public string VendorId; | ||
|
||
[LoadColumn(1)] | ||
public float RateCode; | ||
|
||
[LoadColumn(2)] | ||
public float PassengerCount; | ||
|
||
[LoadColumn(3)] | ||
public float TripTimeInSeconds; | ||
|
||
[LoadColumn(4)] | ||
public float TripDistance; | ||
|
||
[LoadColumn(5)] | ||
public string PaymentType; | ||
|
||
[LoadColumn(6)] | ||
public float FareAmount; | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
using Microsoft.ML.Data; | ||
|
||
namespace Microsoft.ML.AutoML.Samples | ||
{ | ||
public class TaxiTripFarePrediction | ||
{ | ||
[ColumnName("Score")] | ||
public float FareAmount; | ||
} | ||
} |
Original file line number | Diff line number | Diff line change | ||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
@@ -0,0 +1,46 @@ | ||||||||||||||||
<Project Sdk="Microsoft.NET.Sdk"> | ||||||||||||||||
|
||||||||||||||||
<PropertyGroup> | ||||||||||||||||
<OutputType>Exe</OutputType> | ||||||||||||||||
<TargetFramework>netcoreapp2.2</TargetFramework> | ||||||||||||||||
</PropertyGroup> | ||||||||||||||||
|
||||||||||||||||
<ItemGroup> | ||||||||||||||||
<ProjectReference Include="..\..\..\src\Microsoft.ML.Auto\Microsoft.ML.Auto.csproj" /> | ||||||||||||||||
</ItemGroup> | ||||||||||||||||
|
||||||||||||||||
<ItemGroup> | ||||||||||||||||
<Folder Include="Data\" /> | ||||||||||||||||
</ItemGroup> | ||||||||||||||||
|
||||||||||||||||
<ItemGroup> | ||||||||||||||||
<None Include="..\..\..\test\data\optdigits-test.csv" Link="Data\optdigits-test.csv"> | ||||||||||||||||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory> | ||||||||||||||||
</None> | ||||||||||||||||
<None Include="..\..\..\test\data\optdigits-train.csv" Link="Data\optdigits-train.csv"> | ||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we reference the dataset thru the Dataset Utils? Given its part of examples, I expect it's accessible thru nugets. We could add the optdigits to SamplesDatasetUtils.cs: machinelearning/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs Lines 18 to 24 in fde1ab7
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, the opt-digits dataset is part of ML.NET samples. However, DatasetUtils doesn't download the dataset from NuGet, but instead pulls it from the ML.NET samples Git repository. You're right -- since this dataset is already in one Git repository (the ML.NET samples Git repo), no need to add it to another Git repository (the ML.NET one), too. Of course, questions about inconsistency come to mind (like, why have 2 out of 3 datasets for these samples (including two 25 MB files), checked in directly, and download the other dataset (< 1MB) from another Git repo owned by our team), but I'm okay with that. Long-term, it'd be great to nail down a solid story around how we store datasets ofc. (Are datasets housed in their own repo? In Git LFS? Etc) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Correct. I was wondering if the DatasetUtils class is public in the nuget. The dataset files are downloaded via https. Longer term, I'm a fan of Git LFS, but we haven't set it up for the repo. There's an ok argument to keep all datasets in the samples repo too. Besides for unit test datasets which tend to be trivially small, I like having the main repo be rather slim by keeping datasets elsewhere. The DatasetUtils method also lets us reference larger datasets for our examples. |
||||||||||||||||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory> | ||||||||||||||||
</None> | ||||||||||||||||
<None Include="..\..\..\test\data\taxi-fare-test.csv" Link="Data\taxi-fare-test.csv"> | ||||||||||||||||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory> | ||||||||||||||||
</None> | ||||||||||||||||
<None Include="..\..\..\test\data\taxi-fare-train.csv" Link="Data\taxi-fare-train.csv"> | ||||||||||||||||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory> | ||||||||||||||||
</None> | ||||||||||||||||
<None Include="..\..\..\test\data\wikipedia-detox-250-line-data.tsv" Link="Data\wikipedia-detox-250-line-data.tsv"> | ||||||||||||||||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory> | ||||||||||||||||
</None> | ||||||||||||||||
<None Include="..\..\..\test\data\wikipedia-detox-250-line-test.tsv" Link="Data\wikipedia-detox-250-line-test.tsv"> | ||||||||||||||||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory> | ||||||||||||||||
</None> | ||||||||||||||||
</ItemGroup> | ||||||||||||||||
|
||||||||||||||||
<ItemGroup> | ||||||||||||||||
<None Update="Data\optdigits-test.csv"> | ||||||||||||||||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory> | ||||||||||||||||
</None> | ||||||||||||||||
<None Update="Data\optdigits-train.csv"> | ||||||||||||||||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory> | ||||||||||||||||
</None> | ||||||||||||||||
</ItemGroup> | ||||||||||||||||
|
||||||||||||||||
</Project> |
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
@@ -0,0 +1,62 @@ | ||||
using System; | ||||
using System.IO; | ||||
using System.Linq; | ||||
using Microsoft.ML.Auto; | ||||
using Microsoft.ML.Data; | ||||
|
||||
namespace Microsoft.ML.AutoML.Samples | ||||
{ | ||||
public static class MulticlassClassificationExperiment | ||||
{ | ||||
private static string BaseDatasetsLocation = "Data"; | ||||
private static string TrainDataPath = Path.Combine(BaseDatasetsLocation, "optdigits-train.csv"); | ||||
private static string TestDataPath = Path.Combine(BaseDatasetsLocation, "optdigits-test.csv"); | ||||
private static string ModelPath = Path.Combine(BaseDatasetsLocation, "OptDigits.zip"); | ||||
private static string LabelColumnName = "Number"; | ||||
private static uint ExperimentTime = 60; | ||||
|
||||
public static void Run() | ||||
{ | ||||
MLContext mlContext = new MLContext(); | ||||
|
||||
// STEP 1: Load data | ||||
IDataView trainDataView = mlContext.Data.LoadFromTextFile<PixelData>(TrainDataPath, hasHeader: true, separatorChar: ','); | ||||
IDataView testDataView = mlContext.Data.LoadFromTextFile<PixelData>(TestDataPath, hasHeader: true, separatorChar: ','); | ||||
|
||||
// STEP 2: Run AutoML experiment | ||||
Console.WriteLine($"Running AutoML multiclass classification experiment for {ExperimentTime} seconds..."); | ||||
ExperimentResult<MulticlassClassificationMetrics> experimentResult = mlContext.Auto() | ||||
.CreateMulticlassClassificationExperiment(ExperimentTime) | ||||
.Execute(trainDataView, LabelColumnName); | ||||
|
||||
// STEP 3: Print metric from the best model | ||||
RunDetail<MulticlassClassificationMetrics> best = experimentResult.BestRun; | ||||
Console.WriteLine($"Total models produced: {experimentResult.RunDetails.Count()}"); | ||||
Console.WriteLine($"Best model's trainer: {best.TrainerName}"); | ||||
Console.WriteLine($"AccuracyMacro of best model from validation data: {best.ValidationMetrics.MacroAccuracy}"); | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why MacroAccuracy? Generally, I'd just print all metrics. If we're going to display MacroAccracy, I'd recommend training towards it (or a proxy metric). Macro & Micro accuracy are sometimes at odds, for instance adding a weight column to balance the effect of each class tends to increase macro accuracy at the cost of micro. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You're right -- I copied this part over from the old repo. Thanks. Let me print MicroAccuracy, since we're sweeping to optimize it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How obvious is helper function? For clarity, we could have the 4 print statements directly. I like seeing training time for my runs; do we pass that back to the user? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. rev-ed multiclass sample machinelearning/docs/samples/Microsoft.ML.AutoML.Samples/MulticlassClassificationExperiment.cs Line 43 in 4b5bfd5
I can update metrics printing for remaining samples if that looks alright I agree that printing iteration time for each model would be nice to have. Let's brainstorm together with Vinod all the other things the samples here should include, and what they should look like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. for the sake of time, going to iterate on metrics printing for Regression & Binary in the same way, and then check-in. if there's a different way that would be preferable, will submit a PR to update |
||||
|
||||
// STEP 4: Evaluate test data | ||||
IDataView testDataViewWithBestScore = best.Model.Transform(testDataView); | ||||
MulticlassClassificationMetrics testMetrics = mlContext.MulticlassClassification.Evaluate(testDataViewWithBestScore, labelColumnName: LabelColumnName); | ||||
Console.WriteLine($"AccuracyMacro of best model on test data: {testMetrics.MacroAccuracy}"); | ||||
|
||||
// STEP 5: Save the best model for later deployment and inferencing | ||||
using (FileStream fs = File.Create(ModelPath)) | ||||
mlContext.Model.Save(best.Model, trainDataView.Schema, fs); | ||||
|
||||
// STEP 6: Create prediction engine from the best trained model | ||||
var predictionEngine = mlContext.Model.CreatePredictionEngine<PixelData, PixelPrediction>(best.Model); | ||||
|
||||
// STEP 7: Initialize new pixel data, and get the predicted number | ||||
var testPixelData = new PixelData | ||||
{ | ||||
PixelValues = new float[] { 0, 0, 1, 8, 15, 10, 0, 0, 0, 3, 13, 15, 14, 14, 0, 0, 0, 5, 10, 0, 10, 12, 0, 0, 0, 0, 3, 5, 15, 10, 2, 0, 0, 0, 16, 16, 16, 16, 12, 0, 0, 1, 8, 12, 14, 8, 3, 0, 0, 0, 0, 10, 13, 0, 0, 0, 0, 0, 0, 11, 9, 0, 0, 0 } | ||||
}; | ||||
var prediction = predictionEngine.Predict(testPixelData); | ||||
Console.WriteLine($"Predicted number for test pixels: {prediction.Prediction}"); | ||||
|
||||
Console.WriteLine("Press any key to continue..."); | ||||
Console.ReadKey(); | ||||
} | ||||
} | ||||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
using System; | ||
|
||
namespace Microsoft.ML.AutoML.Samples | ||
{ | ||
public class Program | ||
{ | ||
public static void Main(string[] args) | ||
{ | ||
try | ||
{ | ||
RegressionExperiment.Run(); | ||
Console.Clear(); | ||
|
||
BinaryClassificationExperiment.Run(); | ||
Console.Clear(); | ||
|
||
MulticlassClassificationExperiment.Run(); | ||
Console.Clear(); | ||
|
||
Console.WriteLine("Done"); | ||
} | ||
catch (Exception ex) | ||
{ | ||
Console.WriteLine($"Exception {ex}"); | ||
} | ||
|
||
Console.ReadLine(); | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
using System; | ||
using System.IO; | ||
using System.Linq; | ||
using Microsoft.ML.Auto; | ||
using Microsoft.ML.Data; | ||
|
||
namespace Microsoft.ML.AutoML.Samples | ||
{ | ||
public static class RegressionExperiment | ||
{ | ||
private static string BaseDatasetsLocation = "Data"; | ||
private static string TrainDataPath = Path.Combine(BaseDatasetsLocation, "taxi-fare-train.csv"); | ||
private static string TestDataPath = Path.Combine(BaseDatasetsLocation, "taxi-fare-test.csv"); | ||
private static string ModelPath = Path.Combine(BaseDatasetsLocation, "TaxiFareModel.zip"); | ||
private static string LabelColumnName = "FareAmount"; | ||
private static uint ExperimentTime = 60; | ||
|
||
// STEP 2: Run an AutoML experiment | ||
|
||
public static void Run() | ||
{ | ||
MLContext mlContext = new MLContext(); | ||
|
||
// STEP 1: Load data | ||
IDataView trainDataView = mlContext.Data.LoadFromTextFile<TaxiTrip>(TrainDataPath, hasHeader: true, separatorChar: ','); | ||
IDataView testDataView = mlContext.Data.LoadFromTextFile<TaxiTrip>(TestDataPath, hasHeader: true, separatorChar: ','); | ||
|
||
// STEP 2: Run AutoML experiment | ||
Console.WriteLine($"Running AutoML regression experiment for {ExperimentTime} seconds..."); | ||
ExperimentResult<RegressionMetrics> experimentResult = mlContext.Auto() | ||
.CreateRegressionExperiment(ExperimentTime) | ||
.Execute(trainDataView, LabelColumnName); | ||
|
||
// STEP 3: Print metric from best model | ||
RunDetail<RegressionMetrics> best = experimentResult.BestRun; | ||
Console.WriteLine($"Total models produced: {experimentResult.RunDetails.Count()}"); | ||
Console.WriteLine($"Best model's trainer: {best.TrainerName}"); | ||
Console.WriteLine($"RSquared of best model from validation data: {best.ValidationMetrics.RSquared}"); | ||
|
||
// STEP 5: Evaluate test data | ||
IDataView testDataViewWithBestScore = best.Model.Transform(testDataView); | ||
RegressionMetrics testMetrics = mlContext.Regression.Evaluate(testDataViewWithBestScore, labelColumnName: LabelColumnName); | ||
Console.WriteLine($"RSquared of best model on test data: {testMetrics.RSquared}"); | ||
|
||
// STEP 6: Save the best model for later deployment and inferencing | ||
using (FileStream fs = File.Create(ModelPath)) | ||
mlContext.Model.Save(best.Model, trainDataView.Schema, fs); | ||
|
||
// STEP 7: Create prediction engine from the best trained model | ||
var predictionEngine = mlContext.Model.CreatePredictionEngine<TaxiTrip, TaxiTripFarePrediction>(best.Model); | ||
|
||
// STEP 8: Initialize a new test taxi trip, and get the predicted fare | ||
var testTaxiTrip = new TaxiTrip | ||
{ | ||
VendorId = "VTS", | ||
RateCode = 1, | ||
PassengerCount = 1, | ||
TripTimeInSeconds = 1140, | ||
TripDistance = 3.75f, | ||
PaymentType = "CRD" | ||
}; | ||
var prediction = predictionEngine.Predict(testTaxiTrip); | ||
Console.WriteLine($"Predicted fare for test taxi trip: {prediction.FareAmount}"); | ||
|
||
Console.WriteLine("Press any key to continue..."); | ||
Console.ReadKey(); | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure how to best do it, but printing the featurization would be nice.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Great point. +1 that not sure how to do it. Let's think through this sometime together, and then circle back & add it