Skip to content

Lockdown of Microsoft.ML.LightGBM public surface. #2476

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
Feb 15, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
59d1a08
Lockdown of Microsoft.ML.LightGBM public surface.
zeahmed Feb 8, 2019
97a2e59
Merge remote-tracking branch 'upstream/master' into LightGBM_refact
zeahmed Feb 11, 2019
a6c3357
Addressed reviewers' comments.
zeahmed Feb 12, 2019
01f4188
Addressed reviewers' comments.
zeahmed Feb 12, 2019
85d53d1
Addressed reviewers' comments and added more samples.
zeahmed Feb 12, 2019
3f91b54
Undo changes in LightGbmStaticExtension.cs
zeahmed Feb 12, 2019
9245470
Addressed reviewers' comments.
zeahmed Feb 13, 2019
95de1a1
Merge remote-tracking branch 'upstream/master' into LightGBM_refact
zeahmed Feb 13, 2019
49509c1
Merged with the base and resolved merge conflicts.
zeahmed Feb 13, 2019
3e27a88
Addressed reviewers' comments.
zeahmed Feb 13, 2019
6c8ac35
Merge remote-tracking branch 'upstream/master' into LightGBM_refact
zeahmed Feb 14, 2019
571575f
Addressed reviewers' comments.
zeahmed Feb 14, 2019
112636b
Merge remote-tracking branch 'upstream/master' into LightGBM_refact
zeahmed Feb 14, 2019
fc60edd
Created two parameters for Booster. One for EntryPoint and one for cm…
zeahmed Feb 14, 2019
69e51bf
Merge branch 'LightGBM_refact' of https://github.com/zeahmed/machinel…
zeahmed Feb 14, 2019
31ceabb
Making all the Booster classes sealed.
zeahmed Feb 14, 2019
0a56dc8
Resolved merged conflicts.
zeahmed Feb 14, 2019
1b54da4
Resolved conflicts in SamplesDatasetUtils.cs
zeahmed Feb 14, 2019
ed2ea83
Reverted changes related to making `ISupportBoosterParameterFactory` …
zeahmed Feb 14, 2019
d1d4109
Changed Arguments classes to Options.
zeahmed Feb 15, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
using Microsoft.ML.Transforms.Categorical;

namespace Microsoft.ML.Samples.Dynamic
{
public class LightGbmBinaryClassification
{
// This example requires installation of additional nuget package <a href="https://www.nuget.org/packages/Microsoft.ML.LightGBM/">Microsoft.ML.LightGBM</a>.
public static void Example()
Copy link
Contributor

@Ivanidzo4ka Ivanidzo4ka Feb 12, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Example [](start = 27, length = 7)

shall we comment what this sample require LightGbm nuget package? #Resolved

{
// Creating the ML.Net IHostEnvironment object, needed for the pipeline.
var mlContext = new MLContext();

// Download and featurize the dataset.
var dataview = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext);

// Leave out 10% of data for testing.
var split = mlContext.BinaryClassification.TrainTestSplit(dataview, testFraction: 0.1);

// Create the Estimator.
var pipeline = mlContext.BinaryClassification.Trainers.LightGbm("IsOver50K", "Features");

// Fit this Pipeline to the Training Data.
var model = pipeline.Fit(split.TrainSet);

// Evaluate how the model is doing on the test data.
var dataWithPredictions = model.Transform(split.TestSet);

var metrics = mlContext.BinaryClassification.Evaluate(dataWithPredictions, "IsOver50K");
SamplesUtils.ConsoleUtils.PrintMetrics(metrics);

// Output:
// Accuracy: 0.88
// AUC: 0.93
// F1 Score: 0.71
// Negative Precision: 0.90
// Negative Recall: 0.94
// Positive Precision: 0.76
// Positive Recall: 0.66
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
using Microsoft.ML.LightGBM;
using Microsoft.ML.Transforms.Categorical;
using static Microsoft.ML.LightGBM.Options;

namespace Microsoft.ML.Samples.Dynamic
{
class LightGbmBinaryClassificationWithOptions
{
// This example requires installation of additional nuget package <a href="https://www.nuget.org/packages/Microsoft.ML.LightGBM/">Microsoft.ML.LightGBM</a>.
public static void Example()
{
// Creating the ML.Net IHostEnvironment object, needed for the pipeline
var mlContext = new MLContext();

// Download and featurize the dataset.
var dataview = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext);

// Leave out 10% of data for testing.
var split = mlContext.BinaryClassification.TrainTestSplit(dataview, testFraction: 0.1);

// Create the pipeline with LightGbm Estimator using advanced options.
var pipeline = mlContext.BinaryClassification.Trainers.LightGbm(
new Options
{
LabelColumn = "IsOver50K",
FeatureColumn = "Features",
Booster = new GossBooster.Options
{
TopRate = 0.3,
OtherRate = 0.2
}
});

// Fit this Pipeline to the Training Data.
var model = pipeline.Fit(split.TrainSet);

// Evaluate how the model is doing on the test data.
var dataWithPredictions = model.Transform(split.TestSet);

var metrics = mlContext.BinaryClassification.Evaluate(dataWithPredictions, "IsOver50K");
SamplesUtils.ConsoleUtils.PrintMetrics(metrics);

// Output:
// Accuracy: 0.88
// AUC: 0.93
// F1 Score: 0.71
// Negative Precision: 0.90
// Negative Recall: 0.94
// Positive Precision: 0.76
// Positive Recall: 0.67
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
using System;
using System.Linq;
using Microsoft.ML.Data;
using Microsoft.ML.SamplesUtils;

namespace Microsoft.ML.Samples.Dynamic
{
class LightGbmMulticlassClassification
{
// This example requires installation of additional nuget package <a href="https://www.nuget.org/packages/Microsoft.ML.LightGBM/">Microsoft.ML.LightGBM</a>.
public static void Example()
{
// Create a general context for ML.NET operations. It can be used for exception tracking and logging,
// as a catalog of available operations and as the source of randomness.
var mlContext = new MLContext();

// Create in-memory examples as C# native class.
var examples = DatasetUtils.GenerateRandomMulticlassClassificationExamples(1000);

// Convert native C# class to IDataView, a consumble format to ML.NET functions.
var dataView = mlContext.Data.ReadFromEnumerable(examples);

//////////////////// Data Preview ////////////////////
// Label Features
// AA 0.7262433,0.8173254,0.7680227,0.5581612,0.2060332,0.5588848,0.9060271,0.4421779,0.9775497,0.2737045
// BB 0.4919063,0.6673147,0.8326591,0.6695119,1.182151,0.230367,1.06237,1.195347,0.8771811,0.5145918
// CC 1.216908,1.248052,1.391902,0.4326252,1.099942,0.9262842,1.334019,1.08762,0.9468155,0.4811099
// DD 0.7871246,1.053327,0.8971719,1.588544,1.242697,1.362964,0.6303943,0.9810045,0.9431419,1.557455

// Create a pipeline.
// - Convert the string labels into key types.
// - Apply LightGbm multiclass trainer.
var pipeline = mlContext.Transforms.Conversion.MapValueToKey("LabelIndex", "Label")
.Append(mlContext.MulticlassClassification.Trainers.LightGbm(labelColumn: "LabelIndex"))
.Append(mlContext.Transforms.Conversion.MapValueToKey("PredictedLabelIndex", "PredictedLabel"))
.Append(mlContext.Transforms.CopyColumns("Scores", "Score"));

// Split the static-typed data into training and test sets. Only training set is used in fitting
// the created pipeline. Metrics are computed on the test.
var split = mlContext.MulticlassClassification.TrainTestSplit(dataView, testFraction: 0.5);

// Train the model.
var model = pipeline.Fit(split.TrainSet);

// Do prediction on the test set.
var dataWithPredictions = model.Transform(split.TestSet);

// Evaluate the trained model using the test set.
var metrics = mlContext.MulticlassClassification.Evaluate(dataWithPredictions, label: "LabelIndex");

// Check if metrics are reasonable.
Console.WriteLine($"Macro accuracy: {metrics.AccuracyMacro:F4}, Micro accuracy: {metrics.AccuracyMicro:F4}.");
// Console output:
// Macro accuracy: 0.8655, Micro accuracy: 0.8651.

// IDataView with predictions, to an IEnumerable<DatasetUtils.MulticlassClassificationExample>.
var nativePredictions = mlContext.CreateEnumerable<DatasetUtils.MulticlassClassificationExample>(dataWithPredictions, false).ToList();

// Get schema object out of the prediction. It contains metadata such as the mapping from predicted label index
// (e.g., 1) to its actual label (e.g., "AA").
// The metadata can be used to get all the unique labels used during training.
var labelBuffer = new VBuffer<ReadOnlyMemory<char>>();
dataWithPredictions.Schema["PredictedLabelIndex"].GetKeyValues(ref labelBuffer);
// nativeLabels is { "AA" , "BB", "CC", "DD" }
var nativeLabels = labelBuffer.DenseValues().ToArray(); // nativeLabels[nativePrediction.PredictedLabelIndex - 1] is the original label indexed by nativePrediction.PredictedLabelIndex.


// Show prediction result for the 3rd example.
var nativePrediction = nativePredictions[2];
// Console output:
// Our predicted label to this example is "AA" with probability 0.9257.
Console.WriteLine($"Our predicted label to this example is {nativeLabels[(int)nativePrediction.PredictedLabelIndex - 1]} " +
$"with probability {nativePrediction.Scores[(int)nativePrediction.PredictedLabelIndex - 1]:F4}.");

// Scores and nativeLabels are two parallel attributes; that is, Scores[i] is the probability of being nativeLabels[i].
// Console output:
// The probability of being class "AA" is 0.9257.
// The probability of being class "BB" is 0.0739.
// The probability of being class "CC" is 0.0002.
// The probability of being class "DD" is 0.0001.
for (int i = 0; i < nativeLabels.Length; ++i)
Console.WriteLine($"The probability of being class {nativeLabels[i]} is {nativePrediction.Scores[i]:F4}.");
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
using System;
using System.Linq;
using Microsoft.ML.Data;
using Microsoft.ML.LightGBM;
using Microsoft.ML.SamplesUtils;
using static Microsoft.ML.LightGBM.Options;

namespace Microsoft.ML.Samples.Dynamic
{
class LightGbmMulticlassClassificationWithOptions
{
// This example requires installation of additional nuget package <a href="https://www.nuget.org/packages/Microsoft.ML.LightGBM/">Microsoft.ML.LightGBM</a>.
public static void Example()
{
// Create a general context for ML.NET operations. It can be used for exception tracking and logging,
// as a catalog of available operations and as the source of randomness.
var mlContext = new MLContext(seed: 0);

// Create in-memory examples as C# native class.
var examples = DatasetUtils.GenerateRandomMulticlassClassificationExamples(1000);

// Convert native C# class to IDataView, a consumble format to ML.NET functions.
var dataView = mlContext.Data.ReadFromEnumerable(examples);

//////////////////// Data Preview ////////////////////
// Label Features
// AA 0.7262433,0.8173254,0.7680227,0.5581612,0.2060332,0.5588848,0.9060271,0.4421779,0.9775497,0.2737045
// BB 0.4919063,0.6673147,0.8326591,0.6695119,1.182151,0.230367,1.06237,1.195347,0.8771811,0.5145918
// CC 1.216908,1.248052,1.391902,0.4326252,1.099942,0.9262842,1.334019,1.08762,0.9468155,0.4811099
// DD 0.7871246,1.053327,0.8971719,1.588544,1.242697,1.362964,0.6303943,0.9810045,0.9431419,1.557455

// Create a pipeline.
// - Convert the string labels into key types.
// - Apply LightGbm multiclass trainer with advanced options.
var pipeline = mlContext.Transforms.Conversion.MapValueToKey("LabelIndex", "Label")
.Append(mlContext.MulticlassClassification.Trainers.LightGbm(new Options
{
LabelColumn = "LabelIndex",
FeatureColumn = "Features",
Booster = new DartBooster.Options
{
DropRate = 0.15,
XgboostDartMode = false
}
}))
.Append(mlContext.Transforms.Conversion.MapValueToKey("PredictedLabelIndex", "PredictedLabel"))
.Append(mlContext.Transforms.CopyColumns("Scores", "Score"));

// Split the static-typed data into training and test sets. Only training set is used in fitting
// the created pipeline. Metrics are computed on the test.
var split = mlContext.MulticlassClassification.TrainTestSplit(dataView, testFraction: 0.5);

// Train the model.
var model = pipeline.Fit(split.TrainSet);

// Do prediction on the test set.
var dataWithPredictions = model.Transform(split.TestSet);

// Evaluate the trained model using the test set.
var metrics = mlContext.MulticlassClassification.Evaluate(dataWithPredictions, label: "LabelIndex");

// Check if metrics are reasonable.
Console.WriteLine($"Macro accuracy: {metrics.AccuracyMacro:F4}, Micro accuracy: {metrics.AccuracyMicro:F4}.");
// Console output:
// Macro accuracy: 0.8619, Micro accuracy: 0.8611.

// IDataView with predictions, to an IEnumerable<DatasetUtils.MulticlassClassificationExample>.
var nativePredictions = mlContext.CreateEnumerable<DatasetUtils.MulticlassClassificationExample>(dataWithPredictions, false).ToList();

// Get schema object out of the prediction. It contains metadata such as the mapping from predicted label index
// (e.g., 1) to its actual label (e.g., "AA").
// The metadata can be used to get all the unique labels used during training.
var labelBuffer = new VBuffer<ReadOnlyMemory<char>>();
dataWithPredictions.Schema["PredictedLabelIndex"].GetKeyValues(ref labelBuffer);
// nativeLabels is { "AA" , "BB", "CC", "DD" }
var nativeLabels = labelBuffer.DenseValues().ToArray(); // nativeLabels[nativePrediction.PredictedLabelIndex - 1] is the original label indexed by nativePrediction.PredictedLabelIndex.


// Show prediction result for the 3rd example.
var nativePrediction = nativePredictions[2];
// Console output:
// Our predicted label to this example is AA with probability 0.8986.
Console.WriteLine($"Our predicted label to this example is {nativeLabels[(int)nativePrediction.PredictedLabelIndex - 1]} " +
$"with probability {nativePrediction.Scores[(int)nativePrediction.PredictedLabelIndex - 1]:F4}.");

// Scores and nativeLabels are two parallel attributes; that is, Scores[i] is the probability of being nativeLabels[i].
// Console output:
// The probability of being class AA is 0.8986.
// The probability of being class BB is 0.0961.
// The probability of being class CC is 0.0050.
// The probability of being class DD is 0.0003.
for (int i = 0; i < nativeLabels.Length; ++i)
Console.WriteLine($"The probability of being class {nativeLabels[i]} is {nativePrediction.Scores[i]:F4}.");
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
using System;
using System.Linq;
using Microsoft.ML.Data;

namespace Microsoft.ML.Samples.Dynamic
{
class LightGbmRegression
{
// This example requires installation of additional nuget package <a href="https://www.nuget.org/packages/Microsoft.ML.LightGBM/">Microsoft.ML.LightGBM</a>.
public static void Example()
{
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
// as well as the source of randomness.
var mlContext = new MLContext();

// Download and load the housing dataset into an IDataView.
var dataView = SamplesUtils.DatasetUtils.LoadHousingRegressionDataset(mlContext);

//////////////////// Data Preview ////////////////////
/// Only 6 columns are displayed here.
// MedianHomeValue CrimesPerCapita PercentResidental PercentNonRetail CharlesRiver NitricOxides RoomsPerDwelling PercentPre40s ...
// 24.00 0.00632 18.00 2.310 0 0.5380 6.5750 65.20 ...
// 21.60 0.02731 00.00 7.070 0 0.4690 6.4210 78.90 ...
// 34.70 0.02729 00.00 7.070 0 0.4690 7.1850 61.10 ...

var split = mlContext.Regression.TrainTestSplit(dataView, testFraction: 0.1);

// Create the estimator, here we only need LightGbm trainer
// as data is already processed in a form consumable by the trainer.
var labelName = "MedianHomeValue";
var featureNames = dataView.Schema
.Select(column => column.Name) // Get the column names
.Where(name => name != labelName) // Drop the Label
.ToArray();
var pipeline = mlContext.Transforms.Concatenate("Features", featureNames)
.Append(mlContext.Regression.Trainers.LightGbm(
labelColumn: labelName,
numLeaves: 4,
minDataPerLeaf: 6,
learningRate: 0.001));

// Fit this pipeline to the training data.
var model = pipeline.Fit(split.TrainSet);

// Get the feature importance based on the information gain used during training.
VBuffer<float> weights = default;
model.LastTransformer.Model.GetFeatureWeights(ref weights);
var weightsValues = weights.DenseValues().ToArray();
Console.WriteLine($"weight 0 - {weightsValues[0]}"); // CrimesPerCapita (weight 0) = 0.1898361
Console.WriteLine($"weight 5 - {weightsValues[5]}"); // RoomsPerDwelling (weight 5) = 1

// Evaluate how the model is doing on the test data.
var dataWithPredictions = model.Transform(split.TestSet);
var metrics = mlContext.Regression.Evaluate(dataWithPredictions, label: labelName);
SamplesUtils.ConsoleUtils.PrintMetrics(metrics);

// Output
// L1: 4.97
// L2: 51.37
// LossFunction: 51.37
// RMS: 7.17
// RSquared: 0.08
}
}
}
Loading