From b57261463288e5089da7536ddc1e167dc71da1bd Mon Sep 17 00:00:00 2001 From: "REDMOND\\nakazmi" Date: Tue, 19 Feb 2019 23:19:02 -0800 Subject: [PATCH 01/13] Adding a sample for LightGbm Ranking --- ...LightGBMBinaryClassificationWithOptions.cs | 1 - .../Trainers/Ranking/LightGBMRanking.cs | 42 +++++++++++++++ .../Ranking/LightGBMRankingWithOptions.cs | 44 ++++++++++++++++ docs/samples/Microsoft.ML.Samples/Program.cs | 2 +- .../Evaluators/Metrics/RankerMetrics.cs | 2 +- src/Microsoft.ML.SamplesUtils/ConsoleUtils.cs | 11 ++++ .../SamplesDatasetUtils.cs | 51 +++++++++++++++++++ 7 files changed, 150 insertions(+), 3 deletions(-) create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRanking.cs create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRankingWithOptions.cs diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGBMBinaryClassificationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGBMBinaryClassificationWithOptions.cs index 20924bc29f..904285aaee 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGBMBinaryClassificationWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGBMBinaryClassificationWithOptions.cs @@ -1,5 +1,4 @@ using Microsoft.ML.LightGBM; -using Microsoft.ML.Transforms.Categorical; using static Microsoft.ML.LightGBM.Options; namespace Microsoft.ML.Samples.Dynamic diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRanking.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRanking.cs new file mode 100644 index 0000000000..8822a16630 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRanking.cs @@ -0,0 +1,42 @@ +using System; +using System.Collections.Generic; +using System.Text; + +namespace Microsoft.ML.Samples.Dynamic +{ + public class LightGbmRanking + { + // This example requires installation of additional nuget package Microsoft.ML.LightGBM. + public static void Example() + { + // Creating the ML.Net IHostEnvironment object, needed for the pipeline. + var mlContext = new MLContext(); + + // Download and featurize the train and validation datasets. + (var trainData, var validationData) = SamplesUtils.DatasetUtils.LoadFeaturizedMslrWeb10kTrainAndValidate(mlContext); + + // Create the Estimator pipeline. For simplicity, we will train a small tree with 4 leaves and 2 boosting iterations. + var pipeline = mlContext.Ranking.Trainers.LightGbm( + labelColumn: "Label", + featureColumn: "Features", + groupIdColumn: "GroupId", + numLeaves: 4, + minDataPerLeaf: 10, + learningRate: 0.1, + numBoostRound: 2); + + // Fit this Pipeline to the Training Data. + var model = pipeline.Fit(trainData); + + // Evaluate how the model is doing on the test data. + var dataWithPredictions = model.Transform(validationData); + + var metrics = mlContext.Ranking.Evaluate(dataWithPredictions, "Label", "GroupId"); + SamplesUtils.ConsoleUtils.PrintMetrics(metrics); + + // Output: + // DCG @N: 1.38, 3.11, 4.94 + // NDCG @N: 7.13, 10.12, 12.62 + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRankingWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRankingWithOptions.cs new file mode 100644 index 0000000000..d8f3da41ea --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRankingWithOptions.cs @@ -0,0 +1,44 @@ +using Microsoft.ML.LightGBM; +using static Microsoft.ML.LightGBM.Options; + +namespace Microsoft.ML.Samples.Dynamic +{ + public class LightGbmRankingWithOptions + { + // This example requires installation of additional nuget package Microsoft.ML.LightGBM. + public static void Example() + { + // Creating the ML.Net IHostEnvironment object, needed for the pipeline. + var mlContext = new MLContext(); + + // Download and featurize the train and validation datasets. + (var trainData, var validationData) = SamplesUtils.DatasetUtils.LoadFeaturizedMslrWeb10kTrainAndValidate(mlContext); + + // Create the Estimator pipeline. For simplicity, we will train a small tree with 4 leaves and 2 boosting iterations. + var pipeline = mlContext.Ranking.Trainers.LightGbm( + new Options + { + LabelColumn = "Label", + FeatureColumn = "Features", + GroupIdColumn = "GroupId", + NumLeaves = 4, + MinDataPerLeaf = 10, + LearningRate = 0.1, + NumBoostRound = 2 + }); + + // Fit this Pipeline to the Training Data. + var model = pipeline.Fit(trainData); + + // Evaluate how the model is doing on the test data. + var dataWithPredictions = model.Transform(validationData); + + var metrics = mlContext.Ranking.Evaluate(dataWithPredictions, "Label", "GroupId"); + SamplesUtils.ConsoleUtils.PrintMetrics(metrics); + + // Output: + // DCG @N: 1.38, 3.11, 4.94 + // NDCG @N: 7.13, 10.12, 12.62 + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Program.cs b/docs/samples/Microsoft.ML.Samples/Program.cs index d28cdd4d77..6fa4e40705 100644 --- a/docs/samples/Microsoft.ML.Samples/Program.cs +++ b/docs/samples/Microsoft.ML.Samples/Program.cs @@ -6,7 +6,7 @@ internal static class Program { static void Main(string[] args) { - TakeRows.Example(); + LightGbmRanking.Example(); } } } diff --git a/src/Microsoft.ML.Data/Evaluators/Metrics/RankerMetrics.cs b/src/Microsoft.ML.Data/Evaluators/Metrics/RankerMetrics.cs index b9532fd31b..d3b9ef685f 100644 --- a/src/Microsoft.ML.Data/Evaluators/Metrics/RankerMetrics.cs +++ b/src/Microsoft.ML.Data/Evaluators/Metrics/RankerMetrics.cs @@ -18,7 +18,7 @@ public sealed class RankerMetrics ///Array of discounted cumulative gains where i-th element represent DCG@i. /// Discounted Cumulative gain /// is the sum of the gains, for all the instances i, normalized by the natural logarithm of the instance + 1. - /// Note that unline the Wikipedia article, ML.Net uses the natural logarithm. + /// Note that unlike the Wikipedia article, ML.Net uses the natural logarithm. /// /// public double[] Dcg { get; } diff --git a/src/Microsoft.ML.SamplesUtils/ConsoleUtils.cs b/src/Microsoft.ML.SamplesUtils/ConsoleUtils.cs index 93de658b1e..7c40d9a4c8 100644 --- a/src/Microsoft.ML.SamplesUtils/ConsoleUtils.cs +++ b/src/Microsoft.ML.SamplesUtils/ConsoleUtils.cs @@ -1,4 +1,5 @@ using System; +using System.Linq; using Microsoft.ML.Data; namespace Microsoft.ML.SamplesUtils @@ -35,5 +36,15 @@ public static void PrintMetrics(RegressionMetrics metrics) Console.WriteLine($"RMS: {metrics.Rms:F2}"); Console.WriteLine($"RSquared: {metrics.RSquared:F2}"); } + + /// + /// Pretty-print RankerMetrics objects. + /// + /// Ranker metrics. + public static void PrintMetrics(RankerMetrics metrics) + { + Console.WriteLine($"DCG@N: {string.Join(", ", metrics.Dcg.Select(d => Math.Round(d, 2)).ToArray())}"); + Console.WriteLine($"NDCG@N: {string.Join(", ", metrics.Ndcg.Select(d => Math.Round(d, 2)).ToArray())}"); + } } } diff --git a/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs b/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs index 203bd6e6bd..a40212b0e6 100644 --- a/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs +++ b/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs @@ -146,6 +146,57 @@ public static IDataView LoadFeaturizedAdultDataset(MLContext mlContext) return featurizedData; } + public static string DownloadMslrWeb10kTrain() + { + var fileName = "MSLRWeb10KTrain720kRows.tsv"; + if (!File.Exists(fileName)) + Download("https://tlcresources.blob.core.windows.net/datasets/MSLR-WEB10K/MSLR-WEB10K_Fold1.TRAIN.500MB_720k-rows.tsv", fileName); + return fileName; + } + + public static string DownloadMslrWeb10kValidate() + { + var fileName = "MSLRWeb10KValidate240kRows.tsv"; + if (!File.Exists(fileName)) + Download("https://tlcresources.blob.core.windows.net/datasets/MSLR-WEB10K/MSLR-WEB10K_Fold1.VALIDATE.160MB_240k-rows.tsv", fileName); + return fileName; + } + + public static (IDataView, IDataView) LoadFeaturizedMslrWeb10kTrainAndValidate(MLContext mlContext) + { + // Download the training and validation files. + string trainDataFile = DownloadMslrWeb10kTrain(); + string validationDataFile = DownloadMslrWeb10kValidate(); + + // Create the reader to read the data. + var reader = mlContext.Data.CreateTextLoader( + columns: new[] + { + new TextLoader.Column("Label", DataKind.R4, 0), + new TextLoader.Column("GroupId", DataKind.TX, 1), + new TextLoader.Column("Features", DataKind.R4, new[] { new TextLoader.Range(2, 138) }) + } + ); + + // Load the raw training and validation datasets. + var trainData = reader.Read(trainDataFile); + var validationData = reader.Read(validationDataFile); + + // Create the featurization pipeline. First, hash the GroupId column. + var pipeline = mlContext.Transforms.Conversion.Hash("GroupId") + // Replace missing values in Features column with the default replacement value for its type. + .Append(mlContext.Transforms.ReplaceMissingValues("Features")); + + // Fit the pipeline on the training data. + var fittedPipeline = pipeline.Fit(trainData); + + // Use the fitted pipeline to transform the training and validation datasets. + var transformedTrainData = fittedPipeline.Transform(trainData); + var transformedValidationData = fittedPipeline.Transform(validationData); + + return (transformedTrainData, transformedValidationData); + } + /// /// Downloads the breast cancer dataset from the ML.NET repo. /// From f3d5d82ef1c1524f9d9f95c597ca40f52863a176 Mon Sep 17 00:00:00 2001 From: "REDMOND\\nakazmi" Date: Fri, 22 Feb 2019 18:19:12 -0800 Subject: [PATCH 02/13] PR feedback + cleaning up namespaces in Microsoft.ML.Samples project --- .../LightGBMBinaryClassification.cs | 24 +++++++------- ...LightGBMBinaryClassificationWithOptions.cs | 24 +++++++------- .../SDCALogisticRegression.cs | 2 +- .../SDCASupportVectorMachine.cs | 2 +- .../SymbolicStochasticGradientDescent.cs | 20 ++++++------ ...licStochasticGradientDescentWithOptions.cs | 21 ++++++------ .../LightGBMMulticlassClassification.cs | 4 +-- ...tGBMMulticlassClassificationWithOptions.cs | 4 +-- .../Trainers/Ranking/LightGBMRanking.cs | 30 ++++++++--------- .../Ranking/LightGBMRankingWithOptions.cs | 27 ++++++++-------- .../Recommendation/MatrixFactorization.cs | 2 +- .../MatrixFactorizationWithOptions.cs | 2 +- .../Trainers/Regression/LightGBMRegression.cs | 16 +++++----- .../LightGBMRegressionWithOptions.cs | 16 +++++----- .../Regression/OrdinaryLeastSquares.cs | 14 ++++---- .../OrdinaryLeastSquaresWithOptions.cs | 14 ++++---- docs/samples/Microsoft.ML.Samples/Program.cs | 2 +- .../Evaluators/Metrics/RankerMetrics.cs | 4 +-- src/Microsoft.ML.SamplesUtils/ConsoleUtils.cs | 14 ++++++-- .../SamplesDatasetUtils.cs | 32 ++++++------------- 20 files changed, 138 insertions(+), 136 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGBMBinaryClassification.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGBMBinaryClassification.cs index edd4e31504..a6834d0082 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGBMBinaryClassification.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGBMBinaryClassification.cs @@ -1,8 +1,8 @@ using Microsoft.ML.Transforms.Categorical; -namespace Microsoft.ML.Samples.Dynamic +namespace Microsoft.ML.Samples.Dynamic.Trainers.BinaryClassification { - public class LightGbmBinaryClassification + public class LightGbm { // This example requires installation of additional nuget package Microsoft.ML.LightGBM. public static void Example() @@ -17,7 +17,7 @@ public static void Example() var split = mlContext.BinaryClassification.TrainTestSplit(dataview, testFraction: 0.1); // Create the Estimator. - var pipeline = mlContext.BinaryClassification.Trainers.LightGbm("IsOver50K", "Features"); + var pipeline = mlContext.BinaryClassification.Trainers.LightGbm(); // Fit this Pipeline to the Training Data. var model = pipeline.Fit(split.TrainSet); @@ -25,17 +25,17 @@ public static void Example() // Evaluate how the model is doing on the test data. var dataWithPredictions = model.Transform(split.TestSet); - var metrics = mlContext.BinaryClassification.Evaluate(dataWithPredictions, "IsOver50K"); + var metrics = mlContext.BinaryClassification.Evaluate(dataWithPredictions); SamplesUtils.ConsoleUtils.PrintMetrics(metrics); - // Output: - // Accuracy: 0.88 - // AUC: 0.93 - // F1 Score: 0.71 - // Negative Precision: 0.90 - // Negative Recall: 0.94 - // Positive Precision: 0.76 - // Positive Recall: 0.66 + // Expected output: + // Accuracy: 0.88 + // AUC: 0.93 + // F1 Score: 0.71 + // Negative Precision: 0.90 + // Negative Recall: 0.94 + // Positive Precision: 0.76 + // Positive Recall: 0.66 } } } \ No newline at end of file diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGBMBinaryClassificationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGBMBinaryClassificationWithOptions.cs index 904285aaee..7b0e21fed9 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGBMBinaryClassificationWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGBMBinaryClassificationWithOptions.cs @@ -1,9 +1,9 @@ using Microsoft.ML.LightGBM; using static Microsoft.ML.LightGBM.Options; -namespace Microsoft.ML.Samples.Dynamic +namespace Microsoft.ML.Samples.Dynamic.Trainers.BinaryClassification { - class LightGbmBinaryClassificationWithOptions + class LightGbmWithOptions { // This example requires installation of additional nuget package Microsoft.ML.LightGBM. public static void Example() @@ -21,8 +21,6 @@ public static void Example() var pipeline = mlContext.BinaryClassification.Trainers.LightGbm( new Options { - LabelColumn = "IsOver50K", - FeatureColumn = "Features", Booster = new GossBooster.Options { TopRate = 0.3, @@ -36,17 +34,17 @@ public static void Example() // Evaluate how the model is doing on the test data. var dataWithPredictions = model.Transform(split.TestSet); - var metrics = mlContext.BinaryClassification.Evaluate(dataWithPredictions, "IsOver50K"); + var metrics = mlContext.BinaryClassification.Evaluate(dataWithPredictions); SamplesUtils.ConsoleUtils.PrintMetrics(metrics); - // Output: - // Accuracy: 0.88 - // AUC: 0.93 - // F1 Score: 0.71 - // Negative Precision: 0.90 - // Negative Recall: 0.94 - // Positive Precision: 0.76 - // Positive Recall: 0.67 + // Expected output: + // Accuracy: 0.88 + // AUC: 0.93 + // F1 Score: 0.71 + // Negative Precision: 0.90 + // Negative Recall: 0.94 + // Positive Precision: 0.76 + // Positive Recall: 0.67 } } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SDCALogisticRegression.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SDCALogisticRegression.cs index 3ab3257638..da12242ce4 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SDCALogisticRegression.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SDCALogisticRegression.cs @@ -3,7 +3,7 @@ using Microsoft.ML.Data; using Microsoft.ML.Trainers; -namespace Microsoft.ML.Samples.Dynamic +namespace Microsoft.ML.Samples.Dynamic.Trainers.BinaryClassification { public static class SDCALogisticRegression { diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SDCASupportVectorMachine.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SDCASupportVectorMachine.cs index d37c1cec1a..0730e3daee 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SDCASupportVectorMachine.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SDCASupportVectorMachine.cs @@ -2,7 +2,7 @@ using System.Linq; using Microsoft.ML.Data; -namespace Microsoft.ML.Samples.Dynamic +namespace Microsoft.ML.Samples.Dynamic.Trainers.BinaryClassification { public static class SDCASupportVectorMachine { diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescent.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescent.cs index 49b31342e0..dcdd331ab5 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescent.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescent.cs @@ -1,4 +1,4 @@ -namespace Microsoft.ML.Samples.Dynamic +namespace Microsoft.ML.Samples.Dynamic.Trainers.BinaryClassification { public static class SymbolicStochasticGradientDescent { @@ -24,15 +24,17 @@ public static void Example() // Evaluate how the model is doing on the test data. var dataWithPredictions = model.Transform(split.TestSet); - var metrics = mlContext.BinaryClassification.EvaluateNonCalibrated(dataWithPredictions, "IsOver50K"); + var metrics = mlContext.BinaryClassification.EvaluateNonCalibrated(dataWithPredictions); SamplesUtils.ConsoleUtils.PrintMetrics(metrics); - // Accuracy: 0.85 - // AUC: 0.90 - // F1 Score: 0.64 - // Negative Precision: 0.88 - // Negative Recall: 0.93 - // Positive Precision: 0.72 - // Positive Recall: 0.58 + + // Expected output: + // Accuracy: 0.85 + // AUC: 0.90 + // F1 Score: 0.64 + // Negative Precision: 0.88 + // Negative Recall: 0.93 + // Positive Precision: 0.72 + // Positive Recall: 0.58 } } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescentWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescentWithOptions.cs index d05d64454c..e4363f29cc 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescentWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescentWithOptions.cs @@ -1,4 +1,4 @@ -namespace Microsoft.ML.Samples.Dynamic +namespace Microsoft.ML.Samples.Dynamic.Trainers.BinaryClassification { public static class SymbolicStochasticGradientDescentWithOptions { @@ -22,7 +22,6 @@ public static void Example() var pipeline = mlContext.BinaryClassification.Trainers.SymbolicStochasticGradientDescent( new ML.Trainers.HalLearners.SymSgdClassificationTrainer.Options() { - LabelColumn = "IsOver50K", LearningRate = 0.2f, NumberOfIterations = 10, NumberOfThreads = 1, @@ -33,15 +32,17 @@ public static void Example() // Evaluate how the model is doing on the test data. var dataWithPredictions = model.Transform(split.TestSet); - var metrics = mlContext.BinaryClassification.EvaluateNonCalibrated(dataWithPredictions, "IsOver50K"); + var metrics = mlContext.BinaryClassification.EvaluateNonCalibrated(dataWithPredictions); SamplesUtils.ConsoleUtils.PrintMetrics(metrics); - // Accuracy: 0.84 - // AUC: 0.88 - // F1 Score: 0.60 - // Negative Precision: 0.87 - // Negative Recall: 0.93 - // Positive Precision: 0.69 - // Positive Recall: 0.53 + + // Expected output: + // Accuracy: 0.84 + // AUC: 0.88 + // F1 Score: 0.60 + // Negative Precision: 0.87 + // Negative Recall: 0.93 + // Positive Precision: 0.69 + // Positive Recall: 0.53 } } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGBMMulticlassClassification.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGBMMulticlassClassification.cs index 8731c6bc50..103d9f052f 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGBMMulticlassClassification.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGBMMulticlassClassification.cs @@ -3,9 +3,9 @@ using Microsoft.ML.Data; using Microsoft.ML.SamplesUtils; -namespace Microsoft.ML.Samples.Dynamic +namespace Microsoft.ML.Samples.Dynamic.Trainers.MulticlassClassification { - class LightGbmMulticlassClassification + class LightGbm { // This example requires installation of additional nuget package Microsoft.ML.LightGBM. public static void Example() diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGBMMulticlassClassificationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGBMMulticlassClassificationWithOptions.cs index 7d98c9318e..36de9b8fe1 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGBMMulticlassClassificationWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGBMMulticlassClassificationWithOptions.cs @@ -5,9 +5,9 @@ using Microsoft.ML.SamplesUtils; using static Microsoft.ML.LightGBM.Options; -namespace Microsoft.ML.Samples.Dynamic +namespace Microsoft.ML.Samples.Dynamic.Trainers.MulticlassClassification { - class LightGbmMulticlassClassificationWithOptions + class LightGbmWithOptions { // This example requires installation of additional nuget package Microsoft.ML.LightGBM. public static void Example() diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRanking.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRanking.cs index 8822a16630..b5857e4538 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRanking.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRanking.cs @@ -1,10 +1,8 @@ -using System; -using System.Collections.Generic; -using System.Text; +using Microsoft.ML; -namespace Microsoft.ML.Samples.Dynamic +namespace Microsoft.ML.Samples.Dynamic.Trainers.Ranking { - public class LightGbmRanking + public class LightGbm { // This example requires installation of additional nuget package Microsoft.ML.LightGBM. public static void Example() @@ -12,31 +10,33 @@ public static void Example() // Creating the ML.Net IHostEnvironment object, needed for the pipeline. var mlContext = new MLContext(); - // Download and featurize the train and validation datasets. - (var trainData, var validationData) = SamplesUtils.DatasetUtils.LoadFeaturizedMslrWeb10kTrainAndValidate(mlContext); + // Download and featurize the dataset. + var dataview = SamplesUtils.DatasetUtils.LoadFeaturizedMslrWeb10kDataset(mlContext); + + // Leave out 10% of the dataset for testing. Since this is a ranking problem, we must ensure that the split + // respects the GroupId column, i.e. rows with the same GroupId are either all in the train split or all in + // the test split. The samplingKeyColumn parameter in Ranking.TrainTestSplit is used for this purpose. + var split = mlContext.Ranking.TrainTestSplit(dataview, testFraction: 0.1, samplingKeyColumn: "GroupId"); // Create the Estimator pipeline. For simplicity, we will train a small tree with 4 leaves and 2 boosting iterations. var pipeline = mlContext.Ranking.Trainers.LightGbm( - labelColumn: "Label", - featureColumn: "Features", - groupIdColumn: "GroupId", numLeaves: 4, minDataPerLeaf: 10, learningRate: 0.1, numBoostRound: 2); // Fit this Pipeline to the Training Data. - var model = pipeline.Fit(trainData); + var model = pipeline.Fit(split.TrainSet); // Evaluate how the model is doing on the test data. - var dataWithPredictions = model.Transform(validationData); + var dataWithPredictions = model.Transform(split.TestSet); var metrics = mlContext.Ranking.Evaluate(dataWithPredictions, "Label", "GroupId"); SamplesUtils.ConsoleUtils.PrintMetrics(metrics); - // Output: - // DCG @N: 1.38, 3.11, 4.94 - // NDCG @N: 7.13, 10.12, 12.62 + // Expected output: + // DCG: @1:1.25, @2:2.69, @3:4.57 + // NDCG: @1:7.01, @2:9.57, @3:12.34 } } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRankingWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRankingWithOptions.cs index d8f3da41ea..30087131d8 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRankingWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRankingWithOptions.cs @@ -1,9 +1,8 @@ using Microsoft.ML.LightGBM; -using static Microsoft.ML.LightGBM.Options; -namespace Microsoft.ML.Samples.Dynamic +namespace Microsoft.ML.Samples.Dynamic.Trainers.Ranking { - public class LightGbmRankingWithOptions + public class LightGbmWithOptions { // This example requires installation of additional nuget package Microsoft.ML.LightGBM. public static void Example() @@ -12,33 +11,35 @@ public static void Example() var mlContext = new MLContext(); // Download and featurize the train and validation datasets. - (var trainData, var validationData) = SamplesUtils.DatasetUtils.LoadFeaturizedMslrWeb10kTrainAndValidate(mlContext); + var dataview = SamplesUtils.DatasetUtils.LoadFeaturizedMslrWeb10kDataset(mlContext); + + // Leave out 10% of the dataset for testing. Since this is a ranking problem, we must ensure that the split + // respects the GroupId column, i.e. rows with the same GroupId are either all in the train split or all in + // the test split. The samplingKeyColumn parameter in Ranking.TrainTestSplit is used for this purpose. + var split = mlContext.Ranking.TrainTestSplit(dataview, testFraction: 0.1, samplingKeyColumn: "GroupId"); // Create the Estimator pipeline. For simplicity, we will train a small tree with 4 leaves and 2 boosting iterations. var pipeline = mlContext.Ranking.Trainers.LightGbm( new Options { - LabelColumn = "Label", - FeatureColumn = "Features", - GroupIdColumn = "GroupId", NumLeaves = 4, MinDataPerLeaf = 10, LearningRate = 0.1, NumBoostRound = 2 }); - // Fit this Pipeline to the Training Data. - var model = pipeline.Fit(trainData); + // Fit this pipeline to the training Data. + var model = pipeline.Fit(split.TrainSet); // Evaluate how the model is doing on the test data. - var dataWithPredictions = model.Transform(validationData); + var dataWithPredictions = model.Transform(split.TestSet); var metrics = mlContext.Ranking.Evaluate(dataWithPredictions, "Label", "GroupId"); SamplesUtils.ConsoleUtils.PrintMetrics(metrics); - // Output: - // DCG @N: 1.38, 3.11, 4.94 - // NDCG @N: 7.13, 10.12, 12.62 + // Expected output: + // DCG: @1:1.25, @2:2.69, @3:4.57 + // NDCG: @1:7.01, @2:9.57, @3:12.34 } } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Recommendation/MatrixFactorization.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Recommendation/MatrixFactorization.cs index f630cceab9..a6d7e445fd 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Recommendation/MatrixFactorization.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Recommendation/MatrixFactorization.cs @@ -3,7 +3,7 @@ using Microsoft.ML.Data; using static Microsoft.ML.SamplesUtils.DatasetUtils; -namespace Microsoft.ML.Samples.Dynamic +namespace Microsoft.ML.Samples.Dynamic.Trainers.Recommendation { public static class MatrixFactorization { diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Recommendation/MatrixFactorizationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Recommendation/MatrixFactorizationWithOptions.cs index c73fd7fbcb..cbb11938a0 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Recommendation/MatrixFactorizationWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Recommendation/MatrixFactorizationWithOptions.cs @@ -4,7 +4,7 @@ using Microsoft.ML.Trainers; using static Microsoft.ML.SamplesUtils.DatasetUtils; -namespace Microsoft.ML.Samples.Dynamic +namespace Microsoft.ML.Samples.Dynamic.Trainers.Recommendation { public static class MatrixFactorizationWithOptions { diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGBMRegression.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGBMRegression.cs index c4b6f9f68c..cb950e6832 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGBMRegression.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGBMRegression.cs @@ -2,9 +2,9 @@ using System.Linq; using Microsoft.ML.Data; -namespace Microsoft.ML.Samples.Dynamic +namespace Microsoft.ML.Samples.Dynamic.Trainers.Regression { - class LightGbmRegression + class LightGbm { // This example requires installation of additional nuget package Microsoft.ML.LightGBM. public static void Example() @@ -54,12 +54,12 @@ public static void Example() var metrics = mlContext.Regression.Evaluate(dataWithPredictions, label: labelName); SamplesUtils.ConsoleUtils.PrintMetrics(metrics); - // Output - // L1: 4.97 - // L2: 51.37 - // LossFunction: 51.37 - // RMS: 7.17 - // RSquared: 0.08 + // Expected output + // L1: 4.97 + // L2: 51.37 + // LossFunction: 51.37 + // RMS: 7.17 + // RSquared: 0.08 } } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGBMRegressionWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGBMRegressionWithOptions.cs index 3f73df053e..c1c82a9735 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGBMRegressionWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGBMRegressionWithOptions.cs @@ -4,9 +4,9 @@ using Microsoft.ML.LightGBM; using static Microsoft.ML.LightGBM.Options; -namespace Microsoft.ML.Samples.Dynamic +namespace Microsoft.ML.Samples.Dynamic.Trainers.Regression { - class LightGbmRegressionWithOptions + class LightGbmWithOptions { // This example requires installation of additional nuget package Microsoft.ML.LightGBM. public static void Example() @@ -64,12 +64,12 @@ public static void Example() var metrics = mlContext.Regression.Evaluate(dataWithPredictions, label: labelName); SamplesUtils.ConsoleUtils.PrintMetrics(metrics); - // Output - // L1: 4.97 - // L2: 51.37 - // LossFunction: 51.37 - // RMS: 7.17 - // RSquared: 0.08 + // Expected output + // L1: 4.97 + // L2: 51.37 + // LossFunction: 51.37 + // RMS: 7.17 + // RSquared: 0.08 } } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquares.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquares.cs index 3a8a17952b..6cf99ad8ce 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquares.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquares.cs @@ -2,7 +2,7 @@ using Microsoft.ML.Data; using Microsoft.ML.SamplesUtils; -namespace Microsoft.ML.Samples.Dynamic +namespace Microsoft.ML.Samples.Dynamic.Trainers.Regression { public static class OrdinaryLeastSquares { @@ -55,11 +55,13 @@ public static void Example() var metrics = mlContext.Regression.Evaluate(dataWithPredictions); ConsoleUtils.PrintMetrics(metrics); - // L1: 4.15 - // L2: 31.98 - // LossFunction: 31.98 - // RMS: 5.65 - // RSquared: 0.56 + + // Expected output: + // L1: 4.15 + // L2: 31.98 + // LossFunction: 31.98 + // RMS: 5.65 + // RSquared: 0.56 } } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquaresWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquaresWithOptions.cs index 519a9ef683..45a9704f47 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquaresWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquaresWithOptions.cs @@ -3,7 +3,7 @@ using Microsoft.ML.SamplesUtils; using Microsoft.ML.Trainers.HalLearners; -namespace Microsoft.ML.Samples.Dynamic +namespace Microsoft.ML.Samples.Dynamic.Trainers.Regression { public static class OrdinaryLeastSquaresWithOptions { @@ -59,11 +59,13 @@ public static void Example() var metrics = mlContext.Regression.Evaluate(dataWithPredictions); ConsoleUtils.PrintMetrics(metrics); - // L1: 4.14 - // L2: 32.35 - // LossFunction: 32.35 - // RMS: 5.69 - // RSquared: 0.56 + + // Expected output: + // L1: 4.14 + // L2: 32.35 + // LossFunction: 32.35 + // RMS: 5.69 + // RSquared: 0.56 } } } diff --git a/docs/samples/Microsoft.ML.Samples/Program.cs b/docs/samples/Microsoft.ML.Samples/Program.cs index 6fa4e40705..d28cdd4d77 100644 --- a/docs/samples/Microsoft.ML.Samples/Program.cs +++ b/docs/samples/Microsoft.ML.Samples/Program.cs @@ -6,7 +6,7 @@ internal static class Program { static void Main(string[] args) { - LightGbmRanking.Example(); + TakeRows.Example(); } } } diff --git a/src/Microsoft.ML.Data/Evaluators/Metrics/RankerMetrics.cs b/src/Microsoft.ML.Data/Evaluators/Metrics/RankerMetrics.cs index d3b9ef685f..c9e6337070 100644 --- a/src/Microsoft.ML.Data/Evaluators/Metrics/RankerMetrics.cs +++ b/src/Microsoft.ML.Data/Evaluators/Metrics/RankerMetrics.cs @@ -15,10 +15,10 @@ public sealed class RankerMetrics public double[] Ndcg { get; } /// - ///Array of discounted cumulative gains where i-th element represent DCG@i. + /// Array of discounted cumulative gains where i-th element represent DCG@i. /// Discounted Cumulative gain /// is the sum of the gains, for all the instances i, normalized by the natural logarithm of the instance + 1. - /// Note that unlike the Wikipedia article, ML.Net uses the natural logarithm. + /// Note that unlike the Wikipedia article, ML.NET uses the natural logarithm. /// /// public double[] Dcg { get; } diff --git a/src/Microsoft.ML.SamplesUtils/ConsoleUtils.cs b/src/Microsoft.ML.SamplesUtils/ConsoleUtils.cs index 7c40d9a4c8..58ec0bcb6e 100644 --- a/src/Microsoft.ML.SamplesUtils/ConsoleUtils.cs +++ b/src/Microsoft.ML.SamplesUtils/ConsoleUtils.cs @@ -43,8 +43,18 @@ public static void PrintMetrics(RegressionMetrics metrics) /// Ranker metrics. public static void PrintMetrics(RankerMetrics metrics) { - Console.WriteLine($"DCG@N: {string.Join(", ", metrics.Dcg.Select(d => Math.Round(d, 2)).ToArray())}"); - Console.WriteLine($"NDCG@N: {string.Join(", ", metrics.Ndcg.Select(d => Math.Round(d, 2)).ToArray())}"); + Console.WriteLine($"DCG: {string.Join(", ", RoundAndBeautifyRankerMetrics(metrics.Dcg))}"); + Console.WriteLine($"NDCG: {string.Join(", ", RoundAndBeautifyRankerMetrics(metrics.Ndcg))}"); + } + + private static string[] RoundAndBeautifyRankerMetrics(double[] input) + { + string[] result = input.Select(d => Math.Round(d, 2).ToString()).ToArray(); + for (int i = 0; i < result.Length; i++) + { + result[i] = $"@{(i + 1).ToString()}:{result[i]}"; + } + return result; } } } diff --git a/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs b/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs index a40212b0e6..dab907f32a 100644 --- a/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs +++ b/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs @@ -138,7 +138,7 @@ public static IDataView LoadFeaturizedAdultDataset(MLContext mlContext) .Append(mlContext.Transforms.Concatenate("Features", "workclass", "education", "marital-status", "occupation", "relationship", "ethnicity", "native-country", "age", "education-num", "capital-gain", "capital-loss", "hours-per-week")) - // Min-max normalized all the features + // Min-max normalize all the features .Append(mlContext.Transforms.Normalize("Features")); var data = reader.Read(dataFile); @@ -146,7 +146,7 @@ public static IDataView LoadFeaturizedAdultDataset(MLContext mlContext) return featurizedData; } - public static string DownloadMslrWeb10kTrain() + public static string DownloadMslrWeb10k() { var fileName = "MSLRWeb10KTrain720kRows.tsv"; if (!File.Exists(fileName)) @@ -154,19 +154,10 @@ public static string DownloadMslrWeb10kTrain() return fileName; } - public static string DownloadMslrWeb10kValidate() - { - var fileName = "MSLRWeb10KValidate240kRows.tsv"; - if (!File.Exists(fileName)) - Download("https://tlcresources.blob.core.windows.net/datasets/MSLR-WEB10K/MSLR-WEB10K_Fold1.VALIDATE.160MB_240k-rows.tsv", fileName); - return fileName; - } - - public static (IDataView, IDataView) LoadFeaturizedMslrWeb10kTrainAndValidate(MLContext mlContext) + public static IDataView LoadFeaturizedMslrWeb10kDataset(MLContext mlContext) { // Download the training and validation files. - string trainDataFile = DownloadMslrWeb10kTrain(); - string validationDataFile = DownloadMslrWeb10kValidate(); + string dataFile = DownloadMslrWeb10k(); // Create the reader to read the data. var reader = mlContext.Data.CreateTextLoader( @@ -178,23 +169,18 @@ public static (IDataView, IDataView) LoadFeaturizedMslrWeb10kTrainAndValidate(ML } ); - // Load the raw training and validation datasets. - var trainData = reader.Read(trainDataFile); - var validationData = reader.Read(validationDataFile); + // Load the raw dataset. + var data = reader.Read(dataFile); // Create the featurization pipeline. First, hash the GroupId column. var pipeline = mlContext.Transforms.Conversion.Hash("GroupId") // Replace missing values in Features column with the default replacement value for its type. .Append(mlContext.Transforms.ReplaceMissingValues("Features")); - // Fit the pipeline on the training data. - var fittedPipeline = pipeline.Fit(trainData); - - // Use the fitted pipeline to transform the training and validation datasets. - var transformedTrainData = fittedPipeline.Transform(trainData); - var transformedValidationData = fittedPipeline.Transform(validationData); + // Fit the pipeline and transform the dataset. + var transformedData = pipeline.Fit(data).Transform(data); - return (transformedTrainData, transformedValidationData); + return transformedData; } /// From ba14a9d3c5be9c4bcc8cc65446f409f4f1c02fc2 Mon Sep 17 00:00:00 2001 From: "REDMOND\\nakazmi" Date: Tue, 19 Feb 2019 23:19:02 -0800 Subject: [PATCH 03/13] Adding a sample for LightGbm Ranking --- ...LightGBMBinaryClassificationWithOptions.cs | 1 - .../Trainers/Ranking/LightGBMRanking.cs | 42 +++++++++++++++ .../Ranking/LightGBMRankingWithOptions.cs | 44 ++++++++++++++++ docs/samples/Microsoft.ML.Samples/Program.cs | 2 +- .../Evaluators/Metrics/RankerMetrics.cs | 2 +- src/Microsoft.ML.SamplesUtils/ConsoleUtils.cs | 11 ++++ .../SamplesDatasetUtils.cs | 51 +++++++++++++++++++ 7 files changed, 150 insertions(+), 3 deletions(-) create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRanking.cs create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRankingWithOptions.cs diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGBMBinaryClassificationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGBMBinaryClassificationWithOptions.cs index 20924bc29f..904285aaee 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGBMBinaryClassificationWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGBMBinaryClassificationWithOptions.cs @@ -1,5 +1,4 @@ using Microsoft.ML.LightGBM; -using Microsoft.ML.Transforms.Categorical; using static Microsoft.ML.LightGBM.Options; namespace Microsoft.ML.Samples.Dynamic diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRanking.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRanking.cs new file mode 100644 index 0000000000..8822a16630 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRanking.cs @@ -0,0 +1,42 @@ +using System; +using System.Collections.Generic; +using System.Text; + +namespace Microsoft.ML.Samples.Dynamic +{ + public class LightGbmRanking + { + // This example requires installation of additional nuget package Microsoft.ML.LightGBM. + public static void Example() + { + // Creating the ML.Net IHostEnvironment object, needed for the pipeline. + var mlContext = new MLContext(); + + // Download and featurize the train and validation datasets. + (var trainData, var validationData) = SamplesUtils.DatasetUtils.LoadFeaturizedMslrWeb10kTrainAndValidate(mlContext); + + // Create the Estimator pipeline. For simplicity, we will train a small tree with 4 leaves and 2 boosting iterations. + var pipeline = mlContext.Ranking.Trainers.LightGbm( + labelColumn: "Label", + featureColumn: "Features", + groupIdColumn: "GroupId", + numLeaves: 4, + minDataPerLeaf: 10, + learningRate: 0.1, + numBoostRound: 2); + + // Fit this Pipeline to the Training Data. + var model = pipeline.Fit(trainData); + + // Evaluate how the model is doing on the test data. + var dataWithPredictions = model.Transform(validationData); + + var metrics = mlContext.Ranking.Evaluate(dataWithPredictions, "Label", "GroupId"); + SamplesUtils.ConsoleUtils.PrintMetrics(metrics); + + // Output: + // DCG @N: 1.38, 3.11, 4.94 + // NDCG @N: 7.13, 10.12, 12.62 + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRankingWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRankingWithOptions.cs new file mode 100644 index 0000000000..d8f3da41ea --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRankingWithOptions.cs @@ -0,0 +1,44 @@ +using Microsoft.ML.LightGBM; +using static Microsoft.ML.LightGBM.Options; + +namespace Microsoft.ML.Samples.Dynamic +{ + public class LightGbmRankingWithOptions + { + // This example requires installation of additional nuget package Microsoft.ML.LightGBM. + public static void Example() + { + // Creating the ML.Net IHostEnvironment object, needed for the pipeline. + var mlContext = new MLContext(); + + // Download and featurize the train and validation datasets. + (var trainData, var validationData) = SamplesUtils.DatasetUtils.LoadFeaturizedMslrWeb10kTrainAndValidate(mlContext); + + // Create the Estimator pipeline. For simplicity, we will train a small tree with 4 leaves and 2 boosting iterations. + var pipeline = mlContext.Ranking.Trainers.LightGbm( + new Options + { + LabelColumn = "Label", + FeatureColumn = "Features", + GroupIdColumn = "GroupId", + NumLeaves = 4, + MinDataPerLeaf = 10, + LearningRate = 0.1, + NumBoostRound = 2 + }); + + // Fit this Pipeline to the Training Data. + var model = pipeline.Fit(trainData); + + // Evaluate how the model is doing on the test data. + var dataWithPredictions = model.Transform(validationData); + + var metrics = mlContext.Ranking.Evaluate(dataWithPredictions, "Label", "GroupId"); + SamplesUtils.ConsoleUtils.PrintMetrics(metrics); + + // Output: + // DCG @N: 1.38, 3.11, 4.94 + // NDCG @N: 7.13, 10.12, 12.62 + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Program.cs b/docs/samples/Microsoft.ML.Samples/Program.cs index d28cdd4d77..6fa4e40705 100644 --- a/docs/samples/Microsoft.ML.Samples/Program.cs +++ b/docs/samples/Microsoft.ML.Samples/Program.cs @@ -6,7 +6,7 @@ internal static class Program { static void Main(string[] args) { - TakeRows.Example(); + LightGbmRanking.Example(); } } } diff --git a/src/Microsoft.ML.Data/Evaluators/Metrics/RankerMetrics.cs b/src/Microsoft.ML.Data/Evaluators/Metrics/RankerMetrics.cs index b9532fd31b..d3b9ef685f 100644 --- a/src/Microsoft.ML.Data/Evaluators/Metrics/RankerMetrics.cs +++ b/src/Microsoft.ML.Data/Evaluators/Metrics/RankerMetrics.cs @@ -18,7 +18,7 @@ public sealed class RankerMetrics ///Array of discounted cumulative gains where i-th element represent DCG@i. /// Discounted Cumulative gain /// is the sum of the gains, for all the instances i, normalized by the natural logarithm of the instance + 1. - /// Note that unline the Wikipedia article, ML.Net uses the natural logarithm. + /// Note that unlike the Wikipedia article, ML.Net uses the natural logarithm. /// /// public double[] Dcg { get; } diff --git a/src/Microsoft.ML.SamplesUtils/ConsoleUtils.cs b/src/Microsoft.ML.SamplesUtils/ConsoleUtils.cs index 93de658b1e..7c40d9a4c8 100644 --- a/src/Microsoft.ML.SamplesUtils/ConsoleUtils.cs +++ b/src/Microsoft.ML.SamplesUtils/ConsoleUtils.cs @@ -1,4 +1,5 @@ using System; +using System.Linq; using Microsoft.ML.Data; namespace Microsoft.ML.SamplesUtils @@ -35,5 +36,15 @@ public static void PrintMetrics(RegressionMetrics metrics) Console.WriteLine($"RMS: {metrics.Rms:F2}"); Console.WriteLine($"RSquared: {metrics.RSquared:F2}"); } + + /// + /// Pretty-print RankerMetrics objects. + /// + /// Ranker metrics. + public static void PrintMetrics(RankerMetrics metrics) + { + Console.WriteLine($"DCG@N: {string.Join(", ", metrics.Dcg.Select(d => Math.Round(d, 2)).ToArray())}"); + Console.WriteLine($"NDCG@N: {string.Join(", ", metrics.Ndcg.Select(d => Math.Round(d, 2)).ToArray())}"); + } } } diff --git a/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs b/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs index 203bd6e6bd..a40212b0e6 100644 --- a/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs +++ b/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs @@ -146,6 +146,57 @@ public static IDataView LoadFeaturizedAdultDataset(MLContext mlContext) return featurizedData; } + public static string DownloadMslrWeb10kTrain() + { + var fileName = "MSLRWeb10KTrain720kRows.tsv"; + if (!File.Exists(fileName)) + Download("https://tlcresources.blob.core.windows.net/datasets/MSLR-WEB10K/MSLR-WEB10K_Fold1.TRAIN.500MB_720k-rows.tsv", fileName); + return fileName; + } + + public static string DownloadMslrWeb10kValidate() + { + var fileName = "MSLRWeb10KValidate240kRows.tsv"; + if (!File.Exists(fileName)) + Download("https://tlcresources.blob.core.windows.net/datasets/MSLR-WEB10K/MSLR-WEB10K_Fold1.VALIDATE.160MB_240k-rows.tsv", fileName); + return fileName; + } + + public static (IDataView, IDataView) LoadFeaturizedMslrWeb10kTrainAndValidate(MLContext mlContext) + { + // Download the training and validation files. + string trainDataFile = DownloadMslrWeb10kTrain(); + string validationDataFile = DownloadMslrWeb10kValidate(); + + // Create the reader to read the data. + var reader = mlContext.Data.CreateTextLoader( + columns: new[] + { + new TextLoader.Column("Label", DataKind.R4, 0), + new TextLoader.Column("GroupId", DataKind.TX, 1), + new TextLoader.Column("Features", DataKind.R4, new[] { new TextLoader.Range(2, 138) }) + } + ); + + // Load the raw training and validation datasets. + var trainData = reader.Read(trainDataFile); + var validationData = reader.Read(validationDataFile); + + // Create the featurization pipeline. First, hash the GroupId column. + var pipeline = mlContext.Transforms.Conversion.Hash("GroupId") + // Replace missing values in Features column with the default replacement value for its type. + .Append(mlContext.Transforms.ReplaceMissingValues("Features")); + + // Fit the pipeline on the training data. + var fittedPipeline = pipeline.Fit(trainData); + + // Use the fitted pipeline to transform the training and validation datasets. + var transformedTrainData = fittedPipeline.Transform(trainData); + var transformedValidationData = fittedPipeline.Transform(validationData); + + return (transformedTrainData, transformedValidationData); + } + /// /// Downloads the breast cancer dataset from the ML.NET repo. /// From f20d7bf1b06b84b43246cde2ab3e00b9bf59904a Mon Sep 17 00:00:00 2001 From: "REDMOND\\nakazmi" Date: Fri, 22 Feb 2019 18:19:12 -0800 Subject: [PATCH 04/13] PR feedback + cleaning up namespaces in Microsoft.ML.Samples project --- .../LightGBMBinaryClassification.cs | 24 +++++++------- ...LightGBMBinaryClassificationWithOptions.cs | 24 +++++++------- .../SDCALogisticRegression.cs | 2 +- .../SDCASupportVectorMachine.cs | 2 +- .../SymbolicStochasticGradientDescent.cs | 20 ++++++------ ...licStochasticGradientDescentWithOptions.cs | 21 ++++++------ .../LightGBMMulticlassClassification.cs | 4 +-- ...tGBMMulticlassClassificationWithOptions.cs | 4 +-- .../Trainers/Ranking/LightGBMRanking.cs | 30 ++++++++--------- .../Ranking/LightGBMRankingWithOptions.cs | 27 ++++++++-------- .../Recommendation/MatrixFactorization.cs | 2 +- .../MatrixFactorizationWithOptions.cs | 2 +- .../Trainers/Regression/LightGBMRegression.cs | 16 +++++----- .../LightGBMRegressionWithOptions.cs | 16 +++++----- .../Regression/OrdinaryLeastSquares.cs | 14 ++++---- .../OrdinaryLeastSquaresWithOptions.cs | 14 ++++---- docs/samples/Microsoft.ML.Samples/Program.cs | 2 +- .../Evaluators/Metrics/RankerMetrics.cs | 4 +-- src/Microsoft.ML.SamplesUtils/ConsoleUtils.cs | 14 ++++++-- .../SamplesDatasetUtils.cs | 32 ++++++------------- 20 files changed, 138 insertions(+), 136 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGBMBinaryClassification.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGBMBinaryClassification.cs index edd4e31504..a6834d0082 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGBMBinaryClassification.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGBMBinaryClassification.cs @@ -1,8 +1,8 @@ using Microsoft.ML.Transforms.Categorical; -namespace Microsoft.ML.Samples.Dynamic +namespace Microsoft.ML.Samples.Dynamic.Trainers.BinaryClassification { - public class LightGbmBinaryClassification + public class LightGbm { // This example requires installation of additional nuget package Microsoft.ML.LightGBM. public static void Example() @@ -17,7 +17,7 @@ public static void Example() var split = mlContext.BinaryClassification.TrainTestSplit(dataview, testFraction: 0.1); // Create the Estimator. - var pipeline = mlContext.BinaryClassification.Trainers.LightGbm("IsOver50K", "Features"); + var pipeline = mlContext.BinaryClassification.Trainers.LightGbm(); // Fit this Pipeline to the Training Data. var model = pipeline.Fit(split.TrainSet); @@ -25,17 +25,17 @@ public static void Example() // Evaluate how the model is doing on the test data. var dataWithPredictions = model.Transform(split.TestSet); - var metrics = mlContext.BinaryClassification.Evaluate(dataWithPredictions, "IsOver50K"); + var metrics = mlContext.BinaryClassification.Evaluate(dataWithPredictions); SamplesUtils.ConsoleUtils.PrintMetrics(metrics); - // Output: - // Accuracy: 0.88 - // AUC: 0.93 - // F1 Score: 0.71 - // Negative Precision: 0.90 - // Negative Recall: 0.94 - // Positive Precision: 0.76 - // Positive Recall: 0.66 + // Expected output: + // Accuracy: 0.88 + // AUC: 0.93 + // F1 Score: 0.71 + // Negative Precision: 0.90 + // Negative Recall: 0.94 + // Positive Precision: 0.76 + // Positive Recall: 0.66 } } } \ No newline at end of file diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGBMBinaryClassificationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGBMBinaryClassificationWithOptions.cs index 904285aaee..7b0e21fed9 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGBMBinaryClassificationWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGBMBinaryClassificationWithOptions.cs @@ -1,9 +1,9 @@ using Microsoft.ML.LightGBM; using static Microsoft.ML.LightGBM.Options; -namespace Microsoft.ML.Samples.Dynamic +namespace Microsoft.ML.Samples.Dynamic.Trainers.BinaryClassification { - class LightGbmBinaryClassificationWithOptions + class LightGbmWithOptions { // This example requires installation of additional nuget package Microsoft.ML.LightGBM. public static void Example() @@ -21,8 +21,6 @@ public static void Example() var pipeline = mlContext.BinaryClassification.Trainers.LightGbm( new Options { - LabelColumn = "IsOver50K", - FeatureColumn = "Features", Booster = new GossBooster.Options { TopRate = 0.3, @@ -36,17 +34,17 @@ public static void Example() // Evaluate how the model is doing on the test data. var dataWithPredictions = model.Transform(split.TestSet); - var metrics = mlContext.BinaryClassification.Evaluate(dataWithPredictions, "IsOver50K"); + var metrics = mlContext.BinaryClassification.Evaluate(dataWithPredictions); SamplesUtils.ConsoleUtils.PrintMetrics(metrics); - // Output: - // Accuracy: 0.88 - // AUC: 0.93 - // F1 Score: 0.71 - // Negative Precision: 0.90 - // Negative Recall: 0.94 - // Positive Precision: 0.76 - // Positive Recall: 0.67 + // Expected output: + // Accuracy: 0.88 + // AUC: 0.93 + // F1 Score: 0.71 + // Negative Precision: 0.90 + // Negative Recall: 0.94 + // Positive Precision: 0.76 + // Positive Recall: 0.67 } } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SDCALogisticRegression.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SDCALogisticRegression.cs index 3ab3257638..da12242ce4 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SDCALogisticRegression.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SDCALogisticRegression.cs @@ -3,7 +3,7 @@ using Microsoft.ML.Data; using Microsoft.ML.Trainers; -namespace Microsoft.ML.Samples.Dynamic +namespace Microsoft.ML.Samples.Dynamic.Trainers.BinaryClassification { public static class SDCALogisticRegression { diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SDCASupportVectorMachine.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SDCASupportVectorMachine.cs index d37c1cec1a..0730e3daee 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SDCASupportVectorMachine.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SDCASupportVectorMachine.cs @@ -2,7 +2,7 @@ using System.Linq; using Microsoft.ML.Data; -namespace Microsoft.ML.Samples.Dynamic +namespace Microsoft.ML.Samples.Dynamic.Trainers.BinaryClassification { public static class SDCASupportVectorMachine { diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescent.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescent.cs index 49b31342e0..dcdd331ab5 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescent.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescent.cs @@ -1,4 +1,4 @@ -namespace Microsoft.ML.Samples.Dynamic +namespace Microsoft.ML.Samples.Dynamic.Trainers.BinaryClassification { public static class SymbolicStochasticGradientDescent { @@ -24,15 +24,17 @@ public static void Example() // Evaluate how the model is doing on the test data. var dataWithPredictions = model.Transform(split.TestSet); - var metrics = mlContext.BinaryClassification.EvaluateNonCalibrated(dataWithPredictions, "IsOver50K"); + var metrics = mlContext.BinaryClassification.EvaluateNonCalibrated(dataWithPredictions); SamplesUtils.ConsoleUtils.PrintMetrics(metrics); - // Accuracy: 0.85 - // AUC: 0.90 - // F1 Score: 0.64 - // Negative Precision: 0.88 - // Negative Recall: 0.93 - // Positive Precision: 0.72 - // Positive Recall: 0.58 + + // Expected output: + // Accuracy: 0.85 + // AUC: 0.90 + // F1 Score: 0.64 + // Negative Precision: 0.88 + // Negative Recall: 0.93 + // Positive Precision: 0.72 + // Positive Recall: 0.58 } } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescentWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescentWithOptions.cs index d05d64454c..e4363f29cc 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescentWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescentWithOptions.cs @@ -1,4 +1,4 @@ -namespace Microsoft.ML.Samples.Dynamic +namespace Microsoft.ML.Samples.Dynamic.Trainers.BinaryClassification { public static class SymbolicStochasticGradientDescentWithOptions { @@ -22,7 +22,6 @@ public static void Example() var pipeline = mlContext.BinaryClassification.Trainers.SymbolicStochasticGradientDescent( new ML.Trainers.HalLearners.SymSgdClassificationTrainer.Options() { - LabelColumn = "IsOver50K", LearningRate = 0.2f, NumberOfIterations = 10, NumberOfThreads = 1, @@ -33,15 +32,17 @@ public static void Example() // Evaluate how the model is doing on the test data. var dataWithPredictions = model.Transform(split.TestSet); - var metrics = mlContext.BinaryClassification.EvaluateNonCalibrated(dataWithPredictions, "IsOver50K"); + var metrics = mlContext.BinaryClassification.EvaluateNonCalibrated(dataWithPredictions); SamplesUtils.ConsoleUtils.PrintMetrics(metrics); - // Accuracy: 0.84 - // AUC: 0.88 - // F1 Score: 0.60 - // Negative Precision: 0.87 - // Negative Recall: 0.93 - // Positive Precision: 0.69 - // Positive Recall: 0.53 + + // Expected output: + // Accuracy: 0.84 + // AUC: 0.88 + // F1 Score: 0.60 + // Negative Precision: 0.87 + // Negative Recall: 0.93 + // Positive Precision: 0.69 + // Positive Recall: 0.53 } } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGBMMulticlassClassification.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGBMMulticlassClassification.cs index 8731c6bc50..103d9f052f 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGBMMulticlassClassification.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGBMMulticlassClassification.cs @@ -3,9 +3,9 @@ using Microsoft.ML.Data; using Microsoft.ML.SamplesUtils; -namespace Microsoft.ML.Samples.Dynamic +namespace Microsoft.ML.Samples.Dynamic.Trainers.MulticlassClassification { - class LightGbmMulticlassClassification + class LightGbm { // This example requires installation of additional nuget package Microsoft.ML.LightGBM. public static void Example() diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGBMMulticlassClassificationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGBMMulticlassClassificationWithOptions.cs index 7d98c9318e..36de9b8fe1 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGBMMulticlassClassificationWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGBMMulticlassClassificationWithOptions.cs @@ -5,9 +5,9 @@ using Microsoft.ML.SamplesUtils; using static Microsoft.ML.LightGBM.Options; -namespace Microsoft.ML.Samples.Dynamic +namespace Microsoft.ML.Samples.Dynamic.Trainers.MulticlassClassification { - class LightGbmMulticlassClassificationWithOptions + class LightGbmWithOptions { // This example requires installation of additional nuget package Microsoft.ML.LightGBM. public static void Example() diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRanking.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRanking.cs index 8822a16630..b5857e4538 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRanking.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRanking.cs @@ -1,10 +1,8 @@ -using System; -using System.Collections.Generic; -using System.Text; +using Microsoft.ML; -namespace Microsoft.ML.Samples.Dynamic +namespace Microsoft.ML.Samples.Dynamic.Trainers.Ranking { - public class LightGbmRanking + public class LightGbm { // This example requires installation of additional nuget package Microsoft.ML.LightGBM. public static void Example() @@ -12,31 +10,33 @@ public static void Example() // Creating the ML.Net IHostEnvironment object, needed for the pipeline. var mlContext = new MLContext(); - // Download and featurize the train and validation datasets. - (var trainData, var validationData) = SamplesUtils.DatasetUtils.LoadFeaturizedMslrWeb10kTrainAndValidate(mlContext); + // Download and featurize the dataset. + var dataview = SamplesUtils.DatasetUtils.LoadFeaturizedMslrWeb10kDataset(mlContext); + + // Leave out 10% of the dataset for testing. Since this is a ranking problem, we must ensure that the split + // respects the GroupId column, i.e. rows with the same GroupId are either all in the train split or all in + // the test split. The samplingKeyColumn parameter in Ranking.TrainTestSplit is used for this purpose. + var split = mlContext.Ranking.TrainTestSplit(dataview, testFraction: 0.1, samplingKeyColumn: "GroupId"); // Create the Estimator pipeline. For simplicity, we will train a small tree with 4 leaves and 2 boosting iterations. var pipeline = mlContext.Ranking.Trainers.LightGbm( - labelColumn: "Label", - featureColumn: "Features", - groupIdColumn: "GroupId", numLeaves: 4, minDataPerLeaf: 10, learningRate: 0.1, numBoostRound: 2); // Fit this Pipeline to the Training Data. - var model = pipeline.Fit(trainData); + var model = pipeline.Fit(split.TrainSet); // Evaluate how the model is doing on the test data. - var dataWithPredictions = model.Transform(validationData); + var dataWithPredictions = model.Transform(split.TestSet); var metrics = mlContext.Ranking.Evaluate(dataWithPredictions, "Label", "GroupId"); SamplesUtils.ConsoleUtils.PrintMetrics(metrics); - // Output: - // DCG @N: 1.38, 3.11, 4.94 - // NDCG @N: 7.13, 10.12, 12.62 + // Expected output: + // DCG: @1:1.25, @2:2.69, @3:4.57 + // NDCG: @1:7.01, @2:9.57, @3:12.34 } } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRankingWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRankingWithOptions.cs index d8f3da41ea..30087131d8 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRankingWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRankingWithOptions.cs @@ -1,9 +1,8 @@ using Microsoft.ML.LightGBM; -using static Microsoft.ML.LightGBM.Options; -namespace Microsoft.ML.Samples.Dynamic +namespace Microsoft.ML.Samples.Dynamic.Trainers.Ranking { - public class LightGbmRankingWithOptions + public class LightGbmWithOptions { // This example requires installation of additional nuget package Microsoft.ML.LightGBM. public static void Example() @@ -12,33 +11,35 @@ public static void Example() var mlContext = new MLContext(); // Download and featurize the train and validation datasets. - (var trainData, var validationData) = SamplesUtils.DatasetUtils.LoadFeaturizedMslrWeb10kTrainAndValidate(mlContext); + var dataview = SamplesUtils.DatasetUtils.LoadFeaturizedMslrWeb10kDataset(mlContext); + + // Leave out 10% of the dataset for testing. Since this is a ranking problem, we must ensure that the split + // respects the GroupId column, i.e. rows with the same GroupId are either all in the train split or all in + // the test split. The samplingKeyColumn parameter in Ranking.TrainTestSplit is used for this purpose. + var split = mlContext.Ranking.TrainTestSplit(dataview, testFraction: 0.1, samplingKeyColumn: "GroupId"); // Create the Estimator pipeline. For simplicity, we will train a small tree with 4 leaves and 2 boosting iterations. var pipeline = mlContext.Ranking.Trainers.LightGbm( new Options { - LabelColumn = "Label", - FeatureColumn = "Features", - GroupIdColumn = "GroupId", NumLeaves = 4, MinDataPerLeaf = 10, LearningRate = 0.1, NumBoostRound = 2 }); - // Fit this Pipeline to the Training Data. - var model = pipeline.Fit(trainData); + // Fit this pipeline to the training Data. + var model = pipeline.Fit(split.TrainSet); // Evaluate how the model is doing on the test data. - var dataWithPredictions = model.Transform(validationData); + var dataWithPredictions = model.Transform(split.TestSet); var metrics = mlContext.Ranking.Evaluate(dataWithPredictions, "Label", "GroupId"); SamplesUtils.ConsoleUtils.PrintMetrics(metrics); - // Output: - // DCG @N: 1.38, 3.11, 4.94 - // NDCG @N: 7.13, 10.12, 12.62 + // Expected output: + // DCG: @1:1.25, @2:2.69, @3:4.57 + // NDCG: @1:7.01, @2:9.57, @3:12.34 } } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Recommendation/MatrixFactorization.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Recommendation/MatrixFactorization.cs index d252eb489d..3737e751d5 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Recommendation/MatrixFactorization.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Recommendation/MatrixFactorization.cs @@ -3,7 +3,7 @@ using Microsoft.ML.Data; using static Microsoft.ML.SamplesUtils.DatasetUtils; -namespace Microsoft.ML.Samples.Dynamic +namespace Microsoft.ML.Samples.Dynamic.Trainers.Recommendation { public static class MatrixFactorization { diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Recommendation/MatrixFactorizationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Recommendation/MatrixFactorizationWithOptions.cs index c73fd7fbcb..cbb11938a0 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Recommendation/MatrixFactorizationWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Recommendation/MatrixFactorizationWithOptions.cs @@ -4,7 +4,7 @@ using Microsoft.ML.Trainers; using static Microsoft.ML.SamplesUtils.DatasetUtils; -namespace Microsoft.ML.Samples.Dynamic +namespace Microsoft.ML.Samples.Dynamic.Trainers.Recommendation { public static class MatrixFactorizationWithOptions { diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGBMRegression.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGBMRegression.cs index c4b6f9f68c..cb950e6832 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGBMRegression.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGBMRegression.cs @@ -2,9 +2,9 @@ using System.Linq; using Microsoft.ML.Data; -namespace Microsoft.ML.Samples.Dynamic +namespace Microsoft.ML.Samples.Dynamic.Trainers.Regression { - class LightGbmRegression + class LightGbm { // This example requires installation of additional nuget package Microsoft.ML.LightGBM. public static void Example() @@ -54,12 +54,12 @@ public static void Example() var metrics = mlContext.Regression.Evaluate(dataWithPredictions, label: labelName); SamplesUtils.ConsoleUtils.PrintMetrics(metrics); - // Output - // L1: 4.97 - // L2: 51.37 - // LossFunction: 51.37 - // RMS: 7.17 - // RSquared: 0.08 + // Expected output + // L1: 4.97 + // L2: 51.37 + // LossFunction: 51.37 + // RMS: 7.17 + // RSquared: 0.08 } } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGBMRegressionWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGBMRegressionWithOptions.cs index 3f73df053e..c1c82a9735 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGBMRegressionWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGBMRegressionWithOptions.cs @@ -4,9 +4,9 @@ using Microsoft.ML.LightGBM; using static Microsoft.ML.LightGBM.Options; -namespace Microsoft.ML.Samples.Dynamic +namespace Microsoft.ML.Samples.Dynamic.Trainers.Regression { - class LightGbmRegressionWithOptions + class LightGbmWithOptions { // This example requires installation of additional nuget package Microsoft.ML.LightGBM. public static void Example() @@ -64,12 +64,12 @@ public static void Example() var metrics = mlContext.Regression.Evaluate(dataWithPredictions, label: labelName); SamplesUtils.ConsoleUtils.PrintMetrics(metrics); - // Output - // L1: 4.97 - // L2: 51.37 - // LossFunction: 51.37 - // RMS: 7.17 - // RSquared: 0.08 + // Expected output + // L1: 4.97 + // L2: 51.37 + // LossFunction: 51.37 + // RMS: 7.17 + // RSquared: 0.08 } } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquares.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquares.cs index 3a8a17952b..6cf99ad8ce 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquares.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquares.cs @@ -2,7 +2,7 @@ using Microsoft.ML.Data; using Microsoft.ML.SamplesUtils; -namespace Microsoft.ML.Samples.Dynamic +namespace Microsoft.ML.Samples.Dynamic.Trainers.Regression { public static class OrdinaryLeastSquares { @@ -55,11 +55,13 @@ public static void Example() var metrics = mlContext.Regression.Evaluate(dataWithPredictions); ConsoleUtils.PrintMetrics(metrics); - // L1: 4.15 - // L2: 31.98 - // LossFunction: 31.98 - // RMS: 5.65 - // RSquared: 0.56 + + // Expected output: + // L1: 4.15 + // L2: 31.98 + // LossFunction: 31.98 + // RMS: 5.65 + // RSquared: 0.56 } } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquaresWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquaresWithOptions.cs index 519a9ef683..45a9704f47 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquaresWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquaresWithOptions.cs @@ -3,7 +3,7 @@ using Microsoft.ML.SamplesUtils; using Microsoft.ML.Trainers.HalLearners; -namespace Microsoft.ML.Samples.Dynamic +namespace Microsoft.ML.Samples.Dynamic.Trainers.Regression { public static class OrdinaryLeastSquaresWithOptions { @@ -59,11 +59,13 @@ public static void Example() var metrics = mlContext.Regression.Evaluate(dataWithPredictions); ConsoleUtils.PrintMetrics(metrics); - // L1: 4.14 - // L2: 32.35 - // LossFunction: 32.35 - // RMS: 5.69 - // RSquared: 0.56 + + // Expected output: + // L1: 4.14 + // L2: 32.35 + // LossFunction: 32.35 + // RMS: 5.69 + // RSquared: 0.56 } } } diff --git a/docs/samples/Microsoft.ML.Samples/Program.cs b/docs/samples/Microsoft.ML.Samples/Program.cs index 6fa4e40705..d28cdd4d77 100644 --- a/docs/samples/Microsoft.ML.Samples/Program.cs +++ b/docs/samples/Microsoft.ML.Samples/Program.cs @@ -6,7 +6,7 @@ internal static class Program { static void Main(string[] args) { - LightGbmRanking.Example(); + TakeRows.Example(); } } } diff --git a/src/Microsoft.ML.Data/Evaluators/Metrics/RankerMetrics.cs b/src/Microsoft.ML.Data/Evaluators/Metrics/RankerMetrics.cs index d3b9ef685f..c9e6337070 100644 --- a/src/Microsoft.ML.Data/Evaluators/Metrics/RankerMetrics.cs +++ b/src/Microsoft.ML.Data/Evaluators/Metrics/RankerMetrics.cs @@ -15,10 +15,10 @@ public sealed class RankerMetrics public double[] Ndcg { get; } /// - ///Array of discounted cumulative gains where i-th element represent DCG@i. + /// Array of discounted cumulative gains where i-th element represent DCG@i. /// Discounted Cumulative gain /// is the sum of the gains, for all the instances i, normalized by the natural logarithm of the instance + 1. - /// Note that unlike the Wikipedia article, ML.Net uses the natural logarithm. + /// Note that unlike the Wikipedia article, ML.NET uses the natural logarithm. /// /// public double[] Dcg { get; } diff --git a/src/Microsoft.ML.SamplesUtils/ConsoleUtils.cs b/src/Microsoft.ML.SamplesUtils/ConsoleUtils.cs index 7c40d9a4c8..58ec0bcb6e 100644 --- a/src/Microsoft.ML.SamplesUtils/ConsoleUtils.cs +++ b/src/Microsoft.ML.SamplesUtils/ConsoleUtils.cs @@ -43,8 +43,18 @@ public static void PrintMetrics(RegressionMetrics metrics) /// Ranker metrics. public static void PrintMetrics(RankerMetrics metrics) { - Console.WriteLine($"DCG@N: {string.Join(", ", metrics.Dcg.Select(d => Math.Round(d, 2)).ToArray())}"); - Console.WriteLine($"NDCG@N: {string.Join(", ", metrics.Ndcg.Select(d => Math.Round(d, 2)).ToArray())}"); + Console.WriteLine($"DCG: {string.Join(", ", RoundAndBeautifyRankerMetrics(metrics.Dcg))}"); + Console.WriteLine($"NDCG: {string.Join(", ", RoundAndBeautifyRankerMetrics(metrics.Ndcg))}"); + } + + private static string[] RoundAndBeautifyRankerMetrics(double[] input) + { + string[] result = input.Select(d => Math.Round(d, 2).ToString()).ToArray(); + for (int i = 0; i < result.Length; i++) + { + result[i] = $"@{(i + 1).ToString()}:{result[i]}"; + } + return result; } } } diff --git a/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs b/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs index a40212b0e6..dab907f32a 100644 --- a/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs +++ b/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs @@ -138,7 +138,7 @@ public static IDataView LoadFeaturizedAdultDataset(MLContext mlContext) .Append(mlContext.Transforms.Concatenate("Features", "workclass", "education", "marital-status", "occupation", "relationship", "ethnicity", "native-country", "age", "education-num", "capital-gain", "capital-loss", "hours-per-week")) - // Min-max normalized all the features + // Min-max normalize all the features .Append(mlContext.Transforms.Normalize("Features")); var data = reader.Read(dataFile); @@ -146,7 +146,7 @@ public static IDataView LoadFeaturizedAdultDataset(MLContext mlContext) return featurizedData; } - public static string DownloadMslrWeb10kTrain() + public static string DownloadMslrWeb10k() { var fileName = "MSLRWeb10KTrain720kRows.tsv"; if (!File.Exists(fileName)) @@ -154,19 +154,10 @@ public static string DownloadMslrWeb10kTrain() return fileName; } - public static string DownloadMslrWeb10kValidate() - { - var fileName = "MSLRWeb10KValidate240kRows.tsv"; - if (!File.Exists(fileName)) - Download("https://tlcresources.blob.core.windows.net/datasets/MSLR-WEB10K/MSLR-WEB10K_Fold1.VALIDATE.160MB_240k-rows.tsv", fileName); - return fileName; - } - - public static (IDataView, IDataView) LoadFeaturizedMslrWeb10kTrainAndValidate(MLContext mlContext) + public static IDataView LoadFeaturizedMslrWeb10kDataset(MLContext mlContext) { // Download the training and validation files. - string trainDataFile = DownloadMslrWeb10kTrain(); - string validationDataFile = DownloadMslrWeb10kValidate(); + string dataFile = DownloadMslrWeb10k(); // Create the reader to read the data. var reader = mlContext.Data.CreateTextLoader( @@ -178,23 +169,18 @@ public static (IDataView, IDataView) LoadFeaturizedMslrWeb10kTrainAndValidate(ML } ); - // Load the raw training and validation datasets. - var trainData = reader.Read(trainDataFile); - var validationData = reader.Read(validationDataFile); + // Load the raw dataset. + var data = reader.Read(dataFile); // Create the featurization pipeline. First, hash the GroupId column. var pipeline = mlContext.Transforms.Conversion.Hash("GroupId") // Replace missing values in Features column with the default replacement value for its type. .Append(mlContext.Transforms.ReplaceMissingValues("Features")); - // Fit the pipeline on the training data. - var fittedPipeline = pipeline.Fit(trainData); - - // Use the fitted pipeline to transform the training and validation datasets. - var transformedTrainData = fittedPipeline.Transform(trainData); - var transformedValidationData = fittedPipeline.Transform(validationData); + // Fit the pipeline and transform the dataset. + var transformedData = pipeline.Fit(data).Transform(data); - return (transformedTrainData, transformedValidationData); + return transformedData; } /// From d862c3bd8f53e31454d555a1ce31fa44ba15286f Mon Sep 17 00:00:00 2001 From: "REDMOND\\nakazmi" Date: Fri, 22 Feb 2019 19:32:08 -0800 Subject: [PATCH 05/13] nit --- .../Dynamic/Trainers/Ranking/LightGBMRanking.cs | 2 +- .../Dynamic/Trainers/Ranking/LightGBMRankingWithOptions.cs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRanking.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRanking.cs index b5857e4538..eccf87af8c 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRanking.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRanking.cs @@ -31,7 +31,7 @@ public static void Example() // Evaluate how the model is doing on the test data. var dataWithPredictions = model.Transform(split.TestSet); - var metrics = mlContext.Ranking.Evaluate(dataWithPredictions, "Label", "GroupId"); + var metrics = mlContext.Ranking.Evaluate(dataWithPredictions); SamplesUtils.ConsoleUtils.PrintMetrics(metrics); // Expected output: diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRankingWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRankingWithOptions.cs index 30087131d8..c142881716 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRankingWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRankingWithOptions.cs @@ -34,7 +34,7 @@ public static void Example() // Evaluate how the model is doing on the test data. var dataWithPredictions = model.Transform(split.TestSet); - var metrics = mlContext.Ranking.Evaluate(dataWithPredictions, "Label", "GroupId"); + var metrics = mlContext.Ranking.Evaluate(dataWithPredictions); SamplesUtils.ConsoleUtils.PrintMetrics(metrics); // Expected output: From 18801ab63aaf98dc295cc701df65619a9972b01b Mon Sep 17 00:00:00 2001 From: "REDMOND\\nakazmi" Date: Tue, 19 Feb 2019 23:19:02 -0800 Subject: [PATCH 06/13] Adding a sample for LightGbm Ranking --- ...LightGBMBinaryClassificationWithOptions.cs | 1 - .../Trainers/Ranking/LightGBMRanking.cs | 42 +++++++++++++++ .../Ranking/LightGBMRankingWithOptions.cs | 44 ++++++++++++++++ docs/samples/Microsoft.ML.Samples/Program.cs | 2 +- .../Evaluators/Metrics/RankingMetrics.cs | 2 +- src/Microsoft.ML.SamplesUtils/ConsoleUtils.cs | 11 ++++ .../SamplesDatasetUtils.cs | 51 +++++++++++++++++++ 7 files changed, 150 insertions(+), 3 deletions(-) create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRanking.cs create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRankingWithOptions.cs diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGBMBinaryClassificationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGBMBinaryClassificationWithOptions.cs index 20924bc29f..904285aaee 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGBMBinaryClassificationWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGBMBinaryClassificationWithOptions.cs @@ -1,5 +1,4 @@ using Microsoft.ML.LightGBM; -using Microsoft.ML.Transforms.Categorical; using static Microsoft.ML.LightGBM.Options; namespace Microsoft.ML.Samples.Dynamic diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRanking.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRanking.cs new file mode 100644 index 0000000000..8822a16630 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRanking.cs @@ -0,0 +1,42 @@ +using System; +using System.Collections.Generic; +using System.Text; + +namespace Microsoft.ML.Samples.Dynamic +{ + public class LightGbmRanking + { + // This example requires installation of additional nuget package Microsoft.ML.LightGBM. + public static void Example() + { + // Creating the ML.Net IHostEnvironment object, needed for the pipeline. + var mlContext = new MLContext(); + + // Download and featurize the train and validation datasets. + (var trainData, var validationData) = SamplesUtils.DatasetUtils.LoadFeaturizedMslrWeb10kTrainAndValidate(mlContext); + + // Create the Estimator pipeline. For simplicity, we will train a small tree with 4 leaves and 2 boosting iterations. + var pipeline = mlContext.Ranking.Trainers.LightGbm( + labelColumn: "Label", + featureColumn: "Features", + groupIdColumn: "GroupId", + numLeaves: 4, + minDataPerLeaf: 10, + learningRate: 0.1, + numBoostRound: 2); + + // Fit this Pipeline to the Training Data. + var model = pipeline.Fit(trainData); + + // Evaluate how the model is doing on the test data. + var dataWithPredictions = model.Transform(validationData); + + var metrics = mlContext.Ranking.Evaluate(dataWithPredictions, "Label", "GroupId"); + SamplesUtils.ConsoleUtils.PrintMetrics(metrics); + + // Output: + // DCG @N: 1.38, 3.11, 4.94 + // NDCG @N: 7.13, 10.12, 12.62 + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRankingWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRankingWithOptions.cs new file mode 100644 index 0000000000..d8f3da41ea --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRankingWithOptions.cs @@ -0,0 +1,44 @@ +using Microsoft.ML.LightGBM; +using static Microsoft.ML.LightGBM.Options; + +namespace Microsoft.ML.Samples.Dynamic +{ + public class LightGbmRankingWithOptions + { + // This example requires installation of additional nuget package Microsoft.ML.LightGBM. + public static void Example() + { + // Creating the ML.Net IHostEnvironment object, needed for the pipeline. + var mlContext = new MLContext(); + + // Download and featurize the train and validation datasets. + (var trainData, var validationData) = SamplesUtils.DatasetUtils.LoadFeaturizedMslrWeb10kTrainAndValidate(mlContext); + + // Create the Estimator pipeline. For simplicity, we will train a small tree with 4 leaves and 2 boosting iterations. + var pipeline = mlContext.Ranking.Trainers.LightGbm( + new Options + { + LabelColumn = "Label", + FeatureColumn = "Features", + GroupIdColumn = "GroupId", + NumLeaves = 4, + MinDataPerLeaf = 10, + LearningRate = 0.1, + NumBoostRound = 2 + }); + + // Fit this Pipeline to the Training Data. + var model = pipeline.Fit(trainData); + + // Evaluate how the model is doing on the test data. + var dataWithPredictions = model.Transform(validationData); + + var metrics = mlContext.Ranking.Evaluate(dataWithPredictions, "Label", "GroupId"); + SamplesUtils.ConsoleUtils.PrintMetrics(metrics); + + // Output: + // DCG @N: 1.38, 3.11, 4.94 + // NDCG @N: 7.13, 10.12, 12.62 + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Program.cs b/docs/samples/Microsoft.ML.Samples/Program.cs index d28cdd4d77..6fa4e40705 100644 --- a/docs/samples/Microsoft.ML.Samples/Program.cs +++ b/docs/samples/Microsoft.ML.Samples/Program.cs @@ -6,7 +6,7 @@ internal static class Program { static void Main(string[] args) { - TakeRows.Example(); + LightGbmRanking.Example(); } } } diff --git a/src/Microsoft.ML.Data/Evaluators/Metrics/RankingMetrics.cs b/src/Microsoft.ML.Data/Evaluators/Metrics/RankingMetrics.cs index f5f4b3389a..82513280da 100644 --- a/src/Microsoft.ML.Data/Evaluators/Metrics/RankingMetrics.cs +++ b/src/Microsoft.ML.Data/Evaluators/Metrics/RankingMetrics.cs @@ -18,7 +18,7 @@ public sealed class RankingMetrics ///Array of discounted cumulative gains where i-th element represent DCG@i. /// Discounted Cumulative gain /// is the sum of the gains, for all the instances i, normalized by the natural logarithm of the instance + 1. - /// Note that unline the Wikipedia article, ML.Net uses the natural logarithm. + /// Note that unlike the Wikipedia article, ML.Net uses the natural logarithm. /// /// public double[] Dcg { get; } diff --git a/src/Microsoft.ML.SamplesUtils/ConsoleUtils.cs b/src/Microsoft.ML.SamplesUtils/ConsoleUtils.cs index 16f72e3392..40407fe142 100644 --- a/src/Microsoft.ML.SamplesUtils/ConsoleUtils.cs +++ b/src/Microsoft.ML.SamplesUtils/ConsoleUtils.cs @@ -1,4 +1,5 @@ using System; +using System.Linq; using Microsoft.ML.Data; namespace Microsoft.ML.SamplesUtils @@ -47,5 +48,15 @@ public static void PrintMetrics(RegressionMetrics metrics) Console.WriteLine($"RMS: {metrics.Rms:F2}"); Console.WriteLine($"RSquared: {metrics.RSquared:F2}"); } + + /// + /// Pretty-print RankerMetrics objects. + /// + /// Ranker metrics. + public static void PrintMetrics(RankerMetrics metrics) + { + Console.WriteLine($"DCG@N: {string.Join(", ", metrics.Dcg.Select(d => Math.Round(d, 2)).ToArray())}"); + Console.WriteLine($"NDCG@N: {string.Join(", ", metrics.Ndcg.Select(d => Math.Round(d, 2)).ToArray())}"); + } } } diff --git a/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs b/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs index 79ce680470..b123b604f4 100644 --- a/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs +++ b/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs @@ -145,6 +145,57 @@ public static IDataView LoadFeaturizedAdultDataset(MLContext mlContext) return featurizedData; } + public static string DownloadMslrWeb10kTrain() + { + var fileName = "MSLRWeb10KTrain720kRows.tsv"; + if (!File.Exists(fileName)) + Download("https://tlcresources.blob.core.windows.net/datasets/MSLR-WEB10K/MSLR-WEB10K_Fold1.TRAIN.500MB_720k-rows.tsv", fileName); + return fileName; + } + + public static string DownloadMslrWeb10kValidate() + { + var fileName = "MSLRWeb10KValidate240kRows.tsv"; + if (!File.Exists(fileName)) + Download("https://tlcresources.blob.core.windows.net/datasets/MSLR-WEB10K/MSLR-WEB10K_Fold1.VALIDATE.160MB_240k-rows.tsv", fileName); + return fileName; + } + + public static (IDataView, IDataView) LoadFeaturizedMslrWeb10kTrainAndValidate(MLContext mlContext) + { + // Download the training and validation files. + string trainDataFile = DownloadMslrWeb10kTrain(); + string validationDataFile = DownloadMslrWeb10kValidate(); + + // Create the reader to read the data. + var reader = mlContext.Data.CreateTextLoader( + columns: new[] + { + new TextLoader.Column("Label", DataKind.R4, 0), + new TextLoader.Column("GroupId", DataKind.TX, 1), + new TextLoader.Column("Features", DataKind.R4, new[] { new TextLoader.Range(2, 138) }) + } + ); + + // Load the raw training and validation datasets. + var trainData = reader.Read(trainDataFile); + var validationData = reader.Read(validationDataFile); + + // Create the featurization pipeline. First, hash the GroupId column. + var pipeline = mlContext.Transforms.Conversion.Hash("GroupId") + // Replace missing values in Features column with the default replacement value for its type. + .Append(mlContext.Transforms.ReplaceMissingValues("Features")); + + // Fit the pipeline on the training data. + var fittedPipeline = pipeline.Fit(trainData); + + // Use the fitted pipeline to transform the training and validation datasets. + var transformedTrainData = fittedPipeline.Transform(trainData); + var transformedValidationData = fittedPipeline.Transform(validationData); + + return (transformedTrainData, transformedValidationData); + } + /// /// Downloads the breast cancer dataset from the ML.NET repo. /// From 1e1a80399ee5c240d00fc6f5ae53d1f111e8a30c Mon Sep 17 00:00:00 2001 From: "REDMOND\\nakazmi" Date: Fri, 22 Feb 2019 18:19:12 -0800 Subject: [PATCH 07/13] PR feedback + cleaning up namespaces in Microsoft.ML.Samples project --- .../LightGBMBinaryClassification.cs | 24 +++++++------- ...LightGBMBinaryClassificationWithOptions.cs | 24 +++++++------- .../SDCALogisticRegression.cs | 2 +- .../SDCASupportVectorMachine.cs | 2 +- .../SymbolicStochasticGradientDescent.cs | 20 ++++++------ ...licStochasticGradientDescentWithOptions.cs | 21 ++++++------ .../LightGBMMulticlassClassification.cs | 4 +-- ...tGBMMulticlassClassificationWithOptions.cs | 4 +-- .../Trainers/Ranking/LightGBMRanking.cs | 30 ++++++++--------- .../Ranking/LightGBMRankingWithOptions.cs | 27 ++++++++-------- .../Recommendation/MatrixFactorization.cs | 2 +- .../MatrixFactorizationWithOptions.cs | 2 +- .../Trainers/Regression/LightGBMRegression.cs | 16 +++++----- .../LightGBMRegressionWithOptions.cs | 16 +++++----- .../Regression/OrdinaryLeastSquares.cs | 14 ++++---- .../OrdinaryLeastSquaresWithOptions.cs | 14 ++++---- docs/samples/Microsoft.ML.Samples/Program.cs | 2 +- .../Evaluators/Metrics/RankingMetrics.cs | 4 +-- src/Microsoft.ML.SamplesUtils/ConsoleUtils.cs | 14 ++++++-- .../SamplesDatasetUtils.cs | 32 ++++++------------- 20 files changed, 138 insertions(+), 136 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGBMBinaryClassification.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGBMBinaryClassification.cs index edd4e31504..a6834d0082 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGBMBinaryClassification.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGBMBinaryClassification.cs @@ -1,8 +1,8 @@ using Microsoft.ML.Transforms.Categorical; -namespace Microsoft.ML.Samples.Dynamic +namespace Microsoft.ML.Samples.Dynamic.Trainers.BinaryClassification { - public class LightGbmBinaryClassification + public class LightGbm { // This example requires installation of additional nuget package Microsoft.ML.LightGBM. public static void Example() @@ -17,7 +17,7 @@ public static void Example() var split = mlContext.BinaryClassification.TrainTestSplit(dataview, testFraction: 0.1); // Create the Estimator. - var pipeline = mlContext.BinaryClassification.Trainers.LightGbm("IsOver50K", "Features"); + var pipeline = mlContext.BinaryClassification.Trainers.LightGbm(); // Fit this Pipeline to the Training Data. var model = pipeline.Fit(split.TrainSet); @@ -25,17 +25,17 @@ public static void Example() // Evaluate how the model is doing on the test data. var dataWithPredictions = model.Transform(split.TestSet); - var metrics = mlContext.BinaryClassification.Evaluate(dataWithPredictions, "IsOver50K"); + var metrics = mlContext.BinaryClassification.Evaluate(dataWithPredictions); SamplesUtils.ConsoleUtils.PrintMetrics(metrics); - // Output: - // Accuracy: 0.88 - // AUC: 0.93 - // F1 Score: 0.71 - // Negative Precision: 0.90 - // Negative Recall: 0.94 - // Positive Precision: 0.76 - // Positive Recall: 0.66 + // Expected output: + // Accuracy: 0.88 + // AUC: 0.93 + // F1 Score: 0.71 + // Negative Precision: 0.90 + // Negative Recall: 0.94 + // Positive Precision: 0.76 + // Positive Recall: 0.66 } } } \ No newline at end of file diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGBMBinaryClassificationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGBMBinaryClassificationWithOptions.cs index 904285aaee..7b0e21fed9 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGBMBinaryClassificationWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGBMBinaryClassificationWithOptions.cs @@ -1,9 +1,9 @@ using Microsoft.ML.LightGBM; using static Microsoft.ML.LightGBM.Options; -namespace Microsoft.ML.Samples.Dynamic +namespace Microsoft.ML.Samples.Dynamic.Trainers.BinaryClassification { - class LightGbmBinaryClassificationWithOptions + class LightGbmWithOptions { // This example requires installation of additional nuget package Microsoft.ML.LightGBM. public static void Example() @@ -21,8 +21,6 @@ public static void Example() var pipeline = mlContext.BinaryClassification.Trainers.LightGbm( new Options { - LabelColumn = "IsOver50K", - FeatureColumn = "Features", Booster = new GossBooster.Options { TopRate = 0.3, @@ -36,17 +34,17 @@ public static void Example() // Evaluate how the model is doing on the test data. var dataWithPredictions = model.Transform(split.TestSet); - var metrics = mlContext.BinaryClassification.Evaluate(dataWithPredictions, "IsOver50K"); + var metrics = mlContext.BinaryClassification.Evaluate(dataWithPredictions); SamplesUtils.ConsoleUtils.PrintMetrics(metrics); - // Output: - // Accuracy: 0.88 - // AUC: 0.93 - // F1 Score: 0.71 - // Negative Precision: 0.90 - // Negative Recall: 0.94 - // Positive Precision: 0.76 - // Positive Recall: 0.67 + // Expected output: + // Accuracy: 0.88 + // AUC: 0.93 + // F1 Score: 0.71 + // Negative Precision: 0.90 + // Negative Recall: 0.94 + // Positive Precision: 0.76 + // Positive Recall: 0.67 } } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SDCALogisticRegression.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SDCALogisticRegression.cs index 44a7a77534..0b5347ebc0 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SDCALogisticRegression.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SDCALogisticRegression.cs @@ -3,7 +3,7 @@ using Microsoft.ML.Data; using Microsoft.ML.Trainers; -namespace Microsoft.ML.Samples.Dynamic +namespace Microsoft.ML.Samples.Dynamic.Trainers.BinaryClassification { public static class SDCALogisticRegression { diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SDCASupportVectorMachine.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SDCASupportVectorMachine.cs index eede7b03cb..472e7390a8 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SDCASupportVectorMachine.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SDCASupportVectorMachine.cs @@ -2,7 +2,7 @@ using System.Linq; using Microsoft.ML.Data; -namespace Microsoft.ML.Samples.Dynamic +namespace Microsoft.ML.Samples.Dynamic.Trainers.BinaryClassification { public static class SDCASupportVectorMachine { diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescent.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescent.cs index c0687d6ee7..2b69730004 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescent.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescent.cs @@ -1,4 +1,4 @@ -namespace Microsoft.ML.Samples.Dynamic +namespace Microsoft.ML.Samples.Dynamic.Trainers.BinaryClassification { public static class SymbolicStochasticGradientDescent { @@ -24,15 +24,17 @@ public static void Example() // Evaluate how the model is doing on the test data. var dataWithPredictions = model.Transform(split.TestSet); - var metrics = mlContext.BinaryClassification.EvaluateNonCalibrated(dataWithPredictions, "IsOver50K"); + var metrics = mlContext.BinaryClassification.EvaluateNonCalibrated(dataWithPredictions); SamplesUtils.ConsoleUtils.PrintMetrics(metrics); - // Accuracy: 0.85 - // AUC: 0.90 - // F1 Score: 0.64 - // Negative Precision: 0.88 - // Negative Recall: 0.93 - // Positive Precision: 0.72 - // Positive Recall: 0.58 + + // Expected output: + // Accuracy: 0.85 + // AUC: 0.90 + // F1 Score: 0.64 + // Negative Precision: 0.88 + // Negative Recall: 0.93 + // Positive Precision: 0.72 + // Positive Recall: 0.58 } } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescentWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescentWithOptions.cs index 9dd4f50c87..f547cd9712 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescentWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescentWithOptions.cs @@ -1,4 +1,4 @@ -namespace Microsoft.ML.Samples.Dynamic +namespace Microsoft.ML.Samples.Dynamic.Trainers.BinaryClassification { public static class SymbolicStochasticGradientDescentWithOptions { @@ -22,7 +22,6 @@ public static void Example() var pipeline = mlContext.BinaryClassification.Trainers.SymbolicStochasticGradientDescent( new ML.Trainers.HalLearners.SymSgdClassificationTrainer.Options() { - LabelColumn = "IsOver50K", LearningRate = 0.2f, NumberOfIterations = 10, NumberOfThreads = 1, @@ -33,15 +32,17 @@ public static void Example() // Evaluate how the model is doing on the test data. var dataWithPredictions = model.Transform(split.TestSet); - var metrics = mlContext.BinaryClassification.EvaluateNonCalibrated(dataWithPredictions, "IsOver50K"); + var metrics = mlContext.BinaryClassification.EvaluateNonCalibrated(dataWithPredictions); SamplesUtils.ConsoleUtils.PrintMetrics(metrics); - // Accuracy: 0.84 - // AUC: 0.88 - // F1 Score: 0.60 - // Negative Precision: 0.87 - // Negative Recall: 0.93 - // Positive Precision: 0.69 - // Positive Recall: 0.53 + + // Expected output: + // Accuracy: 0.84 + // AUC: 0.88 + // F1 Score: 0.60 + // Negative Precision: 0.87 + // Negative Recall: 0.93 + // Positive Precision: 0.69 + // Positive Recall: 0.53 } } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGBMMulticlassClassification.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGBMMulticlassClassification.cs index 5c6ee5ad5f..f6f7b0d067 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGBMMulticlassClassification.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGBMMulticlassClassification.cs @@ -3,9 +3,9 @@ using Microsoft.ML.Data; using Microsoft.ML.SamplesUtils; -namespace Microsoft.ML.Samples.Dynamic +namespace Microsoft.ML.Samples.Dynamic.Trainers.MulticlassClassification { - class LightGbmMulticlassClassification + class LightGbm { // This example requires installation of additional nuget package Microsoft.ML.LightGBM. public static void Example() diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGBMMulticlassClassificationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGBMMulticlassClassificationWithOptions.cs index 7d98c9318e..36de9b8fe1 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGBMMulticlassClassificationWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGBMMulticlassClassificationWithOptions.cs @@ -5,9 +5,9 @@ using Microsoft.ML.SamplesUtils; using static Microsoft.ML.LightGBM.Options; -namespace Microsoft.ML.Samples.Dynamic +namespace Microsoft.ML.Samples.Dynamic.Trainers.MulticlassClassification { - class LightGbmMulticlassClassificationWithOptions + class LightGbmWithOptions { // This example requires installation of additional nuget package Microsoft.ML.LightGBM. public static void Example() diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRanking.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRanking.cs index 8822a16630..b5857e4538 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRanking.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRanking.cs @@ -1,10 +1,8 @@ -using System; -using System.Collections.Generic; -using System.Text; +using Microsoft.ML; -namespace Microsoft.ML.Samples.Dynamic +namespace Microsoft.ML.Samples.Dynamic.Trainers.Ranking { - public class LightGbmRanking + public class LightGbm { // This example requires installation of additional nuget package Microsoft.ML.LightGBM. public static void Example() @@ -12,31 +10,33 @@ public static void Example() // Creating the ML.Net IHostEnvironment object, needed for the pipeline. var mlContext = new MLContext(); - // Download and featurize the train and validation datasets. - (var trainData, var validationData) = SamplesUtils.DatasetUtils.LoadFeaturizedMslrWeb10kTrainAndValidate(mlContext); + // Download and featurize the dataset. + var dataview = SamplesUtils.DatasetUtils.LoadFeaturizedMslrWeb10kDataset(mlContext); + + // Leave out 10% of the dataset for testing. Since this is a ranking problem, we must ensure that the split + // respects the GroupId column, i.e. rows with the same GroupId are either all in the train split or all in + // the test split. The samplingKeyColumn parameter in Ranking.TrainTestSplit is used for this purpose. + var split = mlContext.Ranking.TrainTestSplit(dataview, testFraction: 0.1, samplingKeyColumn: "GroupId"); // Create the Estimator pipeline. For simplicity, we will train a small tree with 4 leaves and 2 boosting iterations. var pipeline = mlContext.Ranking.Trainers.LightGbm( - labelColumn: "Label", - featureColumn: "Features", - groupIdColumn: "GroupId", numLeaves: 4, minDataPerLeaf: 10, learningRate: 0.1, numBoostRound: 2); // Fit this Pipeline to the Training Data. - var model = pipeline.Fit(trainData); + var model = pipeline.Fit(split.TrainSet); // Evaluate how the model is doing on the test data. - var dataWithPredictions = model.Transform(validationData); + var dataWithPredictions = model.Transform(split.TestSet); var metrics = mlContext.Ranking.Evaluate(dataWithPredictions, "Label", "GroupId"); SamplesUtils.ConsoleUtils.PrintMetrics(metrics); - // Output: - // DCG @N: 1.38, 3.11, 4.94 - // NDCG @N: 7.13, 10.12, 12.62 + // Expected output: + // DCG: @1:1.25, @2:2.69, @3:4.57 + // NDCG: @1:7.01, @2:9.57, @3:12.34 } } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRankingWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRankingWithOptions.cs index d8f3da41ea..30087131d8 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRankingWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRankingWithOptions.cs @@ -1,9 +1,8 @@ using Microsoft.ML.LightGBM; -using static Microsoft.ML.LightGBM.Options; -namespace Microsoft.ML.Samples.Dynamic +namespace Microsoft.ML.Samples.Dynamic.Trainers.Ranking { - public class LightGbmRankingWithOptions + public class LightGbmWithOptions { // This example requires installation of additional nuget package Microsoft.ML.LightGBM. public static void Example() @@ -12,33 +11,35 @@ public static void Example() var mlContext = new MLContext(); // Download and featurize the train and validation datasets. - (var trainData, var validationData) = SamplesUtils.DatasetUtils.LoadFeaturizedMslrWeb10kTrainAndValidate(mlContext); + var dataview = SamplesUtils.DatasetUtils.LoadFeaturizedMslrWeb10kDataset(mlContext); + + // Leave out 10% of the dataset for testing. Since this is a ranking problem, we must ensure that the split + // respects the GroupId column, i.e. rows with the same GroupId are either all in the train split or all in + // the test split. The samplingKeyColumn parameter in Ranking.TrainTestSplit is used for this purpose. + var split = mlContext.Ranking.TrainTestSplit(dataview, testFraction: 0.1, samplingKeyColumn: "GroupId"); // Create the Estimator pipeline. For simplicity, we will train a small tree with 4 leaves and 2 boosting iterations. var pipeline = mlContext.Ranking.Trainers.LightGbm( new Options { - LabelColumn = "Label", - FeatureColumn = "Features", - GroupIdColumn = "GroupId", NumLeaves = 4, MinDataPerLeaf = 10, LearningRate = 0.1, NumBoostRound = 2 }); - // Fit this Pipeline to the Training Data. - var model = pipeline.Fit(trainData); + // Fit this pipeline to the training Data. + var model = pipeline.Fit(split.TrainSet); // Evaluate how the model is doing on the test data. - var dataWithPredictions = model.Transform(validationData); + var dataWithPredictions = model.Transform(split.TestSet); var metrics = mlContext.Ranking.Evaluate(dataWithPredictions, "Label", "GroupId"); SamplesUtils.ConsoleUtils.PrintMetrics(metrics); - // Output: - // DCG @N: 1.38, 3.11, 4.94 - // NDCG @N: 7.13, 10.12, 12.62 + // Expected output: + // DCG: @1:1.25, @2:2.69, @3:4.57 + // NDCG: @1:7.01, @2:9.57, @3:12.34 } } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Recommendation/MatrixFactorization.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Recommendation/MatrixFactorization.cs index d252eb489d..3737e751d5 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Recommendation/MatrixFactorization.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Recommendation/MatrixFactorization.cs @@ -3,7 +3,7 @@ using Microsoft.ML.Data; using static Microsoft.ML.SamplesUtils.DatasetUtils; -namespace Microsoft.ML.Samples.Dynamic +namespace Microsoft.ML.Samples.Dynamic.Trainers.Recommendation { public static class MatrixFactorization { diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Recommendation/MatrixFactorizationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Recommendation/MatrixFactorizationWithOptions.cs index c73fd7fbcb..cbb11938a0 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Recommendation/MatrixFactorizationWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Recommendation/MatrixFactorizationWithOptions.cs @@ -4,7 +4,7 @@ using Microsoft.ML.Trainers; using static Microsoft.ML.SamplesUtils.DatasetUtils; -namespace Microsoft.ML.Samples.Dynamic +namespace Microsoft.ML.Samples.Dynamic.Trainers.Recommendation { public static class MatrixFactorizationWithOptions { diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGBMRegression.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGBMRegression.cs index d67da241c9..ce9e27a0fc 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGBMRegression.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGBMRegression.cs @@ -2,9 +2,9 @@ using System.Linq; using Microsoft.ML.Data; -namespace Microsoft.ML.Samples.Dynamic +namespace Microsoft.ML.Samples.Dynamic.Trainers.Regression { - class LightGbmRegression + class LightGbm { // This example requires installation of additional nuget package Microsoft.ML.LightGBM. public static void Example() @@ -54,12 +54,12 @@ public static void Example() var metrics = mlContext.Regression.Evaluate(dataWithPredictions, label: labelName); SamplesUtils.ConsoleUtils.PrintMetrics(metrics); - // Output - // L1: 4.97 - // L2: 51.37 - // LossFunction: 51.37 - // RMS: 7.17 - // RSquared: 0.08 + // Expected output + // L1: 4.97 + // L2: 51.37 + // LossFunction: 51.37 + // RMS: 7.17 + // RSquared: 0.08 } } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGBMRegressionWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGBMRegressionWithOptions.cs index 3f73df053e..c1c82a9735 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGBMRegressionWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGBMRegressionWithOptions.cs @@ -4,9 +4,9 @@ using Microsoft.ML.LightGBM; using static Microsoft.ML.LightGBM.Options; -namespace Microsoft.ML.Samples.Dynamic +namespace Microsoft.ML.Samples.Dynamic.Trainers.Regression { - class LightGbmRegressionWithOptions + class LightGbmWithOptions { // This example requires installation of additional nuget package Microsoft.ML.LightGBM. public static void Example() @@ -64,12 +64,12 @@ public static void Example() var metrics = mlContext.Regression.Evaluate(dataWithPredictions, label: labelName); SamplesUtils.ConsoleUtils.PrintMetrics(metrics); - // Output - // L1: 4.97 - // L2: 51.37 - // LossFunction: 51.37 - // RMS: 7.17 - // RSquared: 0.08 + // Expected output + // L1: 4.97 + // L2: 51.37 + // LossFunction: 51.37 + // RMS: 7.17 + // RSquared: 0.08 } } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquares.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquares.cs index 003962c5bc..0e75693547 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquares.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquares.cs @@ -2,7 +2,7 @@ using Microsoft.ML.Data; using Microsoft.ML.SamplesUtils; -namespace Microsoft.ML.Samples.Dynamic +namespace Microsoft.ML.Samples.Dynamic.Trainers.Regression { public static class OrdinaryLeastSquares { @@ -55,11 +55,13 @@ public static void Example() var metrics = mlContext.Regression.Evaluate(dataWithPredictions); ConsoleUtils.PrintMetrics(metrics); - // L1: 4.15 - // L2: 31.98 - // LossFunction: 31.98 - // RMS: 5.65 - // RSquared: 0.56 + + // Expected output: + // L1: 4.15 + // L2: 31.98 + // LossFunction: 31.98 + // RMS: 5.65 + // RSquared: 0.56 } } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquaresWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquaresWithOptions.cs index 21a6a9e1ae..6cc982b277 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquaresWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquaresWithOptions.cs @@ -3,7 +3,7 @@ using Microsoft.ML.SamplesUtils; using Microsoft.ML.Trainers.HalLearners; -namespace Microsoft.ML.Samples.Dynamic +namespace Microsoft.ML.Samples.Dynamic.Trainers.Regression { public static class OrdinaryLeastSquaresWithOptions { @@ -59,11 +59,13 @@ public static void Example() var metrics = mlContext.Regression.Evaluate(dataWithPredictions); ConsoleUtils.PrintMetrics(metrics); - // L1: 4.14 - // L2: 32.35 - // LossFunction: 32.35 - // RMS: 5.69 - // RSquared: 0.56 + + // Expected output: + // L1: 4.14 + // L2: 32.35 + // LossFunction: 32.35 + // RMS: 5.69 + // RSquared: 0.56 } } } diff --git a/docs/samples/Microsoft.ML.Samples/Program.cs b/docs/samples/Microsoft.ML.Samples/Program.cs index 6fa4e40705..d28cdd4d77 100644 --- a/docs/samples/Microsoft.ML.Samples/Program.cs +++ b/docs/samples/Microsoft.ML.Samples/Program.cs @@ -6,7 +6,7 @@ internal static class Program { static void Main(string[] args) { - LightGbmRanking.Example(); + TakeRows.Example(); } } } diff --git a/src/Microsoft.ML.Data/Evaluators/Metrics/RankingMetrics.cs b/src/Microsoft.ML.Data/Evaluators/Metrics/RankingMetrics.cs index 82513280da..8c64adc842 100644 --- a/src/Microsoft.ML.Data/Evaluators/Metrics/RankingMetrics.cs +++ b/src/Microsoft.ML.Data/Evaluators/Metrics/RankingMetrics.cs @@ -15,10 +15,10 @@ public sealed class RankingMetrics public double[] Ndcg { get; } /// - ///Array of discounted cumulative gains where i-th element represent DCG@i. + /// Array of discounted cumulative gains where i-th element represent DCG@i. /// Discounted Cumulative gain /// is the sum of the gains, for all the instances i, normalized by the natural logarithm of the instance + 1. - /// Note that unlike the Wikipedia article, ML.Net uses the natural logarithm. + /// Note that unlike the Wikipedia article, ML.NET uses the natural logarithm. /// /// public double[] Dcg { get; } diff --git a/src/Microsoft.ML.SamplesUtils/ConsoleUtils.cs b/src/Microsoft.ML.SamplesUtils/ConsoleUtils.cs index 40407fe142..5c581c6856 100644 --- a/src/Microsoft.ML.SamplesUtils/ConsoleUtils.cs +++ b/src/Microsoft.ML.SamplesUtils/ConsoleUtils.cs @@ -55,8 +55,18 @@ public static void PrintMetrics(RegressionMetrics metrics) /// Ranker metrics. public static void PrintMetrics(RankerMetrics metrics) { - Console.WriteLine($"DCG@N: {string.Join(", ", metrics.Dcg.Select(d => Math.Round(d, 2)).ToArray())}"); - Console.WriteLine($"NDCG@N: {string.Join(", ", metrics.Ndcg.Select(d => Math.Round(d, 2)).ToArray())}"); + Console.WriteLine($"DCG: {string.Join(", ", RoundAndBeautifyRankerMetrics(metrics.Dcg))}"); + Console.WriteLine($"NDCG: {string.Join(", ", RoundAndBeautifyRankerMetrics(metrics.Ndcg))}"); + } + + private static string[] RoundAndBeautifyRankerMetrics(double[] input) + { + string[] result = input.Select(d => Math.Round(d, 2).ToString()).ToArray(); + for (int i = 0; i < result.Length; i++) + { + result[i] = $"@{(i + 1).ToString()}:{result[i]}"; + } + return result; } } } diff --git a/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs b/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs index b123b604f4..86e8b61ebe 100644 --- a/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs +++ b/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs @@ -137,7 +137,7 @@ public static IDataView LoadFeaturizedAdultDataset(MLContext mlContext) .Append(mlContext.Transforms.Concatenate("Features", "workclass", "education", "marital-status", "occupation", "relationship", "ethnicity", "native-country", "age", "education-num", "capital-gain", "capital-loss", "hours-per-week")) - // Min-max normalized all the features + // Min-max normalize all the features .Append(mlContext.Transforms.Normalize("Features")); var data = reader.Read(dataFile); @@ -145,7 +145,7 @@ public static IDataView LoadFeaturizedAdultDataset(MLContext mlContext) return featurizedData; } - public static string DownloadMslrWeb10kTrain() + public static string DownloadMslrWeb10k() { var fileName = "MSLRWeb10KTrain720kRows.tsv"; if (!File.Exists(fileName)) @@ -153,19 +153,10 @@ public static string DownloadMslrWeb10kTrain() return fileName; } - public static string DownloadMslrWeb10kValidate() - { - var fileName = "MSLRWeb10KValidate240kRows.tsv"; - if (!File.Exists(fileName)) - Download("https://tlcresources.blob.core.windows.net/datasets/MSLR-WEB10K/MSLR-WEB10K_Fold1.VALIDATE.160MB_240k-rows.tsv", fileName); - return fileName; - } - - public static (IDataView, IDataView) LoadFeaturizedMslrWeb10kTrainAndValidate(MLContext mlContext) + public static IDataView LoadFeaturizedMslrWeb10kDataset(MLContext mlContext) { // Download the training and validation files. - string trainDataFile = DownloadMslrWeb10kTrain(); - string validationDataFile = DownloadMslrWeb10kValidate(); + string dataFile = DownloadMslrWeb10k(); // Create the reader to read the data. var reader = mlContext.Data.CreateTextLoader( @@ -177,23 +168,18 @@ public static (IDataView, IDataView) LoadFeaturizedMslrWeb10kTrainAndValidate(ML } ); - // Load the raw training and validation datasets. - var trainData = reader.Read(trainDataFile); - var validationData = reader.Read(validationDataFile); + // Load the raw dataset. + var data = reader.Read(dataFile); // Create the featurization pipeline. First, hash the GroupId column. var pipeline = mlContext.Transforms.Conversion.Hash("GroupId") // Replace missing values in Features column with the default replacement value for its type. .Append(mlContext.Transforms.ReplaceMissingValues("Features")); - // Fit the pipeline on the training data. - var fittedPipeline = pipeline.Fit(trainData); - - // Use the fitted pipeline to transform the training and validation datasets. - var transformedTrainData = fittedPipeline.Transform(trainData); - var transformedValidationData = fittedPipeline.Transform(validationData); + // Fit the pipeline and transform the dataset. + var transformedData = pipeline.Fit(data).Transform(data); - return (transformedTrainData, transformedValidationData); + return transformedData; } /// From 345cf60448caf4f0682798dbe4318de74c489c4e Mon Sep 17 00:00:00 2001 From: "REDMOND\\nakazmi" Date: Tue, 19 Feb 2019 23:19:02 -0800 Subject: [PATCH 08/13] Adding a sample for LightGbm Ranking --- docs/samples/Microsoft.ML.Samples/Program.cs | 2 +- src/Microsoft.ML.SamplesUtils/ConsoleUtils.cs | 2 +- src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Program.cs b/docs/samples/Microsoft.ML.Samples/Program.cs index d28cdd4d77..6fa4e40705 100644 --- a/docs/samples/Microsoft.ML.Samples/Program.cs +++ b/docs/samples/Microsoft.ML.Samples/Program.cs @@ -6,7 +6,7 @@ internal static class Program { static void Main(string[] args) { - TakeRows.Example(); + LightGbmRanking.Example(); } } } diff --git a/src/Microsoft.ML.SamplesUtils/ConsoleUtils.cs b/src/Microsoft.ML.SamplesUtils/ConsoleUtils.cs index 5c581c6856..040d765b9b 100644 --- a/src/Microsoft.ML.SamplesUtils/ConsoleUtils.cs +++ b/src/Microsoft.ML.SamplesUtils/ConsoleUtils.cs @@ -53,7 +53,7 @@ public static void PrintMetrics(RegressionMetrics metrics) /// Pretty-print RankerMetrics objects. /// /// Ranker metrics. - public static void PrintMetrics(RankerMetrics metrics) + public static void PrintMetrics(RankingMetrics metrics) { Console.WriteLine($"DCG: {string.Join(", ", RoundAndBeautifyRankerMetrics(metrics.Dcg))}"); Console.WriteLine($"NDCG: {string.Join(", ", RoundAndBeautifyRankerMetrics(metrics.Ndcg))}"); diff --git a/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs b/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs index 86e8b61ebe..f09e9ad779 100644 --- a/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs +++ b/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs @@ -162,9 +162,9 @@ public static IDataView LoadFeaturizedMslrWeb10kDataset(MLContext mlContext) var reader = mlContext.Data.CreateTextLoader( columns: new[] { - new TextLoader.Column("Label", DataKind.R4, 0), - new TextLoader.Column("GroupId", DataKind.TX, 1), - new TextLoader.Column("Features", DataKind.R4, new[] { new TextLoader.Range(2, 138) }) + new TextLoader.Column("Label", DataKind.Single, 0), + new TextLoader.Column("GroupId", DataKind.String, 1), + new TextLoader.Column("Features", DataKind.Single, new[] { new TextLoader.Range(2, 138) }) } ); From c25a3c33869efb34236bcd65d155fc50ffc798d2 Mon Sep 17 00:00:00 2001 From: "REDMOND\\nakazmi" Date: Fri, 22 Feb 2019 18:19:12 -0800 Subject: [PATCH 09/13] PR feedback + cleaning up namespaces in Microsoft.ML.Samples project --- docs/samples/Microsoft.ML.Samples/Program.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/samples/Microsoft.ML.Samples/Program.cs b/docs/samples/Microsoft.ML.Samples/Program.cs index 6fa4e40705..d28cdd4d77 100644 --- a/docs/samples/Microsoft.ML.Samples/Program.cs +++ b/docs/samples/Microsoft.ML.Samples/Program.cs @@ -6,7 +6,7 @@ internal static class Program { static void Main(string[] args) { - LightGbmRanking.Example(); + TakeRows.Example(); } } } From 34ecd4a6dfe71b168cca8e0836afe4fe6ca8cf55 Mon Sep 17 00:00:00 2001 From: "REDMOND\\nakazmi" Date: Fri, 22 Feb 2019 19:32:08 -0800 Subject: [PATCH 10/13] nit --- .../Dynamic/Trainers/Ranking/LightGBMRanking.cs | 2 +- .../Dynamic/Trainers/Ranking/LightGBMRankingWithOptions.cs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRanking.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRanking.cs index b5857e4538..eccf87af8c 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRanking.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRanking.cs @@ -31,7 +31,7 @@ public static void Example() // Evaluate how the model is doing on the test data. var dataWithPredictions = model.Transform(split.TestSet); - var metrics = mlContext.Ranking.Evaluate(dataWithPredictions, "Label", "GroupId"); + var metrics = mlContext.Ranking.Evaluate(dataWithPredictions); SamplesUtils.ConsoleUtils.PrintMetrics(metrics); // Expected output: diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRankingWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRankingWithOptions.cs index 30087131d8..c142881716 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRankingWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRankingWithOptions.cs @@ -34,7 +34,7 @@ public static void Example() // Evaluate how the model is doing on the test data. var dataWithPredictions = model.Transform(split.TestSet); - var metrics = mlContext.Ranking.Evaluate(dataWithPredictions, "Label", "GroupId"); + var metrics = mlContext.Ranking.Evaluate(dataWithPredictions); SamplesUtils.ConsoleUtils.PrintMetrics(metrics); // Expected output: From 1c99a4f9e0df8ee2ea97a482f40c1927b9513b97 Mon Sep 17 00:00:00 2001 From: "REDMOND\\nakazmi" Date: Mon, 25 Feb 2019 17:02:38 -0800 Subject: [PATCH 11/13] Changing dataset to small sample and other feedback --- .../Dynamic/Trainers/Ranking/LightGBMRanking.cs | 4 ++-- .../Ranking/LightGBMRankingWithOptions.cs | 15 +++++++++++---- .../SamplesDatasetUtils.cs | 4 ++-- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRanking.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRanking.cs index eccf87af8c..c3bd9d604e 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRanking.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRanking.cs @@ -35,8 +35,8 @@ public static void Example() SamplesUtils.ConsoleUtils.PrintMetrics(metrics); // Expected output: - // DCG: @1:1.25, @2:2.69, @3:4.57 - // NDCG: @1:7.01, @2:9.57, @3:12.34 + // DCG: @1:1.71, @2:3.88, @3:7.93 + // NDCG: @1:7.98, @2:12.14, @3:16.62 } } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRankingWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRankingWithOptions.cs index c142881716..ccacec1b58 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRankingWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRankingWithOptions.cs @@ -1,4 +1,5 @@ using Microsoft.ML.LightGBM; +using static Microsoft.ML.LightGBM.Options; namespace Microsoft.ML.Samples.Dynamic.Trainers.Ranking { @@ -25,7 +26,11 @@ public static void Example() NumLeaves = 4, MinDataPerLeaf = 10, LearningRate = 0.1, - NumBoostRound = 2 + NumBoostRound = 2, + Booster = new TreeBooster.Options + { + FeatureFraction = 0.9 + } }); // Fit this pipeline to the training Data. @@ -37,9 +42,11 @@ public static void Example() var metrics = mlContext.Ranking.Evaluate(dataWithPredictions); SamplesUtils.ConsoleUtils.PrintMetrics(metrics); - // Expected output: - // DCG: @1:1.25, @2:2.69, @3:4.57 - // NDCG: @1:7.01, @2:9.57, @3:12.34 + // NOTE: + // + // This sample is currently broken due to a bug in setting the GroupId column in LightGbm when using Options. + // + // Please follow GitHub issue 2652 to be notified of a fix: https://github.com/dotnet/machinelearning/issues/2652 } } } diff --git a/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs b/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs index f09e9ad779..cddba0238e 100644 --- a/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs +++ b/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs @@ -147,9 +147,9 @@ public static IDataView LoadFeaturizedAdultDataset(MLContext mlContext) public static string DownloadMslrWeb10k() { - var fileName = "MSLRWeb10KTrain720kRows.tsv"; + var fileName = "MSLRWeb10KTrain10kRows.tsv"; if (!File.Exists(fileName)) - Download("https://tlcresources.blob.core.windows.net/datasets/MSLR-WEB10K/MSLR-WEB10K_Fold1.TRAIN.500MB_720k-rows.tsv", fileName); + Download("https://tlcresources.blob.core.windows.net/datasets/MSLR-WEB10K/MSLR-WEB10K%2BFold1.TRAIN.SMALL_10k-rows.tsv", fileName); return fileName; } From 07bdf9b8728a111a41e712e9d3bec875d11ef0a8 Mon Sep 17 00:00:00 2001 From: "REDMOND\\nakazmi" Date: Tue, 26 Feb 2019 10:48:36 -0800 Subject: [PATCH 12/13] Renaming LightGbm sample filenames --- .../{LightGBMBinaryClassification.cs => LightGbm.cs} | 0 ...MBinaryClassificationWithOptions.cs => LightGbmWithOptions.cs} | 0 .../{LightGBMMulticlassClassification.cs => LightGbm.cs} | 0 ...ticlassClassificationWithOptions.cs => LightGbmWithOptions.cs} | 0 .../Dynamic/Trainers/Ranking/{LightGBMRanking.cs => LightGbm.cs} | 0 .../{LightGBMRankingWithOptions.cs => LightGbmWithOptions.cs} | 0 .../Trainers/Regression/{LightGBMRegression.cs => LightGbm.cs} | 0 .../{LightGBMRegressionWithOptions.cs => LightGbmWithOptions.cs} | 0 8 files changed, 0 insertions(+), 0 deletions(-) rename docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/{LightGBMBinaryClassification.cs => LightGbm.cs} (100%) rename docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/{LightGBMBinaryClassificationWithOptions.cs => LightGbmWithOptions.cs} (100%) rename docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/{LightGBMMulticlassClassification.cs => LightGbm.cs} (100%) rename docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/{LightGBMMulticlassClassificationWithOptions.cs => LightGbmWithOptions.cs} (100%) rename docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/{LightGBMRanking.cs => LightGbm.cs} (100%) rename docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/{LightGBMRankingWithOptions.cs => LightGbmWithOptions.cs} (100%) rename docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/{LightGBMRegression.cs => LightGbm.cs} (100%) rename docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/{LightGBMRegressionWithOptions.cs => LightGbmWithOptions.cs} (100%) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGBMBinaryClassification.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGbm.cs similarity index 100% rename from docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGBMBinaryClassification.cs rename to docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGbm.cs diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGBMBinaryClassificationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGbmWithOptions.cs similarity index 100% rename from docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGBMBinaryClassificationWithOptions.cs rename to docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGbmWithOptions.cs diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGBMMulticlassClassification.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGbm.cs similarity index 100% rename from docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGBMMulticlassClassification.cs rename to docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGbm.cs diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGBMMulticlassClassificationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGbmWithOptions.cs similarity index 100% rename from docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGBMMulticlassClassificationWithOptions.cs rename to docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGbmWithOptions.cs diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRanking.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbm.cs similarity index 100% rename from docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRanking.cs rename to docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbm.cs diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRankingWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbmWithOptions.cs similarity index 100% rename from docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGBMRankingWithOptions.cs rename to docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbmWithOptions.cs diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGBMRegression.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGbm.cs similarity index 100% rename from docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGBMRegression.cs rename to docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGbm.cs diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGBMRegressionWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGbmWithOptions.cs similarity index 100% rename from docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGBMRegressionWithOptions.cs rename to docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGbmWithOptions.cs From 2e9641813430ca44e1a847d037dd844bc530dfd7 Mon Sep 17 00:00:00 2001 From: "REDMOND\\nakazmi" Date: Tue, 26 Feb 2019 11:04:22 -0800 Subject: [PATCH 13/13] Feedback --- .../Trainers/Ranking/LightGbmWithOptions.cs | 8 +++----- src/Microsoft.ML.SamplesUtils/ConsoleUtils.cs | 14 ++------------ 2 files changed, 5 insertions(+), 17 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbmWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbmWithOptions.cs index ccacec1b58..dc898fb4d3 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbmWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbmWithOptions.cs @@ -42,11 +42,9 @@ public static void Example() var metrics = mlContext.Ranking.Evaluate(dataWithPredictions); SamplesUtils.ConsoleUtils.PrintMetrics(metrics); - // NOTE: - // - // This sample is currently broken due to a bug in setting the GroupId column in LightGbm when using Options. - // - // Please follow GitHub issue 2652 to be notified of a fix: https://github.com/dotnet/machinelearning/issues/2652 + // Expected output: + // DCG: @1:1.71, @2:3.88, @3:7.93 + // NDCG: @1:7.98, @2:12.14, @3:16.62 } } } diff --git a/src/Microsoft.ML.SamplesUtils/ConsoleUtils.cs b/src/Microsoft.ML.SamplesUtils/ConsoleUtils.cs index 040d765b9b..af707f5a32 100644 --- a/src/Microsoft.ML.SamplesUtils/ConsoleUtils.cs +++ b/src/Microsoft.ML.SamplesUtils/ConsoleUtils.cs @@ -55,18 +55,8 @@ public static void PrintMetrics(RegressionMetrics metrics) /// Ranker metrics. public static void PrintMetrics(RankingMetrics metrics) { - Console.WriteLine($"DCG: {string.Join(", ", RoundAndBeautifyRankerMetrics(metrics.Dcg))}"); - Console.WriteLine($"NDCG: {string.Join(", ", RoundAndBeautifyRankerMetrics(metrics.Ndcg))}"); - } - - private static string[] RoundAndBeautifyRankerMetrics(double[] input) - { - string[] result = input.Select(d => Math.Round(d, 2).ToString()).ToArray(); - for (int i = 0; i < result.Length; i++) - { - result[i] = $"@{(i + 1).ToString()}:{result[i]}"; - } - return result; + Console.WriteLine($"DCG: {string.Join(", ", metrics.Dcg.Select((d, i) => $"@{i + 1}:{d:F2}").ToArray())}"); + Console.WriteLine($"NDCG: {string.Join(", ", metrics.Ndcg.Select((d, i) => $"@{i + 1}:{d:F2}").ToArray())}"); } } }