Added samples & docs for BinaryClassification.StochasticGradientDescent (#2688)

Shahab Moradi · web-flow · commit a16eb309820a · 2019-02-25T16:45:55.000-05:00
* Added samples &amp; docs for BinaryClassification.StochasticGradientDescent, plus a bunch of typo fixing.

* Addressed PR comments.

* Mentioned Hogwild

* Updates to exampleWeightColumnName.

* Fixed trailing whitespaces.
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/AveragedPerceptron.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/AveragedPerceptron.cs
@@ -5,7 +5,7 @@ namespace Microsoft.ML.Samples.Dynamic.Trainers.BinaryClassification
     public static class AveragedPerceptron
     {
         // In this examples we will use the adult income dataset. The goal is to predict
-        // if a person's income is above $50K or not, based on different pieces of information about that person.
+        // if a person's income is above $50K or not, based on demographic information about that person.
         // For more details about this dataset, please see https://archive.ics.uci.edu/ml/datasets/adult.
         public static void Example()
         {
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/AveragedPerceptronWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/AveragedPerceptronWithOptions.cs
@@ -6,7 +6,7 @@ namespace Microsoft.ML.Samples.Dynamic.Trainers.BinaryClassification
     public static class AveragedPerceptronWithOptions
     {
         // In this examples we will use the adult income dataset. The goal is to predict
-        // if a person's income is above $50K or not, based on different pieces of information about that person.
+        // if a person's income is above $50K or not, based on demographic information about that person.
         // For more details about this dataset, please see https://archive.ics.uci.edu/ml/datasets/adult.
         public static void Example()
         {
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescent.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescent.cs
@@ -0,0 +1,47 @@
+﻿using Microsoft.ML;
+
+namespace Microsoft.ML.Samples.Dynamic.Trainers.BinaryClassification
+{
+    public static class StochasticGradientDescent
+    {
+        // In this examples we will use the adult income dataset. The goal is to predict
+        // if a person's income is above $50K or not, based on demographic information about that person.
+        // For more details about this dataset, please see https://archive.ics.uci.edu/ml/datasets/adult.
+        public static void Example()
+        {
+            // Create a new context for ML.NET operations. It can be used for exception tracking and logging, 
+            // as a catalog of available operations and as the source of randomness.
+            // Setting the seed to a fixed number in this example to make outputs deterministic.
+            var mlContext = new MLContext(seed: 0);
+
+            // Download and featurize the dataset.
+            var data = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext);
+
+            // Leave out 10% of data for testing.
+            var trainTestData = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1);
+
+            // Create data training pipeline.
+            var pipeline = mlContext.BinaryClassification.Trainers.StochasticGradientDescent();
+
+            // Fit this pipeline to the training data.
+            var model = pipeline.Fit(trainTestData.TrainSet);
+
+            // Evaluate how the model is doing on the test data.
+            var dataWithPredictions = model.Transform(trainTestData.TestSet);
+            var metrics = mlContext.BinaryClassification.Evaluate(dataWithPredictions);
+            SamplesUtils.ConsoleUtils.PrintMetrics(metrics);
+
+            // Expected output:
+            //   Accuracy: 0.85
+            //   AUC: 0.90
+            //   F1 Score: 0.67
+            //   Negative Precision: 0.90
+            //   Negative Recall: 0.91
+            //   Positive Precision: 0.68
+            //   Positive Recall: 0.65
+            //   LogLoss: 0.48
+            //   LogLossReduction: 38.31
+            //   Entropy: 0.78
+        }
+    }
+}
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescentWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescentWithOptions.cs
@@ -0,0 +1,59 @@
+﻿using Microsoft.ML;
+using Microsoft.ML.Trainers;
+
+namespace Microsoft.ML.Samples.Dynamic.Trainers.BinaryClassification
+{
+    public static class StochasticGradientDescentWithOptions
+    {
+        // In this examples we will use the adult income dataset. The goal is to predict
+        // if a person's income is above $50K or not, based on demographic information about that person.
+        // For more details about this dataset, please see https://archive.ics.uci.edu/ml/datasets/adult.
+        public static void Example()
+        {
+            // Create a new context for ML.NET operations. It can be used for exception tracking and logging, 
+            // as a catalog of available operations and as the source of randomness.
+            // Setting the seed to a fixed number in this example to make outputs deterministic.
+            var mlContext = new MLContext(seed: 0);
+
+            // Download and featurize the dataset.
+            var data = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext);
+
+            // Leave out 10% of data for testing.
+            var trainTestData = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1);
+
+            // Define the trainer options.
+            var options = new SgdBinaryTrainer.Options()
+            {
+                // Make the convergence tolerance tighter.
+                ConvergenceTolerance = 5e-5,
+                // Increase the maximum number of passes over training data.
+                MaxIterations = 30,
+                // Give the instances of the positive class slightly more weight.
+                PositiveInstanceWeight = 1.2f,
+            };
+
+            // Create data training pipeline.
+            var pipeline = mlContext.BinaryClassification.Trainers.StochasticGradientDescent(options);
+
+            // Fit this pipeline to the training data.
+            var model = pipeline.Fit(trainTestData.TrainSet);
+
+            // Evaluate how the model is doing on the test data.
+            var dataWithPredictions = model.Transform(trainTestData.TestSet);
+            var metrics = mlContext.BinaryClassification.Evaluate(dataWithPredictions);
+            SamplesUtils.ConsoleUtils.PrintMetrics(metrics);
+
+            // Expected output:
+            //   Accuracy: 0.85
+            //   AUC: 0.90
+            //   F1 Score: 0.67
+            //   Negative Precision: 0.91
+            //   Negative Recall: 0.89
+            //   Positive Precision: 0.65
+            //   Positive Recall: 0.70
+            //   LogLoss: 0.48
+            //   LogLossReduction: 37.52
+            //   Entropy: 0.78
+        }
+    }
+}
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescent.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescent.cs
@@ -4,7 +4,7 @@ public static class SymbolicStochasticGradientDescent
     {
         // This example requires installation of additional nuget package <a href="https://www.nuget.org/packages/Microsoft.ML.HalLearners/">Microsoft.ML.HalLearners</a>.
         // In this example we will use the adult income dataset. The goal is to predict
-        // if a person's income is above $50K or not, based on different pieces of information about that person.
+        // if a person's income is above $50K or not, based on demographic information about that person.
         // For more details about this dataset, please see https://archive.ics.uci.edu/ml/datasets/adult
         public static void Example()
         {
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescentWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescentWithOptions.cs
@@ -4,7 +4,7 @@ public static class SymbolicStochasticGradientDescentWithOptions
     {
         // This example requires installation of additional nuget package <a href="https://www.nuget.org/packages/Microsoft.ML.HalLearners/">Microsoft.ML.HalLearners</a>.
         // In this example we will use the adult income dataset. The goal is to predict
-        // if a person's income is above $50K or not, based on different pieces of information about that person.
+        // if a person's income is above $50K or not, based on demographic information about that person.
         // For more details about this dataset, please see https://archive.ics.uci.edu/ml/datasets/adult
         public static void Example()
         {
diff --git a/src/Microsoft.ML.Data/EntryPoints/InputBase.cs b/src/Microsoft.ML.Data/EntryPoints/InputBase.cs
@@ -95,7 +95,7 @@ public abstract class LearnerInputBaseWithLabel : LearnerInputBase
     public abstract class LearnerInputBaseWithWeight : LearnerInputBaseWithLabel
     {
         /// <summary>
-        /// Column to use for example weight.
+        /// The name of the example weight column.
         /// </summary>
         [Argument(ArgumentType.AtMostOnce, HelpText = "Column to use for example weight", ShortName = "weight", SortOrder = 4, Visibility = ArgumentAttribute.VisibilityType.EntryPointsOnly)]
         public string WeightColumn = null;
diff --git a/src/Microsoft.ML.SamplesUtils/ConsoleUtils.cs b/src/Microsoft.ML.SamplesUtils/ConsoleUtils.cs
@@ -23,6 +23,18 @@ public static void PrintMetrics(BinaryClassificationMetrics metrics)
             Console.WriteLine($"Positive Recall: {metrics.PositiveRecall:F2}");
         }
 
+        /// <summary>
+        /// Pretty-print CalibratedBinaryClassificationMetrics objects.
+        /// </summary>
+        /// <param name="metrics"><see cref="CalibratedBinaryClassificationMetrics"/> object.</param>
+        public static void PrintMetrics(CalibratedBinaryClassificationMetrics metrics)
+        {
+            PrintMetrics(metrics as BinaryClassificationMetrics);
+            Console.WriteLine($"LogLoss: {metrics.LogLoss:F2}");
+            Console.WriteLine($"LogLossReduction: {metrics.LogLossReduction:F2}");
+            Console.WriteLine($"Entropy: {metrics.Entropy:F2}");
+        }
+
         /// <summary>
         /// Pretty-print RegressionMetrics objects.
         /// </summary>
diff --git a/src/Microsoft.ML.StandardLearners/Standard/Online/AveragedLinear.cs b/src/Microsoft.ML.StandardLearners/Standard/Online/AveragedLinear.cs
@@ -60,7 +60,7 @@ public abstract class AveragedLinearOptions : OnlineLinearOptions
         public bool DoLazyUpdates = true;
 
         /// <summary>
-        /// L2 weight for <a href='tmpurl_regularization'>regularization</a>.
+        /// The L2 weight for <a href='tmpurl_regularization'>regularization</a>.
         /// </summary>
         [Argument(ArgumentType.AtMostOnce, HelpText = "L2 Regularization Weight", ShortName = "reg", SortOrder = 50)]
         [TGUI(Label = "L2 Regularization Weight")]
diff --git a/src/Microsoft.ML.StandardLearners/Standard/Online/AveragedPerceptron.cs b/src/Microsoft.ML.StandardLearners/Standard/Online/AveragedPerceptron.cs
@@ -54,7 +54,7 @@ public sealed class AveragedPerceptronTrainer : AveragedLinearTrainer<BinaryPred
         private readonly Options _args;
 
         /// <summary>
-        /// Options for the averaged perceptron trainer.
+        /// Options for the <see cref="AveragedPerceptronTrainer"/>.
         /// </summary>
         public sealed class Options : AveragedLinearOptions
         {
diff --git a/src/Microsoft.ML.StandardLearners/Standard/Online/OnlineLinear.cs b/src/Microsoft.ML.StandardLearners/Standard/Online/OnlineLinear.cs
@@ -24,7 +24,7 @@ public abstract class OnlineLinearOptions : LearnerInputBaseWithLabel
         /// <summary>
         /// Number of passes through the training dataset.
         /// </summary>
-        [Argument(ArgumentType.AtMostOnce, HelpText = "Number of iterations", ShortName = "iter, numIterations", SortOrder = 50)]
+        [Argument(ArgumentType.AtMostOnce, HelpText = "Number of iterations", ShortName = "iter,numIterations", SortOrder = 50)]
         [TGUI(Label = "Number of Iterations", Description = "Number of training iterations through data", SuggestedSweeps = "1,10,100")]
         [TlcModule.SweepableLongParamAttribute("NumIterations", 1, 100, stepSize: 10, isLogScale: true)]
         public int NumberOfIterations = OnlineDefault.NumIterations;
@@ -43,7 +43,7 @@ public abstract class OnlineLinearOptions : LearnerInputBaseWithLabel
         /// This property is only used if the provided value is positive and <see cref="InitialWeights"/> is not specified.
         /// The weights and bias will be randomly selected from InitialWeights * [-0.5,0.5] interval with uniform distribution.
         /// </value>
-        [Argument(ArgumentType.AtMostOnce, HelpText = "Init weights diameter", ShortName = "initwts, initWtsDiameter", SortOrder = 140)]
+        [Argument(ArgumentType.AtMostOnce, HelpText = "Init weights diameter", ShortName = "initwts,initWtsDiameter", SortOrder = 140)]
         [TGUI(Label = "Initial Weights Scale", SuggestedSweeps = "0,0.1,0.5,1")]
         [TlcModule.SweepableFloatParamAttribute("InitWtsDiameter", 0.0f, 1.0f, numSteps: 5)]
         public float InitialWeightsDiameter = 0;
diff --git a/src/Microsoft.ML.StandardLearners/Standard/SdcaBinary.cs b/src/Microsoft.ML.StandardLearners/Standard/SdcaBinary.cs
@@ -1723,36 +1723,77 @@ public abstract class SgdBinaryTrainerBase<TModel> :
     {
         public class OptionsBase : LearnerInputBaseWithWeight
         {
+            /// <summary>
+            /// The L2 weight for <a href='tmpurl_regularization'>regularization</a>.
+            /// </summary>
             [Argument(ArgumentType.AtMostOnce, HelpText = "L2 Regularization constant", ShortName = "l2", SortOrder = 50)]
             [TGUI(Label = "L2 Regularization Constant", SuggestedSweeps = "1e-7,5e-7,1e-6,5e-6,1e-5")]
             [TlcModule.SweepableDiscreteParam("L2Const", new object[] { 1e-7f, 5e-7f, 1e-6f, 5e-6f, 1e-5f })]
             public float L2Weight = Defaults.L2Weight;
 
+            /// <summary>
+            /// The degree of lock-free parallelism used by SGD.
+            /// </summary>
+            /// <value>
+            /// Defaults to automatic depending on data sparseness. Determinism is not guaranteed.
+            /// </value>
             [Argument(ArgumentType.AtMostOnce, HelpText = "Degree of lock-free parallelism. Defaults to automatic depending on data sparseness. Determinism not guaranteed.", ShortName = "nt,t,threads", SortOrder = 50)]
             [TGUI(Label = "Number of threads", SuggestedSweeps = "1,2,4")]
             public int? NumThreads;
 
+            /// <summary>
+            /// The convergence tolerance. If the exponential moving average of loss reductions falls below this tolerance,
+            /// the algorithm is deemed to have converged and will stop.
+            /// </summary>
             [Argument(ArgumentType.AtMostOnce, HelpText = "Exponential moving averaged improvement tolerance for convergence", ShortName = "tol")]
             [TGUI(SuggestedSweeps = "1e-2,1e-3,1e-4,1e-5")]
             [TlcModule.SweepableDiscreteParam("ConvergenceTolerance", new object[] { 1e-2f, 1e-3f, 1e-4f, 1e-5f })]
             public double ConvergenceTolerance = 1e-4;
 
+            /// <summary>
+            /// The maximum number of passes through the training dataset.
+            /// </summary>
+            /// <value>
+            /// Set to 1 to simulate online learning.
+            /// </value>
             [Argument(ArgumentType.AtMostOnce, HelpText = "Maximum number of iterations; set to 1 to simulate online learning.", ShortName = "iter")]
             [TGUI(Label = "Max number of iterations", SuggestedSweeps = "1,5,10,20")]
             [TlcModule.SweepableDiscreteParam("MaxIterations", new object[] { 1, 5, 10, 20 })]
             public int MaxIterations = Defaults.MaxIterations;
 
+            /// <summary>
+            /// The initial <a href="tmpurl_lr">learning rate</a> used by SGD.
+            /// </summary>
             [Argument(ArgumentType.AtMostOnce, HelpText = "Initial learning rate (only used by SGD)", ShortName = "ilr,lr")]
             [TGUI(Label = "Initial Learning Rate (for SGD)")]
             public double InitLearningRate = Defaults.InitLearningRate;
 
+            /// <summary>
+            /// Determines whether to shuffle data for each training iteration.
+            /// </summary>
+            /// <value>
+            /// <see langword="true" /> to shuffle data for each training iteration; otherwise, <see langword="false" />.
+            /// Default is <see langword="true" />.
+            /// </value>
             [Argument(ArgumentType.AtMostOnce, HelpText = "Shuffle data every epoch?", ShortName = "shuf")]
             [TlcModule.SweepableDiscreteParam("Shuffle", null, isBool: true)]
             public bool Shuffle = true;
 
+            /// <summary>
+            /// The weight to be applied to the positive class. This is useful for training with imbalanced data.
+            /// </summary>
+            /// <value>
+            /// Default value is 1, which means no extra weight.
+            /// </value>
             [Argument(ArgumentType.AtMostOnce, HelpText = "Apply weight to the positive class, for imbalanced data", ShortName = "piw")]
             public float PositiveInstanceWeight = 1;
 
+            /// <summary>
+            /// Determines the frequency of checking for convergence in terms of number of iterations.
+            /// </summary>
+            /// <value>
+            /// Default equals <see cref="NumThreads"/>."
+            /// </value>
             [Argument(ArgumentType.AtMostOnce, HelpText = "Convergence check frequency (in terms of number of iterations). Default equals number of threads", ShortName = "checkFreq")]
             public int? CheckFrequency;
 
@@ -1802,7 +1843,7 @@ internal static class Defaults
         /// <param name="env">The environment to use.</param>
         /// <param name="featureColumn">The name of the feature column.</param>
         /// <param name="labelColumn">The name of the label column.</param>
-        /// <param name="weightColumn">The name for the example weight column.</param>
+        /// <param name="weightColumn">The name of the example weight column.</param>
         /// <param name="maxIterations">The maximum number of iterations; set to 1 to simulate online learning.</param>
         /// <param name="initLearningRate">The initial learning rate used by SGD.</param>
         /// <param name="l2Weight">The L2 regularizer constant.</param>
@@ -2077,13 +2118,21 @@ private protected override void CheckLabel(RoleMappedData examples, out int weig
     }
 
     /// <summary>
-    /// Train logistic regression using a parallel stochastic gradient method.
+    /// The <see cref="IEstimator{TTransformer}"/> for training logistic regression using a parallel stochastic gradient method.
+    /// The trained model is <a href='tmpurl_calib'>calibrated</a> and can produce probability by feeding the output value of the
+    /// linear function to a <see cref="PlattCalibrator"/>.
     /// </summary>
+    /// <remarks>
+    /// The Stochastic Gradient Descent (SGD) is one of the popular stochastic optimization procedures that can be integrated
+    /// into several machine learning tasks to achieve state-of-the-art performance. This trainer implements the Hogwild SGD for binary classification
+    /// that supports multi-threading without any locking. If the associated optimization problem is sparse, Hogwild SGD achieves a nearly optimal
+    /// rate of convergence. For more details about Hogwild SGD, please refer to http://arxiv.org/pdf/1106.5730v2.pdf.
+    /// </remarks>
     public sealed class SgdBinaryTrainer :
         SgdBinaryTrainerBase<CalibratedModelParametersBase<LinearBinaryModelParameters, PlattCalibrator>>
     {
         /// <summary>
-        /// Options available to training logistic regression using the implemented stochastic gradient method.
+        /// Options for the <see cref="SgdBinaryTrainer"/>.
         /// </summary>
         public sealed class Options : OptionsBase
         {
diff --git a/src/Microsoft.ML.StandardLearners/StandardLearnersCatalog.cs b/src/Microsoft.ML.StandardLearners/StandardLearnersCatalog.cs
diff --git a/src/Microsoft.ML.StaticPipe/LbfgsStatic.cs b/src/Microsoft.ML.StaticPipe/LbfgsStatic.cs
diff --git a/src/Microsoft.ML.StaticPipe/SgdStatic.cs b/src/Microsoft.ML.StaticPipe/SgdStatic.cs
diff --git a/src/Microsoft.ML.StaticPipe/TreeTrainersStatic.cs b/src/Microsoft.ML.StaticPipe/TreeTrainersStatic.cs

Original file line number	Diff line number	Diff line change
`@@ -5,7 +5,7 @@ namespace Microsoft.ML.Samples.Dynamic.Trainers.BinaryClassification`
`5`	`5`	`public static class AveragedPerceptron`
`6`	`6`	`{`
`7`	`7`	`// In this examples we will use the adult income dataset. The goal is to predict`
`8`		`- // if a person's income is above $50K or not, based on different pieces of information about that person.`
	`8`	`+ // if a person's income is above $50K or not, based on demographic information about that person.`
`9`	`9`	`// For more details about this dataset, please see https://archive.ics.uci.edu/ml/datasets/adult.`
`10`	`10`	`public static void Example()`
`11`	`11`	`{`
Original file line number	Diff line number	Diff line change
`@@ -6,7 +6,7 @@ namespace Microsoft.ML.Samples.Dynamic.Trainers.BinaryClassification`
`6`	`6`	`public static class AveragedPerceptronWithOptions`
`7`	`7`	`{`
`8`	`8`	`// In this examples we will use the adult income dataset. The goal is to predict`
`9`		`- // if a person's income is above $50K or not, based on different pieces of information about that person.`
	`9`	`+ // if a person's income is above $50K or not, based on demographic information about that person.`
`10`	`10`	`// For more details about this dataset, please see https://archive.ics.uci.edu/ml/datasets/adult.`
`11`	`11`	`public static void Example()`
`12`	`12`	`{`
Original file line number	Diff line number	Diff line change
`@@ -4,7 +4,7 @@ public static class SymbolicStochasticGradientDescent`
`4`	`4`	`{`
`5`	`5`	`// This example requires installation of additional nuget package <a href="https://www.nuget.org/packages/Microsoft.ML.HalLearners/">Microsoft.ML.HalLearners</a>.`
`6`	`6`	`// In this example we will use the adult income dataset. The goal is to predict`
`7`		`- // if a person's income is above $50K or not, based on different pieces of information about that person.`
	`7`	`+ // if a person's income is above $50K or not, based on demographic information about that person.`
`8`	`8`	`// For more details about this dataset, please see https://archive.ics.uci.edu/ml/datasets/adult`
`9`	`9`	`public static void Example()`
`10`	`10`	`{`
Original file line number	Diff line number	Diff line change
`@@ -95,7 +95,7 @@ public abstract class LearnerInputBaseWithLabel : LearnerInputBase`
`95`	`95`	`public abstract class LearnerInputBaseWithWeight : LearnerInputBaseWithLabel`
`96`	`96`	`{`
`97`	`97`	`/// <summary>`
`98`		`- /// Column to use for example weight.`
	`98`	`+ /// The name of the example weight column.`
`99`	`99`	`/// </summary>`
`100`	`100`	`[Argument(ArgumentType.AtMostOnce, HelpText = "Column to use for example weight", ShortName = "weight", SortOrder = 4, Visibility = ArgumentAttribute.VisibilityType.EntryPointsOnly)]`
`101`	`101`	`public string WeightColumn = null;`
Original file line number	Diff line number	Diff line change
`@@ -54,7 +54,7 @@ public sealed class AveragedPerceptronTrainer : AveragedLinearTrainer<BinaryPred`
`54`	`54`	`private readonly Options _args;`
`55`	`55`
`56`	`56`	`/// <summary>`
`57`		`- /// Options for the averaged perceptron trainer.`
	`57`	`+ /// Options for the <see cref="AveragedPerceptronTrainer"/>.`
`58`	`58`	`/// </summary>`
`59`	`59`	`public sealed class Options : AveragedLinearOptions`
`60`	`60`	`{`