eerhardt
diff --git a/‎src/Microsoft.ML.StandardLearners/Standard/LinearClassificationTrainer.cs
+16 b/‎src/Microsoft.ML.StandardLearners/Standard/LinearClassificationTrainer.cs
+16
diff --git a/‎src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LbfgsPredictorBase.cs
+22 b/‎src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LbfgsPredictorBase.cs
+22
diff --git a/‎src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs
+1-1 b/‎src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs
+1-1
diff --git a/‎src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/MulticlassLogisticRegression.cs
+1-1 b/‎src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/MulticlassLogisticRegression.cs
+1-1
diff --git a/‎src/Microsoft.ML.StandardLearners/Standard/Online/AveragedPerceptron.cs
+12-1 b/‎src/Microsoft.ML.StandardLearners/Standard/Online/AveragedPerceptron.cs
+12-1
diff --git a/‎src/Microsoft.ML.StandardLearners/Standard/SdcaMultiClass.cs
+1-1 b/‎src/Microsoft.ML.StandardLearners/Standard/SdcaMultiClass.cs
+1-1
diff --git a/‎src/Microsoft.ML.StandardLearners/Standard/SdcaRegression.cs
+1-1 b/‎src/Microsoft.ML.StandardLearners/Standard/SdcaRegression.cs
+1-1
@@ -222,6 +222,22 @@ internal virtual void Check(IHostEnvironment env)
             }
         }
 
+        internal const string SDCADetailedSummary = @"This classifier is a trainer based on the Stochastic DualCoordinate 
+Ascent(SDCA) method, a state-of-the-art optimization technique for convex objective functions.
+The algorithm can be scaled for use on large out-of-memory data sets due to a semi-asynchronized implementation 
+that supports multi-threading.
+Convergence is underwritten by periodically enforcing synchronization between primal and dual updates in a separate thread.
+Several choices of loss functions are also provided.
+The SDCA method combines several of the best properties and capabilities of logistic regression and SVM algorithms.
+For more information on SDCA, see:
+<see href='https://www.microsoft.com/en-us/research/wp-content/uploads/2016/06/main-3.pdf'>Scaling Up Stochastic Dual Coordinate Ascent</see>.
+<see href='http://www.jmlr.org/papers/volume14/shalev-shwartz13a/shalev-shwartz13a.pdf'>Stochastic Dual Coordinate Ascent Methods for Regularized Loss Minimization</see>.
+Note that SDCA is a stochastic and streaming optimization algorithm. 
+The results depends on the order of the training data. For reproducible results, it is recommended that one sets `shuffle` to
+`False` and `NumThreads` to `1`.
+Elastic net regularization can be specified by the l2_weight and l1_weight parameters. Note that the l2_weight has an effect on the rate of convergence. 
+In general, the larger the l2_weight, the faster SDCA converges.";
+
         // The order of these matter, since they are used as indices into arrays.
         protected enum MetricKind
         {
 
@@ -94,6 +94,28 @@ public abstract class ArgumentsBase : LearnerInputBaseWithWeight
             public bool EnforceNonNegativity = false;
         }
 
+        internal const string DetailedSummary = @"Logistic Regression is a classification method used to predict the value of a categorical dependent variable from its relationship to one or more independent variables assumed to have a logistic distribution. 
+If the dependent variable has only two possible values (success/failure), then the logistic regression is binary. 
+If the dependent variable has more than two possible values (blood type given diagnostic test results), then the logistic regression is multinomial.
+The optimization technique used for LogisticRegressionBinaryClassifier is the limited memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS). 
+Both the L-BFGS and regular BFGS algorithms use quasi-Newtonian methods to estimate the computationally intensive Hessian matrix in the equation used by Newton's method to calculate steps. 
+But the L-BFGS approximation uses only a limited amount of memory to compute the next step direction, so that it is especially suited for problems with a large number of variables. 
+The memory_size parameter specifies the number of past positions and gradients to store for use in the computation of the next step.
+This learner can use elastic net regularization: a linear combination of L1 (lasso) and L2 (ridge) regularizations. 
+Regularization is a method that can render an ill-posed problem more tractable by imposing constraints that provide information to supplement the data and that prevents overfitting by penalizing models with extreme coefficient values. 
+This can improve the generalization of the model learned by selecting the optimal complexity in the bias-variance tradeoff. Regularization works by adding the penalty that is associated with coefficient values to the error of the hypothesis. 
+An accurate model with extreme coefficient values would be penalized more, but a less accurate model with more conservative values would be penalized less. L1 and L2 regularization have different effects and uses that are complementary in certain respects.
+l1_weight: can be applied to sparse models, when working with high-dimensional data. It pulls small weights associated features that are relatively unimportant towards 0. 
+l2_weight: is preferable for data that is not sparse. It pulls large weights towards zero. 
+Adding the ridge penalty to the regularization overcomes some of lasso's limitations. It can improve its predictive accuracy, for example, when the number of predictors is greater than the sample size. If x = l1_weight and y = l2_weight, ax + by = c defines the linear span of the regularization terms. 
+The default values of x and y are both 1. 
+An agressive regularization can harm predictive capacity by excluding important variables out of the model. So choosing the optimal values for the regularization parameters is important for the performance of the logistic regression model.
+<see href='http://en.wikipedia.org/wiki/L-BFGS'>Wikipedia: L-BFGS</see>.
+<see href='http://en.wikipedia.org/wiki/Logistic_regression'>Wikipedia: Logistic regression</see>.
+<see href='http://research.microsoft.com/apps/pubs/default.aspx?id=78900'>Scalable Training of L1-Regularized Log-Linear Models</see>.
+<see href='https://msdn.microsoft.com/en-us/magazine/dn904675.aspx'>Test Run - L1 and L2 Regularization for Machine Learning</see>.
+";
+
         protected int NumFeatures;
         protected VBuffer<Float> CurrentWeights;
         protected long NumGoodRows;
 
@@ -386,7 +386,7 @@ public override ParameterMixingCalibratedPredictor CreatePredictor()
                 new PlattCalibrator(Host, -1, 0));
         }
 
-        [TlcModule.EntryPoint(Name = "Trainers.LogisticRegressionBinaryClassifier", Desc = "Train a logistic regression binary model", UserName = UserNameValue, ShortName = ShortName)]
+        [TlcModule.EntryPoint(Name = "Trainers.LogisticRegressionBinaryClassifier", Desc = DetailedSummary, UserName = UserNameValue, ShortName = ShortName)]
         public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
 
@@ -961,7 +961,7 @@ public IRow GetStatsIRowOrNull(RoleMappedSchema schema)
     /// </summary>
     public partial class LogisticRegression
     {
-        [TlcModule.EntryPoint(Name = "Trainers.LogisticRegressionClassifier", Desc = "Train a logistic regression multi class model", UserName = MulticlassLogisticRegression.UserNameValue, ShortName = MulticlassLogisticRegression.ShortName)]
+        [TlcModule.EntryPoint(Name = "Trainers.LogisticRegressionClassifier", Desc = DetailedSummary, UserName = MulticlassLogisticRegression.UserNameValue, ShortName = MulticlassLogisticRegression.ShortName)]
         public static CommonOutputs.MulticlassClassificationOutput TrainMultiClass(IHostEnvironment env, MulticlassLogisticRegression.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
 
@@ -37,6 +37,17 @@ public sealed class AveragedPerceptronTrainer :
         internal const string UserNameValue = "Averaged Perceptron";
         internal const string ShortName = "ap";
         internal const string Summary = "Perceptron is a binary classification algorithm that makes its predictions based on a linear function.";
+        internal const string DetailedSummary = @"Perceptron is a classification algorithm that makes its predictions based on a linear function.
+I.e., for an instance with feature values f0, f1,..., f_D-1, , the prediction is given by the sign of sigma[0,D-1] ( w_i * f_i), where w_0, w_1,...,w_D-1 are the weights computed by the algorithm.
+Perceptron is an online algorithm, i.e., it processes the instances in the training set one at a time.
+The weights are initialized to be 0, or some random values. Then, for each example in the training set, the value of sigma[0, D-1] (w_i * f_i) is computed. 
+If this value has the same sign as the label of the current example, the weights remain the same. If they have opposite signs,
+the weights vector is updated by either subtracting or adding (if the label is negative or positive, respectively) the feature vector of the current example,
+multiplied by a factor 0 < a <= 1, called the learning rate. In a generalization of this algorithm, the weights are updated by adding the feature vector multiplied by the learning rate, 
+and by the gradient of some loss function (in the specific case described above, the loss is hinge-loss, whose gradient is 1 when it is non-zero).
+In Averaged Perceptron (AKA voted-perceptron), the weight vectors are stored, 
+together with a weight that counts the number of iterations it survived (this is equivalent to storing the weight vector after every iteration, regardless of whether it was updated or not).
+The prediction is then calculated by taking the weighted average of all the sums sigma[0, D-1] (w_i * f_i) or the different weight vectors.";
 
         public class Arguments : AveragedLinearArguments
         {
@@ -91,7 +102,7 @@ public override LinearBinaryPredictor CreatePredictor()
             return new LinearBinaryPredictor(Host, ref weights, bias);
         }
 
-        [TlcModule.EntryPoint(Name = "Trainers.AveragedPerceptronBinaryClassifier", Desc = "Train a Average perceptron.", UserName = UserNameValue, ShortName = ShortName)]
+        [TlcModule.EntryPoint(Name = "Trainers.AveragedPerceptronBinaryClassifier", Desc = DetailedSummary, UserName = UserNameValue, ShortName = ShortName)]
         public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
 
@@ -386,7 +386,7 @@ protected override Float GetInstanceWeight(FloatLabelCursor cursor)
     /// </summary>
     public static partial class Sdca
     {
-        [TlcModule.EntryPoint(Name = "Trainers.StochasticDualCoordinateAscentClassifier", Desc = "Train an SDCA multi class model", UserName = SdcaMultiClassTrainer.UserNameValue, ShortName = SdcaMultiClassTrainer.ShortName)]
+        [TlcModule.EntryPoint(Name = "Trainers.StochasticDualCoordinateAscentClassifier", Desc = SdcaMultiClassTrainer.SDCADetailedSummary, UserName = SdcaMultiClassTrainer.UserNameValue, ShortName = SdcaMultiClassTrainer.ShortName)]
         public static CommonOutputs.MulticlassClassificationOutput TrainMultiClass(IHostEnvironment env, SdcaMultiClassTrainer.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
 
@@ -131,7 +131,7 @@ protected override Float TuneDefaultL2(IChannel ch, int maxIterations, long rowC
     /// </summary>
     public static partial class Sdca
     {
-        [TlcModule.EntryPoint(Name = "Trainers.StochasticDualCoordinateAscentRegressor", Desc = "Train an SDCA regression model", UserName = SdcaRegressionTrainer.UserNameValue, ShortName = SdcaRegressionTrainer.ShortName)]
+        [TlcModule.EntryPoint(Name = "Trainers.StochasticDualCoordinateAscentRegressor", Desc = SdcaRegressionTrainer.SDCADetailedSummary, UserName = SdcaRegressionTrainer.UserNameValue, ShortName = SdcaRegressionTrainer.ShortName)]
         public static CommonOutputs.RegressionOutput TrainRegression(IHostEnvironment env, SdcaRegressionTrainer.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
Original file line number	Diff line number	Diff line change
`@@ -386,7 +386,7 @@ public override ParameterMixingCalibratedPredictor CreatePredictor()`
`386`	`386`	`new PlattCalibrator(Host, -1, 0));`
`387`	`387`	`}`
`388`	`388`
`389`		`- [TlcModule.EntryPoint(Name = "Trainers.LogisticRegressionBinaryClassifier", Desc = "Train a logistic regression binary model", UserName = UserNameValue, ShortName = ShortName)]`
	`389`	`+ [TlcModule.EntryPoint(Name = "Trainers.LogisticRegressionBinaryClassifier", Desc = DetailedSummary, UserName = UserNameValue, ShortName = ShortName)]`
`390`	`390`	`public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, Arguments input)`
`391`	`391`	`{`
`392`	`392`	`Contracts.CheckValue(env, nameof(env));`
Original file line number	Diff line number	Diff line change
`@@ -961,7 +961,7 @@ public IRow GetStatsIRowOrNull(RoleMappedSchema schema)`
`961`	`961`	`/// </summary>`
`962`	`962`	`public partial class LogisticRegression`
`963`	`963`	`{`
`964`		`- [TlcModule.EntryPoint(Name = "Trainers.LogisticRegressionClassifier", Desc = "Train a logistic regression multi class model", UserName = MulticlassLogisticRegression.UserNameValue, ShortName = MulticlassLogisticRegression.ShortName)]`
	`964`	`+ [TlcModule.EntryPoint(Name = "Trainers.LogisticRegressionClassifier", Desc = DetailedSummary, UserName = MulticlassLogisticRegression.UserNameValue, ShortName = MulticlassLogisticRegression.ShortName)]`
`965`	`965`	`public static CommonOutputs.MulticlassClassificationOutput TrainMultiClass(IHostEnvironment env, MulticlassLogisticRegression.Arguments input)`
`966`	`966`	`{`
`967`	`967`	`Contracts.CheckValue(env, nameof(env));`
Original file line number	Diff line number	Diff line change
`@@ -386,7 +386,7 @@ protected override Float GetInstanceWeight(FloatLabelCursor cursor)`
`386`	`386`	`/// </summary>`
`387`	`387`	`public static partial class Sdca`
`388`	`388`	`{`
`389`		`- [TlcModule.EntryPoint(Name = "Trainers.StochasticDualCoordinateAscentClassifier", Desc = "Train an SDCA multi class model", UserName = SdcaMultiClassTrainer.UserNameValue, ShortName = SdcaMultiClassTrainer.ShortName)]`
	`389`	`+ [TlcModule.EntryPoint(Name = "Trainers.StochasticDualCoordinateAscentClassifier", Desc = SdcaMultiClassTrainer.SDCADetailedSummary, UserName = SdcaMultiClassTrainer.UserNameValue, ShortName = SdcaMultiClassTrainer.ShortName)]`
`390`	`390`	`public static CommonOutputs.MulticlassClassificationOutput TrainMultiClass(IHostEnvironment env, SdcaMultiClassTrainer.Arguments input)`
`391`	`391`	`{`
`392`	`392`	`Contracts.CheckValue(env, nameof(env));`
Original file line number	Diff line number	Diff line change
`@@ -131,7 +131,7 @@ protected override Float TuneDefaultL2(IChannel ch, int maxIterations, long rowC`
`131`	`131`	`/// </summary>`
`132`	`132`	`public static partial class Sdca`
`133`	`133`	`{`
`134`		`- [TlcModule.EntryPoint(Name = "Trainers.StochasticDualCoordinateAscentRegressor", Desc = "Train an SDCA regression model", UserName = SdcaRegressionTrainer.UserNameValue, ShortName = SdcaRegressionTrainer.ShortName)]`
	`134`	`+ [TlcModule.EntryPoint(Name = "Trainers.StochasticDualCoordinateAscentRegressor", Desc = SdcaRegressionTrainer.SDCADetailedSummary, UserName = SdcaRegressionTrainer.UserNameValue, ShortName = SdcaRegressionTrainer.ShortName)]`
`135`	`135`	`public static CommonOutputs.RegressionOutput TrainRegression(IHostEnvironment env, SdcaRegressionTrainer.Arguments input)`
`136`	`136`	`{`
`137`	`137`	`Contracts.CheckValue(env, nameof(env));`