diff --git a/docs/api-reference/algo-details-sdca.md b/docs/api-reference/algo-details-sdca.md index df935b07ba..69e5eaf586 100644 --- a/docs/api-reference/algo-details-sdca.md +++ b/docs/api-reference/algo-details-sdca.md @@ -1,12 +1,24 @@ ### Training Algorithm Details This trainer is based on the Stochastic Dual Coordinate Ascent (SDCA) method, a state-of-the-art optimization technique for convex objective functions. The -algorithm can be scaled for use on large out-of-memory data sets due to a -semi-asynchronized implementation that supports multi-threading. +algorithm can be scaled because it's a streaming training algorithm as described +in a [KDD best +paper.](https://www.csie.ntu.edu.tw/~cjlin/papers/disk_decomposition/tkdd_disk_decomposition.pdf) Convergence is underwritten by periodically enforcing synchronization between primal and dual variables in a separate thread. Several choices of loss -functions are also provided. +functions are also provided such as +[hinge-loss](https://en.wikipedia.org/wiki/Hinge_loss) and [logistic +loss](http://www.hongliangjie.com/wp-content/uploads/2011/10/logistic.pdf). +Depending on the loss used, the trained model can be, for example, [support +vector machine](https://en.wikipedia.org/wiki/Support-vector_machine) or +[logistic regression](https://en.wikipedia.org/wiki/Logistic_regression). The +SDCA method combines several of the best properties such the ability to do +streaming learning (without fitting the entire data set into your memory), +reaching a reasonable result with a few scans of the whole data set (for +example, see experiments in [this +paper](https://www.csie.ntu.edu.tw/~cjlin/papers/cddual.pdf)), and spending no +computation on zeros in sparse data sets. Note that SDCA is a stochastic and streaming optimization algorithm. The result depends on the order of training data because the stopping tolerance is not @@ -17,6 +29,25 @@ For reproducible results, it is recommended that one sets 'Shuffle' to False and 'NumThreads' to 1. Elastic net regularization can be specified by the 'L2Const' and 'L1Threshold' parameters. Note that the 'L2Const' has an effect on the rate of convergence. In general, the larger the 'L2Const', the faster SDCA converges. +Regularization is a method that can render an ill-posed problem more tractable +by imposing constraints that provide information to supplement the data and that +prevents overfitting by penalizing model's magnitude usually measured by some +norm functions. This can improve the generalization of the model learned by +selecting the optimal complexity in the bias-variance tradeoff. Regularization +works by adding the penalty that is associated with coefficient values to the +error of the hypothesis. An accurate model with extreme coefficient values would +be penalized more, but a less accurate model with more conservative values would +be penalized less. This learner supports [elastic net +regularization](https://en.wikipedia.org/wiki/Elastic_net_regularization): a +linear combination of L1-norm (LASSO), $|| \boldsymbol{w} ||_1$, and L2-norm +(ridge), $|| \boldsymbol{w} ||_2^2$ regularizations. L1-nrom and L2-norm +regularizations have different effects and uses that are complementary in +certain respects. Using L1-norm can increase sparsity of the trained +$\boldsymbol{w}$. When working with high-dimensional data, it shrinks small +weights of irrevalent features to 0 and therefore no reource will be spent on +those bad features when making prediction. L2-norm regularization is preferable +for data that is not sparse and it largely penalizes the existence of large +weights. For more information, see: * [Scaling Up Stochastic Dual Coordinate diff --git a/src/Microsoft.ML.StandardTrainers/Standard/SdcaBinary.cs b/src/Microsoft.ML.StandardTrainers/Standard/SdcaBinary.cs index bbe982b6cb..79e75ce774 100644 --- a/src/Microsoft.ML.StandardTrainers/Standard/SdcaBinary.cs +++ b/src/Microsoft.ML.StandardTrainers/Standard/SdcaBinary.cs @@ -1550,12 +1550,34 @@ private protected override BinaryPredictionTransformer MakeTra /// The trained model is calibrated and can produce probability by feeding the output value of the /// linear function to a . /// - /// + /// + /// + /// + /// + /// + /// + /// public sealed class SdcaLogisticRegressionBinaryTrainer : SdcaBinaryTrainerBase> { /// - /// Options for the . + /// Options for the as used in + /// [SdcaLogisticRegression(Options)](xref:Microsoft.ML.StandardTrainersCatalog.SdcaLogisticRegression(Microsoft.ML.BinaryClassificationCatalog.BinaryClassificationTrainers,Microsoft.ML.Trainers.SdcaLogisticRegressionBinaryTrainer.Options)). /// public sealed class Options : BinaryOptionsBase { @@ -1614,7 +1636,38 @@ private protected override SchemaShape.Column[] ComputeSdcaBinaryClassifierSchem /// /// The for training a binary logistic regression classification model using the stochastic dual coordinate ascent method. /// - /// + /// + /// + /// + /// + /// + /// + /// public sealed class SdcaNonCalibratedBinaryTrainer : SdcaBinaryTrainerBase { /// diff --git a/src/Microsoft.ML.StandardTrainers/Standard/SdcaRegression.cs b/src/Microsoft.ML.StandardTrainers/Standard/SdcaRegression.cs index 5398cf4e0b..949688c79d 100644 --- a/src/Microsoft.ML.StandardTrainers/Standard/SdcaRegression.cs +++ b/src/Microsoft.ML.StandardTrainers/Standard/SdcaRegression.cs @@ -39,7 +39,7 @@ namespace Microsoft.ML.Trainers /// | Is caching required? | No | /// | Required NuGet in addition to Microsoft.ML | None | /// - /// [!include[io](~/../docs/samples/docs/api-reference/algo-details-sdca.md)] + /// [!include[algorithm](~/../docs/samples/docs/api-reference/algo-details-sdca.md)] /// ]]> /// /// diff --git a/src/Microsoft.ML.StandardTrainers/StandardTrainersCatalog.cs b/src/Microsoft.ML.StandardTrainers/StandardTrainersCatalog.cs index 486a8dfd9f..9daeb1f28d 100644 --- a/src/Microsoft.ML.StandardTrainers/StandardTrainersCatalog.cs +++ b/src/Microsoft.ML.StandardTrainers/StandardTrainersCatalog.cs @@ -211,7 +211,7 @@ public static SdcaLogisticRegressionBinaryTrainer SdcaLogisticRegression( } /// - /// Create using advanced options, which predicts a target using a linear classification model. + /// Create with advanced options, which predicts a target using a linear classification model. /// /// The binary classification catalog trainer object. /// Trainer options. @@ -233,11 +233,11 @@ public static SdcaLogisticRegressionBinaryTrainer SdcaLogisticRegression( } /// - /// Predict a target using a linear classification model trained with . + /// Create , which predicts a target using a linear classification model. /// /// The binary classification catalog trainer object. - /// The name of the label column. - /// The name of the feature column. + /// The name of the label column. The column data must be . + /// The name of the feature column. The column data must be a known-sized vector of . /// The name of the example weight column (optional). /// The loss function minimized in the training process. Defaults to if not specified. /// The L2 weight for regularization. @@ -265,7 +265,7 @@ public static SdcaNonCalibratedBinaryTrainer SdcaNonCalibrated( } /// - /// Predict a target using a linear classification model trained with and advanced options. + /// Create using advanced options, which predicts a target using a linear classification model trained over boolean label data. /// /// The binary classification catalog trainer object. /// Trainer options.