From 48a7303126878640a3cd5f5c3f52e40a831bf9db Mon Sep 17 00:00:00 2001
From: Zeeshan Siddiqui <mzs@microsoft.com>
Date: Wed, 17 Apr 2019 23:50:47 -0700
Subject: [PATCH 1/4] XML documentation for Calibrated and Non Calibrated SDCA
 Trainers.

---
 .../Standard/SdcaBinary.cs                    | 66 ++++++++++++++++++-
 .../StandardTrainersCatalog.cs                | 10 +--
 2 files changed, 69 insertions(+), 7 deletions(-)
diff --git a/src/Microsoft.ML.StandardTrainers/Standard/SdcaBinary.cs b/src/Microsoft.ML.StandardTrainers/Standard/SdcaBinary.cs
index bbe982b6cb..2552000afb 100644
--- a/src/Microsoft.ML.StandardTrainers/Standard/SdcaBinary.cs
+++ b/src/Microsoft.ML.StandardTrainers/Standard/SdcaBinary.cs
@@ -1550,7 +1550,38 @@ private protected override BinaryPredictionTransformer<TModelParameters> MakeTra
     /// The trained model is <a href='https://en.wikipedia.org/wiki/Calibration_(statistics)'>calibrated</a> and can produce probability by feeding the output value of the
     /// linear function to a <see cref="PlattCalibrator"/>.
     /// </summary>
-    /// <include file='doc.xml' path='doc/members/member[@name="SDCA_remarks"]/*' />
+    /// <remarks>
+    /// <format type="text/markdown"><![CDATA[
+    /// To create this trainer, use [SdcaLogisticRegression](xref:Microsoft.ML.StandardTrainersCatalog.SdcaLogisticRegression(Microsoft.ML.BinaryClassificationCatalog.BinaryClassificationTrainers,System.String,System.String,System.String,System.Nullable{System.Single},System.Nullable{System.Single},System.Nullable{System.Int32}))
+    /// or [SdcaLogisticRegression(Options)](xref:Microsoft.ML.StandardTrainersCatalog.SdcaLogisticRegression(Microsoft.ML.BinaryClassificationCatalog.BinaryClassificationTrainers,Microsoft.ML.Trainers.SdcaLogisticRegressionBinaryTrainer.Options)).
+    ///
+    /// [!include[io](~/../docs/samples/docs/api-reference/io-columns-binary-classification.md)]
+    ///
+    /// ### Trainer Characteristics
+    /// |  |  |
+    /// | -- | -- |
+    /// | Machine learning task | Binary classification |
+    /// | Is normalization required? | Yes |
+    /// | Is caching required? | No |
+    /// | Required NuGet in addition to Microsoft.ML | None |
+    ///
+    /// ### Training Algorithm Details
+    /// This trainer is based on the Stochastic Dual Coordinate Ascent (SDCA) method, a state-of-the-art optimization technique for convex objective functions.
+    /// The algorithm can be scaled for use on large out-of-memory data sets due to a semi-asynchronized implementation that supports multi-threading.
+    /// Convergence is underwritten by periodically enforcing synchronization between primal and dual updates in a separate thread.
+    /// Several choices of loss functions are also provided.The SDCA method combines several of the best properties and capabilities of logistic regression and SVM algorithms.
+    /// Note that SDCA is a stochastic and streaming optimization algorithm. The results depends on the order of the training data.
+    /// For reproducible results, it is recommended that one sets 'Shuffle' to False and 'NumThreads' to 1.
+    /// Elastic net regularization can be specified by the 'L2Const' and 'L1Threshold' parameters. Note that the 'L2Const' has an effect on the rate of convergence.
+    /// In general, the larger the 'L2Const', the faster SDCA converges.
+    /// For more information, see: [Scaling Up Stochastic Dual Coordinate Ascent](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/06/main-3.pdf ) and
+    /// [Stochastic Dual Coordinate Ascent Methods for Regularized Loss Minimization](http://www.jmlr.org/papers/volume14/shalev-shwartz13a/shalev-shwartz13a.pdf).
+    /// ]]>
+    /// </format>
+    /// </remarks>
+    /// <seealso cref="Microsoft.ML.StandardTrainersCatalog.SdcaLogisticRegression(Microsoft.ML.BinaryClassificationCatalog.BinaryClassificationTrainers,System.String,System.String,System.String,System.Nullable{System.Single},System.Nullable{System.Single},System.Nullable{System.Int32})"/>
+    /// <seealso cref="Microsoft.ML.StandardTrainersCatalog.SdcaLogisticRegression(Microsoft.ML.BinaryClassificationCatalog.BinaryClassificationTrainers,Microsoft.ML.Trainers.SdcaLogisticRegressionBinaryTrainer.Options)"/>
+    /// <seealso cref="Options"/>
     public sealed class SdcaLogisticRegressionBinaryTrainer :
         SdcaBinaryTrainerBase<CalibratedModelParametersBase<LinearBinaryModelParameters, PlattCalibrator>>
     {
@@ -1614,7 +1645,38 @@ private protected override SchemaShape.Column[] ComputeSdcaBinaryClassifierSchem
     /// <summary>
     /// The <see cref="IEstimator{TTransformer}"/> for training a binary logistic regression classification model using the stochastic dual coordinate ascent method.
     /// </summary>
-    /// <include file='doc.xml' path='doc/members/member[@name="SDCA_remarks"]/*' />
+    /// <remarks>
+    /// <format type="text/markdown"><![CDATA[
+    /// To create this trainer, use [SdcaNonCalibrated](xref:Microsoft.ML.StandardTrainersCatalog.SdcaNonCalibrated(Microsoft.ML.BinaryClassificationCatalog.BinaryClassificationTrainers,System.String,System.String,System.String,Microsoft.ML.Trainers.ISupportSdcaClassificationLoss,System.Nullable{System.Single},System.Nullable{System.Single},System.Nullable{System.Int32}))
+    /// or [SdcaNonCalibrated(Options)](xref:Microsoft.ML.StandardTrainersCatalog.SdcaNonCalibrated(Microsoft.ML.BinaryClassificationCatalog.BinaryClassificationTrainers,Microsoft.ML.Trainers.SdcaNonCalibratedBinaryTrainer.Options)).
+    ///
+    /// [!include[io](~/../docs/samples/docs/api-reference/io-columns-binary-classification.md)]
+    ///
+    /// ### Trainer Characteristics
+    /// |  |  |
+    /// | -- | -- |
+    /// | Machine learning task | Binary classification |
+    /// | Is normalization required? | Yes |
+    /// | Is caching required? | No |
+    /// | Required NuGet in addition to Microsoft.ML | None |
+    ///
+    /// ### Training Algorithm Details
+    /// This trainer is based on the Stochastic Dual Coordinate Ascent (SDCA) method, a state-of-the-art optimization technique for convex objective functions.
+    /// The algorithm can be scaled for use on large out-of-memory data sets due to a semi-asynchronized implementation that supports multi-threading.
+    /// Convergence is underwritten by periodically enforcing synchronization between primal and dual updates in a separate thread.
+    /// Several choices of loss functions are also provided.The SDCA method combines several of the best properties and capabilities of logistic regression and SVM algorithms.
+    /// Note that SDCA is a stochastic and streaming optimization algorithm. The results depends on the order of the training data.
+    /// For reproducible results, it is recommended that one sets 'Shuffle' to False and 'NumThreads' to 1.
+    /// Elastic net regularization can be specified by the 'L2Const' and 'L1Threshold' parameters. Note that the 'L2Const' has an effect on the rate of convergence.
+    /// In general, the larger the 'L2Const', the faster SDCA converges.
+    /// For more information, see: [Scaling Up Stochastic Dual Coordinate Ascent](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/06/main-3.pdf ) and
+    /// [Stochastic Dual Coordinate Ascent Methods for Regularized Loss Minimization](http://www.jmlr.org/papers/volume14/shalev-shwartz13a/shalev-shwartz13a.pdf).
+    /// ]]>
+    /// </format>
+    /// </remarks>
+    /// <seealso cref="Microsoft.ML.StandardTrainersCatalog.SdcaNonCalibrated(Microsoft.ML.BinaryClassificationCatalog.BinaryClassificationTrainers,System.String,System.String,System.String,Microsoft.ML.Trainers.ISupportSdcaClassificationLoss,System.Nullable{System.Single},System.Nullable{System.Single},System.Nullable{System.Int32})"/>
+    /// <seealso cref="Microsoft.ML.StandardTrainersCatalog.SdcaNonCalibrated(Microsoft.ML.BinaryClassificationCatalog.BinaryClassificationTrainers,Microsoft.ML.Trainers.SdcaNonCalibratedBinaryTrainer.Options)"/>
+    /// <seealso cref="Options"/>
     public sealed class SdcaNonCalibratedBinaryTrainer : SdcaBinaryTrainerBase<LinearBinaryModelParameters>
     {
         /// <summary>
diff --git a/src/Microsoft.ML.StandardTrainers/StandardTrainersCatalog.cs b/src/Microsoft.ML.StandardTrainers/StandardTrainersCatalog.cs
index 486a8dfd9f..1c052bd60b 100644
--- a/src/Microsoft.ML.StandardTrainers/StandardTrainersCatalog.cs
+++ b/src/Microsoft.ML.StandardTrainers/StandardTrainersCatalog.cs
@@ -211,7 +211,7 @@ public static SdcaLogisticRegressionBinaryTrainer SdcaLogisticRegression(
         }
 
         /// <summary>
-        /// Create <see cref="SdcaLogisticRegressionBinaryTrainer"/> using advanced options, which predicts a target using a linear classification model.
+        /// Create <see cref="SdcaLogisticRegressionBinaryTrainer"/> with advanced options, which predicts a target using a linear classification model.
         /// </summary>
         /// <param name="catalog">The binary classification catalog trainer object.</param>
         /// <param name="options">Trainer options.</param>
@@ -233,11 +233,11 @@ public static SdcaLogisticRegressionBinaryTrainer SdcaLogisticRegression(
         }
 
         /// <summary>
-        /// Predict a target using a linear classification model trained with <see cref="SdcaNonCalibratedBinaryTrainer"/>.
+        /// Creates a <see cref="SdcaNonCalibratedBinaryTrainer"/> that predicts a target using a linear classification model.
         /// </summary>
         /// <param name="catalog">The binary classification catalog trainer object.</param>
-        /// <param name="labelColumnName">The name of the label column.</param>
-        /// <param name="featureColumnName">The name of the feature column.</param>
+        /// <param name="labelColumnName">The name of the label column. The column data must be <see cref="System.Boolean"/>.</param>
+        /// <param name="featureColumnName">The name of the feature column. The column data must be a known-sized vector of <see cref="System.Single"/>.</param>
         /// <param name="exampleWeightColumnName">The name of the example weight column (optional).</param>
         /// <param name="lossFunction">The <a href="https://en.wikipedia.org/wiki/Loss_function">loss</a> function minimized in the training process. Defaults to <see cref="LogLoss"/> if not specified.</param>
         /// <param name="l2Regularization">The L2 weight for <a href='https://en.wikipedia.org/wiki/Regularization_(mathematics)'>regularization</a>.</param>
@@ -265,7 +265,7 @@ public static SdcaNonCalibratedBinaryTrainer SdcaNonCalibrated(
         }
 
         /// <summary>
-        /// Predict a target using a linear classification model trained with <see cref="SdcaNonCalibratedBinaryTrainer"/> and advanced options.
+        /// Creates a <see cref="SdcaNonCalibratedBinaryTrainer"/> that predicts a target using a linear classification model trained over boolean label data with advanced options.
         /// </summary>
         /// <param name="catalog">The binary classification catalog trainer object.</param>
         /// <param name="options">Trainer options.</param>

From b8d34294a1cded989f5947fa012a4c5bf7285ad2 Mon Sep 17 00:00:00 2001
From: Zeeshan Siddiqui <mzs@microsoft.com>
Date: Sun, 21 Apr 2019 12:06:55 -0700
Subject: [PATCH 2/4] PR feedback.

---
 docs/api-reference/algo-details-sdca.md       | 33 +++++++++++++++++--
 .../Standard/SdcaBinary.cs                    | 19 +++--------
 .../Standard/SdcaRegression.cs                |  2 +-
 3 files changed, 36 insertions(+), 18 deletions(-)

diff --git a/docs/api-reference/algo-details-sdca.md b/docs/api-reference/algo-details-sdca.md
index df935b07ba..90ee692f3c 100644
--- a/docs/api-reference/algo-details-sdca.md
+++ b/docs/api-reference/algo-details-sdca.md
@@ -1,12 +1,20 @@
 ### Training Algorithm Details
 This trainer is based on the Stochastic Dual Coordinate Ascent (SDCA) method, a
 state-of-the-art optimization technique for convex objective functions. The
-algorithm can be scaled for use on large out-of-memory data sets due to a
-semi-asynchronized implementation that supports multi-threading.
+algorithm can be scaled because it's a streaming training algorithm as described
+in a [KDD best
+paper.](https://www.csie.ntu.edu.tw/~cjlin/papers/disk_decomposition/tkdd_disk_decomposition.pdf)
         
 Convergence is underwritten by periodically enforcing synchronization between
 primal and dual variables in a separate thread. Several choices of loss
-functions are also provided.
+functions are also provided such as [hinge-loss](…) and [logistic loss](..).
+Depending on the loss used, the trained model can be, for example, [support
+vector machine](…) or [logistic regression](…). The SDCA method combines several
+of the best properties such the ability to do streaming learning (without
+fitting the entire data set into your memory), reaching a reasonable result with
+a few scans of the whole data set (for example, see experiments in [this
+paper](https://www.csie.ntu.edu.tw/~cjlin/papers/cddual.pdf)), and spending no
+computation on zeros in sparse data sets.
           
 Note that SDCA is a stochastic and streaming optimization algorithm. The result
 depends on the order of training data because the stopping tolerance is not
@@ -17,6 +25,25 @@ For reproducible results, it is recommended that one sets 'Shuffle' to False and
 'NumThreads' to 1. Elastic net regularization can be specified by the 'L2Const'
 and 'L1Threshold' parameters. Note that the 'L2Const' has an effect on the rate
 of convergence. In general, the larger the 'L2Const', the faster SDCA converges.
+Regularization is a method that can render an ill-posed problem more tractable
+by imposing constraints that provide information to supplement the data and that
+prevents overfitting by penalizing model's magnitude usually measured by some
+norm functions. This can improve the generalization of the model learned by
+selecting the optimal complexity in the bias-variance tradeoff. Regularization
+works by adding the penalty that is associated with coefficient values to the
+error of the hypothesis. An accurate model with extreme coefficient values would
+be penalized more, but a less accurate model with more conservative values would
+be penalized less. This learner supports [elastic net
+regularization](https://en.wikipedia.org/wiki/Elastic_net_regularization): a
+linear combination of L1-norm (LASSO), $|| \boldsymbol{w} ||_1$, and L2-norm
+(ridge), $|| \boldsymbol{w} ||_2^2$ regularizations. L1-nrom and L2-norm
+regularizations have different effects and uses that are complementary in
+certain respects. Using L1-norm can increase sparsity of the trained
+$\boldsymbol{w}$. When working with high-dimensional data, it shrinks small
+weights of irrevalent features to 0 and therefore no reource will be spent on
+those bad features when making prediction. L2-norm regularization is preferable
+for data that is not sparse and it largely penalizes the existence of large
+weights.
 
 For more information, see:
 * [Scaling Up Stochastic Dual Coordinate
diff --git a/src/Microsoft.ML.StandardTrainers/Standard/SdcaBinary.cs b/src/Microsoft.ML.StandardTrainers/Standard/SdcaBinary.cs
index 2552000afb..79e75ce774 100644
--- a/src/Microsoft.ML.StandardTrainers/Standard/SdcaBinary.cs
+++ b/src/Microsoft.ML.StandardTrainers/Standard/SdcaBinary.cs
@@ -1565,28 +1565,19 @@ private protected override BinaryPredictionTransformer<TModelParameters> MakeTra
     /// | Is caching required? | No |
     /// | Required NuGet in addition to Microsoft.ML | None |
     ///
-    /// ### Training Algorithm Details
-    /// This trainer is based on the Stochastic Dual Coordinate Ascent (SDCA) method, a state-of-the-art optimization technique for convex objective functions.
-    /// The algorithm can be scaled for use on large out-of-memory data sets due to a semi-asynchronized implementation that supports multi-threading.
-    /// Convergence is underwritten by periodically enforcing synchronization between primal and dual updates in a separate thread.
-    /// Several choices of loss functions are also provided.The SDCA method combines several of the best properties and capabilities of logistic regression and SVM algorithms.
-    /// Note that SDCA is a stochastic and streaming optimization algorithm. The results depends on the order of the training data.
-    /// For reproducible results, it is recommended that one sets 'Shuffle' to False and 'NumThreads' to 1.
-    /// Elastic net regularization can be specified by the 'L2Const' and 'L1Threshold' parameters. Note that the 'L2Const' has an effect on the rate of convergence.
-    /// In general, the larger the 'L2Const', the faster SDCA converges.
-    /// For more information, see: [Scaling Up Stochastic Dual Coordinate Ascent](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/06/main-3.pdf ) and
-    /// [Stochastic Dual Coordinate Ascent Methods for Regularized Loss Minimization](http://www.jmlr.org/papers/volume14/shalev-shwartz13a/shalev-shwartz13a.pdf).
+    /// [!include[algorithm](~/../docs/samples/docs/api-reference/algo-details-sdca.md)]
     /// ]]>
     /// </format>
     /// </remarks>
-    /// <seealso cref="Microsoft.ML.StandardTrainersCatalog.SdcaLogisticRegression(Microsoft.ML.BinaryClassificationCatalog.BinaryClassificationTrainers,System.String,System.String,System.String,System.Nullable{System.Single},System.Nullable{System.Single},System.Nullable{System.Int32})"/>
-    /// <seealso cref="Microsoft.ML.StandardTrainersCatalog.SdcaLogisticRegression(Microsoft.ML.BinaryClassificationCatalog.BinaryClassificationTrainers,Microsoft.ML.Trainers.SdcaLogisticRegressionBinaryTrainer.Options)"/>
+    /// <seealso cref="StandardTrainersCatalog.SdcaLogisticRegression(BinaryClassificationCatalog.BinaryClassificationTrainers, string, string, string, float?, float?, int?)"/>
+    /// <seealso cref="StandardTrainersCatalog.SdcaLogisticRegression(BinaryClassificationCatalog.BinaryClassificationTrainers, SdcaLogisticRegressionBinaryTrainer.Options)"/>
     /// <seealso cref="Options"/>
     public sealed class SdcaLogisticRegressionBinaryTrainer :
         SdcaBinaryTrainerBase<CalibratedModelParametersBase<LinearBinaryModelParameters, PlattCalibrator>>
     {
         /// <summary>
-        /// Options for the <see cref="SdcaLogisticRegressionBinaryTrainer"/>.
+        /// Options for the <see cref="SdcaLogisticRegressionBinaryTrainer"/> as used in
+        /// [SdcaLogisticRegression(Options)](xref:Microsoft.ML.StandardTrainersCatalog.SdcaLogisticRegression(Microsoft.ML.BinaryClassificationCatalog.BinaryClassificationTrainers,Microsoft.ML.Trainers.SdcaLogisticRegressionBinaryTrainer.Options)).
         /// </summary>
         public sealed class Options : BinaryOptionsBase
         {
diff --git a/src/Microsoft.ML.StandardTrainers/Standard/SdcaRegression.cs b/src/Microsoft.ML.StandardTrainers/Standard/SdcaRegression.cs
index 5398cf4e0b..949688c79d 100644
--- a/src/Microsoft.ML.StandardTrainers/Standard/SdcaRegression.cs
+++ b/src/Microsoft.ML.StandardTrainers/Standard/SdcaRegression.cs
@@ -39,7 +39,7 @@ namespace Microsoft.ML.Trainers
     /// | Is caching required? | No |
     /// | Required NuGet in addition to Microsoft.ML | None |
     ///
-    /// [!include[io](~/../docs/samples/docs/api-reference/algo-details-sdca.md)]
+    /// [!include[algorithm](~/../docs/samples/docs/api-reference/algo-details-sdca.md)]
     /// ]]>
     /// </format>
     /// </remarks>

From e98fa2b9b2e0422c7e7f9d5b384d031adae6b8d5 Mon Sep 17 00:00:00 2001
From: Zeeshan Siddiqui <mzs@microsoft.com>
Date: Sun, 21 Apr 2019 12:27:57 -0700
Subject: [PATCH 3/4] PR feedback.

---
 src/Microsoft.ML.StandardTrainers/StandardTrainersCatalog.cs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Microsoft.ML.StandardTrainers/StandardTrainersCatalog.cs b/src/Microsoft.ML.StandardTrainers/StandardTrainersCatalog.cs
index 1c052bd60b..9daeb1f28d 100644
--- a/src/Microsoft.ML.StandardTrainers/StandardTrainersCatalog.cs
+++ b/src/Microsoft.ML.StandardTrainers/StandardTrainersCatalog.cs
@@ -233,7 +233,7 @@ public static SdcaLogisticRegressionBinaryTrainer SdcaLogisticRegression(
         }
 
         /// <summary>
-        /// Creates a <see cref="SdcaNonCalibratedBinaryTrainer"/> that predicts a target using a linear classification model.
+        /// Create <see cref="SdcaNonCalibratedBinaryTrainer"/>, which predicts a target using a linear classification model.
         /// </summary>
         /// <param name="catalog">The binary classification catalog trainer object.</param>
         /// <param name="labelColumnName">The name of the label column. The column data must be <see cref="System.Boolean"/>.</param>
@@ -265,7 +265,7 @@ public static SdcaNonCalibratedBinaryTrainer SdcaNonCalibrated(
         }
 
         /// <summary>
-        /// Creates a <see cref="SdcaNonCalibratedBinaryTrainer"/> that predicts a target using a linear classification model trained over boolean label data with advanced options.
+        /// Create <see cref="SdcaNonCalibratedBinaryTrainer"/> using advanced options, which predicts a target using a linear classification model trained over boolean label data.
         /// </summary>
         /// <param name="catalog">The binary classification catalog trainer object.</param>
         /// <param name="options">Trainer options.</param>

From d21ddc3af74f2cd5facca3a9b150ee3351350ed7 Mon Sep 17 00:00:00 2001
From: Zeeshan Siddiqui <mzs@microsoft.com>
Date: Sun, 21 Apr 2019 13:03:30 -0700
Subject: [PATCH 4/4] PR feedback.

---
 docs/api-reference/algo-details-sdca.md | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/docs/api-reference/algo-details-sdca.md b/docs/api-reference/algo-details-sdca.md
index 90ee692f3c..69e5eaf586 100644
--- a/docs/api-reference/algo-details-sdca.md
+++ b/docs/api-reference/algo-details-sdca.md
@@ -7,12 +7,16 @@ paper.](https://www.csie.ntu.edu.tw/~cjlin/papers/disk_decomposition/tkdd_disk_d
         
 Convergence is underwritten by periodically enforcing synchronization between
 primal and dual variables in a separate thread. Several choices of loss
-functions are also provided such as [hinge-loss](…) and [logistic loss](..).
+functions are also provided such as
+[hinge-loss](https://en.wikipedia.org/wiki/Hinge_loss) and [logistic
+loss](http://www.hongliangjie.com/wp-content/uploads/2011/10/logistic.pdf).
 Depending on the loss used, the trained model can be, for example, [support
-vector machine](…) or [logistic regression](…). The SDCA method combines several
-of the best properties such the ability to do streaming learning (without
-fitting the entire data set into your memory), reaching a reasonable result with
-a few scans of the whole data set (for example, see experiments in [this
+vector machine](https://en.wikipedia.org/wiki/Support-vector_machine) or
+[logistic regression](https://en.wikipedia.org/wiki/Logistic_regression). The
+SDCA method combines several of the best properties such the ability to do
+streaming learning (without fitting the entire data set into your memory),
+reaching a reasonable result with a few scans of the whole data set (for
+example, see experiments in [this
 paper](https://www.csie.ntu.edu.tw/~cjlin/papers/cddual.pdf)), and spending no
 computation on zeros in sparse data sets.