|
25 | 25 | namespace Microsoft.ML.Trainers
|
26 | 26 | {
|
27 | 27 | /// <summary>
|
28 |
| - /// The <see cref="IEstimator{TTransformer}"/> for training a multiclass linear classification model using the stochastic dual coordinate ascent method. |
| 28 | + /// The <see cref="IEstimator{TTransformer}"/> to predict a target using a linear multiclass classifier model trained with a coordinate descent method. |
| 29 | + /// Depending on the used loss function, the trained model can be, for example, maximum entropy classifier or multi-class support vector machine. |
29 | 30 | /// </summary>
|
30 |
| - /// <include file='doc.xml' path='doc/members/member[@name="SDCA_remarks"]/*' /> |
| 31 | + /// <remarks> |
| 32 | + /// <format type="text/markdown"><) or |
| 34 | + /// [SdcaMaximumEntropy(Options)](xref:Microsoft.ML.StandardTrainersCatalog.SdcaMaximumEntropy(Microsoft.ML.MulticlassClassificationCatalog.MulticlassClassificationTrainers,Microsoft.ML.Trainers.SdcaMaximumEntropyMulticlassTrainer.Options)). |
| 35 | + /// To create this trainer for a [loss function](xref:Microsoft.ML.Trainers.ISupportSdcaClassificationLoss) (such as support vector machine's [hinge loss](xref:Microsoft.ML.Trainers.HingeLoss)) of your choice, |
| 36 | + /// use [SdcaNonCalibrated](xref:Microsoft.ML.StandardTrainersCatalog.SdcaNonCalibrated(Microsoft.ML.MulticlassClassificationCatalog.MulticlassClassificationTrainers,System.String,System.String,System.String,Microsoft.ML.Trainers.ISupportSdcaClassificationLoss,System.Nullable{System.Single},System.Nullable{System.Single},System.Nullable{System.Int32})) or |
| 37 | + /// [SdcaNonCalibrated(Options)](Microsoft.ML.StandardTrainersCatalog.SdcaNonCalibrated(Microsoft.ML.MulticlassClassificationCatalog.MulticlassClassificationTrainers,Microsoft.ML.Trainers.SdcaNonCalibratedMulticlassTrainer.Options)). |
| 38 | + /// |
| 39 | + /// [!include[io](~/../docs/samples/docs/api-reference/io-columns-multiclass-classification.md)] |
| 40 | + /// |
| 41 | + /// ### Trainer Characteristics |
| 42 | + /// | | | |
| 43 | + /// | -- | -- | |
| 44 | + /// | Machine learning task | Multiclass classification | |
| 45 | + /// | Is normalization required? | Yes | |
| 46 | + /// | Is caching required? | No | |
| 47 | + /// | Required NuGet in addition to Microsoft.ML | None | |
| 48 | + /// |
| 49 | + /// ### Scoring Function |
| 50 | + /// This trains linear model to solve multiclass classification problems. |
| 51 | + /// Assume that the number of classes is $m$ and number of features is $n$. |
| 52 | + /// It assigns the $c$-th class a coefficient vector $\boldsymbol{w}_c \in {\mathbb R}^n$ and a bias $b_c \in {\mathbb R}$, for $c=1,\dots,m$. |
| 53 | + /// Given a feature vector $\boldsymbol{x} \in {\mathbb R}^n$, the $c$-th class's score would be $\hat{y}^c = \boldsymbol{w}_c^T \boldsymbol{x} + b_c$. |
| 54 | + /// If $\boldsymbol{x}$ belongs to class $c$, then $\hat{y}^c$ should be much larger than 0. |
| 55 | + /// In contrast, a $\hat{y}^c$ much smaller than 0 means the desired label should not be $c$. |
| 56 | + /// |
| 57 | + /// If and only if the trained model is a maximum entropy classifier, you can interpret the output score vector as the predicted class probabilities because [softmax function](https://en.wikipedia.org/wiki/Softmax_function) may be applied to post-process all classes' scores. |
| 58 | + /// More specifically, the probability of $\boldsymbol{x}$ belonging to class $c$ is computed by $\tilde{P}(c|\boldsymbol{x}) = \frac{ e^{\hat{y}^c} }{ \sum_{c' = 1}^m e^{\hat{y}^{c'}} }$ and store at the $c$-th element in the score vector. |
| 59 | + /// In other cases, the output score vector is just $[\hat{y}^1, \dots, \hat{y}^m]$. |
| 60 | + /// |
| 61 | + /// ### Training Algorithm Details |
| 62 | + /// The optimization algorithm is an extension of (http://jmlr.org/papers/volume14/shalev-shwartz13a/shalev-shwartz13a.pdf) following a similar path proposed in an earlier [paper](https://www.csie.ntu.edu.tw/~cjlin/papers/maxent_dual.pdf). |
| 63 | + /// It is usually much faster than [L-BFGS](https://en.wikipedia.org/wiki/Limited-memory_BFGS) and [truncated Newton methods](https://en.wikipedia.org/wiki/Truncated_Newton_method) for large-scale and sparse data set. |
| 64 | + /// |
| 65 | + /// Regularization is a method that can render an ill-posed problem more tractable by imposing constraints that provide information to supplement the data and that prevents overfitting by penalizing model's magnitude usually measured by some norm functions. |
| 66 | + /// This can improve the generalization of the model learned by selecting the optimal complexity in the bias-variance tradeoff. |
| 67 | + /// Regularization works by adding the penalty on the magnitude of $\boldsymbol{w}_c$, $c=1,\dots,m$ to the error of the hypothesis. |
| 68 | + /// An accurate model with extreme coefficient values would be penalized more, but a less accurate model with more conservative values would be penalized less. |
| 69 | + /// |
| 70 | + /// This trainer supports [elastic net regularization](https://en.wikipedia.org/wiki/Elastic_net_regularization): a linear combination of L1-norm (LASSO), $|| \boldsymbol{w}_c ||_1$, and L2-norm (ridge), $|| \boldsymbol{w}_c ||_2^2$ regularizations. |
| 71 | + /// L1-norm and L2-norm regularizations have different effects and uses that are complementary in certain respects. |
| 72 | + /// Using L1-norm can increase sparsity of the trained $\boldsymbol{w}_c$. |
| 73 | + /// When working with high-dimensional data, it shrinks small weights of irrelevant features to 0 and therefore no resource will be spent on those bad features when making prediction. |
| 74 | + /// L2-norm regularization is preferable for data that is not sparse and it largely penalizes the existence of large weights. |
| 75 | + /// |
| 76 | + /// An aggressive regularization (that is, assigning large coefficients to L1-norm or L2-norm regularization terms) can harm predictive capacity by excluding important variables out of the model. |
| 77 | + /// Therefore, choosing the right regularization coefficients is important in practice. |
| 78 | + /// ]]> |
| 79 | + /// </format> |
| 80 | + /// </remarks> |
| 81 | + /// <seealso cref="Microsoft.ML.StandardTrainersCatalog.SdcaMaximumEntropy(MulticlassClassificationCatalog.MulticlassClassificationTrainers, SdcaMaximumEntropyMulticlassTrainer.Options)"/> |
| 82 | + /// <seealso cref="Microsoft.ML.StandardTrainersCatalog.SdcaMaximumEntropy(MulticlassClassificationCatalog.MulticlassClassificationTrainers, string, string, string, float?, float?, int?)"/> |
| 83 | + /// <seealso cref="Microsoft.ML.Trainers.SdcaMaximumEntropyMulticlassTrainer.Options"/> |
| 84 | + /// <seealso cref="Microsoft.ML.StandardTrainersCatalog.SdcaNonCalibrated(MulticlassClassificationCatalog.MulticlassClassificationTrainers, SdcaNonCalibratedMulticlassTrainer.Options)"/> |
| 85 | + /// <seealso cref="Microsoft.ML.StandardTrainersCatalog.SdcaNonCalibrated(MulticlassClassificationCatalog.MulticlassClassificationTrainers, string, string, string, ISupportSdcaClassificationLoss, float?, float?, int?)"/> |
| 86 | + /// <seealso cref="Microsoft.ML.Trainers.SdcaNonCalibratedMulticlassTrainer.Options"/> |
31 | 87 | public abstract class SdcaMulticlassTrainerBase<TModel> : SdcaTrainerBase<SdcaMulticlassTrainerBase<TModel>.MulticlassOptions, MulticlassPredictionTransformer<TModel>, TModel>
|
32 | 88 | where TModel : class
|
33 | 89 | {
|
@@ -433,12 +489,46 @@ private protected override float GetInstanceWeight(FloatLabelCursor cursor)
|
433 | 489 | }
|
434 | 490 |
|
435 | 491 | /// <summary>
|
436 |
| - /// The <see cref="IEstimator{TTransformer}"/> for training a maximum entropy classification model using the stochastic dual coordinate ascent method. |
| 492 | + /// The <see cref="IEstimator{TTransformer}"/> to predict a target using a maximum entropy multiclass classifier. |
437 | 493 | /// The trained model <see cref="MaximumEntropyModelParameters"/> produces probabilities of classes.
|
438 | 494 | /// </summary>
|
439 |
| - /// <include file='doc.xml' path='doc/members/member[@name="SDCA_remarks"]/*' /> |
| 495 | + /// <remarks> |
| 496 | + /// <format type="text/markdown"><) or |
| 498 | + /// [SdcaMaximumEntropy(Options)](xref:Microsoft.ML.StandardTrainersCatalog.SdcaMaximumEntropy(Microsoft.ML.MulticlassClassificationCatalog.MulticlassClassificationTrainers,Microsoft.ML.Trainers.SdcaMaximumEntropyMulticlassTrainer.Options)). |
| 499 | + /// |
| 500 | + /// [!include[io](~/../docs/samples/docs/api-reference/io-columns-multiclass-classification.md)] |
| 501 | + /// |
| 502 | + /// ### Trainer Characteristics |
| 503 | + /// | | | |
| 504 | + /// | -- | -- | |
| 505 | + /// | Machine learning task | Multiclass classification | |
| 506 | + /// | Is normalization required? | Yes | |
| 507 | + /// | Is caching required? | No | |
| 508 | + /// | Required NuGet in addition to Microsoft.ML | None | |
| 509 | + /// |
| 510 | + /// ### Scoring Function |
| 511 | + /// This trains a linear model to solve multiclass classification problems. |
| 512 | + /// Assume that the number of classes is $m$ and number of features is $n$. |
| 513 | + /// It assigns the $c$-th class a coefficient vector $\boldsymbol{w}_c \in {\mathbb R}^n$ and a bias $b_c \in {\mathbb R}$, for $c=1,\dots,m$. |
| 514 | + /// Given a feature vector $\boldsymbol{x} \in {\mathbb R}^n$, the $c$-th class's score would be $\tilde{P}(c|\boldsymbol{x}) = \frac{ e^{\hat{y}^c} }{ \sum_{c' = 1}^m e^{\hat{y}^{c'}} }$, where $\hat{y}^c = \boldsymbol{w}_c^T \boldsymbol{x} + b_c$. |
| 515 | + /// Note that $\tilde{P}(c|\boldsymbol{x})$ is the probability of observing class $c$ when the feature vector is $\boldsymbol{x}$. |
| 516 | + /// |
| 517 | + /// ### Training Algorithm Details |
| 518 | + /// See the documentation of [SdcaMulticlassTrainerBase](xref:Microsoft.ML.Trainers.SdcaMulticlassTrainerBase). |
| 519 | + /// |
| 520 | + /// ]]> |
| 521 | + /// </format> |
| 522 | + /// </remarks> |
| 523 | + /// <seealso cref="Microsoft.ML.StandardTrainersCatalog.SdcaMaximumEntropy(MulticlassClassificationCatalog.MulticlassClassificationTrainers, SdcaMaximumEntropyMulticlassTrainer.Options)"/> |
| 524 | + /// <seealso cref="Microsoft.ML.StandardTrainersCatalog.SdcaMaximumEntropy(MulticlassClassificationCatalog.MulticlassClassificationTrainers, string, string, string, float?, float?, int?)"/> |
| 525 | + /// <seealso cref="Microsoft.ML.Trainers.SdcaMaximumEntropyMulticlassTrainer.Options"/> |
440 | 526 | public sealed class SdcaMaximumEntropyMulticlassTrainer : SdcaMulticlassTrainerBase<MaximumEntropyModelParameters>
|
441 | 527 | {
|
| 528 | + /// <summary> |
| 529 | + /// <see cref="Options"/> for <see cref="SdcaMaximumEntropyMulticlassTrainer"/> as used in |
| 530 | + /// <see cref="Microsoft.ML.StandardTrainersCatalog.SdcaMaximumEntropy(MulticlassClassificationCatalog.MulticlassClassificationTrainers, string, string, string, float?, float?, int?)"/> |
| 531 | + /// </summary> |
442 | 532 | public sealed class Options : MulticlassOptions
|
443 | 533 | {
|
444 | 534 | }
|
@@ -482,13 +572,46 @@ private protected override MulticlassPredictionTransformer<MaximumEntropyModelPa
|
482 | 572 | }
|
483 | 573 |
|
484 | 574 | /// <summary>
|
485 |
| - /// The <see cref="IEstimator{TTransformer}"/> for training a multiclass linear model using the stochastic dual coordinate ascent method. |
486 |
| - /// The trained model <see cref="LinearMulticlassModelParameters"/> does not produces probabilities of classes, but we can still make decisions |
487 |
| - /// by choosing the class associated with the largest score. |
| 575 | + /// The<see cref="IEstimator{TTransformer}"/> to predict a target using a linear multiclass classifier. |
| 576 | + /// The trained model <see cref="LinearMulticlassModelParameters"/> produces probabilities of classes. |
488 | 577 | /// </summary>
|
489 |
| - /// <include file='doc.xml' path='doc/members/member[@name="SDCA_remarks"]/*' /> |
| 578 | + /// <remarks> |
| 579 | + /// <format type="text/markdown"><) or |
| 581 | + /// [SdcaMaximumEntropy(Options)](xref:Microsoft.ML.StandardTrainersCatalog.SdcaMaximumEntropy(Microsoft.ML.MulticlassClassificationCatalog.MulticlassClassificationTrainers,Microsoft.ML.Trainers.SdcaMaximumEntropyMulticlassTrainer.Options)). |
| 582 | + /// |
| 583 | + /// [!include[io](~/../docs/samples/docs/api-reference/io-columns-multiclass-classification.md)] |
| 584 | + /// |
| 585 | + /// ### Trainer Characteristics |
| 586 | + /// | | | |
| 587 | + /// | -- | -- | |
| 588 | + /// | Machine learning task | Multiclass classification | |
| 589 | + /// | Is normalization required? | Yes | |
| 590 | + /// | Is caching required? | No | |
| 591 | + /// | Required NuGet in addition to Microsoft.ML | None | |
| 592 | + /// |
| 593 | + /// ### Scoring Function |
| 594 | + /// This trains a linear model to solve multiclass classification problems. |
| 595 | + /// Assume that the number of classes is $m$ and number of features is $n$. |
| 596 | + /// It assigns the $c$-th class a coefficient vector $\boldsymbol{w}_c \in {\mathbb R}^n$ and a bias $b_c \in {\mathbb R}$, for $c=1,\dots,m$. |
| 597 | + /// Given a feature vector $\boldsymbol{x} \in {\mathbb R}^n$, the $c$-th class's score would be $\hat{y}^c = \boldsymbol{w}_c^T \boldsymbol{x} + b_c$. |
| 598 | + /// Note that the $c$-th value in the output score column is just $\hat{y}^c$. |
| 599 | + /// |
| 600 | + /// ### Training Algorithm Details |
| 601 | + /// See the documentation of [SdcaMulticlassTrainerBase](xref:Microsoft.ML.Trainers.SdcaMulticlassTrainerBase). |
| 602 | + /// |
| 603 | + /// ]]> |
| 604 | + /// </format> |
| 605 | + /// </remarks> |
| 606 | + /// <seealso cref="Microsoft.ML.StandardTrainersCatalog.SdcaNonCalibrated(MulticlassClassificationCatalog.MulticlassClassificationTrainers, SdcaNonCalibratedMulticlassTrainer.Options)"/> |
| 607 | + /// <seealso cref="Microsoft.ML.StandardTrainersCatalog.SdcaNonCalibrated(MulticlassClassificationCatalog.MulticlassClassificationTrainers, string, string, string, ISupportSdcaClassificationLoss, float?, float?, int?)"/> |
| 608 | + /// <seealso cref="Microsoft.ML.Trainers.SdcaNonCalibratedMulticlassTrainer.Options"/> |
490 | 609 | public sealed class SdcaNonCalibratedMulticlassTrainer : SdcaMulticlassTrainerBase<LinearMulticlassModelParameters>
|
491 | 610 | {
|
| 611 | + /// <summary> |
| 612 | + /// <see cref="Options"/> for <see cref="SdcaNonCalibratedMulticlassTrainer"/> as used in |
| 613 | + /// <see cref="Microsoft.ML.StandardTrainersCatalog.SdcaNonCalibrated(MulticlassClassificationCatalog.MulticlassClassificationTrainers, string, string, string, ISupportSdcaClassificationLoss, float?, float?, int?)"/>. |
| 614 | + /// </summary> |
492 | 615 | public sealed class Options : MulticlassOptions
|
493 | 616 | {
|
494 | 617 | /// <summary>
|
|
0 commit comments