From 0bef3e56e34c3e3bf0f9004f60ef49ef76d41747 Mon Sep 17 00:00:00 2001 From: Zeeshan Siddiqui Date: Thu, 18 Apr 2019 18:01:31 -0700 Subject: [PATCH 1/3] XML documentation for Randomized PCA trainer. --- src/Microsoft.ML.PCA/PCACatalog.cs | 6 +++--- src/Microsoft.ML.PCA/PcaTrainer.cs | 25 ++++++++++++++++++++++--- 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/src/Microsoft.ML.PCA/PCACatalog.cs b/src/Microsoft.ML.PCA/PCACatalog.cs index 3c1aafe6f7..dfca5c8f82 100644 --- a/src/Microsoft.ML.PCA/PCACatalog.cs +++ b/src/Microsoft.ML.PCA/PCACatalog.cs @@ -40,10 +40,10 @@ internal static PrincipalComponentAnalyzer ProjectToPrincipalComponents(this Tra => new PrincipalComponentAnalyzer(CatalogUtils.GetEnvironment(catalog), columns); /// - /// Trains an approximate principal component analysis (PCA) model using randomized SVD algorithm. + /// Creates a , which trains an approximate principal component analysis (PCA) model using randomized SVD algorithm. /// /// The anomaly detection catalog trainer object. - /// The name of the feature column. + /// The name of the feature column. The column data must be a known-sized vector of . /// The name of the example weight column (optional). /// The number of components in the PCA. /// Oversampling parameter for randomized PCA training. @@ -69,7 +69,7 @@ public static RandomizedPcaTrainer RandomizedPca(this AnomalyDetectionCatalog.An } /// - /// Trains an approximate principal component analysis (PCA) model using randomized SVD algorithm. + /// TCreates a , which trains an approximate principal component analysis (PCA) model using randomized SVD algorithm with advanced options. /// /// The anomaly detection catalog trainer object. /// Advanced options to the algorithm. diff --git a/src/Microsoft.ML.PCA/PcaTrainer.cs b/src/Microsoft.ML.PCA/PcaTrainer.cs index 810bf1b5ea..6b090f6ef8 100644 --- a/src/Microsoft.ML.PCA/PcaTrainer.cs +++ b/src/Microsoft.ML.PCA/PcaTrainer.cs @@ -33,12 +33,31 @@ namespace Microsoft.ML.Trainers // REVIEW: make RFF transformer an option here. /// - /// This trainer trains an approximate PCA using Randomized SVD algorithm - /// Reference: https://web.stanford.edu/group/mmds/slides2010/Martinsson.pdf + /// The for training an approximate PCA using Randomized SVD algorithm. /// /// - /// This PCA can be made into Kernel PCA by using Random Fourier Features transform + /// + /// /// + /// + /// + /// public sealed class RandomizedPcaTrainer : TrainerEstimatorBase, PcaModelParameters> { internal const string LoadNameValue = "pcaAnomaly"; From 4dd116cc3a99144f27889204ebaf124dd41b2c72 Mon Sep 17 00:00:00 2001 From: Zeeshan Siddiqui Date: Fri, 19 Apr 2019 17:54:46 -0700 Subject: [PATCH 2/3] PR feedback. --- src/Microsoft.ML.PCA/PCACatalog.cs | 4 ++-- src/Microsoft.ML.PCA/PcaTrainer.cs | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Microsoft.ML.PCA/PCACatalog.cs b/src/Microsoft.ML.PCA/PCACatalog.cs index dfca5c8f82..f2b26b9f7f 100644 --- a/src/Microsoft.ML.PCA/PCACatalog.cs +++ b/src/Microsoft.ML.PCA/PCACatalog.cs @@ -40,7 +40,7 @@ internal static PrincipalComponentAnalyzer ProjectToPrincipalComponents(this Tra => new PrincipalComponentAnalyzer(CatalogUtils.GetEnvironment(catalog), columns); /// - /// Creates a , which trains an approximate principal component analysis (PCA) model using randomized SVD algorithm. + /// Create , which trains an approximate principal component analysis (PCA) model using randomized SVD algorithm. /// /// The anomaly detection catalog trainer object. /// The name of the feature column. The column data must be a known-sized vector of . @@ -69,7 +69,7 @@ public static RandomizedPcaTrainer RandomizedPca(this AnomalyDetectionCatalog.An } /// - /// TCreates a , which trains an approximate principal component analysis (PCA) model using randomized SVD algorithm with advanced options. + /// Create using advanced options, which trains an approximate principal component analysis (PCA) model using randomized SVD algorithm. /// /// The anomaly detection catalog trainer object. /// Advanced options to the algorithm. diff --git a/src/Microsoft.ML.PCA/PcaTrainer.cs b/src/Microsoft.ML.PCA/PcaTrainer.cs index 6b090f6ef8..8c1e1c25ca 100644 --- a/src/Microsoft.ML.PCA/PcaTrainer.cs +++ b/src/Microsoft.ML.PCA/PcaTrainer.cs @@ -46,7 +46,7 @@ namespace Microsoft.ML.Trainers /// | | | /// | -- | -- | /// | Machine learning task | Anomaly Detection | - /// | Is normalization required? | Yes | + /// | Is normalization required? | No | /// | Is caching required? | No | /// | Required NuGet in addition to Microsoft.ML | None | /// @@ -55,8 +55,8 @@ namespace Microsoft.ML.Trainers /// ]]> /// /// - /// - /// + /// + /// /// public sealed class RandomizedPcaTrainer : TrainerEstimatorBase, PcaModelParameters> { From 029b34e0c5dbb10c8bac7e9665f7843f07a97e28 Mon Sep 17 00:00:00 2001 From: Zeeshan Siddiqui Date: Sun, 21 Apr 2019 10:09:00 -0700 Subject: [PATCH 3/3] PR feedback.: --- .../io-columns-anomaly-detection.md | 6 +++++ .../io-columns-binary-classification.md | 9 +++---- src/Microsoft.ML.PCA/PCACatalog.cs | 7 +++--- src/Microsoft.ML.PCA/PcaTrainer.cs | 24 ++++++++++++++++--- 4 files changed, 36 insertions(+), 10 deletions(-) create mode 100644 docs/api-reference/io-columns-anomaly-detection.md diff --git a/docs/api-reference/io-columns-anomaly-detection.md b/docs/api-reference/io-columns-anomaly-detection.md new file mode 100644 index 0000000000..1bcc6a2a9d --- /dev/null +++ b/docs/api-reference/io-columns-anomaly-detection.md @@ -0,0 +1,6 @@ +### Input and Output Columns +The input features column data must be a known-sized vector of . This trainer outputs the following columns: + +| Output Column Name | Column Type | Description| +| -- | -- | -- | +| `Score` | | The non-negative, unbounded score that was calculated by the anomaly detection model.| \ No newline at end of file diff --git a/docs/api-reference/io-columns-binary-classification.md b/docs/api-reference/io-columns-binary-classification.md index ae261b230b..59e49d7da6 100644 --- a/docs/api-reference/io-columns-binary-classification.md +++ b/docs/api-reference/io-columns-binary-classification.md @@ -1,8 +1,9 @@ ### Input and Output Columns -The input label column data must be . This trainer outputs the following columns: +The input label column data must be . +The input features column data must be a known-sized vector of . This trainer outputs the following columns: | Output Column Name | Column Type | Description| | -- | -- | -- | -| `Score` | | The unbounded score that was calculated by the trainer to determine the prediction.| -| `PredictedLabel` | | The label predicted by the trainer. `false` maps to negative score and `true` maps to positive score.| -| `Probability` | | The probability of having true as the label. Probability value is in range [0, 1].|| \ No newline at end of file +| `Score` | | The unbounded score that was calculated by the model.| +| `PredictedLabel` | | The predicted label, based on the sign of the score. A negative score maps to `false` and a positive score maps to `true`.| +| `Probability` | | The probability calculated by calibrating the score of having true as the label. Probability value is in range [0, 1].|| \ No newline at end of file diff --git a/src/Microsoft.ML.PCA/PCACatalog.cs b/src/Microsoft.ML.PCA/PCACatalog.cs index f2b26b9f7f..05f98a6678 100644 --- a/src/Microsoft.ML.PCA/PCACatalog.cs +++ b/src/Microsoft.ML.PCA/PCACatalog.cs @@ -40,11 +40,12 @@ internal static PrincipalComponentAnalyzer ProjectToPrincipalComponents(this Tra => new PrincipalComponentAnalyzer(CatalogUtils.GetEnvironment(catalog), columns); /// - /// Create , which trains an approximate principal component analysis (PCA) model using randomized SVD algorithm. + /// Create , which trains an approximate principal component analysis (PCA) model using randomized singular value decomposition (SVD) algorithm. /// /// The anomaly detection catalog trainer object. /// The name of the feature column. The column data must be a known-sized vector of . - /// The name of the example weight column (optional). + /// The name of the example weight column (optional). To use the weight column, the column data + /// must be of type . /// The number of components in the PCA. /// Oversampling parameter for randomized PCA training. /// If enabled, data is centered to be zero mean. @@ -69,7 +70,7 @@ public static RandomizedPcaTrainer RandomizedPca(this AnomalyDetectionCatalog.An } /// - /// Create using advanced options, which trains an approximate principal component analysis (PCA) model using randomized SVD algorithm. + /// Create with advanced options, which trains an approximate principal component analysis (PCA) model using randomized singular value decomposition (SVD) algorithm. /// /// The anomaly detection catalog trainer object. /// Advanced options to the algorithm. diff --git a/src/Microsoft.ML.PCA/PcaTrainer.cs b/src/Microsoft.ML.PCA/PcaTrainer.cs index 8c1e1c25ca..cd2680fb88 100644 --- a/src/Microsoft.ML.PCA/PcaTrainer.cs +++ b/src/Microsoft.ML.PCA/PcaTrainer.cs @@ -40,7 +40,7 @@ namespace Microsoft.ML.Trainers /// To create this trainer, use [RandomizedPca](xref:Microsoft.ML.PcaCatalog.RandomizedPca(Microsoft.ML.AnomalyDetectionCatalog.AnomalyDetectionTrainers,System.String,System.String,System.Int32,System.Int32,System.Boolean,System.Nullable{System.Int32})) /// or [RandomizedPca(Options)](xref:Microsoft.ML.PcaCatalog.RandomizedPca(Microsoft.ML.AnomalyDetectionCatalog.AnomalyDetectionTrainers,Microsoft.ML.Trainers.RandomizedPcaTrainer.Options)). /// - /// [!include[io](~/../docs/samples/docs/api-reference/io-columns-regression.md)] + /// [!include[io](~/../docs/samples/docs/api-reference/io-anomaly-detection.md)] /// /// ### Trainer Characteristics /// | | | @@ -51,12 +51,26 @@ namespace Microsoft.ML.Trainers /// | Required NuGet in addition to Microsoft.ML | None | /// /// ### Training Algorithm Details - /// This PCA can be made into Kernel PCA by using Random Fourier Features transform. [Reference](https://web.stanford.edu/group/mmds/slides2010/Martinsson.pdf) + /// This trainer trains an approximate PCA using a randomized method for computing the singular value decomposition (SVD) of + /// the matrix whose rows are the input vectors. + /// The model generated by this trainer contains three parameters: + /// - A projection matrix $U$ + /// - The mean vector in the original feature space $m$ + /// - The mean vector in the projected feature space $p$ + /// + /// For an input feature vector $x$, the anomaly score is computed by comparing the $L_2$ + /// norm of the original input vector, and the $L_2$ norm of the projected vector: + /// $\sqrt{\left(\|x-m\|_2^2 - \|Ux-p\|_2^2\right)\|x-m\|_2^2}$. + /// + /// The method is described [here](https://web.stanford.edu/group/mmds/slides2010/Martinsson.pdf). + /// + /// Note that the algorithm can be made into Kernel PCA by applying the + /// to the data before passing it to the trainer. /// ]]> /// /// /// - /// + /// /// public sealed class RandomizedPcaTrainer : TrainerEstimatorBase, PcaModelParameters> { @@ -66,6 +80,10 @@ public sealed class RandomizedPcaTrainer : TrainerEstimatorBase + /// Options for the as used in + /// [RandomizedPca(Options)](xref:Microsoft.ML.PcaCatalog.RandomizedPca(Microsoft.ML.AnomalyDetectionCatalog.AnomalyDetectionTrainers,Microsoft.ML.Trainers.RandomizedPcaTrainer.Options)). + /// public sealed class Options : UnsupervisedTrainerInputBaseWithWeight { [Argument(ArgumentType.AtMostOnce, HelpText = "The number of components in the PCA", ShortName = "k", SortOrder = 50)]