PCA Anomaly Detection Threshold (#4039)

artidoro · web-flow · commit bb00e07b30e9 · 2019-08-02T15:04:22.000-07:00
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/AnomalyDetection/RandomizedPcaSample.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/AnomalyDetection/RandomizedPcaSample.cs
@@ -19,12 +19,12 @@ public static void Example()
             // Training data.
             var samples = new List<DataPoint>()
             {
-                new DataPoint(){ Features = new float[3] {1, 0, 0} },
                 new DataPoint(){ Features = new float[3] {0, 2, 1} },
-                new DataPoint(){ Features = new float[3] {1, 2, 3} },
-                new DataPoint(){ Features = new float[3] {0, 1, 0} },
                 new DataPoint(){ Features = new float[3] {0, 2, 1} },
-                new DataPoint(){ Features = new float[3] {-100, 50, -100} }
+                new DataPoint(){ Features = new float[3] {0, 2, 1} },
+                new DataPoint(){ Features = new float[3] {0, 1, 2} },
+                new DataPoint(){ Features = new float[3] {0, 2, 1} },
+                new DataPoint(){ Features = new float[3] {2, 0, 0} }
             };
 
             // Convert the List<DataPoint> to IDataView, a consumble format to
@@ -57,23 +57,23 @@ public static void Example()
                 var featuresInText = string.Join(',', samples[i].Features);
 
                 if (result.PredictedLabel)
-                    // The i-th sample is predicted as an inlier.
-                    Console.WriteLine("The {0}-th example with features [{1}]" +
-                        "is an inlier with a score of being inlier {2}", i,
-                            featuresInText, result.Score);
-                else
                     // The i-th sample is predicted as an outlier.
-                    Console.WriteLine("The {0}-th example with features [{1}] is" +
+                    Console.WriteLine("The {0}-th example with features [{1}] is " +
                         "an outlier with a score of being inlier {2}", i,
+                            featuresInText, result.Score);
+                else
+                    // The i-th sample is predicted as an inlier.
+                    Console.WriteLine("The {0}-th example with features [{1}] is " +
+                        "an inlier with a score of being inlier {2}", i,
                         featuresInText, result.Score);
             }
             // Lines printed out should be
-            //   The 0 - th example with features[1, 0, 0] is an inlier with a score of being inlier 0.7453707
-            //   The 1 - th example with features[0, 2, 1] is an inlier with a score of being inlier 0.9999999
-            //   The 2 - th example with features[1, 2, 3] is an inlier with a score of being inlier 0.8450122
-            //   The 3 - th example with features[0, 1, 0] is an inlier with a score of being inlier 0.9428905
-            //   The 4 - th example with features[0, 2, 1] is an inlier with a score of being inlier 0.9999999
-            //   The 5 - th example with features[-100, 50, -100] is an outlier with a score of being inlier 0
+            // The 0 - th example with features[0, 2, 1] is an inlier with a score of being outlier 0.1101028
+            // The 1 - th example with features[0, 2, 1] is an inlier with a score of being outlier 0.1101028
+            // The 2 - th example with features[0, 2, 1] is an inlier with a score of being outlier 0.1101028
+            // The 3 - th example with features[0, 1, 2] is an outlier with a score of being outlier 0.5082728
+            // The 4 - th example with features[0, 2, 1] is an inlier with a score of being outlier 0.1101028
+            // The 5 - th example with features[2, 0, 0] is an outlier with a score of being outlier 1
         }
 
         // Example with 3 feature values. A training data set is a collection of
@@ -87,9 +87,9 @@ private class DataPoint
         // Class used to capture prediction of DataPoint.
         private class Result
         {
-            // Outlier gets false while inlier has true.
+            // Outlier gets true while inlier has false.
             public bool PredictedLabel { get; set; }
-            // Outlier gets smaller score.
+            // Inlier gets smaller score. Score is between 0 and 1.
             public float Score { get; set; }
         }
     }
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/AnomalyDetection/RandomizedPcaSampleWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/AnomalyDetection/RandomizedPcaSampleWithOptions.cs
@@ -19,12 +19,14 @@ public static void Example()
             // Training data.
             var samples = new List<DataPoint>()
             {
-                new DataPoint(){ Features = new float[3] {1, 0, 0} },
                 new DataPoint(){ Features = new float[3] {0, 2, 1} },
-                new DataPoint(){ Features = new float[3] {1, 2, 3} },
-                new DataPoint(){ Features = new float[3] {0, 1, 0} },
+                new DataPoint(){ Features = new float[3] {0, 2, 3} },
+                new DataPoint(){ Features = new float[3] {0, 2, 4} },
                 new DataPoint(){ Features = new float[3] {0, 2, 1} },
-                new DataPoint(){ Features = new float[3] {-100, 50, -100} }
+                new DataPoint(){ Features = new float[3] {0, 2, 2} },
+                new DataPoint(){ Features = new float[3] {0, 2, 3} },
+                new DataPoint(){ Features = new float[3] {0, 2, 4} },
+                new DataPoint(){ Features = new float[3] {1, 0, 0} }
             };
 
             // Convert the List<DataPoint> to IDataView, a consumble format to
@@ -63,23 +65,25 @@ public static void Example()
                 var featuresInText = string.Join(',', samples[i].Features);
 
                 if (result.PredictedLabel)
-                    // The i-th sample is predicted as an inlier.
+                    // The i-th sample is predicted as an outlier.
                     Console.WriteLine("The {0}-th example with features [{1}] is" +
-                        "an inlier with a score of being inlier {2}", i, 
+                        "an outlier with a score of being outlier {2}", i,
                         featuresInText, result.Score);
                 else
-                    // The i-th sample is predicted as an outlier.
+                    // The i-th sample is predicted as an inlier.
                     Console.WriteLine("The {0}-th example with features [{1}] is" +
-                        "an outlier with a score of being inlier {2}",
+                        "an inlier with a score of being outlier {2}",
                         i, featuresInText, result.Score);
             }
             // Lines printed out should be
-            //   The 0 - th example with features[1, 0, 0] is an inlier with a score of being inlier 0.7453707
-            //   The 1 - th example with features[0, 2, 1] is an inlier with a score of being inlier 0.9999999
-            //   The 2 - th example with features[1, 2, 3] is an inlier with a score of being inlier 0.8450122
-            //   The 3 - th example with features[0, 1, 0] is an inlier with a score of being inlier 0.9428905
-            //   The 4 - th example with features[0, 2, 1] is an inlier with a score of being inlier 0.9999999
-            //   The 5 - th example with features[-100, 50, -100] is an outlier with a score of being inlier 0
+            // The 0 - th example with features[0, 2, 1] isan inlier with a score of being outlier 0.2264826
+            // The 1 - th example with features[0, 2, 3] isan inlier with a score of being outlier 0.1739471
+            // The 2 - th example with features[0, 2, 4] isan inlier with a score of being outlier 0.05711612
+            // The 3 - th example with features[0, 2, 1] isan inlier with a score of being outlier 0.2264826
+            // The 4 - th example with features[0, 2, 2] isan inlier with a score of being outlier 0.3868995
+            // The 5 - th example with features[0, 2, 3] isan inlier with a score of being outlier 0.1739471
+            // The 6 - th example with features[0, 2, 4] isan inlier with a score of being outlier 0.05711612
+            // The 7 - th example with features[1, 0, 0] isan outlier with a score of being outlier 0.6260795
         }
 
         // Example with 3 feature values. A training data set is a collection of
@@ -93,9 +97,9 @@ private class DataPoint
         // Class used to capture prediction of DataPoint.
         private class Result
         {
-            // Outlier gets false while inlier has true.
+            // Outlier gets true while inlier has false.
             public bool PredictedLabel { get; set; }
-            // Outlier gets smaller score.
+            // Inlier gets smaller score. Score is between 0 and 1.
             public float Score { get; set; }
         }
     }
diff --git a/src/Microsoft.ML.Data/Scorers/PredictionTransformer.cs b/src/Microsoft.ML.Data/Scorers/PredictionTransformer.cs
@@ -267,7 +267,7 @@ public sealed class AnomalyPredictionTransformer<TModel> : SingleFeaturePredicti
 
         [BestFriend]
         internal AnomalyPredictionTransformer(IHostEnvironment env, TModel model, DataViewSchema inputSchema, string featureColumn,
-            float threshold = 0f, string thresholdColumn = DefaultColumnNames.Score)
+            float threshold = 0.5f, string thresholdColumn = DefaultColumnNames.Score)
             : base(Contracts.CheckRef(env, nameof(env)).Register(nameof(AnomalyPredictionTransformer<TModel>)), model, inputSchema, featureColumn)
         {
             Host.CheckNonEmpty(thresholdColumn, nameof(thresholdColumn));
diff --git a/src/Microsoft.ML.Data/TrainCatalog.cs b/src/Microsoft.ML.Data/TrainCatalog.cs
@@ -703,6 +703,22 @@ public AnomalyDetectionMetrics Evaluate(IDataView data, string labelColumnName =
             var eval = new AnomalyDetectionEvaluator(Environment, args);
             return eval.Evaluate(data, labelColumnName, scoreColumnName, predictedLabelColumnName);
         }
+
+        /// <summary>
+        /// Creates a new <see cref="AnomalyPredictionTransformer{TModel}"/> with the specified <paramref name="threshold"/>.
+        /// If the provided <paramref name="threshold"/> is the same as the <paramref name="model"/> threshold it simply returns <paramref name="model"/>.
+        /// Note that by default the threshold is 0.5 and valid scores range from 0 to 1.
+        /// </summary>
+        /// <param name="model">A trained <see cref="AnomalyPredictionTransformer{TModel}"/>.</param>
+        /// <param name="threshold">The new threshold value that will be used to determine the label of a data point
+        /// based on the predicted score by the model.</param>
+        public AnomalyPredictionTransformer<TModel> ChangeModelThreshold<TModel>(AnomalyPredictionTransformer<TModel> model, float threshold)
+            where TModel : class
+        {
+            if (model.Threshold == threshold)
+                return model;
+            return new AnomalyPredictionTransformer<TModel>(Environment, model.Model, model.TrainSchema, model.FeatureColumnName, threshold, model.ThresholdColumn);
+        }
     }
 
     /// <summary>
diff --git a/src/Microsoft.ML.PCA/PCACatalog.cs b/src/Microsoft.ML.PCA/PCACatalog.cs
@@ -54,6 +54,10 @@ internal static PrincipalComponentAnalyzer ProjectToPrincipalComponents(this Tra
         /// <param name="oversampling">Oversampling parameter for randomized PCA training.</param>
         /// <param name="ensureZeroMean">If enabled, data is centered to be zero mean.</param>
         /// <param name="seed">The seed for random number generation.</param>
+        /// <remarks>
+        /// By default the threshold used to determine the label of a data point based on the predicted score is 0.5. Scores range from 0 to 1. A data point with predicted
+        /// score higher than 0.5 is considered an outlier. Use <see cref="AnomalyDetectionCatalog.ChangeModelThreshold"/> to change this threshold.
+        /// </remarks>
         /// <example>
         /// <format type="text/markdown">
         /// <![CDATA[
@@ -78,6 +82,10 @@ public static RandomizedPcaTrainer RandomizedPca(this AnomalyDetectionCatalog.An
         /// </summary>
         /// <param name="catalog">The anomaly detection catalog trainer object.</param>
         /// <param name="options">Advanced options to the algorithm.</param>
+        /// <remarks>
+        /// By default the threshold used to determine the label of a data point based on the predicted score is 0.5. Scores range from 0 to 1. A data point with predicted
+        /// score higher than 0.5 is considered an outlier. Use <see cref="AnomalyDetectionCatalog.ChangeModelThreshold"/> to change this threshold.
+        /// </remarks>
         /// <example>
         /// <format type="text/markdown">
         /// <![CDATA[
diff --git a/test/Microsoft.ML.Tests/AnomalyDetectionTests.cs b/test/Microsoft.ML.Tests/AnomalyDetectionTests.cs
@@ -52,7 +52,7 @@ public void NoAnomalyTest()
         [Fact]
         public static void RandomizedPcaInMemory()
         {
-            // Create a new context for ML.NET operations. It can be used for exception tracking and logging, 
+            // Create a new context for ML.NET operations. It can be used for exception tracking and logging,
             // as a catalog of available operations and as the source of randomness.
             // Setting the seed to a fixed number in this example to make outputs deterministic.
             var mlContext = new MLContext(seed: 0);
@@ -68,7 +68,38 @@ public static void RandomizedPcaInMemory()
             {
                 FeatureColumnName = nameof(DataPoint.Features),
                 Rank = 1,
-                EnsureZeroMean = false
+                EnsureZeroMean = false,
+                Seed = 10
+            };
+
+            // Create anther anomaly detector. Its underlying algorithm is randomized PCA.
+            var trainer2 = mlContext.AnomalyDetection.Trainers.RandomizedPca(options);
+
+            // Test the second detector.
+            ExecutePipelineWithGivenRandomizedPcaTrainer(mlContext, trainer2);
+        }
+
+        [Fact]
+        public static void RandomizedPcaChangeThreshold()
+        {
+            // Create a new context for ML.NET operations. It can be used for exception tracking and logging,
+            // as a catalog of available operations and as the source of randomness.
+            // Setting the seed to a fixed number in this example to make outputs deterministic.
+            var mlContext = new MLContext(seed: 0);
+
+            // Create an anomaly detector. Its underlying algorithm is randomized PCA.
+            var trainer1 = mlContext.AnomalyDetection.Trainers.RandomizedPca(featureColumnName: nameof(DataPoint.Features), rank: 1, ensureZeroMean: false);
+
+            // Test the first detector.
+            ExecutePipelineWithGivenRandomizedPcaTrainer(mlContext, trainer1);
+
+            // Object required in the creation of another detector.
+            var options = new Trainers.RandomizedPcaTrainer.Options()
+            {
+                FeatureColumnName = nameof(DataPoint.Features),
+                Rank = 1,
+                EnsureZeroMean = false,
+                Seed = 10
             };
 
             // Create anther anomaly detector. Its underlying algorithm is randomized PCA.
@@ -105,12 +136,14 @@ private static void ExecutePipelineWithGivenRandomizedPcaTrainer(MLContext mlCon
         {
             var samples = new List<DataPoint>()
             {
-                new DataPoint(){ Features= new float[3] {1, 0, 0} },
-                new DataPoint(){ Features= new float[3] {0, 2, 1} },
-                new DataPoint(){ Features= new float[3] {1, 2, 3} },
-                new DataPoint(){ Features= new float[3] {0, 1, 0} },
-                new DataPoint(){ Features= new float[3] {0, 2, 1} },
-                new DataPoint(){ Features= new float[3] {-100, 50, -100} }
+                new DataPoint(){ Features = new float[3] {0, 2, 1} },
+                new DataPoint(){ Features = new float[3] {0, 2, 3} },
+                new DataPoint(){ Features = new float[3] {0, 2, 4} },
+                new DataPoint(){ Features = new float[3] {0, 2, 1} },
+                new DataPoint(){ Features = new float[3] {0, 2, 2} },
+                new DataPoint(){ Features = new float[3] {0, 2, 3} },
+                new DataPoint(){ Features = new float[3] {0, 2, 4} },
+                new DataPoint(){ Features = new float[3] {1, 0, 0} }
             };
 
             // Convert the List<DataPoint> to IDataView, a consumble format to ML.NET functions.
@@ -126,17 +159,78 @@ private static void ExecutePipelineWithGivenRandomizedPcaTrainer(MLContext mlCon
             var results = mlContext.Data.CreateEnumerable<Result>(transformed, reuseRowObject: false).ToList();
 
             // First 5 examples are inliers.
-            for (int i = 0; i < 5; ++i)
+            for (int i = 0; i < 7; ++i)
             {
-                // Inlier should be predicted as true.
-                Assert.True(results[i].PredictedLabel);
+                // Inlier should be predicted as false.
+                Assert.False(results[i].PredictedLabel);
                 // Higher score means closer to inlier.
-                Assert.InRange(results[i].Score, 0.3, 1);
+                Assert.InRange(results[i].Score, 0, 0.5);
             }
 
-            // Last example is outlier. Note that outlier should be predicted as false.
+            // Last example is outlier. Note that outlier should be predicted as true.
+            Assert.True(results[7].PredictedLabel);
+            Assert.InRange(results[7].Score, 0.5, 1);
+        }
+
+
+        /// <summary>
+        /// Help function used to execute trainers defined in <see cref="RandomizedPcaInMemory"/>.
+        /// </summary>
+        private static void ExecuteRandomizedPcaTrainerChangeThreshold(MLContext mlContext, Trainers.RandomizedPcaTrainer trainer)
+        {
+            var samples = new List<DataPoint>()
+            {
+                new DataPoint(){ Features = new float[3] {0, 2, 1} },
+                new DataPoint(){ Features = new float[3] {0, 2, 3} },
+                new DataPoint(){ Features = new float[3] {0, 2, 4} },
+                new DataPoint(){ Features = new float[3] {0, 2, 1} },
+                new DataPoint(){ Features = new float[3] {0, 2, 2} },
+                new DataPoint(){ Features = new float[3] {0, 2, 3} },
+                new DataPoint(){ Features = new float[3] {0, 2, 4} },
+                new DataPoint(){ Features = new float[3] {1, 0, 0} }
+            };
+
+            // Convert the List<DataPoint> to IDataView, a consumble format to ML.NET functions.
+            var data = mlContext.Data.LoadFromEnumerable(samples);
+
+            // Train the anomaly detector.
+            var model = trainer.Fit(data);
+
+            var transformer = mlContext.AnomalyDetection.ChangeModelThreshold(model, 0.3f);
+
+            // Apply the trained model on the training data.
+            var transformed = transformer.Transform(data);
+
+            // Read ML.NET predictions into IEnumerable<Result>.
+            var results = mlContext.Data.CreateEnumerable<Result>(transformed, reuseRowObject: false).ToList();
+
+            // Inlier should be predicted as false.
+            Assert.False(results[0].PredictedLabel);
+            Assert.InRange(results[0].Score, 0, 0.3);
+            // Inlier should be predicted as false.
+            Assert.False(results[1].PredictedLabel);
+            Assert.InRange(results[1].Score, 0, 0.3);
+            // Inlier should be predicted as false.
+            Assert.False(results[2].PredictedLabel);
+            Assert.InRange(results[2].Score, 0, 0.3);
+            // Inlier should be predicted as false.
+            Assert.False(results[3].PredictedLabel);
+            Assert.InRange(results[3].Score, 0, 0.3);
+
+            // Outlier should be predicted as true.
+            Assert.True(results[4].PredictedLabel);
+            Assert.InRange(results[4].Score, 0.3, 1);
+
+            // Inlier should be predicted as false.
             Assert.False(results[5].PredictedLabel);
             Assert.InRange(results[5].Score, 0, 0.3);
+            // Inlier should be predicted as false.
+            Assert.False(results[6].PredictedLabel);
+            Assert.InRange(results[6].Score, 0, 0.3);
+
+            // Outlier should be predicted as true.
+            Assert.True(results[7].PredictedLabel);
+            Assert.InRange(results[7].Score, 0.3, 1);
         }
 
         private IDataView DetectAnomalyInMnistOneClass(string trainPath, string testPath)

Original file line number	Diff line number	Diff line change
`@@ -267,7 +267,7 @@ public sealed class AnomalyPredictionTransformer<TModel> : SingleFeaturePredicti`
`267`	`267`
`268`	`268`	`[BestFriend]`
`269`	`269`	`internal AnomalyPredictionTransformer(IHostEnvironment env, TModel model, DataViewSchema inputSchema, string featureColumn,`
`270`		`- float threshold = 0f, string thresholdColumn = DefaultColumnNames.Score)`
	`270`	`+ float threshold = 0.5f, string thresholdColumn = DefaultColumnNames.Score)`
`271`	`271`	`: base(Contracts.CheckRef(env, nameof(env)).Register(nameof(AnomalyPredictionTransformer<TModel>)), model, inputSchema, featureColumn)`
`272`	`272`	`{`
`273`	`273`	`Host.CheckNonEmpty(thresholdColumn, nameof(thresholdColumn));`