diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/AnomalyDetection/RandomizedPcaSample.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/AnomalyDetection/RandomizedPcaSample.cs index adccee9e81..175371d4a0 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/AnomalyDetection/RandomizedPcaSample.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/AnomalyDetection/RandomizedPcaSample.cs @@ -19,12 +19,12 @@ public static void Example() // Training data. var samples = new List() { - new DataPoint(){ Features = new float[3] {1, 0, 0} }, new DataPoint(){ Features = new float[3] {0, 2, 1} }, - new DataPoint(){ Features = new float[3] {1, 2, 3} }, - new DataPoint(){ Features = new float[3] {0, 1, 0} }, new DataPoint(){ Features = new float[3] {0, 2, 1} }, - new DataPoint(){ Features = new float[3] {-100, 50, -100} } + new DataPoint(){ Features = new float[3] {0, 2, 1} }, + new DataPoint(){ Features = new float[3] {0, 1, 2} }, + new DataPoint(){ Features = new float[3] {0, 2, 1} }, + new DataPoint(){ Features = new float[3] {2, 0, 0} } }; // Convert the List to IDataView, a consumble format to @@ -57,23 +57,23 @@ public static void Example() var featuresInText = string.Join(',', samples[i].Features); if (result.PredictedLabel) - // The i-th sample is predicted as an inlier. - Console.WriteLine("The {0}-th example with features [{1}]" + - "is an inlier with a score of being inlier {2}", i, - featuresInText, result.Score); - else // The i-th sample is predicted as an outlier. - Console.WriteLine("The {0}-th example with features [{1}] is" + + Console.WriteLine("The {0}-th example with features [{1}] is " + "an outlier with a score of being inlier {2}", i, + featuresInText, result.Score); + else + // The i-th sample is predicted as an inlier. + Console.WriteLine("The {0}-th example with features [{1}] is " + + "an inlier with a score of being inlier {2}", i, featuresInText, result.Score); } // Lines printed out should be - // The 0 - th example with features[1, 0, 0] is an inlier with a score of being inlier 0.7453707 - // The 1 - th example with features[0, 2, 1] is an inlier with a score of being inlier 0.9999999 - // The 2 - th example with features[1, 2, 3] is an inlier with a score of being inlier 0.8450122 - // The 3 - th example with features[0, 1, 0] is an inlier with a score of being inlier 0.9428905 - // The 4 - th example with features[0, 2, 1] is an inlier with a score of being inlier 0.9999999 - // The 5 - th example with features[-100, 50, -100] is an outlier with a score of being inlier 0 + // The 0 - th example with features[0, 2, 1] is an inlier with a score of being outlier 0.1101028 + // The 1 - th example with features[0, 2, 1] is an inlier with a score of being outlier 0.1101028 + // The 2 - th example with features[0, 2, 1] is an inlier with a score of being outlier 0.1101028 + // The 3 - th example with features[0, 1, 2] is an outlier with a score of being outlier 0.5082728 + // The 4 - th example with features[0, 2, 1] is an inlier with a score of being outlier 0.1101028 + // The 5 - th example with features[2, 0, 0] is an outlier with a score of being outlier 1 } // Example with 3 feature values. A training data set is a collection of @@ -87,9 +87,9 @@ private class DataPoint // Class used to capture prediction of DataPoint. private class Result { - // Outlier gets false while inlier has true. + // Outlier gets true while inlier has false. public bool PredictedLabel { get; set; } - // Outlier gets smaller score. + // Inlier gets smaller score. Score is between 0 and 1. public float Score { get; set; } } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/AnomalyDetection/RandomizedPcaSampleWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/AnomalyDetection/RandomizedPcaSampleWithOptions.cs index 7a281880b7..dc7f81accd 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/AnomalyDetection/RandomizedPcaSampleWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/AnomalyDetection/RandomizedPcaSampleWithOptions.cs @@ -19,12 +19,14 @@ public static void Example() // Training data. var samples = new List() { - new DataPoint(){ Features = new float[3] {1, 0, 0} }, new DataPoint(){ Features = new float[3] {0, 2, 1} }, - new DataPoint(){ Features = new float[3] {1, 2, 3} }, - new DataPoint(){ Features = new float[3] {0, 1, 0} }, + new DataPoint(){ Features = new float[3] {0, 2, 3} }, + new DataPoint(){ Features = new float[3] {0, 2, 4} }, new DataPoint(){ Features = new float[3] {0, 2, 1} }, - new DataPoint(){ Features = new float[3] {-100, 50, -100} } + new DataPoint(){ Features = new float[3] {0, 2, 2} }, + new DataPoint(){ Features = new float[3] {0, 2, 3} }, + new DataPoint(){ Features = new float[3] {0, 2, 4} }, + new DataPoint(){ Features = new float[3] {1, 0, 0} } }; // Convert the List to IDataView, a consumble format to @@ -63,23 +65,25 @@ public static void Example() var featuresInText = string.Join(',', samples[i].Features); if (result.PredictedLabel) - // The i-th sample is predicted as an inlier. + // The i-th sample is predicted as an outlier. Console.WriteLine("The {0}-th example with features [{1}] is" + - "an inlier with a score of being inlier {2}", i, + "an outlier with a score of being outlier {2}", i, featuresInText, result.Score); else - // The i-th sample is predicted as an outlier. + // The i-th sample is predicted as an inlier. Console.WriteLine("The {0}-th example with features [{1}] is" + - "an outlier with a score of being inlier {2}", + "an inlier with a score of being outlier {2}", i, featuresInText, result.Score); } // Lines printed out should be - // The 0 - th example with features[1, 0, 0] is an inlier with a score of being inlier 0.7453707 - // The 1 - th example with features[0, 2, 1] is an inlier with a score of being inlier 0.9999999 - // The 2 - th example with features[1, 2, 3] is an inlier with a score of being inlier 0.8450122 - // The 3 - th example with features[0, 1, 0] is an inlier with a score of being inlier 0.9428905 - // The 4 - th example with features[0, 2, 1] is an inlier with a score of being inlier 0.9999999 - // The 5 - th example with features[-100, 50, -100] is an outlier with a score of being inlier 0 + // The 0 - th example with features[0, 2, 1] isan inlier with a score of being outlier 0.2264826 + // The 1 - th example with features[0, 2, 3] isan inlier with a score of being outlier 0.1739471 + // The 2 - th example with features[0, 2, 4] isan inlier with a score of being outlier 0.05711612 + // The 3 - th example with features[0, 2, 1] isan inlier with a score of being outlier 0.2264826 + // The 4 - th example with features[0, 2, 2] isan inlier with a score of being outlier 0.3868995 + // The 5 - th example with features[0, 2, 3] isan inlier with a score of being outlier 0.1739471 + // The 6 - th example with features[0, 2, 4] isan inlier with a score of being outlier 0.05711612 + // The 7 - th example with features[1, 0, 0] isan outlier with a score of being outlier 0.6260795 } // Example with 3 feature values. A training data set is a collection of @@ -93,9 +97,9 @@ private class DataPoint // Class used to capture prediction of DataPoint. private class Result { - // Outlier gets false while inlier has true. + // Outlier gets true while inlier has false. public bool PredictedLabel { get; set; } - // Outlier gets smaller score. + // Inlier gets smaller score. Score is between 0 and 1. public float Score { get; set; } } } diff --git a/src/Microsoft.ML.Data/Scorers/PredictionTransformer.cs b/src/Microsoft.ML.Data/Scorers/PredictionTransformer.cs index b4467b9f8b..a3b3ee10c1 100644 --- a/src/Microsoft.ML.Data/Scorers/PredictionTransformer.cs +++ b/src/Microsoft.ML.Data/Scorers/PredictionTransformer.cs @@ -267,7 +267,7 @@ public sealed class AnomalyPredictionTransformer : SingleFeaturePredicti [BestFriend] internal AnomalyPredictionTransformer(IHostEnvironment env, TModel model, DataViewSchema inputSchema, string featureColumn, - float threshold = 0f, string thresholdColumn = DefaultColumnNames.Score) + float threshold = 0.5f, string thresholdColumn = DefaultColumnNames.Score) : base(Contracts.CheckRef(env, nameof(env)).Register(nameof(AnomalyPredictionTransformer)), model, inputSchema, featureColumn) { Host.CheckNonEmpty(thresholdColumn, nameof(thresholdColumn)); diff --git a/src/Microsoft.ML.Data/TrainCatalog.cs b/src/Microsoft.ML.Data/TrainCatalog.cs index 2331b51792..b135e42e86 100644 --- a/src/Microsoft.ML.Data/TrainCatalog.cs +++ b/src/Microsoft.ML.Data/TrainCatalog.cs @@ -703,6 +703,22 @@ public AnomalyDetectionMetrics Evaluate(IDataView data, string labelColumnName = var eval = new AnomalyDetectionEvaluator(Environment, args); return eval.Evaluate(data, labelColumnName, scoreColumnName, predictedLabelColumnName); } + + /// + /// Creates a new with the specified . + /// If the provided is the same as the threshold it simply returns . + /// Note that by default the threshold is 0.5 and valid scores range from 0 to 1. + /// + /// A trained . + /// The new threshold value that will be used to determine the label of a data point + /// based on the predicted score by the model. + public AnomalyPredictionTransformer ChangeModelThreshold(AnomalyPredictionTransformer model, float threshold) + where TModel : class + { + if (model.Threshold == threshold) + return model; + return new AnomalyPredictionTransformer(Environment, model.Model, model.TrainSchema, model.FeatureColumnName, threshold, model.ThresholdColumn); + } } /// diff --git a/src/Microsoft.ML.PCA/PCACatalog.cs b/src/Microsoft.ML.PCA/PCACatalog.cs index c68b9d0f85..fb28765a00 100644 --- a/src/Microsoft.ML.PCA/PCACatalog.cs +++ b/src/Microsoft.ML.PCA/PCACatalog.cs @@ -54,6 +54,10 @@ internal static PrincipalComponentAnalyzer ProjectToPrincipalComponents(this Tra /// Oversampling parameter for randomized PCA training. /// If enabled, data is centered to be zero mean. /// The seed for random number generation. + /// + /// By default the threshold used to determine the label of a data point based on the predicted score is 0.5. Scores range from 0 to 1. A data point with predicted + /// score higher than 0.5 is considered an outlier. Use to change this threshold. + /// /// /// /// /// The anomaly detection catalog trainer object. /// Advanced options to the algorithm. + /// + /// By default the threshold used to determine the label of a data point based on the predicted score is 0.5. Scores range from 0 to 1. A data point with predicted + /// score higher than 0.5 is considered an outlier. Use to change this threshold. + /// /// /// /// () { - new DataPoint(){ Features= new float[3] {1, 0, 0} }, - new DataPoint(){ Features= new float[3] {0, 2, 1} }, - new DataPoint(){ Features= new float[3] {1, 2, 3} }, - new DataPoint(){ Features= new float[3] {0, 1, 0} }, - new DataPoint(){ Features= new float[3] {0, 2, 1} }, - new DataPoint(){ Features= new float[3] {-100, 50, -100} } + new DataPoint(){ Features = new float[3] {0, 2, 1} }, + new DataPoint(){ Features = new float[3] {0, 2, 3} }, + new DataPoint(){ Features = new float[3] {0, 2, 4} }, + new DataPoint(){ Features = new float[3] {0, 2, 1} }, + new DataPoint(){ Features = new float[3] {0, 2, 2} }, + new DataPoint(){ Features = new float[3] {0, 2, 3} }, + new DataPoint(){ Features = new float[3] {0, 2, 4} }, + new DataPoint(){ Features = new float[3] {1, 0, 0} } }; // Convert the List to IDataView, a consumble format to ML.NET functions. @@ -126,17 +159,78 @@ private static void ExecutePipelineWithGivenRandomizedPcaTrainer(MLContext mlCon var results = mlContext.Data.CreateEnumerable(transformed, reuseRowObject: false).ToList(); // First 5 examples are inliers. - for (int i = 0; i < 5; ++i) + for (int i = 0; i < 7; ++i) { - // Inlier should be predicted as true. - Assert.True(results[i].PredictedLabel); + // Inlier should be predicted as false. + Assert.False(results[i].PredictedLabel); // Higher score means closer to inlier. - Assert.InRange(results[i].Score, 0.3, 1); + Assert.InRange(results[i].Score, 0, 0.5); } - // Last example is outlier. Note that outlier should be predicted as false. + // Last example is outlier. Note that outlier should be predicted as true. + Assert.True(results[7].PredictedLabel); + Assert.InRange(results[7].Score, 0.5, 1); + } + + + /// + /// Help function used to execute trainers defined in . + /// + private static void ExecuteRandomizedPcaTrainerChangeThreshold(MLContext mlContext, Trainers.RandomizedPcaTrainer trainer) + { + var samples = new List() + { + new DataPoint(){ Features = new float[3] {0, 2, 1} }, + new DataPoint(){ Features = new float[3] {0, 2, 3} }, + new DataPoint(){ Features = new float[3] {0, 2, 4} }, + new DataPoint(){ Features = new float[3] {0, 2, 1} }, + new DataPoint(){ Features = new float[3] {0, 2, 2} }, + new DataPoint(){ Features = new float[3] {0, 2, 3} }, + new DataPoint(){ Features = new float[3] {0, 2, 4} }, + new DataPoint(){ Features = new float[3] {1, 0, 0} } + }; + + // Convert the List to IDataView, a consumble format to ML.NET functions. + var data = mlContext.Data.LoadFromEnumerable(samples); + + // Train the anomaly detector. + var model = trainer.Fit(data); + + var transformer = mlContext.AnomalyDetection.ChangeModelThreshold(model, 0.3f); + + // Apply the trained model on the training data. + var transformed = transformer.Transform(data); + + // Read ML.NET predictions into IEnumerable. + var results = mlContext.Data.CreateEnumerable(transformed, reuseRowObject: false).ToList(); + + // Inlier should be predicted as false. + Assert.False(results[0].PredictedLabel); + Assert.InRange(results[0].Score, 0, 0.3); + // Inlier should be predicted as false. + Assert.False(results[1].PredictedLabel); + Assert.InRange(results[1].Score, 0, 0.3); + // Inlier should be predicted as false. + Assert.False(results[2].PredictedLabel); + Assert.InRange(results[2].Score, 0, 0.3); + // Inlier should be predicted as false. + Assert.False(results[3].PredictedLabel); + Assert.InRange(results[3].Score, 0, 0.3); + + // Outlier should be predicted as true. + Assert.True(results[4].PredictedLabel); + Assert.InRange(results[4].Score, 0.3, 1); + + // Inlier should be predicted as false. Assert.False(results[5].PredictedLabel); Assert.InRange(results[5].Score, 0, 0.3); + // Inlier should be predicted as false. + Assert.False(results[6].PredictedLabel); + Assert.InRange(results[6].Score, 0, 0.3); + + // Outlier should be predicted as true. + Assert.True(results[7].PredictedLabel); + Assert.InRange(results[7].Score, 0.3, 1); } private IDataView DetectAnomalyInMnistOneClass(string trainPath, string testPath)