Skip to content

Commit bb00e07

Browse files
authored
PCA Anomaly Detection Threshold (#4039)
1 parent c2d69f0 commit bb00e07

File tree

6 files changed

+170
-48
lines changed

6 files changed

+170
-48
lines changed

docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/AnomalyDetection/RandomizedPcaSample.cs

+18-18
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,12 @@ public static void Example()
1919
// Training data.
2020
var samples = new List<DataPoint>()
2121
{
22-
new DataPoint(){ Features = new float[3] {1, 0, 0} },
2322
new DataPoint(){ Features = new float[3] {0, 2, 1} },
24-
new DataPoint(){ Features = new float[3] {1, 2, 3} },
25-
new DataPoint(){ Features = new float[3] {0, 1, 0} },
2623
new DataPoint(){ Features = new float[3] {0, 2, 1} },
27-
new DataPoint(){ Features = new float[3] {-100, 50, -100} }
24+
new DataPoint(){ Features = new float[3] {0, 2, 1} },
25+
new DataPoint(){ Features = new float[3] {0, 1, 2} },
26+
new DataPoint(){ Features = new float[3] {0, 2, 1} },
27+
new DataPoint(){ Features = new float[3] {2, 0, 0} }
2828
};
2929

3030
// Convert the List<DataPoint> to IDataView, a consumble format to
@@ -57,23 +57,23 @@ public static void Example()
5757
var featuresInText = string.Join(',', samples[i].Features);
5858

5959
if (result.PredictedLabel)
60-
// The i-th sample is predicted as an inlier.
61-
Console.WriteLine("The {0}-th example with features [{1}]" +
62-
"is an inlier with a score of being inlier {2}", i,
63-
featuresInText, result.Score);
64-
else
6560
// The i-th sample is predicted as an outlier.
66-
Console.WriteLine("The {0}-th example with features [{1}] is" +
61+
Console.WriteLine("The {0}-th example with features [{1}] is " +
6762
"an outlier with a score of being inlier {2}", i,
63+
featuresInText, result.Score);
64+
else
65+
// The i-th sample is predicted as an inlier.
66+
Console.WriteLine("The {0}-th example with features [{1}] is " +
67+
"an inlier with a score of being inlier {2}", i,
6868
featuresInText, result.Score);
6969
}
7070
// Lines printed out should be
71-
// The 0 - th example with features[1, 0, 0] is an inlier with a score of being inlier 0.7453707
72-
// The 1 - th example with features[0, 2, 1] is an inlier with a score of being inlier 0.9999999
73-
// The 2 - th example with features[1, 2, 3] is an inlier with a score of being inlier 0.8450122
74-
// The 3 - th example with features[0, 1, 0] is an inlier with a score of being inlier 0.9428905
75-
// The 4 - th example with features[0, 2, 1] is an inlier with a score of being inlier 0.9999999
76-
// The 5 - th example with features[-100, 50, -100] is an outlier with a score of being inlier 0
71+
// The 0 - th example with features[0, 2, 1] is an inlier with a score of being outlier 0.1101028
72+
// The 1 - th example with features[0, 2, 1] is an inlier with a score of being outlier 0.1101028
73+
// The 2 - th example with features[0, 2, 1] is an inlier with a score of being outlier 0.1101028
74+
// The 3 - th example with features[0, 1, 2] is an outlier with a score of being outlier 0.5082728
75+
// The 4 - th example with features[0, 2, 1] is an inlier with a score of being outlier 0.1101028
76+
// The 5 - th example with features[2, 0, 0] is an outlier with a score of being outlier 1
7777
}
7878

7979
// Example with 3 feature values. A training data set is a collection of
@@ -87,9 +87,9 @@ private class DataPoint
8787
// Class used to capture prediction of DataPoint.
8888
private class Result
8989
{
90-
// Outlier gets false while inlier has true.
90+
// Outlier gets true while inlier has false.
9191
public bool PredictedLabel { get; set; }
92-
// Outlier gets smaller score.
92+
// Inlier gets smaller score. Score is between 0 and 1.
9393
public float Score { get; set; }
9494
}
9595
}

docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/AnomalyDetection/RandomizedPcaSampleWithOptions.cs

+20-16
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,14 @@ public static void Example()
1919
// Training data.
2020
var samples = new List<DataPoint>()
2121
{
22-
new DataPoint(){ Features = new float[3] {1, 0, 0} },
2322
new DataPoint(){ Features = new float[3] {0, 2, 1} },
24-
new DataPoint(){ Features = new float[3] {1, 2, 3} },
25-
new DataPoint(){ Features = new float[3] {0, 1, 0} },
23+
new DataPoint(){ Features = new float[3] {0, 2, 3} },
24+
new DataPoint(){ Features = new float[3] {0, 2, 4} },
2625
new DataPoint(){ Features = new float[3] {0, 2, 1} },
27-
new DataPoint(){ Features = new float[3] {-100, 50, -100} }
26+
new DataPoint(){ Features = new float[3] {0, 2, 2} },
27+
new DataPoint(){ Features = new float[3] {0, 2, 3} },
28+
new DataPoint(){ Features = new float[3] {0, 2, 4} },
29+
new DataPoint(){ Features = new float[3] {1, 0, 0} }
2830
};
2931

3032
// Convert the List<DataPoint> to IDataView, a consumble format to
@@ -63,23 +65,25 @@ public static void Example()
6365
var featuresInText = string.Join(',', samples[i].Features);
6466

6567
if (result.PredictedLabel)
66-
// The i-th sample is predicted as an inlier.
68+
// The i-th sample is predicted as an outlier.
6769
Console.WriteLine("The {0}-th example with features [{1}] is" +
68-
"an inlier with a score of being inlier {2}", i,
70+
"an outlier with a score of being outlier {2}", i,
6971
featuresInText, result.Score);
7072
else
71-
// The i-th sample is predicted as an outlier.
73+
// The i-th sample is predicted as an inlier.
7274
Console.WriteLine("The {0}-th example with features [{1}] is" +
73-
"an outlier with a score of being inlier {2}",
75+
"an inlier with a score of being outlier {2}",
7476
i, featuresInText, result.Score);
7577
}
7678
// Lines printed out should be
77-
// The 0 - th example with features[1, 0, 0] is an inlier with a score of being inlier 0.7453707
78-
// The 1 - th example with features[0, 2, 1] is an inlier with a score of being inlier 0.9999999
79-
// The 2 - th example with features[1, 2, 3] is an inlier with a score of being inlier 0.8450122
80-
// The 3 - th example with features[0, 1, 0] is an inlier with a score of being inlier 0.9428905
81-
// The 4 - th example with features[0, 2, 1] is an inlier with a score of being inlier 0.9999999
82-
// The 5 - th example with features[-100, 50, -100] is an outlier with a score of being inlier 0
79+
// The 0 - th example with features[0, 2, 1] isan inlier with a score of being outlier 0.2264826
80+
// The 1 - th example with features[0, 2, 3] isan inlier with a score of being outlier 0.1739471
81+
// The 2 - th example with features[0, 2, 4] isan inlier with a score of being outlier 0.05711612
82+
// The 3 - th example with features[0, 2, 1] isan inlier with a score of being outlier 0.2264826
83+
// The 4 - th example with features[0, 2, 2] isan inlier with a score of being outlier 0.3868995
84+
// The 5 - th example with features[0, 2, 3] isan inlier with a score of being outlier 0.1739471
85+
// The 6 - th example with features[0, 2, 4] isan inlier with a score of being outlier 0.05711612
86+
// The 7 - th example with features[1, 0, 0] isan outlier with a score of being outlier 0.6260795
8387
}
8488

8589
// Example with 3 feature values. A training data set is a collection of
@@ -93,9 +97,9 @@ private class DataPoint
9397
// Class used to capture prediction of DataPoint.
9498
private class Result
9599
{
96-
// Outlier gets false while inlier has true.
100+
// Outlier gets true while inlier has false.
97101
public bool PredictedLabel { get; set; }
98-
// Outlier gets smaller score.
102+
// Inlier gets smaller score. Score is between 0 and 1.
99103
public float Score { get; set; }
100104
}
101105
}

src/Microsoft.ML.Data/Scorers/PredictionTransformer.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -267,7 +267,7 @@ public sealed class AnomalyPredictionTransformer<TModel> : SingleFeaturePredicti
267267

268268
[BestFriend]
269269
internal AnomalyPredictionTransformer(IHostEnvironment env, TModel model, DataViewSchema inputSchema, string featureColumn,
270-
float threshold = 0f, string thresholdColumn = DefaultColumnNames.Score)
270+
float threshold = 0.5f, string thresholdColumn = DefaultColumnNames.Score)
271271
: base(Contracts.CheckRef(env, nameof(env)).Register(nameof(AnomalyPredictionTransformer<TModel>)), model, inputSchema, featureColumn)
272272
{
273273
Host.CheckNonEmpty(thresholdColumn, nameof(thresholdColumn));

src/Microsoft.ML.Data/TrainCatalog.cs

+16
Original file line numberDiff line numberDiff line change
@@ -703,6 +703,22 @@ public AnomalyDetectionMetrics Evaluate(IDataView data, string labelColumnName =
703703
var eval = new AnomalyDetectionEvaluator(Environment, args);
704704
return eval.Evaluate(data, labelColumnName, scoreColumnName, predictedLabelColumnName);
705705
}
706+
707+
/// <summary>
708+
/// Creates a new <see cref="AnomalyPredictionTransformer{TModel}"/> with the specified <paramref name="threshold"/>.
709+
/// If the provided <paramref name="threshold"/> is the same as the <paramref name="model"/> threshold it simply returns <paramref name="model"/>.
710+
/// Note that by default the threshold is 0.5 and valid scores range from 0 to 1.
711+
/// </summary>
712+
/// <param name="model">A trained <see cref="AnomalyPredictionTransformer{TModel}"/>.</param>
713+
/// <param name="threshold">The new threshold value that will be used to determine the label of a data point
714+
/// based on the predicted score by the model.</param>
715+
public AnomalyPredictionTransformer<TModel> ChangeModelThreshold<TModel>(AnomalyPredictionTransformer<TModel> model, float threshold)
716+
where TModel : class
717+
{
718+
if (model.Threshold == threshold)
719+
return model;
720+
return new AnomalyPredictionTransformer<TModel>(Environment, model.Model, model.TrainSchema, model.FeatureColumnName, threshold, model.ThresholdColumn);
721+
}
706722
}
707723

708724
/// <summary>

src/Microsoft.ML.PCA/PCACatalog.cs

+8
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,10 @@ internal static PrincipalComponentAnalyzer ProjectToPrincipalComponents(this Tra
5454
/// <param name="oversampling">Oversampling parameter for randomized PCA training.</param>
5555
/// <param name="ensureZeroMean">If enabled, data is centered to be zero mean.</param>
5656
/// <param name="seed">The seed for random number generation.</param>
57+
/// <remarks>
58+
/// By default the threshold used to determine the label of a data point based on the predicted score is 0.5. Scores range from 0 to 1. A data point with predicted
59+
/// score higher than 0.5 is considered an outlier. Use <see cref="AnomalyDetectionCatalog.ChangeModelThreshold"/> to change this threshold.
60+
/// </remarks>
5761
/// <example>
5862
/// <format type="text/markdown">
5963
/// <![CDATA[
@@ -78,6 +82,10 @@ public static RandomizedPcaTrainer RandomizedPca(this AnomalyDetectionCatalog.An
7882
/// </summary>
7983
/// <param name="catalog">The anomaly detection catalog trainer object.</param>
8084
/// <param name="options">Advanced options to the algorithm.</param>
85+
/// <remarks>
86+
/// By default the threshold used to determine the label of a data point based on the predicted score is 0.5. Scores range from 0 to 1. A data point with predicted
87+
/// score higher than 0.5 is considered an outlier. Use <see cref="AnomalyDetectionCatalog.ChangeModelThreshold"/> to change this threshold.
88+
/// </remarks>
8189
/// <example>
8290
/// <format type="text/markdown">
8391
/// <![CDATA[

test/Microsoft.ML.Tests/AnomalyDetectionTests.cs

+107-13
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ public void NoAnomalyTest()
5252
[Fact]
5353
public static void RandomizedPcaInMemory()
5454
{
55-
// Create a new context for ML.NET operations. It can be used for exception tracking and logging,
55+
// Create a new context for ML.NET operations. It can be used for exception tracking and logging,
5656
// as a catalog of available operations and as the source of randomness.
5757
// Setting the seed to a fixed number in this example to make outputs deterministic.
5858
var mlContext = new MLContext(seed: 0);
@@ -68,7 +68,38 @@ public static void RandomizedPcaInMemory()
6868
{
6969
FeatureColumnName = nameof(DataPoint.Features),
7070
Rank = 1,
71-
EnsureZeroMean = false
71+
EnsureZeroMean = false,
72+
Seed = 10
73+
};
74+
75+
// Create anther anomaly detector. Its underlying algorithm is randomized PCA.
76+
var trainer2 = mlContext.AnomalyDetection.Trainers.RandomizedPca(options);
77+
78+
// Test the second detector.
79+
ExecutePipelineWithGivenRandomizedPcaTrainer(mlContext, trainer2);
80+
}
81+
82+
[Fact]
83+
public static void RandomizedPcaChangeThreshold()
84+
{
85+
// Create a new context for ML.NET operations. It can be used for exception tracking and logging,
86+
// as a catalog of available operations and as the source of randomness.
87+
// Setting the seed to a fixed number in this example to make outputs deterministic.
88+
var mlContext = new MLContext(seed: 0);
89+
90+
// Create an anomaly detector. Its underlying algorithm is randomized PCA.
91+
var trainer1 = mlContext.AnomalyDetection.Trainers.RandomizedPca(featureColumnName: nameof(DataPoint.Features), rank: 1, ensureZeroMean: false);
92+
93+
// Test the first detector.
94+
ExecutePipelineWithGivenRandomizedPcaTrainer(mlContext, trainer1);
95+
96+
// Object required in the creation of another detector.
97+
var options = new Trainers.RandomizedPcaTrainer.Options()
98+
{
99+
FeatureColumnName = nameof(DataPoint.Features),
100+
Rank = 1,
101+
EnsureZeroMean = false,
102+
Seed = 10
72103
};
73104

74105
// Create anther anomaly detector. Its underlying algorithm is randomized PCA.
@@ -105,12 +136,14 @@ private static void ExecutePipelineWithGivenRandomizedPcaTrainer(MLContext mlCon
105136
{
106137
var samples = new List<DataPoint>()
107138
{
108-
new DataPoint(){ Features= new float[3] {1, 0, 0} },
109-
new DataPoint(){ Features= new float[3] {0, 2, 1} },
110-
new DataPoint(){ Features= new float[3] {1, 2, 3} },
111-
new DataPoint(){ Features= new float[3] {0, 1, 0} },
112-
new DataPoint(){ Features= new float[3] {0, 2, 1} },
113-
new DataPoint(){ Features= new float[3] {-100, 50, -100} }
139+
new DataPoint(){ Features = new float[3] {0, 2, 1} },
140+
new DataPoint(){ Features = new float[3] {0, 2, 3} },
141+
new DataPoint(){ Features = new float[3] {0, 2, 4} },
142+
new DataPoint(){ Features = new float[3] {0, 2, 1} },
143+
new DataPoint(){ Features = new float[3] {0, 2, 2} },
144+
new DataPoint(){ Features = new float[3] {0, 2, 3} },
145+
new DataPoint(){ Features = new float[3] {0, 2, 4} },
146+
new DataPoint(){ Features = new float[3] {1, 0, 0} }
114147
};
115148

116149
// Convert the List<DataPoint> to IDataView, a consumble format to ML.NET functions.
@@ -126,17 +159,78 @@ private static void ExecutePipelineWithGivenRandomizedPcaTrainer(MLContext mlCon
126159
var results = mlContext.Data.CreateEnumerable<Result>(transformed, reuseRowObject: false).ToList();
127160

128161
// First 5 examples are inliers.
129-
for (int i = 0; i < 5; ++i)
162+
for (int i = 0; i < 7; ++i)
130163
{
131-
// Inlier should be predicted as true.
132-
Assert.True(results[i].PredictedLabel);
164+
// Inlier should be predicted as false.
165+
Assert.False(results[i].PredictedLabel);
133166
// Higher score means closer to inlier.
134-
Assert.InRange(results[i].Score, 0.3, 1);
167+
Assert.InRange(results[i].Score, 0, 0.5);
135168
}
136169

137-
// Last example is outlier. Note that outlier should be predicted as false.
170+
// Last example is outlier. Note that outlier should be predicted as true.
171+
Assert.True(results[7].PredictedLabel);
172+
Assert.InRange(results[7].Score, 0.5, 1);
173+
}
174+
175+
176+
/// <summary>
177+
/// Help function used to execute trainers defined in <see cref="RandomizedPcaInMemory"/>.
178+
/// </summary>
179+
private static void ExecuteRandomizedPcaTrainerChangeThreshold(MLContext mlContext, Trainers.RandomizedPcaTrainer trainer)
180+
{
181+
var samples = new List<DataPoint>()
182+
{
183+
new DataPoint(){ Features = new float[3] {0, 2, 1} },
184+
new DataPoint(){ Features = new float[3] {0, 2, 3} },
185+
new DataPoint(){ Features = new float[3] {0, 2, 4} },
186+
new DataPoint(){ Features = new float[3] {0, 2, 1} },
187+
new DataPoint(){ Features = new float[3] {0, 2, 2} },
188+
new DataPoint(){ Features = new float[3] {0, 2, 3} },
189+
new DataPoint(){ Features = new float[3] {0, 2, 4} },
190+
new DataPoint(){ Features = new float[3] {1, 0, 0} }
191+
};
192+
193+
// Convert the List<DataPoint> to IDataView, a consumble format to ML.NET functions.
194+
var data = mlContext.Data.LoadFromEnumerable(samples);
195+
196+
// Train the anomaly detector.
197+
var model = trainer.Fit(data);
198+
199+
var transformer = mlContext.AnomalyDetection.ChangeModelThreshold(model, 0.3f);
200+
201+
// Apply the trained model on the training data.
202+
var transformed = transformer.Transform(data);
203+
204+
// Read ML.NET predictions into IEnumerable<Result>.
205+
var results = mlContext.Data.CreateEnumerable<Result>(transformed, reuseRowObject: false).ToList();
206+
207+
// Inlier should be predicted as false.
208+
Assert.False(results[0].PredictedLabel);
209+
Assert.InRange(results[0].Score, 0, 0.3);
210+
// Inlier should be predicted as false.
211+
Assert.False(results[1].PredictedLabel);
212+
Assert.InRange(results[1].Score, 0, 0.3);
213+
// Inlier should be predicted as false.
214+
Assert.False(results[2].PredictedLabel);
215+
Assert.InRange(results[2].Score, 0, 0.3);
216+
// Inlier should be predicted as false.
217+
Assert.False(results[3].PredictedLabel);
218+
Assert.InRange(results[3].Score, 0, 0.3);
219+
220+
// Outlier should be predicted as true.
221+
Assert.True(results[4].PredictedLabel);
222+
Assert.InRange(results[4].Score, 0.3, 1);
223+
224+
// Inlier should be predicted as false.
138225
Assert.False(results[5].PredictedLabel);
139226
Assert.InRange(results[5].Score, 0, 0.3);
227+
// Inlier should be predicted as false.
228+
Assert.False(results[6].PredictedLabel);
229+
Assert.InRange(results[6].Score, 0, 0.3);
230+
231+
// Outlier should be predicted as true.
232+
Assert.True(results[7].PredictedLabel);
233+
Assert.InRange(results[7].Score, 0.3, 1);
140234
}
141235

142236
private IDataView DetectAnomalyInMnistOneClass(string trainPath, string testPath)

0 commit comments

Comments
 (0)