Skip to content

PCA Anomaly Detection Threshold #4039

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Aug 2, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,12 @@ public static void Example()
// Training data.
var samples = new List<DataPoint>()
{
new DataPoint(){ Features = new float[3] {1, 0, 0} },
new DataPoint(){ Features = new float[3] {0, 2, 1} },
new DataPoint(){ Features = new float[3] {1, 2, 3} },
new DataPoint(){ Features = new float[3] {0, 1, 0} },
new DataPoint(){ Features = new float[3] {0, 2, 1} },
new DataPoint(){ Features = new float[3] {-100, 50, -100} }
new DataPoint(){ Features = new float[3] {0, 2, 1} },
new DataPoint(){ Features = new float[3] {0, 1, 2} },
new DataPoint(){ Features = new float[3] {0, 2, 1} },
new DataPoint(){ Features = new float[3] {2, 0, 0} }
};

// Convert the List<DataPoint> to IDataView, a consumble format to
Expand Down Expand Up @@ -57,23 +57,23 @@ public static void Example()
var featuresInText = string.Join(',', samples[i].Features);

if (result.PredictedLabel)
// The i-th sample is predicted as an inlier.
Console.WriteLine("The {0}-th example with features [{1}]" +
"is an inlier with a score of being inlier {2}", i,
featuresInText, result.Score);
else
// The i-th sample is predicted as an outlier.
Console.WriteLine("The {0}-th example with features [{1}] is" +
Console.WriteLine("The {0}-th example with features [{1}] is " +
"an outlier with a score of being inlier {2}", i,
featuresInText, result.Score);
else
// The i-th sample is predicted as an inlier.
Console.WriteLine("The {0}-th example with features [{1}] is " +
"an inlier with a score of being inlier {2}", i,
featuresInText, result.Score);
}
// Lines printed out should be
// The 0 - th example with features[1, 0, 0] is an inlier with a score of being inlier 0.7453707
// The 1 - th example with features[0, 2, 1] is an inlier with a score of being inlier 0.9999999
// The 2 - th example with features[1, 2, 3] is an inlier with a score of being inlier 0.8450122
// The 3 - th example with features[0, 1, 0] is an inlier with a score of being inlier 0.9428905
// The 4 - th example with features[0, 2, 1] is an inlier with a score of being inlier 0.9999999
// The 5 - th example with features[-100, 50, -100] is an outlier with a score of being inlier 0
// The 0 - th example with features[0, 2, 1] is an inlier with a score of being outlier 0.1101028
// The 1 - th example with features[0, 2, 1] is an inlier with a score of being outlier 0.1101028
// The 2 - th example with features[0, 2, 1] is an inlier with a score of being outlier 0.1101028
// The 3 - th example with features[0, 1, 2] is an outlier with a score of being outlier 0.5082728
// The 4 - th example with features[0, 2, 1] is an inlier with a score of being outlier 0.1101028
// The 5 - th example with features[2, 0, 0] is an outlier with a score of being outlier 1
}

// Example with 3 feature values. A training data set is a collection of
Expand All @@ -87,9 +87,9 @@ private class DataPoint
// Class used to capture prediction of DataPoint.
private class Result
{
// Outlier gets false while inlier has true.
// Outlier gets true while inlier has false.
public bool PredictedLabel { get; set; }
// Outlier gets smaller score.
// Inlier gets smaller score. Score is between 0 and 1.
public float Score { get; set; }
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,14 @@ public static void Example()
// Training data.
var samples = new List<DataPoint>()
{
new DataPoint(){ Features = new float[3] {1, 0, 0} },
new DataPoint(){ Features = new float[3] {0, 2, 1} },
new DataPoint(){ Features = new float[3] {1, 2, 3} },
new DataPoint(){ Features = new float[3] {0, 1, 0} },
new DataPoint(){ Features = new float[3] {0, 2, 3} },
new DataPoint(){ Features = new float[3] {0, 2, 4} },
new DataPoint(){ Features = new float[3] {0, 2, 1} },
new DataPoint(){ Features = new float[3] {-100, 50, -100} }
new DataPoint(){ Features = new float[3] {0, 2, 2} },
new DataPoint(){ Features = new float[3] {0, 2, 3} },
new DataPoint(){ Features = new float[3] {0, 2, 4} },
new DataPoint(){ Features = new float[3] {1, 0, 0} }
};

// Convert the List<DataPoint> to IDataView, a consumble format to
Expand Down Expand Up @@ -63,23 +65,25 @@ public static void Example()
var featuresInText = string.Join(',', samples[i].Features);

if (result.PredictedLabel)
// The i-th sample is predicted as an inlier.
// The i-th sample is predicted as an outlier.
Console.WriteLine("The {0}-th example with features [{1}] is" +
"an inlier with a score of being inlier {2}", i,
"an outlier with a score of being outlier {2}", i,
featuresInText, result.Score);
else
// The i-th sample is predicted as an outlier.
// The i-th sample is predicted as an inlier.
Console.WriteLine("The {0}-th example with features [{1}] is" +
"an outlier with a score of being inlier {2}",
"an inlier with a score of being outlier {2}",
i, featuresInText, result.Score);
}
// Lines printed out should be
// The 0 - th example with features[1, 0, 0] is an inlier with a score of being inlier 0.7453707
// The 1 - th example with features[0, 2, 1] is an inlier with a score of being inlier 0.9999999
// The 2 - th example with features[1, 2, 3] is an inlier with a score of being inlier 0.8450122
// The 3 - th example with features[0, 1, 0] is an inlier with a score of being inlier 0.9428905
// The 4 - th example with features[0, 2, 1] is an inlier with a score of being inlier 0.9999999
// The 5 - th example with features[-100, 50, -100] is an outlier with a score of being inlier 0
// The 0 - th example with features[0, 2, 1] isan inlier with a score of being outlier 0.2264826
// The 1 - th example with features[0, 2, 3] isan inlier with a score of being outlier 0.1739471
// The 2 - th example with features[0, 2, 4] isan inlier with a score of being outlier 0.05711612
// The 3 - th example with features[0, 2, 1] isan inlier with a score of being outlier 0.2264826
// The 4 - th example with features[0, 2, 2] isan inlier with a score of being outlier 0.3868995
// The 5 - th example with features[0, 2, 3] isan inlier with a score of being outlier 0.1739471
// The 6 - th example with features[0, 2, 4] isan inlier with a score of being outlier 0.05711612
// The 7 - th example with features[1, 0, 0] isan outlier with a score of being outlier 0.6260795
}

// Example with 3 feature values. A training data set is a collection of
Expand All @@ -93,9 +97,9 @@ private class DataPoint
// Class used to capture prediction of DataPoint.
private class Result
{
// Outlier gets false while inlier has true.
// Outlier gets true while inlier has false.
public bool PredictedLabel { get; set; }
// Outlier gets smaller score.
// Inlier gets smaller score. Score is between 0 and 1.
public float Score { get; set; }
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/Microsoft.ML.Data/Scorers/PredictionTransformer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ public sealed class AnomalyPredictionTransformer<TModel> : SingleFeaturePredicti

[BestFriend]
internal AnomalyPredictionTransformer(IHostEnvironment env, TModel model, DataViewSchema inputSchema, string featureColumn,
float threshold = 0f, string thresholdColumn = DefaultColumnNames.Score)
float threshold = 0.5f, string thresholdColumn = DefaultColumnNames.Score)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

float threshold = 0.5f, [](start = 12, length = 23)

Why 0.5? Why not 0.1 or 0.05?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As you point out it is arbitrary. What do you suggest?

: base(Contracts.CheckRef(env, nameof(env)).Register(nameof(AnomalyPredictionTransformer<TModel>)), model, inputSchema, featureColumn)
{
Host.CheckNonEmpty(thresholdColumn, nameof(thresholdColumn));
Expand Down
16 changes: 16 additions & 0 deletions src/Microsoft.ML.Data/TrainCatalog.cs
Original file line number Diff line number Diff line change
Expand Up @@ -703,6 +703,22 @@ public AnomalyDetectionMetrics Evaluate(IDataView data, string labelColumnName =
var eval = new AnomalyDetectionEvaluator(Environment, args);
return eval.Evaluate(data, labelColumnName, scoreColumnName, predictedLabelColumnName);
}

/// <summary>
/// Creates a new <see cref="AnomalyPredictionTransformer{TModel}"/> with the specified <paramref name="threshold"/>.
/// If the provided <paramref name="threshold"/> is the same as the <paramref name="model"/> threshold it simply returns <paramref name="model"/>.
/// Note that by default the threshold is 0.5 and valid scores range from 0 to 1.
/// </summary>
/// <param name="model">A trained <see cref="AnomalyPredictionTransformer{TModel}"/>.</param>
/// <param name="threshold">The new threshold value that will be used to determine the label of a data point
/// based on the predicted score by the model.</param>
public AnomalyPredictionTransformer<TModel> ChangeModelThreshold<TModel>(AnomalyPredictionTransformer<TModel> model, float threshold)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

float threshold [](start = 125, length = 15)

I suggest to add the Threshold to option for constructor, like we do for majority of other transformers

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can also add that to a constructor but that will change the signature of the constructor, so I can only a new one with this additional parameter. Is that what you would like me to do?

Where else are we setting the threshold from the constructor? I don't remember seeing that.

where TModel : class
{
if (model.Threshold == threshold)
return model;
return new AnomalyPredictionTransformer<TModel>(Environment, model.Model, model.TrainSchema, model.FeatureColumnName, threshold, model.ThresholdColumn);
}
}

/// <summary>
Expand Down
8 changes: 8 additions & 0 deletions src/Microsoft.ML.PCA/PCACatalog.cs
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,10 @@ internal static PrincipalComponentAnalyzer ProjectToPrincipalComponents(this Tra
/// <param name="oversampling">Oversampling parameter for randomized PCA training.</param>
/// <param name="ensureZeroMean">If enabled, data is centered to be zero mean.</param>
/// <param name="seed">The seed for random number generation.</param>
/// <remarks>
/// By default the threshold used to determine the label of a data point based on the predicted score is 0.5. Scores range from 0 to 1. A data point with predicted
/// score higher than 0.5 is considered an outlier. Use <see cref="AnomalyDetectionCatalog.ChangeModelThreshold"/> to change this threshold.
/// </remarks>
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
Expand All @@ -78,6 +82,10 @@ public static RandomizedPcaTrainer RandomizedPca(this AnomalyDetectionCatalog.An
/// </summary>
/// <param name="catalog">The anomaly detection catalog trainer object.</param>
/// <param name="options">Advanced options to the algorithm.</param>
/// <remarks>
/// By default the threshold used to determine the label of a data point based on the predicted score is 0.5. Scores range from 0 to 1. A data point with predicted
/// score higher than 0.5 is considered an outlier. Use <see cref="AnomalyDetectionCatalog.ChangeModelThreshold"/> to change this threshold.
/// </remarks>
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
Expand Down
120 changes: 107 additions & 13 deletions test/Microsoft.ML.Tests/AnomalyDetectionTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ public void NoAnomalyTest()
[Fact]
public static void RandomizedPcaInMemory()
{
// Create a new context for ML.NET operations. It can be used for exception tracking and logging,
// Create a new context for ML.NET operations. It can be used for exception tracking and logging,
// as a catalog of available operations and as the source of randomness.
// Setting the seed to a fixed number in this example to make outputs deterministic.
var mlContext = new MLContext(seed: 0);
Expand All @@ -68,7 +68,38 @@ public static void RandomizedPcaInMemory()
{
FeatureColumnName = nameof(DataPoint.Features),
Rank = 1,
EnsureZeroMean = false
EnsureZeroMean = false,
Seed = 10
};

// Create anther anomaly detector. Its underlying algorithm is randomized PCA.
var trainer2 = mlContext.AnomalyDetection.Trainers.RandomizedPca(options);

// Test the second detector.
ExecutePipelineWithGivenRandomizedPcaTrainer(mlContext, trainer2);
}

[Fact]
public static void RandomizedPcaChangeThreshold()
{
// Create a new context for ML.NET operations. It can be used for exception tracking and logging,
// as a catalog of available operations and as the source of randomness.
// Setting the seed to a fixed number in this example to make outputs deterministic.
var mlContext = new MLContext(seed: 0);

// Create an anomaly detector. Its underlying algorithm is randomized PCA.
var trainer1 = mlContext.AnomalyDetection.Trainers.RandomizedPca(featureColumnName: nameof(DataPoint.Features), rank: 1, ensureZeroMean: false);

// Test the first detector.
ExecutePipelineWithGivenRandomizedPcaTrainer(mlContext, trainer1);

// Object required in the creation of another detector.
var options = new Trainers.RandomizedPcaTrainer.Options()
{
FeatureColumnName = nameof(DataPoint.Features),
Rank = 1,
EnsureZeroMean = false,
Seed = 10
};

// Create anther anomaly detector. Its underlying algorithm is randomized PCA.
Expand Down Expand Up @@ -105,12 +136,14 @@ private static void ExecutePipelineWithGivenRandomizedPcaTrainer(MLContext mlCon
{
var samples = new List<DataPoint>()
{
new DataPoint(){ Features= new float[3] {1, 0, 0} },
new DataPoint(){ Features= new float[3] {0, 2, 1} },
new DataPoint(){ Features= new float[3] {1, 2, 3} },
new DataPoint(){ Features= new float[3] {0, 1, 0} },
new DataPoint(){ Features= new float[3] {0, 2, 1} },
new DataPoint(){ Features= new float[3] {-100, 50, -100} }
new DataPoint(){ Features = new float[3] {0, 2, 1} },
new DataPoint(){ Features = new float[3] {0, 2, 3} },
new DataPoint(){ Features = new float[3] {0, 2, 4} },
new DataPoint(){ Features = new float[3] {0, 2, 1} },
new DataPoint(){ Features = new float[3] {0, 2, 2} },
new DataPoint(){ Features = new float[3] {0, 2, 3} },
new DataPoint(){ Features = new float[3] {0, 2, 4} },
new DataPoint(){ Features = new float[3] {1, 0, 0} }
};

// Convert the List<DataPoint> to IDataView, a consumble format to ML.NET functions.
Expand All @@ -126,17 +159,78 @@ private static void ExecutePipelineWithGivenRandomizedPcaTrainer(MLContext mlCon
var results = mlContext.Data.CreateEnumerable<Result>(transformed, reuseRowObject: false).ToList();

// First 5 examples are inliers.
for (int i = 0; i < 5; ++i)
for (int i = 0; i < 7; ++i)
{
// Inlier should be predicted as true.
Assert.True(results[i].PredictedLabel);
// Inlier should be predicted as false.
Assert.False(results[i].PredictedLabel);
// Higher score means closer to inlier.
Assert.InRange(results[i].Score, 0.3, 1);
Assert.InRange(results[i].Score, 0, 0.5);
}

// Last example is outlier. Note that outlier should be predicted as false.
// Last example is outlier. Note that outlier should be predicted as true.
Assert.True(results[7].PredictedLabel);
Assert.InRange(results[7].Score, 0.5, 1);
}


/// <summary>
/// Help function used to execute trainers defined in <see cref="RandomizedPcaInMemory"/>.
/// </summary>
private static void ExecuteRandomizedPcaTrainerChangeThreshold(MLContext mlContext, Trainers.RandomizedPcaTrainer trainer)
{
var samples = new List<DataPoint>()
{
new DataPoint(){ Features = new float[3] {0, 2, 1} },
new DataPoint(){ Features = new float[3] {0, 2, 3} },
new DataPoint(){ Features = new float[3] {0, 2, 4} },
new DataPoint(){ Features = new float[3] {0, 2, 1} },
new DataPoint(){ Features = new float[3] {0, 2, 2} },
new DataPoint(){ Features = new float[3] {0, 2, 3} },
new DataPoint(){ Features = new float[3] {0, 2, 4} },
new DataPoint(){ Features = new float[3] {1, 0, 0} }
};

// Convert the List<DataPoint> to IDataView, a consumble format to ML.NET functions.
var data = mlContext.Data.LoadFromEnumerable(samples);

// Train the anomaly detector.
var model = trainer.Fit(data);

var transformer = mlContext.AnomalyDetection.ChangeModelThreshold(model, 0.3f);

// Apply the trained model on the training data.
var transformed = transformer.Transform(data);

// Read ML.NET predictions into IEnumerable<Result>.
var results = mlContext.Data.CreateEnumerable<Result>(transformed, reuseRowObject: false).ToList();

// Inlier should be predicted as false.
Assert.False(results[0].PredictedLabel);
Assert.InRange(results[0].Score, 0, 0.3);
// Inlier should be predicted as false.
Assert.False(results[1].PredictedLabel);
Assert.InRange(results[1].Score, 0, 0.3);
// Inlier should be predicted as false.
Assert.False(results[2].PredictedLabel);
Assert.InRange(results[2].Score, 0, 0.3);
// Inlier should be predicted as false.
Assert.False(results[3].PredictedLabel);
Assert.InRange(results[3].Score, 0, 0.3);

// Outlier should be predicted as true.
Assert.True(results[4].PredictedLabel);
Assert.InRange(results[4].Score, 0.3, 1);

// Inlier should be predicted as false.
Assert.False(results[5].PredictedLabel);
Assert.InRange(results[5].Score, 0, 0.3);
// Inlier should be predicted as false.
Assert.False(results[6].PredictedLabel);
Assert.InRange(results[6].Score, 0, 0.3);

// Outlier should be predicted as true.
Assert.True(results[7].PredictedLabel);
Assert.InRange(results[7].Score, 0.3, 1);
}

private IDataView DetectAnomalyInMnistOneClass(string trainPath, string testPath)
Expand Down