Skip to content

RandomizedPCA Anomaly Detection fraud detection sample #589

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Aug 2, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .vsts-dotnet-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,14 @@ phases:
inputs:
projects: '.\samples\csharp\getting-started\BinaryClassification_CreditCardFraudDetection\CreditCardFraudDetection.sln'

- phase: CreditCardFraudDetection2
queue: Hosted VS2017
steps:
- task: DotNetCoreCLI@2
displayName: Build CreditCardFraudDetection (AnomalyDetection)
inputs:
projects: '.\samples\csharp\getting-started\AnomalyDetection_CreditCardFraudDetection\CreditCardFraudDetection.sln'

- phase: SentimentAnalysis
queue: Hosted VS2017
steps:
Expand Down
10 changes: 10 additions & 0 deletions samples/csharp/common/ConsoleHelper.cs
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,16 @@ public static void PrintBinaryClassificationMetrics(string name, CalibratedBinar
Console.WriteLine($"************************************************************");
}

public static void PrintAnomalyDetectionMetrics(string name, AnomalyDetectionMetrics metrics)
{
Console.WriteLine($"************************************************************");
Console.WriteLine($"* Metrics for {name} anomaly detection model ");
Console.WriteLine($"*-----------------------------------------------------------");
Console.WriteLine($"* Area Under Curve: {metrics.AreaUnderRocCurve:P2}");
Console.WriteLine($"* Detection rate at false positive count: {metrics.DetectionRateAtFalsePositiveCount}");
Console.WriteLine($"************************************************************");
}

public static void PrintMultiClassClassificationMetrics(string name, MulticlassClassificationMetrics metrics)
{
Console.WriteLine($"************************************************************");
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFramework>netstandard2.0</TargetFramework>
</PropertyGroup>

<ItemGroup>
<Compile Remove="Assets\**" />
<EmbeddedResource Remove="Assets\**" />
<None Remove="Assets\**" />
</ItemGroup>

<ItemGroup>
<PackageReference Include="Microsoft.ML" Version="$(MicrosoftMLVersion)" />
</ItemGroup>

</Project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
using System;

namespace CreditCardFraudDetection.Common.DataModels
{
public class TransactionFraudPrediction : IModelEntity
{
public float Label;

/// <summary>
/// The non-negative, unbounded score that was calculated by the anomaly detection model.
/// Fraudulent transactions (Anomalies) will have higher scores than normal transactions
/// </summary>
public float Score;

/// <summary>
/// The predicted label, based on the score. A value of true indicates an anomaly.
/// </summary>
public bool PredictedLabel;

public void PrintToConsole()
{
// There is currently an issue where PredictedLabel is always set to true
// Due to this issue, we'll manually choose the treshold that will indicate an anomaly
// Issue: https://github.com/dotnet/machinelearning/issues/3990
Console.WriteLine($"Predicted Label: {Score > 0.2f} (Score: {Score})");

//Console.WriteLine($"Predicted Label: {PredictedLabel} (Score: {Score})");
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
using System;

using Microsoft.ML.Data;

namespace CreditCardFraudDetection.Common.DataModels
{
public interface IModelEntity {
void PrintToConsole();
}

public class TransactionObservation : IModelEntity
{
[LoadColumn(0)]
public float Time;

[LoadColumn(1)]
public float V1;

[LoadColumn(2)]
public float V2;

[LoadColumn(3)]
public float V3;

[LoadColumn(4)]
public float V4;

[LoadColumn(5)]
public float V5;

[LoadColumn(6)]
public float V6;

[LoadColumn(7)]
public float V7;

[LoadColumn(8)]
public float V8;

[LoadColumn(9)]
public float V9;

[LoadColumn(10)]
public float V10;

[LoadColumn(11)]
public float V11;

[LoadColumn(12)]
public float V12;

[LoadColumn(13)]
public float V13;

[LoadColumn(14)]
public float V14;

[LoadColumn(15)]
public float V15;

[LoadColumn(16)]
public float V16;

[LoadColumn(17)]
public float V17;

[LoadColumn(18)]
public float V18;

[LoadColumn(19)]
public float V19;

[LoadColumn(20)]
public float V20;

[LoadColumn(21)]
public float V21;

[LoadColumn(22)]
public float V22;

[LoadColumn(23)]
public float V23;

[LoadColumn(24)]
public float V24;

[LoadColumn(25)]
public float V25;

[LoadColumn(26)]
public float V26;

[LoadColumn(27)]
public float V27;

[LoadColumn(28)]
public float V28;

[LoadColumn(29)]
public float Amount;

[LoadColumn(30)]
public float Label;

public void PrintToConsole() {
Console.WriteLine($"Label: {Label}");
Console.WriteLine($"Features: [V1] {V1} [V2] {V2} [V3] {V3} ... [V28] {V28} Amount: {Amount}");
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
using System.IO;
using System.Linq;

using Microsoft.ML.Data;

namespace CreditCardFraudDetection.Common
{
public static class LocalConsoleHelper
{
public static string GetAssetsPath(params string[] paths)
{
FileInfo _dataRoot = new FileInfo(typeof(LocalConsoleHelper).Assembly.Location);

if (paths == null || paths.Length == 0)
{
return null;
}

return Path.Combine(paths.Prepend(_dataRoot.Directory.FullName).ToArray());
}

public static string DeleteAssets(params string[] paths)
{
var location = GetAssetsPath(paths);

if (!string.IsNullOrWhiteSpace(location) && File.Exists(location))
{
File.Delete(location);
}

return location;
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>netcoreapp2.1</TargetFramework>
<LangVersion>7.2</LangVersion>
</PropertyGroup>

<ItemGroup>
<Folder Include="assets\input\" />
</ItemGroup>

<ItemGroup>
<None Remove="assets\input\.gitignore" />
</ItemGroup>

<ItemGroup>
<PackageReference Include="Microsoft.ML" Version="$(MicrosoftMLVersion)" />
</ItemGroup>

<ItemGroup>
<ProjectReference Include="..\CreditCardFraudDetection.Common\CreditCardFraudDetection.Common.csproj" />
</ItemGroup>

</Project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
using System;
using System.Linq;

using Microsoft.ML;

using CreditCardFraudDetection.Common.DataModels;

namespace CreditCardFraudDetection.Predictor
{
public class Predictor
{
private readonly string _modelfile;
private readonly string _dasetFile;

public Predictor(string modelfile, string dasetFile)
{
_modelfile = modelfile ?? throw new ArgumentNullException(nameof(modelfile));
_dasetFile = dasetFile ?? throw new ArgumentNullException(nameof(dasetFile));
}


public void RunMultiplePredictions(int numberOfPredictions)
{
var mlContext = new MLContext();

// Load data as input for predictions
IDataView inputDataForPredictions = mlContext.Data.LoadFromTextFile<TransactionObservation>(_dasetFile, separatorChar: ',', hasHeader: true);

Console.WriteLine($"Predictions from saved model:");

ITransformer model = mlContext.Model.Load(_modelfile, out var inputSchema);

var predictionEngine = mlContext.Model.CreatePredictionEngine<TransactionObservation, TransactionFraudPrediction>(model);

Console.WriteLine($"\n \n Test {numberOfPredictions} transactions, from the test datasource, that should be predicted as fraud (true):");

mlContext.Data.CreateEnumerable<TransactionObservation>(inputDataForPredictions, reuseRowObject: false)
.Where(x => x.Label > 0)
.Take(numberOfPredictions)
.Select(testData => testData)
.ToList()
.ForEach(testData =>
{
Console.WriteLine($"--- Transaction ---");
testData.PrintToConsole();
predictionEngine.Predict(testData).PrintToConsole();
Console.WriteLine($"-------------------");
});


Console.WriteLine($"\n \n Test {numberOfPredictions} transactions, from the test datasource, that should NOT be predicted as fraud (false):");

mlContext.Data.CreateEnumerable<TransactionObservation>(inputDataForPredictions, reuseRowObject: false)
.Where(x => x.Label < 1)
.Take(numberOfPredictions)
.ToList()
.ForEach(testData =>
{
Console.WriteLine($"--- Transaction ---");
testData.PrintToConsole();
predictionEngine.Predict(testData).PrintToConsole();
Console.WriteLine($"-------------------");
});
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
using System;
using System.IO;

using CreditCardFraudDetection.Common;

namespace CreditCardFraudDetection.Predictor
{
class Program
{
static void Main(string[] args)
{
string assetsPath = GetAbsolutePath(@"../../../assets");
string trainOutput = GetAbsolutePath(@"../../../../CreditCardFraudDetection.Trainer/assets/output");

CopyModelAndDatasetFromTrainingProject(trainOutput, assetsPath);

var inputDatasetForPredictions = Path.Combine(assetsPath, "input", "testData.csv");
var modelFilePath = Path.Combine(assetsPath, "input", "randomizedPca.zip");

// Create model predictor to perform a few predictions
var modelPredictor = new Predictor(modelFilePath, inputDatasetForPredictions);

modelPredictor.RunMultiplePredictions(numberOfPredictions: 5);

Console.WriteLine("=============== Press any key ===============");
Console.ReadKey();
}


public static void CopyModelAndDatasetFromTrainingProject(string trainOutput, string assetsPath)
{
if (!File.Exists(Path.Combine(trainOutput, "testData.csv")) ||
!File.Exists(Path.Combine(trainOutput, "randomizedPca.zip")))
{
Console.WriteLine("***** YOU NEED TO RUN THE TRAINING PROJECT FIRST *****");
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason we don't include a copy of randomizedPca.zip for the user already (i.e. is it too large of a file to upload to GitHub)? For the other samples, I believe we include a copy of the model/files produced from training, and the user doesn't have to train first themselves to get the predictor to work.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I based this on how it was done in the BinaryClassification version of this sample. Although, it actually looks like that sample does commit the model into the input directory of the sample's Predictor project. However, as mentioned above, both the test data csv and the model are generated by the Trainer project (in both samples) and looks like the BinaryClassification version of this sample doesn't commit that csv either. So even in the BinaryClassification sample, the Trainer project will need to be run before the Predictor sample will work.

I'm happy to follow you guidance here. If we want to commit the model and test data set into the Predictor project, I can definitely make the change, but we should probably do it for both samples. I believe the longer-term goal is to have all these samples run as a single project vs. separate Trainer/Predictor projects. But again, that's a change that should be made in both the CreditCardFraudDetection samples, and may be outside the scope of this PR.

Let me know how you want to move forward.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@CESARDELATORRE Would love to get your thoughts here!

Copy link
Contributor

@CESARDELATORRE CESARDELATORRE Aug 2, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This concrete sample has two projects. The predictor/scoring project should have the model.ZIP file plus the Test dataset for doing multiple predictions, so it works out-of-the-box without running the training project.

But the training project doesn't need to have the model.zip file since it will generate it after training and since the code is split in two projects it is even better that the training project doesn't have it.

About the dataset, we're only committing/pushing a dataset .zip file (instead of directly the .csv files) because this concrete dataset is larger than 100MB and that's not allowed by GitHub, therefore we have the .zip file for the dataset which is a bit smaller than 100MB.

But for the predictor/scored I think we could include both the model .zip and the test dataset so a user could just try predictions, if desired, and it'll work out-of-the-box instead of raising an error and saying that you first need to run the training project.

In any case we can merge as it is now and we can change those details while reviewing it further. It is not critical. . :)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Btw, never mind. For this case it might be better if the scoring/client app copies the model so it takes the latest training. We'll change the code so it is only copying the .zip model and the test dataset. The scoring project doesn't need the training dataset and git ignore files that are being copied currently. But this is a minor improvement for clarity.

Thanks! 👍

Console.WriteLine("=============== Press any key ===============");
Console.ReadKey();
Environment.Exit(0);
}

// Copy files from train output
Directory.CreateDirectory(assetsPath);

foreach (var file in Directory.GetFiles(trainOutput))
{
var fileDestination = Path.Combine(Path.Combine(assetsPath, "input"), Path.GetFileName(file));

if (File.Exists(fileDestination))
{
LocalConsoleHelper.DeleteAssets(fileDestination);
}

File.Copy(file, Path.Combine(Path.Combine(assetsPath, "input"), Path.GetFileName(file)));
}
}


public static string GetAbsolutePath(string relativePath)
{
FileInfo _dataRoot = new FileInfo(typeof(Program).Assembly.Location);

string assemblyFolderPath = _dataRoot.Directory.FullName;

string fullPath = Path.Combine(assemblyFolderPath, relativePath);

return fullPath;
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
!.gitignore

*.csv
Loading