-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Scrubbing FieldAwareFactorizationMachine learner. #2730
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
19d25e2
b89c63b
0bc0c92
4051d6b
17f83a9
7d4e7ed
3f7cf53
30faca7
c2746e2
003b6a0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
using System; | ||
using System.Linq; | ||
using Microsoft.ML.Data; | ||
namespace Microsoft.ML.Samples.Dynamic | ||
{ | ||
public static class FFMBinaryClassification | ||
{ | ||
public static void Example() | ||
{ | ||
// Creating the ML.Net IHostEnvironment object, needed for the pipeline. | ||
var mlContext = new MLContext(); | ||
|
||
// Download and featurize the dataset. | ||
(var trainData, var testData) = SamplesUtils.DatasetUtils.LoadFeaturizedSentimentDataset(mlContext); | ||
|
||
// ML.NET doesn't cache data set by default. Therefore, if one reads a data set from a file and accesses it many times, it can be slow due to | ||
// expensive featurization and disk operations. When the considered data can fit into memory, a solution is to cache the data in memory. Caching is especially | ||
// helpful when working with iterative algorithms which needs many data passes. Since SDCA is the case, we cache. Inserting a | ||
// cache step in a pipeline is also possible, please see the construction of pipeline below. | ||
trainData = mlContext.Data.Cache(trainData); | ||
|
||
// Step 2: Pipeline | ||
// Create the 'FieldAwareFactorizationMachine' binary classifier, setting the "Sentiment" column as the label of the dataset, and | ||
// the "Features" column as the features column. | ||
var pipeline = new EstimatorChain<ITransformer>().AppendCacheCheckpoint(mlContext) | ||
.Append(mlContext.BinaryClassification.Trainers. | ||
FieldAwareFactorizationMachine(labelColumnName: "Sentiment", featureColumnNames: new[] { "Features" })); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. indentation #Resolved |
||
|
||
// Fit the model. | ||
var model = pipeline.Fit(trainData); | ||
|
||
// Let's get the model parameters from the model. | ||
var modelParams = model.LastTransformer.Model; | ||
|
||
// Let's inspect the model parameters. | ||
var featureCount = modelParams.GetFeatureCount(); | ||
var fieldCount = modelParams.GetFieldCount(); | ||
var latentDim = modelParams.GetLatentDim(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
brrr #Resolved |
||
var linearWeights = modelParams.GetLinearWeights(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I think right now I can do following:
X1 != X2 Which is awful. Can you check I also don't understand |
||
var latentWeights = modelParams.GetLatentWeights(); | ||
|
||
Console.WriteLine("The feature count is: " + featureCount); | ||
Console.WriteLine("The number of fields is: " + fieldCount); | ||
Console.WriteLine("The latent dimension is: " + latentDim); | ||
Console.WriteLine("The linear weights of some of the features are: " + | ||
string.Concat(Enumerable.Range(1, 10).Select(i => $"{linearWeights[i]:F4} "))); | ||
Console.WriteLine("The weights of some of the latent features are: " + | ||
string.Concat(Enumerable.Range(1, 10).Select(i => $"{latentWeights[i]:F4} "))); | ||
|
||
// The feature count is: 9374 | ||
// The number of fields is: 1 | ||
// The latent dimension is: 20 | ||
// The linear weights of some of the features are: 0.0196 0.0000 -0.0045 -0.0205 0.0000 0.0032 0.0682 0.0091 -0.0151 0.0089 | ||
// The weights of some of the latent features are: 0.3316 0.2140 0.0752 0.0908 -0.0495 -0.0810 0.0761 0.0966 0.0090 -0.0962 | ||
|
||
// Evaluate how the model is doing on the test data. | ||
var dataWithPredictions = model.Transform(testData); | ||
|
||
var metrics = mlContext.BinaryClassification.Evaluate(dataWithPredictions, "Sentiment"); | ||
SamplesUtils.ConsoleUtils.PrintMetrics(metrics); | ||
|
||
// Accuracy: 0.72 | ||
// AUC: 0.75 | ||
// F1 Score: 0.74 | ||
// Negative Precision: 0.75 | ||
// Negative Recall: 0.67 | ||
// Positive Precision: 0.70 | ||
// Positive Recall: 0.78 | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
using System; | ||
using System.Linq; | ||
using Microsoft.ML.Data; | ||
using Microsoft.ML.FactorizationMachine; | ||
|
||
namespace Microsoft.ML.Samples.Dynamic | ||
{ | ||
public static class FFMBinaryClassificationWithOptions | ||
{ | ||
public static void Example() | ||
{ | ||
// Creating the ML.Net IHostEnvironment object, needed for the pipeline. | ||
var mlContext = new MLContext(); | ||
|
||
// Download and featurize the dataset. | ||
(var trainData, var testData) = SamplesUtils.DatasetUtils.LoadFeaturizedSentimentDataset(mlContext); | ||
|
||
// ML.NET doesn't cache data set by default. Therefore, if one reads a data set from a file and accesses it many times, it can be slow due to | ||
// expensive featurization and disk operations. When the considered data can fit into memory, a solution is to cache the data in memory. Caching is especially | ||
// helpful when working with iterative algorithms which needs many data passes. Since SDCA is the case, we cache. Inserting a | ||
// cache step in a pipeline is also possible, please see the construction of pipeline below. | ||
trainData = mlContext.Data.Cache(trainData); | ||
|
||
// Step 2: Pipeline | ||
// Create the 'FieldAwareFactorizationMachine' binary classifier, setting the "Sentiment" column as the label of the dataset, and | ||
// the "Features" column as the features column. | ||
var pipeline = new EstimatorChain<ITransformer>().AppendCacheCheckpoint(mlContext) | ||
.Append(mlContext.BinaryClassification.Trainers. | ||
FieldAwareFactorizationMachine( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. one more #Resolved |
||
new FieldAwareFactorizationMachineTrainer.Options | ||
{ | ||
FeatureColumn = "Features", | ||
LabelColumn = "Sentiment", | ||
LearningRate = 0.1f, | ||
Iters = 10 | ||
})); | ||
|
||
// Fit the model. | ||
var model = pipeline.Fit(trainData); | ||
|
||
// Let's get the model parameters from the model. | ||
var modelParams = model.LastTransformer.Model; | ||
|
||
// Let's inspect the model parameters. | ||
var featureCount = modelParams.GetFeatureCount(); | ||
var fieldCount = modelParams.GetFieldCount(); | ||
var latentDim = modelParams.GetLatentDim(); | ||
var linearWeights = modelParams.GetLinearWeights(); | ||
var latentWeights = modelParams.GetLatentWeights(); | ||
|
||
Console.WriteLine("The feature count is: " + featureCount); | ||
Console.WriteLine("The number of fields is: " + fieldCount); | ||
Console.WriteLine("The latent dimension is: " + latentDim); | ||
Console.WriteLine("The linear weights of some of the features are: " + | ||
string.Concat(Enumerable.Range(1, 10).Select(i => $"{linearWeights[i]:F4} "))); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
Console.WriteLine("The weights of some of the latent features are: " + | ||
string.Concat(Enumerable.Range(1, 10).Select(i => $"{latentWeights[i]:F4} "))); | ||
|
||
// The feature count is: 9374 | ||
// The number of fields is: 1 | ||
// The latent dimension is: 20 | ||
// The linear weights of some of the features are: 0.0410 0.0000 -0.0078 -0.0285 0.0000 0.0114 0.1313 0.0183 -0.0224 0.0166 | ||
// The weights of some of the latent features are: -0.0326 0.1127 0.0621 0.1446 0.2038 0.1608 0.2084 0.0141 0.2458 -0.0625 | ||
|
||
// Evaluate how the model is doing on the test data. | ||
var dataWithPredictions = model.Transform(testData); | ||
|
||
var metrics = mlContext.BinaryClassification.Evaluate(dataWithPredictions, "Sentiment"); | ||
SamplesUtils.ConsoleUtils.PrintMetrics(metrics); | ||
|
||
// Accuracy: 0.78 | ||
// AUC: 0.81 | ||
// F1 Score: 0.78 | ||
// Negative Precision: 0.78 | ||
// Negative Recall: 0.78 | ||
// Positive Precision: 0.78 | ||
// Positive Recall: 0.78 | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -78,14 +78,48 @@ public sealed class HousingRegression | |
/// <summary> | ||
/// Downloads the wikipedia detox dataset from the ML.NET repo. | ||
/// </summary> | ||
public static string DownloadSentimentDataset() | ||
=> Download("https://raw.githubusercontent.com/dotnet/machinelearning/76cb2cdf5cc8b6c88ca44b8969153836e589df04/test/data/wikipedia-detox-250-line-data.tsv", "sentiment.tsv"); | ||
public static (string trainFile, string testFile) DownloadSentimentDataset() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Can we not use value tuples? They nice, but are they really necessary? |
||
{ | ||
var trainFile = Download("https://raw.githubusercontent.com/dotnet/machinelearning/76cb2cdf5cc8b6c88ca44b8969153836e589df04/test/data/wikipedia-detox-250-line-data.tsv", "sentiment.tsv"); | ||
var testFile = Download("https://raw.githubusercontent.com/dotnet/machinelearning/76cb2cdf5cc8b6c88ca44b8969153836e589df04/test/data/wikipedia-detox-250-line-test.tsv", "sentimenttest.tsv"); | ||
return (trainFile, testFile); | ||
} | ||
|
||
/// <summary> | ||
/// Downloads the adult dataset from the ML.NET repo. | ||
/// </summary> | ||
public static string DownloadAdultDataset() | ||
=> Download("https://raw.githubusercontent.com/dotnet/machinelearning/244a8c2ac832657af282aa312d568211698790aa/test/data/adult.train", "adult.txt"); | ||
|
||
/// <summary> | ||
/// Downloads the adult dataset from the ML.NET repo. | ||
/// Downloads the wikipedia detox dataset and featurizes it to be suitable for sentiment classification tasks. | ||
/// </summary> | ||
public static string DownloadAdultDataset() | ||
=> Download("https://raw.githubusercontent.com/dotnet/machinelearning/244a8c2ac832657af282aa312d568211698790aa/test/data/adult.train", "adult.txt"); | ||
/// <param name="mlContext"><see cref="MLContext"/> used for data loading and processing.</param> | ||
/// <returns>Featurized dataset.</returns> | ||
public static (IDataView trainData, IDataView testData) LoadFeaturizedSentimentDataset(MLContext mlContext) | ||
{ | ||
// Download the file | ||
(string trainFile, string testFile) = DownloadSentimentDataset(); | ||
|
||
// Define the columns to read | ||
var reader = mlContext.Data.CreateTextLoader( | ||
columns: new[] | ||
{ | ||
new TextLoader.Column("Sentiment", DataKind.BL, 0), | ||
new TextLoader.Column("SentimentText", DataKind.Text, 1) | ||
}, | ||
hasHeader: true | ||
); | ||
|
||
// Create data featurizing pipeline | ||
var pipeline = mlContext.Transforms.Text.FeaturizeText("Features", "SentimentText"); | ||
|
||
var data = reader.Read(trainFile); | ||
var model = pipeline.Fit(data); | ||
var featurizedDataTrain = model.Transform(data); | ||
var featurizedDataTest = model.Transform(reader.Read(testFile)); | ||
return (featurizedDataTrain, featurizedDataTest); | ||
} | ||
|
||
/// <summary> | ||
/// Downloads the Adult UCI dataset and featurizes it to be suitable for classification tasks. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this is our default comment. #Resolved