From 7629533cad66990723d14e7a6c235f8d1bb835f4 Mon Sep 17 00:00:00 2001 From: Shahab Moradi Date: Mon, 5 Nov 2018 10:46:37 -0800 Subject: [PATCH 01/14] Fixed "How do I look at the intermediate data?" --- docs/code/MlNetCookBook.md | 54 +++++++++++++++++++++++++++++++++++--- 1 file changed, 50 insertions(+), 4 deletions(-) diff --git a/docs/code/MlNetCookBook.md b/docs/code/MlNetCookBook.md index 542c745e56..9f2df6aca5 100644 --- a/docs/code/MlNetCookBook.md +++ b/docs/code/MlNetCookBook.md @@ -148,12 +148,12 @@ Label Workclass education marital-status This is how you can read this data: ```csharp -// Create a new environment for ML.NET operations. It can be used for exception tracking and logging, -// as well as the source of randomness. -var env = new LocalEnvironment(); +// Create a new context for ML.NET operations. It can be used for exception tracking and logging, +// as a catalog of available operations and as the source of randomness. +var mlContext = new MLContext(); // Create the reader: define the data columns and where to find them in the text file. -var reader = TextLoader.CreateReader(env, ctx => ( +var reader = mlContext.Data.TextReader(ctx => ( // A boolean column depicting the 'target label'. IsOver50K: ctx.LoadBool(14), // Three text columns. @@ -303,6 +303,52 @@ private class InspectedRow } ``` +You can also use the dynamic API to create the equivalent of the previous pipeline. +```csharp +// Create a new context for ML.NET operations. It can be used for exception tracking and logging, +// as a catalog of available operations and as the source of randomness. +var mlContext = new MLContext(); + +// Create the reader: define the data columns and where to find them in the text file. +var reader = new TextLoader(mlContext, new TextLoader.Arguments +{ + Column = new[] { + // A boolean column depicting the 'label'. + new TextLoader.Column("IsOver50k", DataKind.BL, 0), + // Three text columns. + new TextLoader.Column("Workclass", DataKind.TX, 1), + new TextLoader.Column("Education", DataKind.TX, 2), + new TextLoader.Column("MaritalStatus", DataKind.TX, 3) + }, + // First line of the file is a header, not a data row. + HasHeader = true +}); + +// Start creating our processing pipeline. For now, let's just concatenate all the text columns +// together into one. +var dynamicPipeline = mlContext.Transforms.Concatenate("AllFeatures", "Education", "MaritalStatus"); + +// Let's verify that the data has been read correctly. +// First, we read the data file. +var data = reader.Read(dataPath); + +// Fit our data pipeline and transform data with it. +var transformedData = dynamicPipeline.Fit(data).Transform(data); + +// 'transformedData' is a 'promise' of data. Let's actually read it. +var someRows = transformedData + // Convert to an enumerable of user-defined type. + .AsEnumerable(mlContext, reuseRowObject: false) + // Take a couple values as an array. + .Take(4).ToArray(); + +// Extract the 'AllFeatures' column. +// This will give the entire dataset: make sure to only take several row +// in case the dataset is huge. The is similar to the static API, except +// you have to specify the column name and type. +var featureColumns = transformedData.GetColumn(mlContext, "AllFeatures") + .Take(20).ToArray(); +``` ## How do I train a regression model? Generally, in order to train any model in ML.NET, you will go through three steps: From 7a51c137ab086aa01e3c2dd6f03efb22ed317158 Mon Sep 17 00:00:00 2001 From: Shahab Moradi Date: Mon, 5 Nov 2018 11:03:48 -0800 Subject: [PATCH 02/14] Fixed "How do I train a regression model?" --- docs/code/MlNetCookBook.md | 39 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/docs/code/MlNetCookBook.md b/docs/code/MlNetCookBook.md index 9f2df6aca5..2f77fba353 100644 --- a/docs/code/MlNetCookBook.md +++ b/docs/code/MlNetCookBook.md @@ -405,6 +405,45 @@ var learningPipeline = reader.MakeNewEstimator() var model = learningPipeline.Fit(trainData); ``` +You can also use the dynamic API to create the equivalent of the previous pipeline. +```csharp +// Create a new context for ML.NET operations. It can be used for exception tracking and logging, +// as a catalog of available operations and as the source of randomness. +var mlContext = new MLContext(); + +// Step one: read the data as an IDataView. +// First, we define the reader: specify the data columns and where to find them in the text file. +var reader = new TextLoader(mlContext, new TextLoader.Arguments +{ + Column = new[] { + // We read the first 11 values as a single float vector. + new TextLoader.Column("FeatureVector", DataKind.R4, 0, 10), + + // Separately, read the target variable. + new TextLoader.Column("Target", DataKind.R4, 11), + }, + // First line of the file is a header, not a data row. + HasHeader = true, + // Default separator is tab, but we need a semicolon. + Separator = ";" +}); + +// Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed). +var trainData = reader.Read(trainDataPath); + +// Step two: define the learning pipeline. + +// We 'start' the pipeline with the output of the reader. +var dynamicPipeline = + // First 'normalize' the data (rescale to be + // between -1 and 1 for all examples), and then train the model. + mlContext.Transforms.Normalize("FeatureVector") + // Add the SDCA regression trainer. + .Append(mlContext.Regression.Trainers.StochasticDualCoordinateAscent(label: "Target", features: "FeatureVector")) + +// Step three. Train the pipeline. +var model = dynamicPipeline.Fit(trainData); +``` ## How do I verify the model quality? This is the first question that arises after you train the model: how good it actually is? From 59c4f0da561464b6f8a7dcaf20c44106ac0eff75 Mon Sep 17 00:00:00 2001 From: Shahab Moradi Date: Mon, 5 Nov 2018 11:56:07 -0800 Subject: [PATCH 03/14] Fixed "How do I use the model to make one prediction?" --- docs/code/MlNetCookBook.md | 41 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/docs/code/MlNetCookBook.md b/docs/code/MlNetCookBook.md index 2f77fba353..1a9e3b2d1f 100644 --- a/docs/code/MlNetCookBook.md +++ b/docs/code/MlNetCookBook.md @@ -532,6 +532,47 @@ var learningPipeline = reader.MakeNewEstimator() var model = learningPipeline.Fit(trainData).AsDynamic; ``` +You can also use the dynamic API to create the equivalent of the previous pipeline. +```csharp +// Create a new context for ML.NET operations. It can be used for exception tracking and logging, +// as a catalog of available operations and as the source of randomness. +var mlContext = new MLContext(); + +// Step one: read the data as an IDataView. +// First, we define the reader: specify the data columns and where to find them in the text file. +var reader = new TextLoader(mlContext, new TextLoader.Arguments +{ + Column = new[] { + // We read the first 11 values as a single float vector. + new TextLoader.Column("SepalLength", DataKind.R4, 0), + new TextLoader.Column("SepalWidth", DataKind.R4, 1), + new TextLoader.Column("PetalLength", DataKind.R4, 2), + new TextLoader.Column("PetalWidth", DataKind.R4, 3), + // Label: kind of iris. + new TextLoader.Column("Label", DataKind.TX, 4), + }, + // Default separator is tab, but the dataset has comma. + Separator = "," +}); + +// Retrieve the training data. +var trainData = reader.Read(irisDataPath); + +// Build the training pipeline. +var dynamicPipeline = + // Concatenate all the features together into one column 'Features'. + mlContext.Transforms.Concatenate("Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth") + // Note that the label is text, so it needs to be converted to key. + .Append(new ValueToKeyMappingEstimator(mlContext, "Label"), TransformerScope.TrainTest) + // Use the multi-class SDCA model to predict the label using features. + .Append(mlContext.MulticlassClassification.Trainers.StochasticDualCoordinateAscent()) + // Apply the inverse conversion from 'PredictedLabel' key back to string value. + .Append(mlContext.Transforms.Conversion.MapKeyToValue("PredictedLabel")); + +// Train the model. +var model = dynamicPipeline.Fit(trainData); +``` + Now, in order to use [schema comprehension](SchemaComprehension.md) for prediction, we define a pair of classes like following: ```csharp private class IrisInput From 3f05a018097287af0715a7c51f5de638be48280e Mon Sep 17 00:00:00 2001 From: Shahab Moradi Date: Mon, 5 Nov 2018 13:08:16 -0800 Subject: [PATCH 04/14] Small fix --- docs/code/MlNetCookBook.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/code/MlNetCookBook.md b/docs/code/MlNetCookBook.md index 1a9e3b2d1f..bdf7d32a26 100644 --- a/docs/code/MlNetCookBook.md +++ b/docs/code/MlNetCookBook.md @@ -664,7 +664,7 @@ var trainData = mlContext.CreateStreamingDataView(churnData); // We apply our FastTree binary classifier to predict the 'HasChurned' label. var dynamicLearningPipeline = mlContext.Transforms.Categorical.OneHotEncoding("DemographicCategory") - .Append(new ColumnConcatenatingEstimator(mlContext, "Features", "DemographicCategory", "LastVisits")) + .Append(mlContext.Transforms.Concatenate("Features", "DemographicCategory", "LastVisits")) .Append(mlContext.BinaryClassification.Trainers.FastTree("HasChurned", "Features", numTrees: 20)); var dynamicModel = dynamicLearningPipeline.Fit(trainData); From 60451660257e90c9acf6c9b7bc7b390689d9a352 Mon Sep 17 00:00:00 2001 From: Shahab Moradi Date: Mon, 5 Nov 2018 13:16:16 -0800 Subject: [PATCH 05/14] Fixed normalization examples --- docs/code/MlNetCookBook.md | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/docs/code/MlNetCookBook.md b/docs/code/MlNetCookBook.md index bdf7d32a26..028ca85822 100644 --- a/docs/code/MlNetCookBook.md +++ b/docs/code/MlNetCookBook.md @@ -543,7 +543,6 @@ var mlContext = new MLContext(); var reader = new TextLoader(mlContext, new TextLoader.Arguments { Column = new[] { - // We read the first 11 values as a single float vector. new TextLoader.Column("SepalLength", DataKind.R4, 0), new TextLoader.Column("SepalWidth", DataKind.R4, 1), new TextLoader.Column("PetalLength", DataKind.R4, 2), @@ -827,6 +826,42 @@ var normalizedData = pipeline.Fit(trainData).Transform(trainData); var meanVarValues = normalizedData.GetColumn(r => r.MeanVarNormalized).ToArray(); ``` +You can achive the same results using the dynamic API. +```csharp +// Create a new context for ML.NET operations. It can be used for exception tracking and logging, +// as a catalog of available operations and as the source of randomness. +var mlContext = new MLContext(); + +// Define the reader: specify the data columns and where to find them in the text file. +var reader = new TextLoader(mlContext, new TextLoader.Arguments +{ + Column = new[] { + // The four features of the Iris dataset will be grouped together as one Features column. + new TextLoader.Column("Features", DataKind.R4, 0, 3), + // Label: kind of iris. + new TextLoader.Column("Label", DataKind.TX, 4), + }, + // Default separator is tab, but the dataset has comma. + Separator = "," +}); + +// Read the training data. +var trainData = reader.Read(dataPath); + +// Apply all kinds of standard ML.NET normalization to the raw features. +var pipeline = + mlContext.Transforms.Normalize( + new NormalizingEstimator.MinMaxColumn("Features", "MinMaxNormalized", fixZero: true), + new NormalizingEstimator.MeanVarColumn("Features", "MeanVarNormalized", fixZero: true), + new NormalizingEstimator.BinningColumn("Features", "BinNormalized", numBins: 256)); + +// Let's train our pipeline of normalizers, and then apply it to the same data. +var normalizedData = pipeline.Fit(trainData).Transform(trainData); + +// Inspect one column of the resulting dataset. +var meanVarValues = normalizedData.GetColumn(mlContext, "MeanVarNormalized").ToArray(); +``` + ## How do I train my model on categorical data? Generally speaking, *all ML.NET learners expect the features as a float vector*. So, if some of your data is not natively a float, you will need to convert to floats. From 63d0536c051e6e28aebb2de5b0b0807e2d6266d0 Mon Sep 17 00:00:00 2001 From: Shahab Moradi Date: Mon, 5 Nov 2018 14:32:52 -0800 Subject: [PATCH 06/14] Updated categorical examples --- docs/code/MlNetCookBook.md | 57 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/docs/code/MlNetCookBook.md b/docs/code/MlNetCookBook.md index 028ca85822..53c442f348 100644 --- a/docs/code/MlNetCookBook.md +++ b/docs/code/MlNetCookBook.md @@ -943,6 +943,63 @@ var fullLearningPipeline = learningPipeline var model = fullLearningPipeline.Fit(data); ``` +You can achive the same results using the dynamic API. +```csharp +// Create a new context for ML.NET operations. It can be used for exception tracking and logging, +// as a catalog of available operations and as the source of randomness. +var mlContext = new MLContext(); + +// Define the reader: specify the data columns and where to find them in the text file. +var reader = new TextLoader(mlContext, new TextLoader.Arguments +{ + Column = new[] { + new TextLoader.Column("Label", DataKind.BL, 0), + // We will load all the categorical features into one vector column of size 8. + new TextLoader.Column("CategoricalFeatures", DataKind.TX, 1, 8), + // Similarly, load all numerical features into one vector of size 6. + new TextLoader.Column("NumericalFeatures", DataKind.R4, 9, 14), + // Let's also separately load the 'Workclass' column. + new TextLoader.Column("Workclass", DataKind.TX, 1), + }, + HasHeader = true +}); + +// Read the data. +var data = reader.Read(dataPath); + +// Inspect the categorical columns to check that they are correctly read. +var catColumns = data.GetColumn(mlContext, "CategoricalFeatures").Take(10).ToArray(); + +// Build several alternative featurization pipelines. +var dynamicPipeline = + // Convert each categorical feature into one-hot encoding independently. + mlContext.Transforms.Categorical.OneHotEncoding("CategoricalFeatures", "CategoricalOneHot") + // Convert all categorical features into indices, and build a 'word bag' of these. + .Append(mlContext.Transforms.Categorical.OneHotEncoding("CategoricalFeatures", "CategoricalOneHot", CategoricalTransform.OutputKind.Bag)) + // One-hot encode the workclass column, then drop all the categories that have fewer than 10 instances in the train set. + .Append(mlContext.Transforms.Categorical.OneHotEncoding("Workclass", "WorkclassOneHot")) + .Append(new CountFeatureSelector(mlContext, "WorkclassOneHot", "WorkclassOneHotTrimmed", count: 10)); + +// Let's train our pipeline, and then apply it to the same data. +var transformedData = dynamicPipeline.Fit(data).Transform(data); + +// Inspect some columns of the resulting dataset. +var categoricalBags = transformedData.GetColumn(mlContext, "CategoricalBag").Take(10).ToArray(); +var workclasses = transformedData.GetColumn(mlContext, "WorkclassOneHotTrimmed").Take(10).ToArray(); + +// Of course, if we want to train the model, we will need to compose a single float vector of all the features. +// Here's how we could do this: + +var fullLearningPipeline = dynamicPipeline + // Concatenate two of the 3 categorical pipelines, and the numeric features. + .Append(mlContext.Transforms.Concatenate("Features", "NumericalFeatures", "CategoricalBag", "WorkclassOneHotTrimmed")) + // Now we're ready to train. We chose our FastTree trainer for this classification task. + .Append(mlContext.BinaryClassification.Trainers.FastTree(numTrees: 50)); + +// Train the model. +var model = fullLearningPipeline.Fit(data); +``` + ## How do I train my model on textual data? Generally speaking, *all ML.NET learners expect the features as a float vector*. So, if some of your data is not natively a float, you will need to convert to floats. From c6e0284b42ead62e4335cf6d3f3240bbfd2bbced Mon Sep 17 00:00:00 2001 From: Shahab Moradi Date: Mon, 5 Nov 2018 16:46:35 -0800 Subject: [PATCH 07/14] Updated text example --- docs/code/MlNetCookBook.md | 54 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/docs/code/MlNetCookBook.md b/docs/code/MlNetCookBook.md index 53c442f348..4b5bb11df8 100644 --- a/docs/code/MlNetCookBook.md +++ b/docs/code/MlNetCookBook.md @@ -1070,6 +1070,60 @@ var embeddings = transformedData.GetColumn(x => x.Embeddings).Take(10).ToArray() var unigrams = transformedData.GetColumn(x => x.BagOfWords).Take(10).ToArray(); ``` +You can achive the same results using the dynamic API. +```csharp +// Create a new context for ML.NET operations. It can be used for exception tracking and logging, +// as a catalog of available operations and as the source of randomness. +var mlContext = new MLContext(); + +// Define the reader: specify the data columns and where to find them in the text file. +var reader = new TextLoader(mlContext, new TextLoader.Arguments +{ + Column = new[] { + new TextLoader.Column("IsToxic", DataKind.BL, 0), + new TextLoader.Column("Message", DataKind.TX, 1), + }, + HasHeader = true +}); + +// Read the data. +var data = reader.Read(dataPath); + +// Inspect the message texts that are read from the file. +var messageTexts = data.GetColumn(mlContext, "Message").Take(20).ToArray(); + +// Apply various kinds of text operations supported by ML.NET. +var dynamicPipeline = + // One-stop shop to run the full text featurization. + mlContext.Transforms.Text.FeaturizeText("Message", "TextFeatures") + + // Normalize the message for later transforms + .Append(mlContext.Transforms.Text.NormalizeText("Message", "NormalizedMessage")) + + // NLP pipeline 1: bag of words. + .Append(new WordBagEstimator(mlContext, "NormalizedMessage", "BagOfWords")) + + // NLP pipeline 2: bag of bigrams, using hashes instead of dictionary indices. + .Append(new WordHashBagEstimator(mlContext, "NormalizedMessage", "BagOfBigrams", + ngramLength: 2, allLengths: false)) + + // NLP pipeline 3: bag of tri-character sequences with TF-IDF weighting. + .Append(mlContext.Transforms.Text.TokenizeCharacters("Message", "MessageChars")) + .Append(new WordBagEstimator(mlContext, "MessageChars", "BagOfTrichar", + ngramLength: 3, weighting: NgramTransform.WeightingCriteria.TfIdf)) + + // NLP pipeline 4: word embeddings. + .Append(mlContext.Transforms.Text.ExtractWordEmbeedings("NormalizedMessage", "Embeddings", + WordEmbeddingsTransform.PretrainedModelKind.GloVeTwitter25D)); + +// Let's train our pipeline, and then apply it to the same data. +// Note that even on a small dataset of 70KB the pipeline above can take up to a minute to completely train. +var transformedData = dynamicPipeline.Fit(data).Transform(data); + +// Inspect some columns of the resulting dataset. +var embeddings = transformedData.GetColumn(mlContext, "Embeddings").Take(10).ToArray(); +var unigrams = transformedData.GetColumn(mlContext, "BagOfWords").Take(10).ToArray(); +``` ## How do I train using cross-validation? [Cross-validation](https://en.wikipedia.org/wiki/Cross-validation_(statistics)) is a useful technique for ML applications. It helps estimate the variance of the model quality from one run to another and also eliminates the need to extract a separate test set for evaluation. From cccf6c9498d1600e346f859543de2a03394baffe Mon Sep 17 00:00:00 2001 From: Shahab Moradi Date: Mon, 5 Nov 2018 16:55:48 -0800 Subject: [PATCH 08/14] Updated CV --- docs/code/MlNetCookBook.md | 59 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 56 insertions(+), 3 deletions(-) diff --git a/docs/code/MlNetCookBook.md b/docs/code/MlNetCookBook.md index 4b5bb11df8..45a0078ef7 100644 --- a/docs/code/MlNetCookBook.md +++ b/docs/code/MlNetCookBook.md @@ -826,7 +826,7 @@ var normalizedData = pipeline.Fit(trainData).Transform(trainData); var meanVarValues = normalizedData.GetColumn(r => r.MeanVarNormalized).ToArray(); ``` -You can achive the same results using the dynamic API. +You can achieve the same results using the dynamic API. ```csharp // Create a new context for ML.NET operations. It can be used for exception tracking and logging, // as a catalog of available operations and as the source of randomness. @@ -943,7 +943,7 @@ var fullLearningPipeline = learningPipeline var model = fullLearningPipeline.Fit(data); ``` -You can achive the same results using the dynamic API. +You can achieve the same results using the dynamic API. ```csharp // Create a new context for ML.NET operations. It can be used for exception tracking and logging, // as a catalog of available operations and as the source of randomness. @@ -1070,7 +1070,7 @@ var embeddings = transformedData.GetColumn(x => x.Embeddings).Take(10).ToArray() var unigrams = transformedData.GetColumn(x => x.BagOfWords).Take(10).ToArray(); ``` -You can achive the same results using the dynamic API. +You can achieve the same results using the dynamic API. ```csharp // Create a new context for ML.NET operations. It can be used for exception tracking and logging, // as a catalog of available operations and as the source of randomness. @@ -1188,6 +1188,59 @@ var microAccuracies = cvResults.Select(r => r.metrics.AccuracyMicro); Console.WriteLine(microAccuracies.Average()); ``` +You can achieve the same results using the dynamic API. +```csharp +// Create a new context for ML.NET operations. It can be used for exception tracking and logging, +// as a catalog of available operations and as the source of randomness. +var mlContext = new MLContext(); + +// Step one: read the data as an IDataView. +// First, we define the reader: specify the data columns and where to find them in the text file. +var reader = new TextLoader(mlContext, new TextLoader.Arguments +{ + Column = new[] { + // We read the first 11 values as a single float vector. + new TextLoader.Column("SepalLength", DataKind.R4, 0), + new TextLoader.Column("SepalWidth", DataKind.R4, 1), + new TextLoader.Column("PetalLength", DataKind.R4, 2), + new TextLoader.Column("PetalWidth", DataKind.R4, 3), + // Label: kind of iris. + new TextLoader.Column("Label", DataKind.TX, 4), + }, + // Default separator is tab, but the dataset has comma. + Separator = "," +}); + +// Read the data. +var data = reader.Read(dataPath); + +// Build the training pipeline. +var dynamicPipeline = + // Concatenate all the features together into one column 'Features'. + mlContext.Transforms.Concatenate("Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth") + // Note that the label is text, so it needs to be converted to key. + .Append(new ValueToKeyMappingEstimator(mlContext, "Label"), TransformerScope.TrainTest) + // Use the multi-class SDCA model to predict the label using features. + .Append(mlContext.MulticlassClassification.Trainers.StochasticDualCoordinateAscent()); + +// Split the data 90:10 into train and test sets, train and evaluate. +var (trainData, testData) = mlContext.MulticlassClassification.TrainTestSplit(data, testFraction: 0.1); + +// Train the model. +var model = dynamicPipeline.Fit(trainData); +// Compute quality metrics on the test set. +var metrics = mlContext.MulticlassClassification.Evaluate(model.Transform(testData)); +Console.WriteLine(metrics.AccuracyMicro); + +// Now run the 5-fold cross-validation experiment, using the same pipeline. +var cvResults = mlContext.MulticlassClassification.CrossValidate(data, dynamicPipeline, numFolds: 5); + +// The results object is an array of 5 elements. For each of the 5 folds, we have metrics, model and scored test data. +// Let's compute the average micro-accuracy. +var microAccuracies = cvResults.Select(r => r.metrics.AccuracyMicro); +Console.WriteLine(microAccuracies.Average()); + +``` ## Can I mix and match static and dynamic pipelines? Yes, we can have both of them in our codebase. The static pipelines are just a statically-typed way to build dynamic pipelines. From eb5f02ba36a525ccc46a2949687fb868e97fc324 Mon Sep 17 00:00:00 2001 From: Shahab Moradi Date: Tue, 6 Nov 2018 08:25:51 -0800 Subject: [PATCH 09/14] Changed all the readers to mlContext.Data.TextReader --- docs/code/MlNetCookBook.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/code/MlNetCookBook.md b/docs/code/MlNetCookBook.md index 45a0078ef7..fd02a695ae 100644 --- a/docs/code/MlNetCookBook.md +++ b/docs/code/MlNetCookBook.md @@ -113,7 +113,7 @@ If the schema of the data is not known at compile time, or too cumbersome, you c var mlContext = new MLContext(); // Create the reader: define the data columns and where to find them in the text file. -var reader = new TextLoader(mlContext, new TextLoader.Arguments +var reader = mlContext.Data.TextReader(new TextLoader.Arguments { Column = new[] { // A boolean column depicting the 'label'. @@ -310,7 +310,7 @@ You can also use the dynamic API to create the equivalent of the previous pipeli var mlContext = new MLContext(); // Create the reader: define the data columns and where to find them in the text file. -var reader = new TextLoader(mlContext, new TextLoader.Arguments +var reader = mlContext.Data.TextReader(new TextLoader.Arguments { Column = new[] { // A boolean column depicting the 'label'. @@ -413,7 +413,7 @@ var mlContext = new MLContext(); // Step one: read the data as an IDataView. // First, we define the reader: specify the data columns and where to find them in the text file. -var reader = new TextLoader(mlContext, new TextLoader.Arguments +var reader = mlContext.Data.TextReader(new TextLoader.Arguments { Column = new[] { // We read the first 11 values as a single float vector. @@ -540,7 +540,7 @@ var mlContext = new MLContext(); // Step one: read the data as an IDataView. // First, we define the reader: specify the data columns and where to find them in the text file. -var reader = new TextLoader(mlContext, new TextLoader.Arguments +var reader = mlContext.Data.TextReader(new TextLoader.Arguments { Column = new[] { new TextLoader.Column("SepalLength", DataKind.R4, 0), @@ -833,7 +833,7 @@ You can achieve the same results using the dynamic API. var mlContext = new MLContext(); // Define the reader: specify the data columns and where to find them in the text file. -var reader = new TextLoader(mlContext, new TextLoader.Arguments +var reader = mlContext.Data.TextReader(new TextLoader.Arguments { Column = new[] { // The four features of the Iris dataset will be grouped together as one Features column. @@ -950,7 +950,7 @@ You can achieve the same results using the dynamic API. var mlContext = new MLContext(); // Define the reader: specify the data columns and where to find them in the text file. -var reader = new TextLoader(mlContext, new TextLoader.Arguments +var reader = mlContext.Data.TextReader(new TextLoader.Arguments { Column = new[] { new TextLoader.Column("Label", DataKind.BL, 0), @@ -1077,7 +1077,7 @@ You can achieve the same results using the dynamic API. var mlContext = new MLContext(); // Define the reader: specify the data columns and where to find them in the text file. -var reader = new TextLoader(mlContext, new TextLoader.Arguments +var reader = mlContext.Data.TextReader(new TextLoader.Arguments { Column = new[] { new TextLoader.Column("IsToxic", DataKind.BL, 0), @@ -1196,7 +1196,7 @@ var mlContext = new MLContext(); // Step one: read the data as an IDataView. // First, we define the reader: specify the data columns and where to find them in the text file. -var reader = new TextLoader(mlContext, new TextLoader.Arguments +var reader = mlContext.Data.TextReader(new TextLoader.Arguments { Column = new[] { // We read the first 11 values as a single float vector. From 1f21c84eec3c8a0fe7ef1102ca03c54d4d48d910 Mon Sep 17 00:00:00 2001 From: Shahab Moradi Date: Tue, 6 Nov 2018 09:12:39 -0800 Subject: [PATCH 10/14] Some PR comment fixes --- docs/code/MlNetCookBook.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/code/MlNetCookBook.md b/docs/code/MlNetCookBook.md index fd02a695ae..e0e6c19488 100644 --- a/docs/code/MlNetCookBook.md +++ b/docs/code/MlNetCookBook.md @@ -401,7 +401,7 @@ var learningPipeline = reader.MakeNewEstimator() // the the same call. Prediction: mlContext.Regression.Trainers.Sdca(label: r.Target, features: r.FeatureVector.Normalize()))); -// Step three. Train the pipeline. +// Step three. Fit the pipeline to the training data. var model = learningPipeline.Fit(trainData); ``` @@ -441,7 +441,7 @@ var dynamicPipeline = // Add the SDCA regression trainer. .Append(mlContext.Regression.Trainers.StochasticDualCoordinateAscent(label: "Target", features: "FeatureVector")) -// Step three. Train the pipeline. +// Step three. Fit the pipeline to the training data. var model = dynamicPipeline.Fit(trainData); ``` ## How do I verify the model quality? @@ -565,7 +565,7 @@ var dynamicPipeline = .Append(new ValueToKeyMappingEstimator(mlContext, "Label"), TransformerScope.TrainTest) // Use the multi-class SDCA model to predict the label using features. .Append(mlContext.MulticlassClassification.Trainers.StochasticDualCoordinateAscent()) - // Apply the inverse conversion from 'PredictedLabel' key back to string value. + // Apply the inverse conversion from 'PredictedLabel' column back to string value. .Append(mlContext.Transforms.Conversion.MapKeyToValue("PredictedLabel")); // Train the model. @@ -967,7 +967,7 @@ var reader = mlContext.Data.TextReader(new TextLoader.Arguments // Read the data. var data = reader.Read(dataPath); -// Inspect the categorical columns to check that they are correctly read. +// Inspect the first 10 records of the categorical columns to check that they are correctly read. var catColumns = data.GetColumn(mlContext, "CategoricalFeatures").Take(10).ToArray(); // Build several alternative featurization pipelines. From bf4bc63add142ab0cb482aee781ad0ad1938242e Mon Sep 17 00:00:00 2001 From: Shahab Moradi Date: Tue, 6 Nov 2018 10:49:21 -0800 Subject: [PATCH 11/14] Updated static cookbook tests to match cookbook --- .../Api/CookbookSamples/CookbookSamples.cs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs index 62cd9e0708..3f35815549 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs @@ -67,6 +67,13 @@ private void IntermediateData(string dataPath) // Fit our data pipeline and transform data with it. var transformedData = dataPipeline.Fit(data).Transform(data); + // 'transformedData' is a 'promise' of data. Let's actually read it. + var someRows = transformedData.AsDynamic + // Convert to an enumerable of user-defined type. + .AsEnumerable(mlContext, reuseRowObject: false) + // Take a couple values as an array. + .Take(4).ToArray(); + // Extract the 'AllFeatures' column. // This will give the entire dataset: make sure to only take several row // in case the dataset is huge. @@ -695,5 +702,14 @@ private class IrisPrediction [ColumnName("Data")] public string PredictedClass { get; set; } } + + private class InspectedRow + { + public bool IsOver50K { get; set; } + public string Workclass { get; set; } + public string Education { get; set; } + public string MaritalStatus { get; set; } + public string[] AllFeatures { get; set; } + } } } From 04bafe21c0c7947df487029cbee69850c7994bd9 Mon Sep 17 00:00:00 2001 From: Shahab Moradi Date: Tue, 6 Nov 2018 12:01:11 -0800 Subject: [PATCH 12/14] Added tests for dynamic API snippets, and fixed the bugs in md file. --- docs/code/MlNetCookBook.md | 60 +- .../Api/CookbookSamples/CookbookSamples.cs | 29 +- .../CookbookSamplesDynamicApi.cs | 514 ++++++++++++++++++ 3 files changed, 568 insertions(+), 35 deletions(-) create mode 100644 test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs diff --git a/docs/code/MlNetCookBook.md b/docs/code/MlNetCookBook.md index e0e6c19488..a730015fcf 100644 --- a/docs/code/MlNetCookBook.md +++ b/docs/code/MlNetCookBook.md @@ -117,7 +117,7 @@ var reader = mlContext.Data.TextReader(new TextLoader.Arguments { Column = new[] { // A boolean column depicting the 'label'. - new TextLoader.Column("IsOver50k", DataKind.BL, 0), + new TextLoader.Column("IsOver50K", DataKind.BL, 0), // Three text columns. new TextLoader.Column("Workclass", DataKind.TX, 1), new TextLoader.Column("Education", DataKind.TX, 2), @@ -166,6 +166,29 @@ var reader = mlContext.Data.TextReader(ctx => ( var data = reader.Read(exampleFile1, exampleFile2); ``` +The code is very similar using the dynamic API: +```csharp +// Create a new context for ML.NET operations. It can be used for exception tracking and logging, +// as a catalog of available operations and as the source of randomness. +var mlContext = new MLContext(); + +// Create the reader: define the data columns and where to find them in the text file. +var reader = mlContext.Data.TextReader(new TextLoader.Arguments +{ + Column = new[] { + // A boolean column depicting the 'label'. + new TextLoader.Column("IsOver50k", DataKind.BL, 0), + // Three text columns. + new TextLoader.Column("Workclass", DataKind.TX, 1), + new TextLoader.Column("Education", DataKind.TX, 2), + new TextLoader.Column("MaritalStatus", DataKind.TX, 3) + }, + // First line of the file is a header, not a data row. + HasHeader = true +}); + +var data = reader.Read(exampleFile1, exampleFile2); +``` ## How do I load data with many columns from a CSV? `TextLoader` is used to load data from text files. You will need to specify what are the data columns, what are their types, and where to find them in the text file. @@ -439,7 +462,7 @@ var dynamicPipeline = // between -1 and 1 for all examples), and then train the model. mlContext.Transforms.Normalize("FeatureVector") // Add the SDCA regression trainer. - .Append(mlContext.Regression.Trainers.StochasticDualCoordinateAscent(label: "Target", features: "FeatureVector")) + .Append(mlContext.Regression.Trainers.StochasticDualCoordinateAscent(label: "Target", features: "FeatureVector")); // Step three. Fit the pipeline to the training data. var model = dynamicPipeline.Fit(trainData); @@ -458,6 +481,13 @@ var testData = reader.Read(testDataPath); // Calculate metrics of the model on the test data. var metrics = mlContext.Regression.Evaluate(model.Transform(testData), label: r => r.Target, score: r => r.Prediction); ``` +Calculating the metrics with the dynamic API is as follows. +```csharp +// Read the test dataset. +var testData = reader.Read(testDataPath); +// Calculate metrics of the model on the test data. +var metrics = mlContext.Regression.Evaluate(model.Transform(testData), label: "Target"); +``` ## How do I save and load the model? @@ -480,6 +510,21 @@ using (var stream = File.OpenRead(modelPath)) loadedModel = mlContext.Model.Load(stream); ``` +You can use the dynamic API to achieve the same. +```csharp +using (var stream = File.Create(modelPath)) +{ + // Saving and loading happens to 'dynamic' models. + mlContext.Model.Save(model, stream); +} + +// Potentially, the lines below can be in a different process altogether. + +// When you load the model, it's a 'dynamic' transformer. +ITransformer loadedModel; +using (var stream = File.OpenRead(modelPath)) + loadedModel = mlContext.Model.Load(stream); +``` ## How do I use the model to make one prediction? Since any ML.NET model is a transformer, you can of course use `model.Transform` to apply the model to the 'data view' and obtain predictions this way. @@ -566,7 +611,7 @@ var dynamicPipeline = // Use the multi-class SDCA model to predict the label using features. .Append(mlContext.MulticlassClassification.Trainers.StochasticDualCoordinateAscent()) // Apply the inverse conversion from 'PredictedLabel' column back to string value. - .Append(mlContext.Transforms.Conversion.MapKeyToValue("PredictedLabel")); + .Append(mlContext.Transforms.Conversion.MapKeyToValue(("PredictedLabel", "Data"))); // Train the model. var model = dynamicPipeline.Fit(trainData); @@ -975,7 +1020,7 @@ var dynamicPipeline = // Convert each categorical feature into one-hot encoding independently. mlContext.Transforms.Categorical.OneHotEncoding("CategoricalFeatures", "CategoricalOneHot") // Convert all categorical features into indices, and build a 'word bag' of these. - .Append(mlContext.Transforms.Categorical.OneHotEncoding("CategoricalFeatures", "CategoricalOneHot", CategoricalTransform.OutputKind.Bag)) + .Append(mlContext.Transforms.Categorical.OneHotEncoding("CategoricalFeatures", "CategoricalBag", CategoricalTransform.OutputKind.Bag)) // One-hot encode the workclass column, then drop all the categories that have fewer than 10 instances in the train set. .Append(mlContext.Transforms.Categorical.OneHotEncoding("Workclass", "WorkclassOneHot")) .Append(new CountFeatureSelector(mlContext, "WorkclassOneHot", "WorkclassOneHotTrimmed", count: 10)); @@ -1104,16 +1149,17 @@ var dynamicPipeline = .Append(new WordBagEstimator(mlContext, "NormalizedMessage", "BagOfWords")) // NLP pipeline 2: bag of bigrams, using hashes instead of dictionary indices. - .Append(new WordHashBagEstimator(mlContext, "NormalizedMessage", "BagOfBigrams", + .Append(new WordHashBagEstimator(mlContext, "NormalizedMessage", "BagOfBigrams", ngramLength: 2, allLengths: false)) // NLP pipeline 3: bag of tri-character sequences with TF-IDF weighting. .Append(mlContext.Transforms.Text.TokenizeCharacters("Message", "MessageChars")) - .Append(new WordBagEstimator(mlContext, "MessageChars", "BagOfTrichar", + .Append(new NgramEstimator(mlContext, "MessageChars", "BagOfTrichar", ngramLength: 3, weighting: NgramTransform.WeightingCriteria.TfIdf)) // NLP pipeline 4: word embeddings. - .Append(mlContext.Transforms.Text.ExtractWordEmbeedings("NormalizedMessage", "Embeddings", + .Append(mlContext.Transforms.Text.TokenizeWords("NormalizedMessage", "TokenizedMessage")) + .Append(mlContext.Transforms.Text.ExtractWordEmbeedings("TokenizedMessage", "Embeddings", WordEmbeddingsTransform.PretrainedModelKind.GloVeTwitter25D)); // Let's train our pipeline, and then apply it to the same data. diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs index 3f35815549..eaf42e4e96 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs @@ -6,13 +6,13 @@ using Microsoft.ML.Data; using Microsoft.ML.Runtime.Api; using Microsoft.ML.Runtime.Data; -using Microsoft.ML.Trainers.FastTree; using Microsoft.ML.Runtime.Learners; using Microsoft.ML.Runtime.RunTests; using Microsoft.ML.StaticPipe; using Microsoft.ML.TestFramework; using Microsoft.ML.Transforms; using Microsoft.ML.Transforms.Categorical; +using Microsoft.ML.Transforms.Conversions; using Microsoft.ML.Transforms.Text; using System; using System.Collections.Generic; @@ -21,7 +21,6 @@ using System.Linq; using Xunit; using Xunit.Abstractions; -using Microsoft.ML.Transforms.Conversions; namespace Microsoft.ML.Tests.Scenarios.Api.CookbookSamples { @@ -661,32 +660,6 @@ private void MixMatch(string dataPath) // Now 'dynamicModel', and 'model.AsDynamic' are equivalent. } - [Fact] - public void ReadData() - { - ReadDataDynamic(GetDataPath("generated_regression_dataset.csv")); - } - - private void ReadDataDynamic(string dataPath) - { - // Create a new context for ML.NET operations. It can be used for exception tracking and logging, - // as a catalog of available operations and as the source of randomness. - var mlContext = new MLContext(); - - // Create the reader: define the data columns and where to find them in the text file. - var reader = mlContext.Data.TextReader(new[] { - // We read the first 10 values as a single float vector. - new TextLoader.Column("FeatureVector", DataKind.R4, new[] {new TextLoader.Range(0, 9)}), - // Separately, read the target variable. - new TextLoader.Column("Target", DataKind.R4, 10) - }, - // Default separator is tab, but we need a comma. - s => s.Separator = ","); - - // Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed). - var data = reader.Read(new MultiFileSource(dataPath)); - } - private class CustomerChurnInfo { public string CustomerID { get; set; } diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs new file mode 100644 index 0000000000..626dfb77b2 --- /dev/null +++ b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs @@ -0,0 +1,514 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.ML.Core.Data; +using Microsoft.ML.Data; +using Microsoft.ML.Runtime.Api; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.RunTests; +using Microsoft.ML.TestFramework; +using Microsoft.ML.Transforms; +using Microsoft.ML.Transforms.Categorical; +using Microsoft.ML.Transforms.Normalizers; +using Microsoft.ML.Transforms.Text; +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using Xunit; +using Xunit.Abstractions; + +namespace Microsoft.ML.Tests.Scenarios.Api.CookbookSamples +{ + /// + /// Samples that are written as part of 'ML.NET Cookbook' are also added here as tests. + /// These tests don't actually test anything, other than the fact that the code compiles and + /// doesn't throw when it is executed. + /// + public sealed class CookbookSamplesDynamicApi : BaseTestClass + { + public CookbookSamplesDynamicApi(ITestOutputHelper output) : base(output) + { + } + + private void IntermediateData(string dataPath) + { + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // as a catalog of available operations and as the source of randomness. + var mlContext = new MLContext(); + + // Create the reader: define the data columns and where to find them in the text file. + var reader = mlContext.Data.TextReader(new TextLoader.Arguments + { + Column = new[] { + // A boolean column depicting the 'label'. + new TextLoader.Column("IsOver50K", DataKind.BL, 0), + // Three text columns. + new TextLoader.Column("Workclass", DataKind.TX, 1), + new TextLoader.Column("Education", DataKind.TX, 2), + new TextLoader.Column("MaritalStatus", DataKind.TX, 3) + }, + // First line of the file is a header, not a data row. + HasHeader = true + }); + + // Start creating our processing pipeline. For now, let's just concatenate all the text columns + // together into one. + var dynamicPipeline = mlContext.Transforms.Concatenate("AllFeatures", "Education", "MaritalStatus"); + + // Let's verify that the data has been read correctly. + // First, we read the data file. + var data = reader.Read(dataPath); + + // Fit our data pipeline and transform data with it. + var transformedData = dynamicPipeline.Fit(data).Transform(data); + + // 'transformedData' is a 'promise' of data. Let's actually read it. + var someRows = transformedData + // Convert to an enumerable of user-defined type. + .AsEnumerable(mlContext, reuseRowObject: false) + // Take a couple values as an array. + .Take(4).ToArray(); + + // Extract the 'AllFeatures' column. + // This will give the entire dataset: make sure to only take several row + // in case the dataset is huge. The is similar to the static API, except + // you have to specify the column name and type. + var featureColumns = transformedData.GetColumn(mlContext, "AllFeatures") + .Take(20).ToArray(); + } + + [Fact] + public void InspectIntermediateDataGetColumn() + => IntermediateData(GetDataPath("adult.tiny.with-schema.txt")); + + private void TrainRegression(string trainDataPath, string testDataPath, string modelPath) + { + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // as a catalog of available operations and as the source of randomness. + var mlContext = new MLContext(); + + // Step one: read the data as an IDataView. + // First, we define the reader: specify the data columns and where to find them in the text file. + var reader = mlContext.Data.TextReader(new TextLoader.Arguments + { + Column = new[] { + // We read the first 11 values as a single float vector. + new TextLoader.Column("FeatureVector", DataKind.R4, 0, 10), + + // Separately, read the target variable. + new TextLoader.Column("Target", DataKind.R4, 11), + }, + // First line of the file is a header, not a data row. + HasHeader = true, + // Default separator is tab, but we need a semicolon. + Separator = ";" + }); + + // Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed). + var trainData = reader.Read(trainDataPath); + + // Step two: define the learning pipeline. + + // We 'start' the pipeline with the output of the reader. + var dynamicPipeline = + // First 'normalize' the data (rescale to be + // between -1 and 1 for all examples), and then train the model. + mlContext.Transforms.Normalize("FeatureVector") + // Add the SDCA regression trainer. + .Append(mlContext.Regression.Trainers.StochasticDualCoordinateAscent(label: "Target", features: "FeatureVector")); + + // Step three. Fit the pipeline to the training data. + var model = dynamicPipeline.Fit(trainData); + + // Read the test dataset. + var testData = reader.Read(testDataPath); + // Calculate metrics of the model on the test data. + var metrics = mlContext.Regression.Evaluate(model.Transform(testData), label: "Target"); + + using (var stream = File.Create(modelPath)) + { + // Saving and loading happens to 'dynamic' models. + mlContext.Model.Save(model, stream); + } + + // Potentially, the lines below can be in a different process altogether. + + // When you load the model, it's a 'dynamic' transformer. + ITransformer loadedModel; + using (var stream = File.OpenRead(modelPath)) + loadedModel = mlContext.Model.Load(stream); + } + + [Fact] + public void TrainRegressionModel() + => TrainRegression(GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename), GetDataPath(TestDatasets.generatedRegressionDataset.testFilename), + DeleteOutputPath("cook_model.zip")); + + private ITransformer TrainOnIris(string irisDataPath) + { + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // as a catalog of available operations and as the source of randomness. + var mlContext = new MLContext(); + + // Step one: read the data as an IDataView. + // First, we define the reader: specify the data columns and where to find them in the text file. + var reader = mlContext.Data.TextReader(new TextLoader.Arguments + { + Column = new[] { + new TextLoader.Column("SepalLength", DataKind.R4, 0), + new TextLoader.Column("SepalWidth", DataKind.R4, 1), + new TextLoader.Column("PetalLength", DataKind.R4, 2), + new TextLoader.Column("PetalWidth", DataKind.R4, 3), + // Label: kind of iris. + new TextLoader.Column("Label", DataKind.TX, 4), + }, + // Default separator is tab, but the dataset has comma. + Separator = "," + }); + + // Retrieve the training data. + var trainData = reader.Read(irisDataPath); + + // Build the training pipeline. + var dynamicPipeline = + // Concatenate all the features together into one column 'Features'. + mlContext.Transforms.Concatenate("Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth") + // Note that the label is text, so it needs to be converted to key. + .Append(new ValueToKeyMappingEstimator(mlContext, "Label"), TransformerScope.TrainTest) + // Use the multi-class SDCA model to predict the label using features. + .Append(mlContext.MulticlassClassification.Trainers.StochasticDualCoordinateAscent()) + // Apply the inverse conversion from 'PredictedLabel' column back to string value. + .Append(mlContext.Transforms.Conversion.MapKeyToValue(("PredictedLabel", "Data"))); + + // Train the model. + var model = dynamicPipeline.Fit(trainData); + return model; + } + + private void PredictOnIris(ITransformer model) + { + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // as a catalog of available operations and as the source of randomness. + var mlContext = new MLContext(); + + // Use the model for one-time prediction. + // Make the prediction function object. Note that, on average, this call takes around 200x longer + // than one prediction, so you might want to cache and reuse the prediction function, instead of + // creating one per prediction. + var predictionFunc = model.MakePredictionFunction(mlContext); + + // Obtain the prediction. Remember that 'Predict' is not reentrant. If you want to use multiple threads + // for simultaneous prediction, make sure each thread is using its own PredictionFunction. + var prediction = predictionFunc.Predict(new IrisInput + { + SepalLength = 4.1f, + SepalWidth = 0.1f, + PetalLength = 3.2f, + PetalWidth = 1.4f + }); + } + + [Fact] + public void TrainAndPredictOnIris() + => PredictOnIris(TrainOnIris(GetDataPath("iris.data"))); + + private void NormalizationWorkout(string dataPath) + { + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // as a catalog of available operations and as the source of randomness. + var mlContext = new MLContext(); + + // Define the reader: specify the data columns and where to find them in the text file. + var reader = mlContext.Data.TextReader(new TextLoader.Arguments + { + Column = new[] { + // The four features of the Iris dataset will be grouped together as one Features column. + new TextLoader.Column("Features", DataKind.R4, 0, 3), + // Label: kind of iris. + new TextLoader.Column("Label", DataKind.TX, 4), + }, + // Default separator is tab, but the dataset has comma. + Separator = "," + }); + + // Read the training data. + var trainData = reader.Read(dataPath); + + // Apply all kinds of standard ML.NET normalization to the raw features. + var pipeline = + mlContext.Transforms.Normalize( + new NormalizingEstimator.MinMaxColumn("Features", "MinMaxNormalized", fixZero: true), + new NormalizingEstimator.MeanVarColumn("Features", "MeanVarNormalized", fixZero: true), + new NormalizingEstimator.BinningColumn("Features", "BinNormalized", numBins: 256)); + + // Let's train our pipeline of normalizers, and then apply it to the same data. + var normalizedData = pipeline.Fit(trainData).Transform(trainData); + + // Inspect one column of the resulting dataset. + var meanVarValues = normalizedData.GetColumn(mlContext, "MeanVarNormalized").ToArray(); + } + + [Fact] + public void Normalization() + => NormalizationWorkout(GetDataPath("iris.data")); + + private class IrisInput + { + // Unfortunately, we still need the dummy 'Label' column to be present. + [ColumnName("Label")] + public string IgnoredLabel { get; set; } + public float SepalLength { get; set; } + public float SepalWidth { get; set; } + public float PetalLength { get; set; } + public float PetalWidth { get; set; } + } + + private IEnumerable GetChurnInfo() + { + var r = new Random(454); + return Enumerable.Range(0, 500) + .Select(x => new CustomerChurnInfo + { + HasChurned = x % 2 == 0 || (r.NextDouble() < 0.05), + DemographicCategory = (x % 10).ToString(), + LastVisits = new float[] { x, x * 2, x * 3, x * 4, x * 5 } + }); + } + + private void TextFeaturizationOn(string dataPath) + { + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // as a catalog of available operations and as the source of randomness. + var mlContext = new MLContext(); + + // Define the reader: specify the data columns and where to find them in the text file. + var reader = mlContext.Data.TextReader(new TextLoader.Arguments + { + Column = new[] { + new TextLoader.Column("IsToxic", DataKind.BL, 0), + new TextLoader.Column("Message", DataKind.TX, 1), + }, + HasHeader = true + }); + + // Read the data. + var data = reader.Read(dataPath); + + // Inspect the message texts that are read from the file. + var messageTexts = data.GetColumn(mlContext, "Message").Take(20).ToArray(); + + // Apply various kinds of text operations supported by ML.NET. + var dynamicPipeline = + // One-stop shop to run the full text featurization. + mlContext.Transforms.Text.FeaturizeText("Message", "TextFeatures") + + // Normalize the message for later transforms + .Append(mlContext.Transforms.Text.NormalizeText("Message", "NormalizedMessage")) + + // NLP pipeline 1: bag of words. + .Append(new WordBagEstimator(mlContext, "NormalizedMessage", "BagOfWords")) + + // NLP pipeline 2: bag of bigrams, using hashes instead of dictionary indices. + .Append(new WordHashBagEstimator(mlContext, "NormalizedMessage", "BagOfBigrams", + ngramLength: 2, allLengths: false)) + + // NLP pipeline 3: bag of tri-character sequences with TF-IDF weighting. + .Append(mlContext.Transforms.Text.TokenizeCharacters("Message", "MessageChars")) + .Append(new NgramEstimator(mlContext, "MessageChars", "BagOfTrichar", + ngramLength: 3, weighting: NgramTransform.WeightingCriteria.TfIdf)) + + // NLP pipeline 4: word embeddings. + .Append(mlContext.Transforms.Text.TokenizeWords("NormalizedMessage", "TokenizedMessage")) + .Append(mlContext.Transforms.Text.ExtractWordEmbeedings("TokenizedMessage", "Embeddings", + WordEmbeddingsTransform.PretrainedModelKind.GloVeTwitter25D)); + + // Let's train our pipeline, and then apply it to the same data. + // Note that even on a small dataset of 70KB the pipeline above can take up to a minute to completely train. + var transformedData = dynamicPipeline.Fit(data).Transform(data); + + // Inspect some columns of the resulting dataset. + var embeddings = transformedData.GetColumn(mlContext, "Embeddings").Take(10).ToArray(); + var unigrams = transformedData.GetColumn(mlContext, "BagOfWords").Take(10).ToArray(); + } + + [Fact (Skip = "This test is running for one minute")] + public void TextFeaturization() + => TextFeaturizationOn(GetDataPath("wikipedia-detox-250-line-data.tsv")); + + [Fact] + public void CategoricalFeaturization() + => CategoricalFeaturizationOn(GetDataPath("adult.tiny.with-schema.txt")); + + [Fact] + public void ReadMultipleFiles() + => CategoricalFeaturizationOn(GetDataPath("adult.tiny.with-schema.txt"), GetDataPath("adult.tiny.with-schema.txt")); + + private void CategoricalFeaturizationOn(params string[] dataPath) + { + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // as a catalog of available operations and as the source of randomness. + var mlContext = new MLContext(); + + // Define the reader: specify the data columns and where to find them in the text file. + var reader = mlContext.Data.TextReader(new TextLoader.Arguments + { + Column = new[] { + new TextLoader.Column("Label", DataKind.BL, 0), + // We will load all the categorical features into one vector column of size 8. + new TextLoader.Column("CategoricalFeatures", DataKind.TX, 1, 8), + // Similarly, load all numerical features into one vector of size 6. + new TextLoader.Column("NumericalFeatures", DataKind.R4, 9, 14), + // Let's also separately load the 'Workclass' column. + new TextLoader.Column("Workclass", DataKind.TX, 1), + }, + HasHeader = true + }); + + // Read the data. + var data = reader.Read(dataPath); + + // Inspect the first 10 records of the categorical columns to check that they are correctly read. + var catColumns = data.GetColumn(mlContext, "CategoricalFeatures").Take(10).ToArray(); + + // Build several alternative featurization pipelines. + var dynamicPipeline = + // Convert each categorical feature into one-hot encoding independently. + mlContext.Transforms.Categorical.OneHotEncoding("CategoricalFeatures", "CategoricalOneHot") + // Convert all categorical features into indices, and build a 'word bag' of these. + .Append(mlContext.Transforms.Categorical.OneHotEncoding("CategoricalFeatures", "CategoricalBag", CategoricalTransform.OutputKind.Bag)) + // One-hot encode the workclass column, then drop all the categories that have fewer than 10 instances in the train set. + .Append(mlContext.Transforms.Categorical.OneHotEncoding("Workclass", "WorkclassOneHot")) + .Append(new CountFeatureSelector(mlContext, "WorkclassOneHot", "WorkclassOneHotTrimmed", count: 10)); + + // Let's train our pipeline, and then apply it to the same data. + var transformedData = dynamicPipeline.Fit(data).Transform(data); + + // Inspect some columns of the resulting dataset. + var categoricalBags = transformedData.GetColumn(mlContext, "CategoricalBag").Take(10).ToArray(); + var workclasses = transformedData.GetColumn(mlContext, "WorkclassOneHotTrimmed").Take(10).ToArray(); + + // Of course, if we want to train the model, we will need to compose a single float vector of all the features. + // Here's how we could do this: + + var fullLearningPipeline = dynamicPipeline + // Concatenate two of the 3 categorical pipelines, and the numeric features. + .Append(mlContext.Transforms.Concatenate("Features", "NumericalFeatures", "CategoricalBag", "WorkclassOneHotTrimmed")) + // Now we're ready to train. We chose our FastTree trainer for this classification task. + .Append(mlContext.BinaryClassification.Trainers.FastTree(numTrees: 50)); + + // Train the model. + var model = fullLearningPipeline.Fit(data); + } + + [Fact] + public void CrossValidationIris() + => CrossValidationOn(GetDataPath("iris.data")); + + private void CrossValidationOn(string dataPath) + { + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // as a catalog of available operations and as the source of randomness. + var mlContext = new MLContext(); + + // Step one: read the data as an IDataView. + // First, we define the reader: specify the data columns and where to find them in the text file. + var reader = mlContext.Data.TextReader(new TextLoader.Arguments + { + Column = new[] { + // We read the first 11 values as a single float vector. + new TextLoader.Column("SepalLength", DataKind.R4, 0), + new TextLoader.Column("SepalWidth", DataKind.R4, 1), + new TextLoader.Column("PetalLength", DataKind.R4, 2), + new TextLoader.Column("PetalWidth", DataKind.R4, 3), + // Label: kind of iris. + new TextLoader.Column("Label", DataKind.TX, 4), + }, + // Default separator is tab, but the dataset has comma. + Separator = "," + }); + + // Read the data. + var data = reader.Read(dataPath); + + // Build the training pipeline. + var dynamicPipeline = + // Concatenate all the features together into one column 'Features'. + mlContext.Transforms.Concatenate("Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth") + // Note that the label is text, so it needs to be converted to key. + .Append(new ValueToKeyMappingEstimator(mlContext, "Label"), TransformerScope.TrainTest) + // Use the multi-class SDCA model to predict the label using features. + .Append(mlContext.MulticlassClassification.Trainers.StochasticDualCoordinateAscent()); + + // Split the data 90:10 into train and test sets, train and evaluate. + var (trainData, testData) = mlContext.MulticlassClassification.TrainTestSplit(data, testFraction: 0.1); + + // Train the model. + var model = dynamicPipeline.Fit(trainData); + // Compute quality metrics on the test set. + var metrics = mlContext.MulticlassClassification.Evaluate(model.Transform(testData)); + Console.WriteLine(metrics.AccuracyMicro); + + // Now run the 5-fold cross-validation experiment, using the same pipeline. + var cvResults = mlContext.MulticlassClassification.CrossValidate(data, dynamicPipeline, numFolds: 5); + + // The results object is an array of 5 elements. For each of the 5 folds, we have metrics, model and scored test data. + // Let's compute the average micro-accuracy. + var microAccuracies = cvResults.Select(r => r.metrics.AccuracyMicro); + Console.WriteLine(microAccuracies.Average()); + } + + + [Fact] + public void ReadData() + { + ReadDataDynamic(GetDataPath("generated_regression_dataset.csv")); + } + + private void ReadDataDynamic(string dataPath) + { + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // as a catalog of available operations and as the source of randomness. + var mlContext = new MLContext(); + + // Create the reader: define the data columns and where to find them in the text file. + var reader = mlContext.Data.TextReader(new[] { + // We read the first 10 values as a single float vector. + new TextLoader.Column("FeatureVector", DataKind.R4, new[] {new TextLoader.Range(0, 9)}), + // Separately, read the target variable. + new TextLoader.Column("Target", DataKind.R4, 10) + }, + // Default separator is tab, but we need a comma. + s => s.Separator = ","); + + // Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed). + var data = reader.Read(dataPath); + } + + private class CustomerChurnInfo + { + public string CustomerID { get; set; } + public bool HasChurned { get; set; } + public string DemographicCategory { get; set; } + // Visits during last 5 days, latest to newest. + [VectorType(5)] + public float[] LastVisits { get; set; } + } + + private class IrisPrediction + { + [ColumnName("Data")] + public string PredictedClass { get; set; } + } + + private class InspectedRow + { + public bool IsOver50K { get; set; } + public string Workclass { get; set; } + public string Education { get; set; } + public string MaritalStatus { get; set; } + public string[] AllFeatures { get; set; } + } + } +} From ef26ab73096b2a8113b2528dd819882c14b8623e Mon Sep 17 00:00:00 2001 From: Shahab Moradi Date: Tue, 6 Nov 2018 12:06:05 -0800 Subject: [PATCH 13/14] Cosmetic changes --- .../CookbookSamplesDynamicApi.cs | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs index 626dfb77b2..7b4dd3aef7 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs @@ -417,14 +417,14 @@ private void CrossValidationOn(string dataPath) var reader = mlContext.Data.TextReader(new TextLoader.Arguments { Column = new[] { - // We read the first 11 values as a single float vector. - new TextLoader.Column("SepalLength", DataKind.R4, 0), - new TextLoader.Column("SepalWidth", DataKind.R4, 1), - new TextLoader.Column("PetalLength", DataKind.R4, 2), - new TextLoader.Column("PetalWidth", DataKind.R4, 3), - // Label: kind of iris. - new TextLoader.Column("Label", DataKind.TX, 4), - }, + // We read the first 11 values as a single float vector. + new TextLoader.Column("SepalLength", DataKind.R4, 0), + new TextLoader.Column("SepalWidth", DataKind.R4, 1), + new TextLoader.Column("PetalLength", DataKind.R4, 2), + new TextLoader.Column("PetalWidth", DataKind.R4, 3), + // Label: kind of iris. + new TextLoader.Column("Label", DataKind.TX, 4), + }, // Default separator is tab, but the dataset has comma. Separator = "," }); @@ -458,7 +458,6 @@ private void CrossValidationOn(string dataPath) var microAccuracies = cvResults.Select(r => r.metrics.AccuracyMicro); Console.WriteLine(microAccuracies.Average()); } - [Fact] public void ReadData() From e3a34ae6ae1b25ac96faa0317308703ce943ff95 Mon Sep 17 00:00:00 2001 From: Shahab Moradi Date: Tue, 6 Nov 2018 13:33:40 -0800 Subject: [PATCH 14/14] Addressed the final comments. --- docs/code/MlNetCookBook.md | 8 ++++---- .../Api/CookbookSamples/CookbookSamplesDynamicApi.cs | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/code/MlNetCookBook.md b/docs/code/MlNetCookBook.md index a730015fcf..92c589d3a8 100644 --- a/docs/code/MlNetCookBook.md +++ b/docs/code/MlNetCookBook.md @@ -416,7 +416,7 @@ var trainData = reader.Read(trainDataPath); // We 'start' the pipeline with the output of the reader. var learningPipeline = reader.MakeNewEstimator() // Now we can add any 'training steps' to it. In our case we want to 'normalize' the data (rescale to be - // between -1 and 1 for all examples), and then train the model. + // between -1 and 1 for all examples) .Append(r => ( // Retain the 'Target' column for evaluation purposes. r.Target, @@ -459,7 +459,7 @@ var trainData = reader.Read(trainDataPath); // We 'start' the pipeline with the output of the reader. var dynamicPipeline = // First 'normalize' the data (rescale to be - // between -1 and 1 for all examples), and then train the model. + // between -1 and 1 for all examples) mlContext.Transforms.Normalize("FeatureVector") // Add the SDCA regression trainer. .Append(mlContext.Regression.Trainers.StochasticDualCoordinateAscent(label: "Target", features: "FeatureVector")); @@ -607,7 +607,7 @@ var dynamicPipeline = // Concatenate all the features together into one column 'Features'. mlContext.Transforms.Concatenate("Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth") // Note that the label is text, so it needs to be converted to key. - .Append(new ValueToKeyMappingEstimator(mlContext, "Label"), TransformerScope.TrainTest) + .Append(mlContext.Transforms.Categorical.MapValueToKey("Label"), TransformerScope.TrainTest) // Use the multi-class SDCA model to predict the label using features. .Append(mlContext.MulticlassClassification.Trainers.StochasticDualCoordinateAscent()) // Apply the inverse conversion from 'PredictedLabel' column back to string value. @@ -1265,7 +1265,7 @@ var dynamicPipeline = // Concatenate all the features together into one column 'Features'. mlContext.Transforms.Concatenate("Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth") // Note that the label is text, so it needs to be converted to key. - .Append(new ValueToKeyMappingEstimator(mlContext, "Label"), TransformerScope.TrainTest) + .Append(mlContext.Transforms.Categorical.MapValueToKey("Label"), TransformerScope.TrainTest) // Use the multi-class SDCA model to predict the label using features. .Append(mlContext.MulticlassClassification.Trainers.StochasticDualCoordinateAscent()); diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs index 7b4dd3aef7..5d6bf62eeb 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs @@ -176,7 +176,7 @@ private ITransformer TrainOnIris(string irisDataPath) // Concatenate all the features together into one column 'Features'. mlContext.Transforms.Concatenate("Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth") // Note that the label is text, so it needs to be converted to key. - .Append(new ValueToKeyMappingEstimator(mlContext, "Label"), TransformerScope.TrainTest) + .Append(mlContext.Transforms.Categorical.MapValueToKey("Label"), TransformerScope.TrainTest) // Use the multi-class SDCA model to predict the label using features. .Append(mlContext.MulticlassClassification.Trainers.StochasticDualCoordinateAscent()) // Apply the inverse conversion from 'PredictedLabel' column back to string value. @@ -437,7 +437,7 @@ private void CrossValidationOn(string dataPath) // Concatenate all the features together into one column 'Features'. mlContext.Transforms.Concatenate("Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth") // Note that the label is text, so it needs to be converted to key. - .Append(new ValueToKeyMappingEstimator(mlContext, "Label"), TransformerScope.TrainTest) + .Append(mlContext.Transforms.Categorical.MapValueToKey("Label"), TransformerScope.TrainTest) // Use the multi-class SDCA model to predict the label using features. .Append(mlContext.MulticlassClassification.Trainers.StochasticDualCoordinateAscent());