Scrubbing FieldAwareFactorizationMachine learner.

zeahmed · zeahmed · commit 19d25e2e87cd · 2019-02-25T16:56:39.000-08:00
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Calibrator.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Calibrator.cs
@@ -16,7 +16,7 @@ public static void Example()
             // This will create a sentiment.tsv file in the filesystem.
             // The string, dataFile, is the path to the downloaded file.
             // You can open this file, if you want to see the data. 
-            string dataFile = SamplesUtils.DatasetUtils.DownloadSentimentDataset();
+            (string dataFile, _ ) = SamplesUtils.DatasetUtils.DownloadSentimentDataset();
 
             // A preview of the data. 
             // Sentiment	SentimentText
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/FieldAwareFactorizationMachine.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/FieldAwareFactorizationMachine.cs
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/FieldAwareFactorizationMachine.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/FieldAwareFactorizationMachine.cs
@@ -0,0 +1,71 @@
+﻿using System;
+using System.Linq;
+using Microsoft.ML.Data;
+namespace Microsoft.ML.Samples.Dynamic
+{
+    public static class FFMBinaryClassification
+    {
+        public static void Example()
+        {
+            // Creating the ML.Net IHostEnvironment object, needed for the pipeline.
+            var mlContext = new MLContext();
+
+            // Download and featurize the dataset.
+            (var trainData, var testData) = SamplesUtils.DatasetUtils.LoadFeaturizedSentimentDataset(mlContext);
+
+            // ML.NET doesn't cache data set by default. Therefore, if one reads a data set from a file and accesses it many times, it can be slow due to
+            // expensive featurization and disk operations. When the considered data can fit into memory, a solution is to cache the data in memory. Caching is especially
+            // helpful when working with iterative algorithms which needs many data passes. Since SDCA is the case, we cache. Inserting a
+            // cache step in a pipeline is also possible, please see the construction of pipeline below.
+            trainData = mlContext.Data.Cache(trainData);
+
+            // Step 2: Pipeline
+            // Create the 'FieldAwareFactorizationMachine' binary classifier, setting the "Sentiment" column as the label of the dataset, and 
+            // the "Features" column as the features column.
+            var pipeline = new EstimatorChain<ITransformer>().AppendCacheCheckpoint(mlContext)
+                .Append(mlContext.BinaryClassification.Trainers.
+                FieldAwareFactorizationMachine(labelColumnName: "Sentiment", featureColumnNames: new[] { "Features" }));
+
+            // Fit the model.
+            var model = pipeline.Fit(trainData);
+
+            // Let's get the model parameters from the model.
+            var modelParams = model.LastTransformer.Model;
+
+            // Let's inspect the model parameters.
+            var featureCount = modelParams.GetFeatureCount();
+            var fieldCount = modelParams.GetFieldCount();
+            var latentDim = modelParams.GetLatentDim();
+            var linearWeights = modelParams.GetLinearWeights();
+            var latentWeights = modelParams.GetLatentWeights();
+
+            Console.WriteLine("The feature count is: " + featureCount);
+            Console.WriteLine("The number of fields is: " + fieldCount);
+            Console.WriteLine("The latent dimension is: " + latentDim);
+            Console.WriteLine("The linear weights of some of the features are: " + 
+                string.Concat(Enumerable.Range(1, 10).Select(i => $"{linearWeights[i]:F4} ")));
+            Console.WriteLine("The weights of some of the latent features are: " + 
+                 string.Concat(Enumerable.Range(1, 10).Select(i => $"{latentWeights[i]:F4} ")));
+
+            //  The feature count is: 9374
+            //  The number of fields is: 1
+            //  The latent dimension is: 20
+            //  The linear weights of some of the features are: 0.0196  0.0000 -0.0045 -0.0205  0.0000  0.0032  0.0682  0.0091 -0.0151  0.0089
+            //  The weights of some of the latent features are: 0.3316  0.2140  0.0752  0.0908 -0.0495 -0.0810  0.0761  0.0966  0.0090 -0.0962
+
+            // Evaluate how the model is doing on the test data.
+            var dataWithPredictions = model.Transform(testData);
+
+            var metrics = mlContext.BinaryClassification.Evaluate(dataWithPredictions, "Sentiment");
+            SamplesUtils.ConsoleUtils.PrintMetrics(metrics);
+
+            //  Accuracy: 0.72
+            //  AUC: 0.75
+            //  F1 Score: 0.74
+            //  Negative Precision: 0.75
+            //  Negative Recall: 0.67
+            //  Positive Precision: 0.70
+            //  Positive Recall: 0.78
+        }
+    }
+}
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/FieldAwareFactorizationMachinewWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/FieldAwareFactorizationMachinewWithOptions.cs
@@ -0,0 +1,80 @@
+﻿using System;
+using System.Linq;
+using Microsoft.ML.Data;
+using Microsoft.ML.FactorizationMachine;
+
+namespace Microsoft.ML.Samples.Dynamic
+{
+    public static class FFMBinaryClassificationWithOptions
+    {
+        public static void Example()
+        {
+            // Creating the ML.Net IHostEnvironment object, needed for the pipeline.
+            var mlContext = new MLContext();
+
+            // Download and featurize the dataset.
+            (var trainData, var testData) = SamplesUtils.DatasetUtils.LoadFeaturizedSentimentDataset(mlContext);
+
+            // ML.NET doesn't cache data set by default. Therefore, if one reads a data set from a file and accesses it many times, it can be slow due to
+            // expensive featurization and disk operations. When the considered data can fit into memory, a solution is to cache the data in memory. Caching is especially
+            // helpful when working with iterative algorithms which needs many data passes. Since SDCA is the case, we cache. Inserting a
+            // cache step in a pipeline is also possible, please see the construction of pipeline below.
+            trainData = mlContext.Data.Cache(trainData);
+
+            // Step 2: Pipeline
+            // Create the 'FieldAwareFactorizationMachine' binary classifier, setting the "Sentiment" column as the label of the dataset, and 
+            // the "Features" column as the features column.
+            var pipeline = new EstimatorChain<ITransformer>().AppendCacheCheckpoint(mlContext)
+                .Append(mlContext.BinaryClassification.Trainers.
+                FieldAwareFactorizationMachine(
+                    new FieldAwareFactorizationMachineTrainer.Options
+                    {
+                        FeatureColumn = "Features",
+                        LabelColumn = "Sentiment",
+                        LearningRate = 0.1f,
+                        Iters = 10
+                    }));
+
+            // Fit the model.
+            var model = pipeline.Fit(trainData);
+
+            // Let's get the model parameters from the model.
+            var modelParams = model.LastTransformer.Model;
+
+            // Let's inspect the model parameters.
+            var featureCount = modelParams.GetFeatureCount();
+            var fieldCount = modelParams.GetFieldCount();
+            var latentDim = modelParams.GetLatentDim();
+            var linearWeights = modelParams.GetLinearWeights();
+            var latentWeights = modelParams.GetLatentWeights();
+
+            Console.WriteLine("The feature count is: " + featureCount);
+            Console.WriteLine("The number of fields is: " + fieldCount);
+            Console.WriteLine("The latent dimension is: " + latentDim);
+            Console.WriteLine("The linear weights of some of the features are: " +
+                string.Concat(Enumerable.Range(1, 10).Select(i => $"{linearWeights[i]:F4} ")));
+            Console.WriteLine("The weights of some of the latent features are: " +
+                 string.Concat(Enumerable.Range(1, 10).Select(i => $"{latentWeights[i]:F4} ")));
+
+            //  The feature count is: 9374
+            //  The number of fields is: 1
+            //  The latent dimension is: 20
+            //  The linear weights of some of the features are:  0.0410  0.0000 -0.0078 -0.0285  0.0000  0.0114  0.1313  0.0183 -0.0224  0.0166
+            //  The weights of some of the latent features are: -0.0326  0.1127  0.0621  0.1446  0.2038  0.1608  0.2084  0.0141  0.2458 -0.0625
+
+            // Evaluate how the model is doing on the test data.
+            var dataWithPredictions = model.Transform(testData);
+
+            var metrics = mlContext.BinaryClassification.Evaluate(dataWithPredictions, "Sentiment");
+            SamplesUtils.ConsoleUtils.PrintMetrics(metrics);
+
+            //  Accuracy: 0.78
+            //  AUC: 0.81
+            //  F1 Score: 0.78
+            //  Negative Precision: 0.78
+            //  Negative Recall: 0.78
+            //  Positive Precision: 0.78
+            //  Positive Recall: 0.78
+        }
+    }
+}
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SDCALogisticRegression.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SDCALogisticRegression.cs
@@ -12,7 +12,7 @@ public static void Example()
             // Downloading the dataset from github.com/dotnet/machinelearning.
             // This will create a sentiment.tsv file in the filesystem.
             // You can open this file, if you want to see the data. 
-            string dataFile = SamplesUtils.DatasetUtils.DownloadSentimentDataset();
+            (var dataFile, _ ) = SamplesUtils.DatasetUtils.DownloadSentimentDataset();
 
             // A preview of the data. 
             // Sentiment	SentimentText
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/PriorTrainerSample.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/PriorTrainerSample.cs
@@ -10,7 +10,7 @@ public static void Example()
             // Downloading the dataset from github.com/dotnet/machinelearning.
             // This will create a sentiment.tsv file in the filesystem.
             // You can open this file, if you want to see the data. 
-            string dataFile = SamplesUtils.DatasetUtils.DownloadSentimentDataset();
+            (string dataFile, _ ) = SamplesUtils.DatasetUtils.DownloadSentimentDataset();
 
             // A preview of the data. 
             // Sentiment	SentimentText
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/RandomTrainerSample.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/RandomTrainerSample.cs
@@ -10,7 +10,7 @@ public static void Example()
             // Downloading the dataset from github.com/dotnet/machinelearning.
             // This will create a sentiment.tsv file in the filesystem.
             // You can open this file, if you want to see the data. 
-            string dataFile = SamplesUtils.DatasetUtils.DownloadSentimentDataset();
+            (string dataFile, _ ) = SamplesUtils.DatasetUtils.DownloadSentimentDataset();
 
             // A preview of the data. 
             // Sentiment	SentimentText
diff --git a/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj b/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj
@@ -22,6 +22,7 @@
     <NativeAssemblyReference Include="CpuMathNative" />
     <NativeAssemblyReference Include="FastTreeNative" />
     <NativeAssemblyReference Include="MatrixFactorizationNative" />
+    <NativeAssemblyReference Include="FactorizationMachineNative" />
     <NativeAssemblyReference Include="LdaNative" />
     <NativeAssemblyReference Include="SymSgdNative" />
     <PackageReference Include="Microsoft.ML.TensorFlow.Redist" Version="0.10.0" />
diff --git a/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs b/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs
@@ -78,14 +78,48 @@ public sealed class HousingRegression
         /// <summary>
         /// Downloads the wikipedia detox dataset from the ML.NET repo.
         /// </summary>
-        public static string DownloadSentimentDataset()
-         => Download("https://raw.githubusercontent.com/dotnet/machinelearning/76cb2cdf5cc8b6c88ca44b8969153836e589df04/test/data/wikipedia-detox-250-line-data.tsv", "sentiment.tsv");
+        public static (string trainFile, string testFile) DownloadSentimentDataset()
+        {
+            var trainFile = Download("https://raw.githubusercontent.com/dotnet/machinelearning/76cb2cdf5cc8b6c88ca44b8969153836e589df04/test/data/wikipedia-detox-250-line-data.tsv", "sentiment.tsv");
+            var testFile = Download("https://raw.githubusercontent.com/dotnet/machinelearning/76cb2cdf5cc8b6c88ca44b8969153836e589df04/test/data/wikipedia-detox-250-line-test.tsv", "sentimenttest.tsv");
+            return (trainFile, testFile);
+        }
+
+            /// <summary>
+            /// Downloads the adult dataset from the ML.NET repo.
+            /// </summary>
+            public static string DownloadAdultDataset()
+            => Download("https://raw.githubusercontent.com/dotnet/machinelearning/244a8c2ac832657af282aa312d568211698790aa/test/data/adult.train", "adult.txt");
 
         /// <summary>
-        /// Downloads the adult dataset from the ML.NET repo.
+        /// Downloads the  wikipedia detox dataset and featurizes it to be suitable for sentiment classification tasks.
         /// </summary>
-        public static string DownloadAdultDataset()
-            => Download("https://raw.githubusercontent.com/dotnet/machinelearning/244a8c2ac832657af282aa312d568211698790aa/test/data/adult.train", "adult.txt");
+        /// <param name="mlContext"><see cref="MLContext"/> used for data loading and processing.</param>
+        /// <returns>Featurized dataset.</returns>
+        public static (IDataView trainData, IDataView testData) LoadFeaturizedSentimentDataset(MLContext mlContext)
+        {
+            // Download the file
+            (string trainFile, string testFile) = DownloadSentimentDataset();
+
+            // Define the columns to read
+            var reader = mlContext.Data.CreateTextLoader(
+                columns: new[]
+                    {
+                        new TextLoader.Column("Sentiment", DataKind.BL, 0),
+                        new TextLoader.Column("SentimentText", DataKind.Text, 1)
+                    },
+                hasHeader: true
+            );
+
+            // Create data featurizing pipeline
+            var pipeline = mlContext.Transforms.Text.FeaturizeText("Features", "SentimentText");
+
+            var data = reader.Read(trainFile);
+            var model = pipeline.Fit(data);
+            var featurizedDataTrain = model.Transform(data);
+            var featurizedDataTest = model.Transform(reader.Read(testFile));
+            return (featurizedDataTrain, featurizedDataTest);
+        }
 
         /// <summary>
         /// Downloads the Adult UCI dataset and featurizes it to be suitable for classification tasks.
diff --git a/src/Microsoft.ML.StandardLearners/FactorizationMachine/FactorizationMachineCatalog.cs b/src/Microsoft.ML.StandardLearners/FactorizationMachine/FactorizationMachineCatalog.cs
@@ -23,7 +23,7 @@ public static class FactorizationMachineExtensions
         /// <example>
         /// <format type="text/markdown">
         /// <![CDATA[
-        ///  [!code-csharp[FieldAwareFactorizationMachine](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/FieldAwareFactorizationMachine.cs)]
+        ///  [!code-csharp[FieldAwareFactorizationMachine](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/FieldAwareFactorizationMachine.cs)]
         /// ]]></format>
         /// </example>
         public static FieldAwareFactorizationMachineTrainer FieldAwareFactorizationMachine(this BinaryClassificationCatalog.BinaryClassificationTrainers catalog,
@@ -41,6 +41,12 @@ public static FieldAwareFactorizationMachineTrainer FieldAwareFactorizationMachi
         /// </summary>
         /// <param name="catalog">The binary classification catalog trainer object.</param>
         /// <param name="options">Advanced arguments to the algorithm.</param>
+        /// <example>
+        /// <format type="text/markdown">
+        /// <![CDATA[
+        ///  [!code-csharp[FieldAwareFactorizationMachine](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/FieldAwareFactorizationMachineWithOptions.cs)]
+        /// ]]></format>
+        /// </example>
         public static FieldAwareFactorizationMachineTrainer FieldAwareFactorizationMachine(this BinaryClassificationCatalog.BinaryClassificationTrainers catalog,
             FieldAwareFactorizationMachineTrainer.Options options)
         {
diff --git a/src/Microsoft.ML.StandardLearners/FactorizationMachine/FactorizationMachineTrainer.cs b/src/Microsoft.ML.StandardLearners/FactorizationMachine/FactorizationMachineTrainer.cs