Add V1 Scenario tests for data transformation (#2803)

rogancarr · web-flow · commit a10050506d0d · 2019-03-04T11:23:21.000-08:00
* Add tests for data transformation scenarios
diff --git a/test/Microsoft.ML.Functional.Tests/DataTransformation.cs b/test/Microsoft.ML.Functional.Tests/DataTransformation.cs
@@ -0,0 +1,198 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using Microsoft.ML.Functional.Tests.Datasets;
+using Microsoft.ML.RunTests;
+using Microsoft.ML.TestFramework;
+using Microsoft.ML.Trainers;
+using Microsoft.ML.Transforms;
+using Microsoft.ML.Transforms.Text;
+using Xunit;
+using Xunit.Abstractions;
+
+namespace Microsoft.ML.Functional.Tests
+{
+    public class DataTransformation : BaseTestClass
+    {
+        public DataTransformation(ITestOutputHelper output) : base(output)
+        {
+        }
+
+        /// <summary>
+        /// Extensibility: Add a new column that is a function of other columns.
+        /// </summary>
+        [Fact]
+        void ExtensibilityAddAColumnAsAFunctionOfMultipleColumns()
+        {
+            // Concurrency must be 1 to assure that the mapping is done sequentially.
+            var mlContext = new MLContext(seed: 1, conc: 1);
+
+            // Load the Iris dataset
+            var data = mlContext.Data.LoadFromTextFile<Iris>(
+                GetDataPath(TestDatasets.iris.trainFilename),
+                hasHeader: TestDatasets.iris.fileHasHeader,
+                separatorChar: TestDatasets.iris.fileSeparator);
+
+            // Subsample it down to the first 10 rows.
+            int numSamples = 10;
+            data = mlContext.Data.TakeRows(data, numSamples);
+
+            // Create a stand-alone function to produce a random number.
+            float angiospermCosine(float petalWidth, float petalLength, float sepalWidth, float sepalLength)
+            {
+                var petalMagnitude = Math.Sqrt(petalWidth * petalWidth + petalLength * petalLength);
+                var sepalMagnitude = Math.Sqrt(sepalWidth * sepalWidth + sepalLength * sepalLength);
+                return (float)((petalWidth * sepalWidth + petalLength * sepalLength) / (petalMagnitude * sepalMagnitude));
+            }
+
+            // Create a function that generates a column.
+            Action<Iris, IrisWithOneExtraColumn> generateGroupId = (input, output) =>
+            {
+                output.Label = input.Label;
+                output.Float1 = angiospermCosine(input.PetalLength, input.PetalWidth, input.SepalLength, input.SepalWidth);
+                output.PetalLength = input.PetalLength;
+                output.PetalWidth = input.PetalWidth;
+                output.SepalLength = input.SepalLength;
+                output.SepalWidth = input.SepalWidth;
+            };
+
+            // Create a pipeline to execute the custom function.
+            var pipeline = mlContext.Transforms.CustomMapping(generateGroupId, null);
+
+            // Transform the data.
+            var transformedData = pipeline.Fit(data).Transform(data);
+
+            // Verify that the column has the correct data.
+            var transformedRows = mlContext.Data.CreateEnumerable<IrisWithOneExtraColumn>(transformedData, reuseRowObject: true);
+            foreach (var row in transformedRows)
+            {
+                var cosineDistance = angiospermCosine(row.PetalLength, row.PetalWidth, row.SepalLength, row.SepalWidth);
+                Assert.Equal(cosineDistance, row.Float1);
+            }
+        }
+
+        /// <summary>
+        /// Extensibility: Add multiple new columns.
+        /// </summary>
+        [Fact]
+        void ExtensibilityAddingTwoColumns()
+        {
+            // Concurrency must be 1 to assure that the mapping is done sequentially.
+            var mlContext = new MLContext(seed: 1, conc: 1);
+
+            // Load the Iris dataset
+            var data = mlContext.Data.LoadFromTextFile<Iris>(
+                GetDataPath(TestDatasets.iris.trainFilename),
+                hasHeader: TestDatasets.iris.fileHasHeader,
+                separatorChar: TestDatasets.iris.fileSeparator);
+
+            // Subsample it down to the first 10 rows.
+            int numSamples = 10;
+            data = mlContext.Data.TakeRows(data, numSamples);
+
+            // Create a function that generates a column.
+            Action<Iris, IrisWithTwoExtraColumns> generateGroupId = (input, output) =>
+            {
+                output.Label = input.Label;
+                output.Float1 = GetRandomNumber(1 + input.Label + input.PetalLength + input.PetalWidth + input.SepalLength + input.SepalWidth);
+                output.Float2 = GetRandomNumber(2 + input.Label + input.PetalLength + input.PetalWidth + input.SepalLength + input.SepalWidth);
+                output.PetalLength = input.PetalLength;
+                output.PetalWidth = input.PetalWidth;
+                output.SepalLength = input.SepalLength;
+                output.SepalWidth = input.SepalWidth;
+            };
+
+            // Create a pipeline to execute the custom function.
+            var pipeline = mlContext.Transforms.CustomMapping(generateGroupId, null);
+
+            // Transform the data.
+            var transformedData = pipeline.Fit(data).Transform(data);
+
+            // Verify that the column has the correct data.
+            var transformedRows = mlContext.Data.CreateEnumerable<IrisWithTwoExtraColumns>(transformedData, reuseRowObject: true);
+            foreach (var row in transformedRows)
+            {
+                var randomNumber1 = GetRandomNumber(1 + row.Label + row.PetalLength + row.PetalWidth + row.SepalLength + row.SepalWidth);
+                var randomNumber2 = GetRandomNumber(2 + row.Label + row.PetalLength + row.PetalWidth + row.SepalLength + row.SepalWidth);
+                Assert.Equal(randomNumber1, row.Float1);
+                Assert.Equal(randomNumber2, row.Float2);
+            }
+        }
+
+        /// <summary>
+        /// Extensibility: Featurize text using custom word-grams, char-grams, and normalization.
+        /// </summary>
+        [Fact]
+        void ExtensibilityModifyTextFeaturization()
+        {
+            // Concurrency must be 1 to assure that the mapping is done sequentially.
+            var mlContext = new MLContext(seed: 1, conc: 1);
+
+            var data = mlContext.Data.LoadFromTextFile<TweetSentiment>(GetDataPath(TestDatasets.Sentiment.trainFilename),
+                hasHeader: TestDatasets.Sentiment.fileHasHeader,
+                separatorChar: TestDatasets.Sentiment.fileSeparator);
+
+            // Create a training pipeline.
+            // TODO #2802: Update FeaturizeText to allow specifications of word-grams and char-grams.
+            var pipeline = mlContext.Transforms.Text.FeaturizeText("Features", new string[] { "SentimentText" },
+                    new TextFeaturizingEstimator.Options
+                    {
+                        UseCharExtractor = true,
+                        UseWordExtractor = true,
+                        VectorNormalizer = TextFeaturizingEstimator.TextNormKind.L1
+                    })
+                .AppendCacheCheckpoint(mlContext)
+                .Append(mlContext.BinaryClassification.Trainers.StochasticDualCoordinateAscent(
+                    new SdcaBinaryTrainer.Options { NumThreads = 1 }));
+
+            // Train the model.
+            var model = pipeline.Fit(data);
+
+            // Evaluate the model.
+            var scoredData = model.Transform(data);
+            var metrics = mlContext.BinaryClassification.Evaluate(scoredData);
+
+            // Check that the metrics returned are valid.
+            Common.AssertMetrics(metrics);
+        }
+
+        /// <summary>
+        /// Extensibility: Apply a normalizer to columns in the dataset.
+        /// </summary>
+        [Fact]
+        void ExtensibilityNormalizeColumns()
+        {
+            // Concurrency must be 1 to assure that the mapping is done sequentially.
+            var mlContext = new MLContext(seed: 1, conc: 1);
+
+            // Load the Iris dataset.
+            var data = mlContext.Data.LoadFromTextFile<Iris>(
+                GetDataPath(TestDatasets.iris.trainFilename),
+                hasHeader: TestDatasets.iris.fileHasHeader,
+                separatorChar: TestDatasets.iris.fileSeparator);
+
+            // Compose the transformation.
+            var pipeline = mlContext.Transforms.Concatenate("Features", Iris.Features)
+                .Append(mlContext.Transforms.Normalize("Features", mode: NormalizingEstimator.NormalizerMode.MinMax));
+            
+            // Transform the data.
+            var transformedData = pipeline.Fit(data).Transform(data);
+
+            // Validate that the data was normalized to between -1 and 1.
+            var dataEnumerator = mlContext.Data.CreateEnumerable<FeatureColumn>(transformedData, true);
+            foreach (var row in dataEnumerator)
+                // Verify per-slot normalization.
+                for (int i = 0; i < row.Features.Length; i++)
+                    Assert.InRange(row.Features[i], -1, 1);
+        }
+
+        private float GetRandomNumber(float number)
+        {
+            var seed = (int)(10 * number);
+            var rng = new Random(seed);
+            return (float)rng.NextDouble();
+        }
+    }
+}
diff --git a/test/Microsoft.ML.Functional.Tests/Datasets/FeatureColumn.cs b/test/Microsoft.ML.Functional.Tests/Datasets/FeatureColumn.cs
@@ -0,0 +1,14 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+namespace Microsoft.ML.Functional.Tests.Datasets
+{
+    /// <summary>
+    /// A class to hold a feature column.
+    /// </summary>
+    internal sealed class FeatureColumn
+    {
+        public float[] Features { get; set; }
+    }
+}
diff --git a/test/Microsoft.ML.Functional.Tests/Datasets/Iris.cs b/test/Microsoft.ML.Functional.Tests/Datasets/Iris.cs
@@ -75,4 +75,31 @@ internal sealed class IrisWithGroup
         public float PetalLength { get; set; }
         public float PetalWidth { get; set; }
     }
+
+    /// <summary>
+    /// A class for the Iris dataset with an extra float column.
+    /// </summary>
+    internal sealed class IrisWithOneExtraColumn
+    {
+        public float Label { get; set; }
+        public float SepalLength { get; set; }
+        public float SepalWidth { get; set; }
+        public float PetalLength { get; set; }
+        public float PetalWidth { get; set; }
+        public float Float1 { get; set; }
+    }
+
+    /// <summary>
+    /// A class for the Iris dataset with two extra float columns.
+    /// </summary>
+    internal sealed class IrisWithTwoExtraColumns
+    {
+        public float Label { get; set; }
+        public float SepalLength { get; set; }
+        public float SepalWidth { get; set; }
+        public float PetalLength { get; set; }
+        public float PetalWidth { get; set; }
+        public float Float1 { get; set; }
+        public float Float2 { get; set; }
+    }
 }
diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Extensibility.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Extensibility.cs