Checking in the samples generated during bug bash for MissingNa, Repl… (#2960)

sfilipi · web-flow · commit 9cd9a8c348e2 · 2019-03-14T21:02:10.000-07:00
* Checkign in the samples generated during bug bash for MissingNa, ReplaceNA and OneHot
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Categorical/OneHotEncoding.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Categorical/OneHotEncoding.cs
@@ -0,0 +1,85 @@
+﻿using System;
+using System.Collections.Generic;
+using System.Linq;
+using Microsoft.ML.Data;
+using static Microsoft.ML.Transforms.OneHotEncodingEstimator;
+
+namespace Microsoft.ML.Samples.Dynamic
+{
+    public static class OneHotEncoding
+    {
+        public static void Example()
+        {
+            // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, 
+            // as well as the source of randomness.
+            var mlContext = new MLContext();
+
+            // Get a small dataset as an IEnumerable.
+            var samples = new List<DataPoint>()
+            {
+                new DataPoint(){ Label = 0, Education = "0-5yrs" },
+                new DataPoint(){ Label = 1, Education = "0-5yrs" },
+                new DataPoint(){ Label = 45, Education = "6-11yrs" },
+                new DataPoint(){ Label = 50, Education = "6-11yrs" },
+                new DataPoint(){ Label = 50, Education = "11-15yrs" },
+            };
+
+            // Convert training data to IDataView.
+            var trainData = mlContext.Data.LoadFromEnumerable(samples);
+
+            // A pipeline for one hot encoding the Education column.
+            var bagPipeline = mlContext.Transforms.Categorical.OneHotEncoding("EducationOneHotEncoded", "Education", OutputKind.Bag);
+            // Fit to data.
+            var bagTransformer = bagPipeline.Fit(trainData);
+
+            // Get transformed data
+            var bagTransformedData = bagTransformer.Transform(trainData);
+            // Getting the data of the newly created column, so we can preview it.
+            var bagEncodedColumn = bagTransformedData.GetColumn<float[]>("EducationOneHotEncoded");
+
+            var keyPipeline = mlContext.Transforms.Categorical.OneHotEncoding("EducationOneHotEncoded", "Education", OutputKind.Key);
+            // Fit to data.
+            var keyTransformer = keyPipeline.Fit(trainData);
+
+            // Get transformed data
+            var keyTransformedData = keyTransformer.Transform(trainData);
+            // Getting the data of the newly created column, so we can preview it.
+            var keyEncodedColumn = keyTransformedData.GetColumn<uint>("EducationOneHotEncoded");
+
+            Console.WriteLine("One Hot Encoding based on the bagging strategy.");
+            foreach (var row in bagEncodedColumn)
+            {
+                for (var i = 0; i < row.Length; i++)
+                    Console.Write($"{row[i]} ");
+            }
+
+            // data column obtained post-transformation.
+            // Since there are only two categories in the Education column of the trainData, the output vector
+            // for one hot will have two slots.
+            //
+            // 0 0 0
+            // 0 0 0
+            // 0 0 1
+            // 0 0 1
+            // 0 1 0
+
+            Console.WriteLine("One Hot Encoding with key type output.");
+            foreach (var element in keyEncodedColumn)
+                Console.WriteLine(element);
+
+            // 1
+            // 1
+            // 2
+            // 2
+            // 3
+
+        }
+
+        private class DataPoint
+        {
+            public float Label { get; set; }
+
+            public string Education { get; set; }
+        }
+    }
+}
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValues.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValues.cs
@@ -0,0 +1,74 @@
+﻿using System;
+using System.Collections.Generic;
+using System.Linq;
+using Microsoft.ML.Data;
+
+namespace Microsoft.ML.Samples.Dynamic
+{
+    public static class IndicateMissingValues
+    {
+
+        public static void Example()
+        {
+            // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, 
+            // as well as the source of randomness.
+            var mlContext = new MLContext();
+
+            var samples = new List<DataPoint>()
+            {
+                new DataPoint(){ Label = 3, Features = new float[3] {1, 1, 0} },
+                new DataPoint(){ Label = 32, Features = new float[3] {0, float.NaN, 1} },
+                new DataPoint(){ Label = float.NaN, Features = new float[3] {-1, float.NaN, -3} },
+            };
+            // Convert training data to IDataView, the general data type used in ML.NET.
+            var data = mlContext.Data.LoadFromEnumerable(samples);
+
+            // IndicateMissingValues is used to create a boolean containing
+            // 'true' where the value in the input column is NaN. This value can be used
+            // to replace missing values with other values.
+            IEstimator<ITransformer> pipeline = mlContext.Transforms.IndicateMissingValues("MissingIndicator", "Features");
+
+            // Now we can transform the data and look at the output to confirm the behavior of the estimator.
+            // This operation doesn't actually evaluate data until we read the data below.
+            var tansformer = pipeline.Fit(data);
+            var transformedData = tansformer.Transform(data);
+
+            // We can extract the newly created column as an IEnumerable of SampleDataTransformed, the class we define below.
+            var rowEnumerable = mlContext.Data.CreateEnumerable<SampleDataTransformed>(transformedData, reuseRowObject: false);
+
+            // a small printing utility
+            Func<object[], string> vectorPrinter = (object[] vector) =>
+            {
+                string preview = "[";
+                foreach (var slot in vector)
+                    preview += $"{slot} ";
+               return preview += "]";
+
+            };
+
+            // And finally, we can write out the rows of the dataset, looking at the columns of interest.
+            foreach (var row in rowEnumerable)
+            {
+                Console.WriteLine($"Label: {row.Label} Features: {vectorPrinter(row.Features.Cast<object>().ToArray())} MissingIndicator: {vectorPrinter(row.MissingIndicator.Cast<object>().ToArray())}");
+            }
+
+            // Expected output:
+            // 
+            // Label: 3 Features: [1 1 0] MissingIndicator: [False False False]
+            // Label: 32 Features: [0 NaN 1] MissingIndicator: [False True False]
+            // Label: NaN Features: [-1 NaN -3 ] MissingIndicator: [False True False]
+        }
+
+        private class DataPoint
+        {
+            public float Label { get; set; }
+            [VectorType(3)]
+            public float[] Features { get; set; }
+        }
+
+        private sealed class SampleDataTransformed : DataPoint
+        {
+            public bool[] MissingIndicator { get; set; }
+        }
+    }
+}
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValues.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValues.cs
@@ -0,0 +1,102 @@
+﻿using System;
+using System.Collections.Generic;
+using System.Linq;
+using Microsoft.ML.Data;
+using static Microsoft.ML.Transforms.MissingValueReplacingEstimator.ColumnOptions;
+
+namespace Microsoft.ML.Samples.Dynamic
+{
+    class ReplaceMissingValues
+    {
+        public static void Example()
+        {
+            // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, 
+            // as well as the source of randomness.
+            var mlContext = new MLContext();
+
+            var samples = new List<DataPoint>()
+            {
+                new DataPoint(){ Label = 3, Features = new float[3] {1, 1, 0} },
+                new DataPoint(){ Label = 32, Features = new float[3] {0, float.NaN, 1} },
+                new DataPoint(){ Label = 5, Features = new float[3] {-1, 2, -3} },
+                 new DataPoint(){ Label = 9, Features = new float[3] {-1, 6, -3} },
+            };
+            // Convert training data to IDataView, the general data type used in ML.NET.
+            var data = mlContext.Data.LoadFromEnumerable(samples);
+
+            // ReplaceMissingValues is used to create a column where missing values are replaced according to the ReplacementMode.
+            var meanPipeline = mlContext.Transforms.ReplaceMissingValues("MissingReplaced", "Features", ReplacementMode.Mean);
+
+            // Now we can transform the data and look at the output to confirm the behavior of the estimator.
+            // This operation doesn't actually evaluate data until we read the data below.
+            var meanTransformer = meanPipeline.Fit(data);
+            var meanTransformedData = meanTransformer.Transform(data);
+
+            // We can extract the newly created column as an IEnumerable of SampleDataTransformed, the class we define below.
+            var meanRowEnumerable = mlContext.Data.CreateEnumerable<SampleDataTransformed>(meanTransformedData, reuseRowObject: false);
+
+            // ReplaceMissingValues is used to create a column where missing values are replaced according to the ReplacementMode.
+            var defaultPipeline = mlContext.Transforms.ReplaceMissingValues("MissingReplaced", "Features", ReplacementMode.DefaultValue);
+
+            // Now we can transform the data and look at the output to confirm the behavior of the estimator.
+            // This operation doesn't actually evaluate data until we read the data below.
+            var defaultTransformer = defaultPipeline.Fit(data);
+            var defaultTransformedData = defaultTransformer.Transform(data);
+
+            // We can extract the newly created column as an IEnumerable of SampleDataTransformed, the class we define below.
+            var defaultRowEnumerable = mlContext.Data.CreateEnumerable<SampleDataTransformed>(defaultTransformedData, reuseRowObject: false);
+
+            // a small printing utility
+            Func<object[], string> vectorPrinter = (object[] vector) =>
+            {
+                string preview = "[";
+                foreach (var slot in vector)
+                    preview += $"{slot} ";
+                return preview += "]";
+
+            };
+
+            // And finally, we can write out the rows of the dataset, looking at the columns of interest.
+            foreach (var row in meanRowEnumerable)
+            {
+                Console.WriteLine($"Label: {row.Label} Features: {vectorPrinter(row.Features.Cast<object>().ToArray())} MissingReplaced: {vectorPrinter(row.MissingReplaced.Cast<object>().ToArray())}");
+            }
+
+            // Expected output:
+            // Notice how the NaN of the Features column for the second row is replaced by the mean of (1, 2, 6) the values in that row
+            // 
+            //Label: 3  Features: [1 1    0] MissingReplaced: [1  1  0]
+            //Label: 32 Features: [0 NaN  1] MissingReplaced: [0  3  1]
+            //Label: 5  Features: [-1 2 - 3] MissingReplaced: [-1 2 -3]
+            //Label: 9  Features: [-1 6 - 3] MissingReplaced: [-1 6 -3]
+
+            // And finally, we can write out the rows of the dataset, looking at the columns of interest.
+            foreach (var row in defaultRowEnumerable)
+            {
+                Console.WriteLine($"Label: {row.Label} Features: {vectorPrinter(row.Features.Cast<object>().ToArray())} MissingReplaced: {vectorPrinter(row.MissingReplaced.Cast<object>().ToArray())}");
+            }
+
+            // Expected output:
+            // Notice how the NaN of the Features column for the second row is replaced by 0, the default value for floats.
+            // 
+            //Label: 3  Features: [1 1 0]    MissingReplaced: [1 1 0]
+            //Label: 32 Features: [0 NaN 1]  MissingReplaced: [0 0 1]
+            //Label: 5  Features: [-1 2 - 3] MissingReplaced: [-1 2 - 3]
+            //Label: 9  Features: [-1 6 - 3] MissingReplaced: [-1 6 - 3]
+        }
+
+        private class DataPoint
+        {
+            public float Label { get; set; }
+
+            [VectorType(3)]
+            public float[] Features { get; set; }
+        }
+
+        private sealed class SampleDataTransformed : DataPoint
+        {
+            [VectorType(3)]
+            public float[] MissingReplaced { get; set; }
+        }
+    }
+}
diff --git a/docs/samples/Microsoft.ML.Samples/Program.cs b/docs/samples/Microsoft.ML.Samples/Program.cs
@@ -6,7 +6,7 @@ internal static class Program
     {
         static void Main(string[] args)
         {
-            CustomMapping.Example();
+            ReplaceMissingValues.Example();
         }
     }
 }
diff --git a/src/Microsoft.ML.Transforms/CategoricalCatalog.cs b/src/Microsoft.ML.Transforms/CategoricalCatalog.cs
@@ -20,6 +20,12 @@ public static class CategoricalCatalog
         /// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
         /// <param name="inputColumnName">Name of column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
         /// <param name="outputKind">The conversion mode.</param>
+        /// <example>
+        /// <format type="text/markdown">
+        /// <![CDATA[
+        ///  [!code-csharp[RPCA](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Categorical/OneHotEncoding.cs)]
+        /// ]]></format>
+        /// </example>
         public static OneHotEncodingEstimator OneHotEncoding(this TransformsCatalog.CategoricalTransforms catalog,
                 string outputColumnName,
                 string inputColumnName = null,
diff --git a/src/Microsoft.ML.Transforms/ExtensionsCatalog.cs b/src/Microsoft.ML.Transforms/ExtensionsCatalog.cs
@@ -29,6 +29,12 @@ public static MissingValueIndicatorEstimator IndicateMissingValues(this Transfor
         /// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
         /// <param name="inputColumnName">Name of column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.
         /// If left to <value>null</value> the <paramref name="inputColumnName"/> will get replaced.</param>
+        /// <example>
+        /// <format type="text/markdown">
+        /// <![CDATA[
+        ///  [!code-csharp[RPCA](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValues.cs)]
+        /// ]]></format>
+        /// </example>
         public static MissingValueIndicatorEstimator IndicateMissingValues(this TransformsCatalog catalog,
             string outputColumnName,
             string inputColumnName = null)
@@ -46,6 +52,12 @@ public static MissingValueIndicatorEstimator IndicateMissingValues(this Transfor
         /// <param name="inputColumnName">Name of column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.
         /// If not provided, the <paramref name="inputColumnName"/> will be replaced with the results of the transforms.</param>
         /// <param name="replacementMode">The type of replacement to use as specified in <see cref="MissingValueReplacingEstimator.ColumnOptions.ReplacementMode"/></param>
+        /// <example>
+        /// <format type="text/markdown">
+        /// <![CDATA[
+        ///  [!code-csharp[RPCA](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValues.cs)]
+        /// ]]></format>
+        /// </example>
         public static MissingValueReplacingEstimator ReplaceMissingValues(this TransformsCatalog catalog,
             string outputColumnName,
             string inputColumnName = null,

Original file line number	Diff line number	Diff line change
`@@ -6,7 +6,7 @@ internal static class Program`
`6`	`6`	`{`
`7`	`7`	`static void Main(string[] args)`
`8`	`8`	`{`
`9`		`- CustomMapping.Example();`
	`9`	`+ ReplaceMissingValues.Example();`
`10`	`10`	`}`
`11`	`11`	`}`
`12`	`12`	`}`