|
| 1 | +using System; |
| 2 | +using System.Collections.Generic; |
| 3 | +using System.Linq; |
| 4 | +using Microsoft.ML.Data; |
| 5 | +using static Microsoft.ML.Transforms.MissingValueReplacingEstimator.ColumnOptions; |
| 6 | + |
| 7 | +namespace Microsoft.ML.Samples.Dynamic |
| 8 | +{ |
| 9 | + class ReplaceMissingValues |
| 10 | + { |
| 11 | + public static void Example() |
| 12 | + { |
| 13 | + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, |
| 14 | + // as well as the source of randomness. |
| 15 | + var mlContext = new MLContext(); |
| 16 | + |
| 17 | + var samples = new List<DataPoint>() |
| 18 | + { |
| 19 | + new DataPoint(){ Label = 3, Features = new float[3] {1, 1, 0} }, |
| 20 | + new DataPoint(){ Label = 32, Features = new float[3] {0, float.NaN, 1} }, |
| 21 | + new DataPoint(){ Label = 5, Features = new float[3] {-1, 2, -3} }, |
| 22 | + new DataPoint(){ Label = 9, Features = new float[3] {-1, 6, -3} }, |
| 23 | + }; |
| 24 | + // Convert training data to IDataView, the general data type used in ML.NET. |
| 25 | + var data = mlContext.Data.LoadFromEnumerable(samples); |
| 26 | + |
| 27 | + // ReplaceMissingValues is used to create a column where missing values are replaced according to the ReplacementMode. |
| 28 | + var meanPipeline = mlContext.Transforms.ReplaceMissingValues("MissingReplaced", "Features", ReplacementMode.Mean); |
| 29 | + |
| 30 | + // Now we can transform the data and look at the output to confirm the behavior of the estimator. |
| 31 | + // This operation doesn't actually evaluate data until we read the data below. |
| 32 | + var meanTransformer = meanPipeline.Fit(data); |
| 33 | + var meanTransformedData = meanTransformer.Transform(data); |
| 34 | + |
| 35 | + // We can extract the newly created column as an IEnumerable of SampleDataTransformed, the class we define below. |
| 36 | + var meanRowEnumerable = mlContext.Data.CreateEnumerable<SampleDataTransformed>(meanTransformedData, reuseRowObject: false); |
| 37 | + |
| 38 | + // ReplaceMissingValues is used to create a column where missing values are replaced according to the ReplacementMode. |
| 39 | + var defaultPipeline = mlContext.Transforms.ReplaceMissingValues("MissingReplaced", "Features", ReplacementMode.DefaultValue); |
| 40 | + |
| 41 | + // Now we can transform the data and look at the output to confirm the behavior of the estimator. |
| 42 | + // This operation doesn't actually evaluate data until we read the data below. |
| 43 | + var defaultTransformer = defaultPipeline.Fit(data); |
| 44 | + var defaultTransformedData = defaultTransformer.Transform(data); |
| 45 | + |
| 46 | + // We can extract the newly created column as an IEnumerable of SampleDataTransformed, the class we define below. |
| 47 | + var defaultRowEnumerable = mlContext.Data.CreateEnumerable<SampleDataTransformed>(defaultTransformedData, reuseRowObject: false); |
| 48 | + |
| 49 | + // a small printing utility |
| 50 | + Func<object[], string> vectorPrinter = (object[] vector) => |
| 51 | + { |
| 52 | + string preview = "["; |
| 53 | + foreach (var slot in vector) |
| 54 | + preview += $"{slot} "; |
| 55 | + return preview += "]"; |
| 56 | + |
| 57 | + }; |
| 58 | + |
| 59 | + // And finally, we can write out the rows of the dataset, looking at the columns of interest. |
| 60 | + foreach (var row in meanRowEnumerable) |
| 61 | + { |
| 62 | + Console.WriteLine($"Label: {row.Label} Features: {vectorPrinter(row.Features.Cast<object>().ToArray())} MissingReplaced: {vectorPrinter(row.MissingReplaced.Cast<object>().ToArray())}"); |
| 63 | + } |
| 64 | + |
| 65 | + // Expected output: |
| 66 | + // Notice how the NaN of the Features column for the second row is replaced by the mean of (1, 2, 6) the values in that row |
| 67 | + // |
| 68 | + //Label: 3 Features: [1 1 0] MissingReplaced: [1 1 0] |
| 69 | + //Label: 32 Features: [0 NaN 1] MissingReplaced: [0 3 1] |
| 70 | + //Label: 5 Features: [-1 2 - 3] MissingReplaced: [-1 2 -3] |
| 71 | + //Label: 9 Features: [-1 6 - 3] MissingReplaced: [-1 6 -3] |
| 72 | + |
| 73 | + // And finally, we can write out the rows of the dataset, looking at the columns of interest. |
| 74 | + foreach (var row in defaultRowEnumerable) |
| 75 | + { |
| 76 | + Console.WriteLine($"Label: {row.Label} Features: {vectorPrinter(row.Features.Cast<object>().ToArray())} MissingReplaced: {vectorPrinter(row.MissingReplaced.Cast<object>().ToArray())}"); |
| 77 | + } |
| 78 | + |
| 79 | + // Expected output: |
| 80 | + // Notice how the NaN of the Features column for the second row is replaced by 0, the default value for floats. |
| 81 | + // |
| 82 | + //Label: 3 Features: [1 1 0] MissingReplaced: [1 1 0] |
| 83 | + //Label: 32 Features: [0 NaN 1] MissingReplaced: [0 0 1] |
| 84 | + //Label: 5 Features: [-1 2 - 3] MissingReplaced: [-1 2 - 3] |
| 85 | + //Label: 9 Features: [-1 6 - 3] MissingReplaced: [-1 6 - 3] |
| 86 | + } |
| 87 | + |
| 88 | + private class DataPoint |
| 89 | + { |
| 90 | + public float Label { get; set; } |
| 91 | + |
| 92 | + [VectorType(3)] |
| 93 | + public float[] Features { get; set; } |
| 94 | + } |
| 95 | + |
| 96 | + private sealed class SampleDataTransformed : DataPoint |
| 97 | + { |
| 98 | + [VectorType(3)] |
| 99 | + public float[] MissingReplaced { get; set; } |
| 100 | + } |
| 101 | + } |
| 102 | +} |
0 commit comments