Skip to content

Commit 9cd9a8c

Browse files
authored
Checking in the samples generated during bug bash for MissingNa, Repl… (#2960)
* Checkign in the samples generated during bug bash for MissingNa, ReplaceNA and OneHot
1 parent da655b8 commit 9cd9a8c

File tree

6 files changed

+280
-1
lines changed

6 files changed

+280
-1
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.Linq;
4+
using Microsoft.ML.Data;
5+
using static Microsoft.ML.Transforms.OneHotEncodingEstimator;
6+
7+
namespace Microsoft.ML.Samples.Dynamic
8+
{
9+
public static class OneHotEncoding
10+
{
11+
public static void Example()
12+
{
13+
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
14+
// as well as the source of randomness.
15+
var mlContext = new MLContext();
16+
17+
// Get a small dataset as an IEnumerable.
18+
var samples = new List<DataPoint>()
19+
{
20+
new DataPoint(){ Label = 0, Education = "0-5yrs" },
21+
new DataPoint(){ Label = 1, Education = "0-5yrs" },
22+
new DataPoint(){ Label = 45, Education = "6-11yrs" },
23+
new DataPoint(){ Label = 50, Education = "6-11yrs" },
24+
new DataPoint(){ Label = 50, Education = "11-15yrs" },
25+
};
26+
27+
// Convert training data to IDataView.
28+
var trainData = mlContext.Data.LoadFromEnumerable(samples);
29+
30+
// A pipeline for one hot encoding the Education column.
31+
var bagPipeline = mlContext.Transforms.Categorical.OneHotEncoding("EducationOneHotEncoded", "Education", OutputKind.Bag);
32+
// Fit to data.
33+
var bagTransformer = bagPipeline.Fit(trainData);
34+
35+
// Get transformed data
36+
var bagTransformedData = bagTransformer.Transform(trainData);
37+
// Getting the data of the newly created column, so we can preview it.
38+
var bagEncodedColumn = bagTransformedData.GetColumn<float[]>("EducationOneHotEncoded");
39+
40+
var keyPipeline = mlContext.Transforms.Categorical.OneHotEncoding("EducationOneHotEncoded", "Education", OutputKind.Key);
41+
// Fit to data.
42+
var keyTransformer = keyPipeline.Fit(trainData);
43+
44+
// Get transformed data
45+
var keyTransformedData = keyTransformer.Transform(trainData);
46+
// Getting the data of the newly created column, so we can preview it.
47+
var keyEncodedColumn = keyTransformedData.GetColumn<uint>("EducationOneHotEncoded");
48+
49+
Console.WriteLine("One Hot Encoding based on the bagging strategy.");
50+
foreach (var row in bagEncodedColumn)
51+
{
52+
for (var i = 0; i < row.Length; i++)
53+
Console.Write($"{row[i]} ");
54+
}
55+
56+
// data column obtained post-transformation.
57+
// Since there are only two categories in the Education column of the trainData, the output vector
58+
// for one hot will have two slots.
59+
//
60+
// 0 0 0
61+
// 0 0 0
62+
// 0 0 1
63+
// 0 0 1
64+
// 0 1 0
65+
66+
Console.WriteLine("One Hot Encoding with key type output.");
67+
foreach (var element in keyEncodedColumn)
68+
Console.WriteLine(element);
69+
70+
// 1
71+
// 1
72+
// 2
73+
// 2
74+
// 3
75+
76+
}
77+
78+
private class DataPoint
79+
{
80+
public float Label { get; set; }
81+
82+
public string Education { get; set; }
83+
}
84+
}
85+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.Linq;
4+
using Microsoft.ML.Data;
5+
6+
namespace Microsoft.ML.Samples.Dynamic
7+
{
8+
public static class IndicateMissingValues
9+
{
10+
11+
public static void Example()
12+
{
13+
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
14+
// as well as the source of randomness.
15+
var mlContext = new MLContext();
16+
17+
var samples = new List<DataPoint>()
18+
{
19+
new DataPoint(){ Label = 3, Features = new float[3] {1, 1, 0} },
20+
new DataPoint(){ Label = 32, Features = new float[3] {0, float.NaN, 1} },
21+
new DataPoint(){ Label = float.NaN, Features = new float[3] {-1, float.NaN, -3} },
22+
};
23+
// Convert training data to IDataView, the general data type used in ML.NET.
24+
var data = mlContext.Data.LoadFromEnumerable(samples);
25+
26+
// IndicateMissingValues is used to create a boolean containing
27+
// 'true' where the value in the input column is NaN. This value can be used
28+
// to replace missing values with other values.
29+
IEstimator<ITransformer> pipeline = mlContext.Transforms.IndicateMissingValues("MissingIndicator", "Features");
30+
31+
// Now we can transform the data and look at the output to confirm the behavior of the estimator.
32+
// This operation doesn't actually evaluate data until we read the data below.
33+
var tansformer = pipeline.Fit(data);
34+
var transformedData = tansformer.Transform(data);
35+
36+
// We can extract the newly created column as an IEnumerable of SampleDataTransformed, the class we define below.
37+
var rowEnumerable = mlContext.Data.CreateEnumerable<SampleDataTransformed>(transformedData, reuseRowObject: false);
38+
39+
// a small printing utility
40+
Func<object[], string> vectorPrinter = (object[] vector) =>
41+
{
42+
string preview = "[";
43+
foreach (var slot in vector)
44+
preview += $"{slot} ";
45+
return preview += "]";
46+
47+
};
48+
49+
// And finally, we can write out the rows of the dataset, looking at the columns of interest.
50+
foreach (var row in rowEnumerable)
51+
{
52+
Console.WriteLine($"Label: {row.Label} Features: {vectorPrinter(row.Features.Cast<object>().ToArray())} MissingIndicator: {vectorPrinter(row.MissingIndicator.Cast<object>().ToArray())}");
53+
}
54+
55+
// Expected output:
56+
//
57+
// Label: 3 Features: [1 1 0] MissingIndicator: [False False False]
58+
// Label: 32 Features: [0 NaN 1] MissingIndicator: [False True False]
59+
// Label: NaN Features: [-1 NaN -3 ] MissingIndicator: [False True False]
60+
}
61+
62+
private class DataPoint
63+
{
64+
public float Label { get; set; }
65+
[VectorType(3)]
66+
public float[] Features { get; set; }
67+
}
68+
69+
private sealed class SampleDataTransformed : DataPoint
70+
{
71+
public bool[] MissingIndicator { get; set; }
72+
}
73+
}
74+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.Linq;
4+
using Microsoft.ML.Data;
5+
using static Microsoft.ML.Transforms.MissingValueReplacingEstimator.ColumnOptions;
6+
7+
namespace Microsoft.ML.Samples.Dynamic
8+
{
9+
class ReplaceMissingValues
10+
{
11+
public static void Example()
12+
{
13+
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
14+
// as well as the source of randomness.
15+
var mlContext = new MLContext();
16+
17+
var samples = new List<DataPoint>()
18+
{
19+
new DataPoint(){ Label = 3, Features = new float[3] {1, 1, 0} },
20+
new DataPoint(){ Label = 32, Features = new float[3] {0, float.NaN, 1} },
21+
new DataPoint(){ Label = 5, Features = new float[3] {-1, 2, -3} },
22+
new DataPoint(){ Label = 9, Features = new float[3] {-1, 6, -3} },
23+
};
24+
// Convert training data to IDataView, the general data type used in ML.NET.
25+
var data = mlContext.Data.LoadFromEnumerable(samples);
26+
27+
// ReplaceMissingValues is used to create a column where missing values are replaced according to the ReplacementMode.
28+
var meanPipeline = mlContext.Transforms.ReplaceMissingValues("MissingReplaced", "Features", ReplacementMode.Mean);
29+
30+
// Now we can transform the data and look at the output to confirm the behavior of the estimator.
31+
// This operation doesn't actually evaluate data until we read the data below.
32+
var meanTransformer = meanPipeline.Fit(data);
33+
var meanTransformedData = meanTransformer.Transform(data);
34+
35+
// We can extract the newly created column as an IEnumerable of SampleDataTransformed, the class we define below.
36+
var meanRowEnumerable = mlContext.Data.CreateEnumerable<SampleDataTransformed>(meanTransformedData, reuseRowObject: false);
37+
38+
// ReplaceMissingValues is used to create a column where missing values are replaced according to the ReplacementMode.
39+
var defaultPipeline = mlContext.Transforms.ReplaceMissingValues("MissingReplaced", "Features", ReplacementMode.DefaultValue);
40+
41+
// Now we can transform the data and look at the output to confirm the behavior of the estimator.
42+
// This operation doesn't actually evaluate data until we read the data below.
43+
var defaultTransformer = defaultPipeline.Fit(data);
44+
var defaultTransformedData = defaultTransformer.Transform(data);
45+
46+
// We can extract the newly created column as an IEnumerable of SampleDataTransformed, the class we define below.
47+
var defaultRowEnumerable = mlContext.Data.CreateEnumerable<SampleDataTransformed>(defaultTransformedData, reuseRowObject: false);
48+
49+
// a small printing utility
50+
Func<object[], string> vectorPrinter = (object[] vector) =>
51+
{
52+
string preview = "[";
53+
foreach (var slot in vector)
54+
preview += $"{slot} ";
55+
return preview += "]";
56+
57+
};
58+
59+
// And finally, we can write out the rows of the dataset, looking at the columns of interest.
60+
foreach (var row in meanRowEnumerable)
61+
{
62+
Console.WriteLine($"Label: {row.Label} Features: {vectorPrinter(row.Features.Cast<object>().ToArray())} MissingReplaced: {vectorPrinter(row.MissingReplaced.Cast<object>().ToArray())}");
63+
}
64+
65+
// Expected output:
66+
// Notice how the NaN of the Features column for the second row is replaced by the mean of (1, 2, 6) the values in that row
67+
//
68+
//Label: 3 Features: [1 1 0] MissingReplaced: [1 1 0]
69+
//Label: 32 Features: [0 NaN 1] MissingReplaced: [0 3 1]
70+
//Label: 5 Features: [-1 2 - 3] MissingReplaced: [-1 2 -3]
71+
//Label: 9 Features: [-1 6 - 3] MissingReplaced: [-1 6 -3]
72+
73+
// And finally, we can write out the rows of the dataset, looking at the columns of interest.
74+
foreach (var row in defaultRowEnumerable)
75+
{
76+
Console.WriteLine($"Label: {row.Label} Features: {vectorPrinter(row.Features.Cast<object>().ToArray())} MissingReplaced: {vectorPrinter(row.MissingReplaced.Cast<object>().ToArray())}");
77+
}
78+
79+
// Expected output:
80+
// Notice how the NaN of the Features column for the second row is replaced by 0, the default value for floats.
81+
//
82+
//Label: 3 Features: [1 1 0] MissingReplaced: [1 1 0]
83+
//Label: 32 Features: [0 NaN 1] MissingReplaced: [0 0 1]
84+
//Label: 5 Features: [-1 2 - 3] MissingReplaced: [-1 2 - 3]
85+
//Label: 9 Features: [-1 6 - 3] MissingReplaced: [-1 6 - 3]
86+
}
87+
88+
private class DataPoint
89+
{
90+
public float Label { get; set; }
91+
92+
[VectorType(3)]
93+
public float[] Features { get; set; }
94+
}
95+
96+
private sealed class SampleDataTransformed : DataPoint
97+
{
98+
[VectorType(3)]
99+
public float[] MissingReplaced { get; set; }
100+
}
101+
}
102+
}

docs/samples/Microsoft.ML.Samples/Program.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ internal static class Program
66
{
77
static void Main(string[] args)
88
{
9-
CustomMapping.Example();
9+
ReplaceMissingValues.Example();
1010
}
1111
}
1212
}

src/Microsoft.ML.Transforms/CategoricalCatalog.cs

+6
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,12 @@ public static class CategoricalCatalog
2020
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
2121
/// <param name="inputColumnName">Name of column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
2222
/// <param name="outputKind">The conversion mode.</param>
23+
/// <example>
24+
/// <format type="text/markdown">
25+
/// <![CDATA[
26+
/// [!code-csharp[RPCA](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Categorical/OneHotEncoding.cs)]
27+
/// ]]></format>
28+
/// </example>
2329
public static OneHotEncodingEstimator OneHotEncoding(this TransformsCatalog.CategoricalTransforms catalog,
2430
string outputColumnName,
2531
string inputColumnName = null,

src/Microsoft.ML.Transforms/ExtensionsCatalog.cs

+12
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,12 @@ public static MissingValueIndicatorEstimator IndicateMissingValues(this Transfor
2929
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
3030
/// <param name="inputColumnName">Name of column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.
3131
/// If left to <value>null</value> the <paramref name="inputColumnName"/> will get replaced.</param>
32+
/// <example>
33+
/// <format type="text/markdown">
34+
/// <![CDATA[
35+
/// [!code-csharp[RPCA](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValues.cs)]
36+
/// ]]></format>
37+
/// </example>
3238
public static MissingValueIndicatorEstimator IndicateMissingValues(this TransformsCatalog catalog,
3339
string outputColumnName,
3440
string inputColumnName = null)
@@ -46,6 +52,12 @@ public static MissingValueIndicatorEstimator IndicateMissingValues(this Transfor
4652
/// <param name="inputColumnName">Name of column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.
4753
/// If not provided, the <paramref name="inputColumnName"/> will be replaced with the results of the transforms.</param>
4854
/// <param name="replacementMode">The type of replacement to use as specified in <see cref="MissingValueReplacingEstimator.ColumnOptions.ReplacementMode"/></param>
55+
/// <example>
56+
/// <format type="text/markdown">
57+
/// <![CDATA[
58+
/// [!code-csharp[RPCA](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValues.cs)]
59+
/// ]]></format>
60+
/// </example>
4961
public static MissingValueReplacingEstimator ReplaceMissingValues(this TransformsCatalog catalog,
5062
string outputColumnName,
5163
string inputColumnName = null,

0 commit comments

Comments
 (0)