Skip to content

Commit d32f027

Browse files
committed
samples custom, missingindicator, missingreplace
1 parent 8130567 commit d32f027

File tree

9 files changed

+405
-51
lines changed

9 files changed

+405
-51
lines changed
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
using System;
2+
using System.Collections.Generic;
3+
using Microsoft.ML.Transforms;
4+
25
namespace Microsoft.ML.Samples.Dynamic
36
{
47
public static class CustomMapping
@@ -10,71 +13,88 @@ public static void Example()
1013
var mlContext = new MLContext();
1114

1215
// Get a small dataset as an IEnumerable and convert it to an IDataView.
13-
var data = SamplesUtils.DatasetUtils.GetInfertData();
14-
var trainData = mlContext.Data.LoadFromEnumerable(data);
16+
var rawData = GetData();
17+
18+
// Printing the input data.
19+
Console.WriteLine("Age\t Salary");
20+
foreach (var row in rawData)
21+
Console.WriteLine($"{row.Age}\t {row.Salary}");
22+
// Expected output:
23+
// Age Salary
24+
// 26 40000
25+
// 35 80000
26+
// 34 10000
27+
// 28 100000
1528

16-
// Preview of the data.
17-
// Age RowNum Education ...
18-
// 26 0 0-5yrs ...
19-
// 42 1 0-5yrs ...
20-
// 39 2 12+yrs ...
21-
// 34 3 0-5yrs ...
22-
// 35 4 6-11yrs ...
29+
var data = mlContext.Data.LoadFromEnumerable(rawData);
2330

2431
// We define the custom mapping between input and output rows that will be applied by the transformation.
25-
Action<SamplesUtils.DatasetUtils.SampleInfertData, OutputRow> mapping =
32+
Action<InputData, CustomMappingOutput > mapping =
2633
(input, output) => output.IsUnderThirty = input.Age < 30;
2734

28-
// Custom transformations can be used to transform data directly, or as part of a pipeline. Below we transform data directly.
29-
var estimator = mlContext.Transforms.CustomMapping(mapping, null);
30-
var transformedData = estimator.Fit(trainData).Transform(trainData);
35+
// Custom transformations can be used to transform data directly, or as part of a pipeline of estimators.
36+
// Note: If contractName is null in the CustomMapping estimator, any pipeline of estimators containing it, cannot be saved and loaded back.
37+
// See other sample on how to load and save the CustomMapping estimator.
38+
var estimator = mlContext.Transforms.CustomMapping(mapping, contractName: null);
39+
var transformedData = estimator.Fit(data).Transform(data);
3140

32-
// Preview 5 lines of the transformed data.
33-
transformedData = mlContext.Data.TakeRows(transformedData, 5);
34-
var dataEnumerable = mlContext.Data.CreateEnumerable<SampleInfertDataTransformed>(transformedData, reuseRowObject: true);
35-
Console.WriteLine("IsUnderThirty\t Age\t RowNum\t Education\t ...");
41+
// Printing the output data.
42+
var dataEnumerable = mlContext.Data.CreateEnumerable<TransformedData>(transformedData, reuseRowObject: true);
43+
Console.WriteLine("Age\t Salary\t IsUnderThirty");
3644
foreach (var row in dataEnumerable)
37-
Console.WriteLine($"{row.IsUnderThirty}\t {row.Age}\t {row.RowNum}\t {row.Education}\t ...");
45+
Console.WriteLine($"{row.Age}\t {row.Salary}\t {row.IsUnderThirty}");
3846
// Expected output:
39-
// IsUnderThirty Age RowNum Education ...
40-
// True 26 0 0-5yrs ...
41-
// False 42 1 0-5yrs ...
42-
// False 39 2 12+yrs ...
43-
// False 34 3 0-5yrs ...
44-
// False 35 4 6-11yrs ...
45-
47+
// Age Salary IsUnderThirty
48+
// 26 40000 True
49+
// 35 80000 False
50+
// 34 10000 False
51+
// 28 100000 True
52+
}
4653

47-
// Here instead we use it as part of a pipeline of estimators.
48-
var pipeline = mlContext.Transforms.CustomMapping(mapping, null)
49-
.Append(mlContext.Transforms.Concatenate(outputColumnName: "Features", inputColumnNames: new[] { "Parity", "Induced" }))
50-
// It is useful to add a caching checkpoint before a trainer that does several passes over the data.
51-
.AppendCacheCheckpoint(mlContext)
52-
// We use binary FastTree to predict the label column that was generated by the custom mapping at the first step of the pipeline.
53-
.Append(mlContext.BinaryClassification.Trainers.FastTree(labelColumnName: "IsUnderThirty"));
54+
// Defines only the column to be generated by the custom mapping transformation in addition to the columns already present.
55+
public class CustomMappingOutput
56+
{
57+
public bool IsUnderThirty { get; set; }
58+
}
5459

55-
// We can train the pipeline and use it to transform data.
56-
transformedData = pipeline.Fit(trainData).Transform(trainData);
60+
// Defines the schema of the input data.
61+
public class InputData
62+
{
63+
public float Age { get; set; }
64+
public float Salary { get; set; }
5765
}
5866

59-
// This defines only the column to be generated by the transformation in addition to the columns already present.
60-
public class OutputRow
67+
// Defines the schema of the transformed data, which includes the new column IsUnderThirty.
68+
public class TransformedData
6169
{
70+
public float Age { get; set; }
71+
public float Salary { get; set; }
6272
public bool IsUnderThirty { get; set; }
73+
6374
}
6475

65-
// Represents the transformed infertility dataset.
66-
public class SampleInfertDataTransformed
76+
// Returns an enumerable of input rows.
77+
public static IEnumerable<InputData> GetData()
6778
{
68-
public bool IsUnderThirty { get; set; }
69-
public float Age { get; set; }
70-
public int RowNum { get; set; }
71-
public string Education { get; set; }
72-
public float Parity { get; set; }
73-
public float Induced { get; set; }
74-
public float Case { get; set; }
75-
public float Spontaneous { get; set; }
76-
public float Stratum { get; set; }
77-
public float PooledStratum { get; set; }
79+
return new List<InputData>
80+
{
81+
new InputData {
82+
Age = 26,
83+
Salary = 40000,
84+
},
85+
new InputData {
86+
Age = 35,
87+
Salary = 80000,
88+
},
89+
new InputData {
90+
Age = 34,
91+
Salary = 10000,
92+
},
93+
new InputData {
94+
Age = 28,
95+
Salary = 100000,
96+
},
97+
};
7898
}
7999
}
80100
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using Microsoft.ML.Transforms;
4+
5+
namespace Microsoft.ML.Samples.Dynamic
6+
{
7+
public static class CustomMappingSaveAndLoad
8+
{
9+
public static void Example()
10+
{
11+
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
12+
// as well as the source of randomness.
13+
var mlContext = new MLContext();
14+
15+
// Get a small dataset as an IEnumerable and convert it to an IDataView.
16+
var rawData = GetData();
17+
18+
// Printing the input data.
19+
Console.WriteLine("Age\t Salary");
20+
foreach (var row in rawData)
21+
Console.WriteLine($"{row.Age}\t {row.Salary}");
22+
// Expected output:
23+
// Age Salary
24+
// 26 40000
25+
// 35 80000
26+
// 34 10000
27+
// 28 100000
28+
29+
var data = mlContext.Data.LoadFromEnumerable(rawData);
30+
31+
// Custom transformations can be used to transform data directly, or as part of a pipeline. Below we transform data directly.
32+
var estimator = mlContext.Transforms.CustomMapping(new IsUnderThirtyCustomAction().GetMapping(), contractName: "IsUnderThirty");
33+
var transform = estimator.Fit(data);
34+
35+
// To save and load the CustomMapping estimator, the assembly in which the custom action is defined needs to be registered in the
36+
// environment. The following registers the assembly where IsUnderThirtyCustomAction is defined.
37+
mlContext.ComponentCatalog.RegisterAssembly(typeof(IsUnderThirtyCustomAction).Assembly);
38+
39+
// Now the transform pipeline can be saved and loaded through the usual MLCOntext method.
40+
mlContext.Model.Save(transform, data.Schema, "customTransform.zip");
41+
var loadedTransform = mlContext.Model.Load("customTransform.zip", out var inputSchema);
42+
43+
// Transform the data using the CustomMapping transform that was saved and loaded.
44+
var transformedData = loadedTransform.Transform(data);
45+
46+
// Printing the output data.
47+
var dataEnumerable = mlContext.Data.CreateEnumerable<TransformedData>(transformedData, reuseRowObject: true);
48+
Console.WriteLine("Age\t Salary\t IsUnderThirty");
49+
foreach (var row in dataEnumerable)
50+
Console.WriteLine($"{row.Age}\t {row.Salary}\t {row.IsUnderThirty}");
51+
// Expected output:
52+
// Age Salary IsUnderThirty
53+
// 26 40000 True
54+
// 35 80000 False
55+
// 34 10000 False
56+
// 28 100000 True
57+
}
58+
59+
// The custom action needs to implement the abstract class CustomMappingFactory, and needs to have attribute
60+
// CustomMappingFactoryAttribute with argument equal to the contractName used to define the CustomMapping estimator
61+
// which uses the action.
62+
[CustomMappingFactoryAttribute("IsUnderThirty")]
63+
public class IsUnderThirtyCustomAction : CustomMappingFactory<InputData, CustomMappingOutput>
64+
{
65+
// We define the custom mapping between input and output rows that will be applied by the transformation.
66+
public static void CustomAction(InputData input, CustomMappingOutput output)
67+
=> output.IsUnderThirty = input.Age < 30;
68+
69+
public override Action<InputData, CustomMappingOutput> GetMapping()
70+
=> CustomAction;
71+
}
72+
73+
// Defines only the column to be generated by the custom mapping transformation in addition to the columns already present.
74+
public class CustomMappingOutput
75+
{
76+
public bool IsUnderThirty { get; set; }
77+
}
78+
79+
// Defines the schema of the input data.
80+
public class InputData
81+
{
82+
public float Age { get; set; }
83+
public float Salary { get; set; }
84+
}
85+
86+
// Defines the schema of the transformed data, which includes the new column IsUnderThirty.
87+
public class TransformedData
88+
{
89+
public float Age { get; set; }
90+
public float Salary { get; set; }
91+
public bool IsUnderThirty { get; set; }
92+
93+
}
94+
95+
// Returns an enumerable of input rows.
96+
public static IEnumerable<InputData> GetData()
97+
{
98+
return new List<InputData>
99+
{
100+
new InputData {
101+
Age = 26,
102+
Salary = 40000,
103+
},
104+
new InputData {
105+
Age = 35,
106+
Salary = 80000,
107+
},
108+
new InputData {
109+
Age = 34,
110+
Salary = 10000,
111+
},
112+
new InputData {
113+
Age = 28,
114+
Salary = 100000,
115+
},
116+
};
117+
}
118+
}
119+
}

docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValues.cs

-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ namespace Microsoft.ML.Samples.Dynamic
77
{
88
public static class IndicateMissingValues
99
{
10-
1110
public static void Example()
1211
{
1312
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.Linq;
4+
using Microsoft.ML.Data;
5+
6+
namespace Microsoft.ML.Samples.Dynamic
7+
{
8+
public static class IndicateMissingValuesMultiColumn
9+
{
10+
public static void Example()
11+
{
12+
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
13+
// as well as the source of randomness.
14+
var mlContext = new MLContext();
15+
16+
var samples = new List<DataPoint>()
17+
{
18+
new DataPoint(){ Label = 3, Features1 = new float[3] {1, 1, 0}, Features2 = new float[2] {1, 1} },
19+
new DataPoint(){ Label = 32, Features1 = new float[3] {0, float.NaN, 1}, Features2 = new float[2] {float.NaN, 1} },
20+
new DataPoint(){ Label = float.NaN, Features1 = new float[3] {-1, float.NaN, -3}, Features2 = new float[2] {1, float.PositiveInfinity} },
21+
};
22+
// Convert training data to IDataView, the general data type used in ML.NET.
23+
var data = mlContext.Data.LoadFromEnumerable(samples);
24+
25+
// IndicateMissingValues is used to create a boolean containing
26+
// 'true' where the value in the input column is NaN. This value can be used
27+
// to replace missing values with other values. We can use an array of InputOutputColumnPair
28+
// to apply the MissingValueIndicatorEstimator to multiple columns in one pass over the data.
29+
IEstimator<ITransformer> pipeline = mlContext.Transforms.IndicateMissingValues(new[] {
30+
new InputOutputColumnPair("MissingIndicator1", "Features1"),
31+
new InputOutputColumnPair("MissingIndicator2", "Features2")
32+
});
33+
34+
// Now we can transform the data and look at the output to confirm the behavior of the estimator.
35+
// This operation doesn't actually evaluate data until we read the data below.
36+
var tansformer = pipeline.Fit(data);
37+
var transformedData = tansformer.Transform(data);
38+
39+
// We can extract the newly created column as an IEnumerable of SampleDataTransformed, the class we define below.
40+
var rowEnumerable = mlContext.Data.CreateEnumerable<SampleDataTransformed>(transformedData, reuseRowObject: false);
41+
42+
// a small printing utility
43+
Func<object[], string> vectorPrinter = (object[] vector) =>
44+
{
45+
string preview = "[";
46+
foreach (var slot in vector)
47+
preview += $"{slot} ";
48+
return preview += "]";
49+
50+
};
51+
52+
// And finally, we can write out the rows of the dataset, looking at the columns of interest.
53+
foreach (var row in rowEnumerable)
54+
{
55+
Console.WriteLine($"Label: {row.Label} Features1: {vectorPrinter(row.Features1.Cast<object>().ToArray())} " +
56+
$"Features2: {vectorPrinter(row.Features2.Cast<object>().ToArray())} " +
57+
$"MissingIndicator1: {vectorPrinter(row.MissingIndicator1.Cast<object>().ToArray())} " +
58+
$"MissingIndicator2: {vectorPrinter(row.MissingIndicator2.Cast<object>().ToArray())}");
59+
}
60+
61+
// Expected output:
62+
// Label: 3 Features1: [1 1 0] Features2: [1 1] MissingIndicator1: [False False False] MissingIndicator2: [False False]
63+
// Label: 32 Features1: [0 NaN 1] Features2: [NaN 1] MissingIndicator1: [False True False] MissingIndicator2: [True False]
64+
// Label: NaN Features1: [-1 NaN -3 ] Features2: [1 ∞ ] MissingIndicator1: [False True False] MissingIndicator2: [False False]
65+
}
66+
67+
private class DataPoint
68+
{
69+
public float Label { get; set; }
70+
[VectorType(3)]
71+
public float[] Features1 { get; set; }
72+
[VectorType(2)]
73+
public float[] Features2 { get; set; }
74+
}
75+
76+
private sealed class SampleDataTransformed : DataPoint
77+
{
78+
public bool[] MissingIndicator1 { get; set; }
79+
public bool[] MissingIndicator2 { get; set; }
80+
81+
}
82+
}
83+
}

docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValues.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ public static void Example()
1818
new DataPoint(){ Label = 3, Features = new float[3] {1, 1, 0} },
1919
new DataPoint(){ Label = 32, Features = new float[3] {0, float.NaN, 1} },
2020
new DataPoint(){ Label = 5, Features = new float[3] {-1, 2, -3} },
21-
new DataPoint(){ Label = 9, Features = new float[3] {-1, 6, -3} },
21+
new DataPoint(){ Label = 9, Features = new float[3] {-1, 6, -3} },
2222
};
2323
// Convert training data to IDataView, the general data type used in ML.NET.
2424
var data = mlContext.Data.LoadFromEnumerable(samples);

0 commit comments

Comments
 (0)