Skip to content

Commit a100505

Browse files
authored
Add V1 Scenario tests for data transformation (#2803)
* Add tests for data transformation scenarios
1 parent 09f40d0 commit a100505

File tree

4 files changed

+239
-59
lines changed

4 files changed

+239
-59
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
// See the LICENSE file in the project root for more information.
4+
5+
using System;
6+
using Microsoft.ML.Functional.Tests.Datasets;
7+
using Microsoft.ML.RunTests;
8+
using Microsoft.ML.TestFramework;
9+
using Microsoft.ML.Trainers;
10+
using Microsoft.ML.Transforms;
11+
using Microsoft.ML.Transforms.Text;
12+
using Xunit;
13+
using Xunit.Abstractions;
14+
15+
namespace Microsoft.ML.Functional.Tests
16+
{
17+
public class DataTransformation : BaseTestClass
18+
{
19+
public DataTransformation(ITestOutputHelper output) : base(output)
20+
{
21+
}
22+
23+
/// <summary>
24+
/// Extensibility: Add a new column that is a function of other columns.
25+
/// </summary>
26+
[Fact]
27+
void ExtensibilityAddAColumnAsAFunctionOfMultipleColumns()
28+
{
29+
// Concurrency must be 1 to assure that the mapping is done sequentially.
30+
var mlContext = new MLContext(seed: 1, conc: 1);
31+
32+
// Load the Iris dataset
33+
var data = mlContext.Data.LoadFromTextFile<Iris>(
34+
GetDataPath(TestDatasets.iris.trainFilename),
35+
hasHeader: TestDatasets.iris.fileHasHeader,
36+
separatorChar: TestDatasets.iris.fileSeparator);
37+
38+
// Subsample it down to the first 10 rows.
39+
int numSamples = 10;
40+
data = mlContext.Data.TakeRows(data, numSamples);
41+
42+
// Create a stand-alone function to produce a random number.
43+
float angiospermCosine(float petalWidth, float petalLength, float sepalWidth, float sepalLength)
44+
{
45+
var petalMagnitude = Math.Sqrt(petalWidth * petalWidth + petalLength * petalLength);
46+
var sepalMagnitude = Math.Sqrt(sepalWidth * sepalWidth + sepalLength * sepalLength);
47+
return (float)((petalWidth * sepalWidth + petalLength * sepalLength) / (petalMagnitude * sepalMagnitude));
48+
}
49+
50+
// Create a function that generates a column.
51+
Action<Iris, IrisWithOneExtraColumn> generateGroupId = (input, output) =>
52+
{
53+
output.Label = input.Label;
54+
output.Float1 = angiospermCosine(input.PetalLength, input.PetalWidth, input.SepalLength, input.SepalWidth);
55+
output.PetalLength = input.PetalLength;
56+
output.PetalWidth = input.PetalWidth;
57+
output.SepalLength = input.SepalLength;
58+
output.SepalWidth = input.SepalWidth;
59+
};
60+
61+
// Create a pipeline to execute the custom function.
62+
var pipeline = mlContext.Transforms.CustomMapping(generateGroupId, null);
63+
64+
// Transform the data.
65+
var transformedData = pipeline.Fit(data).Transform(data);
66+
67+
// Verify that the column has the correct data.
68+
var transformedRows = mlContext.Data.CreateEnumerable<IrisWithOneExtraColumn>(transformedData, reuseRowObject: true);
69+
foreach (var row in transformedRows)
70+
{
71+
var cosineDistance = angiospermCosine(row.PetalLength, row.PetalWidth, row.SepalLength, row.SepalWidth);
72+
Assert.Equal(cosineDistance, row.Float1);
73+
}
74+
}
75+
76+
/// <summary>
77+
/// Extensibility: Add multiple new columns.
78+
/// </summary>
79+
[Fact]
80+
void ExtensibilityAddingTwoColumns()
81+
{
82+
// Concurrency must be 1 to assure that the mapping is done sequentially.
83+
var mlContext = new MLContext(seed: 1, conc: 1);
84+
85+
// Load the Iris dataset
86+
var data = mlContext.Data.LoadFromTextFile<Iris>(
87+
GetDataPath(TestDatasets.iris.trainFilename),
88+
hasHeader: TestDatasets.iris.fileHasHeader,
89+
separatorChar: TestDatasets.iris.fileSeparator);
90+
91+
// Subsample it down to the first 10 rows.
92+
int numSamples = 10;
93+
data = mlContext.Data.TakeRows(data, numSamples);
94+
95+
// Create a function that generates a column.
96+
Action<Iris, IrisWithTwoExtraColumns> generateGroupId = (input, output) =>
97+
{
98+
output.Label = input.Label;
99+
output.Float1 = GetRandomNumber(1 + input.Label + input.PetalLength + input.PetalWidth + input.SepalLength + input.SepalWidth);
100+
output.Float2 = GetRandomNumber(2 + input.Label + input.PetalLength + input.PetalWidth + input.SepalLength + input.SepalWidth);
101+
output.PetalLength = input.PetalLength;
102+
output.PetalWidth = input.PetalWidth;
103+
output.SepalLength = input.SepalLength;
104+
output.SepalWidth = input.SepalWidth;
105+
};
106+
107+
// Create a pipeline to execute the custom function.
108+
var pipeline = mlContext.Transforms.CustomMapping(generateGroupId, null);
109+
110+
// Transform the data.
111+
var transformedData = pipeline.Fit(data).Transform(data);
112+
113+
// Verify that the column has the correct data.
114+
var transformedRows = mlContext.Data.CreateEnumerable<IrisWithTwoExtraColumns>(transformedData, reuseRowObject: true);
115+
foreach (var row in transformedRows)
116+
{
117+
var randomNumber1 = GetRandomNumber(1 + row.Label + row.PetalLength + row.PetalWidth + row.SepalLength + row.SepalWidth);
118+
var randomNumber2 = GetRandomNumber(2 + row.Label + row.PetalLength + row.PetalWidth + row.SepalLength + row.SepalWidth);
119+
Assert.Equal(randomNumber1, row.Float1);
120+
Assert.Equal(randomNumber2, row.Float2);
121+
}
122+
}
123+
124+
/// <summary>
125+
/// Extensibility: Featurize text using custom word-grams, char-grams, and normalization.
126+
/// </summary>
127+
[Fact]
128+
void ExtensibilityModifyTextFeaturization()
129+
{
130+
// Concurrency must be 1 to assure that the mapping is done sequentially.
131+
var mlContext = new MLContext(seed: 1, conc: 1);
132+
133+
var data = mlContext.Data.LoadFromTextFile<TweetSentiment>(GetDataPath(TestDatasets.Sentiment.trainFilename),
134+
hasHeader: TestDatasets.Sentiment.fileHasHeader,
135+
separatorChar: TestDatasets.Sentiment.fileSeparator);
136+
137+
// Create a training pipeline.
138+
// TODO #2802: Update FeaturizeText to allow specifications of word-grams and char-grams.
139+
var pipeline = mlContext.Transforms.Text.FeaturizeText("Features", new string[] { "SentimentText" },
140+
new TextFeaturizingEstimator.Options
141+
{
142+
UseCharExtractor = true,
143+
UseWordExtractor = true,
144+
VectorNormalizer = TextFeaturizingEstimator.TextNormKind.L1
145+
})
146+
.AppendCacheCheckpoint(mlContext)
147+
.Append(mlContext.BinaryClassification.Trainers.StochasticDualCoordinateAscent(
148+
new SdcaBinaryTrainer.Options { NumThreads = 1 }));
149+
150+
// Train the model.
151+
var model = pipeline.Fit(data);
152+
153+
// Evaluate the model.
154+
var scoredData = model.Transform(data);
155+
var metrics = mlContext.BinaryClassification.Evaluate(scoredData);
156+
157+
// Check that the metrics returned are valid.
158+
Common.AssertMetrics(metrics);
159+
}
160+
161+
/// <summary>
162+
/// Extensibility: Apply a normalizer to columns in the dataset.
163+
/// </summary>
164+
[Fact]
165+
void ExtensibilityNormalizeColumns()
166+
{
167+
// Concurrency must be 1 to assure that the mapping is done sequentially.
168+
var mlContext = new MLContext(seed: 1, conc: 1);
169+
170+
// Load the Iris dataset.
171+
var data = mlContext.Data.LoadFromTextFile<Iris>(
172+
GetDataPath(TestDatasets.iris.trainFilename),
173+
hasHeader: TestDatasets.iris.fileHasHeader,
174+
separatorChar: TestDatasets.iris.fileSeparator);
175+
176+
// Compose the transformation.
177+
var pipeline = mlContext.Transforms.Concatenate("Features", Iris.Features)
178+
.Append(mlContext.Transforms.Normalize("Features", mode: NormalizingEstimator.NormalizerMode.MinMax));
179+
180+
// Transform the data.
181+
var transformedData = pipeline.Fit(data).Transform(data);
182+
183+
// Validate that the data was normalized to between -1 and 1.
184+
var dataEnumerator = mlContext.Data.CreateEnumerable<FeatureColumn>(transformedData, true);
185+
foreach (var row in dataEnumerator)
186+
// Verify per-slot normalization.
187+
for (int i = 0; i < row.Features.Length; i++)
188+
Assert.InRange(row.Features[i], -1, 1);
189+
}
190+
191+
private float GetRandomNumber(float number)
192+
{
193+
var seed = (int)(10 * number);
194+
var rng = new Random(seed);
195+
return (float)rng.NextDouble();
196+
}
197+
}
198+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
// See the LICENSE file in the project root for more information.
4+
5+
namespace Microsoft.ML.Functional.Tests.Datasets
6+
{
7+
/// <summary>
8+
/// A class to hold a feature column.
9+
/// </summary>
10+
internal sealed class FeatureColumn
11+
{
12+
public float[] Features { get; set; }
13+
}
14+
}

test/Microsoft.ML.Functional.Tests/Datasets/Iris.cs

+27
Original file line numberDiff line numberDiff line change
@@ -75,4 +75,31 @@ internal sealed class IrisWithGroup
7575
public float PetalLength { get; set; }
7676
public float PetalWidth { get; set; }
7777
}
78+
79+
/// <summary>
80+
/// A class for the Iris dataset with an extra float column.
81+
/// </summary>
82+
internal sealed class IrisWithOneExtraColumn
83+
{
84+
public float Label { get; set; }
85+
public float SepalLength { get; set; }
86+
public float SepalWidth { get; set; }
87+
public float PetalLength { get; set; }
88+
public float PetalWidth { get; set; }
89+
public float Float1 { get; set; }
90+
}
91+
92+
/// <summary>
93+
/// A class for the Iris dataset with two extra float columns.
94+
/// </summary>
95+
internal sealed class IrisWithTwoExtraColumns
96+
{
97+
public float Label { get; set; }
98+
public float SepalLength { get; set; }
99+
public float SepalWidth { get; set; }
100+
public float PetalLength { get; set; }
101+
public float PetalWidth { get; set; }
102+
public float Float1 { get; set; }
103+
public float Float2 { get; set; }
104+
}
78105
}

test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Extensibility.cs

-59
This file was deleted.

0 commit comments

Comments
 (0)