1
+ // Licensed to the .NET Foundation under one or more agreements.
2
+ // The .NET Foundation licenses this file to you under the MIT license.
3
+ // See the LICENSE file in the project root for more information.
4
+
5
+ using System ;
6
+ using Microsoft . ML . Functional . Tests . Datasets ;
7
+ using Microsoft . ML . RunTests ;
8
+ using Microsoft . ML . TestFramework ;
9
+ using Microsoft . ML . Trainers ;
10
+ using Microsoft . ML . Transforms ;
11
+ using Microsoft . ML . Transforms . Text ;
12
+ using Xunit ;
13
+ using Xunit . Abstractions ;
14
+
15
+ namespace Microsoft . ML . Functional . Tests
16
+ {
17
+ public class DataTransformation : BaseTestClass
18
+ {
19
+ public DataTransformation ( ITestOutputHelper output ) : base ( output )
20
+ {
21
+ }
22
+
23
+ /// <summary>
24
+ /// Extensibility: Add a new column that is a function of other columns.
25
+ /// </summary>
26
+ [ Fact ]
27
+ void ExtensibilityAddAColumnAsAFunctionOfMultipleColumns ( )
28
+ {
29
+ // Concurrency must be 1 to assure that the mapping is done sequentially.
30
+ var mlContext = new MLContext ( seed : 1 , conc : 1 ) ;
31
+
32
+ // Load the Iris dataset
33
+ var data = mlContext . Data . LoadFromTextFile < Iris > (
34
+ GetDataPath ( TestDatasets . iris . trainFilename ) ,
35
+ hasHeader : TestDatasets . iris . fileHasHeader ,
36
+ separatorChar : TestDatasets . iris . fileSeparator ) ;
37
+
38
+ // Subsample it down to the first 10 rows.
39
+ int numSamples = 10 ;
40
+ data = mlContext . Data . TakeRows ( data , numSamples ) ;
41
+
42
+ // Create a stand-alone function to produce a random number.
43
+ float angiospermCosine ( float petalWidth , float petalLength , float sepalWidth , float sepalLength )
44
+ {
45
+ var petalMagnitude = Math . Sqrt ( petalWidth * petalWidth + petalLength * petalLength ) ;
46
+ var sepalMagnitude = Math . Sqrt ( sepalWidth * sepalWidth + sepalLength * sepalLength ) ;
47
+ return ( float ) ( ( petalWidth * sepalWidth + petalLength * sepalLength ) / ( petalMagnitude * sepalMagnitude ) ) ;
48
+ }
49
+
50
+ // Create a function that generates a column.
51
+ Action < Iris , IrisWithOneExtraColumn > generateGroupId = ( input , output ) =>
52
+ {
53
+ output . Label = input . Label ;
54
+ output . Float1 = angiospermCosine ( input . PetalLength , input . PetalWidth , input . SepalLength , input . SepalWidth ) ;
55
+ output . PetalLength = input . PetalLength ;
56
+ output . PetalWidth = input . PetalWidth ;
57
+ output . SepalLength = input . SepalLength ;
58
+ output . SepalWidth = input . SepalWidth ;
59
+ } ;
60
+
61
+ // Create a pipeline to execute the custom function.
62
+ var pipeline = mlContext . Transforms . CustomMapping ( generateGroupId , null ) ;
63
+
64
+ // Transform the data.
65
+ var transformedData = pipeline . Fit ( data ) . Transform ( data ) ;
66
+
67
+ // Verify that the column has the correct data.
68
+ var transformedRows = mlContext . Data . CreateEnumerable < IrisWithOneExtraColumn > ( transformedData , reuseRowObject : true ) ;
69
+ foreach ( var row in transformedRows )
70
+ {
71
+ var cosineDistance = angiospermCosine ( row . PetalLength , row . PetalWidth , row . SepalLength , row . SepalWidth ) ;
72
+ Assert . Equal ( cosineDistance , row . Float1 ) ;
73
+ }
74
+ }
75
+
76
+ /// <summary>
77
+ /// Extensibility: Add multiple new columns.
78
+ /// </summary>
79
+ [ Fact ]
80
+ void ExtensibilityAddingTwoColumns ( )
81
+ {
82
+ // Concurrency must be 1 to assure that the mapping is done sequentially.
83
+ var mlContext = new MLContext ( seed : 1 , conc : 1 ) ;
84
+
85
+ // Load the Iris dataset
86
+ var data = mlContext . Data . LoadFromTextFile < Iris > (
87
+ GetDataPath ( TestDatasets . iris . trainFilename ) ,
88
+ hasHeader : TestDatasets . iris . fileHasHeader ,
89
+ separatorChar : TestDatasets . iris . fileSeparator ) ;
90
+
91
+ // Subsample it down to the first 10 rows.
92
+ int numSamples = 10 ;
93
+ data = mlContext . Data . TakeRows ( data , numSamples ) ;
94
+
95
+ // Create a function that generates a column.
96
+ Action < Iris , IrisWithTwoExtraColumns > generateGroupId = ( input , output ) =>
97
+ {
98
+ output . Label = input . Label ;
99
+ output . Float1 = GetRandomNumber ( 1 + input . Label + input . PetalLength + input . PetalWidth + input . SepalLength + input . SepalWidth ) ;
100
+ output . Float2 = GetRandomNumber ( 2 + input . Label + input . PetalLength + input . PetalWidth + input . SepalLength + input . SepalWidth ) ;
101
+ output . PetalLength = input . PetalLength ;
102
+ output . PetalWidth = input . PetalWidth ;
103
+ output . SepalLength = input . SepalLength ;
104
+ output . SepalWidth = input . SepalWidth ;
105
+ } ;
106
+
107
+ // Create a pipeline to execute the custom function.
108
+ var pipeline = mlContext . Transforms . CustomMapping ( generateGroupId , null ) ;
109
+
110
+ // Transform the data.
111
+ var transformedData = pipeline . Fit ( data ) . Transform ( data ) ;
112
+
113
+ // Verify that the column has the correct data.
114
+ var transformedRows = mlContext . Data . CreateEnumerable < IrisWithTwoExtraColumns > ( transformedData , reuseRowObject : true ) ;
115
+ foreach ( var row in transformedRows )
116
+ {
117
+ var randomNumber1 = GetRandomNumber ( 1 + row . Label + row . PetalLength + row . PetalWidth + row . SepalLength + row . SepalWidth ) ;
118
+ var randomNumber2 = GetRandomNumber ( 2 + row . Label + row . PetalLength + row . PetalWidth + row . SepalLength + row . SepalWidth ) ;
119
+ Assert . Equal ( randomNumber1 , row . Float1 ) ;
120
+ Assert . Equal ( randomNumber2 , row . Float2 ) ;
121
+ }
122
+ }
123
+
124
+ /// <summary>
125
+ /// Extensibility: Featurize text using custom word-grams, char-grams, and normalization.
126
+ /// </summary>
127
+ [ Fact ]
128
+ void ExtensibilityModifyTextFeaturization ( )
129
+ {
130
+ // Concurrency must be 1 to assure that the mapping is done sequentially.
131
+ var mlContext = new MLContext ( seed : 1 , conc : 1 ) ;
132
+
133
+ var data = mlContext . Data . LoadFromTextFile < TweetSentiment > ( GetDataPath ( TestDatasets . Sentiment . trainFilename ) ,
134
+ hasHeader : TestDatasets . Sentiment . fileHasHeader ,
135
+ separatorChar : TestDatasets . Sentiment . fileSeparator ) ;
136
+
137
+ // Create a training pipeline.
138
+ // TODO #2802: Update FeaturizeText to allow specifications of word-grams and char-grams.
139
+ var pipeline = mlContext . Transforms . Text . FeaturizeText ( "Features" , new string [ ] { "SentimentText" } ,
140
+ new TextFeaturizingEstimator . Options
141
+ {
142
+ UseCharExtractor = true ,
143
+ UseWordExtractor = true ,
144
+ VectorNormalizer = TextFeaturizingEstimator . TextNormKind . L1
145
+ } )
146
+ . AppendCacheCheckpoint ( mlContext )
147
+ . Append ( mlContext . BinaryClassification . Trainers . StochasticDualCoordinateAscent (
148
+ new SdcaBinaryTrainer . Options { NumThreads = 1 } ) ) ;
149
+
150
+ // Train the model.
151
+ var model = pipeline . Fit ( data ) ;
152
+
153
+ // Evaluate the model.
154
+ var scoredData = model . Transform ( data ) ;
155
+ var metrics = mlContext . BinaryClassification . Evaluate ( scoredData ) ;
156
+
157
+ // Check that the metrics returned are valid.
158
+ Common . AssertMetrics ( metrics ) ;
159
+ }
160
+
161
+ /// <summary>
162
+ /// Extensibility: Apply a normalizer to columns in the dataset.
163
+ /// </summary>
164
+ [ Fact ]
165
+ void ExtensibilityNormalizeColumns ( )
166
+ {
167
+ // Concurrency must be 1 to assure that the mapping is done sequentially.
168
+ var mlContext = new MLContext ( seed : 1 , conc : 1 ) ;
169
+
170
+ // Load the Iris dataset.
171
+ var data = mlContext . Data . LoadFromTextFile < Iris > (
172
+ GetDataPath ( TestDatasets . iris . trainFilename ) ,
173
+ hasHeader : TestDatasets . iris . fileHasHeader ,
174
+ separatorChar : TestDatasets . iris . fileSeparator ) ;
175
+
176
+ // Compose the transformation.
177
+ var pipeline = mlContext . Transforms . Concatenate ( "Features" , Iris . Features )
178
+ . Append ( mlContext . Transforms . Normalize ( "Features" , mode : NormalizingEstimator . NormalizerMode . MinMax ) ) ;
179
+
180
+ // Transform the data.
181
+ var transformedData = pipeline . Fit ( data ) . Transform ( data ) ;
182
+
183
+ // Validate that the data was normalized to between -1 and 1.
184
+ var dataEnumerator = mlContext . Data . CreateEnumerable < FeatureColumn > ( transformedData , true ) ;
185
+ foreach ( var row in dataEnumerator )
186
+ // Verify per-slot normalization.
187
+ for ( int i = 0 ; i < row . Features . Length ; i ++ )
188
+ Assert . InRange ( row . Features [ i ] , - 1 , 1 ) ;
189
+ }
190
+
191
+ private float GetRandomNumber ( float number )
192
+ {
193
+ var seed = ( int ) ( 10 * number ) ;
194
+ var rng = new Random ( seed ) ;
195
+ return ( float ) rng . NextDouble ( ) ;
196
+ }
197
+ }
198
+ }
0 commit comments