1
+ using System ;
2
+ using Microsoft . ML . Data ;
3
+
4
+ namespace Microsoft . ML . Samples . Dynamic
5
+ {
6
+ // This example demonstrates hashing of categorical string and integer data types.
7
+ public static class Hash
8
+ {
9
+ public static void Example ( )
10
+ {
11
+ // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
12
+ // as well as the source of randomness.
13
+ var mlContext = new MLContext ( seed : 1 ) ;
14
+
15
+ // Get a small dataset as an IEnumerable.
16
+ var rawData = new [ ] {
17
+ new DataPoint ( ) { Category = "MLB" , Age = 18 } ,
18
+ new DataPoint ( ) { Category = "NFL" , Age = 14 } ,
19
+ new DataPoint ( ) { Category = "NFL" , Age = 15 } ,
20
+ new DataPoint ( ) { Category = "MLB" , Age = 18 } ,
21
+ new DataPoint ( ) { Category = "MLS" , Age = 14 } ,
22
+ } ;
23
+
24
+ var data = mlContext . Data . LoadFromEnumerable ( rawData ) ;
25
+
26
+ // Construct the pipeline that would hash the two columns and store the results in new columns.
27
+ // The first transform hashes the string column and the second transform hashes the integer column.
28
+ //
29
+ // Hashing is not a reversible operation, so there is no way to retrive the original value from the hashed value.
30
+ // Sometimes, for debugging, or model explainability, users will need to know what values in the original columns generated
31
+ // the values in the hashed columns, since the algorithms will mostly use the hashed values for further computations.
32
+ // The Hash method will preserve the mapping from the original values to the hashed values in the Annotations of the
33
+ // newly created column (column populated with the hashed values).
34
+ //
35
+ // Setting the maximumNumberOfInverts parameters to -1 will preserve the full map.
36
+ // If that parameter is left to the default 0 value, the mapping is not preserved.
37
+ var pipeline = mlContext . Transforms . Conversion . Hash ( "CategoryHashed" , "Category" , numberOfBits : 16 , maximumNumberOfInverts : - 1 )
38
+ . Append ( mlContext . Transforms . Conversion . Hash ( "AgeHashed" , "Age" , numberOfBits : 8 ) ) ;
39
+
40
+ // Let's fit our pipeline, and then apply it to the same data.
41
+ var transformer = pipeline . Fit ( data ) ;
42
+ var transformedData = transformer . Transform ( data ) ;
43
+
44
+ // Convert the post transformation from the IDataView format to an IEnumerable<TransformedData> for easy consumption.
45
+ var convertedData = mlContext . Data . CreateEnumerable < TransformedDataPoint > ( transformedData , true ) ;
46
+
47
+ Console . WriteLine ( "Category CategoryHashed\t Age\t AgeHashed" ) ;
48
+ foreach ( var item in convertedData )
49
+ Console . WriteLine ( $ "{ item . Category } \t { item . CategoryHashed } \t \t { item . Age } \t { item . AgeHashed } ") ;
50
+
51
+ // Expected data after the transformation.
52
+ //
53
+ // Category CategoryHashed Age AgeHashed
54
+ // MLB 36206 18 127
55
+ // NFL 19015 14 62
56
+ // NFL 19015 15 43
57
+ // MLB 36206 18 127
58
+ // MLS 6013 14 62
59
+
60
+ // For the Category column, where we set the maximumNumberOfInverts parameter, the names of the original categories,
61
+ // and their correspondance with the generated hash values is preserved in the Annotations in the format of indices and values.
62
+ // the indices array will have the hashed values, and the corresponding element, position-wise, in the values array will
63
+ // contain the original value.
64
+ //
65
+ // See below for an example on how to retrieve the mapping.
66
+ var slotNames = new VBuffer < ReadOnlyMemory < char > > ( ) ;
67
+ transformedData . Schema [ "CategoryHashed" ] . Annotations . GetValue ( "KeyValues" , ref slotNames ) ;
68
+
69
+ var indices = slotNames . GetIndices ( ) ;
70
+ var categoryNames = slotNames . GetValues ( ) ;
71
+
72
+ for ( int i = 0 ; i < indices . Length ; i ++ )
73
+ Console . WriteLine ( $ "The original value of the { indices [ i ] } category is { categoryNames [ i ] } ") ;
74
+
75
+ // Output Data
76
+ //
77
+ // The original value of the 6012 category is MLS
78
+ // The original value of the 19014 category is NFL
79
+ // The original value of the 36205 category is MLB
80
+ }
81
+
82
+ private class DataPoint
83
+ {
84
+ public string Category ;
85
+ public uint Age ;
86
+ }
87
+
88
+ private class TransformedDataPoint : DataPoint
89
+ {
90
+ public uint CategoryHashed ;
91
+ public uint AgeHashed ;
92
+ }
93
+
94
+ }
95
+ }
0 commit comments