Skip to content

Commit 49403ab

Browse files
authored
Hash sample (#3042)
* Hash sample
1 parent ff62d40 commit 49403ab

File tree

3 files changed

+103
-0
lines changed

3 files changed

+103
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
using System;
2+
using Microsoft.ML.Data;
3+
4+
namespace Microsoft.ML.Samples.Dynamic
5+
{
6+
// This example demonstrates hashing of categorical string and integer data types.
7+
public static class Hash
8+
{
9+
public static void Example()
10+
{
11+
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
12+
// as well as the source of randomness.
13+
var mlContext = new MLContext(seed: 1);
14+
15+
// Get a small dataset as an IEnumerable.
16+
var rawData = new[] {
17+
new DataPoint() { Category = "MLB" , Age = 18 },
18+
new DataPoint() { Category = "NFL" , Age = 14 },
19+
new DataPoint() { Category = "NFL" , Age = 15 },
20+
new DataPoint() { Category = "MLB" , Age = 18 },
21+
new DataPoint() { Category = "MLS" , Age = 14 },
22+
};
23+
24+
var data = mlContext.Data.LoadFromEnumerable(rawData);
25+
26+
// Construct the pipeline that would hash the two columns and store the results in new columns.
27+
// The first transform hashes the string column and the second transform hashes the integer column.
28+
//
29+
// Hashing is not a reversible operation, so there is no way to retrive the original value from the hashed value.
30+
// Sometimes, for debugging, or model explainability, users will need to know what values in the original columns generated
31+
// the values in the hashed columns, since the algorithms will mostly use the hashed values for further computations.
32+
// The Hash method will preserve the mapping from the original values to the hashed values in the Annotations of the
33+
// newly created column (column populated with the hashed values).
34+
//
35+
// Setting the maximumNumberOfInverts parameters to -1 will preserve the full map.
36+
// If that parameter is left to the default 0 value, the mapping is not preserved.
37+
var pipeline = mlContext.Transforms.Conversion.Hash("CategoryHashed", "Category", numberOfBits: 16, maximumNumberOfInverts: -1)
38+
.Append(mlContext.Transforms.Conversion.Hash("AgeHashed", "Age", numberOfBits: 8));
39+
40+
// Let's fit our pipeline, and then apply it to the same data.
41+
var transformer = pipeline.Fit(data);
42+
var transformedData = transformer.Transform(data);
43+
44+
// Convert the post transformation from the IDataView format to an IEnumerable<TransformedData> for easy consumption.
45+
var convertedData = mlContext.Data.CreateEnumerable<TransformedDataPoint>(transformedData, true);
46+
47+
Console.WriteLine("Category CategoryHashed\t Age\t AgeHashed");
48+
foreach (var item in convertedData)
49+
Console.WriteLine($"{item.Category}\t {item.CategoryHashed}\t\t {item.Age}\t {item.AgeHashed}");
50+
51+
// Expected data after the transformation.
52+
//
53+
// Category CategoryHashed Age AgeHashed
54+
// MLB 36206 18 127
55+
// NFL 19015 14 62
56+
// NFL 19015 15 43
57+
// MLB 36206 18 127
58+
// MLS 6013 14 62
59+
60+
// For the Category column, where we set the maximumNumberOfInverts parameter, the names of the original categories,
61+
// and their correspondance with the generated hash values is preserved in the Annotations in the format of indices and values.
62+
// the indices array will have the hashed values, and the corresponding element, position-wise, in the values array will
63+
// contain the original value.
64+
//
65+
// See below for an example on how to retrieve the mapping.
66+
var slotNames = new VBuffer<ReadOnlyMemory<char>>();
67+
transformedData.Schema["CategoryHashed"].Annotations.GetValue("KeyValues", ref slotNames);
68+
69+
var indices = slotNames.GetIndices();
70+
var categoryNames = slotNames.GetValues();
71+
72+
for (int i = 0; i < indices.Length; i++)
73+
Console.WriteLine($"The original value of the {indices[i]} category is {categoryNames[i]}");
74+
75+
// Output Data
76+
//
77+
// The original value of the 6012 category is MLS
78+
// The original value of the 19014 category is NFL
79+
// The original value of the 36205 category is MLB
80+
}
81+
82+
private class DataPoint
83+
{
84+
public string Category;
85+
public uint Age;
86+
}
87+
88+
private class TransformedDataPoint : DataPoint
89+
{
90+
public uint CategoryHashed;
91+
public uint AgeHashed;
92+
}
93+
94+
}
95+
}

docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj

+1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
<PropertyGroup>
44
<TargetFramework>netcoreapp2.1</TargetFramework>
55
<OutputType>Exe</OutputType>
6+
<WarningsNotAsErrors>649</WarningsNotAsErrors>
67
</PropertyGroup>
78

89
<ItemGroup>

src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs

+7
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,13 @@ public static class ConversionsExtensionsCatalog
2828
/// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one.
2929
/// <paramref name="maximumNumberOfInverts"/>Specifies the upper bound of the number of distinct input values mapping to a hash that should be retained.
3030
/// <value>0</value> does not retain any input values. <value>-1</value> retains all input values mapping to each hash.</param>
31+
/// <example>
32+
/// <format type="text/markdown">
33+
/// <![CDATA[
34+
/// [!code-csharp[Hash](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Conversion/Hash.cs)]
35+
/// ]]></format>
36+
/// </example>
37+
3138
public static HashingEstimator Hash(this TransformsCatalog.ConversionTransforms catalog, string outputColumnName, string inputColumnName = null,
3239
int numberOfBits = HashDefaults.NumberOfBits, int maximumNumberOfInverts = HashDefaults.MaximumNumberOfInverts)
3340
=> new HashingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumnName, inputColumnName, numberOfBits, maximumNumberOfInverts);

0 commit comments

Comments
 (0)