Skip to content

Reformat categorical transform samples. #3588

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Apr 30, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,56 +1,67 @@
using System;
using System.Collections.Generic;
using Microsoft.ML;
using Microsoft.ML.Data;
using static Microsoft.ML.Transforms.OneHotEncodingEstimator;
using Microsoft.ML.Transforms;

namespace Samples.Dynamic
namespace Samples.Dynamic.Transforms.Categorical
{
public static class OneHotEncoding
{
public static void Example()
{
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
// as well as the source of randomness.
// Create a new ML context for ML.NET operations. It can be used for
// exception tracking and logging as well as the source of randomness.
var mlContext = new MLContext();

// Get a small dataset as an IEnumerable.
var samples = new List<DataPoint>()
// Create a small dataset as an IEnumerable.
var samples = new[]
{
new DataPoint(){ Education = "0-5yrs" },
new DataPoint(){ Education = "0-5yrs" },
new DataPoint(){ Education = "6-11yrs" },
new DataPoint(){ Education = "6-11yrs" },
new DataPoint(){ Education = "11-15yrs" },
new DataPoint {Education = "0-5yrs"},
new DataPoint {Education = "0-5yrs"},
new DataPoint {Education = "6-11yrs"},
new DataPoint {Education = "6-11yrs"},
new DataPoint {Education = "11-15yrs"}
};

// Convert training data to IDataView.
var data = mlContext.Data.LoadFromEnumerable(samples);
IDataView data = mlContext.Data.LoadFromEnumerable(samples);

// A pipeline for one hot encoding the Education column.
var pipeline = mlContext.Transforms.Categorical.OneHotEncoding("EducationOneHotEncoded", "Education");
var pipeline = mlContext.Transforms.Categorical.OneHotEncoding(
"EducationOneHotEncoded", "Education");

// Fit and transform the data.
var oneHotEncodedData = pipeline.Fit(data).Transform(data);
IDataView oneHotEncodedData = pipeline.Fit(data).Transform(data);

PrintDataColumn(oneHotEncodedData, "EducationOneHotEncoded");
// We have 3 slots, because there are three categories in the 'Education' column.

// We have 3 slots because there are three categories in the
// 'Education' column.

// 1 0 0
// 1 0 0
// 0 1 0
// 0 1 0
// 0 0 1

// A pipeline for one hot encoding the Education column (using keying).
var keyPipeline = mlContext.Transforms.Categorical.OneHotEncoding("EducationOneHotEncoded", "Education", OutputKind.Key);
var keyPipeline = mlContext.Transforms.Categorical.OneHotEncoding(
"EducationOneHotEncoded", "Education",
OneHotEncodingEstimator.OutputKind.Key);

// Fit and Transform data.
oneHotEncodedData = keyPipeline.Fit(data).Transform(data);

var keyEncodedColumn = oneHotEncodedData.GetColumn<uint>("EducationOneHotEncoded");
var keyEncodedColumn =
oneHotEncodedData.GetColumn<uint>("EducationOneHotEncoded");

Console.WriteLine(
"One Hot Encoding of single column 'Education', with key type " +
"output.");

// One Hot Encoding of single column 'Education', with key type output.

Console.WriteLine("One Hot Encoding of single column 'Education', with key type output.");
foreach (var element in keyEncodedColumn)
foreach (uint element in keyEncodedColumn)
Console.WriteLine(element);

// 1
Expand All @@ -59,17 +70,22 @@ public static void Example()
// 2
// 3
}
private static void PrintDataColumn(IDataView transformedData, string columnName)

private static void PrintDataColumn(IDataView transformedData,
string columnName)
{
var countSelectColumn = transformedData.GetColumn<float[]>(transformedData.Schema[columnName]);
var countSelectColumn = transformedData.GetColumn<float[]>(
transformedData.Schema[columnName]);

foreach (var row in countSelectColumn)
{
for (var i = 0; i < row.Length; i++)
Console.Write($"{row[i]}\t");

Console.WriteLine();
}
}

private class DataPoint
{
public string Education { get; set; }
Expand Down
Original file line number Diff line number Diff line change
@@ -1,45 +1,55 @@
using System;
using System.Collections.Generic;
using Microsoft.ML;

namespace Samples.Dynamic
namespace Samples.Dynamic.Transforms.Categorical
{
public static class OneHotEncodingMultiColumn
{
public static void Example()
{
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
// as well as the source of randomness.
// Create a new ML context for ML.NET operations. It can be used for
// exception tracking and logging as well as the source of randomness.
var mlContext = new MLContext();

// Get a small dataset as an IEnumerable.
var samples = new List<DataPoint>()
// Create a small dataset as an IEnumerable.
var samples = new[]
{
new DataPoint(){ Education = "0-5yrs", ZipCode = "98005" },
new DataPoint(){ Education = "0-5yrs", ZipCode = "98052" },
new DataPoint(){ Education = "6-11yrs", ZipCode = "98005" },
new DataPoint(){ Education = "6-11yrs", ZipCode = "98052" },
new DataPoint(){ Education = "11-15yrs", ZipCode = "98005" },
new DataPoint {Education = "0-5yrs", ZipCode = "98005"},
new DataPoint {Education = "0-5yrs", ZipCode = "98052"},
new DataPoint {Education = "6-11yrs", ZipCode = "98005"},
new DataPoint {Education = "6-11yrs", ZipCode = "98052"},
new DataPoint {Education = "11-15yrs", ZipCode = "98005"}
};

// Convert training data to IDataView.
var data = mlContext.Data.LoadFromEnumerable(samples);
IDataView data = mlContext.Data.LoadFromEnumerable(samples);

// Multi column example : A pipeline for one hot encoding two columns 'Education' and 'ZipCode'
var multiColumnKeyPipeline = mlContext.Transforms.Categorical.OneHotEncoding(
new InputOutputColumnPair[] {
new InputOutputColumnPair("Education"),
new InputOutputColumnPair("ZipCode"),
});
// Multi column example: A pipeline for one hot encoding two columns
// 'Education' and 'ZipCode'.
var multiColumnKeyPipeline =
mlContext.Transforms.Categorical.OneHotEncoding(
new[]
{
new InputOutputColumnPair("Education"),
new InputOutputColumnPair("ZipCode")
});

// Fit and Transform data.
var transformedData = multiColumnKeyPipeline.Fit(data).Transform(data);
IDataView transformedData =
multiColumnKeyPipeline.Fit(data).Transform(data);

var convertedData = mlContext.Data.CreateEnumerable<TransformedData>(transformedData, true);
var convertedData =
mlContext.Data.CreateEnumerable<TransformedData>(transformedData,
true);

Console.WriteLine("One Hot Encoding of two columns 'Education' and 'ZipCode'.");
foreach (var item in convertedData)
Console.WriteLine("{0}\t\t\t{1}", string.Join(" ", item.Education), string.Join(" ", item.ZipCode));
Console.WriteLine(
"One Hot Encoding of two columns 'Education' and 'ZipCode'.");

// One Hot Encoding of two columns 'Education' and 'ZipCode'.

foreach (TransformedData item in convertedData)
Console.WriteLine("{0}\t\t\t{1}", string.Join(" ", item.Education),
string.Join(" ", item.ZipCode));

// 1 0 0 1 0
// 1 0 0 0 1
Expand Down
Original file line number Diff line number Diff line change
@@ -1,37 +1,37 @@
using System;
using System.Collections.Generic;
using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.ML.Transforms;

namespace Samples.Dynamic
namespace Samples.Dynamic.Transforms.Categorical
{
public static class OneHotHashEncoding
{
public static void Example()
{
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
// as well as the source of randomness.
// Create a new ML context for ML.NET operations. It can be used for
// exception tracking and logging as well as the source of randomness.
var mlContext = new MLContext();

// Get a small dataset as an IEnumerable.
var samples = new List<DataPoint>()
// Create a small dataset as an IEnumerable.
var samples = new[]
{
new DataPoint(){ Education = "0-5yrs" },
new DataPoint(){ Education = "0-5yrs" },
new DataPoint(){ Education = "6-11yrs" },
new DataPoint(){ Education = "6-11yrs" },
new DataPoint(){ Education = "11-15yrs" },
new DataPoint {Education = "0-5yrs"},
new DataPoint {Education = "0-5yrs"},
new DataPoint {Education = "6-11yrs"},
new DataPoint {Education = "6-11yrs"},
new DataPoint {Education = "11-15yrs"}
};

// Convert training data to IDataView.
var data = mlContext.Data.LoadFromEnumerable(samples);
// Convert training data to an IDataView.
IDataView data = mlContext.Data.LoadFromEnumerable(samples);

// A pipeline for one hot hash encoding the 'Education' column.
var pipeline = mlContext.Transforms.Categorical.OneHotHashEncoding("EducationOneHotHashEncoded", "Education", numberOfBits: 3);
var pipeline = mlContext.Transforms.Categorical.OneHotHashEncoding(
"EducationOneHotHashEncoded", "Education", numberOfBits: 3);

// Fit and transform the data.
var hashEncodedData = pipeline.Fit(data).Transform(data);
IDataView hashEncodedData = pipeline.Fit(data).Transform(data);

PrintDataColumn(hashEncodedData, "EducationOneHotHashEncoded");
// We have 8 slots, because we used numberOfBits = 3.
Expand All @@ -42,19 +42,26 @@ public static void Example()
// 0 0 0 0 1 0 0 0
// 0 0 0 0 0 0 0 1

// A pipeline for one hot hash encoding the 'Education' column (using keying strategy).
var keyPipeline = mlContext.Transforms.Categorical.OneHotHashEncoding("EducationOneHotHashEncoded", "Education",
outputKind: OneHotEncodingEstimator.OutputKind.Key,
numberOfBits: 3);
// A pipeline for one hot hash encoding the 'Education' column
// (using keying strategy).
var keyPipeline = mlContext.Transforms.Categorical.OneHotHashEncoding(
"EducationOneHotHashEncoded", "Education",
OneHotEncodingEstimator.OutputKind.Key, 3);

// Fit and transform the data.
var hashKeyEncodedData = keyPipeline.Fit(data).Transform(data);
IDataView hashKeyEncodedData = keyPipeline.Fit(data).Transform(data);

// Getting the data of the newly created column, so we can preview it.
var keyEncodedColumn = hashKeyEncodedData.GetColumn<uint>("EducationOneHotHashEncoded");
// Get the data of the newly created column for inspecting.
var keyEncodedColumn =
hashKeyEncodedData.GetColumn<uint>("EducationOneHotHashEncoded");

Console.WriteLine("One Hot Hash Encoding of single column 'Education', with key type output.");
foreach (var element in keyEncodedColumn)
Console.WriteLine(
"One Hot Hash Encoding of single column 'Education', with key " +
"type output.");

// One Hot Hash Encoding of single column 'Education', with key type output.

foreach (uint element in keyEncodedColumn)
Console.WriteLine(element);

// 4
Expand All @@ -64,9 +71,11 @@ public static void Example()
// 8
}

private static void PrintDataColumn(IDataView transformedData, string columnName)
private static void PrintDataColumn(IDataView transformedData,
string columnName)
{
var countSelectColumn = transformedData.GetColumn<float[]>(transformedData.Schema[columnName]);
var countSelectColumn = transformedData.GetColumn<float[]>(
transformedData.Schema[columnName]);

foreach (var row in countSelectColumn)
{
Expand Down
Original file line number Diff line number Diff line change
@@ -1,44 +1,57 @@
using System;
using System.Collections.Generic;
using Microsoft.ML;

namespace Samples.Dynamic
namespace Samples.Dynamic.Transforms.Categorical
{
public static class OneHotHashEncodingMultiColumn
{
public static void Example()
{
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
// as well as the source of randomness.
// Create a new ML context for ML.NET operations. It can be used for
// exception tracking and logging as well as the source of randomness.
var mlContext = new MLContext();

// Get a small dataset as an IEnumerable.
var samples = new List<DataPoint>()
var samples = new[]
{
new DataPoint(){ Education = "0-5yrs", ZipCode = "98005" },
new DataPoint(){ Education = "0-5yrs", ZipCode = "98052" },
new DataPoint(){ Education = "6-11yrs", ZipCode = "98005" },
new DataPoint(){ Education = "6-11yrs", ZipCode = "98052" },
new DataPoint(){ Education = "11-15yrs", ZipCode = "98005" },
new DataPoint {Education = "0-5yrs", ZipCode = "98005"},
new DataPoint {Education = "0-5yrs", ZipCode = "98052"},
new DataPoint {Education = "6-11yrs", ZipCode = "98005"},
new DataPoint {Education = "6-11yrs", ZipCode = "98052"},
new DataPoint {Education = "11-15yrs", ZipCode = "98005"}
};

// Convert training data to IDataView.
var data = mlContext.Data.LoadFromEnumerable(samples);
IDataView data = mlContext.Data.LoadFromEnumerable(samples);

// Multi column example : A pipeline for one hot has encoding two columns 'Education' and 'ZipCode'
var multiColumnKeyPipeline = mlContext.Transforms.Categorical.OneHotHashEncoding(
new InputOutputColumnPair[] { new InputOutputColumnPair("Education"), new InputOutputColumnPair("ZipCode") },
numberOfBits: 3);
// Multi column example: A pipeline for one hot has encoding two
// columns 'Education' and 'ZipCode'.
var multiColumnKeyPipeline =
mlContext.Transforms.Categorical.OneHotHashEncoding(
new[]
{
new InputOutputColumnPair("Education"),
new InputOutputColumnPair("ZipCode")
},
numberOfBits: 3);

// Fit and Transform the data.
var transformedData = multiColumnKeyPipeline.Fit(data).Transform(data);
IDataView transformedData =
multiColumnKeyPipeline.Fit(data).Transform(data);

var convertedData = mlContext.Data.CreateEnumerable<TransformedData>(transformedData, true);
var convertedData =
mlContext.Data.CreateEnumerable<TransformedData>(transformedData,
true);

Console.WriteLine(
"One Hot Hash Encoding of two columns 'Education' and 'ZipCode'.");

// One Hot Hash Encoding of two columns 'Education' and 'ZipCode'.

foreach (TransformedData item in convertedData)
Console.WriteLine("{0}\t\t\t{1}", string.Join(" ", item.Education),
string.Join(" ", item.ZipCode));

Console.WriteLine("One Hot Hash Encoding of two columns 'Education' and 'ZipCode'.");
foreach (var item in convertedData)
Console.WriteLine("{0}\t\t\t{1}", string.Join(" ", item.Education), string.Join(" ", item.ZipCode));

// We have 8 slots, because we used numberOfBits = 3.

// 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1
Expand Down