-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Transforms components docs #1321
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
95b34ef
121c2e5
f6578a6
161c0d3
f8664d9
8c8483d
c84f17d
3a5490c
5274eed
909a46a
9f84319
80aa06d
b7d0a46
108100d
e60c1e0
1ec6489
01a327a
0317a2e
eff8504
68a43c1
309a13e
28b698a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
// Licensed to the .NET Foundation under one or more agreements. | ||
// The .NET Foundation licenses this file to you under the MIT license. | ||
// See the LICENSE file in the project root for more information. | ||
|
||
// the alignment of the usings with the methods is intentional so they can display on the same level in the docs site. | ||
using Microsoft.ML.Runtime.Data; | ||
using Microsoft.ML.Runtime.Api; | ||
using Microsoft.ML.Data; | ||
using System; | ||
using System.Linq; | ||
using System.Collections.Generic; | ||
|
||
namespace Microsoft.ML.Samples.Dynamic | ||
{ | ||
public partial class TransformSamples | ||
{ | ||
class SampleInfertDataWithFeatures | ||
{ | ||
public VBuffer<int> Features { get; set; } | ||
} | ||
|
||
public static void ConcatTransform() | ||
{ | ||
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, | ||
// as well as the source of randomness. | ||
var ml = new MLContext(seed: 1, conc: 1); | ||
|
||
// Get a small dataset as an IEnumerable. | ||
IEnumerable<SamplesUtils.DatasetUtils.SampleInfertData> data = SamplesUtils.DatasetUtils.GetInfertData(); | ||
var trainData = ml.CreateStreamingDataView(data); | ||
|
||
// Preview of the data. | ||
// Age Case Education induced parity pooled.stratum row_num ... | ||
// 26.0 1.0 0-5yrs 1.0 6.0 3.0 1.0 ... | ||
// 42.0 1.0 0-5yrs 1.0 1.0 1.0 2.0 ... | ||
// 39.0 1.0 0-5yrs 2.0 6.0 4.0 3.0 ... | ||
// 34.0 1.0 0-5yrs 2.0 4.0 2.0 4.0 ... | ||
// 35.0 1.0 6-11yrs 1.0 3.0 32.0 5.0 ... | ||
|
||
// A pipeline for concatenating the age, parity and induced columns together in the Features column | ||
string outputColumnName = "Features"; | ||
var pipeline = new ConcatEstimator(ml, outputColumnName, new[] { "Age", "Parity", "Induced"}); | ||
|
||
// The transformed data. | ||
var transformedData = pipeline.Fit(trainData).Transform(trainData); | ||
|
||
// Getting the data of the newly created column as an Array, and | ||
var featuresColumn = transformedData.AsEnumerable<SampleInfertDataWithFeatures>(ml, reuseRowObject: false); | ||
|
||
Console.WriteLine($"{outputColumnName} column obtained post-transformation."); | ||
foreach (var featureRow in featuresColumn) | ||
{ | ||
foreach (var value in featureRow.Features.Values) | ||
Console.Write($"{value} "); | ||
Console.WriteLine(""); | ||
} | ||
|
||
// Features | ||
// 26 6 1 | ||
// 42 1 1 | ||
// 39 6 2 | ||
// 34 4 2 | ||
// 35 3 1 | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change | ||||||
---|---|---|---|---|---|---|---|---|
@@ -0,0 +1,107 @@ | ||||||||
// Licensed to the .NET Foundation under one or more agreements. | ||||||||
// The .NET Foundation licenses this file to you under the MIT license. | ||||||||
// See the LICENSE file in the project root for more information. | ||||||||
|
||||||||
// the alignment of the usings with the methods is intentional so they can display on the same level in the docs site. | ||||||||
using Microsoft.ML.Data; | ||||||||
using Microsoft.ML.Runtime.Api; | ||||||||
using Microsoft.ML.Runtime.Data; | ||||||||
using Microsoft.ML.Transforms; | ||||||||
using System; | ||||||||
using System.Collections.Generic; | ||||||||
using System.Linq; | ||||||||
|
||||||||
namespace Microsoft.ML.Samples.Dynamic | ||||||||
{ | ||||||||
public partial class TransformSamples | ||||||||
{ | ||||||||
public static void KeyToValue_Term() | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This is standing out, what this "_" mean, and why it cannot be KeyToValueAndTerm or KeyToValueThenTerm? |
||||||||
{ | ||||||||
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, | ||||||||
// as well as the source of randomness. | ||||||||
var ml = new MLContext(seed: 1, conc: 1); | ||||||||
|
||||||||
// Get a small dataset as an IEnumerable. | ||||||||
IEnumerable<SamplesUtils.DatasetUtils.SampleTopicsData> data = SamplesUtils.DatasetUtils.GetTopicsData(); | ||||||||
var trainData = ml.CreateStreamingDataView(data); | ||||||||
|
||||||||
// Preview of the data. | ||||||||
// Review ReviewReverse, Label | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
// "animals birds cats dogs fish horse", "radiation galaxy universe duck", 1 | ||||||||
// "horse birds house fish duck cats", "space galaxy universe radiation", 0 | ||||||||
// "car truck driver bus pickup", "bus pickup", 1 | ||||||||
// "car truck driver bus pickup horse", "car truck", 0 | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. May want to say the goal of the dataset. Eg: "The goal of the dataset to classify if the review matches ..." I ask this, mainly as I'm reading the example, I have no idea what the labels represent vs. the data. #Resolved There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||||||||
|
||||||||
// A pipeline to convert the terms of the review_reverse column in | ||||||||
// making use of default settings. | ||||||||
string defaultColumnName = "DefaultKeys"; | ||||||||
// REVIEW create through the catalog extension | ||||||||
var default_pipeline = new WordTokenizer(ml, "ReviewReverse", "ReviewReverse") | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why use WordTokenizer+Term instead of TextTransform? #Resolved There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||||||||
.Append(new TermEstimator(ml, "ReviewReverse" , defaultColumnName)); | ||||||||
|
||||||||
// Another pipeline, that customizes the advanced settings of the FeaturizeText transformer. | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
@justinormont, @TomFinley you guys think I should add more explanations about SortOrder here in the comments? #Resolved |
||||||||
string customizedColumnName = "CustomizedKeys"; | ||||||||
var customized_pipeline = new WordTokenizer(ml, "ReviewReverse", "ReviewReverse") | ||||||||
.Append(new TermEstimator(ml, "ReviewReverse", customizedColumnName, maxNumTerms: 10, sort:TermTransform.SortOrder.Value)); | ||||||||
|
||||||||
// The transformed data. | ||||||||
var transformedData_default = default_pipeline.Fit(trainData).Transform(trainData); | ||||||||
var transformedData_customized = customized_pipeline.Fit(trainData).Transform(trainData); | ||||||||
|
||||||||
// small helper to print the text inside the columns, in the console. | ||||||||
Action<string, IEnumerable<VBuffer<uint>>> printHelper = (columnName, column) => | ||||||||
{ | ||||||||
Console.WriteLine($"{columnName} column obtained post-transformation."); | ||||||||
foreach (var row in column) | ||||||||
{ | ||||||||
foreach (var value in row.Values) | ||||||||
Console.Write($"{value} "); | ||||||||
Console.WriteLine(""); | ||||||||
} | ||||||||
|
||||||||
Console.WriteLine("==================================================="); | ||||||||
}; | ||||||||
|
||||||||
// Preview of the TextFeatures column obtained after processing the input. | ||||||||
var defaultColumn = transformedData_default.GetColumn<VBuffer<uint>>(ml, defaultColumnName); | ||||||||
printHelper(defaultColumnName, defaultColumn); | ||||||||
|
||||||||
// DefaultKeys column obtained post-transformation. | ||||||||
// 1 2 3 4 | ||||||||
// 5 2 3 1 | ||||||||
// 6 7 3 1 | ||||||||
// 8 9 3 1 | ||||||||
|
||||||||
// Previewing the CustomizedKeys column obtained after processing the input. | ||||||||
var customizedColumn = transformedData_customized.GetColumn<VBuffer<uint>>(ml, customizedColumnName); | ||||||||
printHelper(customizedColumnName, customizedColumn); | ||||||||
|
||||||||
// CustomizedKeys column obtained post-transformation. | ||||||||
// 6 4 9 3 | ||||||||
// 7 4 9 6 | ||||||||
// 1 5 9 6 | ||||||||
// 2 8 9 6 | ||||||||
|
||||||||
// retrieve the original values, by appending the KeyToValue etimator to the existing pipelines | ||||||||
// to convert the keys back to the strings | ||||||||
var pipeline = default_pipeline.Append(new KeyToValueEstimator(ml, defaultColumnName)); | ||||||||
transformedData_default = pipeline.Fit(trainData).Transform(trainData); | ||||||||
|
||||||||
// Preview of the DefaultColumnName column obtained | ||||||||
var originalColumnBack = transformedData_default.GetColumn<VBuffer<ReadOnlyMemory<char>>>(ml, defaultColumnName); | ||||||||
|
||||||||
foreach (var row in originalColumnBack) | ||||||||
{ | ||||||||
foreach (var value in row.Values) | ||||||||
Console.Write($"{value} "); | ||||||||
Console.WriteLine(""); | ||||||||
} | ||||||||
|
||||||||
// DefaultColumnName column obtained post-transformation. | ||||||||
// radiation galaxy universe duck | ||||||||
// space galaxy universe radiation | ||||||||
// bus pickup universe radiation | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
why is this here, log an issue post merge. |
||||||||
// car truck universe radiation | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
this seems to be a bug too. #Closed There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||||||||
} | ||||||||
} | ||||||||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
// Licensed to the .NET Foundation under one or more agreements. | ||
// The .NET Foundation licenses this file to you under the MIT license. | ||
// See the LICENSE file in the project root for more information. | ||
|
||
// the alignment of the usings with the methods is intentional so they can display on the same level in the docs site. | ||
using Microsoft.ML.Runtime.Data; | ||
using Microsoft.ML.Runtime.Api; | ||
using Microsoft.ML.Data; | ||
using System; | ||
using System.Collections.Generic; | ||
|
||
namespace Microsoft.ML.Samples.Dynamic | ||
{ | ||
public partial class TransformSamples | ||
{ | ||
public static void MinMaxNormalizer() | ||
{ | ||
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, | ||
// as well as the source of randomness. | ||
var ml = new MLContext(seed: 1, conc: 1); | ||
|
||
// Get a small dataset as an IEnumerable. | ||
IEnumerable<SamplesUtils.DatasetUtils.SampleInfertData> data = SamplesUtils.DatasetUtils.GetInfertData(); | ||
var trainData = ml.CreateStreamingDataView(data); | ||
|
||
// Preview of the data. | ||
// Age Case Education Induced Parity PooledStratum RowNum ... | ||
// 26 1 0-5yrs 1 6 3 1 ... | ||
// 42 1 0-5yrs 1 1 1 2 ... | ||
// 39 1 0-5yrs 2 6 4 3 ... | ||
// 34 1 0-5yrs 2 4 2 4 ... | ||
// 35 1 6-11yrs 1 3 32 5 ... | ||
|
||
// A pipeline for concatenating the age, parity and induced columns together in the Features column | ||
var pipeline = ml.Transforms.Normalizer("Induced"); | ||
// The transformed data. | ||
var transformedData = pipeline.Fit(trainData).Transform(trainData); | ||
// Getting the data of the newly created column as an Array, and | ||
var normalizedColumn = transformedData.GetColumn<float>(ml, "Induced"); | ||
|
||
// A small printing utility | ||
Action<string, IEnumerable<float>> printHelper = (colName, column) => | ||
{ | ||
Console.WriteLine($"{colName} column obtained post-transformation."); | ||
foreach (var row in column) | ||
Console.WriteLine($"{row} "); | ||
}; | ||
|
||
printHelper("Induced", normalizedColumn); | ||
// Induced | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
nit: this doesn't match $"{colName} column obtained post-transformation." #Closed There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I meant if these comment are supposed to be the output from printHelper. If they are, then the headers don't match: vs Console.WriteLine($"{colName} column obtained post-transformation."); Same thing applies to other comments showing data preview. In reply to: 227982369 [](ancestors = 227982369,227974775) |
||
// 0.5 | ||
// 0.5 | ||
// 1 | ||
// 1 | ||
// 0.5 | ||
|
||
// Composing a different pipeline if we wanted to normalize more than one column at a time. | ||
// A pipeline for concatenating the age, parity and induced columns together in the new columns | ||
// using log scale | ||
var multiColPipeline = ml.Transforms.Normalizer(Normalizer.NormalizerMode.LogMeanVariance, new[] { ("Induced", "LogInduced"), ("Spontaneous", "LogSpontaneous") }); | ||
// The transformed data. | ||
var multiColtransformedData = multiColPipeline.Fit(trainData).Transform(trainData); | ||
// Getting the data of the newly created column as an Array, and | ||
var normalizedInduced = multiColtransformedData.GetColumn<float>(ml, "LogInduced"); | ||
var normalizedSpont = multiColtransformedData.GetColumn<float>(ml, "LogSpontaneous"); | ||
|
||
printHelper("LogInduced", normalizedInduced); | ||
|
||
// LogInduced | ||
// 0.2071445 | ||
// 0.2071445 | ||
// 0.889631 | ||
// 0.889631 | ||
// 0.2071445 | ||
|
||
printHelper("LogSpontaneous", normalizedSpont); | ||
|
||
// LogSpontaneous | ||
// 0.8413026 | ||
// 0 | ||
// 0 | ||
// 0 | ||
// 0.1586974 | ||
|
||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
// Licensed to the .NET Foundation under one or more agreements. | ||
// The .NET Foundation licenses this file to you under the MIT license. | ||
// See the LICENSE file in the project root for more information. | ||
|
||
// the alignment of the usings with the methods is intentional so they can display on the same level in the docs site. | ||
using Microsoft.ML.Runtime.Data; | ||
using Microsoft.ML.Runtime.Api; | ||
using Microsoft.ML.Data; | ||
using System; | ||
using System.Collections.Generic; | ||
|
||
namespace Microsoft.ML.Samples.Dynamic | ||
{ | ||
public partial class TransformSamples | ||
{ | ||
public static void TextTransform() | ||
{ | ||
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, | ||
// as well as the source of randomness. | ||
var ml = new MLContext(seed: 1, conc: 1); | ||
|
||
// Get a small dataset as an IEnumerable. | ||
IEnumerable<SamplesUtils.DatasetUtils.SampleSentimentData> data = SamplesUtils.DatasetUtils.GetSentimentData(); | ||
var trainData = ml.CreateStreamingDataView(data); | ||
|
||
// Preview of the data. | ||
// Sentiment SentimentText | ||
// true Best game I've ever played. | ||
// false ==RUDE== Dude, 2. | ||
// true Until the next game, this is the best Xbox game! | ||
|
||
// A pipeline for featurization of the "SentimentText" column, and placing the output in a new column named "TextFeatures" | ||
// making use of default settings. | ||
string defaultColumnName = "DefaultTextFeatures"; | ||
var default_pipeline = ml.Transforms.Text.FeaturizeText("SentimentText", defaultColumnName); | ||
|
||
// Another pipeline, that customizes the advanced settings of the FeaturizeText transformer. | ||
string customizedColumnName = "CustomizedTextFeatures"; | ||
var customized_pipeline = ml.Transforms.Text.FeaturizeText("SentimentText", customizedColumnName, s => | ||
{ | ||
s.KeepPunctuations = false; | ||
s.KeepNumbers = false; | ||
s.OutputTokens = true; | ||
s.TextLanguage = Runtime.Data.TextTransform.Language.English; // supports English, French, German, Dutch, Italian, Spanish, Japanese | ||
}); | ||
|
||
// The transformed data. | ||
var transformedData_default = default_pipeline.Fit(trainData).Transform(trainData); | ||
var transformedData_customized = customized_pipeline.Fit(trainData).Transform(trainData); | ||
|
||
// small helper to print the text inside the columns, in the console. | ||
Action<string, IEnumerable<VBuffer<float>>> printHelper = (columnName, column) => | ||
{ | ||
Console.WriteLine($"{columnName} column obtained post-transformation."); | ||
foreach (var featureRow in column) | ||
{ | ||
foreach (var value in featureRow.Values) | ||
Console.Write($"{value} "); | ||
Console.WriteLine(""); | ||
} | ||
|
||
Console.WriteLine("==================================================="); | ||
}; | ||
|
||
// Preview of the TextFeatures column obtained after processing the input. | ||
var defaultColumn = transformedData_default.GetColumn<VBuffer<float>>(ml, defaultColumnName); | ||
printHelper(defaultColumnName, defaultColumn); | ||
|
||
// Transformed data | ||
// 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.4472136 0.4472136 0.4472136 0.4472136 0.4472136 | ||
// 0.2357023 0.2357023 0.2357023 0.2357023 0.4714046 0.2357023 0.2357023 0.2357023 0.2357023 0.2357023 0.2357023 0.2357023 0.2357023 0.2357023 0.2357023 0.5773503 0.5773503 0.5773503 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.4472136 0.4472136 0.4472136 0.4472136 0.4472136 | ||
// 0 0.1230915 0.1230915 0.1230915 0.1230915 0.246183 0.246183 0.246183 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.1230915 0 0 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.3692745 0.246183 0.246183 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.246183 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.2886751 0 0 0 0 0 0 0 0.2886751 0.5773503 0.2886751 0.2886751 0.2886751 0.2886751 0.2886751 0.2886751 | ||
|
||
// Preview of the TextFeatures column obtained after processing the input. | ||
var customizedColumn = transformedData_customized.GetColumn<VBuffer<float>>(ml, customizedColumnName); | ||
printHelper(customizedColumnName, customizedColumn); | ||
|
||
// Transformed data | ||
// 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.4472136 0.4472136 0.4472136 0.4472136 0.4472136 | ||
// 0.25 0.25 0.25 0.25 0.5 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.7071068 0.7071068 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.4472136 0.4472136 0.4472136 0.4472136 0.4472136 | ||
// 0 0.125 0.125 0.125 0.125 0.25 0.25 0.25 0.125 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.125 0.125 0.125 0.125 0.125 0.125 0.375 0.25 0.25 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.25 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.2672612 0.5345225 0 0 0 0 0 0.2672612 0.5345225 0.2672612 0.2672612 0.2672612 0.2672612 } | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What does
reuseRowObject
do? Would defaults be fine for these samples? #ResolvedThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
for the enumerable, it determines whether to return the same object on every row, or allocate a new one per row. It is a required param; doesn't have a default.
For the settings of the transforms, i am using both defaults and non-defaults; since the purpose of this snippet is to educate about usage.
In reply to: 227591855 [](ancestors = 227591855)