-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Adding Shuffle to the catalog #2427
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
83527ee
e11c16c
9dcd81f
0369b1b
bed219d
2d243a3
ec174c4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
using System; | ||
using Microsoft.ML.Data; | ||
using Microsoft.ML.SamplesUtils; | ||
|
||
namespace Microsoft.ML.Samples.Dynamic | ||
{ | ||
/// <summary> | ||
/// Sample class showing how to use Shuffle. | ||
/// </summary> | ||
public static class Shuffle | ||
{ | ||
public static void Example() | ||
{ | ||
// Create a new context for ML.NET operations. It can be used for exception tracking and logging, | ||
// as a catalog of available operations and as the source of randomness. | ||
var mlContext = new MLContext(); | ||
|
||
// Get a small dataset as an IEnumerable. | ||
var enumerableOfData = DatasetUtils.GetSampleTemperatureData(5); | ||
var data = mlContext.Data.ReadFromEnumerable(enumerableOfData); | ||
|
||
// Before we apply a filter, examine all the records in the dataset. | ||
Console.WriteLine($"Date\tTemperature"); | ||
foreach (var row in enumerableOfData) | ||
{ | ||
Console.WriteLine($"{row.Date.ToString("d")}\t{row.Temperature}"); | ||
} | ||
Console.WriteLine(); | ||
// Expected output: | ||
// Date Temperature | ||
// 1/2/2012 36 | ||
// 1/3/2012 36 | ||
// 1/4/2012 34 | ||
// 1/5/2012 35 | ||
// 1/6/2012 35 | ||
|
||
// Shuffle the dataset. | ||
var shuffledData = mlContext.Data.Shuffle(data, seed: 123); | ||
|
||
// Look at the shuffled data and observe that the rows are in a randomized order. | ||
var enumerable = mlContext.CreateEnumerable<DatasetUtils.SampleTemperatureData>(shuffledData, reuseRowObject: true); | ||
Console.WriteLine($"Date\tTemperature"); | ||
foreach (var row in enumerable) | ||
{ | ||
Console.WriteLine($"{row.Date.ToString("d")}\t{row.Temperature}"); | ||
} | ||
// Expected output: | ||
// Date Temperature | ||
// 1/4/2012 34 | ||
// 1/2/2012 36 | ||
// 1/5/2012 35 | ||
// 1/3/2012 36 | ||
// 1/6/2012 35 | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -35,7 +35,7 @@ internal DataOperationsCatalog(IHostEnvironment env) | |
/// and complementary out-of-bag sample by using the same <paramref name="seed"/>. | ||
/// </remarks> | ||
/// <param name="input">The input data.</param> | ||
/// <param name="seed">The random seed. If unspecified random state will be instead derived from the <see cref="MLContext"/>.</param> | ||
/// <param name="seed">The random seed. If unspecified, the random state will be instead derived from the <see cref="MLContext"/>.</param> | ||
/// <param name="complement">Whether this is the out-of-bag sample, that is, all those rows that are not selected by the transform. | ||
/// Can be used to create a complementary pair of samples by using the same seed.</param> | ||
/// <example> | ||
|
@@ -46,15 +46,15 @@ internal DataOperationsCatalog(IHostEnvironment env) | |
/// </format> | ||
/// </example> | ||
public IDataView BootstrapSample(IDataView input, | ||
uint? seed = null, | ||
int? seed = null, | ||
bool complement = BootstrapSamplingTransformer.Defaults.Complement) | ||
{ | ||
Environment.CheckValue(input, nameof(input)); | ||
return new BootstrapSamplingTransformer( | ||
Environment, | ||
input, | ||
complement: complement, | ||
seed: seed, | ||
seed: (uint?) seed, | ||
shuffleInput: false, | ||
poolSize: 0); | ||
} | ||
|
@@ -168,5 +168,51 @@ public IDataView FilterByMissingValues(IDataView input, params string[] columns) | |
|
||
return new NAFilter(Environment, input, complement: false, columns); | ||
} | ||
|
||
/// <summary> | ||
/// Shuffle the rows of <paramref name="input"/>. | ||
/// </summary> | ||
/// <remarks> | ||
/// <see cref="Shuffle"/> will shuffle the rows of any input <see cref="IDataView"/> using a streaming approach. | ||
/// In order to not load the entire dataset in memory, a pool of <paramref name="shufflePoolSize"/> rows will be used | ||
/// to randomly select rows to output. The pool is constructed from the first <paramref name="shufflePoolSize"/> rows | ||
/// in <paramref name="input"/>. Rows will then be randomly yielded from the pool and replaced with the next row from <paramref name="input"/> | ||
/// until all the rows have been yielded, resulting in a new <see cref="IDataView"/> of the same size as <paramref name="input"/> | ||
/// but with the rows in a randomized order. | ||
/// If the <see cref="IDataView.CanShuffle"/> property of <paramref name="input"/> is true, then it will also be read into the | ||
/// pool in a random order, offering two sources of randomness. | ||
/// </remarks> | ||
/// <param name="input">The input data.</param> | ||
/// <param name="seed">The random seed. If unspecified, the random state will be instead derived from the <see cref="MLContext"/>.</param> | ||
/// <param name="shufflePoolSize">The number of rows to hold in the pool. Setting this to 1 will turn off pool shuffling and | ||
/// <see cref="Shuffle"/> will only perform a shuffle by reading <paramref name="input"/> in a random order.</param> | ||
/// <param name="shuffleSource">If <see langword="false"/>, the transform will not attempt to read <paramref name="input"/> in a random order and only use | ||
/// pooling to shuffle. This parameter has no effect if the <see cref="IDataView.CanShuffle"/> property of <paramref name="input"/> is <see langword="false"/>. | ||
/// </param> | ||
/// <example> | ||
/// <format type="text/markdown"> | ||
/// <] | ||
/// ]]> | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This actually works? That's fantastic if so. #Resolved There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, it pulls the examples into the docs.microsoft.com pages. Pretty cool. In reply to: 254346894 [](ancestors = 254346894) |
||
/// </format> | ||
/// </example> | ||
public IDataView Shuffle(IDataView input, | ||
int? seed = null, | ||
int shufflePoolSize = RowShufflingTransformer.Defaults.PoolRows, | ||
bool shuffleSource = !RowShufflingTransformer.Defaults.PoolOnly) | ||
{ | ||
Environment.CheckValue(input, nameof(input)); | ||
Environment.CheckUserArg(shufflePoolSize > 0, nameof(shufflePoolSize), "Must be positive"); | ||
|
||
var options = new RowShufflingTransformer.Options | ||
{ | ||
PoolRows = shufflePoolSize, | ||
PoolOnly = !shuffleSource, | ||
ForceShuffle = true, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
shall this be a argument of Shuffle? #Resolved There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We are generally trying to remove parameters from the input, and this is a confusing one to explain, as it really gets into the internals of RowShufflingTransform. I can get all the same behavior out of the RowShufflingTransform by setting this to true and tuning the others, so I'd prefer to leave it out. In reply to: 254150113 [](ancestors = 254150113) |
||
ForceShuffleSeed = seed | ||
}; | ||
|
||
return new RowShufflingTransformer(Environment, options, input); | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
From the below comment I don't understand if the output
IDataView
will be of sizepoolRows
or if it will be of the same size as the inputIDataView
. #ResolvedThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How about now? I rewrote the remarks to be a bit more explicit on what was happening.
In reply to: 254190313 [](ancestors = 254190313)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's very clear now thanks for explaining further!
In reply to: 254337002 [](ancestors = 254337002,254190313)