-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Adding Shuffle to the catalog #2427
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
83527ee
e11c16c
9dcd81f
0369b1b
bed219d
2d243a3
ec174c4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
using System; | ||
using Microsoft.ML.Data; | ||
using Microsoft.ML.SamplesUtils; | ||
|
||
namespace Microsoft.ML.Samples.Dynamic | ||
{ | ||
/// <summary> | ||
/// Sample class showing how to use Shuffle. | ||
/// </summary> | ||
public static class Shuffle | ||
{ | ||
public static void Example() | ||
{ | ||
// Create a new context for ML.NET operations. It can be used for exception tracking and logging, | ||
// as a catalog of available operations and as the source of randomness. | ||
var mlContext = new MLContext(); | ||
|
||
// Get a small dataset as an IEnumerable. | ||
var enumerableOfData = DatasetUtils.GetSampleTemperatureData(5); | ||
var data = mlContext.Data.ReadFromEnumerable(enumerableOfData); | ||
|
||
// Before we apply a filter, examine all the records in the dataset. | ||
Console.WriteLine($"Date\tTemperature"); | ||
foreach (var row in enumerableOfData) | ||
{ | ||
Console.WriteLine($"{row.Date.ToString("d")}\t{row.Temperature}"); | ||
} | ||
Console.WriteLine(); | ||
// Expected output: | ||
// Date Temperature | ||
// 1/2/2012 36 | ||
// 1/3/2012 36 | ||
// 1/4/2012 34 | ||
// 1/5/2012 35 | ||
// 1/6/2012 35 | ||
|
||
// Shuffle the dataset. | ||
var shuffledData = mlContext.Data.Shuffle(data, seed: 123); | ||
|
||
// Look at the shuffled data and observe that the rows are in a randomized order. | ||
var enumerable = mlContext.CreateEnumerable<DatasetUtils.SampleTemperatureData>(shuffledData, reuseRowObject: true); | ||
Console.WriteLine($"Date\tTemperature"); | ||
foreach (var row in enumerable) | ||
{ | ||
Console.WriteLine($"{row.Date.ToString("d")}\t{row.Temperature}"); | ||
} | ||
// Expected output: | ||
// Date Temperature | ||
// 1/4/2012 34 | ||
// 1/2/2012 36 | ||
// 1/5/2012 35 | ||
// 1/3/2012 36 | ||
// 1/6/2012 35 | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -35,7 +35,7 @@ internal DataOperationsCatalog(IHostEnvironment env) | |
/// and complementary out-of-bag sample by using the same <paramref name="seed"/>. | ||
/// </remarks> | ||
/// <param name="input">The input data.</param> | ||
/// <param name="seed">The random seed. If unspecified random state will be instead derived from the <see cref="MLContext"/>.</param> | ||
/// <param name="seed">The random seed. If unspecified, the random state will be instead derived from the <see cref="MLContext"/>.</param> | ||
/// <param name="complement">Whether this is the out-of-bag sample, that is, all those rows that are not selected by the transform. | ||
/// Can be used to create a complementary pair of samples by using the same seed.</param> | ||
/// <example> | ||
|
@@ -147,5 +147,51 @@ public IDataView FilterByKeyColumnFraction(IDataView input, string columnName, d | |
throw Environment.ExceptSchemaMismatch(nameof(columnName), "filter", columnName, "KeyType", type.ToString()); | ||
return new RangeFilter(Environment, input, columnName, lowerBound, upperBound, false); | ||
} | ||
|
||
/// <summary> | ||
/// Shuffle the rows of <paramref name="input"/>. | ||
/// </summary> | ||
/// <remarks> | ||
/// <see cref="Shuffle"/> will shuffle the rows of any input <see cref="IDataView"/> using a streaming approach. | ||
/// In order to not load the entire dataset in memory, a pool of <paramref name="shufflePoolSize"/> rows will be used | ||
/// to randomly select rows to output. The pool is constructed from the first <paramref name="shufflePoolSize"/> rows | ||
/// in <paramref name="input"/>. Rows will then be randomly yielded from the pool and replaced with the next row from <paramref name="input"/> | ||
/// until all the rows have been yielded, resulting in a new <see cref="IDataView"/> of the same size as <paramref name="input"/> | ||
/// but with the rows in a randomized order. | ||
/// If the <see cref="IDataView.CanShuffle"/> property of <paramref name="input"/> is true, then it will also be read into the | ||
/// pool in a random order, offering two sources of randomness. | ||
/// </remarks> | ||
/// <param name="input">The input data.</param> | ||
/// <param name="seed">The random seed. If unspecified, the random state will be instead derived from the <see cref="MLContext"/>.</param> | ||
/// <param name="shufflePoolSize">The number of rows to hold in the pool. Setting this to 1 will turn off pool shuffling and | ||
/// <see cref="Shuffle"/> will only perform a shuffle by reading <paramref name="input"/> in a random order.</param> | ||
/// <param name="shuffleSource">If false, the transform will not attempt to read <paramref name="input"/> in a random order and only use | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I think we've received feedback from .NET docs team that things like this ought to be |
||
/// pooling to shuffle. This parameter has no effect if the <see cref="IDataView.CanShuffle"/> property of <paramref name="input"/> is false. | ||
/// </param> | ||
/// <example> | ||
/// <format type="text/markdown"> | ||
/// <] | ||
/// ]]> | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This actually works? That's fantastic if so. #Resolved There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, it pulls the examples into the docs.microsoft.com pages. Pretty cool. In reply to: 254346894 [](ancestors = 254346894) |
||
/// </format> | ||
/// </example> | ||
public IDataView Shuffle(IDataView input, | ||
uint? seed = null, | ||
int shufflePoolSize = RowShufflingTransformer.Defaults.PoolRows, | ||
bool shuffleSource = !RowShufflingTransformer.Defaults.PoolOnly) | ||
{ | ||
Environment.CheckValue(input, nameof(input)); | ||
Environment.CheckUserArg(shufflePoolSize > 0, nameof(shufflePoolSize), "Pool size must be positive"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Prefer the simpler message |
||
|
||
var options = new RowShufflingTransformer.Options | ||
{ | ||
PoolRows = shufflePoolSize, | ||
PoolOnly = !shuffleSource, | ||
ForceShuffle = true, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
shall this be a argument of Shuffle? #Resolved There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We are generally trying to remove parameters from the input, and this is a confusing one to explain, as it really gets into the internals of RowShufflingTransform. I can get all the same behavior out of the RowShufflingTransform by setting this to true and tuning the others, so I'd prefer to leave it out. In reply to: 254150113 [](ancestors = 254150113) |
||
ForceShuffleSeed = (int)seed | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
If this is left There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also changed the parameter to BootstrapSample to be a In reply to: 254346714 [](ancestors = 254346714) |
||
}; | ||
|
||
return new RowShufflingTransformer(Environment, options, input); | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -33,14 +33,16 @@ namespace Microsoft.ML.Transforms | |
[BestFriend] | ||
internal sealed class RowShufflingTransformer : RowToRowTransformBase | ||
{ | ||
private static class Defaults | ||
[BestFriend] | ||
internal static class Defaults | ||
{ | ||
public const int PoolRows = 1000; | ||
public const bool PoolOnly = false; | ||
public const bool ForceShuffle = false; | ||
} | ||
|
||
public sealed class Options | ||
[BestFriend] | ||
internal sealed class Options | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we revert this to public? This is already in an internal class, so this provides no additional information. #Resolved |
||
{ | ||
// REVIEW: A more intelligent heuristic, based on the expected size of the inputs, perhaps? | ||
[Argument(ArgumentType.LastOccurenceWins, HelpText = "The pool will have this many rows", ShortName = "rows")] | ||
|
@@ -95,7 +97,8 @@ private static VersionInfo GetVersionInfo() | |
/// <param name="poolRows">The pool will have this many rows</param> | ||
/// <param name="poolOnly">If true, the transform will not attempt to shuffle the input cursor but only shuffle based on the pool. This parameter has no effect if the input data was not itself shufflable.</param> | ||
/// <param name="forceShuffle">If true, the transform will always provide a shuffled view.</param> | ||
public RowShufflingTransformer(IHostEnvironment env, | ||
[BestFriend] | ||
internal RowShufflingTransformer(IHostEnvironment env, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Similar notes. there is no point in internalizing these. #Resolved There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note that we had a drive to internalize constructors of In reply to: 254349247 [](ancestors = 254349247) |
||
IDataView input, | ||
int poolRows = Defaults.PoolRows, | ||
bool poolOnly = Defaults.PoolOnly, | ||
|
@@ -105,9 +108,10 @@ public RowShufflingTransformer(IHostEnvironment env, | |
} | ||
|
||
/// <summary> | ||
/// Public constructor corresponding to SignatureDataTransform. | ||
/// Constructor corresponding to SignatureDataTransform. | ||
/// </summary> | ||
public RowShufflingTransformer(IHostEnvironment env, Options args, IDataView input) | ||
[BestFriend] | ||
internal RowShufflingTransformer(IHostEnvironment env, Options args, IDataView input) | ||
: base(env, RegistrationName, input) | ||
{ | ||
Host.CheckValue(args, nameof(args)); | ||
|
@@ -149,7 +153,8 @@ private RowShufflingTransformer(IHost host, ModelLoadContext ctx, IDataView inpu | |
_subsetInput = SelectCachableColumns(input, host); | ||
} | ||
|
||
public static RowShufflingTransformer Create(IHostEnvironment env, ModelLoadContext ctx, IDataView input) | ||
[BestFriend] | ||
internal static RowShufflingTransformer Create(IHostEnvironment env, ModelLoadContext ctx, IDataView input) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
similar #Resolved |
||
{ | ||
Contracts.CheckValue(env, nameof(env)); | ||
var h = env.Register(RegistrationName); | ||
|
@@ -222,7 +227,8 @@ internal static bool CanShuffleAll(Schema schema) | |
/// <summary> | ||
/// Utility to take a cursor, and get a shuffled version of this cursor. | ||
/// </summary> | ||
public static RowCursor GetShuffledCursor(IChannelProvider provider, int poolRows, RowCursor cursor, Random rand) | ||
[BestFriend] | ||
internal static RowCursor GetShuffledCursor(IChannelProvider provider, int poolRows, RowCursor cursor, Random rand) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
similar #Resolved |
||
{ | ||
Contracts.CheckValue(provider, nameof(provider)); | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
From the below comment I don't understand if the output
IDataView
will be of sizepoolRows
or if it will be of the same size as the inputIDataView
. #ResolvedThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How about now? I rewrote the remarks to be a bit more explicit on what was happening.
In reply to: 254190313 [](ancestors = 254190313)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's very clear now thanks for explaining further!
In reply to: 254337002 [](ancestors = 254337002,254190313)