Skip to content

Commit 7d592b7

Browse files
authored
Adding a catalog entry for the bootstrap sample (#2387)
Adding a catalog entry and sample for the bootstrap sample.
1 parent 410a296 commit 7d592b7

File tree

3 files changed

+123
-2
lines changed

3 files changed

+123
-2
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using Microsoft.ML.Data;
4+
5+
namespace Microsoft.ML.Samples.Dynamic
6+
{
7+
public static class BootstrapSample
8+
{
9+
public static void Example()
10+
{
11+
// Create a new context for ML.NET operations. It can be used for exception tracking and logging,
12+
// as a catalog of available operations and as the source of randomness.
13+
var mlContext = new MLContext();
14+
15+
// Get a small dataset as an IEnumerable and them read it as ML.NET's data type.
16+
IEnumerable<SamplesUtils.DatasetUtils.BinaryLabelFloatFeatureVectorSample> enumerableOfData = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorSamples(5);
17+
var data = mlContext.Data.ReadFromEnumerable(enumerableOfData);
18+
19+
// Look at the original dataset
20+
Console.WriteLine($"Label\tFeatures[0]");
21+
foreach (var row in enumerableOfData)
22+
{
23+
Console.WriteLine($"{row.Label}\t{row.Features[0]}");
24+
}
25+
Console.WriteLine();
26+
// Expected output:
27+
// Label Features[0]
28+
// True 1.017325
29+
// False 0.6326591
30+
// False 0.0326252
31+
// True 0.8426974
32+
// True 0.9947656
33+
34+
// Now take a bootstrap sample of this dataset to create a new dataset. The bootstrap is a resampling technique that
35+
// creates a training set of the same size by picking with replacement from the original dataset. With the bootstrap,
36+
// we expect that the resampled dataset will have about 63% of the rows of the original dataset (i.e. 1-e^-1), with some
37+
// rows represented more than once.
38+
// BootstrapSample is a streaming implementation of the boostrap that enables sampling from a dataset too large to hold in memory.
39+
// To enable streaming, BootstrapSample approximates the bootstrap by sampling each row according to a Poisson(1) distribution.
40+
// Note that this streaming approximation treats each row independently, thus the resampled dataset is not guaranteed to be the
41+
// same length as the input dataset.
42+
// Let's take a look at the behavior of the BootstrapSample by examining a few draws:
43+
for (int i = 0; i < 3; i++)
44+
{
45+
var resample = mlContext.Data.BootstrapSample(data, seed: (uint) i);
46+
47+
var enumerable = mlContext.CreateEnumerable<SamplesUtils.DatasetUtils.BinaryLabelFloatFeatureVectorSample>(resample, reuseRowObject: false);
48+
Console.WriteLine($"Label\tFeatures[0]");
49+
foreach (var row in enumerable)
50+
{
51+
Console.WriteLine($"{row.Label}\t{row.Features[0]}");
52+
}
53+
Console.WriteLine();
54+
}
55+
// Expected output:
56+
// Label Features[0]
57+
// True 1.017325
58+
// False 0.6326591
59+
// False 0.6326591
60+
// False 0.6326591
61+
// False 0.0326252
62+
// False 0.0326252
63+
// True 0.8426974
64+
// True 0.8426974
65+
66+
// Label Features[0]
67+
// True 1.017325
68+
// True 1.017325
69+
// False 0.6326591
70+
// False 0.6326591
71+
// False 0.0326252
72+
// False 0.0326252
73+
// False 0.0326252
74+
// True 0.9947656
75+
76+
// Label Features[0]
77+
// False 0.6326591
78+
// False 0.0326252
79+
// True 0.8426974
80+
// True 0.8426974
81+
// True 0.8426974
82+
}
83+
}
84+
}

src/Microsoft.ML.Data/DataLoadSave/DataOperationsCatalog.cs

+38-1
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,49 @@ internal DataOperationsCatalog(IHostEnvironment env)
2424
Environment = env;
2525
}
2626

27+
/// <summary>
28+
/// Take an approximate bootstrap sample of <paramref name="input"/>.
29+
/// </summary>
30+
/// <remarks>
31+
/// This sampler is a streaming version of <a href="https://en.wikipedia.org/wiki/Bootstrapping_(statistics)">bootstrap resampling</a>.
32+
/// Instead of taking the whole dataset into memory and resampling, <see cref="BootstrapSample"/> streams through the dataset and
33+
/// uses a <a href="https://en.wikipedia.org/wiki/Poisson_distribution">Poisson</a>(1) distribution to select the number of times a
34+
/// given row will be added to the sample. The <paramref name="complement"/> parameter allows for the creation of a bootstap sample
35+
/// and complementary out-of-bag sample by using the same <paramref name="seed"/>.
36+
/// </remarks>
37+
/// <param name="input">The input data.</param>
38+
/// <param name="seed">The random seed. If unspecified random state will be instead derived from the <see cref="MLContext"/>.</param>
39+
/// <param name="complement">Whether this is the out-of-bag sample, that is, all those rows that are not selected by the transform.
40+
/// Can be used to create a complementary pair of samples by using the same seed.</param>
41+
/// <example>
42+
/// <format type="text/markdown">
43+
/// <![CDATA[
44+
/// [!code-csharp[BootstrapSample](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/BootstrapSample.cs)]
45+
/// ]]>
46+
/// </format>
47+
/// </example>
48+
public IDataView BootstrapSample(IDataView input,
49+
uint? seed = null,
50+
bool complement = BootstrapSamplingTransformer.Defaults.Complement)
51+
{
52+
Environment.CheckValue(input, nameof(input));
53+
return new BootstrapSamplingTransformer(
54+
Environment,
55+
input,
56+
complement: complement,
57+
seed: seed,
58+
shuffleInput: false,
59+
poolSize: 0);
60+
}
61+
2762
/// <summary>
2863
/// Creates a lazy in-memory cache of <paramref name="input"/>.
64+
/// </summary>
65+
/// <remarks>
2966
/// Caching happens per-column. A column is only cached when it is first accessed.
3067
/// In addition, <paramref name="columnsToPrefetch"/> are considered 'always needed', so all of them
3168
/// will be cached whenever any data is requested.
32-
/// </summary>
69+
/// </remarks>
3370
/// <param name="input">The data view to cache.</param>
3471
/// <param name="columnsToPrefetch">The columns that must be cached whenever anything is cached. Empty array or null
3572
/// is acceptable, it means that all columns are only cached at the first access.</param>

src/Microsoft.ML.Transforms/BootstrapSamplingTransformer.cs renamed to src/Microsoft.ML.Data/Transforms/BootstrapSamplingTransformer.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ namespace Microsoft.ML.Transforms
2929
[BestFriend]
3030
internal sealed class BootstrapSamplingTransformer : FilterBase
3131
{
32-
private static class Defaults
32+
internal static class Defaults
3333
{
3434
public const bool Complement = false;
3535
public const bool ShuffleInput = true;

0 commit comments

Comments
 (0)