Adding a catalog entry for the bootstrap sample (#2387)

rogancarr · web-flow · commit 7d592b757389 · 2019-02-05T10:37:11.000-08:00
Adding a catalog entry and sample for the bootstrap sample.
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/BootstrapSample.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/BootstrapSample.cs
@@ -0,0 +1,84 @@
+﻿using System;
+using System.Collections.Generic;
+using Microsoft.ML.Data;
+
+namespace Microsoft.ML.Samples.Dynamic
+{
+    public static class BootstrapSample
+    {
+        public static void Example()
+        {
+            // Create a new context for ML.NET operations. It can be used for exception tracking and logging, 
+            // as a catalog of available operations and as the source of randomness.
+            var mlContext = new MLContext();
+
+            // Get a small dataset as an IEnumerable and them read it as ML.NET's data type.
+            IEnumerable<SamplesUtils.DatasetUtils.BinaryLabelFloatFeatureVectorSample> enumerableOfData = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorSamples(5);
+            var data = mlContext.Data.ReadFromEnumerable(enumerableOfData);
+
+            // Look at the original dataset
+            Console.WriteLine($"Label\tFeatures[0]");
+            foreach (var row in enumerableOfData)
+            {
+                Console.WriteLine($"{row.Label}\t{row.Features[0]}");
+            }
+            Console.WriteLine();
+            // Expected output:
+            //  Label Features[0]
+            //  True    1.017325
+            //  False   0.6326591
+            //  False   0.0326252
+            //  True    0.8426974
+            //  True    0.9947656
+
+            // Now take a bootstrap sample of this dataset to create a new dataset. The bootstrap is a resampling technique that
+            // creates a training set of the same size by picking with replacement from the original dataset. With the bootstrap, 
+            // we expect that the resampled dataset will have about 63% of the rows of the original dataset (i.e. 1-e^-1), with some
+            // rows represented more than once.
+            // BootstrapSample is a streaming implementation of the boostrap that enables sampling from a dataset too large to hold in memory.
+            // To enable streaming, BootstrapSample approximates the bootstrap by sampling each row according to a Poisson(1) distribution.
+            // Note that this streaming approximation treats each row independently, thus the resampled dataset is not guaranteed to be the 
+            // same length as the input dataset.
+            // Let's take a look at the behavior of the BootstrapSample by examining a few draws:
+            for (int i = 0; i < 3; i++)
+            {
+                var resample = mlContext.Data.BootstrapSample(data, seed: (uint) i);
+
+                var enumerable = mlContext.CreateEnumerable<SamplesUtils.DatasetUtils.BinaryLabelFloatFeatureVectorSample>(resample, reuseRowObject: false);
+                Console.WriteLine($"Label\tFeatures[0]");
+                foreach (var row in enumerable)
+                {
+                    Console.WriteLine($"{row.Label}\t{row.Features[0]}");
+                }
+                Console.WriteLine();
+            }
+            // Expected output:
+            //  Label Features[0]
+            //  True    1.017325
+            //  False   0.6326591
+            //  False   0.6326591
+            //  False   0.6326591
+            //  False   0.0326252
+            //  False   0.0326252
+            //  True    0.8426974
+            //  True    0.8426974
+
+            //  Label Features[0]
+            //  True    1.017325
+            //  True    1.017325
+            //  False   0.6326591
+            //  False   0.6326591
+            //  False   0.0326252
+            //  False   0.0326252
+            //  False   0.0326252
+            //  True    0.9947656
+
+            //  Label Features[0]
+            //  False   0.6326591
+            //  False   0.0326252
+            //  True    0.8426974
+            //  True    0.8426974
+            //  True    0.8426974
+        }
+    }
+}
diff --git a/src/Microsoft.ML.Data/DataLoadSave/DataOperationsCatalog.cs b/src/Microsoft.ML.Data/DataLoadSave/DataOperationsCatalog.cs
@@ -24,12 +24,49 @@ internal DataOperationsCatalog(IHostEnvironment env)
             Environment = env;
         }
 
+        /// <summary>
+        /// Take an approximate bootstrap sample of <paramref name="input"/>.
+        /// </summary>
+        /// <remarks>
+        /// This sampler is a streaming version of <a href="https://en.wikipedia.org/wiki/Bootstrapping_(statistics)">bootstrap resampling</a>.
+        /// Instead of taking the whole dataset into memory and resampling, <see cref="BootstrapSample"/> streams through the dataset and
+        /// uses a <a href="https://en.wikipedia.org/wiki/Poisson_distribution">Poisson</a>(1) distribution to select the number of times a
+        /// given row will be added to the sample. The <paramref name="complement"/> parameter allows for the creation of a bootstap sample
+        /// and complementary out-of-bag sample by using the same <paramref name="seed"/>.
+        /// </remarks>
+        /// <param name="input">The input data.</param>
+        /// <param name="seed">The random seed. If unspecified random state will be instead derived from the <see cref="MLContext"/>.</param>
+        /// <param name="complement">Whether this is the out-of-bag sample, that is, all those rows that are not selected by the transform.
+        /// Can be used to create a complementary pair of samples by using the same seed.</param>
+        /// <example>
+        /// <format type="text/markdown">
+        /// <![CDATA[
+        /// [!code-csharp[BootstrapSample](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/BootstrapSample.cs)]
+        /// ]]>
+        /// </format>
+        /// </example>
+        public IDataView BootstrapSample(IDataView input,
+            uint? seed = null,
+            bool complement = BootstrapSamplingTransformer.Defaults.Complement)
+        {
+            Environment.CheckValue(input, nameof(input));
+            return new BootstrapSamplingTransformer(
+                Environment,
+                input,
+                complement: complement,
+                seed: seed,
+                shuffleInput: false,
+                poolSize: 0);
+        }
+
         /// <summary>
         /// Creates a lazy in-memory cache of <paramref name="input"/>.
+        /// </summary>
+        /// <remarks>
         /// Caching happens per-column. A column is only cached when it is first accessed.
         /// In addition, <paramref name="columnsToPrefetch"/> are considered 'always needed', so all of them
         /// will be cached whenever any data is requested.
-        /// </summary>
+        /// </remarks>
         /// <param name="input">The data view to cache.</param>
         /// <param name="columnsToPrefetch">The columns that must be cached whenever anything is cached. Empty array or null
         /// is acceptable, it means that all columns are only cached at the first access.</param>
diff --git a/src/Microsoft.ML.Data/Transforms/BootstrapSamplingTransformer.cs b/src/Microsoft.ML.Data/Transforms/BootstrapSamplingTransformer.cs
@@ -29,7 +29,7 @@ namespace Microsoft.ML.Transforms
     [BestFriend]
     internal sealed class BootstrapSamplingTransformer : FilterBase
     {
-        private static class Defaults
+        internal static class Defaults
         {
             public const bool Complement = false;
             public const bool ShuffleInput = true;

Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,7 @@ namespace Microsoft.ML.Transforms`
`29`	`29`	`[BestFriend]`
`30`	`30`	`internal sealed class BootstrapSamplingTransformer : FilterBase`
`31`	`31`	`{`
`32`		`- private static class Defaults`
	`32`	`+ internal static class Defaults`
`33`	`33`	`{`
`34`	`34`	`public const bool Complement = false;`
`35`	`35`	`public const bool ShuffleInput = true;`