dotnet · TomFinley · Dec 6, 2018 · Nov 29, 2018 · Nov 30, 2018 · Dec 4, 2018
diff --git a/docs/code/MlNetCookBook.md b/docs/code/MlNetCookBook.md
@@ -443,10 +443,24 @@ var reader = mlContext.Data.TextReader(ctx => (
 // Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed).
 var trainData = reader.Read(trainDataPath);
 
+// Sometime, caching data in-memory after its first access can save some loading time when the data is going to be used
+// several times somewhere. The caching mechanism is also lazy; it only caches things after being used.
+// User can replace all the subsequently uses of "trainData" with "cachedTrainData". We still use "trainData" because
+// a caching step, which provides the same caching function, will be inserted in the considered "learningPipeline."
+var cachedTrainData = trainData.Cache();
+
 // Step two: define the learning pipeline. 
 
 // We 'start' the pipeline with the output of the reader.
 var learningPipeline = reader.MakeNewEstimator()
+    // We add a step for caching data in memory so that the downstream iterative training
+    // algorithm can efficiently scan through the data multiple times. Otherwise, the following
+    // trainer will read data from disk multiple times. The caching mechanism uses an on-demand strategy.
+    // The data accessed in any downstream step will be cached since its first use. In general, you only
+    // need to add a caching step before trainable step, because caching is not helpful if the data is
+    // only scanned once. This step can be removed if user doesn't have enough memory to store the whole
+    // data set.
+    .AppendCacheCheckpoint()
     // Now we can add any 'training steps' to it. In our case we want to 'normalize' the data (rescale to be
     // between -1 and 1 for all examples)
     .Append(r => (
@@ -486,13 +500,28 @@ var reader = mlContext.Data.TextReader(new TextLoader.Arguments
 // Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed).
 var trainData = reader.Read(trainDataPath);
 
+// Sometime, caching data in-memory after its first access can save some loading time when the data is going to be used
+// several times somewhere. The caching mechanism is also lazy; it only caches things after being used.
+// User can replace all the subsequently uses of "trainData" with "cachedTrainData". We still use "trainData" because
+// a caching step, which provides the same caching function, will be inserted in the considered "dynamicPipeline."
+var cachedTrainData = mlContext.Data.Cache(trainData);
+
 // Step two: define the learning pipeline. 
 
 // We 'start' the pipeline with the output of the reader.
 var dynamicPipeline =
     // First 'normalize' the data (rescale to be
     // between -1 and 1 for all examples)
     mlContext.Transforms.Normalize("FeatureVector")
+    // We add a step for caching data in memory so that the downstream iterative training
+    // algorithm can efficiently scan through the data multiple times. Otherwise, the following
+    // trainer will read data from disk multiple times. The caching mechanism uses an on-demand strategy.
+    // The data accessed in any downstream step will be cached since its first use. In general, you only
+    // need to add a caching step before trainable step, because caching is not helpful if the data is
+    // only scanned once. This step can be removed if user doesn't have enough memory to store the whole
+    // data set. Notice that in the upstream Transforms.Normalize step, we only scan through the data 
+    // once so adding a caching step before it is not helpful.
+    .AppendCacheCheckpoint(mlContext)
     // Add the SDCA regression trainer.
     .Append(mlContext.Regression.Trainers.StochasticDualCoordinateAscent(label: "Target", features: "FeatureVector"));
 
@@ -595,6 +624,13 @@ var learningPipeline = reader.MakeNewEstimator()
         r.Label,
         // Concatenate all the features together into one column 'Features'.
         Features: r.SepalLength.ConcatWith(r.SepalWidth, r.PetalLength, r.PetalWidth)))
+    // We add a step for caching data in memory so that the downstream iterative training
+    // algorithm can efficiently scan through the data multiple times. Otherwise, the following
+    // trainer will read data from disk multiple times. The caching mechanism uses an on-demand strategy.
+    // The data accessed in any downstream step will be cached since its first use. In general, you only
+    // need to add a caching step before trainable step, because caching is not helpful if the data is
+    // only scanned once.
+    .AppendCacheCheckpoint()
     .Append(r => (
         r.Label,
         // Train the multi-class SDCA model to predict the label using features.
@@ -640,6 +676,8 @@ var dynamicPipeline =
     mlContext.Transforms.Concatenate("Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth")
     // Note that the label is text, so it needs to be converted to key.
     .Append(mlContext.Transforms.Categorical.MapValueToKey("Label"), TransformerScope.TrainTest)
+    // Cache data in moemory for steps after the cache check point stage.
+    .AppendCacheCheckpoint(mlContext)
     // Use the multi-class SDCA model to predict the label using features.
     .Append(mlContext.MulticlassClassification.Trainers.StochasticDualCoordinateAscent())
     // Apply the inverse conversion from 'PredictedLabel' column back to string value.
@@ -741,6 +779,7 @@ var trainData = mlContext.CreateStreamingDataView(churnData);
 
 var dynamicLearningPipeline = mlContext.Transforms.Categorical.OneHotEncoding("DemographicCategory")
     .Append(mlContext.Transforms.Concatenate("Features", "DemographicCategory", "LastVisits"))
+    .AppendCacheCheckpoint(mlContext) // FastTree will benefit from caching data in memory.
     .Append(mlContext.BinaryClassification.Trainers.FastTree("HasChurned", "Features", numTrees: 20));
 
 var dynamicModel = dynamicLearningPipeline.Fit(trainData);
@@ -757,6 +796,7 @@ var staticLearningPipeline = staticData.MakeNewEstimator()
     .Append(r => (
         r.HasChurned,
         Features: r.DemographicCategory.OneHotEncoding().ConcatWith(r.LastVisits)))
+    .AppendCacheCheckpoint() // FastTree will benefit from caching data in memory.
     .Append(r => mlContext.BinaryClassification.Trainers.FastTree(r.HasChurned, r.Features, numTrees: 20));
 
 var staticModel = staticLearningPipeline.Fit(staticData);
@@ -813,6 +853,8 @@ var learningPipeline = reader.MakeNewEstimator()
             // When the normalizer is trained, the below delegate is going to be called.
             // We use it to memorize the scales.
             onFit: (scales, offsets) => normScales = scales)))
+    // Cache data used in memory because the subsequently trainer needs to access the data multiple times.
+    .AppendCacheCheckpoint()
     .Append(r => (
         r.Label,
         // Train the multi-class SDCA model to predict the label using features.
@@ -987,6 +1029,10 @@ var catColumns = data.GetColumn(r => r.CategoricalFeatures).Take(10).ToArray();
 
 // Build several alternative featurization pipelines.
 var learningPipeline = reader.MakeNewEstimator()
+    // Cache data in memory in an on-demand manner. Columns used in any downstream step will be
+    // cached in memory at their first uses. This step can be removed if user's machine doesn't
+    // have enough memory.
+    .AppendCacheCheckpoint()
     .Append(r => (
         r.Label,
         r.NumericalFeatures,
@@ -1070,6 +1116,9 @@ var workclasses = transformedData.GetColumn<float[]>(mlContext, "WorkclassOneHot
 var fullLearningPipeline = dynamicPipeline
     // Concatenate two of the 3 categorical pipelines, and the numeric features.
     .Append(mlContext.Transforms.Concatenate("Features", "NumericalFeatures", "CategoricalBag", "WorkclassOneHotTrimmed"))
+    // Cache data in memory so that the following trainer will be able to access training examples without
+    // reading them from disk multiple times.
+    .AppendCacheCheckpoint(mlContext)
     // Now we're ready to train. We chose our FastTree trainer for this classification task.
     .Append(mlContext.BinaryClassification.Trainers.FastTree(numTrees: 50));
 
@@ -1121,6 +1170,10 @@ var messageTexts = data.GetColumn(x => x.Message).Take(20).ToArray();
 
 // Apply various kinds of text operations supported by ML.NET.
 var learningPipeline = reader.MakeNewEstimator()
+    // Cache data in memory in an on-demand manner. Columns used in any downstream step will be
+    // cached in memory at their first uses. This step can be removed if user's machine doesn't
+    // have enough memory.
+    .AppendCacheCheckpoint()
     .Append(r => (
         // One-stop shop to run the full text featurization.
         TextFeatures: r.Message.FeaturizeText(),
@@ -1243,6 +1296,9 @@ var learningPipeline = reader.MakeNewEstimator()
         Label: r.Label.ToKey(),
         // Concatenate all the features together into one column 'Features'.
         Features: r.SepalLength.ConcatWith(r.SepalWidth, r.PetalLength, r.PetalWidth)))
+    // Add a step for caching data in memory so that the downstream iterative training
+    // algorithm can efficiently scan through the data multiple times.
+    .AppendCacheCheckpoint()
     .Append(r => (
         r.Label,
         // Train the multi-class SDCA model to predict the label using features.
@@ -1298,6 +1354,10 @@ var dynamicPipeline =
     mlContext.Transforms.Concatenate("Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth")
     // Note that the label is text, so it needs to be converted to key.
     .Append(mlContext.Transforms.Conversions.MapValueToKey("Label"), TransformerScope.TrainTest)
+    // Cache data in memory so that SDCA trainer will be able to randomly access training examples without
+    // reading data from disk multiple times. Data will be cached at its first use in any downstream step.
+    // Notice that unused part in the data may not be cached.
+    .AppendCacheCheckpoint(mlContext)
     // Use the multi-class SDCA model to predict the label using features.
     .Append(mlContext.MulticlassClassification.Trainers.StochasticDualCoordinateAscent());
 
@@ -1439,6 +1499,7 @@ public static ITransformer TrainModel(MLContext mlContext, IDataView trainData)
     Action<InputRow, OutputRow> mapping = (input, output) => output.Label = input.Income > 50000;
     // Construct the learning pipeline.
     var estimator = mlContext.Transforms.CustomMapping(mapping, null)
+        .AppendCacheCheckpoint(mlContext)
         .Append(mlContext.BinaryClassification.Trainers.FastTree(label: "Label"));
 
     return estimator.Fit(trainData);
@@ -1480,8 +1541,12 @@ public class CustomMappings
 var estimator = mlContext.Transforms.CustomMapping<InputRow, OutputRow>(CustomMappings.IncomeMapping, nameof(CustomMappings.IncomeMapping))
     .Append(mlContext.BinaryClassification.Trainers.FastTree(label: "Label"));
 
+// If memory is enough, we can cache the data in-memory to avoid reading them from file
+// when it will be accessed multiple times. 
+var cachedTrainData = mlContext.Data.Cache(trainData);
+
 // Train the model.
-var model = estimator.Fit(trainData);
+var model = estimator.Fit(cachedTrainData);
 
 // Save the model.
 using (var fs = File.Create(modelPath))

diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/SDCA.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/SDCA.cs
@@ -38,11 +38,18 @@ public static void SDCA_BinaryClassification()
             // Read the data
             var data = reader.Read(dataFile);
 
+            // ML.NET doesn't cache data set by default. Therefore, if one reads a data set from a file and accesses it many times, it can be slow due to
+            // expensive featurization and disk operations. When the considered data can fit into memory, a solution is to cache the data in memory. Caching is especially
+            // helpful when working with iterative algorithms which needs many data passes. Since SDCA is the case, we cache. Inserting a
+            // cache step in a pipeline is also possible, please see the construction of pipeline below.
+            data = mlContext.Data.Cache(data);
+
             // Step 2: Pipeline 
             // Featurize the text column through the FeaturizeText API. 
             // Then append a binary classifier, setting the "Label" column as the label of the dataset, and 
-            // the "Features" column produced by FeaturizeText as the features column. 
+            // the "Features" column produced by FeaturizeText as the features column.
             var pipeline = mlContext.Transforms.Text.FeaturizeText("SentimentText", "Features")
+                    .AppendCacheCheckpoint(mlContext) // Add a data-cache step within a pipeline.
                     .Append(mlContext.BinaryClassification.Trainers.StochasticDualCoordinateAscent(labelColumn: "Sentiment", featureColumn: "Features", l2Const: 0.001f));
 
             // Step 3: Run Cross-Validation on this pipeline.

diff --git a/src/Microsoft.ML.Data/StaticPipe/DataView.cs b/src/Microsoft.ML.Data/StaticPipe/DataView.cs
@@ -8,6 +8,7 @@
 using Microsoft.ML.StaticPipe.Runtime;
 using System.Collections.Generic;
 using System;
+using System.Linq;
 
 namespace Microsoft.ML.StaticPipe
 {
@@ -23,6 +24,19 @@ internal DataView(IHostEnvironment env, IDataView view, StaticSchemaShape shape)
             AsDynamic = view;
             Shape.Check(Env, AsDynamic.Schema);
         }
+
+        /// <summary>
+        /// This function return a <see cref="DataView{TShape}"/> whose columns are all cached in memory.
+        /// This returned <see cref="DataView{TShape}"/> is almost the same to the source <see cref="DataView{TShape}"/>.
+        /// The only difference are cache-related properties.
+        /// </summary>
+        public DataView<TShape> Cache()
+        {
+            // Generate all column indexes in the source data.
+            var prefetched = Enumerable.Range(0, AsDynamic.Schema.ColumnCount).ToArray();
+            // Create a cached version of the source data by caching all columns.
+            return new DataView<TShape>(Env, new CacheDataView(Env, AsDynamic, prefetched), Shape);
+        }
     }
 
     public static class DataViewExtensions

diff --git a/src/Microsoft.ML.Data/StaticPipe/Estimator.cs b/src/Microsoft.ML.Data/StaticPipe/Estimator.cs
@@ -77,5 +77,14 @@ string NameMap(PipelineColumn col)
                 return new Estimator<TInShape, TNewOutShape, ITransformer>(Env, est, _inShape, newOut);
             }
         }
+
+        /// <summary>
+        /// Cache data produced in memory by this estimator. It may append an extra estimator to the this estimator
+        /// for caching. The newly added estimator would be returned.
+        /// </summary>
+        public Estimator<TInShape, TOutShape, ITransformer> AppendCacheCheckpoint()
+        {
+            return new Estimator<TInShape, TOutShape, ITransformer>(Env, AsDynamic.AppendCacheCheckpoint(Env), _inShape, Shape);
+        }
     }
 }
diff --git a/src/Microsoft.ML.Data/Training/TrainerEstimatorBase.cs b/src/Microsoft.ML.Data/Training/TrainerEstimatorBase.cs
@@ -130,11 +130,8 @@ protected virtual void CheckLabelCompatible(SchemaShape.Column labelCol)
         protected TTransformer TrainTransformer(IDataView trainSet,
             IDataView validationSet = null, IPredictor initPredictor = null)
         {
-            var cachedTrain = Info.WantCaching ? new CacheDataView(Host, trainSet, prefetch: null) : trainSet;
-            var cachedValid = Info.WantCaching && validationSet != null ? new CacheDataView(Host, validationSet, prefetch: null) : validationSet;
-
-            var trainRoleMapped = MakeRoles(cachedTrain);
-            var validRoleMapped = validationSet == null ? null : MakeRoles(cachedValid);
+            var trainRoleMapped = MakeRoles(trainSet);
+            var validRoleMapped = validationSet == null ? null : MakeRoles(validationSet);
 
             var pred = TrainModelCore(new TrainContext(trainRoleMapped, validRoleMapped, null, initPredictor));
             return MakeTransformer(pred, trainSet.Schema);

diff --git a/test/BaselineOutput/Common/OVA/OVA-CV-iris-out.txt b/test/BaselineOutput/Common/OVA/OVA-CV-iris-out.txt
@@ -21,35 +21,35 @@ Confusion table
 PREDICTED ||     0 |     1 |     2 | Recall
 TRUTH     ||========================
         0 ||    21 |     0 |     0 | 1.0000
-        1 ||     0 |    22 |     8 | 0.7333
+        1 ||     0 |    20 |    10 | 0.6667
         2 ||     0 |     0 |    28 | 1.0000
           ||========================
-Precision ||1.0000 |1.0000 |0.7778 |
-Accuracy(micro-avg): 0.898734
-Accuracy(macro-avg): 0.911111
-Log-loss:           0.372620
-Log-loss reduction: 65.736556
+Precision ||1.0000 |1.0000 |0.7368 |
+Accuracy(micro-avg): 0.873418
+Accuracy(macro-avg): 0.888889
+Log-loss:           0.393949
+Log-loss reduction: 63.775293
 
 Confusion table
           ||========================
 PREDICTED ||     0 |     1 |     2 | Recall
 TRUTH     ||========================
         0 ||    29 |     0 |     0 | 1.0000
-        1 ||     0 |    18 |     2 | 0.9000
+        1 ||     0 |    19 |     1 | 0.9500
         2 ||     0 |     0 |    22 | 1.0000
           ||========================
-Precision ||1.0000 |1.0000 |0.9167 |
-Accuracy(micro-avg): 0.971831
-Accuracy(macro-avg): 0.966667
-Log-loss:           0.357704
-Log-loss reduction: 67.051654
+Precision ||1.0000 |1.0000 |0.9565 |
+Accuracy(micro-avg): 0.985915
+Accuracy(macro-avg): 0.983333
+Log-loss:           0.299620
+Log-loss reduction: 72.401815
 
 OVERALL RESULTS
 ---------------------------------------
-Accuracy(micro-avg): 0.935283 (0.0365)
-Accuracy(macro-avg): 0.938889 (0.0278)
-Log-loss:           0.365162 (0.0075)
-Log-loss reduction: 66.394105 (0.6575)
+Accuracy(micro-avg): 0.929667 (0.0562)
+Accuracy(macro-avg): 0.936111 (0.0472)
+Log-loss:           0.346785 (0.0472)
+Log-loss reduction: 68.088554 (4.3133)
 
 ---------------------------------------
 Physical memory usage(MB): %Number%

diff --git a/test/BaselineOutput/Common/OVA/OVA-CV-iris-rp.txt b/test/BaselineOutput/Common/OVA/OVA-CV-iris-rp.txt
@@ -1,4 +1,4 @@
 OVA
 Accuracy(micro-avg)	Accuracy(macro-avg)	Log-loss	Log-loss reduction	/p	Learner Name	Train Dataset	Test Dataset	Results File	Run Time	Physical Memory	Virtual Memory	Command Line	Settings	
-0.935283	0.938889	0.365162	66.3941	AvgPer{lr=0.8}	OVA	%Data%		%Output%	99	0	0	maml.exe CV tr=OVA{p=AvgPer{ lr=0.8 }} threads=- norm=No dout=%Output% data=%Data% seed=1	/p:AvgPer{lr=0.8}	
+0.929667	0.936111	0.346785	68.08855	AvgPer{lr=0.8}	OVA	%Data%		%Output%	99	0	0	maml.exe CV tr=OVA{p=AvgPer{ lr=0.8 }} threads=- norm=No dout=%Output% data=%Data% seed=1	/p:AvgPer{lr=0.8}