dotnet · TomFinley · Dec 6, 2018 · Nov 29, 2018 · Nov 30, 2018 · Dec 4, 2018
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/SDCA.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/SDCA.cs
@@ -38,11 +38,18 @@ public static void SDCA_BinaryClassification()
             // Read the data
             var data = reader.Read(dataFile);
 
+            // ML.NET doesn't cache data set by default. Therefore, if one reads a data set from a file and accesses it many times, it can be slow due to
+            // expensive featurization and disk operations. When the considered data can fit into memory, a solution is to cache the data in memory. Caching is especially
+            // helpful when working with iterative algorithms which needs many data passes. Since SDCA is the case, we cache. Inserting a
+            // cache step in a pipeline is also possible, please see the construction of pipeline below.
+            data = mlContext.Data.Cache(data);
+
             // Step 2: Pipeline 
             // Featurize the text column through the FeaturizeText API. 
             // Then append a binary classifier, setting the "Label" column as the label of the dataset, and 
-            // the "Features" column produced by FeaturizeText as the features column. 
+            // the "Features" column produced by FeaturizeText as the features column.
             var pipeline = mlContext.Transforms.Text.FeaturizeText("SentimentText", "Features")
+                    .AppendCacheCheckpoint(mlContext) // Add a data-cache step within a pipeline.
                     .Append(mlContext.BinaryClassification.Trainers.StochasticDualCoordinateAscent(labelColumn: "Sentiment", featureColumn: "Features", l2Const: 0.001f));
 
             // Step 3: Run Cross-Validation on this pipeline.

diff --git a/src/Microsoft.ML.Data/StaticPipe/DataView.cs b/src/Microsoft.ML.Data/StaticPipe/DataView.cs
@@ -8,6 +8,7 @@
 using Microsoft.ML.StaticPipe.Runtime;
 using System.Collections.Generic;
 using System;
+using System.Linq;
 
 namespace Microsoft.ML.StaticPipe
 {
@@ -23,6 +24,19 @@ internal DataView(IHostEnvironment env, IDataView view, StaticSchemaShape shape)
             AsDynamic = view;
             Shape.Check(Env, AsDynamic.Schema);
         }
+
+        /// <summary>
+        /// This function return a <see cref="DataView{TShape}"/> whose columns are all cached in memory.
+        /// This returned <see cref="DataView{TShape}"/> is almost the same to the source <see cref="DataView{TShape}"/>.
+        /// The only difference are cache-related properties.
+        /// </summary>
+        public DataView<TShape> Cache()
+        {
+            // Generate all column indexes in the source data.
+            var prefetched = Enumerable.Range(0, AsDynamic.Schema.ColumnCount).ToArray();
+            // Create a cached version of the source data by caching all columns.
+            return new DataView<TShape>(Env, new CacheDataView(Env, AsDynamic, prefetched), Shape);
+        }
     }
 
     public static class DataViewExtensions

diff --git a/src/Microsoft.ML.Data/StaticPipe/Estimator.cs b/src/Microsoft.ML.Data/StaticPipe/Estimator.cs
@@ -77,5 +77,14 @@ string NameMap(PipelineColumn col)
                 return new Estimator<TInShape, TNewOutShape, ITransformer>(Env, est, _inShape, newOut);
             }
         }
+
+        /// <summary>
+        /// Cache data produced in memory by this estimator. It may append an extra estimator to the this estimator
+        /// for caching. The newly added estimator would be returned.
+        /// </summary>
+        public Estimator<TInShape, TOutShape, ITransformer> AppendCacheCheckpoint()
+        {
+            return new Estimator<TInShape, TOutShape, ITransformer>(Env, AsDynamic.AppendCacheCheckpoint(Env), _inShape, Shape);
+        }
     }
 }
diff --git a/src/Microsoft.ML.Data/Training/TrainerEstimatorBase.cs b/src/Microsoft.ML.Data/Training/TrainerEstimatorBase.cs
@@ -132,21 +132,16 @@ protected virtual void CheckLabelCompatible(SchemaShape.Column labelCol)
         protected TTransformer TrainTransformer(IDataView trainSet,
             IDataView validationSet = null, IPredictor initPredictor = null)
         {
-            var cachedTrain = Info.WantCaching ? new CacheDataView(Host, trainSet, prefetch: null) : trainSet;
+            var trainRoleMapped = MakeRoles(trainSet);
 
-            var trainRoles = MakeRoles(cachedTrain);
-
-            RoleMappedData validRoles;
+            RoleMappedData validRoleMapped;
 
             if (validationSet == null)
-                validRoles = null;
+                validRoleMapped = null;
             else
-            {
-                var cachedValid = Info.WantCaching ? new CacheDataView(Host, validationSet, prefetch: null) : validationSet;
-                validRoles = MakeRoles(cachedValid);
-            }
+                validRoleMapped = MakeRoles(validationSet);
 
-            var pred = TrainModelCore(new TrainContext(trainRoles, validRoles, null, initPredictor));
+            var pred = TrainModelCore(new TrainContext(trainRoleMapped, validRoleMapped, null, initPredictor));
             return MakeTransformer(pred, trainSet.Schema);
         }
 

diff --git a/test/BaselineOutput/Common/OVA/OVA-CV-iris-out.txt b/test/BaselineOutput/Common/OVA/OVA-CV-iris-out.txt
@@ -21,35 +21,35 @@ Confusion table
 PREDICTED ||     0 |     1 |     2 | Recall
 TRUTH     ||========================
         0 ||    21 |     0 |     0 | 1.0000
-        1 ||     0 |    22 |     8 | 0.7333
+        1 ||     0 |    20 |    10 | 0.6667
         2 ||     0 |     0 |    28 | 1.0000
           ||========================
-Precision ||1.0000 |1.0000 |0.7778 |
-Accuracy(micro-avg): 0.898734
-Accuracy(macro-avg): 0.911111
-Log-loss:           0.372620
-Log-loss reduction: 65.736556
+Precision ||1.0000 |1.0000 |0.7368 |
+Accuracy(micro-avg): 0.873418
+Accuracy(macro-avg): 0.888889
+Log-loss:           0.393949
+Log-loss reduction: 63.775293
 
 Confusion table
           ||========================
 PREDICTED ||     0 |     1 |     2 | Recall
 TRUTH     ||========================
         0 ||    29 |     0 |     0 | 1.0000
-        1 ||     0 |    18 |     2 | 0.9000
+        1 ||     0 |    19 |     1 | 0.9500
         2 ||     0 |     0 |    22 | 1.0000
           ||========================
-Precision ||1.0000 |1.0000 |0.9167 |
-Accuracy(micro-avg): 0.971831
-Accuracy(macro-avg): 0.966667
-Log-loss:           0.357704
-Log-loss reduction: 67.051654
+Precision ||1.0000 |1.0000 |0.9565 |
+Accuracy(micro-avg): 0.985915
+Accuracy(macro-avg): 0.983333
+Log-loss:           0.299620
+Log-loss reduction: 72.401815
 
 OVERALL RESULTS
 ---------------------------------------
-Accuracy(micro-avg): 0.935283 (0.0365)
-Accuracy(macro-avg): 0.938889 (0.0278)
-Log-loss:           0.365162 (0.0075)
-Log-loss reduction: 66.394105 (0.6575)
+Accuracy(micro-avg): 0.929667 (0.0562)
+Accuracy(macro-avg): 0.936111 (0.0472)
+Log-loss:           0.346785 (0.0472)
+Log-loss reduction: 68.088554 (4.3133)
 
 ---------------------------------------
 Physical memory usage(MB): %Number%

diff --git a/test/BaselineOutput/Common/OVA/OVA-CV-iris-rp.txt b/test/BaselineOutput/Common/OVA/OVA-CV-iris-rp.txt
@@ -1,4 +1,4 @@
 OVA
 Accuracy(micro-avg)	Accuracy(macro-avg)	Log-loss	Log-loss reduction	/p	Learner Name	Train Dataset	Test Dataset	Results File	Run Time	Physical Memory	Virtual Memory	Command Line	Settings	
-0.935283	0.938889	0.365162	66.3941	AvgPer{lr=0.8}	OVA	%Data%		%Output%	99	0	0	maml.exe CV tr=OVA{p=AvgPer{ lr=0.8 }} threads=- norm=No dout=%Output% data=%Data% seed=1	/p:AvgPer{lr=0.8}	
+0.929667	0.936111	0.346785	68.08855	AvgPer{lr=0.8}	OVA	%Data%		%Output%	99	0	0	maml.exe CV tr=OVA{p=AvgPer{ lr=0.8 }} threads=- norm=No dout=%Output% data=%Data% seed=1	/p:AvgPer{lr=0.8}