dotnet#202: migrate Clustering_Iris F# sample to v0.9

mariuszwojcik · mariuszwojcik · commit 44d470f04b60 · 2019-01-20T12:31:41.000Z
diff --git a/samples/fsharp/getting-started/Clustering_Iris/IrisClustering/IrisClusteringConsoleApp/Clustering_Iris.fsproj b/samples/fsharp/getting-started/Clustering_Iris/IrisClustering/IrisClusteringConsoleApp/Clustering_Iris.fsproj
@@ -6,10 +6,10 @@
   </PropertyGroup>
 
   <ItemGroup>
-    <Compile Include="..\..\..\..\common\ConsoleHelper.fs" Link="Common\ConsoleHelper.fs" />
-    <Compile Include="..\..\..\..\common\ModelBuilder.fs" Link="Common\ModelBuilder.fs" />
-    <Compile Include="..\..\..\..\common\ModelScorer.fs" Link="Common\ModelScorer.fs" />
-    <Compile Include="..\..\..\..\common\Pipeline.fs" Link="Common\Pipeline.fs" />
+    <Compile Include="..\..\..\..\common_v0.9\ConsoleHelper.fs" Link="Common\ConsoleHelper.fs" />
+    <Compile Include="..\..\..\..\common_v0.9\ModelBuilder.fs" Link="Common\ModelBuilder.fs" />
+    <Compile Include="..\..\..\..\common_v0.9\ModelScorer.fs" Link="Common\ModelScorer.fs" />
+    <Compile Include="..\..\..\..\common_v0.9\Pipeline.fs" Link="Common\Pipeline.fs" />
   </ItemGroup>
 
   <ItemGroup>
@@ -18,7 +18,7 @@
   </ItemGroup>
 
   <ItemGroup>
-    <PackageReference Include="Microsoft.ML" Version="0.7.0" />
+    <PackageReference Include="Microsoft.ML" Version="$(MicrosoftMLVersion)" />
   </ItemGroup>
 
   <ItemGroup>
diff --git a/samples/fsharp/getting-started/Clustering_Iris/IrisClustering/IrisClusteringConsoleApp/DataStructures/DataStructures.fs b/samples/fsharp/getting-started/Clustering_Iris/IrisClustering/IrisClusteringConsoleApp/DataStructures/DataStructures.fs
@@ -1,7 +1,7 @@
 ﻿namespace Clustering_Iris.DataStructures
 
 module DataStructures =
-    open Microsoft.ML.Runtime.Api
+    open Microsoft.ML.Data
 
     /// Describes Iris flower. Used as an input to prediction function.
     [<CLIMutable>]
diff --git a/samples/fsharp/getting-started/Clustering_Iris/IrisClustering/IrisClusteringConsoleApp/Program.fs b/samples/fsharp/getting-started/Clustering_Iris/IrisClustering/IrisClusteringConsoleApp/Program.fs
@@ -3,7 +3,7 @@
 open System
 open System.IO
 open Microsoft.ML
-open Microsoft.ML.Runtime.Data
+open Microsoft.ML.Data
 open Clustering_Iris.DataStructures
 open DataStructures
 
@@ -14,12 +14,19 @@ let dataPath = sprintf @"%s/iris-full.txt" baseDatasetsLocation
 let baseModelsPath = @"../../../../MLModels"
 let modelPath = sprintf @"%s/IrisModel.zip" baseModelsPath
 
-let dataLoader (mlContext : MLContext) =
-    mlContext.Data.TextReader(
-        TextLoader.Arguments(
-            Separator = "\t",
-            HasHeader = true,
-            Column = 
+
+[<EntryPoint>]
+let main argv =
+
+    //Create the MLContext to share across components for deterministic results
+    let mlContext = MLContext(seed = Nullable 1)    //Seed set to any number so you have a deterministic environment
+
+    // STEP 1: Common data loading configuration
+    let textLoader = 
+        mlContext.Data.CreateTextReader(
+            hasHeader = true,
+            separatorChar = '\t',
+            columns =
                 [|
                     TextLoader.Column("Label", Nullable DataKind.R4, 0)
                     TextLoader.Column("SepalLength", Nullable DataKind.R4, 1)
@@ -28,35 +35,22 @@ let dataLoader (mlContext : MLContext) =
                     TextLoader.Column("PetalWidth", Nullable DataKind.R4, 4)
                 |]
         )
-    )
-
-let read (dataPath : string) (dataLoader : TextLoader) =
-    dataLoader.Read dataPath
-
-
-[<EntryPoint>]
-let main argv =
-
-    let mlContext = MLContext(seed = Nullable 1)
-
-    //STEP 1: Common data loading
-    let fullData = 
-        dataLoader mlContext
-        |> read dataPath
 
-    let struct (trainingDataView, testingDataView) = mlContext.Clustering.TrainTestSplit(fullData, testFraction = 0.2)
+    let fullData = textLoader.Read dataPath
+    
+    //Split dataset in two parts: TrainingDataset (80%) and TestDataset (20%)
+    let struct(trainingDataView, testingDataView) = mlContext.Clustering.TrainTestSplit(fullData, testFraction = 0.2)
 
     //STEP 2: Process data transformations in pipeline
-    let dataProcessPipeline =
-        mlContext.Transforms.Concatenate("Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth")
+    let dataProcessPipeline = mlContext.Transforms.Concatenate("Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth")
 
     // (Optional) Peek data in training DataView after applying the ProcessPipeline's transformations  
     Common.ConsoleHelper.peekDataViewInConsole<IrisData> mlContext trainingDataView dataProcessPipeline 10 |> ignore
     Common.ConsoleHelper.peekVectorColumnDataInConsole mlContext "Features" trainingDataView dataProcessPipeline 10 |> ignore
 
+    // STEP 3: Create and train the model     
     let trainer = mlContext.Clustering.Trainers.KMeans(features = "Features", clustersCount = 3)
 
-    // STEP 3: Create and train the model                
     let modelBuilder = 
         Common.ModelBuilder.create mlContext dataProcessPipeline
         |> Common.ModelBuilder.addTrainer trainer
diff --git a/samples/fsharp/getting-started/Clustering_Iris/README.md b/samples/fsharp/getting-started/Clustering_Iris/README.md
@@ -1,8 +1,8 @@
-# Clustering Iris flowers (F#)
+# Clustering Iris Data
 
 | ML.NET version | API type          | Status                        | App Type    | Data type | Scenario            | ML Task                   | Algorithms                  |
 |----------------|-------------------|-------------------------------|-------------|-----------|---------------------|---------------------------|-----------------------------|
-| v0.7           | Dynamic API | README.md needs update | Console app | .txt file | Clustering Iris flowers | Clustering | K-means++ |
+| v0.9           | Dynamic API | Up-to-date | Console app | .txt file | Clustering Iris flowers | Clustering | K-means++ |
 
 In this introductory sample, you'll see how to use [ML.NET](https://www.microsoft.com/net/learn/apps/machine-learning-and-ai/ml-dotnet) to divide iris flowers into different groups that correspond to different types of iris. In the world of machine learning, this task is known as **clustering**.
 
@@ -30,59 +30,73 @@ To solve this problem, first we will build and train an ML model. Then we will u
 
 ### 1. Build model
 
-Building a model includes: uploading data (`iris-full.txt` with `TextLoader`), transforming the data so it can be used effectively by an ML algorithm (with `ConcatEstimator`), and choosing a learning algorithm (`KMeansPlusPlusTrainer`). All of those steps are stored in a `EstimatorChain`:
+Building a model includes: uploading data (`iris-full.txt` with `TextLoader`), transforming the data so it can be used effectively by an ML algorithm (with `Concatenate`), and choosing a learning algorithm (`KMeans`). All of those steps are stored in `trainingPipeline`:
+
 ```fsharp
-	// LearningPipeline holds all steps of the learning process: data, transforms, learners.
+    // STEP 1: Common data loading configuration
+    let textLoader = 
+        mlContext.Data.CreateTextReader(
+            hasHeader = true,
+            separatorChar = '\t',
+            columns =
+                [|
+                    TextLoader.Column("Label", Nullable DataKind.R4, 0)
+                    TextLoader.Column("SepalLength", Nullable DataKind.R4, 1)
+                    TextLoader.Column("SepalWidth", Nullable DataKind.R4, 2)
+                    TextLoader.Column("PetalLength", Nullable DataKind.R4, 3)
+                    TextLoader.Column("PetalWidth", Nullable DataKind.R4, 4)
+                |]
+        )
+
+    let fullData = textLoader.Read dataPath
     
-	//1. Create ML.NET context/environment
-    use env = new LocalEnvironment()
-
-    //2. Create DataReader with data schema mapped to file's columns
-    let reader = 
-        TextLoader(
-            env, 
-            TextLoader.Arguments(
-                Separator = "tab", 
-                HasHeader = true, 
-                Column = 
-                    [|
-                        TextLoader.Column("Label", Nullable DataKind.R4, 0)
-                        TextLoader.Column("SepalLength", Nullable DataKind.R4, 1)
-                        TextLoader.Column("SepalWidth", Nullable DataKind.R4, 2)
-                        TextLoader.Column("PetalLength", Nullable DataKind.R4, 3)
-                        TextLoader.Column("PetalWidth", Nullable DataKind.R4, 4)
-                    |]
-                )
-            )
-
-    //Load training data
-    let trainingDataView = MultiFileSource(DataPath) |> reader.Read
+    //Split dataset in two parts: TrainingDataset (80%) and TestDataset (20%)
+    let struct(trainingDataView, testingDataView) = mlContext.Clustering.TrainTestSplit(fullData, testFraction = 0.2)
+
+    //STEP 2: Process data transformations in pipeline
+    let dataProcessPipeline = mlContext.Transforms.Concatenate("Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth")
+
+    // (Optional) Peek data in training DataView after applying the ProcessPipeline's transformations  
+    Common.ConsoleHelper.peekDataViewInConsole<IrisData> mlContext trainingDataView dataProcessPipeline 10 |> ignore
+    Common.ConsoleHelper.peekVectorColumnDataInConsole mlContext "Features" trainingDataView dataProcessPipeline 10 |> ignore
+
+    // STEP 3: Create and train the model     
+    let trainer = mlContext.Clustering.Trainers.KMeans(features = "Features", clustersCount = 3)
+
+    let modelBuilder = 
+        Common.ModelBuilder.create mlContext dataProcessPipeline
+        |> Common.ModelBuilder.addTrainer trainer
+
+    let trainedModel = 
+        modelBuilder
+        |> Common.ModelBuilder.train trainingDataView
 ```
+
 ### 2. Train model
-Training the model is a process of running the chosen algorithm on the given data. It is implemented in the `Fit()` method from the Estimator object. To perform training we just call the method and provide our data.
-```fsharp
-    let model = 
-        env
-        |> Pipeline.concatEstimator "Features" [| "SepalLength"; "SepalWidth"; "PetalLength"; "PetalWidth" |]
-        |> Pipeline.append (KMeansPlusPlusTrainer(env, "Features", clustersCount = 3))
-        |> Pipeline.fit trainingDataView
+Training the model is a process of running the chosen algorithm on the given data. To perform training you need to call the Fit() method.
 
+```fsharp
+    let trainedModel = 
+        modelBuilder
+        |> Common.ModelBuilder.train trainingDataView
 ```
 ### 3. Consume model
 After the model is build and trained, we can use the `Predict()` API to predict the cluster for an iris flower and calculate the distance from given flower parameters to each cluster (each centroid of a cluster).
 
 ```fsharp
-    let sampleIrisData = 
-        { 
+   let sampleIrisData = 
+        {
             SepalLength = 3.3f
             SepalWidth = 1.6f
             PetalLength = 0.2f
-            PetalWidth = 5.1f 
+            PetalWidth = 5.1f
         }
 
-    let predictionFunc = loadedModel.MakePredictionFunction<IrisData, IrisPrediction> env
-    let prediction = predictionFunc.Predict sampleIrisData
+    //Create the clusters: Create data files and plot a chart
+    let prediction = 
+        Common.ModelScorer.create mlContext
+        |> Common.ModelScorer.loadModelFromZipFile modelPath
+        |> Common.ModelScorer.predictSingle sampleIrisData
 
-    printfn "Clusters assigned for setosa flowers: %d" prediction.SelectedClusterId
-```
+    printfn "Cluster assigned for setosa flowers: %d" prediction.SelectedClusterId```
 ```
diff --git a/samples/fsharp/v0.9-All-Samples.sln b/samples/fsharp/v0.9-All-Samples.sln
@@ -19,6 +19,10 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "TaxiFarePrediction.Solution
 EndProject
 Project("{6EC3EE1D-3C4E-46DD-8F32-0CC8E7565705}") = "TaxiFarePrediction", "getting-started\Regression_TaxiFarePrediction\TaxiFarePrediction\TaxiFarePredictionConsoleApp\TaxiFarePrediction.fsproj", "{2865D3B8-753F-4B61-B452-D5A3D032F9BB}"
 EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "IrisClustering.Solution", "IrisClustering.Solution", "{68E5A791-705C-464D-A2AC-A30F0C452A54}"
+EndProject
+Project("{6EC3EE1D-3C4E-46DD-8F32-0CC8E7565705}") = "Clustering_Iris", "getting-started\Clustering_Iris\IrisClustering\IrisClusteringConsoleApp\Clustering_Iris.fsproj", "{23F5ADD0-DCF0-494B-9AE7-4754EC9A23C9}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|Any CPU = Debug|Any CPU
@@ -41,6 +45,10 @@ Global
 		{2865D3B8-753F-4B61-B452-D5A3D032F9BB}.Debug|Any CPU.Build.0 = Debug|Any CPU
 		{2865D3B8-753F-4B61-B452-D5A3D032F9BB}.Release|Any CPU.ActiveCfg = Release|Any CPU
 		{2865D3B8-753F-4B61-B452-D5A3D032F9BB}.Release|Any CPU.Build.0 = Release|Any CPU
+		{23F5ADD0-DCF0-494B-9AE7-4754EC9A23C9}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{23F5ADD0-DCF0-494B-9AE7-4754EC9A23C9}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{23F5ADD0-DCF0-494B-9AE7-4754EC9A23C9}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{23F5ADD0-DCF0-494B-9AE7-4754EC9A23C9}.Release|Any CPU.Build.0 = Release|Any CPU
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
@@ -50,6 +58,7 @@ Global
 		{AC2541F6-ADDF-4B9A-8216-04900767881E} = {D04EC2CC-4F1B-41B2-AFC6-E406FEA0412E}
 		{CF0F35E4-4A8B-4E9A-A284-B791E404D334} = {ABE9B87E-D778-4F97-B38D-62B4A186A6E5}
 		{2865D3B8-753F-4B61-B452-D5A3D032F9BB} = {740E8A4B-A0C2-4FE2-8561-6F46EA10BDCA}
+		{23F5ADD0-DCF0-494B-9AE7-4754EC9A23C9} = {68E5A791-705C-464D-A2AC-A30F0C452A54}
 	EndGlobalSection
 	GlobalSection(ExtensibilityGlobals) = postSolution
 		SolutionGuid = {47D66D3A-D6C7-45A5-8C11-8723039BC142}