From 7557a835d19a1465c47f4f03d380c980b1d31825 Mon Sep 17 00:00:00 2001 From: Rogan Carr Date: Wed, 6 Feb 2019 14:57:34 -0800 Subject: [PATCH 1/7] Updating docstrings --- .../Dynamic/Transforms/SelectColumns.cs | 41 +++++++++++-------- .../Transforms/ExtensionsCatalog.cs | 33 ++++++++++++--- 2 files changed, 50 insertions(+), 24 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/SelectColumns.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/SelectColumns.cs index 2567f036f9..1309b73739 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/SelectColumns.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/SelectColumns.cs @@ -1,5 +1,4 @@ using System; -using System.Collections.Generic; using Microsoft.ML.Data; namespace Microsoft.ML.Samples.Dynamic @@ -13,25 +12,31 @@ public static void Example() var mlContext = new MLContext(); // Get a small dataset as an IEnumerable and them read it as ML.NET's data type. - IEnumerable data = SamplesUtils.DatasetUtils.GetInfertData(); - var trainData = mlContext.Data.ReadFromEnumerable(data); + var enumerableData = SamplesUtils.DatasetUtils.GetInfertData(); + var data = mlContext.Data.ReadFromEnumerable(enumerableData); - // Preview of the data. - // - // Age Case Education induced parity pooled.stratum row_num ... - // 26.0 1.0 0-5yrs 1.0 6.0 3.0 1.0 ... - // 42.0 1.0 0-5yrs 1.0 1.0 1.0 2.0 ... - // 39.0 1.0 0-5yrs 2.0 6.0 4.0 3.0 ... - // 34.0 1.0 0-5yrs 2.0 4.0 2.0 4.0 ... - // 35.0 1.0 6-11yrs 1.0 3.0 32.0 5.0 ... + // Before transformation, take a look at the dataset + Console.WriteLine($"Age\tCase\tEducation\tInduced\tParity\tPooledStratum"); + foreach (var row in enumerableData) + { + Console.WriteLine($"{row.Age}\t{row.Case}\t{row.Education}\t{row.Induced}\t{row.Parity}\t{row.PooledStratum}"); + } + Console.WriteLine(); + // Expected output: + // Age Case Education Induced Parity PooledStratum + // 26 1 0 - 5yrs 1 6 3 + // 42 1 0 - 5yrs 1 1 1 + // 39 1 12 + yrs 2 6 4 + // 34 1 0 - 5yrs 2 4 2 + // 35 1 6 - 11yrs 1 3 32 // Select a subset of columns to keep. - var pipeline = mlContext.Transforms.SelectColumns(new string[] { "Age", "Education" }); + var pipeline = mlContext.Transforms.SelectColumns("Age", "Education"); // Now we can transform the data and look at the output to confirm the behavior of CopyColumns. // Don't forget that this operation doesn't actually evaluate data until we read the data below, // as transformations are lazy in ML.NET. - var transformedData = pipeline.Fit(trainData).Transform(trainData); + var transformedData = pipeline.Fit(data).Transform(data); // Print the number of columns in the schema Console.WriteLine($"There are {transformedData.Schema.Count} columns in the dataset."); @@ -51,11 +56,11 @@ public static void Example() // Expected output: // Age and Education columns obtained post-transformation. - // Age: 26 Education: 0 - 5yrs - // Age: 42 Education: 0 - 5yrs - // Age: 39 Education: 0 - 5yrs - // Age: 34 Education: 0 - 5yrs - // Age: 35 Education: 6 - 11yrs + // Age: 26 Education: 0-5yrs + // Age: 42 Education: 0-5yrs + // Age: 39 Education: 12+yrs + // Age: 34 Education: 0-5yrs + // Age: 35 Education: 6-11yrs } private class SampleInfertDataTransformed diff --git a/src/Microsoft.ML.Data/Transforms/ExtensionsCatalog.cs b/src/Microsoft.ML.Data/Transforms/ExtensionsCatalog.cs index 80f9a67f10..618cb1c7ef 100644 --- a/src/Microsoft.ML.Data/Transforms/ExtensionsCatalog.cs +++ b/src/Microsoft.ML.Data/Transforms/ExtensionsCatalog.cs @@ -2,6 +2,7 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +using Microsoft.Data.DataView; using Microsoft.ML.Data; using Microsoft.ML.Transforms; @@ -67,9 +68,9 @@ public static ColumnConcatenatingEstimator Concatenate(this TransformsCatalog ca /// /// is commonly used to remove unwanted columns from the schema if the dataset is going to be serialized or /// written out to a file. It is not actually necessary to drop unused columns before training or - /// performing transforms, as IDataView's lazy evaluation won't actually materialize those columns. + /// performing transforms, as 's lazy evaluation won't actually materialize those columns. /// In the case of serialization, every column in the schema will be written out. If you have columns - /// that you don't want to save, you can use DropColumns to remove them from the schema. + /// that you don't want to save, you can use to remove them from the schema. /// /// The transform's catalog. /// The array of column names to drop. @@ -84,11 +85,11 @@ public static ColumnSelectingEstimator DropColumns(this TransformsCatalog catalo => ColumnSelectingEstimator.DropColumns(CatalogUtils.GetEnvironment(catalog), columnsToDrop); /// - /// ColumnSelectingEstimator is used to select a list of columns that user wants to keep from a given input. + /// Select a list of columns to keep in a given . /// /// /// - /// operates on the schema of an input IDataView, + /// operates on the schema of an input , /// either dropping unselected columns from the schema or keeping them but marking them as hidden in the schema. Keeping columns hidden /// is recommended when it is necessary to understand how the inputs of a pipeline map to outputs of the pipeline. This feature /// is useful, for example, in debugging a pipeline of transforms by allowing you to print out results from the middle of the pipeline. @@ -97,7 +98,7 @@ public static ColumnSelectingEstimator DropColumns(this TransformsCatalog catalo /// /// The transform's catalog. /// The array of column names to keep. - /// If true will keep hidden columns and false will remove hidden columns. + /// If will keep hidden columns and will remove hidden columns. /// /// /// public static ColumnSelectingEstimator SelectColumns(this TransformsCatalog catalog, string[] keepColumns, - bool keepHidden = ColumnSelectingTransformer.Defaults.KeepHidden) + bool keepHidden) => new ColumnSelectingEstimator(CatalogUtils.GetEnvironment(catalog), keepColumns, null, keepHidden, ColumnSelectingTransformer.Defaults.IgnoreMissing); + + /// + /// Select a list of columns to keep in a given . + /// + /// + /// + /// operates on the schema of an input , dropping unselected columns from the schema. + /// + /// + /// The transform's catalog. + /// The array of column names to keep. + /// + /// + /// + /// + /// + public static ColumnSelectingEstimator SelectColumns(this TransformsCatalog catalog, + params string[] keepColumns) => catalog.SelectColumns(keepColumns, false); } } From 363b083e28b325bd7d8f0666f182783c6ad26ed9 Mon Sep 17 00:00:00 2001 From: Rogan Carr Date: Thu, 7 Feb 2019 15:00:00 -0800 Subject: [PATCH 2/7] Adding a project, functional tests, without internal access to the ML.NET Library. --- Microsoft.ML.sln | 15 ++++++ .../Microsoft.ML.Functional.Tests.csproj | 53 +++++++++++++++++++ .../Prediction.cs | 49 +++++++++++++++++ .../Validation.cs | 48 +++++++++++++++++ .../Api/Estimators/CrossValidation.cs | 36 ------------- .../Estimators/ReconfigurablePrediction.cs | 47 ---------------- 6 files changed, 165 insertions(+), 83 deletions(-) create mode 100644 test/Microsoft.ML.Functional.Tests/Microsoft.ML.Functional.Tests.csproj create mode 100644 test/Microsoft.ML.Functional.Tests/Prediction.cs create mode 100644 test/Microsoft.ML.Functional.Tests/Validation.cs delete mode 100644 test/Microsoft.ML.Tests/Scenarios/Api/Estimators/CrossValidation.cs delete mode 100644 test/Microsoft.ML.Tests/Scenarios/Api/Estimators/ReconfigurablePrediction.cs diff --git a/Microsoft.ML.sln b/Microsoft.ML.sln index 9d969b6be2..8dd91a55a3 100644 --- a/Microsoft.ML.sln +++ b/Microsoft.ML.sln @@ -33,6 +33,8 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.TestFramework" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.Predictor.Tests", "test\Microsoft.ML.Predictor.Tests\Microsoft.ML.Predictor.Tests.csproj", "{6B047E09-39C9-4583-96F3-685D84CA4117}" EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.Functional.Tests", "test\Microsoft.ML.Functional.Tests\Microsoft.ML.Functional.Tests.csproj", "{CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}" +EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.ResultProcessor", "src\Microsoft.ML.ResultProcessor\Microsoft.ML.ResultProcessor.csproj", "{3769FCC3-9AFF-4C37-97E9-6854324681DF}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.FastTree", "src\Microsoft.ML.FastTree\Microsoft.ML.FastTree.csproj", "{B7B593C5-FB8C-4ADA-A638-5B53B47D087E}" @@ -928,6 +930,18 @@ Global {5E920CAC-5A28-42FB-936E-49C472130953}.Release-Intrinsics|Any CPU.Build.0 = Release-Intrinsics|Any CPU {5E920CAC-5A28-42FB-936E-49C472130953}.Release-netfx|Any CPU.ActiveCfg = Release-netfx|Any CPU {5E920CAC-5A28-42FB-936E-49C472130953}.Release-netfx|Any CPU.Build.0 = Release-netfx|Any CPU + {CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Debug|Any CPU.Build.0 = Debug|Any CPU + {CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Debug-Intrinsics|Any CPU.ActiveCfg = Debug|Any CPU + {CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Debug-Intrinsics|Any CPU.Build.0 = Debug|Any CPU + {CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Debug-netfx|Any CPU.ActiveCfg = Debug|Any CPU + {CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Debug-netfx|Any CPU.Build.0 = Debug|Any CPU + {CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Release|Any CPU.ActiveCfg = Release|Any CPU + {CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Release|Any CPU.Build.0 = Release|Any CPU + {CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Release-Intrinsics|Any CPU.ActiveCfg = Release|Any CPU + {CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Release-Intrinsics|Any CPU.Build.0 = Release|Any CPU + {CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Release-netfx|Any CPU.ActiveCfg = Release|Any CPU + {CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Release-netfx|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE @@ -1011,6 +1025,7 @@ Global {85D0CAFD-2FE8-496A-88C7-585D35B94243} = {09EADF06-BE25-4228-AB53-95AE3E15B530} {31D38B21-102B-41C0-9E0A-2FE0BF68D123} = {D3D38B03-B557-484D-8348-8BADEE4DF592} {5E920CAC-5A28-42FB-936E-49C472130953} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4} + {CFED9F0C-FF81-4C96-8D5E-0436264CA7B5} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {41165AF1-35BB-4832-A189-73060F82B01D} diff --git a/test/Microsoft.ML.Functional.Tests/Microsoft.ML.Functional.Tests.csproj b/test/Microsoft.ML.Functional.Tests/Microsoft.ML.Functional.Tests.csproj new file mode 100644 index 0000000000..220faf46d2 --- /dev/null +++ b/test/Microsoft.ML.Functional.Tests/Microsoft.ML.Functional.Tests.csproj @@ -0,0 +1,53 @@ + + + + Microsoft.ML.Functional.Tests + true + + false + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/test/Microsoft.ML.Functional.Tests/Prediction.cs b/test/Microsoft.ML.Functional.Tests/Prediction.cs new file mode 100644 index 0000000000..2ed3d46ca4 --- /dev/null +++ b/test/Microsoft.ML.Functional.Tests/Prediction.cs @@ -0,0 +1,49 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.ML.SamplesUtils; +using Xunit; + +namespace Microsoft.ML.Functional.Tests +{ + public partial class PredictionScenarios + { + /// + /// Reconfigurable predictions: The following should be possible: A user trains a binary classifier, + /// and through the test evaluator gets a PR curve, the based on the PR curve picks a new threshold + /// and configures the scorer (or more precisely instantiates a new scorer over the same predictor) + /// with some threshold derived from that. + /// + [Fact(Skip = "Blocked by issue #2465")] + public void ReconfigurablePrediction() + { + var mlContext = new MLContext(seed: 789); + + // Get the dataset, create a train and test + var dataset = DatasetUtils.LoadHousingRegressionDataset(mlContext); + (var train, var test) = mlContext.BinaryClassification.TrainTestSplit(dataset, testFraction: 0.2); + + // Create a pipeline to train on the housing data + var pipeline = mlContext.Transforms.Concatenate("Features", new string[] { + "CrimesPerCapita", "PercentResidental", "PercentNonRetail", "CharlesRiver", "NitricOxides", "RoomsPerDwelling", + "PercentPre40s", "EmploymentDistance", "HighwayDistance", "TaxRate", "TeacherRatio"}) + .Append(mlContext.Transforms.CopyColumns("Label", "MedianHomeValue")) + .Append(mlContext.Regression.Trainers.OrdinaryLeastSquares()); + + var model = pipeline.Fit(train); + + var scoredTest = model.Transform(test); + var metrics = mlContext.Regression.Evaluate(scoredTest); + + // This is no longer possible in the API + //var newModel = new BinaryPredictionTransformer>(ml, model.Model, trainData.Schema, model.FeatureColumn, threshold: 0.01f, thresholdColumn: DefaultColumnNames.Probability); + //var newScoredTest = newModel.Transform(pipeline.Transform(testData)); + //var newMetrics = mlContext.BinaryClassification.Evaluate(scoredTest); + + // And the Threshold and ThresholdColumn properties are not settable. + // var predictor = model.LastTransformer; + // predictor.Threshold = 0.01; // Not possible + } + } +} diff --git a/test/Microsoft.ML.Functional.Tests/Validation.cs b/test/Microsoft.ML.Functional.Tests/Validation.cs new file mode 100644 index 0000000000..3cf8b005e3 --- /dev/null +++ b/test/Microsoft.ML.Functional.Tests/Validation.cs @@ -0,0 +1,48 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.Data.DataView; +using Microsoft.ML.Data; +using Microsoft.ML.SamplesUtils; +using Microsoft.ML.Trainers.HalLearners; +using Xunit; + +namespace Microsoft.ML.Functional.Tests +{ + public partial class ValidationScenarios + { + /// + /// Cross-validation: Have a mechanism to do cross validation, that is, you come up with + /// a data source (optionally with stratification column), come up with an instantiable transform + /// and trainer pipeline, and it will handle (1) splitting up the data, (2) training the separate + /// pipelines on in-fold data, (3) scoring on the out-fold data, (4) returning the set of + /// evaluations and optionally trained pipes. (People always want metrics out of xfold, + /// they sometimes want the actual models too.) + /// + [Fact] + void CrossValidation() + { + var mlContext = new MLContext(seed: 789); + + // Get the dataset, create a train and test + var data = DatasetUtils.LoadHousingRegressionDataset(mlContext); + + // Create a pipeline to train on the sentiment data + var pipeline = mlContext.Transforms.Concatenate("Features", new string[] { + "CrimesPerCapita", "PercentResidental", "PercentNonRetail", "CharlesRiver", "NitricOxides", "RoomsPerDwelling", + "PercentPre40s", "EmploymentDistance", "HighwayDistance", "TaxRate", "TeacherRatio"}) + .Append(mlContext.Transforms.CopyColumns("Label", "MedianHomeValue")) + .Append(mlContext.Regression.Trainers.OrdinaryLeastSquares()); + + // Compute the CV result + var cvResult = mlContext.Regression.CrossValidate(data, pipeline, numFolds: 5); + + // Check that the results are valid + Assert.IsType(cvResult[0].metrics); + Assert.IsType>>(cvResult[0].model); + Assert.True(cvResult[0].scoredTestData is IDataView); + Assert.Equal(5, cvResult.Length); + } + } +} diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/CrossValidation.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/CrossValidation.cs deleted file mode 100644 index a4e3afc2cc..0000000000 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/CrossValidation.cs +++ /dev/null @@ -1,36 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using Microsoft.ML.RunTests; -using Microsoft.ML.Trainers; -using Xunit; - -namespace Microsoft.ML.Tests.Scenarios.Api -{ - public partial class ApiScenariosTests - { - /// - /// Cross-validation: Have a mechanism to do cross validation, that is, you come up with - /// a data source (optionally with stratification column), come up with an instantiable transform - /// and trainer pipeline, and it will handle (1) splitting up the data, (2) training the separate - /// pipelines on in-fold data, (3) scoring on the out-fold data, (4) returning the set of - /// evaluations and optionally trained pipes. (People always want metrics out of xfold, - /// they sometimes want the actual models too.) - /// - [Fact] - void CrossValidation() - { - var ml = new MLContext(seed: 1, conc: 1); - - var data = ml.Data.ReadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename), hasHeader: true); - - // Pipeline. - var pipeline = ml.Transforms.Text.FeaturizeText("Features", "SentimentText") - .Append(ml.BinaryClassification.Trainers.StochasticDualCoordinateAscent( - new SdcaBinaryTrainer.Options { ConvergenceTolerance = 1f, NumThreads = 1, })); - - var cvResult = ml.BinaryClassification.CrossValidate(data, pipeline); - } - } -} diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/ReconfigurablePrediction.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/ReconfigurablePrediction.cs deleted file mode 100644 index 254dd73e45..0000000000 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/ReconfigurablePrediction.cs +++ /dev/null @@ -1,47 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using Microsoft.ML.Data; -using Microsoft.ML.RunTests; -using Microsoft.ML.Trainers; -using Xunit; - -namespace Microsoft.ML.Tests.Scenarios.Api -{ - public partial class ApiScenariosTests - { - /// - /// Reconfigurable predictions: The following should be possible: A user trains a binary classifier, - /// and through the test evaluator gets a PR curve, the based on the PR curve picks a new threshold - /// and configures the scorer (or more precisely instantiates a new scorer over the same predictor) - /// with some threshold derived from that. - /// - [Fact] - public void ReconfigurablePrediction() - { - var ml = new MLContext(seed: 1, conc: 1); - var dataReader = ml.Data.ReadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename), hasHeader: true); - - var data = ml.Data.ReadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename), hasHeader: true); - var testData = ml.Data.ReadFromTextFile(GetDataPath(TestDatasets.Sentiment.testFilename), hasHeader: true); - - // Pipeline. - var pipeline = ml.Transforms.Text.FeaturizeText("Features", "SentimentText") - .Fit(data); - - var trainer = ml.BinaryClassification.Trainers.StochasticDualCoordinateAscent( - new SdcaBinaryTrainer.Options { NumThreads = 1 }); - - var trainData = ml.Data.Cache(pipeline.Transform(data)); // Cache the data right before the trainer to boost the training speed. - var model = trainer.Fit(trainData); - - var scoredTest = model.Transform(pipeline.Transform(testData)); - var metrics = ml.BinaryClassification.Evaluate(scoredTest); - - var newModel = new BinaryPredictionTransformer>(ml, model.Model, trainData.Schema, model.FeatureColumn, threshold: 0.01f, thresholdColumn: DefaultColumnNames.Probability); - var newScoredTest = newModel.Transform(pipeline.Transform(testData)); - var newMetrics = ml.BinaryClassification.Evaluate(scoredTest); - } - } -} From 2843389c761e083704e79e17a6729563319feacf Mon Sep 17 00:00:00 2001 From: Rogan Carr Date: Fri, 8 Feb 2019 12:40:00 -0800 Subject: [PATCH 3/7] Addressing PR Comments. --- build/Dependencies.props | 2 ++ src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs | 7 ++++++- .../Microsoft.ML.Functional.Tests.csproj | 8 ++++---- test/Microsoft.ML.Functional.Tests/Prediction.cs | 10 ++++++---- test/Microsoft.ML.Functional.Tests/Validation.cs | 4 ++++ .../Microsoft.ML.OnnxTransformTest.csproj | 2 +- test/Microsoft.ML.Tests/Microsoft.ML.Tests.csproj | 4 ++-- 7 files changed, 25 insertions(+), 12 deletions(-) diff --git a/build/Dependencies.props b/build/Dependencies.props index 896ca68978..31c95a5fee 100644 --- a/build/Dependencies.props +++ b/build/Dependencies.props @@ -43,6 +43,8 @@ 0.11.3 0.0.3-test + 0.0.7-test + 0.0.2-test diff --git a/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs b/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs index edaf2d55c5..d95d047faf 100644 --- a/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs +++ b/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs @@ -17,7 +17,12 @@ public static class DatasetUtils /// Downloads the housing dataset from the ML.NET repo. /// public static string DownloadHousingRegressionDataset() - => Download("https://raw.githubusercontent.com/dotnet/machinelearning/024bd4452e1d3660214c757237a19d6123f951ca/test/data/housing.txt", "housing.txt"); + { + var fileName = "housing.txt"; + if (!File.Exists(fileName)) + Download("https://raw.githubusercontent.com/dotnet/machinelearning/024bd4452e1d3660214c757237a19d6123f951ca/test/data/housing.txt", fileName); + return fileName; + } public static IDataView LoadHousingRegressionDataset(MLContext mlContext) { diff --git a/test/Microsoft.ML.Functional.Tests/Microsoft.ML.Functional.Tests.csproj b/test/Microsoft.ML.Functional.Tests/Microsoft.ML.Functional.Tests.csproj index 220faf46d2..968c8f2321 100644 --- a/test/Microsoft.ML.Functional.Tests/Microsoft.ML.Functional.Tests.csproj +++ b/test/Microsoft.ML.Functional.Tests/Microsoft.ML.Functional.Tests.csproj @@ -1,10 +1,10 @@  - Microsoft.ML.Functional.Tests - true + false false + false @@ -47,7 +47,7 @@ - - + + diff --git a/test/Microsoft.ML.Functional.Tests/Prediction.cs b/test/Microsoft.ML.Functional.Tests/Prediction.cs index 2ed3d46ca4..37fd76d496 100644 --- a/test/Microsoft.ML.Functional.Tests/Prediction.cs +++ b/test/Microsoft.ML.Functional.Tests/Prediction.cs @@ -15,7 +15,7 @@ public partial class PredictionScenarios /// and configures the scorer (or more precisely instantiates a new scorer over the same predictor) /// with some threshold derived from that. /// - [Fact(Skip = "Blocked by issue #2465")] + [Fact] public void ReconfigurablePrediction() { var mlContext = new MLContext(seed: 789); @@ -36,14 +36,16 @@ public void ReconfigurablePrediction() var scoredTest = model.Transform(test); var metrics = mlContext.Regression.Evaluate(scoredTest); + Common.CheckMetrics(metrics); + + // Todo #2465: Allow the setting of threshold and thresholdColumn for scoring. // This is no longer possible in the API //var newModel = new BinaryPredictionTransformer>(ml, model.Model, trainData.Schema, model.FeatureColumn, threshold: 0.01f, thresholdColumn: DefaultColumnNames.Probability); //var newScoredTest = newModel.Transform(pipeline.Transform(testData)); //var newMetrics = mlContext.BinaryClassification.Evaluate(scoredTest); - // And the Threshold and ThresholdColumn properties are not settable. - // var predictor = model.LastTransformer; - // predictor.Threshold = 0.01; // Not possible + //var predictor = model.LastTransformer; + //predictor.Threshold = 0.01; // Not possible } } } diff --git a/test/Microsoft.ML.Functional.Tests/Validation.cs b/test/Microsoft.ML.Functional.Tests/Validation.cs index 3cf8b005e3..86dbe3786a 100644 --- a/test/Microsoft.ML.Functional.Tests/Validation.cs +++ b/test/Microsoft.ML.Functional.Tests/Validation.cs @@ -43,6 +43,10 @@ void CrossValidation() Assert.IsType>>(cvResult[0].model); Assert.True(cvResult[0].scoredTestData is IDataView); Assert.Equal(5, cvResult.Length); + + // And validate the metrics + foreach (var result in cvResult) + Common.CheckMetrics(result.metrics); } } } diff --git a/test/Microsoft.ML.OnnxTransformTest/Microsoft.ML.OnnxTransformTest.csproj b/test/Microsoft.ML.OnnxTransformTest/Microsoft.ML.OnnxTransformTest.csproj index a153655b27..a5dbaca9f8 100644 --- a/test/Microsoft.ML.OnnxTransformTest/Microsoft.ML.OnnxTransformTest.csproj +++ b/test/Microsoft.ML.OnnxTransformTest/Microsoft.ML.OnnxTransformTest.csproj @@ -11,7 +11,7 @@ - + diff --git a/test/Microsoft.ML.Tests/Microsoft.ML.Tests.csproj b/test/Microsoft.ML.Tests/Microsoft.ML.Tests.csproj index 2d56666b7f..37f4b25c1e 100644 --- a/test/Microsoft.ML.Tests/Microsoft.ML.Tests.csproj +++ b/test/Microsoft.ML.Tests/Microsoft.ML.Tests.csproj @@ -46,7 +46,7 @@ - - + + From f9f03c837b939337223817a51578601354f4215d Mon Sep 17 00:00:00 2001 From: Rogan Carr Date: Fri, 8 Feb 2019 14:20:47 -0800 Subject: [PATCH 4/7] Addressing PR comments. --- test/Microsoft.ML.Functional.Tests/Common.cs | 24 +++++++++++++++++++ .../Microsoft.ML.Functional.Tests.csproj | 1 - 2 files changed, 24 insertions(+), 1 deletion(-) create mode 100644 test/Microsoft.ML.Functional.Tests/Common.cs diff --git a/test/Microsoft.ML.Functional.Tests/Common.cs b/test/Microsoft.ML.Functional.Tests/Common.cs new file mode 100644 index 0000000000..29088298d3 --- /dev/null +++ b/test/Microsoft.ML.Functional.Tests/Common.cs @@ -0,0 +1,24 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.Data.DataView; +using Microsoft.ML.Data; +using Microsoft.ML.SamplesUtils; +using Microsoft.ML.Trainers.HalLearners; +using Xunit; + +namespace Microsoft.ML.Functional.Tests +{ + internal static class Common + { + public static void CheckMetrics(RegressionMetrics metrics) + { + // Perform sanity checks on the metrics + Assert.True(metrics.Rms >= 0); + Assert.True(metrics.L1 >= 0); + Assert.True(metrics.L2 >= 0); + Assert.True(metrics.RSquared <= 1); + } + } +} diff --git a/test/Microsoft.ML.Functional.Tests/Microsoft.ML.Functional.Tests.csproj b/test/Microsoft.ML.Functional.Tests/Microsoft.ML.Functional.Tests.csproj index 968c8f2321..106db8f36c 100644 --- a/test/Microsoft.ML.Functional.Tests/Microsoft.ML.Functional.Tests.csproj +++ b/test/Microsoft.ML.Functional.Tests/Microsoft.ML.Functional.Tests.csproj @@ -1,7 +1,6 @@  - false false false From c1b7ffc6a99bd90ed0ee78e094ae31b1da7fdbca Mon Sep 17 00:00:00 2001 From: Rogan Carr Date: Fri, 8 Feb 2019 14:40:31 -0800 Subject: [PATCH 5/7] Tests use local files. --- src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs | 4 ++-- test/Microsoft.ML.Functional.Tests/Prediction.cs | 5 +++-- test/Microsoft.ML.Functional.Tests/Validation.cs | 7 +++++-- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs b/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs index d95d047faf..7f3d205e3b 100644 --- a/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs +++ b/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs @@ -24,10 +24,10 @@ public static string DownloadHousingRegressionDataset() return fileName; } - public static IDataView LoadHousingRegressionDataset(MLContext mlContext) + public static IDataView LoadHousingRegressionDataset(MLContext mlContext, string pathToDataset=null) { // Download the file - string dataFile = DownloadHousingRegressionDataset(); + string dataFile = pathToDataset ?? DownloadHousingRegressionDataset(); // Define the columns to read var reader = mlContext.Data.CreateTextLoader( diff --git a/test/Microsoft.ML.Functional.Tests/Prediction.cs b/test/Microsoft.ML.Functional.Tests/Prediction.cs index 37fd76d496..fad7c62fc1 100644 --- a/test/Microsoft.ML.Functional.Tests/Prediction.cs +++ b/test/Microsoft.ML.Functional.Tests/Prediction.cs @@ -3,11 +3,12 @@ // See the LICENSE file in the project root for more information. using Microsoft.ML.SamplesUtils; +using Microsoft.ML.TestFramework; using Xunit; namespace Microsoft.ML.Functional.Tests { - public partial class PredictionScenarios + public class PredictionScenarios { /// /// Reconfigurable predictions: The following should be possible: A user trains a binary classifier, @@ -21,7 +22,7 @@ public void ReconfigurablePrediction() var mlContext = new MLContext(seed: 789); // Get the dataset, create a train and test - var dataset = DatasetUtils.LoadHousingRegressionDataset(mlContext); + var dataset = DatasetUtils.LoadHousingRegressionDataset(mlContext, BaseTestClass.GetDataPath("housing.txt")); (var train, var test) = mlContext.BinaryClassification.TrainTestSplit(dataset, testFraction: 0.2); // Create a pipeline to train on the housing data diff --git a/test/Microsoft.ML.Functional.Tests/Validation.cs b/test/Microsoft.ML.Functional.Tests/Validation.cs index 86dbe3786a..f39ec792a7 100644 --- a/test/Microsoft.ML.Functional.Tests/Validation.cs +++ b/test/Microsoft.ML.Functional.Tests/Validation.cs @@ -2,15 +2,18 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +using System; +using System.IO; using Microsoft.Data.DataView; using Microsoft.ML.Data; using Microsoft.ML.SamplesUtils; +using Microsoft.ML.TestFramework; using Microsoft.ML.Trainers.HalLearners; using Xunit; namespace Microsoft.ML.Functional.Tests { - public partial class ValidationScenarios + public class ValidationScenarios { /// /// Cross-validation: Have a mechanism to do cross validation, that is, you come up with @@ -26,7 +29,7 @@ void CrossValidation() var mlContext = new MLContext(seed: 789); // Get the dataset, create a train and test - var data = DatasetUtils.LoadHousingRegressionDataset(mlContext); + var data = DatasetUtils.LoadHousingRegressionDataset(mlContext, BaseTestClass.GetDataPath("housing.txt")); // Create a pipeline to train on the sentiment data var pipeline = mlContext.Transforms.Concatenate("Features", new string[] { From 56ae524d9484ce5ac8d599fc1b800dc0b85a6bfe Mon Sep 17 00:00:00 2001 From: Rogan Carr Date: Fri, 8 Feb 2019 15:36:48 -0800 Subject: [PATCH 6/7] Updating solution files. --- Microsoft.ML.sln | 16 ++++++++-------- build/Dependencies.props | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/Microsoft.ML.sln b/Microsoft.ML.sln index 8dd91a55a3..2b9bcb6cf9 100644 --- a/Microsoft.ML.sln +++ b/Microsoft.ML.sln @@ -932,16 +932,16 @@ Global {5E920CAC-5A28-42FB-936E-49C472130953}.Release-netfx|Any CPU.Build.0 = Release-netfx|Any CPU {CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Debug|Any CPU.Build.0 = Debug|Any CPU - {CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Debug-Intrinsics|Any CPU.ActiveCfg = Debug|Any CPU - {CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Debug-Intrinsics|Any CPU.Build.0 = Debug|Any CPU - {CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Debug-netfx|Any CPU.ActiveCfg = Debug|Any CPU - {CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Debug-netfx|Any CPU.Build.0 = Debug|Any CPU + {CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Debug-Intrinsics|Any CPU.ActiveCfg = Debug-Intrinsics|Any CPU + {CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Debug-Intrinsics|Any CPU.Build.0 = Debug-Intrinsics|Any CPU + {CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Debug-netfx|Any CPU.ActiveCfg = Debug-netfx|Any CPU + {CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Debug-netfx|Any CPU.Build.0 = Debug-netfx|Any CPU {CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Release|Any CPU.ActiveCfg = Release|Any CPU {CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Release|Any CPU.Build.0 = Release|Any CPU - {CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Release-Intrinsics|Any CPU.ActiveCfg = Release|Any CPU - {CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Release-Intrinsics|Any CPU.Build.0 = Release|Any CPU - {CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Release-netfx|Any CPU.ActiveCfg = Release|Any CPU - {CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Release-netfx|Any CPU.Build.0 = Release|Any CPU + {CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Release-Intrinsics|Any CPU.ActiveCfg = Release-Intrinsics|Any CPU + {CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Release-Intrinsics|Any CPU.Build.0 = Release-Intrinsics|Any CPU + {CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Release-netfx|Any CPU.ActiveCfg = Release-netfx|Any CPU + {CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Release-netfx|Any CPU.Build.0 = Release-netfx|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/build/Dependencies.props b/build/Dependencies.props index 31c95a5fee..9d2174267b 100644 --- a/build/Dependencies.props +++ b/build/Dependencies.props @@ -44,7 +44,7 @@ 0.11.3 0.0.3-test 0.0.7-test - 0.0.2-test + 0.0.4-test From 8dd776e632c7be96dadbab1da8bc9a879d02c6f1 Mon Sep 17 00:00:00 2001 From: Rogan Carr Date: Fri, 8 Feb 2019 21:31:57 -0800 Subject: [PATCH 7/7] Addressing PR comments --- .../SamplesDatasetUtils.cs | 4 ++-- .../Prediction.cs | 9 +++++---- .../Validation.cs | 12 +++++------- test/Microsoft.ML.TestFramework/Datasets.cs | 19 ++++++++++++++++++- 4 files changed, 30 insertions(+), 14 deletions(-) diff --git a/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs b/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs index 7f3d205e3b..d95d047faf 100644 --- a/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs +++ b/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs @@ -24,10 +24,10 @@ public static string DownloadHousingRegressionDataset() return fileName; } - public static IDataView LoadHousingRegressionDataset(MLContext mlContext, string pathToDataset=null) + public static IDataView LoadHousingRegressionDataset(MLContext mlContext) { // Download the file - string dataFile = pathToDataset ?? DownloadHousingRegressionDataset(); + string dataFile = DownloadHousingRegressionDataset(); // Define the columns to read var reader = mlContext.Data.CreateTextLoader( diff --git a/test/Microsoft.ML.Functional.Tests/Prediction.cs b/test/Microsoft.ML.Functional.Tests/Prediction.cs index fad7c62fc1..7e0ff2eb44 100644 --- a/test/Microsoft.ML.Functional.Tests/Prediction.cs +++ b/test/Microsoft.ML.Functional.Tests/Prediction.cs @@ -2,7 +2,7 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. -using Microsoft.ML.SamplesUtils; +using Microsoft.ML.RunTests; using Microsoft.ML.TestFramework; using Xunit; @@ -13,7 +13,7 @@ public class PredictionScenarios /// /// Reconfigurable predictions: The following should be possible: A user trains a binary classifier, /// and through the test evaluator gets a PR curve, the based on the PR curve picks a new threshold - /// and configures the scorer (or more precisely instantiates a new scorer over the same predictor) + /// and configures the scorer (or more precisely instantiates a new scorer over the same model parameters) /// with some threshold derived from that. /// [Fact] @@ -22,8 +22,9 @@ public void ReconfigurablePrediction() var mlContext = new MLContext(seed: 789); // Get the dataset, create a train and test - var dataset = DatasetUtils.LoadHousingRegressionDataset(mlContext, BaseTestClass.GetDataPath("housing.txt")); - (var train, var test) = mlContext.BinaryClassification.TrainTestSplit(dataset, testFraction: 0.2); + var data = mlContext.Data.CreateTextLoader(TestDatasets.housing.GetLoaderColumns(), hasHeader: true) + .Read(BaseTestClass.GetDataPath(TestDatasets.housing.trainFilename)); + (var train, var test) = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.2); // Create a pipeline to train on the housing data var pipeline = mlContext.Transforms.Concatenate("Features", new string[] { diff --git a/test/Microsoft.ML.Functional.Tests/Validation.cs b/test/Microsoft.ML.Functional.Tests/Validation.cs index f39ec792a7..b9bb617285 100644 --- a/test/Microsoft.ML.Functional.Tests/Validation.cs +++ b/test/Microsoft.ML.Functional.Tests/Validation.cs @@ -2,11 +2,9 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. -using System; -using System.IO; using Microsoft.Data.DataView; using Microsoft.ML.Data; -using Microsoft.ML.SamplesUtils; +using Microsoft.ML.RunTests; using Microsoft.ML.TestFramework; using Microsoft.ML.Trainers.HalLearners; using Xunit; @@ -20,16 +18,16 @@ public class ValidationScenarios /// a data source (optionally with stratification column), come up with an instantiable transform /// and trainer pipeline, and it will handle (1) splitting up the data, (2) training the separate /// pipelines on in-fold data, (3) scoring on the out-fold data, (4) returning the set of - /// evaluations and optionally trained pipes. (People always want metrics out of xfold, - /// they sometimes want the actual models too.) + /// metrics, trained pipelines, and scored test data for each fold. /// [Fact] void CrossValidation() { var mlContext = new MLContext(seed: 789); - // Get the dataset, create a train and test - var data = DatasetUtils.LoadHousingRegressionDataset(mlContext, BaseTestClass.GetDataPath("housing.txt")); + // Get the dataset + var data = mlContext.Data.CreateTextLoader(TestDatasets.housing.GetLoaderColumns(), hasHeader: true) + .Read(BaseTestClass.GetDataPath(TestDatasets.housing.trainFilename)); // Create a pipeline to train on the sentiment data var pipeline = mlContext.Transforms.Concatenate("Features", new string[] { diff --git a/test/Microsoft.ML.TestFramework/Datasets.cs b/test/Microsoft.ML.TestFramework/Datasets.cs index 6d6ba61191..1bdfa5048b 100644 --- a/test/Microsoft.ML.TestFramework/Datasets.cs +++ b/test/Microsoft.ML.TestFramework/Datasets.cs @@ -158,7 +158,24 @@ public static class TestDatasets name = "housing", trainFilename = "housing.txt", testFilename = "housing.txt", - loaderSettings = "loader=Text{col=Label:0 col=Features:~ header=+}" + loaderSettings = "loader=Text{col=Label:0 col=Features:~ header=+}", + GetLoaderColumns = () => + { + return new[] { + new TextLoader.Column("MedianHomeValue", DataKind.R4, 0), + new TextLoader.Column("CrimesPerCapita", DataKind.R4, 1), + new TextLoader.Column("PercentResidental", DataKind.R4, 2), + new TextLoader.Column("PercentNonRetail", DataKind.R4, 3), + new TextLoader.Column("CharlesRiver", DataKind.R4, 4), + new TextLoader.Column("NitricOxides", DataKind.R4, 5), + new TextLoader.Column("RoomsPerDwelling", DataKind.R4, 6), + new TextLoader.Column("PercentPre40s", DataKind.R4, 7), + new TextLoader.Column("EmploymentDistance", DataKind.R4, 8), + new TextLoader.Column("HighwayDistance", DataKind.R4, 9), + new TextLoader.Column("TaxRate", DataKind.R4, 10), + new TextLoader.Column("TeacherRatio", DataKind.R4, 11), + }; + } }; public static TestDataset generatedRegressionDatasetmacro = new TestDataset