Skip to content

Add a project for functional tests without visibility into internals of ML.NET #2470

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Feb 9, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions Microsoft.ML.sln
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.TestFramework"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.Predictor.Tests", "test\Microsoft.ML.Predictor.Tests\Microsoft.ML.Predictor.Tests.csproj", "{6B047E09-39C9-4583-96F3-685D84CA4117}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.Functional.Tests", "test\Microsoft.ML.Functional.Tests\Microsoft.ML.Functional.Tests.csproj", "{CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.ResultProcessor", "src\Microsoft.ML.ResultProcessor\Microsoft.ML.ResultProcessor.csproj", "{3769FCC3-9AFF-4C37-97E9-6854324681DF}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.FastTree", "src\Microsoft.ML.FastTree\Microsoft.ML.FastTree.csproj", "{B7B593C5-FB8C-4ADA-A638-5B53B47D087E}"
Expand Down Expand Up @@ -928,6 +930,18 @@ Global
{5E920CAC-5A28-42FB-936E-49C472130953}.Release-Intrinsics|Any CPU.Build.0 = Release-Intrinsics|Any CPU
{5E920CAC-5A28-42FB-936E-49C472130953}.Release-netfx|Any CPU.ActiveCfg = Release-netfx|Any CPU
{5E920CAC-5A28-42FB-936E-49C472130953}.Release-netfx|Any CPU.Build.0 = Release-netfx|Any CPU
{CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Debug|Any CPU.Build.0 = Debug|Any CPU
{CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Debug-Intrinsics|Any CPU.ActiveCfg = Debug-Intrinsics|Any CPU
{CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Debug-Intrinsics|Any CPU.Build.0 = Debug-Intrinsics|Any CPU
{CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Debug-netfx|Any CPU.ActiveCfg = Debug-netfx|Any CPU
{CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Debug-netfx|Any CPU.Build.0 = Debug-netfx|Any CPU
{CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Release|Any CPU.ActiveCfg = Release|Any CPU
{CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Release|Any CPU.Build.0 = Release|Any CPU
{CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Release-Intrinsics|Any CPU.ActiveCfg = Release-Intrinsics|Any CPU
{CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Release-Intrinsics|Any CPU.Build.0 = Release-Intrinsics|Any CPU
{CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Release-netfx|Any CPU.ActiveCfg = Release-netfx|Any CPU
{CFED9F0C-FF81-4C96-8D5E-0436264CA7B5}.Release-netfx|Any CPU.Build.0 = Release-netfx|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
Expand Down Expand Up @@ -1011,6 +1025,7 @@ Global
{85D0CAFD-2FE8-496A-88C7-585D35B94243} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
{31D38B21-102B-41C0-9E0A-2FE0BF68D123} = {D3D38B03-B557-484D-8348-8BADEE4DF592}
{5E920CAC-5A28-42FB-936E-49C472130953} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
{CFED9F0C-FF81-4C96-8D5E-0436264CA7B5} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {41165AF1-35BB-4832-A189-73060F82B01D}
Expand Down
2 changes: 2 additions & 0 deletions build/Dependencies.props
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@
<PropertyGroup>
<BenchmarkDotNetVersion>0.11.3</BenchmarkDotNetVersion>
<MicrosoftMLTestModelsPackageVersion>0.0.3-test</MicrosoftMLTestModelsPackageVersion>
<MicrosoftMLTensorFlowTestModelsVersion>0.0.7-test</MicrosoftMLTensorFlowTestModelsVersion>
<MicrosoftMLOnnxTestModelsVersion>0.0.4-test</MicrosoftMLOnnxTestModelsVersion>
</PropertyGroup>

</Project>
7 changes: 6 additions & 1 deletion src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,12 @@ public static class DatasetUtils
/// Downloads the housing dataset from the ML.NET repo.
/// </summary>
public static string DownloadHousingRegressionDataset()
=> Download("https://raw.githubusercontent.com/dotnet/machinelearning/024bd4452e1d3660214c757237a19d6123f951ca/test/data/housing.txt", "housing.txt");
{
Copy link
Member

@eerhardt eerhardt Feb 8, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this change necessary in this PR?

I don't think our tests should be calling this method - BTW #Resolved

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change is necessary if we want to use LoadHousingRegressionDataset in our tests because there is a race condition on the file lock, so tests will sometimes fail.

Can you explain a bit more why you don't want to use this in tests? Is it that we don't want to use the SamplesUtils project in Tests, or that we shouldn't be downloading data for tests?

If it's the former, check out issue #2420 . We're going to make this a standalone Datasets/ (or some such name) outside of the NuGet project to use in Samples and Tests.

If it's the latter, we are already downloading datasets for tests. But now that I mention it, we can actually add an optional input to LoadHousingRegressionDataset and friends that can load the file from the tests/data/ directory. I'll add this capability now.


In reply to: 255236393 [](ancestors = 255236393)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added.


In reply to: 255256550 [](ancestors = 255256550,255236393)

Copy link
Member

@eerhardt eerhardt Feb 8, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are 2 datasets our tests should use.

  1. Datasets checked into test\data.
  2. Datasets that are downloaded into test\data\external through the DownloadExternalTestFiles build step.

We shouldn't have the test code be downloading random things. #Resolved

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it. This has been updated to use the local dataset in the test\data folder. I'll chase down any other tests using these Download commands as I migrate API-Scenario tests to Functional.Tests/


In reply to: 255264819 [](ancestors = 255264819)

var fileName = "housing.txt";
if (!File.Exists(fileName))
Download("https://raw.githubusercontent.com/dotnet/machinelearning/024bd4452e1d3660214c757237a19d6123f951ca/test/data/housing.txt", fileName);
return fileName;
}

public static IDataView LoadHousingRegressionDataset(MLContext mlContext)
{
Expand Down
24 changes: 24 additions & 0 deletions test/Microsoft.ML.Functional.Tests/Common.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using Microsoft.Data.DataView;
using Microsoft.ML.Data;
using Microsoft.ML.SamplesUtils;
using Microsoft.ML.Trainers.HalLearners;
using Xunit;

namespace Microsoft.ML.Functional.Tests
{
internal static class Common
{
public static void CheckMetrics(RegressionMetrics metrics)
{
// Perform sanity checks on the metrics
Assert.True(metrics.Rms >= 0);
Assert.True(metrics.L1 >= 0);
Assert.True(metrics.L2 >= 0);
Assert.True(metrics.RSquared <= 1);
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<!-- We are turning off strong naming to ensure we never add `InternalsVisibleTo` for these tests -->
<SignAssembly>false</SignAssembly>
Copy link
Member

@eerhardt eerhardt Feb 8, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You probably need to set PublicSign to false as well, to unblock the Mac and Linux builds. #Resolved

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the tip! I couldn't figure out why those builds broke!


In reply to: 255168732 [](ancestors = 255168732)

<PublicSign>false</PublicSign>
</PropertyGroup>

<ItemGroup>
<ProjectReference Include="..\..\src\Microsoft.ML.Data\Microsoft.ML.Data.csproj" />
<ProjectReference Include="..\..\src\Microsoft.ML.Ensemble\Microsoft.ML.Ensemble.csproj" />
<ProjectReference Include="..\..\src\Microsoft.ML.EntryPoints\Microsoft.ML.EntryPoints.csproj" />
<ProjectReference Include="..\..\src\Microsoft.ML.HalLearners\Microsoft.ML.HalLearners.csproj" />
<ProjectReference Include="..\..\src\Microsoft.ML.ImageAnalytics\Microsoft.ML.ImageAnalytics.csproj" />
<ProjectReference Include="..\..\src\Microsoft.ML.KMeansClustering\Microsoft.ML.KMeansClustering.csproj" />
<ProjectReference Include="..\..\src\Microsoft.ML.SamplesUtils\Microsoft.ML.SamplesUtils.csproj" />
<ProjectReference Include="..\..\src\Microsoft.ML.LightGBM\Microsoft.ML.LightGBM.csproj" />
<ProjectReference Include="..\..\src\Microsoft.ML.Maml\Microsoft.ML.Maml.csproj" />
<ProjectReference Include="..\..\src\Microsoft.ML.OnnxTransform\Microsoft.ML.OnnxTransform.csproj" />
<ProjectReference Include="..\..\src\Microsoft.ML.PCA\Microsoft.ML.PCA.csproj" />
<ProjectReference Include="..\..\src\Microsoft.ML.KMeansClustering\Microsoft.ML.KMeansClustering.csproj" />
<ProjectReference Include="..\..\src\Microsoft.ML.Recommender\Microsoft.ML.Recommender.csproj" />
<ProjectReference Include="..\..\src\Microsoft.ML.StandardLearners\Microsoft.ML.StandardLearners.csproj" />
<ProjectReference Include="..\..\src\Microsoft.ML.Onnx\Microsoft.ML.Onnx.csproj" />
<ProjectReference Include="..\..\src\Microsoft.ML.StaticPipe\Microsoft.ML.StaticPipe.csproj" />
<ProjectReference Include="..\..\src\Microsoft.ML.TensorFlow.StaticPipe\Microsoft.ML.TensorFlow.StaticPipe.csproj" />
<ProjectReference Include="..\..\src\Microsoft.ML.TensorFlow\Microsoft.ML.TensorFlow.csproj" />
<ProjectReference Include="..\..\src\Microsoft.ML.TimeSeries\Microsoft.ML.TimeSeries.csproj" />
<ProjectReference Include="..\Microsoft.ML.TestFramework\Microsoft.ML.TestFramework.csproj" />
</ItemGroup>

<ItemGroup>
<NativeAssemblyReference Include="CpuMathNative" />
<NativeAssemblyReference Include="FastTreeNative" />
<NativeAssemblyReference Include="FactorizationMachineNative" />
<NativeAssemblyReference Include="MatrixFactorizationNative" />
<NativeAssemblyReference Include="LdaNative" />
<NativeAssemblyReference Include="SymSgdNative" />
<NativeAssemblyReference Include="MklProxyNative" />
<NativeAssemblyReference Include="MklImports" />
Copy link
Member

@sfilipi sfilipi Feb 9, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are those all needed a this point in time? #Resolved

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think they will be when we're all done.


In reply to: 255278694 [](ancestors = 255278694)

</ItemGroup>

<!-- TensorFlow is 64-bit only -->
<ItemGroup Condition="'$(NativeTargetArchitecture)' == 'x64'">
<NativeAssemblyReference Include="tensorflow" />
<NativeAssemblyReference Condition="'$(OS)' != 'Windows_NT'" Include="tensorflow_framework" />
</ItemGroup>
<ItemGroup>
<PackageReference Include="Microsoft.ML.TensorFlow.TestModels" Version="$(MicrosoftMLTensorFlowTestModelsVersion)" />
<PackageReference Include="Microsoft.ML.Onnx.TestModels" Version="$(MicrosoftMLOnnxTestModelsVersion)" />
</ItemGroup>
</Project>
53 changes: 53 additions & 0 deletions test/Microsoft.ML.Functional.Tests/Prediction.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using Microsoft.ML.RunTests;
using Microsoft.ML.TestFramework;
using Xunit;

namespace Microsoft.ML.Functional.Tests
{
public class PredictionScenarios
{
/// <summary>
/// Reconfigurable predictions: The following should be possible: A user trains a binary classifier,
/// and through the test evaluator gets a PR curve, the based on the PR curve picks a new threshold
/// and configures the scorer (or more precisely instantiates a new scorer over the same model parameters)
/// with some threshold derived from that.
/// </summary>
[Fact]
public void ReconfigurablePrediction()
{
var mlContext = new MLContext(seed: 789);
Copy link
Member

@sfilipi sfilipi Feb 9, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

seed: 78 [](start = 42, length = 8)

this intentional? #Resolved


// Get the dataset, create a train and test
var data = mlContext.Data.CreateTextLoader(TestDatasets.housing.GetLoaderColumns(), hasHeader: true)
.Read(BaseTestClass.GetDataPath(TestDatasets.housing.trainFilename));
(var train, var test) = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.2);

// Create a pipeline to train on the housing data
var pipeline = mlContext.Transforms.Concatenate("Features", new string[] {
"CrimesPerCapita", "PercentResidental", "PercentNonRetail", "CharlesRiver", "NitricOxides", "RoomsPerDwelling",
"PercentPre40s", "EmploymentDistance", "HighwayDistance", "TaxRate", "TeacherRatio"})
.Append(mlContext.Transforms.CopyColumns("Label", "MedianHomeValue"))
.Append(mlContext.Regression.Trainers.OrdinaryLeastSquares());

var model = pipeline.Fit(train);

var scoredTest = model.Transform(test);
var metrics = mlContext.Regression.Evaluate(scoredTest);
Copy link
Member

@eerhardt eerhardt Feb 8, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we be asserting the metrics are in a certain range? #Resolved

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good call on checking valid ranges. I added a Common library to add those sorts of checks to.


In reply to: 255167478 [](ancestors = 255167478)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you missed checking that new function in.


In reply to: 255228907 [](ancestors = 255228907,255167478)


Common.CheckMetrics(metrics);

// Todo #2465: Allow the setting of threshold and thresholdColumn for scoring.
// This is no longer possible in the API
//var newModel = new BinaryPredictionTransformer<IPredictorProducing<float>>(ml, model.Model, trainData.Schema, model.FeatureColumn, threshold: 0.01f, thresholdColumn: DefaultColumnNames.Probability);
//var newScoredTest = newModel.Transform(pipeline.Transform(testData));
//var newMetrics = mlContext.BinaryClassification.Evaluate(scoredTest);
// And the Threshold and ThresholdColumn properties are not settable.
//var predictor = model.LastTransformer;
//predictor.Threshold = 0.01; // Not possible
}
}
}
53 changes: 53 additions & 0 deletions test/Microsoft.ML.Functional.Tests/Validation.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using Microsoft.Data.DataView;
using Microsoft.ML.Data;
using Microsoft.ML.RunTests;
using Microsoft.ML.TestFramework;
using Microsoft.ML.Trainers.HalLearners;
using Xunit;

namespace Microsoft.ML.Functional.Tests
{
public class ValidationScenarios
{
/// <summary>
/// Cross-validation: Have a mechanism to do cross validation, that is, you come up with
/// a data source (optionally with stratification column), come up with an instantiable transform
/// and trainer pipeline, and it will handle (1) splitting up the data, (2) training the separate
/// pipelines on in-fold data, (3) scoring on the out-fold data, (4) returning the set of
/// metrics, trained pipelines, and scored test data for each fold.
/// </summary>
[Fact]
void CrossValidation()
{
var mlContext = new MLContext(seed: 789);

// Get the dataset
var data = mlContext.Data.CreateTextLoader(TestDatasets.housing.GetLoaderColumns(), hasHeader: true)
.Read(BaseTestClass.GetDataPath(TestDatasets.housing.trainFilename));

// Create a pipeline to train on the sentiment data
var pipeline = mlContext.Transforms.Concatenate("Features", new string[] {
"CrimesPerCapita", "PercentResidental", "PercentNonRetail", "CharlesRiver", "NitricOxides", "RoomsPerDwelling",
"PercentPre40s", "EmploymentDistance", "HighwayDistance", "TaxRate", "TeacherRatio"})
.Append(mlContext.Transforms.CopyColumns("Label", "MedianHomeValue"))
.Append(mlContext.Regression.Trainers.OrdinaryLeastSquares());

// Compute the CV result
var cvResult = mlContext.Regression.CrossValidate(data, pipeline, numFolds: 5);

// Check that the results are valid
Assert.IsType<RegressionMetrics>(cvResult[0].metrics);
Assert.IsType<TransformerChain<RegressionPredictionTransformer<OlsLinearRegressionModelParameters>>>(cvResult[0].model);
Assert.True(cvResult[0].scoredTestData is IDataView);
Assert.Equal(5, cvResult.Length);

// And validate the metrics
foreach (var result in cvResult)
Common.CheckMetrics(result.metrics);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
<ProjectReference Include="..\Microsoft.ML.TestFramework\Microsoft.ML.TestFramework.csproj" />
</ItemGroup>
<ItemGroup>
<PackageReference Include="Microsoft.ML.Onnx.TestModels" Version="0.0.4-test" />
<PackageReference Include="Microsoft.ML.Onnx.TestModels" Version="$(MicrosoftMLOnnxTestModelsVersion)" />
</ItemGroup>

<ItemGroup>
Expand Down
19 changes: 18 additions & 1 deletion test/Microsoft.ML.TestFramework/Datasets.cs
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,24 @@ public static class TestDatasets
name = "housing",
trainFilename = "housing.txt",
testFilename = "housing.txt",
loaderSettings = "loader=Text{col=Label:0 col=Features:~ header=+}"
loaderSettings = "loader=Text{col=Label:0 col=Features:~ header=+}",
GetLoaderColumns = () =>
{
return new[] {
new TextLoader.Column("MedianHomeValue", DataKind.R4, 0),
new TextLoader.Column("CrimesPerCapita", DataKind.R4, 1),
new TextLoader.Column("PercentResidental", DataKind.R4, 2),
new TextLoader.Column("PercentNonRetail", DataKind.R4, 3),
new TextLoader.Column("CharlesRiver", DataKind.R4, 4),
new TextLoader.Column("NitricOxides", DataKind.R4, 5),
new TextLoader.Column("RoomsPerDwelling", DataKind.R4, 6),
new TextLoader.Column("PercentPre40s", DataKind.R4, 7),
new TextLoader.Column("EmploymentDistance", DataKind.R4, 8),
new TextLoader.Column("HighwayDistance", DataKind.R4, 9),
new TextLoader.Column("TaxRate", DataKind.R4, 10),
new TextLoader.Column("TeacherRatio", DataKind.R4, 11),
};
}
};

public static TestDataset generatedRegressionDatasetmacro = new TestDataset
Expand Down
4 changes: 2 additions & 2 deletions test/Microsoft.ML.Tests/Microsoft.ML.Tests.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
<NativeAssemblyReference Condition="'$(OS)' != 'Windows_NT'" Include="tensorflow_framework" />
</ItemGroup>
<ItemGroup>
<PackageReference Include="Microsoft.ML.TensorFlow.TestModels" Version="0.0.7-test" />
<PackageReference Include="Microsoft.ML.Onnx.TestModels" Version="0.0.2-test" />
<PackageReference Include="Microsoft.ML.TensorFlow.TestModels" Version="$(MicrosoftMLTensorFlowTestModelsVersion)" />
<PackageReference Include="Microsoft.ML.Onnx.TestModels" Version="$(MicrosoftMLOnnxTestModelsVersion)" />
</ItemGroup>
</Project>

This file was deleted.

This file was deleted.