diff --git a/test/Microsoft.ML.Functional.Tests/Common.cs b/test/Microsoft.ML.Functional.Tests/Common.cs index 0e3e2e5c68..bcba3a8e27 100644 --- a/test/Microsoft.ML.Functional.Tests/Common.cs +++ b/test/Microsoft.ML.Functional.Tests/Common.cs @@ -7,6 +7,7 @@ using System.Linq; using Microsoft.Data.DataView; using Microsoft.ML.Data; +using Microsoft.ML.Data.Evaluators.Metrics; using Microsoft.ML.Functional.Tests.Datasets; using Xunit; @@ -160,13 +161,86 @@ public static void AssertEqual(TypeTestData testType1, TypeTestData testType2) Assert.True(testType1.Ug.Equals(testType2.Ug)); } + /// + /// Check that a object is valid. + /// + /// The metrics object. + public static void AssertMetrics(AnomalyDetectionMetrics metrics) + { + Assert.InRange(metrics.Auc, 0, 1); + Assert.InRange(metrics.DrAtK, 0, 1); + } + + /// + /// Check that a object is valid. + /// + /// The metrics object. + public static void AssertMetrics(BinaryClassificationMetrics metrics) + { + Assert.InRange(metrics.Accuracy, 0, 1); + Assert.InRange(metrics.Auc, 0, 1); + Assert.InRange(metrics.Auprc, 0, 1); + Assert.InRange(metrics.F1Score, 0, 1); + Assert.InRange(metrics.NegativePrecision, 0, 1); + Assert.InRange(metrics.NegativeRecall, 0, 1); + Assert.InRange(metrics.PositivePrecision, 0, 1); + Assert.InRange(metrics.PositiveRecall, 0, 1); + } + + /// + /// Check that a object is valid. + /// + /// The metrics object. + public static void AssertMetrics(CalibratedBinaryClassificationMetrics metrics) + { + Assert.InRange(metrics.Entropy, double.NegativeInfinity, 1); + Assert.InRange(metrics.LogLoss, double.NegativeInfinity, 1); + Assert.InRange(metrics.LogLossReduction, double.NegativeInfinity, 100); + AssertMetrics(metrics as BinaryClassificationMetrics); + } + + /// + /// Check that a object is valid. + /// + /// The metrics object. + public static void AssertMetrics(ClusteringMetrics metrics) + { + Assert.True(metrics.AvgMinScore >= 0); + Assert.True(metrics.Dbi >= 0); + if (!double.IsNaN(metrics.Nmi)) + Assert.True(metrics.Nmi >= 0 && metrics.Nmi <= 1); + } + + /// + /// Check that a object is valid. + /// + /// The metrics object. + public static void AssertMetrics(MultiClassClassifierMetrics metrics) + { + Assert.InRange(metrics.AccuracyMacro, 0, 1); + Assert.InRange(metrics.AccuracyMicro, 0, 1); + Assert.True(metrics.LogLoss >= 0); + Assert.InRange(metrics.TopKAccuracy, 0, 1); + } + + /// + /// Check that a object is valid. + /// + /// The metrics object. + public static void AssertMetrics(RankerMetrics metrics) + { + foreach (var dcg in metrics.Dcg) + Assert.True(dcg >= 0); + foreach (var ndcg in metrics.Ndcg) + Assert.InRange(ndcg, 0, 100); + } + /// /// Check that a object is valid. /// /// The metrics object. public static void AssertMetrics(RegressionMetrics metrics) { - // Perform sanity checks on the metrics. Assert.True(metrics.Rms >= 0); Assert.True(metrics.L1 >= 0); Assert.True(metrics.L2 >= 0); @@ -179,7 +253,6 @@ public static void AssertMetrics(RegressionMetrics metrics) /// The object. public static void AssertMetricStatistics(MetricStatistics metric) { - // Perform sanity checks on the metrics. Assert.True(metric.StandardDeviation >= 0); Assert.True(metric.StandardError >= 0); } @@ -190,7 +263,6 @@ public static void AssertMetricStatistics(MetricStatistics metric) /// The metrics object. public static void AssertMetricsStatistics(RegressionMetricsStatistics metrics) { - // The mean can be any float; the standard deviation and error must be >=0. AssertMetricStatistics(metrics.Rms); AssertMetricStatistics(metrics.L1); AssertMetricStatistics(metrics.L2); diff --git a/test/Microsoft.ML.Functional.Tests/Datasets/Iris.cs b/test/Microsoft.ML.Functional.Tests/Datasets/Iris.cs new file mode 100644 index 0000000000..d1cbfa3fad --- /dev/null +++ b/test/Microsoft.ML.Functional.Tests/Datasets/Iris.cs @@ -0,0 +1,78 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + + +using System; +using Microsoft.Data.DataView; +using Microsoft.ML.Data; + +namespace Microsoft.ML.Functional.Tests.Datasets +{ + /// + /// A class for the Iris test dataset. + /// + internal sealed class Iris + { + [LoadColumn(0)] + public float Label { get; set; } + + [LoadColumn(1)] + public float SepalLength { get; set; } + + [LoadColumn(2)] + public float SepalWidth { get; set; } + + [LoadColumn(4)] + public float PetalLength { get; set; } + + [LoadColumn(5)] + public float PetalWidth { get; set; } + + /// + /// The list of columns commonly used as features. + /// + public static readonly string[] Features = new string[] { "SepalLength", "SepalWidth", "PetalLength", "PetalWidth" }; + + public static IDataView LoadAsRankingProblem(MLContext mlContext, string filePath, bool hasHeader, char separatorChar, int seed = 1) + { + // Load the Iris data. + var data = mlContext.Data.ReadFromTextFile(filePath, hasHeader: hasHeader, separatorChar: separatorChar); + + // Create a function that generates a random groupId. + var rng = new Random(seed); + Action generateGroupId = (input, output) => + { + output.Label = input.Label; + // The standard set used in tests has 150 rows + output.GroupId = rng.Next(0, 30); + output.PetalLength = input.PetalLength; + output.PetalWidth = input.PetalWidth; + output.SepalLength = input.SepalLength; + output.SepalWidth = input.SepalWidth; + }; + + // Describe a pipeline that generates a groupId and converts it to a key. + var pipeline = mlContext.Transforms.CustomMapping(generateGroupId, null) + .Append(mlContext.Transforms.Conversion.MapValueToKey("GroupId")); + + // Transform the data + var transformedData = pipeline.Fit(data).Transform(data); + + return transformedData; + } + } + + /// + /// A class for the Iris dataset with a GroupId column. + /// + internal sealed class IrisWithGroup + { + public float Label { get; set; } + public int GroupId { get; set; } + public float SepalLength { get; set; } + public float SepalWidth { get; set; } + public float PetalLength { get; set; } + public float PetalWidth { get; set; } + } +} diff --git a/test/Microsoft.ML.Functional.Tests/Datasets/MnistOneClass.cs b/test/Microsoft.ML.Functional.Tests/Datasets/MnistOneClass.cs new file mode 100644 index 0000000000..07b26d3d9c --- /dev/null +++ b/test/Microsoft.ML.Functional.Tests/Datasets/MnistOneClass.cs @@ -0,0 +1,29 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.ML.Data; + +namespace Microsoft.ML.Functional.Tests.Datasets +{ + internal sealed class MnistOneClass + { + private const int _featureLength = 783; + + public float Label { get; set; } + + public float[] Features { get; set; } + + public static TextLoader GetTextLoader(MLContext mlContext, bool hasHeader, char separatorChar) + { + return mlContext.Data.CreateTextLoader( + new[] { + new TextLoader.Column("Label", DataKind.R4, 0), + new TextLoader.Column("Features", DataKind.R4, 1, 1 + _featureLength) + }, + separatorChar: separatorChar, + hasHeader: hasHeader, + allowSparse: true); + } + } +} diff --git a/test/Microsoft.ML.Functional.Tests/Datasets/Sentiment.cs b/test/Microsoft.ML.Functional.Tests/Datasets/Sentiment.cs new file mode 100644 index 0000000000..2465e291b3 --- /dev/null +++ b/test/Microsoft.ML.Functional.Tests/Datasets/Sentiment.cs @@ -0,0 +1,20 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.ML.Data; + +namespace Microsoft.ML.Functional.Tests.Datasets +{ + /// + /// A class for reading in the Sentiment test dataset. + /// + internal sealed class TweetSentiment + { + [LoadColumn(0), ColumnName("Label")] + public bool Sentiment { get; set; } + + [LoadColumn(1)] + public string SentimentText { get; set; } + } +} diff --git a/test/Microsoft.ML.Functional.Tests/Datasets/TrivialMatrixFactorization.cs b/test/Microsoft.ML.Functional.Tests/Datasets/TrivialMatrixFactorization.cs new file mode 100644 index 0000000000..005fc98c72 --- /dev/null +++ b/test/Microsoft.ML.Functional.Tests/Datasets/TrivialMatrixFactorization.cs @@ -0,0 +1,41 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + + +using System; +using Microsoft.Data.DataView; +using Microsoft.ML.Data; + +namespace Microsoft.ML.Functional.Tests.Datasets +{ + /// + /// A class describing the TrivialMatrixFactorization test dataset. + /// + internal sealed class TrivialMatrixFactorization + { + [LoadColumn(0)] + public float Label { get; set; } + + [LoadColumn(1)] + public uint MatrixColumnIndex { get; set; } + + [LoadColumn(2)] + public uint MatrixRowIndex { get; set; } + + public static IDataView LoadAndFeaturizeFromTextFile(MLContext mlContext, string filePath, bool hasHeader, char separatorChar) + { + // Load the data from a textfile. + var data = mlContext.Data.ReadFromTextFile(filePath, hasHeader: hasHeader, separatorChar: separatorChar); + + // Describe a pipeline to translate the uints to keys. + var pipeline = mlContext.Transforms.Conversion.MapValueToKey("MatrixColumnIndex") + .Append(mlContext.Transforms.Conversion.MapValueToKey("MatrixRowIndex")); + + // Transform the data. + var transformedData = pipeline.Fit(data).Transform(data); + + return transformedData; + } + } +} diff --git a/test/Microsoft.ML.Functional.Tests/Evaluation.cs b/test/Microsoft.ML.Functional.Tests/Evaluation.cs new file mode 100644 index 0000000000..6ffec01b32 --- /dev/null +++ b/test/Microsoft.ML.Functional.Tests/Evaluation.cs @@ -0,0 +1,309 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.ML.Functional.Tests.Datasets; +using Microsoft.ML.RunTests; +using Microsoft.ML.TestFramework; +using Microsoft.ML.TestFramework.Attributes; +using Microsoft.ML.Trainers; +using Microsoft.ML.Trainers.FastTree; +using Microsoft.ML.Trainers.KMeans; +using Xunit; +using Xunit.Abstractions; + +namespace Microsoft.ML.Functional.Tests +{ + public class Evaluation : BaseTestClass + { + public Evaluation(ITestOutputHelper output): base(output) + { + } + + /// + /// Train and Evaluate: Anomaly Detection. + /// + [Fact] + public void TrainAndEvaluateAnomalyDetection() + { + var mlContext = new MLContext(seed: 1, conc: 1); + + var trainData = MnistOneClass.GetTextLoader(mlContext, + TestDatasets.mnistOneClass.fileHasHeader, TestDatasets.mnistOneClass.fileSeparator) + .Read(GetDataPath(TestDatasets.mnistOneClass.trainFilename)); + var testData = MnistOneClass.GetTextLoader(mlContext, + TestDatasets.mnistOneClass.fileHasHeader, TestDatasets.mnistOneClass.fileSeparator) + .Read(GetDataPath(TestDatasets.mnistOneClass.testFilename)); + + // Create a training pipeline. + var pipeline = mlContext.AnomalyDetection.Trainers.RandomizedPca(); + + // Train the model. + var model = pipeline.Fit(trainData); + + // Evaluate the model. + // TODO #2464: Using the train dataset will cause NaN metrics to be returned. + var scoredTest = model.Transform(testData); + var metrics = mlContext.AnomalyDetection.Evaluate(scoredTest); + + // Check that the metrics returned are valid. + Common.AssertMetrics(metrics); + } + + /// + /// Train and Evaluate: Binary Classification with no calibration. + /// + [Fact] + public void TrainAndEvaluateBinaryClassification() + { + var mlContext = new MLContext(seed: 1, conc: 1); + + var data = mlContext.Data.ReadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename), + hasHeader: TestDatasets.Sentiment.fileHasHeader, + separatorChar: TestDatasets.Sentiment.fileSeparator); + + // Create a training pipeline. + var pipeline = mlContext.Transforms.Text.FeaturizeText("Features", "SentimentText") + .AppendCacheCheckpoint(mlContext) + .Append(mlContext.BinaryClassification.Trainers.StochasticDualCoordinateAscentNonCalibrated( + new SdcaNonCalibratedBinaryTrainer.Options { NumThreads = 1 })); + + // Train the model. + var model = pipeline.Fit(data); + + // Evaluate the model. + var scoredData = model.Transform(data); + var metrics = mlContext.BinaryClassification.EvaluateNonCalibrated(scoredData); + + // Check that the metrics returned are valid. + Common.AssertMetrics(metrics); + } + + /// + /// Train and Evaluate: Binary Classification with a calibrated predictor. + /// + [Fact] + public void TrainAndEvaluateBinaryClassificationWithCalibration() + { + var mlContext = new MLContext(seed: 1, conc: 1); + + var data = mlContext.Data.ReadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename), + hasHeader: TestDatasets.Sentiment.fileHasHeader, + separatorChar: TestDatasets.Sentiment.fileSeparator); + + // Create a training pipeline. + var pipeline = mlContext.Transforms.Text.FeaturizeText("Features", "SentimentText") + .AppendCacheCheckpoint(mlContext) + .Append(mlContext.BinaryClassification.Trainers.LogisticRegression( + new LogisticRegression.Options { NumThreads = 1 })); + + // Train the model. + var model = pipeline.Fit(data); + + // Evaluate the model. + var scoredData = model.Transform(data); + var metrics = mlContext.BinaryClassification.Evaluate(scoredData); + + // Check that the metrics returned are valid. + Common.AssertMetrics(metrics); + } + + /// + /// Train and Evaluate: Clustering. + /// + [Fact] + public void TrainAndEvaluateClustering() + { + var mlContext = new MLContext(seed: 1, conc: 1); + + var data = mlContext.Data.ReadFromTextFile(GetDataPath(TestDatasets.iris.trainFilename), + hasHeader: TestDatasets.iris.fileHasHeader, + separatorChar: TestDatasets.iris.fileSeparator); + + // Create a training pipeline. + var pipeline = mlContext.Transforms.Concatenate("Features", Iris.Features) + .AppendCacheCheckpoint(mlContext) + .Append(mlContext.Clustering.Trainers.KMeans(new KMeansPlusPlusTrainer.Options { NumThreads = 1 })); + + // Train the model. + var model = pipeline.Fit(data); + + // Evaluate the model. + var scoredData = model.Transform(data); + var metrics = mlContext.Clustering.Evaluate(scoredData); + + // Check that the metrics returned are valid. + Common.AssertMetrics(metrics); + } + + /// + /// Train and Evaluate: Multiclass Classification. + /// + [Fact] + public void TrainAndEvaluateMulticlassClassification() + { + var mlContext = new MLContext(seed: 1, conc: 1); + + var data = mlContext.Data.ReadFromTextFile(GetDataPath(TestDatasets.iris.trainFilename), + hasHeader: TestDatasets.iris.fileHasHeader, + separatorChar: TestDatasets.iris.fileSeparator); + + // Create a training pipeline. + var pipeline = mlContext.Transforms.Concatenate("Features", Iris.Features) + .AppendCacheCheckpoint(mlContext) + .Append(mlContext.MulticlassClassification.Trainers.StochasticDualCoordinateAscent( + new SdcaMultiClassTrainer.Options { NumThreads = 1})); + + // Train the model. + var model = pipeline.Fit(data); + + // Evaluate the model. + var scoredData = model.Transform(data); + var metrics = mlContext.MulticlassClassification.Evaluate(scoredData); + + // Check that the metrics returned are valid. + Common.AssertMetrics(metrics); + } + + /// + /// Train and Evaluate: Ranking. + /// + [Fact] + public void TrainAndEvaluateRanking() + { + var mlContext = new MLContext(seed: 1, conc: 1); + + var data = Iris.LoadAsRankingProblem(mlContext, + GetDataPath(TestDatasets.iris.trainFilename), + hasHeader: TestDatasets.iris.fileHasHeader, + separatorChar: TestDatasets.iris.fileSeparator); + + // Create a training pipeline. + var pipeline = mlContext.Transforms.Concatenate("Features", Iris.Features) + .Append(mlContext.Ranking.Trainers.FastTree(new FastTreeRankingTrainer.Options { NumThreads = 1 })); + + // Train the model. + var model = pipeline.Fit(data); + + // Evaluate the model. + var scoredData = model.Transform(data); + var metrics = mlContext.Ranking.Evaluate(scoredData, label: "Label", groupId: "GroupId"); + + // Check that the metrics returned are valid. + Common.AssertMetrics(metrics); + } + + /// + /// Train and Evaluate: Recommendation. + /// + [MatrixFactorizationFact] + public void TrainAndEvaluateRecommendation() + { + var mlContext = new MLContext(seed: 1, conc: 1); + + // Get the dataset. + var data = TrivialMatrixFactorization.LoadAndFeaturizeFromTextFile( + mlContext, + GetDataPath(TestDatasets.trivialMatrixFactorization.trainFilename), + TestDatasets.trivialMatrixFactorization.fileHasHeader, + TestDatasets.trivialMatrixFactorization.fileSeparator); + + // Create a pipeline to train on the sentiment data. + var pipeline = mlContext.Recommendation().Trainers.MatrixFactorization( + new MatrixFactorizationTrainer.Options{ + MatrixColumnIndexColumnName = "MatrixColumnIndex", + MatrixRowIndexColumnName = "MatrixRowIndex", + LabelColumnName = "Label", + NumberOfIterations = 3, + NumberOfThreads = 1, + ApproximationRank = 4, + }); + + // Train the model. + var model = pipeline.Fit(data); + + // Evaluate the model. + var scoredData = model.Transform(data); + var metrics = mlContext.Recommendation().Evaluate(scoredData); + + // Check that the metrics returned are valid. + Common.AssertMetrics(metrics); + } + + /// + /// Train and Evaluate: Regression. + /// + [Fact] + public void TrainAndEvaluateRegression() + { + var mlContext = new MLContext(seed: 1, conc: 1); + + // Get the dataset. + var data = mlContext.Data.CreateTextLoader(TestDatasets.housing.GetLoaderColumns(), + hasHeader: TestDatasets.housing.fileHasHeader, separatorChar: TestDatasets.housing.fileSeparator) + .Read(GetDataPath(TestDatasets.housing.trainFilename)); + + // Create a pipeline to train on the sentiment data. + var pipeline = mlContext.Transforms.Concatenate("Features", new string[] { + "CrimesPerCapita", "PercentResidental", "PercentNonRetail", "CharlesRiver", "NitricOxides", "RoomsPerDwelling", + "PercentPre40s", "EmploymentDistance", "HighwayDistance", "TaxRate", "TeacherRatio"}) + .Append(mlContext.Transforms.CopyColumns("Label", "MedianHomeValue")) + .Append(mlContext.Regression.Trainers.FastTree(new FastTreeRegressionTrainer.Options { NumThreads = 1 })); + + // Train the model. + var model = pipeline.Fit(data); + + // Evaluate the model. + var scoredData = model.Transform(data); + var metrics = mlContext.Regression.Evaluate(scoredData); + + // Check that the metrics returned are valid. + Common.AssertMetrics(metrics); + } + + /// + /// Evaluate With Precision-Recall Curves. + /// + /// + /// This is currently not possible using the APIs. + /// + [Fact] + public void TrainAndEvaluateWithPrecisionRecallCurves() + { + var mlContext = new MLContext(seed: 1, conc: 1); + + var data = mlContext.Data.ReadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename), + hasHeader: TestDatasets.Sentiment.fileHasHeader, + separatorChar: TestDatasets.Sentiment.fileSeparator); + + // Create a training pipeline. + var pipeline = mlContext.Transforms.Text.FeaturizeText("Features", "SentimentText") + .AppendCacheCheckpoint(mlContext) + .Append(mlContext.BinaryClassification.Trainers.LogisticRegression( + new LogisticRegression.Options { NumThreads = 1 })); + + // Train the model. + var model = pipeline.Fit(data); + + // Evaluate the model. + var scoredData = model.Transform(data); + var metrics = mlContext.BinaryClassification.Evaluate(scoredData); + + Common.AssertMetrics(metrics); + + // This scenario is not possible with the current set of APIs. + // There could be two ways imaginable: + // 1. Getting a list of (P,R) from the Evaluator (as it calculates most of the information already). + // Not currently possible. + // 2. Manually setting the classifier threshold and calling evaluate many times: + // Not currently possible: Todo #2465: Allow the setting of threshold and thresholdColumn for scoring. + // Technically, this scenario is possible using custom mappers like so: + // 1. Get a list of all unique probability scores. + // e.g. By reading the IDataView as an IEnumerable, and keeping a hash of known probabilities up to some precision. + // 2. For each value of probability: + // a. Write a custom mapper to produce PredictedLabel at that probability threshold. + // b. Calculate Precision and Recall with these labels. + // c. Append the Precision and Recall to an IList. + } + } +} \ No newline at end of file diff --git a/test/Microsoft.ML.Functional.Tests/Prediction.cs b/test/Microsoft.ML.Functional.Tests/Prediction.cs index 661aac3fcd..74ec111c92 100644 --- a/test/Microsoft.ML.Functional.Tests/Prediction.cs +++ b/test/Microsoft.ML.Functional.Tests/Prediction.cs @@ -22,7 +22,8 @@ public void ReconfigurablePrediction() var mlContext = new MLContext(seed: 789); // Get the dataset, create a train and test - var data = mlContext.Data.CreateTextLoader(TestDatasets.housing.GetLoaderColumns(), hasHeader: true) + var data = mlContext.Data.CreateTextLoader(TestDatasets.housing.GetLoaderColumns(), + hasHeader: TestDatasets.housing.fileHasHeader, separatorChar: TestDatasets.housing.fileSeparator) .Read(BaseTestClass.GetDataPath(TestDatasets.housing.trainFilename)); var split = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.2); diff --git a/test/Microsoft.ML.Functional.Tests/Validation.cs b/test/Microsoft.ML.Functional.Tests/Validation.cs index eebe55b58c..ed1cccbf7c 100644 --- a/test/Microsoft.ML.Functional.Tests/Validation.cs +++ b/test/Microsoft.ML.Functional.Tests/Validation.cs @@ -27,7 +27,8 @@ void CrossValidation() var mlContext = new MLContext(seed: 1, conc: 1); // Get the dataset. - var data = mlContext.Data.CreateTextLoader(TestDatasets.housing.GetLoaderColumns(), hasHeader: true) + var data = mlContext.Data.CreateTextLoader(TestDatasets.housing.GetLoaderColumns(), + hasHeader: TestDatasets.housing.fileHasHeader, separatorChar: TestDatasets.housing.fileSeparator) .Read(BaseTestClass.GetDataPath(TestDatasets.housing.trainFilename)); // Create a pipeline to train on the sentiment data. @@ -60,7 +61,8 @@ public void TrainWithValidationSet() var mlContext = new MLContext(seed: 1, conc: 1); // Get the dataset. - var data = mlContext.Data.CreateTextLoader(TestDatasets.housing.GetLoaderColumns(), hasHeader: true) + var data = mlContext.Data.CreateTextLoader(TestDatasets.housing.GetLoaderColumns(), + hasHeader: TestDatasets.housing.fileHasHeader, separatorChar: TestDatasets.housing.fileSeparator) .Read(BaseTestClass.GetDataPath(TestDatasets.housing.trainFilename)); var dataSplit = mlContext.Regression.TrainTestSplit(data, testFraction: 0.2); var trainData = dataSplit.TrainSet; diff --git a/test/Microsoft.ML.TestFramework/Datasets.cs b/test/Microsoft.ML.TestFramework/Datasets.cs index 7197f1f64b..abc9862049 100644 --- a/test/Microsoft.ML.TestFramework/Datasets.cs +++ b/test/Microsoft.ML.TestFramework/Datasets.cs @@ -14,6 +14,8 @@ public class TestDataset public string testFilename; public string validFilename; public string labelFilename; + public char fileSeparator; + public bool fileHasHeader; // REVIEW: Replace these with appropriate SubComponents! public string settings; @@ -158,6 +160,8 @@ public static class TestDatasets name = "housing", trainFilename = "housing.txt", testFilename = "housing.txt", + fileSeparator = '\t', + fileHasHeader = true, loaderSettings = "loader=Text{col=Label:0 col=Features:~ header=+}", GetLoaderColumns = () => { @@ -206,6 +210,8 @@ public static class TestDatasets name = "sentiment", trainFilename = "wikipedia-detox-250-line-data.tsv", testFilename = "wikipedia-detox-250-line-test.tsv", + fileHasHeader = true, + fileSeparator = '\t', GetLoaderColumns = () => { return new[] @@ -447,6 +453,8 @@ public static class TestDatasets name = "iris", trainFilename = @"iris.txt", testFilename = @"iris.txt", + fileHasHeader = true, + fileSeparator = '\t' }; public static TestDataset irisMissing = new TestDataset() @@ -655,6 +663,8 @@ public static class TestDatasets name = "mnistOneClass", trainFilename = @"MNIST.Train.0-class.tiny.txt", testFilename = @"MNIST.Test.tiny.txt", + fileHasHeader = false, + fileSeparator = '\t', settings = "" }; @@ -704,6 +714,8 @@ public static class TestDatasets name = "trivialMatrixFactorization", trainFilename = @"trivial-train.tsv", testFilename = @"trivial-test.tsv", + fileHasHeader = true, + fileSeparator = '\t', loaderSettings = "loader=Text{col=Label:R4:0 col=User:U4[0-19]:1 col=Item:U4[0-39]:2 header+}" }; } diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Evaluation.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Evaluation.cs deleted file mode 100644 index cad289872e..0000000000 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Evaluation.cs +++ /dev/null @@ -1,39 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using Microsoft.ML.Data; -using Microsoft.ML.RunTests; -using Microsoft.ML.Trainers; -using Xunit; - -namespace Microsoft.ML.Tests.Scenarios.Api -{ - public partial class ApiScenariosTests - { - /// - /// Evaluation: Similar to the simple train scenario, except instead of having some - /// predictive structure, be able to score another "test" data file, run the result - /// through an evaluator and get metrics like AUC, accuracy, PR curves, and whatnot. - /// Getting metrics out of this should be as straightforward and unannoying as possible. - /// - [Fact] - public void Evaluation() - { - var ml = new MLContext(seed: 1, conc: 1); - - // Pipeline. - var pipeline = ml.Data.CreateTextLoader(TestDatasets.Sentiment.GetLoaderColumns(), hasHeader: true) - .Append(ml.Transforms.Text.FeaturizeText("Features", "SentimentText")) - .Append(ml.BinaryClassification.Trainers.StochasticDualCoordinateAscent( - new SdcaBinaryTrainer.Options { NumThreads = 1 })); - - // Train. - var readerModel = pipeline.Fit(new MultiFileSource(GetDataPath(TestDatasets.Sentiment.trainFilename))); - - // Evaluate on the test set. - var dataEval = readerModel.Read(new MultiFileSource(GetDataPath(TestDatasets.Sentiment.testFilename))); - var metrics = ml.BinaryClassification.Evaluate(dataEval); - } - } -}