Skip to content

Add Functional Tests for Data I/O #2518

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Feb 14, 2019
158 changes: 155 additions & 3 deletions test/Microsoft.ML.Functional.Tests/Common.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,171 @@
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System;
using System.Collections.Generic;
using System.Linq;
using Microsoft.Data.DataView;
using Microsoft.ML.Data;
using Microsoft.ML.SamplesUtils;
using Microsoft.ML.Trainers.HalLearners;
using Microsoft.ML.Functional.Tests.Datasets;
using Xunit;

namespace Microsoft.ML.Functional.Tests
{
internal static class Common
{
/// <summary>
/// Asssert that an <see cref="IDataView"/> rows are of <see cref="AllTypes"/>.
/// </summary>
/// <param name="allTypesDataset">An <see cref="IDataView"/>.</param>
public static void AssertAllTypesDataset(IDataView allTypesDataset)
{
var toyClassProperties = typeof(AllTypes).GetProperties();

// Check that the schema is of the right size.
Assert.Equal(toyClassProperties.Length, allTypesDataset.Schema.Count);

// Create a lookup table for the types and counts of all properties.
var types = new Dictionary<string, Type>();
var counts = new Dictionary<string, int>();
foreach (var property in toyClassProperties)
{
if (!property.PropertyType.IsArray)
types[property.Name] = property.PropertyType;
else
{
// Construct a VBuffer type for the array.
var vBufferType = typeof(VBuffer<>);
Type[] typeArgs = { property.PropertyType.GetElementType() };
Activator.CreateInstance(property.PropertyType.GetElementType());
types[property.Name] = vBufferType.MakeGenericType(typeArgs);
}

counts[property.Name] = 0;
}

foreach (var column in allTypesDataset.Schema)
{
Assert.True(types.ContainsKey(column.Name));
Assert.Equal(1, ++counts[column.Name]);
Assert.Equal(types[column.Name], column.Type.RawType);
}

// Make sure we didn't miss any columns.
foreach (var value in counts.Values)
Assert.Equal(1, value);
}

/// <summary>
/// Assert than two <see cref="AllTypes"/> datasets are equal.
/// </summary>
/// <param name="mlContext">The ML Context.</param>
/// <param name="data1">A <see cref="IDataView"/> of <see cref="AllTypes"/></param>
/// <param name="data2">A <see cref="IDataView"/> of <see cref="AllTypes"/></param>
public static void AssertAllTypesDatasetsAreEqual(MLContext mlContext, IDataView data1, IDataView data2)
{
// Confirm that they are both of the propery row type.
AssertAllTypesDataset(data1);
AssertAllTypesDataset(data2);

// Validate that the two Schemas are the same.
Common.AssertEqual(data1.Schema, data2.Schema);

// Define how to serialize the IDataView to objects.
var enumerable1 = mlContext.CreateEnumerable<AllTypes>(data1, true);
var enumerable2 = mlContext.CreateEnumerable<AllTypes>(data2, true);

AssertEqual(enumerable1, enumerable2);
}

/// <summary>
/// Assert that two float arrays are equal.
/// </summary>
/// <param name="array1">An array of floats.</param>
/// <param name="array2">An array of floats.</param>
public static void AssertEqual(float[] array1, float[] array2)
Copy link
Contributor

@glebuk glebuk Feb 13, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AssertEqual [](start = 27, length = 11)

Perhaps Enumerable.SequenceEqual(target1, target2); instead? from Linq
https://stackoverflow.com/questions/3232744/easiest-way-to-compare-arrays-in-c-sharp #Closed

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to use the assert logic here so that we can capture equality to some level of precision. Default equality will fail after serialization due to floating point drift sometimes.


In reply to: 256208383 [](ancestors = 256208383)

Copy link
Contributor Author

@rogancarr rogancarr Feb 13, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Plus I want to Assert and not check for equality.


In reply to: 256209571 [](ancestors = 256209571,256208383)

{
Assert.NotNull(array1);
Assert.NotNull(array2);
Assert.Equal(array1.Length, array2.Length);

for (int i = 0; i < array1.Length; i++)
Assert.Equal(array1[i], array2[i]);
}

/// <summary>
/// Assert that two <see cref="Schema"/> objects are equal.
/// </summary>
/// <param name="schema1">A <see cref="Schema"/> object.</param>
/// <param name="schema2">A <see cref="Schema"/> object.</param>
public static void AssertEqual(Schema schema1, Schema schema2)
Copy link
Contributor

@glebuk glebuk Feb 13, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Schema [](start = 39, length = 6)

Schould Schema implement IEquitable ?
https://stackoverflow.com/questions/8400028/comparing-two-instances-of-a-class #Closed

Copy link
Contributor Author

@rogancarr rogancarr Feb 13, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That would be awesome, but probably not before 1.0.


In reply to: 256209052 [](ancestors = 256209052)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 on implementing IEquitable. Should have an issue about it.


In reply to: 256209052 [](ancestors = 256209052)

{
Assert.NotNull(schema1);
Assert.NotNull(schema2);
Copy link
Contributor

@glebuk glebuk Feb 13, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

technically if they are both null they are equal... #Closed

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Technically yes. But internally in our library we consider nulls to not equal each other. :shruggy-emoticon:


In reply to: 256209640 [](ancestors = 256209640)


Assert.Equal(schema1.Count(), schema2.Count());

foreach (var schemaPair in schema1.Zip(schema2, Tuple.Create))
{
Assert.Equal(schemaPair.Item1.Name, schemaPair.Item2.Name);
Assert.Equal(schemaPair.Item1.Index, schemaPair.Item2.Index);
Assert.Equal(schemaPair.Item1.IsHidden, schemaPair.Item2.IsHidden);
// Can probably do a better comparison of Metadata.
AssertEqual(schemaPair.Item1.Metadata.Schema, schemaPair.Item1.Metadata.Schema);
Assert.True((schemaPair.Item1.Type == schemaPair.Item2.Type) ||
(schemaPair.Item1.Type.RawType == schemaPair.Item2.Type.RawType));
}
}

/// <summary>
/// Assert than two <see cref="AllTypes"/> enumerables are equal.
/// </summary>
/// <param name="data1">An enumerable of <see cref="AllTypes"/></param>
/// <param name="data2">An enumerable of <see cref="AllTypes"/></param>
public static void AssertEqual(IEnumerable<AllTypes> data1, IEnumerable<AllTypes> data2)
{
Assert.NotNull(data1);
Assert.NotNull(data2);
Assert.Equal(data1.Count(), data2.Count());

foreach (var rowPair in data1.Zip(data2, Tuple.Create))
{
AssertEqual(rowPair.Item1, rowPair.Item2);
}
}

/// <summary>
/// Assert that two AllTypes datasets are equal.
/// </summary>
/// <param name="allTypes1">An <see cref="AllTypes"/>.</param>
/// <param name="allTypes2">An <see cref="AllTypes"/>.</param>
public static void AssertEqual(AllTypes allTypes1, AllTypes allTypes2)
{
Assert.Equal(allTypes1.Label, allTypes2.Label);
Common.AssertEqual(allTypes1.Features, allTypes2.Features);
Assert.Equal(allTypes1.I1, allTypes2.I1);
Assert.Equal(allTypes1.U1, allTypes2.U1);
Assert.Equal(allTypes1.I2, allTypes2.I2);
Assert.Equal(allTypes1.U2, allTypes2.U2);
Assert.Equal(allTypes1.I4, allTypes2.I4);
Assert.Equal(allTypes1.U4, allTypes2.U4);
Assert.Equal(allTypes1.I8, allTypes2.I8);
Assert.Equal(allTypes1.U8, allTypes2.U8);
Assert.Equal(allTypes1.R4, allTypes2.R4);
Assert.Equal(allTypes1.R8, allTypes2.R8);
Assert.Equal(allTypes1.Tx.ToString(), allTypes2.Tx.ToString());
Assert.True(allTypes1.Ts.Equals(allTypes2.Ts));
Assert.True(allTypes1.Dt.Equals(allTypes2.Dt));
Assert.True(allTypes1.Dz.Equals(allTypes2.Dz));
Assert.True(allTypes1.Ug.Equals(allTypes2.Ug));
}

/// <summary>
/// Check that a <see cref="RegressionMetrics"/> object is valid.
/// </summary>
/// <param name="metrics">The metrics object.</param>
public static void CheckMetrics(RegressionMetrics metrics)
{
// Perform sanity checks on the metrics
// Perform sanity checks on the metrics.
Assert.True(metrics.Rms >= 0);
Assert.True(metrics.L1 >= 0);
Assert.True(metrics.L2 >= 0);
Expand Down
139 changes: 139 additions & 0 deletions test/Microsoft.ML.Functional.Tests/DataIO.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System.IO;
using Microsoft.Data.DataView;
using Microsoft.ML.Data;
using Microsoft.ML.Functional.Tests.Datasets;
using Microsoft.ML.TestFramework;
using Xunit;
using Xunit.Abstractions;

namespace Microsoft.ML.Functional.Tests
{
/// <summary>
/// Test data input and output formats.
/// </summary>
public class DataIO : BaseTestClass
{
// Separators to test
private readonly char[] _separators;

public DataIO(ITestOutputHelper output) : base(output)
{
// SaveAsText expects a "space, tab, comma, semicolon, or bar".
_separators = new char[] { ' ', '\t', ',', ';', '|', };
}

/// <summary>
/// Read from Enumerable: In-Memory objects can be read as enumerables into an IDatView.
/// </summary>
[Fact]
public void ReadFromIEnumerable()
{
var mlContext = new MLContext(seed: 1, conc: 1);

// Read the dataset from an enumerable.
var data = mlContext.Data.ReadFromEnumerable(AllTypes.GenerateDataset());

Common.AssertAllTypesDataset(data);
}

/// <summary>
/// Export to Enumerable: IDatViews can be exported as enumerables of a class.
/// </summary>
[Fact]
public void ExportToIEnumerable()
{
var mlContext = new MLContext(seed: 1, conc: 1);

// Read the dataset from an enumerable.
var enumerableBefore = AllTypes.GenerateDataset();
var data = mlContext.Data.ReadFromEnumerable(enumerableBefore);

// Export back to an enumerable.
var enumerableAfter = mlContext.CreateEnumerable<AllTypes>(data, true);

Common.AssertEqual(enumerableBefore, enumerableAfter);
}

/// <summary>
/// Write to and read from a delimited file: Any DataKind can be written to and read from a delimited file.
/// </summary>
/// <remarks>
/// Tests the roundtrip hrough a file using explicit schematization.
/// </remarks>
[Fact]
public void WriteToAndReadFromADelimetedFile()
{
var mlContext = new MLContext(seed: 1, conc: 1);

var dataBefore = mlContext.Data.ReadFromEnumerable(AllTypes.GenerateDataset());

foreach (var separator in _separators)
{
// Serialize a dataset with a known schema to a file.
var filePath = SerializeDatasetToFile(mlContext, dataBefore, separator);
var dataAfter = AllTypes.GetTextLoader(mlContext, separator).Read(filePath);
Common.AssertAllTypesDatasetsAreEqual(mlContext, dataBefore, dataAfter);
}
}

/// <summary>
/// Write to and read from a delimited file: Schematized data of any DataKind can be read from a delimited file.
/// </summary>
/// <remarks>
/// Tests the roundtrip hrough a file using schema inference.
Copy link
Member

@sfilipi sfilipi Feb 13, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hrough [](start = 32, length = 6)

typo #Resolved

/// </remarks>
[Fact]
public void WriteToAndReadASchemaFromADelimitedFile()
{
var mlContext = new MLContext(seed: 1, conc: 1);

var dataBefore = mlContext.Data.ReadFromEnumerable(AllTypes.GenerateDataset());

foreach (var separator in _separators)
{
// Serialize a dataset with a known schema to a file.
var filePath = SerializeDatasetToFile(mlContext, dataBefore, separator);
var dataAfter = mlContext.Data.ReadFromTextFile<AllTypes>(filePath, hasHeader: true, separatorChar: separator);
Common.AssertAllTypesDatasetsAreEqual(mlContext, dataBefore, dataAfter);
}
}

/// <summary>
/// Wrie to and read from a delimited file: Schematized data of any DataKind can be read from a delimited file.
/// </summary>
[Fact]
public void WriteAndReadAFromABinaryFile()
{
var mlContext = new MLContext(seed: 1, conc: 1);

var dataBefore = mlContext.Data.ReadFromEnumerable(AllTypes.GenerateDataset());

// Serialize a dataset with a known schema to a file.
var filePath = SerializeDatasetToBinaryFile(mlContext, dataBefore);
var dataAfter = mlContext.Data.ReadFromBinary(filePath);
Common.AssertAllTypesDatasetsAreEqual(mlContext, dataBefore, dataAfter);
}

private string SerializeDatasetToFile(MLContext mlContext, IDataView data, char separator)
{
var filePath = GetOutputPath(Path.GetRandomFileName());
using (var file = File.Create(filePath))
mlContext.Data.SaveAsText(data, file, separatorChar: separator, headerRow: true);

return filePath;
}

private string SerializeDatasetToBinaryFile(MLContext mlContext, IDataView data)
{
var filePath = GetOutputPath(Path.GetRandomFileName());
using (var file = File.Create(filePath))
mlContext.Data.SaveAsBinary(data, file);

return filePath;
}
}
}
Loading