Skip to content

CollectionDataSource (train on top of memory collection instead of loading data from file) #106

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
May 15, 2018
1 change: 1 addition & 0 deletions ZBaselines/Common/EntryPoints/core_ep-list.tsv
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
Data.DataViewReference Pass dataview from memory to experiment Microsoft.ML.Runtime.EntryPoints.InMemoryDataView ImportData Microsoft.ML.Runtime.EntryPoints.InMemoryDataView+Input Microsoft.ML.Runtime.EntryPoints.InMemoryDataView+Output
Data.IDataViewArrayConverter Create and array variable Microsoft.ML.Runtime.EntryPoints.CrossValidationBinaryMacro MakeArray Microsoft.ML.Runtime.EntryPoints.CrossValidationBinaryMacro+ArrayIDataViewInput Microsoft.ML.Runtime.EntryPoints.CrossValidationBinaryMacro+ArrayIDataViewOutput
Data.PredictorModelArrayConverter Create and array variable Microsoft.ML.Runtime.EntryPoints.CrossValidationBinaryMacro MakeArray Microsoft.ML.Runtime.EntryPoints.CrossValidationBinaryMacro+ArrayIPredictorModelInput Microsoft.ML.Runtime.EntryPoints.CrossValidationBinaryMacro+ArrayIPredictorModelOutput
Data.TextLoader Import a dataset from a text file Microsoft.ML.Runtime.EntryPoints.ImportTextData ImportText Microsoft.ML.Runtime.EntryPoints.ImportTextData+Input Microsoft.ML.Runtime.EntryPoints.ImportTextData+Output
Expand Down
26 changes: 26 additions & 0 deletions ZBaselines/Common/EntryPoints/core_manifest.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,31 @@
{
"EntryPoints": [
{
"Name": "Data.DataViewReference",
"Desc": "Pass dataview from memory to experiment",
"FriendlyName": null,
"ShortName": null,
"Inputs": [
{
"Name": "Data",
"Type": "DataView",
"Desc": "Pointer to IDataView in memory",
"Aliases": [
"data"
],
"Required": true,
"SortOrder": 1.0,
"IsNullable": false
}
],
"Outputs": [
{
"Name": "Data",
"Type": "DataView",
"Desc": "The resulting data view"
}
]
},
{
"Name": "Data.IDataViewArrayConverter",
"Desc": "Create and array variable",
Expand Down
28 changes: 28 additions & 0 deletions src/Microsoft.ML/CSharpApi.cs
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,22 @@ public Microsoft.ML.Data.TextLoader.Output Add(Microsoft.ML.Data.TextLoader inpu
return output;
}

public Microsoft.ML.Data.DataViewReference.Output Add(Microsoft.ML.Data.DataViewReference input)
{
var output = new Microsoft.ML.Data.DataViewReference.Output();
Add(input, output);
return output;
}

public void Add(Microsoft.ML.Data.TextLoader input, Microsoft.ML.Data.TextLoader.Output output)
{
_jsonNodes.Add(Serialize("Data.TextLoader", input, output));
}

public void Add(Microsoft.ML.Data.DataViewReference input, Microsoft.ML.Data.DataViewReference.Output output)
{
_jsonNodes.Add(Serialize("Data.DataViewReference", input, output));
}
public Microsoft.ML.Models.AnomalyDetectionEvaluator.Output Add(Microsoft.ML.Models.AnomalyDetectionEvaluator input)
{
var output = new Microsoft.ML.Models.AnomalyDetectionEvaluator.Output();
Expand Down Expand Up @@ -1311,6 +1322,23 @@ public sealed partial class TextLoader
public string CustomSchema { get; set; }


public sealed class Output
{
/// <summary>
/// The resulting data view
/// </summary>
public Var<Microsoft.ML.Runtime.Data.IDataView> Data { get; set; } = new Var<Microsoft.ML.Runtime.Data.IDataView>();

}
}

public sealed partial class DataViewReference
{
/// <summary>
/// Location of the input file
/// </summary>
public Var<Microsoft.ML.Runtime.Data.IDataView> Data { get; set; } = new Var<Microsoft.ML.Runtime.Data.IDataView>();

public sealed class Output
{
/// <summary>
Expand Down
65 changes: 65 additions & 0 deletions src/Microsoft.ML/MemoryCollection.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System.Collections.Generic;
using Microsoft.ML.Runtime;
using Microsoft.ML.Runtime.Api;
using Microsoft.ML.Runtime.Data;
using Microsoft.ML.Runtime.EntryPoints;
using Microsoft.ML.Runtime.Internal.Utilities;

namespace Microsoft.ML
Copy link
Contributor

@glebuk glebuk May 10, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

.ML [](start = 19, length = 3)

ML.Data #Closed

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TextLoader is part of just ML, should I change it as well?


In reply to: 187221578 [](ancestors = 187221578)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good question... seems they both should be in data. the argument went about like this - can a user has out of the box experience with just ML namespace.
I guess we can keep it in ML for now.


In reply to: 187221799 [](ancestors = 187221799,187221578)

Copy link
Member

@codemzs codemzs May 11, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please move to Data. My PR will move TextLoader to ML.Data. #Resolved

{
public class MemoryCollection<TInput> : ILearningPipelineLoader
where TInput : class
{
public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment)
{
Contracts.Assert(previousStep == null);
_dataViewEntryPoint = new Data.DataViewReference();
var importOutput = experiment.Add(_dataViewEntryPoint);
return new MemoryCollectionPipelineStep(importOutput.Data);
}

private readonly IList<TInput> _listCollection;
private readonly IEnumerable<TInput> _enumerableCollection;

private Data.DataViewReference _dataViewEntryPoint;
private IDataView _dataView;

public MemoryCollection(IList<TInput> collection)
Copy link
Contributor Author

@Ivanidzo4ka Ivanidzo4ka May 9, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

MemoryCollection [](start = 15, length = 16)

public constructor required comments. #Resolved

{
Contracts.CheckParamValue(Utils.Size(collection) > 0, collection, nameof(collection), "Must be non-empty");
_listCollection = collection;
}

public MemoryCollection(IEnumerable<TInput> collection)
{
Contracts.CheckParamValue(collection != null, collection, nameof(collection), "Must be non-null");
Copy link
Contributor

@TomFinley TomFinley May 9, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CheckParamValue(collection != null, collection [](start = 22, length = 46)

I'm sorry, I'm silly... I didn't mean CheckParamValue, I meant CheckValue. That'll simplify this a bit. #Closed

_enumerableCollection = collection;
}

public void SetInput(IHostEnvironment env, Experiment experiment)
{
if (_listCollection != null)
_dataView = ComponentCreation.CreateDataView(env, _listCollection);
if (_enumerableCollection != null)
_dataView = ComponentCreation.CreateStreamingDataView(env, _listCollection);
Copy link
Contributor

@glebuk glebuk May 10, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_listCollection [](start = 75, length = 15)

Note that you are not using _enumerableCollection here. Likely a bug
Do we really need two fields here, one for list another for enumerable if only one is used at a time? Isn't a list an IEnumerable anyway? Why not just always store an _enumerable and just use that all the time?
If we want to use two (one is shuffleble and other is not) then we should split into two classes. Seem like this class tries to be two classes at once. #Closed

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thank you for pointing that out, I update test to call both implementation and I also split class into two classes, to get rid of if condition


In reply to: 187225560 [](ancestors = 187225560)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I still want to have two different implementation for IList and IEnumerable, so I would prefer to keep it that way, but with two separate classes


In reply to: 187395933 [](ancestors = 187395933,187225560)

env.CheckValue(_dataView, nameof(_dataView));
experiment.SetInput(_dataViewEntryPoint.Data, _dataView);
}

private class MemoryCollectionPipelineStep : ILearningPipelineDataStep
{
public MemoryCollectionPipelineStep(Var<IDataView> data)
{
Data = data;
Model = null;
Copy link
Contributor

@TomFinley TomFinley May 9, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unnecessary, the default value is null.

That's fine, but I'd go one step further... What I'd do is change the public Var<ITransformModel> Model { get; } below to public Var<ITransformModel> Model => null;, that way you avoid having any backing field whatsoever. (Plus you save three characters, which totally makes it worth it. :D ) #Closed

}

public Var<IDataView> Data { get; }
public Var<ITransformModel> Model { get; }
}
}
}
37 changes: 37 additions & 0 deletions src/Microsoft.ML/Runtime/EntryPoints/InMemoryDataView.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using Microsoft.ML.Runtime;
using Microsoft.ML.Runtime.CommandLine;
using Microsoft.ML.Runtime.Data;
using Microsoft.ML.Runtime.EntryPoints;

[assembly: LoadableClass(typeof(void), typeof(InMemoryDataView), null, typeof(SignatureEntryPointModule), "InMemoryDataView")]
namespace Microsoft.ML.Runtime.EntryPoints
{
public class InMemoryDataView
Copy link
Contributor

@glebuk glebuk May 10, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

InMemoryDataView [](start = 17, length = 16)

Where is this being used? I don't see any references to this class in code or tests. #Closed

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a entrypoint :) Data.DataViewReference it get used in MemoryCollection.cs (At least it entry point wrapper)


In reply to: 187222065 [](ancestors = 187222065)

Copy link
Contributor

@glebuk glebuk May 10, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, silly me as looking for "InMemoryDataView" not "Data.DataViewReference"


In reply to: 187222365 [](ancestors = 187222365,187222065)

Copy link
Contributor

@glebuk glebuk May 10, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

InMemoryDataView [](start = 17, length = 16)

So to clarify, all this entrypoint does it turns input to output? Should we call it as such, something like a data passthrough or something?
Because in reality, does this EP cares what kind of idv is input? #Closed

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No it doesn't. but I wouldn't call it DataPass entrypoint either, since it allow you pass only dataview from you code to experiment, and DataViewReference is already taken by entrypoint class.
DataViewPasser?


In reply to: 187240561 [](ancestors = 187240561)

Copy link
Contributor

@glebuk glebuk May 14, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not DataViewReference or DataViewReferenceEp? Seems like very unrelated class name to the entrypoint name.


In reply to: 187395356 [](ancestors = 187395356,187240561)

{
public sealed class Input
{
[Argument(ArgumentType.Required, ShortName = "data", HelpText = "Pointer to IDataView in memory", SortOrder = 1)]
Copy link
Contributor

@TomFinley TomFinley May 11, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ShortName = "data" [](start = 45, length = 18)

Since shortname is same as longname, you can safely omit. #Resolved

public IDataView Data;
}

public sealed class Output
{
[TlcModule.Output(Desc = "The resulting data view", SortOrder = 1)]
public IDataView Data;
}

[TlcModule.EntryPoint(Name = "Data.DataViewReference", Desc = "Pass dataview from memory to experiment")]
public static Output ImportData(IHostEnvironment env, Input input)
{
Contracts.CheckValue(env, nameof(env));
var host = env.Register("DataViewReference");
env.CheckValue(input, nameof(input));
EntryPointUtils.CheckInputArgs(host, input);
return new Output { Data = input.Data };
}
}
}
201 changes: 201 additions & 0 deletions test/Microsoft.ML.Tests/MemoryCollectionTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using Microsoft.ML.Runtime;
using Microsoft.ML.Runtime.Api;
using Microsoft.ML.Runtime.Data;
using Microsoft.ML.TestFramework;
using Microsoft.ML.Trainers;
using Microsoft.ML.Transforms;
using System.Collections.Generic;
using Xunit;
using Xunit.Abstractions;

namespace Microsoft.ML.EntryPoints.Tests
{
public class MemoryCollectionTests : BaseTestClass
{
public MemoryCollectionTests(ITestOutputHelper output)
: base(output)
{

}

[Fact]
public void CheckConstructor()
{
Assert.NotNull(new MemoryCollection<Input>(new List<Input>() { new Input { Number1 = 1, String1 = "1" } }));
Assert.NotNull(new MemoryCollection<Input>(new Input[1] { new Input { Number1 = 1, String1 = "1" } }));
bool thrown = false;
try
{
new MemoryCollection<Input>(null);
}
catch
{
thrown = true;
}
Assert.True(thrown);
thrown = false;
try
{
new MemoryCollection<Input>(new List<Input>());
}
catch
{
thrown = true;
}
Assert.True(thrown);

thrown = false;
try
{
new MemoryCollection<Input>(new Input[0]);
}
catch
{
thrown = true;
}
Assert.True(thrown);
}

[Fact]
public void CanSuccessfullyApplyATransform()
{
var collection = new MemoryCollection<Input>(new List<Input>() { new Input { Number1 = 1, String1 = "1" } });
Copy link
Contributor

@TomFinley TomFinley May 9, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

MemoryCollection [](start = 33, length = 23)

Just an observation... if we had instead structured this as a static utility method somewhere, then we could avoid having the double-specification of the Input class, as it would have been inferred by the compiler.

So: if we had a input type MyAwesomeInputType, then instead of MemoryCollection<MyAwesomeInputType>(new MyAwesomeInputType[] {...}), we would have MemoryCollection.Create(new MyAwesomeInputType[] {...}) since the compiler could have done the work of inferring the type and whatnot. #Closed

using (var environment = new TlcEnvironment())
{
Experiment experiment = environment.CreateExperiment();
ILearningPipelineDataStep output = collection.ApplyStep(null, experiment) as ILearningPipelineDataStep;

Assert.NotNull(output.Data);
Copy link
Contributor

@TomFinley TomFinley May 9, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Assert.NotNull(output.Data); [](start = 16, length = 28)

If you really meant the as above, you should first Assert.NonNull on output. If you didn't mean the as above, then you should do a direct () style cast to ILearningPipelineDataStep. The only reason to use an as style cast is if you are entertaining the possibility that it might not implement that interface, in which case, given that this is a test, you should test that. (And if it wasn't a test, you would Contracts.Check* it.) #Closed

Assert.NotNull(output.Data.VarName);
Assert.Null(output.Model);
}
}

[Fact]
public void CanSuccessfullyEnumerated()
{
var collection = new MemoryCollection<Input>(new List<Input>() {
new Input { Number1 = 1, String1 = "1" },
new Input { Number1 = 2, String1 = "2" },
new Input { Number1 = 3, String1 = "3" }
});

using (var environment = new TlcEnvironment())
{
Experiment experiment = environment.CreateExperiment();
ILearningPipelineDataStep output = collection.ApplyStep(null, experiment) as ILearningPipelineDataStep;

experiment.Compile();
collection.SetInput(environment, experiment);
experiment.Run();

IDataView data = experiment.GetOutput(output.Data);
Assert.NotNull(data);

using (var cursor = data.GetRowCursor((a => true)))
{
var IDGetter = cursor.GetGetter<float>(0);
var TextGetter = cursor.GetGetter<DvText>(1);

Assert.True(cursor.MoveNext());

float ID = 0;
IDGetter(ref ID);
Assert.Equal(1, ID);

DvText Text = new DvText();
TextGetter(ref Text);
Assert.Equal("1", Text.ToString());

Assert.True(cursor.MoveNext());

ID = 0;
IDGetter(ref ID);
Assert.Equal(2, ID);

Text = new DvText();
TextGetter(ref Text);
Assert.Equal("2", Text.ToString());

Assert.True(cursor.MoveNext());

ID = 0;
IDGetter(ref ID);
Assert.Equal(3, ID);

Text = new DvText();
TextGetter(ref Text);
Assert.Equal("3", Text.ToString());

Assert.False(cursor.MoveNext());
}
}
}

[Fact]
public void CanTrain()
{
var pipeline = new LearningPipeline();
var collection = new MemoryCollection<IrisData>(new List<IrisData>() {
new IrisData { SepalLength = 1f, SepalWidth = 1f ,PetalLength=0.3f, PetalWidth=5.1f, Label=1},
new IrisData { SepalLength = 1f, SepalWidth = 1f ,PetalLength=0.3f, PetalWidth=5.1f, Label=1},
new IrisData { SepalLength = 1.2f, SepalWidth = 0.5f ,PetalLength=0.3f, PetalWidth=5.1f, Label=0}
});

pipeline.Add(collection);

pipeline.Add(new ColumnConcatenator(outputColumn: "Features",
"SepalLength", "SepalWidth", "PetalLength", "PetalWidth"));

pipeline.Add(new StochasticDualCoordinateAscentClassifier());
PredictionModel<IrisData, IrisPrediction> model = pipeline.Train<IrisData, IrisPrediction>();

IrisPrediction prediction = model.Predict(new IrisData()
{
SepalLength = 3.3f,
SepalWidth = 1.6f,
PetalLength = 0.2f,
PetalWidth = 5.1f,
});

}

public class Input
{
[Column("0")]
public float Number1;

[Column("1")]
public string String1;
}

public class IrisData
{
[Column("0")]
public float Label;

[Column("1")]
public float SepalLength;

[Column("2")]
public float SepalWidth;

[Column("3")]
public float PetalLength;

[Column("4")]
public float PetalWidth;
}

public class IrisPrediction
{
[ColumnName("Score")]
public float[] PredictedLabels;
}

}
}