-
Notifications
You must be signed in to change notification settings - Fork 1.9k
CollectionDataSource (train on top of memory collection instead of loading data from file) #106
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 15 commits
f69d659
55b6e46
b166f05
a1761b1
12d1b9e
ebcf448
110e205
62ab575
1da42ca
0cac7dc
ca9c031
d78afa3
ab86b09
ebe6f33
04ff469
9698d19
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
// Licensed to the .NET Foundation under one or more agreements. | ||
// The .NET Foundation licenses this file to you under the MIT license. | ||
// See the LICENSE file in the project root for more information. | ||
|
||
using System.Collections.Generic; | ||
using Microsoft.ML.Runtime; | ||
using Microsoft.ML.Runtime.Api; | ||
using Microsoft.ML.Runtime.Data; | ||
using Microsoft.ML.Runtime.EntryPoints; | ||
using Microsoft.ML.Runtime.Internal.Utilities; | ||
|
||
namespace Microsoft.ML.Data | ||
{ | ||
public static class CollectionDataSource | ||
{ | ||
/// <summary> | ||
/// Creates pipeline data source. Support shuffle. | ||
/// </summary> | ||
public static ILearningPipelineLoader Create<T>(IList<T> data) where T : class | ||
{ | ||
return new ListDataSource<T>(data); | ||
} | ||
|
||
/// <summary> | ||
/// Creates pipeline data source which can't be shuffled. | ||
/// </summary> | ||
public static ILearningPipelineLoader Create<T>(IEnumerable<T> data) where T : class | ||
{ | ||
return new EnumerableDataSource<T>(data); | ||
} | ||
|
||
private abstract class BaseDataSource<TInput> : ILearningPipelineLoader where TInput : class | ||
{ | ||
private Data.DataViewReference _dataViewEntryPoint; | ||
private IDataView _dataView; | ||
|
||
public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment) | ||
{ | ||
Contracts.Assert(previousStep == null); | ||
_dataViewEntryPoint = new Data.DataViewReference(); | ||
var importOutput = experiment.Add(_dataViewEntryPoint); | ||
return new CollectionDataSourcePipelineStep(importOutput.Data); | ||
} | ||
|
||
public void SetInput(IHostEnvironment environment, Experiment experiment) | ||
{ | ||
_dataView = GetDataView(environment); | ||
environment.CheckValue(_dataView, nameof(_dataView)); | ||
experiment.SetInput(_dataViewEntryPoint.Data, _dataView); | ||
} | ||
|
||
public abstract IDataView GetDataView(IHostEnvironment environment); | ||
} | ||
|
||
private class EnumerableDataSource<TInput> : BaseDataSource<TInput> where TInput : class | ||
{ | ||
private readonly IEnumerable<TInput> _enumerableCollection; | ||
|
||
public EnumerableDataSource(IEnumerable<TInput> collection) | ||
{ | ||
Contracts.CheckValue(collection, nameof(collection)); | ||
_enumerableCollection = collection; | ||
} | ||
|
||
public override IDataView GetDataView(IHostEnvironment environment) | ||
{ | ||
return ComponentCreation.CreateStreamingDataView(environment, _enumerableCollection); | ||
} | ||
} | ||
|
||
private class ListDataSource<TInput> : BaseDataSource<TInput> where TInput : class | ||
{ | ||
private readonly IList<TInput> _listCollection; | ||
|
||
public ListDataSource(IList<TInput> collection) | ||
{ | ||
Contracts.CheckParamValue(Utils.Size(collection) > 0, collection, nameof(collection), "Must be non-empty"); | ||
_listCollection = collection; | ||
} | ||
|
||
public override IDataView GetDataView(IHostEnvironment environment) | ||
{ | ||
return ComponentCreation.CreateDataView(environment, _listCollection); | ||
} | ||
} | ||
|
||
private class CollectionDataSourcePipelineStep : ILearningPipelineDataStep | ||
{ | ||
public CollectionDataSourcePipelineStep(Var<IDataView> data) | ||
{ | ||
Data = data; | ||
} | ||
|
||
public Var<IDataView> Data { get; } | ||
public Var<ITransformModel> Model => null; | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
// Licensed to the .NET Foundation under one or more agreements. | ||
// The .NET Foundation licenses this file to you under the MIT license. | ||
// See the LICENSE file in the project root for more information. | ||
|
||
using Microsoft.ML.Runtime; | ||
using Microsoft.ML.Runtime.CommandLine; | ||
using Microsoft.ML.Runtime.Data; | ||
using Microsoft.ML.Runtime.EntryPoints; | ||
|
||
[assembly: LoadableClass(typeof(void), typeof(InMemoryDataView), null, typeof(SignatureEntryPointModule), "InMemoryDataView")] | ||
namespace Microsoft.ML.Runtime.EntryPoints | ||
{ | ||
public class InMemoryDataView | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Where is this being used? I don't see any references to this class in code or tests. #Closed There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a entrypoint :) Data.DataViewReference it get used in MemoryCollection.cs (At least it entry point wrapper) In reply to: 187222065 [](ancestors = 187222065) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok, silly me as looking for "InMemoryDataView" not "Data.DataViewReference" In reply to: 187222365 [](ancestors = 187222365,187222065) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
So to clarify, all this entrypoint does it turns input to output? Should we call it as such, something like a data passthrough or something? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No it doesn't. but I wouldn't call it DataPass entrypoint either, since it allow you pass only dataview from you code to experiment, and DataViewReference is already taken by entrypoint class. In reply to: 187240561 [](ancestors = 187240561) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not DataViewReference or DataViewReferenceEp? Seems like very unrelated class name to the entrypoint name. In reply to: 187395356 [](ancestors = 187395356,187240561) |
||
{ | ||
public sealed class Input | ||
{ | ||
[Argument(ArgumentType.Required, HelpText = "Pointer to IDataView in memory", SortOrder = 1)] | ||
public IDataView Data; | ||
} | ||
|
||
public sealed class Output | ||
{ | ||
[TlcModule.Output(Desc = "The resulting data view", SortOrder = 1)] | ||
public IDataView Data; | ||
} | ||
|
||
[TlcModule.EntryPoint(Name = "Data.DataViewReference", Desc = "Pass dataview from memory to experiment")] | ||
public static Output ImportData(IHostEnvironment env, Input input) | ||
{ | ||
Contracts.CheckValue(env, nameof(env)); | ||
var host = env.Register("DataViewReference"); | ||
env.CheckValue(input, nameof(input)); | ||
EntryPointUtils.CheckInputArgs(host, input); | ||
return new Output { Data = input.Data }; | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please add a top level XML help - what is this class, where does it use, what's the purpose for its existance? #Closed