-
Notifications
You must be signed in to change notification settings - Fork 1.9k
API scenarios implementation with Estimators #688
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 1 commit
Commits
Show all changes
12 commits
Select commit
Hold shift + click to select a range
e05ca2d
Squashed commit of the following:
8fdeae8
add extensibility, mutlithread prediction and visibility
80949a0
PR comments
6072b20
Some fixes too
48dbe68
Some code quality improvements (more to come)
7bd3e9b
Code quality
aee4720
Added OVA
fdb2f00
Added introspective training example.
007b0ce
Fixed OVA normalization
1d54d46
Merged from master
b3b9fc1
Merge remote-tracking branch 'upstream/master' into feature/api-est-s…
1b232d2
Merge
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,182 @@ | ||
// Licensed to the .NET Foundation under one or more agreements. | ||
// The .NET Foundation licenses this file to you under the MIT license. | ||
// See the LICENSE file in the project root for more information. | ||
|
||
using Microsoft.ML.Runtime; | ||
using Microsoft.ML.Runtime.Data; | ||
using System.Collections.Generic; | ||
using System.Linq; | ||
|
||
namespace Microsoft.ML.Core.Data | ||
{ | ||
/// <summary> | ||
/// A set of 'requirements' to the incoming schema, as well as a set of 'promises' of the outgoing schema. | ||
/// This is more relaxed than the proper <see cref="ISchema"/>, since it's only a subset of the columns, | ||
/// and also since it doesn't specify exact <see cref="ColumnType"/>'s for vectors and keys. | ||
/// </summary> | ||
public sealed class SchemaShape | ||
{ | ||
public readonly Column[] Columns; | ||
|
||
public sealed class Column | ||
{ | ||
public enum VectorKind | ||
{ | ||
Scalar, | ||
Vector, | ||
VariableVector | ||
} | ||
|
||
public readonly string Name; | ||
public readonly VectorKind Kind; | ||
public readonly DataKind ItemKind; | ||
public readonly bool IsKey; | ||
public readonly string[] MetadataKinds; | ||
|
||
public Column(string name, VectorKind vecKind, DataKind itemKind, bool isKey, string[] metadataKinds) | ||
{ | ||
Contracts.CheckNonEmpty(name, nameof(name)); | ||
Contracts.CheckValue(metadataKinds, nameof(metadataKinds)); | ||
|
||
Name = name; | ||
Kind = vecKind; | ||
ItemKind = itemKind; | ||
IsKey = isKey; | ||
MetadataKinds = metadataKinds; | ||
} | ||
} | ||
|
||
public SchemaShape(Column[] columns) | ||
{ | ||
Contracts.CheckValue(columns, nameof(columns)); | ||
Columns = columns; | ||
} | ||
|
||
/// <summary> | ||
/// Create a schema shape out of the fully defined schema. | ||
/// </summary> | ||
public static SchemaShape Create(ISchema schema) | ||
{ | ||
Contracts.CheckValue(schema, nameof(schema)); | ||
var cols = new List<Column>(); | ||
|
||
for (int iCol = 0; iCol < schema.ColumnCount; iCol++) | ||
{ | ||
if (!schema.IsHidden(iCol)) | ||
{ | ||
Column.VectorKind vecKind; | ||
var type = schema.GetColumnType(iCol); | ||
if (type.IsKnownSizeVector) | ||
vecKind = Column.VectorKind.Vector; | ||
else if (type.IsVector) | ||
vecKind = Column.VectorKind.VariableVector; | ||
else | ||
vecKind = Column.VectorKind.Scalar; | ||
|
||
var kind = type.ItemType.RawKind; | ||
var isKey = type.ItemType.IsKey; | ||
|
||
var metadataNames = schema.GetMetadataTypes(iCol) | ||
.Select(kvp => kvp.Key) | ||
.ToArray(); | ||
cols.Add(new Column(schema.GetColumnName(iCol), vecKind, kind, isKey, metadataNames)); | ||
} | ||
} | ||
return new SchemaShape(cols.ToArray()); | ||
} | ||
|
||
/// <summary> | ||
/// Returns the column with a specified <paramref name="name"/>, and <c>null</c> if there is no such column. | ||
/// </summary> | ||
public Column FindColumn(string name) | ||
{ | ||
Contracts.CheckValue(name, nameof(name)); | ||
return Columns.FirstOrDefault(x => x.Name == name); | ||
} | ||
|
||
// REVIEW: I think we should have an IsCompatible method to check if it's OK to use one schema shape | ||
// as an input to another schema shape. I started writing, but realized that there's more than one way to check for | ||
// the 'compatibility': as in, 'CAN be compatible' vs. 'WILL be compatible'. | ||
} | ||
|
||
/// <summary> | ||
/// The 'data reader' takes a certain kind of input and turns it into an <see cref="IDataView"/>. | ||
/// </summary> | ||
/// <typeparam name="TSource">The type of input the reader takes.</typeparam> | ||
public interface IDataReader<in TSource> | ||
{ | ||
/// <summary> | ||
/// Produce the data view from the specified input. | ||
/// Note that <see cref="IDataView"/>'s are lazy, so no actual reading happens here, just schema validation. | ||
/// </summary> | ||
IDataView Read(TSource input); | ||
|
||
/// <summary> | ||
/// The output schema of the reader. | ||
/// </summary> | ||
ISchema GetOutputSchema(); | ||
} | ||
|
||
/// <summary> | ||
/// Sometimes we need to 'fit' an <see cref="IDataReader{TIn}"/>. | ||
/// A DataReader estimator is the object that does it. | ||
/// </summary> | ||
public interface IDataReaderEstimator<in TSource, out TReader> | ||
where TReader : IDataReader<TSource> | ||
{ | ||
/// <summary> | ||
/// Train and return a data reader. | ||
/// | ||
/// REVIEW: you could consider the transformer to take a different <typeparamref name="TSource"/>, but we don't have such components | ||
/// yet, so why complicate matters? | ||
/// </summary> | ||
TReader Fit(TSource input); | ||
|
||
/// <summary> | ||
/// The 'promise' of the output schema. | ||
/// It will be used for schema propagation. | ||
/// </summary> | ||
SchemaShape GetOutputSchema(); | ||
} | ||
|
||
/// <summary> | ||
/// The transformer is a component that transforms data. | ||
/// It also supports 'schema propagation' to answer the question of 'how the data with this schema look after you transform it?'. | ||
/// </summary> | ||
public interface ITransformer | ||
{ | ||
/// <summary> | ||
/// Schema propagation for transformers. | ||
/// Returns the output schema of the data, if the input schema is like the one provided. | ||
/// Returns <c>null</c> iff the schema is invalid (then a call to Transform with this data will fail). | ||
/// </summary> | ||
ISchema GetOutputSchema(ISchema inputSchema); | ||
|
||
/// <summary> | ||
/// Take the data in, make transformations, output the data. | ||
/// Note that <see cref="IDataView"/>'s are lazy, so no actual transformations happen here, just schema validation. | ||
/// </summary> | ||
IDataView Transform(IDataView input); | ||
} | ||
|
||
/// <summary> | ||
/// The estimator (in Spark terminology) is an 'untrained transformer'. It needs to 'fit' on the data to manufacture | ||
/// a transformer. | ||
/// It also provides the 'schema propagation' like transformers do, but over <see cref="SchemaShape"/> instead of <see cref="ISchema"/>. | ||
/// </summary> | ||
public interface IEstimator<out TTransformer> | ||
where TTransformer : ITransformer | ||
{ | ||
/// <summary> | ||
/// Train and return a transformer. | ||
/// </summary> | ||
TTransformer Fit(IDataView input); | ||
|
||
/// <summary> | ||
/// Schema propagation for estimators. | ||
/// Returns the output schema shape of the estimator, if the input schema shape is like the one provided. | ||
/// Returns <c>null</c> iff the schema shape is invalid (then a call to <see cref="Fit"/> with this data will fail). | ||
/// </summary> | ||
SchemaShape GetOutputSchema(SchemaShape inputSchema); | ||
} | ||
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,6 +2,9 @@ | |
<PropertyGroup> | ||
<TargetFramework>netcoreapp2.0</TargetFramework> | ||
</PropertyGroup> | ||
<ItemGroup> | ||
<Compile Remove="Scenarios\Api\AspirationalExamples.cs" /> | ||
</ItemGroup> | ||
|
||
<ItemGroup> | ||
<ProjectReference Include="..\..\src\Microsoft.ML.Ensemble\Microsoft.ML.Ensemble.csproj" /> | ||
|
@@ -26,4 +29,8 @@ | |
<NativeAssemblyReference Include="SymSgdNative" /> | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
since you editing this file... |
||
<NativeAssemblyReference Include="MklImports" /> | ||
</ItemGroup> | ||
|
||
<ItemGroup> | ||
<None Include="Scenarios\Api\AspirationalExamples.cs" /> | ||
</ItemGroup> | ||
</Project> |
60 changes: 60 additions & 0 deletions
60
test/Microsoft.ML.Tests/Scenarios/Api/AspirationalExamples.cs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
using System; | ||
using System.Collections.Generic; | ||
using System.Text; | ||
|
||
namespace Microsoft.ML.Tests.Scenarios.Api | ||
{ | ||
public class AspirationalExamples | ||
{ | ||
public class IrisPrediction | ||
{ | ||
public string PredictedLabel; | ||
} | ||
|
||
public class IrisExample | ||
{ | ||
public float SepalWidth { get; set; } | ||
public float SepalLength { get; set; } | ||
public float PetalWidth { get; set; } | ||
public float PetalLength { get; set; } | ||
} | ||
|
||
public void FirstExperienceWithML() | ||
{ | ||
// This is the 'getting started with ML' example, how we see it in our new API. | ||
// It currently doesn't compile, let alone work, but we still can discuss and improve the syntax. | ||
|
||
// Load the data into the system. | ||
string dataPath = "iris-data.txt"; | ||
var data = TextReader.FitAndRead(env, dataPath, row => ( | ||
Label: row.ReadString(0), | ||
SepalWidth: row.ReadFloat(1), | ||
SepalLength: row.ReadFloat(2), | ||
PetalWidth: row.ReadFloat(3), | ||
PetalLength: row.ReadFloat(4))); | ||
|
||
|
||
var preprocess = data.Schema.MakeEstimator(row => ( | ||
// Convert string label to key. | ||
Label: row.Label.DictionarizeLabel(), | ||
// Concatenate all features into a vector. | ||
Features: row.SepalWidth.ConcatWith(row.SepalLength, row.PetalWidth, row.PetalLength))); | ||
|
||
var pipeline = preprocess | ||
// Append the trainer to the training pipeline. | ||
.AppendEstimator(row => row.Label.PredictWithSdca(row.Features)) | ||
.AppendEstimator(row => row.PredictedLabel.KeyToValue()); | ||
|
||
// Train the model and make some predictions. | ||
var model = pipeline.Fit<IrisExample, IrisPrediction>(data); | ||
|
||
IrisPrediction prediction = model.Predict(new IrisExample | ||
{ | ||
SepalWidth = 3.3f, | ||
SepalLength = 1.6f, | ||
PetalWidth = 0.2f, | ||
PetalLength = 5.1f | ||
}); | ||
} | ||
} | ||
} |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why? #Pending
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Because right now it doesn't compile, and may never compile, since it uses my own imaginary version of Pigsty, which may differ from the real one, when it appears.
In reply to: 210986324 [](ancestors = 210986324)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why we even add this file? What's the point?
In reply to: 211030708 [](ancestors = 211030708,210986324)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Umm, as an aspirational example? It's a way to document what we want to reach at the end. Another way would be to put it into Markdown somewhere, but I think I like this way somewhat better.
In reply to: 211688654 [](ancestors = 211688654,211030708,210986324)