Skip to content

Commit e05ca2d

Browse files
author
Pete Luferenko
committed
Squashed commit of the following:
commit 5409376 Author: Pete Luferenko <[email protected]> Date: Fri Aug 17 09:59:08 2018 -0700 Scenarios implementation commit d7a53ad Merge: 759dafb 11ff554 Author: Pete Luferenko <[email protected]> Date: Fri Aug 17 09:59:00 2018 -0700 Scenarios implementation: merge commit 11ff554 Author: Pete Luferenko <[email protected]> Date: Fri Aug 17 08:25:41 2018 -0700 CrossValidator class commit 7a66e19 Author: Pete Luferenko <[email protected]> Date: Wed Aug 15 08:55:11 2018 -0700 Reconfigurable prediction commit 231516a Author: Pete Luferenko <[email protected]> Date: Tue Aug 14 09:11:01 2018 -0700 Cross-validation commit adc4e4b Merge: 660ebeb e77f24e Author: Pete Luferenko <[email protected]> Date: Tue Aug 14 08:30:37 2018 -0700 Merge remote-tracking branch 'upstream/master' into feature/estimators commit 660ebeb Author: Pete Luferenko <[email protected]> Date: Mon Aug 13 10:01:09 2018 -0700 Added transformer scope enum for decomposability commit b82f4c6 Author: Pete Luferenko <[email protected]> Date: Fri Aug 10 14:14:29 2018 -0700 Saving/loading data commit bf097ab Author: Pete Luferenko <[email protected]> Date: Thu Aug 9 16:59:57 2018 -0700 Added auto-normalization to everything commit ef416f7 Author: Pete Luferenko <[email protected]> Date: Wed Aug 8 18:12:57 2018 -0700 No need to give schema to prediction engine commit 41df05a Merge: 215856d d0664c1 Author: Pete Luferenko <[email protected]> Date: Wed Aug 8 17:59:33 2018 -0700 Merge remote-tracking branch 'upstream/master' into feature/estimators commit 215856d Author: Pete Luferenko <[email protected]> Date: Wed Aug 8 09:45:53 2018 -0700 Added evaluation commit 184027b Merge: f85cd9c d932807 Author: Pete Luferenko <[email protected]> Date: Wed Aug 8 08:40:12 2018 -0700 Merge branch 'feature/api-proposal' into feature/estimators commit d932807 Author: Pete Luferenko <[email protected]> Date: Wed Aug 8 08:33:51 2018 -0700 Another rework of first example commit f85cd9c Author: Ivan Matantsev <[email protected]> Date: Mon Aug 6 14:41:55 2018 -0700 ReconfigurablePrediction and shorter execution commit fffa4eb Merge: 4ee52f8 77d42a5 Author: Ivan Matantsev <[email protected]> Date: Mon Aug 6 13:51:24 2018 -0700 Merge branch 'feature/estimators' of https://github.com/Zruty0/machinelearning into feature/estimators commit 4ee52f8 Author: Ivan Matantsev <[email protected]> Date: Mon Aug 6 13:51:08 2018 -0700 cross validation commit 77d42a5 Author: Pete Luferenko <[email protected]> Date: Mon Aug 6 13:32:24 2018 -0700 Small edits on top of Ivan's code commit 528a270 Merge: bf344a3 b468056 Author: Pete Luferenko <[email protected]> Date: Mon Aug 6 13:27:35 2018 -0700 Merge branch 'feature/estimators' of github.com:Zruty0/machinelearning into feature/estimators commit bf344a3 Author: Pete Luferenko <[email protected]> Date: Mon Aug 6 13:27:18 2018 -0700 Lowered execution times on some tests. Implemented new API for initial predictor and validation sets commit 1e90463 Author: Pete Luferenko <[email protected]> Date: Mon Aug 6 08:29:10 2018 -0700 Updated for PR comments commit b468056 Author: Ivan Matantsev <[email protected]> Date: Fri Aug 3 15:35:02 2018 -0700 idv and decomposableTrain commit d051138 Author: Pete Luferenko <[email protected]> Date: Fri Aug 3 09:08:50 2018 -0700 Minor tweaks to Ivan's tests commit 6c1d437 Merge: be90030 e3aea3b Author: Pete Luferenko <[email protected]> Date: Thu Aug 2 17:57:20 2018 -0700 Merge branch 'feature/estimators' of github.com:Zruty0/machinelearning into feature/estimators commit be90030 Merge: 9fd480e 96661bb Author: Pete Luferenko <[email protected]> Date: Thu Aug 2 17:56:46 2018 -0700 Merged with examples commit 96661bb Author: Pete Luferenko <[email protected]> Date: Thu Aug 2 17:47:43 2018 -0700 Rework the example with Tom commit 13bbc43 Author: Pete Luferenko <[email protected]> Date: Wed Aug 1 14:25:06 2018 -0700 Adding 'aspirational examples' and some more baseline scenarios commit e3aea3b Merge: 8b65829 9fd480e Author: Ivan Matantsev <[email protected]> Date: Wed Aug 1 15:12:28 2018 -0700 Merge branch 'feature/estimators' of https://github.com/Zruty0/machinelearning into feature/estimators commit 8b65829 Author: Ivan Matantsev <[email protected]> Date: Wed Aug 1 15:12:24 2018 -0700 add evaluation and normalizationAndCaching example commit 9fd480e Merge: 70d3fb4 89dfc82 Author: Pete Luferenko <[email protected]> Date: Wed Aug 1 14:38:33 2018 -0700 Merge remote-tracking branch 'upstream/master' into feature/estimators commit 70d3fb4 Author: Pete Luferenko <[email protected]> Date: Wed Aug 1 14:25:06 2018 -0700 Adding 'aspirational examples' and some more baseline scenarios commit 1999de8 Merge: a8dabb3 bdb742d Author: Pete Luferenko <[email protected]> Date: Tue Jul 31 16:48:25 2018 -0700 Merge remote-tracking branch 'upstream/master' into feature/estimators commit a8dabb3 Author: Pete Luferenko <[email protected]> Date: Tue Jul 31 16:47:38 2018 -0700 Added second test commit 75f09b6 Merge: 598e174 b727d10 Author: Pete Luferenko <[email protected]> Date: Mon Jul 30 18:15:01 2018 -0700 Merge remote-tracking branch 'upstream/master' into feature/estimators commit 598e174 Author: Pete Luferenko <[email protected]> Date: Thu Jul 26 20:32:23 2018 -0700 Converted one scenario to estimators commit 6299a1f Merge: 6aeb7cc 20e59a2 Author: Pete Luferenko <[email protected]> Date: Thu Jul 26 19:15:03 2018 -0700 Merge branch 'feature/api-examples' into feature/estimators commit 20e59a2 Author: Pete Luferenko <[email protected]> Date: Thu Jul 26 19:14:31 2018 -0700 Added the first two scenarios from the list. commit 6aeb7cc Author: Pete Luferenko <[email protected]> Date: Thu Jul 26 14:04:16 2018 -0700 Fixed whitespaces commit 5088ab4 Merge: f84e67a 0f94a3b Author: Pete Luferenko <[email protected]> Date: Thu Jul 26 13:59:46 2018 -0700 Merge remote-tracking branch 'upstream/master' into feature/estimators commit f84e67a Author: Pete Luferenko <[email protected]> Date: Thu Jul 26 13:56:13 2018 -0700 Some renaming to interfaces Removed non-typed estimator Fixed collections in ad-hoc tests commit 49730cb Author: Pete Luferenko <[email protected]> Date: Thu Jul 26 10:49:09 2018 -0700 Renamed and changed comments on data readers. commit 5a819d3 Author: Pete Luferenko <[email protected]> Date: Wed Jul 25 17:53:02 2018 -0700 Typed estimators commit 91dc0f2 Author: Pete Luferenko <[email protected]> Date: Tue Jul 24 20:00:57 2018 -0700 Added prediction engine to playground commit 3826648 Author: Pete Luferenko <[email protected]> Date: Tue Jul 24 19:21:04 2018 -0700 Added an ad hoc test playground commit 3b2edab Author: Pete Luferenko <[email protected]> Date: Tue Jul 24 17:44:46 2018 -0700 Initial take on IEstimator interfaces
1 parent 759dafb commit e05ca2d

15 files changed

+1645
-0
lines changed
Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
// See the LICENSE file in the project root for more information.
4+
5+
using Microsoft.ML.Runtime;
6+
using Microsoft.ML.Runtime.Data;
7+
using System.Collections.Generic;
8+
using System.Linq;
9+
10+
namespace Microsoft.ML.Core.Data
11+
{
12+
/// <summary>
13+
/// A set of 'requirements' to the incoming schema, as well as a set of 'promises' of the outgoing schema.
14+
/// This is more relaxed than the proper <see cref="ISchema"/>, since it's only a subset of the columns,
15+
/// and also since it doesn't specify exact <see cref="ColumnType"/>'s for vectors and keys.
16+
/// </summary>
17+
public sealed class SchemaShape
18+
{
19+
public readonly Column[] Columns;
20+
21+
public sealed class Column
22+
{
23+
public enum VectorKind
24+
{
25+
Scalar,
26+
Vector,
27+
VariableVector
28+
}
29+
30+
public readonly string Name;
31+
public readonly VectorKind Kind;
32+
public readonly DataKind ItemKind;
33+
public readonly bool IsKey;
34+
public readonly string[] MetadataKinds;
35+
36+
public Column(string name, VectorKind vecKind, DataKind itemKind, bool isKey, string[] metadataKinds)
37+
{
38+
Contracts.CheckNonEmpty(name, nameof(name));
39+
Contracts.CheckValue(metadataKinds, nameof(metadataKinds));
40+
41+
Name = name;
42+
Kind = vecKind;
43+
ItemKind = itemKind;
44+
IsKey = isKey;
45+
MetadataKinds = metadataKinds;
46+
}
47+
}
48+
49+
public SchemaShape(Column[] columns)
50+
{
51+
Contracts.CheckValue(columns, nameof(columns));
52+
Columns = columns;
53+
}
54+
55+
/// <summary>
56+
/// Create a schema shape out of the fully defined schema.
57+
/// </summary>
58+
public static SchemaShape Create(ISchema schema)
59+
{
60+
Contracts.CheckValue(schema, nameof(schema));
61+
var cols = new List<Column>();
62+
63+
for (int iCol = 0; iCol < schema.ColumnCount; iCol++)
64+
{
65+
if (!schema.IsHidden(iCol))
66+
{
67+
Column.VectorKind vecKind;
68+
var type = schema.GetColumnType(iCol);
69+
if (type.IsKnownSizeVector)
70+
vecKind = Column.VectorKind.Vector;
71+
else if (type.IsVector)
72+
vecKind = Column.VectorKind.VariableVector;
73+
else
74+
vecKind = Column.VectorKind.Scalar;
75+
76+
var kind = type.ItemType.RawKind;
77+
var isKey = type.ItemType.IsKey;
78+
79+
var metadataNames = schema.GetMetadataTypes(iCol)
80+
.Select(kvp => kvp.Key)
81+
.ToArray();
82+
cols.Add(new Column(schema.GetColumnName(iCol), vecKind, kind, isKey, metadataNames));
83+
}
84+
}
85+
return new SchemaShape(cols.ToArray());
86+
}
87+
88+
/// <summary>
89+
/// Returns the column with a specified <paramref name="name"/>, and <c>null</c> if there is no such column.
90+
/// </summary>
91+
public Column FindColumn(string name)
92+
{
93+
Contracts.CheckValue(name, nameof(name));
94+
return Columns.FirstOrDefault(x => x.Name == name);
95+
}
96+
97+
// REVIEW: I think we should have an IsCompatible method to check if it's OK to use one schema shape
98+
// as an input to another schema shape. I started writing, but realized that there's more than one way to check for
99+
// the 'compatibility': as in, 'CAN be compatible' vs. 'WILL be compatible'.
100+
}
101+
102+
/// <summary>
103+
/// The 'data reader' takes a certain kind of input and turns it into an <see cref="IDataView"/>.
104+
/// </summary>
105+
/// <typeparam name="TSource">The type of input the reader takes.</typeparam>
106+
public interface IDataReader<in TSource>
107+
{
108+
/// <summary>
109+
/// Produce the data view from the specified input.
110+
/// Note that <see cref="IDataView"/>'s are lazy, so no actual reading happens here, just schema validation.
111+
/// </summary>
112+
IDataView Read(TSource input);
113+
114+
/// <summary>
115+
/// The output schema of the reader.
116+
/// </summary>
117+
ISchema GetOutputSchema();
118+
}
119+
120+
/// <summary>
121+
/// Sometimes we need to 'fit' an <see cref="IDataReader{TIn}"/>.
122+
/// A DataReader estimator is the object that does it.
123+
/// </summary>
124+
public interface IDataReaderEstimator<in TSource, out TReader>
125+
where TReader : IDataReader<TSource>
126+
{
127+
/// <summary>
128+
/// Train and return a data reader.
129+
///
130+
/// REVIEW: you could consider the transformer to take a different <typeparamref name="TSource"/>, but we don't have such components
131+
/// yet, so why complicate matters?
132+
/// </summary>
133+
TReader Fit(TSource input);
134+
135+
/// <summary>
136+
/// The 'promise' of the output schema.
137+
/// It will be used for schema propagation.
138+
/// </summary>
139+
SchemaShape GetOutputSchema();
140+
}
141+
142+
/// <summary>
143+
/// The transformer is a component that transforms data.
144+
/// It also supports 'schema propagation' to answer the question of 'how the data with this schema look after you transform it?'.
145+
/// </summary>
146+
public interface ITransformer
147+
{
148+
/// <summary>
149+
/// Schema propagation for transformers.
150+
/// Returns the output schema of the data, if the input schema is like the one provided.
151+
/// Returns <c>null</c> iff the schema is invalid (then a call to Transform with this data will fail).
152+
/// </summary>
153+
ISchema GetOutputSchema(ISchema inputSchema);
154+
155+
/// <summary>
156+
/// Take the data in, make transformations, output the data.
157+
/// Note that <see cref="IDataView"/>'s are lazy, so no actual transformations happen here, just schema validation.
158+
/// </summary>
159+
IDataView Transform(IDataView input);
160+
}
161+
162+
/// <summary>
163+
/// The estimator (in Spark terminology) is an 'untrained transformer'. It needs to 'fit' on the data to manufacture
164+
/// a transformer.
165+
/// It also provides the 'schema propagation' like transformers do, but over <see cref="SchemaShape"/> instead of <see cref="ISchema"/>.
166+
/// </summary>
167+
public interface IEstimator<out TTransformer>
168+
where TTransformer : ITransformer
169+
{
170+
/// <summary>
171+
/// Train and return a transformer.
172+
/// </summary>
173+
TTransformer Fit(IDataView input);
174+
175+
/// <summary>
176+
/// Schema propagation for estimators.
177+
/// Returns the output schema shape of the estimator, if the input schema shape is like the one provided.
178+
/// Returns <c>null</c> iff the schema shape is invalid (then a call to <see cref="Fit"/> with this data will fail).
179+
/// </summary>
180+
SchemaShape GetOutputSchema(SchemaShape inputSchema);
181+
}
182+
}

test/Microsoft.ML.Core.Tests/Microsoft.ML.Core.Tests.csproj

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
<ProjectReference Include="..\..\src\Microsoft.ML.Transforms\Microsoft.ML.Transforms.csproj" />
2121
<ProjectReference Include="..\..\src\Microsoft.ML\Microsoft.ML.csproj" />
2222
<ProjectReference Include="..\Microsoft.ML.TestFramework\Microsoft.ML.TestFramework.csproj" />
23+
<ProjectReference Include="..\Microsoft.ML.Tests\Microsoft.ML.Tests.csproj" />
2324
</ItemGroup>
2425

2526
<ItemGroup>

test/Microsoft.ML.Tests/Microsoft.ML.Tests.csproj

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22
<PropertyGroup>
33
<TargetFramework>netcoreapp2.0</TargetFramework>
44
</PropertyGroup>
5+
<ItemGroup>
6+
<Compile Remove="Scenarios\Api\AspirationalExamples.cs" />
7+
</ItemGroup>
58

69
<ItemGroup>
710
<ProjectReference Include="..\..\src\Microsoft.ML.Ensemble\Microsoft.ML.Ensemble.csproj" />
@@ -26,4 +29,8 @@
2629
<NativeAssemblyReference Include="SymSgdNative" />
2730
<NativeAssemblyReference Include="MklImports" />
2831
</ItemGroup>
32+
33+
<ItemGroup>
34+
<None Include="Scenarios\Api\AspirationalExamples.cs" />
35+
</ItemGroup>
2936
</Project>
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.Text;
4+
5+
namespace Microsoft.ML.Tests.Scenarios.Api
6+
{
7+
public class AspirationalExamples
8+
{
9+
public class IrisPrediction
10+
{
11+
public string PredictedLabel;
12+
}
13+
14+
public class IrisExample
15+
{
16+
public float SepalWidth { get; set; }
17+
public float SepalLength { get; set; }
18+
public float PetalWidth { get; set; }
19+
public float PetalLength { get; set; }
20+
}
21+
22+
public void FirstExperienceWithML()
23+
{
24+
// This is the 'getting started with ML' example, how we see it in our new API.
25+
// It currently doesn't compile, let alone work, but we still can discuss and improve the syntax.
26+
27+
// Load the data into the system.
28+
string dataPath = "iris-data.txt";
29+
var data = TextReader.FitAndRead(env, dataPath, row => (
30+
Label: row.ReadString(0),
31+
SepalWidth: row.ReadFloat(1),
32+
SepalLength: row.ReadFloat(2),
33+
PetalWidth: row.ReadFloat(3),
34+
PetalLength: row.ReadFloat(4)));
35+
36+
37+
var preprocess = data.Schema.MakeEstimator(row => (
38+
// Convert string label to key.
39+
Label: row.Label.DictionarizeLabel(),
40+
// Concatenate all features into a vector.
41+
Features: row.SepalWidth.ConcatWith(row.SepalLength, row.PetalWidth, row.PetalLength)));
42+
43+
var pipeline = preprocess
44+
// Append the trainer to the training pipeline.
45+
.AppendEstimator(row => row.Label.PredictWithSdca(row.Features))
46+
.AppendEstimator(row => row.PredictedLabel.KeyToValue());
47+
48+
// Train the model and make some predictions.
49+
var model = pipeline.Fit<IrisExample, IrisPrediction>(data);
50+
51+
IrisPrediction prediction = model.Predict(new IrisExample
52+
{
53+
SepalWidth = 3.3f,
54+
SepalLength = 1.6f,
55+
PetalWidth = 0.2f,
56+
PetalLength = 5.1f
57+
});
58+
}
59+
}
60+
}

0 commit comments

Comments
 (0)