Skip to content

Commit 3e7d118

Browse files
authored
Merge pull request dotnet#4 from dotnet/master
Merge with latest dotnet/master
2 parents bad9cd2 + 73b0308 commit 3e7d118

File tree

67 files changed

+4755
-2250
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

67 files changed

+4755
-2250
lines changed

DotnetCLIVersion.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.1.200
1+
2.1.401

pkg/Directory.Build.props

+2
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525
<PackageProjectUrl>https://dot.net/ml</PackageProjectUrl>
2626
<PackageIconUrl>https://aka.ms/mlnetlogo</PackageIconUrl>
2727
<PackageReleaseNotes>https://aka.ms/mlnetreleasenotes</PackageReleaseNotes>
28+
<!-- space separated -->
29+
<PackageTags>ML.NET ML Machine Learning</PackageTags>
2830
</PropertyGroup>
2931

3032
<ItemGroup>
+190
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
// See the LICENSE file in the project root for more information.
4+
5+
using Microsoft.ML.Runtime;
6+
using Microsoft.ML.Runtime.Data;
7+
using System;
8+
using System.Collections.Generic;
9+
using System.Linq;
10+
11+
namespace Microsoft.ML.Core.Data
12+
{
13+
/// <summary>
14+
/// A set of 'requirements' to the incoming schema, as well as a set of 'promises' of the outgoing schema.
15+
/// This is more relaxed than the proper <see cref="ISchema"/>, since it's only a subset of the columns,
16+
/// and also since it doesn't specify exact <see cref="ColumnType"/>'s for vectors and keys.
17+
/// </summary>
18+
public sealed class SchemaShape
19+
{
20+
public readonly Column[] Columns;
21+
22+
public sealed class Column
23+
{
24+
public enum VectorKind
25+
{
26+
Scalar,
27+
Vector,
28+
VariableVector
29+
}
30+
31+
public readonly string Name;
32+
public readonly VectorKind Kind;
33+
public readonly DataKind ItemKind;
34+
public readonly bool IsKey;
35+
public readonly string[] MetadataKinds;
36+
37+
public Column(string name, VectorKind vecKind, DataKind itemKind, bool isKey, string[] metadataKinds)
38+
{
39+
Contracts.CheckNonEmpty(name, nameof(name));
40+
Contracts.CheckValue(metadataKinds, nameof(metadataKinds));
41+
42+
Name = name;
43+
Kind = vecKind;
44+
ItemKind = itemKind;
45+
IsKey = isKey;
46+
MetadataKinds = metadataKinds;
47+
}
48+
}
49+
50+
public SchemaShape(Column[] columns)
51+
{
52+
Contracts.CheckValue(columns, nameof(columns));
53+
Columns = columns;
54+
}
55+
56+
/// <summary>
57+
/// Create a schema shape out of the fully defined schema.
58+
/// </summary>
59+
public static SchemaShape Create(ISchema schema)
60+
{
61+
Contracts.CheckValue(schema, nameof(schema));
62+
var cols = new List<Column>();
63+
64+
for (int iCol = 0; iCol < schema.ColumnCount; iCol++)
65+
{
66+
if (!schema.IsHidden(iCol))
67+
{
68+
Column.VectorKind vecKind;
69+
var type = schema.GetColumnType(iCol);
70+
if (type.IsKnownSizeVector)
71+
vecKind = Column.VectorKind.Vector;
72+
else if (type.IsVector)
73+
vecKind = Column.VectorKind.VariableVector;
74+
else
75+
vecKind = Column.VectorKind.Scalar;
76+
77+
var kind = type.ItemType.RawKind;
78+
var isKey = type.ItemType.IsKey;
79+
80+
var metadataNames = schema.GetMetadataTypes(iCol)
81+
.Select(kvp => kvp.Key)
82+
.ToArray();
83+
cols.Add(new Column(schema.GetColumnName(iCol), vecKind, kind, isKey, metadataNames));
84+
}
85+
}
86+
return new SchemaShape(cols.ToArray());
87+
}
88+
89+
/// <summary>
90+
/// Returns the column with a specified <paramref name="name"/>, and <c>null</c> if there is no such column.
91+
/// </summary>
92+
public Column FindColumn(string name)
93+
{
94+
Contracts.CheckValue(name, nameof(name));
95+
return Columns.FirstOrDefault(x => x.Name == name);
96+
}
97+
98+
// REVIEW: I think we should have an IsCompatible method to check if it's OK to use one schema shape
99+
// as an input to another schema shape. I started writing, but realized that there's more than one way to check for
100+
// the 'compatibility': as in, 'CAN be compatible' vs. 'WILL be compatible'.
101+
}
102+
103+
/// <summary>
104+
/// Exception class for schema validation errors.
105+
/// </summary>
106+
public class SchemaException : Exception
107+
{
108+
}
109+
110+
/// <summary>
111+
/// The 'data reader' takes a certain kind of input and turns it into an <see cref="IDataView"/>.
112+
/// </summary>
113+
/// <typeparam name="TSource">The type of input the reader takes.</typeparam>
114+
public interface IDataReader<in TSource>
115+
{
116+
/// <summary>
117+
/// Produce the data view from the specified input.
118+
/// Note that <see cref="IDataView"/>'s are lazy, so no actual reading happens here, just schema validation.
119+
/// </summary>
120+
IDataView Read(TSource input);
121+
122+
/// <summary>
123+
/// The output schema of the reader.
124+
/// </summary>
125+
ISchema GetOutputSchema();
126+
}
127+
128+
/// <summary>
129+
/// Sometimes we need to 'fit' an <see cref="IDataReader{TIn}"/>.
130+
/// A DataReader estimator is the object that does it.
131+
/// </summary>
132+
public interface IDataReaderEstimator<in TSource, out TReader>
133+
where TReader : IDataReader<TSource>
134+
{
135+
/// <summary>
136+
/// Train and return a data reader.
137+
///
138+
/// REVIEW: you could consider the transformer to take a different <typeparamref name="TSource"/>, but we don't have such components
139+
/// yet, so why complicate matters?
140+
/// </summary>
141+
TReader Fit(TSource input);
142+
143+
/// <summary>
144+
/// The 'promise' of the output schema.
145+
/// It will be used for schema propagation.
146+
/// </summary>
147+
SchemaShape GetOutputSchema();
148+
}
149+
150+
/// <summary>
151+
/// The transformer is a component that transforms data.
152+
/// It also supports 'schema propagation' to answer the question of 'how the data with this schema look after you transform it?'.
153+
/// </summary>
154+
public interface ITransformer
155+
{
156+
/// <summary>
157+
/// Schema propagation for transformers.
158+
/// Returns the output schema of the data, if the input schema is like the one provided.
159+
/// Throws <see cref="SchemaException"/> iff the input schema is not valid for the transformer.
160+
/// </summary>
161+
ISchema GetOutputSchema(ISchema inputSchema);
162+
163+
/// <summary>
164+
/// Take the data in, make transformations, output the data.
165+
/// Note that <see cref="IDataView"/>'s are lazy, so no actual transformations happen here, just schema validation.
166+
/// </summary>
167+
IDataView Transform(IDataView input);
168+
}
169+
170+
/// <summary>
171+
/// The estimator (in Spark terminology) is an 'untrained transformer'. It needs to 'fit' on the data to manufacture
172+
/// a transformer.
173+
/// It also provides the 'schema propagation' like transformers do, but over <see cref="SchemaShape"/> instead of <see cref="ISchema"/>.
174+
/// </summary>
175+
public interface IEstimator<out TTransformer>
176+
where TTransformer : ITransformer
177+
{
178+
/// <summary>
179+
/// Train and return a transformer.
180+
/// </summary>
181+
TTransformer Fit(IDataView input);
182+
183+
/// <summary>
184+
/// Schema propagation for estimators.
185+
/// Returns the output schema shape of the estimator, if the input schema shape is like the one provided.
186+
/// Throws <see cref="SchemaException"/> iff the input schema is not valid for the estimator.
187+
/// </summary>
188+
SchemaShape GetOutputSchema(SchemaShape inputSchema);
189+
}
190+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
// See the LICENSE file in the project root for more information.
4+
5+
using Microsoft.ML.Core.Data;
6+
using Microsoft.ML.Runtime.Model;
7+
using System.IO;
8+
9+
namespace Microsoft.ML.Runtime.Data
10+
{
11+
/// <summary>
12+
/// This class represents a data reader that applies a transformer chain after reading.
13+
/// It also has methods to save itself to a repository.
14+
/// </summary>
15+
public sealed class CompositeDataReader<TSource, TLastTransformer> : IDataReader<TSource>
16+
where TLastTransformer : class, ITransformer
17+
{
18+
/// <summary>
19+
/// The underlying data reader.
20+
/// </summary>
21+
public readonly IDataReader<TSource> Reader;
22+
/// <summary>
23+
/// The chain of transformers (possibly empty) that are applied to data upon reading.
24+
/// </summary>
25+
public readonly TransformerChain<TLastTransformer> Transformer;
26+
27+
public CompositeDataReader(IDataReader<TSource> reader, TransformerChain<TLastTransformer> transformerChain = null)
28+
{
29+
Contracts.CheckValue(reader, nameof(reader));
30+
Contracts.CheckValueOrNull(transformerChain);
31+
32+
Reader = reader;
33+
Transformer = transformerChain ?? new TransformerChain<TLastTransformer>();
34+
}
35+
36+
public IDataView Read(TSource input)
37+
{
38+
var idv = Reader.Read(input);
39+
idv = Transformer.Transform(idv);
40+
return idv;
41+
}
42+
43+
public ISchema GetOutputSchema()
44+
{
45+
var s = Reader.GetOutputSchema();
46+
return Transformer.GetOutputSchema(s);
47+
}
48+
49+
/// <summary>
50+
/// Append a new transformer to the end.
51+
/// </summary>
52+
/// <returns>The new composite data reader</returns>
53+
public CompositeDataReader<TSource, TNewLast> AppendTransformer<TNewLast>(TNewLast transformer)
54+
where TNewLast : class, ITransformer
55+
{
56+
Contracts.CheckValue(transformer, nameof(transformer));
57+
58+
return new CompositeDataReader<TSource, TNewLast>(Reader, Transformer.Append(transformer));
59+
}
60+
61+
/// <summary>
62+
/// Save the contents to a stream, as a "model file".
63+
/// </summary>
64+
public void SaveTo(IHostEnvironment env, Stream outputStream)
65+
{
66+
Contracts.CheckValue(env, nameof(env));
67+
env.CheckValue(outputStream, nameof(outputStream));
68+
69+
env.Check(outputStream.CanWrite && outputStream.CanSeek, "Need a writable and seekable stream to save");
70+
using (var ch = env.Start("Saving pipeline"))
71+
{
72+
using (var rep = RepositoryWriter.CreateNew(outputStream, ch))
73+
{
74+
ch.Trace("Saving data reader");
75+
ModelSaveContext.SaveModel(rep, Reader, "Reader");
76+
77+
ch.Trace("Saving transformer chain");
78+
ModelSaveContext.SaveModel(rep, Transformer, TransformerChain.LoaderSignature);
79+
rep.Commit();
80+
}
81+
}
82+
}
83+
}
84+
85+
/// <summary>
86+
/// Utility class to facilitate loading from a stream.
87+
/// </summary>
88+
public static class CompositeDataReader
89+
{
90+
/// <summary>
91+
/// Load the pipeline from stream.
92+
/// </summary>
93+
public static CompositeDataReader<IMultiStreamSource, ITransformer> LoadFrom(IHostEnvironment env, Stream stream)
94+
{
95+
Contracts.CheckValue(env, nameof(env));
96+
env.CheckValue(stream, nameof(stream));
97+
98+
env.Check(stream.CanRead && stream.CanSeek, "Need a readable and seekable stream to load");
99+
using (var rep = RepositoryReader.Open(stream, env))
100+
using (var ch = env.Start("Loading pipeline"))
101+
{
102+
ch.Trace("Loading data reader");
103+
ModelLoadContext.LoadModel<IDataReader<IMultiStreamSource>, SignatureLoadModel>(env, out var reader, rep, "Reader");
104+
105+
ch.Trace("Loader transformer chain");
106+
ModelLoadContext.LoadModel<TransformerChain<ITransformer>, SignatureLoadModel>(env, out var transformerChain, rep, TransformerChain.LoaderSignature);
107+
return new CompositeDataReader<IMultiStreamSource, ITransformer>(reader, transformerChain);
108+
}
109+
}
110+
}
111+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
// See the LICENSE file in the project root for more information.
4+
5+
using Microsoft.ML.Core.Data;
6+
7+
namespace Microsoft.ML.Runtime.Data
8+
{
9+
/// <summary>
10+
/// An estimator class for composite data reader.
11+
/// It can be used to build a 'trainable smart data reader', although this pattern is not very common.
12+
/// </summary>
13+
public sealed class CompositeReaderEstimator<TSource, TLastTransformer> : IDataReaderEstimator<TSource, CompositeDataReader<TSource, TLastTransformer>>
14+
where TLastTransformer : class, ITransformer
15+
{
16+
private readonly IDataReaderEstimator<TSource, IDataReader<TSource>> _start;
17+
private readonly EstimatorChain<TLastTransformer> _estimatorChain;
18+
19+
public CompositeReaderEstimator(IDataReaderEstimator<TSource, IDataReader<TSource>> start, EstimatorChain<TLastTransformer> estimatorChain = null)
20+
{
21+
Contracts.CheckValue(start, nameof(start));
22+
Contracts.CheckValueOrNull(estimatorChain);
23+
24+
_start = start;
25+
_estimatorChain = estimatorChain ?? new EstimatorChain<TLastTransformer>();
26+
27+
// REVIEW: enforce that estimator chain can read the reader's schema.
28+
// Right now it throws.
29+
// GetOutputSchema();
30+
}
31+
32+
public CompositeDataReader<TSource, TLastTransformer> Fit(TSource input)
33+
{
34+
var start = _start.Fit(input);
35+
var idv = start.Read(input);
36+
37+
var xfChain = _estimatorChain.Fit(idv);
38+
return new CompositeDataReader<TSource, TLastTransformer>(start, xfChain);
39+
}
40+
41+
public SchemaShape GetOutputSchema()
42+
{
43+
var shape = _start.GetOutputSchema();
44+
return _estimatorChain.GetOutputSchema(shape);
45+
}
46+
47+
/// <summary>
48+
/// Append another estimator to the end.
49+
/// </summary>
50+
public CompositeReaderEstimator<TSource, TNewTrans> Append<TNewTrans>(IEstimator<TNewTrans> estimator)
51+
where TNewTrans : class, ITransformer
52+
{
53+
Contracts.CheckValue(estimator, nameof(estimator));
54+
55+
return new CompositeReaderEstimator<TSource, TNewTrans>(_start, _estimatorChain.Append(estimator));
56+
}
57+
}
58+
59+
}

0 commit comments

Comments
 (0)