Skip to content

Commit 3b2edab

Browse files
author
Pete Luferenko
committed
Initial take on IEstimator interfaces
1 parent 8cfa2ed commit 3b2edab

File tree

1 file changed

+163
-0
lines changed

1 file changed

+163
-0
lines changed
+163
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
// See the LICENSE file in the project root for more information.
4+
5+
using Microsoft.ML.Runtime;
6+
using Microsoft.ML.Runtime.Data;
7+
using System.Collections.Generic;
8+
using System.Linq;
9+
10+
namespace Microsoft.ML.Core.Data
11+
{
12+
/// <summary>
13+
/// A set of 'requirements' to the incoming schema, as well as a set of 'promises' of the outgoing schema.
14+
/// This is more relaxed than the proper <see cref="ISchema"/>, since it's only a subset of the columns,
15+
/// and also since it doesn't specify exact <see cref="ColumnType"/>'s for vectors and keys.
16+
/// </summary>
17+
public sealed class SchemaShape
18+
{
19+
public readonly ColumnBase[] Columns;
20+
21+
public abstract class ColumnBase
22+
{
23+
public readonly string Name;
24+
public ColumnBase(string name)
25+
{
26+
Contracts.CheckNonEmpty(name, nameof(name));
27+
Name = name;
28+
}
29+
}
30+
31+
public sealed class RelaxedColumn : ColumnBase
32+
{
33+
public enum VectorKind
34+
{
35+
Scalar,
36+
Vector,
37+
VariableVector
38+
}
39+
40+
public readonly VectorKind Kind;
41+
public readonly DataKind ItemKind;
42+
public readonly bool IsKey;
43+
44+
public RelaxedColumn(string name, VectorKind kind, DataKind itemKind, bool isKey)
45+
: base(name)
46+
{
47+
Kind = kind;
48+
ItemKind = itemKind;
49+
IsKey = isKey;
50+
}
51+
}
52+
53+
public sealed class StrictColumn : ColumnBase
54+
{
55+
// REVIEW: do we ever need strict columns? Maybe we should only have relaxed?
56+
public readonly ColumnType ColumnType;
57+
58+
public StrictColumn(string name, ColumnType columnType)
59+
: base(name)
60+
{
61+
Contracts.CheckValue(columnType, nameof(columnType));
62+
ColumnType = columnType;
63+
}
64+
}
65+
66+
public SchemaShape(ColumnBase[] columns)
67+
{
68+
Contracts.CheckValue(columns, nameof(columns));
69+
Columns = columns;
70+
}
71+
72+
/// <summary>
73+
/// Create a schema shape out of the fully defined schema.
74+
/// </summary>
75+
public static SchemaShape Create(ISchema schema)
76+
{
77+
Contracts.CheckValue(schema, nameof(schema));
78+
var cols = new List<ColumnBase>();
79+
80+
for (int iCol = 0; iCol < schema.ColumnCount; iCol++)
81+
{
82+
if (!schema.IsHidden(iCol))
83+
cols.Append(new StrictColumn(schema.GetColumnName(iCol), schema.GetColumnType(iCol)));
84+
}
85+
return new SchemaShape(cols.ToArray());
86+
}
87+
88+
/// <summary>
89+
/// Returns the column with a specified <paramref name="name"/>, and <c>null</c> if there is no such column.
90+
/// </summary>
91+
public ColumnBase FindColumn(string name)
92+
{
93+
Contracts.CheckValue(name, nameof(name));
94+
return Columns.FirstOrDefault(x => x.Name == name);
95+
}
96+
97+
// REVIEW: I think we should have an IsCompatible method to check if it's OK to use one schema shape
98+
// as an input to another schema shape. I started writing, but realized that there's more than one way to check for
99+
// the 'compatibility': as in, 'CAN be compatible' vs. 'WILL be compatible'.
100+
}
101+
102+
/// <summary>
103+
/// The generic transformer takes any kind of input and turns it into an <see cref="IDataView"/>.
104+
/// Think of this as data loaders. Data transformers are also these, but they also implement <see cref="IDataTransformer"/>.
105+
/// </summary>
106+
/// <typeparam name="TIn">The type of input the transformer takes.</typeparam>
107+
public interface ITransformer<TIn>
108+
{
109+
/// <summary>
110+
/// Take the data in, make transformations, output the data.
111+
/// Note that <see cref="IDataView"/>'s are lazy, so no actual transformations happen here, just schema validation.
112+
/// </summary>
113+
IDataView Transform(TIn input);
114+
}
115+
116+
/// <summary>
117+
/// Estimator is a Spark name for 'trainable component'. Like a normalizer, or an SvmLightLoader.
118+
/// It needs to be 'fitted' to create a <see cref="ITransformer{TIn}"/>.
119+
/// </summary>
120+
/// <typeparam name="TIn">The type of input the estimator (and eventually transformer) takes.</typeparam>
121+
public interface IEstimator<TIn>
122+
{
123+
/// <summary>
124+
/// Train and return a transformer.
125+
///
126+
/// REVIEW: you could consider the transformer to take a different <typeparamref name="TIn"/>, but we don't have such components
127+
/// yet, so why complicate matters?
128+
/// </summary>
129+
ITransformer<TIn> Fit(TIn input);
130+
131+
/// <summary>
132+
/// The 'promise' of the output schema.
133+
/// It will be used for schema propagation.
134+
/// </summary>
135+
SchemaShape GetOutputSchema();
136+
}
137+
138+
/// <summary>
139+
/// The data transformer, in addition to being a transformer, also exposes the input schema shape. It is handy for
140+
/// evaluating what kind of columns the transformer expects.
141+
/// </summary>
142+
public interface IDataTransformer : ITransformer<IDataView>
143+
{
144+
/// <summary>
145+
/// Schema propagation for transformers.
146+
/// Returns the output schema of the data, if the input schema is like the one provided.
147+
/// Returns <c>null</c> iff the schema is invalid (then a call to Transform with this data will fail).
148+
/// </summary>
149+
ISchema GetOutputSchema(ISchema inputSchema);
150+
}
151+
152+
public interface IDataEstimator : IEstimator<IDataView>
153+
{
154+
new IDataTransformer Fit(IDataView input);
155+
156+
/// <summary>
157+
/// Schema propagation for estimators.
158+
/// Returns the output schema shape of the estimator, if the input schema shape is like the one provided.
159+
/// Returns <c>null</c> iff the schema shape is invalid (then a call to <see cref="Fit"/> with this data will fail).
160+
/// </summary>
161+
SchemaShape GetOutputSchema(SchemaShape inputSchema);
162+
}
163+
}

0 commit comments

Comments
 (0)