-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Term transformer implementation #759
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Term transformer implementation #759
Conversation
/// </summary> | ||
private TermTransform(IHostEnvironment env, TermTransform transform, IDataView newSource) | ||
: base(env, RegistrationName, transform, newSource, TestIsKnownDataKind) | ||
public void CreateInfos(SourceNameColumnBase[] column, ISchema input) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
CreateInfos [](start = 20, length = 11)
need be shared with Mapper #Closed
#pragma warning restore MSML_NoMessagesForLoadContext | ||
boundMap = termMap; | ||
} | ||
|
||
private static string TestIsKnownDataKind(ColumnType type) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
TestIsKnownDataKind [](start = 30, length = 19)
remove or incorporate into current logic #Closed
var info = new MetadataInfo<VBuffer<DvText>>(columnType, getter); | ||
colMetaInfo.Add(MetadataUtils.Kinds.KeyValues, info); | ||
/*bldr.AddGetter<VBuffer<DvText>>(MetadataUtils.Kinds.KeyValues, | ||
new VectorType(TextType.Instance, TypedMap.OutputType.KeyCount), getter);*/ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
remove #Closed
var info = new MetadataInfo<VBuffer<T>>(columnType, getter); | ||
colMetaInfo.Add(MetadataUtils.Kinds.KeyValues, info); | ||
/*bldr.AddGetter<VBuffer<T>>(MetadataUtils.Kinds.KeyValues, | ||
new VectorType(TypedMap.ItemType, TypedMap.OutputType.KeyCount), getter);*/ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
remove #Closed
var info = new MetadataInfo<VBuffer<DvText>>(columnType, mgetter); | ||
colMetaInfo.Add(MetadataUtils.Kinds.KeyValues, info); | ||
/*bldr.AddGetter<VBuffer<DvText>>(MetadataUtils.Kinds.KeyValues, | ||
new VectorType(TextType.Instance, TypedMap.OutputType.KeyCount), mgetter);*/ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
remove #Closed
var info = new MetadataInfo<VBuffer<TMeta>>(columnType, mgetter); | ||
colMetaInfo.Add(MetadataUtils.Kinds.KeyValues, info); | ||
/*bldr.AddGetter<VBuffer<TMeta>>(MetadataUtils.Kinds.KeyValues, | ||
new VectorType(srcMetaType.ItemType.AsPrimitive, TypedMap.OutputType.KeyCount), mgetter);*/ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
remove #Closed
} | ||
} | ||
|
||
private sealed class Row : IRow |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Row [](start = 29, length = 3)
Can I use SimpleRow? #Closed
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A better question is: can you reuse RowCursor
somehow? Maybe derive RowCursor
from Row
or something. It's bad to have two implementations of GetGetter
In reply to: 214105932 [](ancestors = 214105932,214102719)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's rude to intervene my dialogue with myself!
In reply to: 214106201 [](ancestors = 214106201,214105932,214102719)
@@ -318,6 +324,96 @@ public override IRowCursor[] GetRowCursorSet(out IRowCursorConsolidator consolid | |||
return cursors; | |||
} | |||
|
|||
public void SaveAsOnnx(OnnxContext ctx) | |||
{ | |||
if (_mapper is ISaveAsOnnx onnx) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Host.CheckValue(ctx)
#Closed
public Func<int, bool> GetDependencies(Func<int, bool> predicate) | ||
{ | ||
Func<int, bool> predicateInput; | ||
var active = _bindings.GetActive(predicate, out predicateInput); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
var active = [](start = 12, length = 12)
no need to assign anything, just call GetActive
#Closed
return fn; | ||
} | ||
|
||
public ValueGetter<UInt128> GetIdGetter() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
GetIdGetter [](start = 40, length = 11)
=>
#Closed
/// <summary> | ||
/// This data model component is savable as ONNX. | ||
/// </summary> | ||
public interface ITransformCanSaveOnnx : ICanSaveOnnx, ISaveAsOnnx, IDataTransform |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ICanSaveOnnx, ISaveAsOnnx [](start = 45, length = 25)
this is confusing: why are there two interfaces? #Closed
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see a reason, but it doesn't make it less confusing to be honest
In reply to: 214106801 [](ancestors = 214106801)
var originalColumn = inputSchema.FindColumn(column.Source); | ||
if (originalColumn != null) | ||
{ | ||
var col = new SchemaShape.Column(column.Name, originalColumn.Kind, DataKind.U4, true, originalColumn.MetadataKinds); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
MetadataKinds [](start = 121, length = 13)
is this correct? #Closed
throw _host.ExceptParam(nameof(inputSchema), $"{column.Source} not found in {nameof(inputSchema)}"); | ||
} | ||
} | ||
return new SchemaShape(resultDic.Values.ToArray()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
.ToArray() [](start = 51, length = 10)
No need #Closed
using System.IO; | ||
using System.Linq; | ||
using System.Text; | ||
using static Microsoft.ML.Runtime.Data.TermTransform; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
using static [](start = 0, length = 12)
can we avoid this? #Closed
@@ -226,70 +227,154 @@ private CodecFactory CodecFactory | |||
/// Public constructor corresponding to SignatureDataTransform. | |||
/// </summary> | |||
public TermTransform(IHostEnvironment env, Arguments args, IDataView input) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
public TermTransform(IHostEnvironment env, Arguments args, IDataView input) [](start = 8, length = 75)
it doesn't correspond to the signature any more, right? #Closed
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually, I'd suggest having the 'training' happen in public static TermTransform Train(env, args, trainData)
, and ctors should be private.
In reply to: 214131316 [](ancestors = 214131316)
internal readonly ISchema Schema; | ||
private readonly Dictionary<int, int> _colNewToOldMapping; | ||
private readonly ColumnType[] _types; | ||
private readonly (string Source, string Name)[] _columns; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if you look at the image transforms, I adopted the paradigm that row mapper has a transformer _parent
. When you load a row mapper, you actually load the parent, and call MakeRowMapper
on it.
This way you don't need a static method to 'save stuff' / 'load stuff': you make only the transformer truly save-able, and the mapper will just save the parent. #Closed
@@ -209,6 +211,10 @@ private static VersionInfo GetVersionInfo() | |||
|
|||
public override ISchema Schema { get { return _bindings; } } | |||
|
|||
public bool CanSaveOnnx => _mapper is ISaveAsOnnx onnxMapper ? true : false; | |||
|
|||
public bool CanSavePfa => _mapper is ISaveAsOnnx pfaMapper ? true : false; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ISaveAsOnnx [](start = 45, length = 11)
You meant PFA I'm sure. #Closed
/// </summary> | ||
public interface ITransformCanSavePfa : ICanSavePfa, IDataTransform | ||
public interface ISaveAsPfa |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ISaveAsPfa [](start = 21, length = 10)
Same comment... can't do this. #Closed
/// </summary> | ||
public interface ITransformCanSaveOnnx: ICanSaveOnnx, IDataTransform | ||
public interface ISaveAsOnnx |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ISaveAsOnnx [](start = 21, length = 11)
This thing needs to descend from ICanSaveOnnx
, because it is often the case, as you'll see if you inspect the actual implementations of CanSaveOnnx
, that the behavior of whether or not we can in fact save as ONNX is far more complex than whether we implement the interface or not. #Closed
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For example, if there were an implementation of IRowMapper
that wrapped any existing mapper, obviously that meta-wrapper would have to report it can save as ONNX, but internally any of the things it is wrapping do not it is hosed, which is bad.
By having the core interface always have this runtime check, we avoid the problem that we have elsewhere with things like exporting as text, exporting as INI, and so on, where we have things that report they can save as something, but really cannot, and fail in strange and interesting ways.
In reply to: 214432408 [](ancestors = 214432408)
@@ -209,6 +211,10 @@ private static VersionInfo GetVersionInfo() | |||
|
|||
public override ISchema Schema { get { return _bindings; } } | |||
|
|||
public bool CanSaveOnnx => _mapper is ISaveAsOnnx onnxMapper ? true : false; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
_mapper is ISaveAsOnnx onnxMapper ? true : false; [](start = 35, length = 49)
Once you correct the interface this could be:
(_mapper as ISaveAsOnnx)?.CanSaveOnnx ?? false
See other instances of CanSaveOnnx
to see this pattern.
Moot point, note that if you have an expression of the form <boolExpr> ? true : false
this can be simplified to <boolExpr>
. #Closed
{ | ||
Host.CheckValue(ctx, nameof(ctx)); | ||
if (_mapper is ISaveAsOnnx onnx) | ||
onnx.SaveAsOnnx(ctx); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think elsewhere we fail if CanSaveOnnx
is false, but double check me. #Closed
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is fine I guess. Alternate would just be to have at the head of this method Host.Check(CanSaveOnnx)
but that's oK.
In reply to: 214433162 [](ancestors = 214433162)
@@ -97,7 +100,7 @@ public enum SortOrder : byte | |||
// other things, like case insensitive (where appropriate), culturally aware, etc.? | |||
} | |||
|
|||
private static class Defaults | |||
public static class Defaults |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
public static class Defaults [](start = 8, length = 28)
Must this be public? Can it not be `internal? #Closed
|
||
public class ColumnInfo | ||
{ | ||
public ColumnInfo(string input, string output, int maxNumTerms = Defaults.MaxNumTerms, SortOrder sort = Defaults.Sort, string[] term = null, string terms = null, bool textKeyValues = false) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
string terms = null [](start = 153, length = 19)
Certainly don't need both of these. #Pending
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Considering how TermMap class and Train function implemented...
I can spend probably few hours and refactor it, but this PR already covering so much stuff.
In reply to: 214463561 [](ancestors = 214463561)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
One easy thing we can do is to pass terms = null
everywhere, without allowing user to override it
In reply to: 214469361 [](ancestors = 214469361,214463561)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Right. In the public facing method, we definitely do not want both exposed. If an internal method requires it (why would you, if terms
only possible use is the command line?), that's fine? I guess?
In reply to: 214479433 [](ancestors = 214479433,214469361,214463561)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
} | ||
} | ||
|
||
public class ColumnInfo |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ColumnInfo [](start = 21, length = 10)
This is a little unfortunate. It may be necessary in the short term, but it seems like we have two objects to control the behavior, is it not so?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
One class(ColumnInfo) used to pass parameters and basically a copy of Arguments.Column and used to pass parameters of how to train transformer, second is just convenient class to store information about schema we work with.
In reply to: 214463662 [](ancestors = 214463662)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Right, I was talking about this being a copy of Arguments.Column
.
In reply to: 214468160 [](ancestors = 214468160,214463662)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's a weak excuse, but I'm just following what @Zruty0 doing right now. I also don't like duplication, but in same time, Arguments.Column and ColumnInfo is a slightly different, ColumnInfo doesn't have any nullable fields, which is the case for Arguments.Column.
In reply to: 214482328 [](ancestors = 214482328,214468160,214463662)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, the pattern I have been following up until now is, I keep Arguments.Column
intact, and create a separate set of ColumnInfo
when needed, to pass to my estimators/transformers.
The idea is, ColumnInfo
doesn't have to be mutable and only have fields that can be populated via cmdline.
Ultimately, I guess, we will sweep up all the Arguments
into the Microsoft.ML.CommandLine
assembly, together with the corresponding loadable classes and factory methods
In reply to: 214484933 [](ancestors = 214484933,214482328,214468160,214463662)
private static volatile CodecFactory _codecFactory; | ||
private static object _factoryLock = new object(); | ||
|
||
private static CodecFactory CodecFactory(IHostEnvironment env) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
private static [](start = 12, length = 14)
By making this object static, we are effectively making the environment and host composed out of that environment static, which we have indicated many, many times we do not want to do. If this CodecFactory
must be defined with an environment, then it is a mistake to make instances of it static, by that token. #Closed
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Interlocked.CompareExchange(ref _codecFactory, new CodecFactory(env, _codecFactoryPool), null); | ||
} | ||
} | ||
env.Assert(_codecFactory != null); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Having a lock, or using InterlockedCompare.Exchange
, are both possible choices. (We tend to favor InterlockedCompare.Exchange
throughout the rest of the codebase, but whatever.) Using both however seems somewhat pointless. #Closed
Contracts.Assert(CanSavePfa); | ||
private readonly ColumnType[] _types; | ||
private readonly TermTransform _parent; | ||
private readonly ColInfo[] _infos; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
_infos [](start = 39, length = 6)
can you zip _types
into _infos
? #Closed
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I mostly use ColInfo to pass information about schema to TermMap builders. So I can go scan data and build dictionaries.
_types is output type of the column, and depends on TermMap (it's gonna be Key of U4, but you don't know cardinality of it until you build map.
I can put them together, but they would be somewhat weird.
In reply to: 214480038 [](ancestors = 214480038)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
new TermTransform.ColumnInfo("C", "TermC") | ||
}); | ||
var invalidData = ComponentCreation.CreateDataView(env, xydata); | ||
TestEstimatorCore(pipe, dataView, null, invalidData); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
TestEstimatorCore [](start = 16, length = 17)
do another one with stringData. Or make TestEstimatorCore accept an array of invalidData
's #Closed
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
TestEstimator except to fail in both cases, one for schema, one for transformation, but for schema string fields are perfectly fine, so this lead to test failure. and I'm not sure I want to change TestEstimatorCore
In reply to: 214480934 [](ancestors = 214480934)
} | ||
|
||
[Fact] | ||
void TestBadTransformSchema() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
TestBadTransformSchema [](start = 13, length = 22)
I suspect that all the tests except TestWorking
are now not needed, as all is covered by TestEstimatorCore
. #Closed
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Keep only TestWorking
, and also baseline the output.
In reply to: 214481129 [](ancestors = 214481129)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🕐
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
mustFail(() => loadedTransformer.GetOutputSchema(validForFitNotValidForTransformInput.Schema)); | ||
mustFail(() => loadedTransformer.Transform(validForFitNotValidForTransformInput)); | ||
} | ||
|
||
|
||
// Schema verification between estimator and transformer. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
remove empty line
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
All right thanks @Ivanidzo4ka !
implementation for #754