-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Metadata fixes for the ValueMappingEstimator #2098
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
7aeb5f7
904975a
3b3f6da
c4729ff
5759218
fa27d84
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||
---|---|---|---|---|---|---|---|---|
|
@@ -100,13 +100,14 @@ public override SchemaShape GetOutputSchema(SchemaShape inputSchema) | |||||||
var isKey = Transformer.ValueColumnType.IsKey; | ||||||||
var columnType = (isKey) ? PrimitiveType.FromKind(DataKind.U4) : | ||||||||
Transformer.ValueColumnType; | ||||||||
var metadata = SchemaShape.Create(Transformer.ValueColumnMetadata.Schema); | ||||||||
foreach (var (Input, Output) in _columns) | ||||||||
{ | ||||||||
if (!inputSchema.TryFindColumn(Input, out var originalColumn)) | ||||||||
throw Host.ExceptSchemaMismatch(nameof(inputSchema), "input", Input); | ||||||||
|
||||||||
// Get the type from TOutputType | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is TOutputType here? Is it not Output? #Resolved There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That is old, I used to have the generic arguments be TInputType, TOutputType, but then changed to TKeyType and TValueType. I will update the comment. In reply to: 247282739 [](ancestors = 247282739) |
||||||||
var col = new SchemaShape.Column(Output, vectorKind, columnType, isKey, originalColumn.Metadata); | ||||||||
var col = new SchemaShape.Column(Output, vectorKind, columnType, isKey, metadata); | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Although Name is more specific to what the object actually is, I think it goes against thinking of these as columns since you are talking about a input/output (or source/name). I would prefer to keep it as Output/Input. BTW - this will also go through another revision with #2064 In reply to: 247282892 [](ancestors = 247282892) |
||||||||
resultDic[Output] = col; | ||||||||
} | ||||||||
return new SchemaShape(resultDic.Values); | ||||||||
|
@@ -191,18 +192,14 @@ internal static IDataView CreateDataView<TKey, TValue>(IHostEnvironment env, | |||||||
// set of values. This is used for generating the metadata of | ||||||||
// the column. | ||||||||
HashSet<TValue> valueSet = new HashSet<TValue>(); | ||||||||
HashSet<TKey> keySet = new HashSet<TKey>(); | ||||||||
for (int i = 0; i < values.Count(); ++i) | ||||||||
foreach(var v in values) | ||||||||
{ | ||||||||
var v = values.ElementAt(i); | ||||||||
if (valueSet.Contains(v)) | ||||||||
continue; | ||||||||
valueSet.Add(v); | ||||||||
|
||||||||
var k = keys.ElementAt(i); | ||||||||
keySet.Add(k); | ||||||||
} | ||||||||
var metaKeys = keySet.ToArray(); | ||||||||
|
||||||||
var metaKeys = valueSet.ToArray(); | ||||||||
|
||||||||
// Key Values are treated in one of two ways: | ||||||||
// If the values are of type uint or ulong, these values are used directly as the keys types and no new keys are created. | ||||||||
|
@@ -387,7 +384,7 @@ protected ValueMappingTransformer(IHostEnvironment env, IDataView lookupMap, | |||||||
Host.CheckNonEmpty(valueColumn, nameof(valueColumn), "A value column must be specified when passing in an IDataView for the value mapping"); | ||||||||
_valueMap = CreateValueMapFromDataView(lookupMap, keyColumn, valueColumn); | ||||||||
int valueColumnIdx = 0; | ||||||||
Host.Assert(lookupMap.Schema.TryGetColumnIndex(valueColumn, out valueColumnIdx)); | ||||||||
Host.Check(lookupMap.Schema.TryGetColumnIndex(valueColumn, out valueColumnIdx)); | ||||||||
_valueMetadata = lookupMap.Schema[valueColumnIdx].Metadata; | ||||||||
|
||||||||
// Create the byte array of the original IDataView, this is used for saving out the data. | ||||||||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -52,17 +52,17 @@ public void ValueMapOneValueTest() | |||||
var data = new[] { new TestClass() { A = "bar", B = "test", C = "foo" } }; | ||||||
var dataView = ComponentCreation.CreateDataView(Env, data); | ||||||
|
||||||
IEnumerable<ReadOnlyMemory<char>> keys = new List<ReadOnlyMemory<char>>() { "foo".AsMemory(), "bar".AsMemory(), "test".AsMemory(), "wahoo".AsMemory() }; | ||||||
IEnumerable<int> values = new List<int>() { 1, 2, 3, 4 }; | ||||||
var keys = new List<ReadOnlyMemory<char>>() { "foo".AsMemory(), "bar".AsMemory(), "test".AsMemory(), "wahoo".AsMemory() }; | ||||||
var values = new List<int>() { 1, 2, 3, 4 }; | ||||||
|
||||||
var estimator = new ValueMappingEstimator<ReadOnlyMemory<char>, int>(Env, keys, values, new[] { ("A", "D"), ("B", "E"), ("C", "F") }); | ||||||
var t = estimator.Fit(dataView); | ||||||
|
||||||
var result = t.Transform(dataView); | ||||||
var cursor = result.GetRowCursor((col) => true); | ||||||
var getterD = cursor.GetGetter<int>(3); | ||||||
var getterE = cursor.GetGetter<int>(4); | ||||||
var getterF = cursor.GetGetter<int>(5); | ||||||
var getterD = cursor.GetGetter<int>(result.Schema["D"].Index); | ||||||
var getterE = cursor.GetGetter<int>(result.Schema["E"].Index); | ||||||
var getterF = cursor.GetGetter<int>(result.Schema["F"].Index); | ||||||
cursor.MoveNext(); | ||||||
|
||||||
int dValue = 0; | ||||||
|
@@ -93,9 +93,9 @@ public void ValueMapVectorValueTest() | |||||
|
||||||
var result = t.Transform(dataView); | ||||||
var cursor = result.GetRowCursor((col) => true); | ||||||
var getterD = cursor.GetGetter<VBuffer<int>>(3); | ||||||
var getterE = cursor.GetGetter<VBuffer<int>>(4); | ||||||
var getterF = cursor.GetGetter<VBuffer<int>>(5); | ||||||
var getterD = cursor.GetGetter<VBuffer<int>>(result.Schema["D"].Index); | ||||||
var getterE = cursor.GetGetter<VBuffer<int>>(result.Schema["E"].Index); | ||||||
var getterF = cursor.GetGetter<VBuffer<int>>(result.Schema["F"].Index); | ||||||
cursor.MoveNext(); | ||||||
|
||||||
var valuesArray = values.ToArray(); | ||||||
|
@@ -116,17 +116,17 @@ public void ValueMappingMissingKey() | |||||
var data = new[] { new TestClass() { A = "barTest", B = "test", C = "foo" } }; | ||||||
var dataView = ComponentCreation.CreateDataView(Env, data); | ||||||
|
||||||
IEnumerable<ReadOnlyMemory<char>> keys = new List<ReadOnlyMemory<char>>() { "foo".AsMemory(), "bar".AsMemory(), "test".AsMemory(), "wahoo".AsMemory() }; | ||||||
IEnumerable<int> values = new List<int>() { 1, 2, 3, 4 }; | ||||||
var keys = new List<ReadOnlyMemory<char>>() { "foo".AsMemory(), "bar".AsMemory(), "test".AsMemory(), "wahoo".AsMemory() }; | ||||||
var values = new List<int>() { 1, 2, 3, 4 }; | ||||||
|
||||||
var estimator = new ValueMappingEstimator<ReadOnlyMemory<char>, int>(Env, keys, values, new[] { ("A", "D"), ("B", "E"), ("C", "F") }); | ||||||
var t = estimator.Fit(dataView); | ||||||
|
||||||
var result = t.Transform(dataView); | ||||||
var cursor = result.GetRowCursor((col) => true); | ||||||
var getterD = cursor.GetGetter<int>(3); | ||||||
var getterE = cursor.GetGetter<int>(4); | ||||||
var getterF = cursor.GetGetter<int>(5); | ||||||
var getterD = cursor.GetGetter<int>(result.Schema["D"].Index); | ||||||
var getterE = cursor.GetGetter<int>(result.Schema["E"].Index); | ||||||
var getterF = cursor.GetGetter<int>(result.Schema["F"].Index); | ||||||
cursor.MoveNext(); | ||||||
|
||||||
int dValue = 1; | ||||||
|
@@ -146,8 +146,8 @@ void TestDuplicateKeys() | |||||
var data = new[] { new TestClass() { A = "barTest", B = "test", C = "foo" } }; | ||||||
var dataView = ComponentCreation.CreateDataView(Env, data); | ||||||
|
||||||
IEnumerable<ReadOnlyMemory<char>> keys = new List<ReadOnlyMemory<char>>() { "foo".AsMemory(), "foo".AsMemory() }; | ||||||
IEnumerable<int> values = new List<int>() { 1, 2 }; | ||||||
var keys = new List<ReadOnlyMemory<char>>() { "foo".AsMemory(), "foo".AsMemory() }; | ||||||
var values = new List<int>() { 1, 2 }; | ||||||
|
||||||
Assert.Throws<InvalidOperationException>(() => new ValueMappingEstimator<ReadOnlyMemory<char>, int>(Env, keys, values, new[] { ("A", "D"), ("B", "E"), ("C", "F") })); | ||||||
} | ||||||
|
@@ -158,8 +158,8 @@ public void ValueMappingOutputSchema() | |||||
var data = new[] { new TestClass() { A = "barTest", B = "test", C = "foo" } }; | ||||||
var dataView = ComponentCreation.CreateDataView(Env, data); | ||||||
|
||||||
IEnumerable<ReadOnlyMemory<char>> keys = new List<ReadOnlyMemory<char>>() { "foo".AsMemory(), "bar".AsMemory(), "test".AsMemory(), "wahoo".AsMemory() }; | ||||||
IEnumerable<int> values = new List<int>() { 1, 2, 3, 4 }; | ||||||
var keys = new List<ReadOnlyMemory<char>>() { "foo".AsMemory(), "bar".AsMemory(), "test".AsMemory(), "wahoo".AsMemory() }; | ||||||
var values = new List<int>() { 1, 2, 3, 4 }; | ||||||
|
||||||
var estimator = new ValueMappingEstimator<ReadOnlyMemory<char>, int>(Env, keys, values, new[] { ("A", "D"), ("B", "E"), ("C", "F") }); | ||||||
var outputSchema = estimator.GetOutputSchema(SchemaShape.Create(dataView.Schema)); | ||||||
|
@@ -184,8 +184,8 @@ public void ValueMappingWithValuesAsKeyTypesOutputSchema() | |||||
var data = new[] { new TestClass() { A = "bar", B = "test", C = "foo" } }; | ||||||
var dataView = ComponentCreation.CreateDataView(Env, data); | ||||||
|
||||||
IEnumerable<ReadOnlyMemory<char>> keys = new List<ReadOnlyMemory<char>>() { "foo".AsMemory(), "bar".AsMemory(), "test".AsMemory(), "wahoo".AsMemory() }; | ||||||
IEnumerable<ReadOnlyMemory<char>> values = new List<ReadOnlyMemory<char>>() { "t".AsMemory(), "s".AsMemory(), "u".AsMemory(), "v".AsMemory() }; | ||||||
var keys = new List<ReadOnlyMemory<char>>() { "foo".AsMemory(), "bar".AsMemory(), "test".AsMemory(), "wahoo".AsMemory() }; | ||||||
var values = new List<ReadOnlyMemory<char>>() { "t".AsMemory(), "s".AsMemory(), "u".AsMemory(), "v".AsMemory() }; | ||||||
|
||||||
var estimator = new ValueMappingEstimator<ReadOnlyMemory<char>, ReadOnlyMemory<char>>(Env, keys, values, true, new[] { ("A", "D"), ("B", "E"), ("C", "F") }); | ||||||
var outputSchema = estimator.GetOutputSchema(SchemaShape.Create(dataView.Schema)); | ||||||
|
@@ -212,20 +212,20 @@ public void ValueMappingValuesAsUintKeyTypes() | |||||
var data = new[] { new TestClass() { A = "bar", B = "test2", C = "wahoo" } }; | ||||||
var dataView = ComponentCreation.CreateDataView(Env, data); | ||||||
|
||||||
IEnumerable<ReadOnlyMemory<char>> keys = new List<ReadOnlyMemory<char>>() { "foo".AsMemory(), "bar".AsMemory(), "test".AsMemory(), "wahoo".AsMemory() }; | ||||||
var keys = new List<ReadOnlyMemory<char>>() { "foo".AsMemory(), "bar".AsMemory(), "test".AsMemory(), "wahoo".AsMemory() }; | ||||||
|
||||||
// These are the expected key type values | ||||||
IEnumerable<uint> values = new List<uint>() { 51, 25, 42, 61 }; | ||||||
var values = new List<uint>() { 51, 25, 42, 61 }; | ||||||
|
||||||
var estimator = new ValueMappingEstimator<ReadOnlyMemory<char>, uint>(Env, keys, values, true, new[] { ("A", "D"), ("B", "E"), ("C", "F") }); | ||||||
|
||||||
var t = estimator.Fit(dataView); | ||||||
|
||||||
var result = t.Transform(dataView); | ||||||
var cursor = result.GetRowCursor((col) => true); | ||||||
var getterD = cursor.GetGetter<uint>(3); | ||||||
var getterE = cursor.GetGetter<uint>(4); | ||||||
var getterF = cursor.GetGetter<uint>(5); | ||||||
var getterD = cursor.GetGetter<uint>(result.Schema["D"].Index); | ||||||
var getterE = cursor.GetGetter<uint>(result.Schema["E"].Index); | ||||||
var getterF = cursor.GetGetter<uint>(result.Schema["F"].Index); | ||||||
cursor.MoveNext(); | ||||||
|
||||||
// The expected values will contain the actual uints and are not generated. | ||||||
|
@@ -251,20 +251,20 @@ public void ValueMappingValuesAsUlongKeyTypes() | |||||
var data = new[] { new TestClass() { A = "bar", B = "test2", C = "wahoo" } }; | ||||||
var dataView = ComponentCreation.CreateDataView(Env, data); | ||||||
|
||||||
IEnumerable<ReadOnlyMemory<char>> keys = new List<ReadOnlyMemory<char>>() { "foo".AsMemory(), "bar".AsMemory(), "test".AsMemory(), "wahoo".AsMemory() }; | ||||||
var keys = new List<ReadOnlyMemory<char>>() { "foo".AsMemory(), "bar".AsMemory(), "test".AsMemory(), "wahoo".AsMemory() }; | ||||||
|
||||||
// These are the expected key type values | ||||||
IEnumerable<ulong> values = new List<ulong>() { 51, Int32.MaxValue, 42, 61 }; | ||||||
var values = new List<ulong>() { 51, Int32.MaxValue, 42, 61 }; | ||||||
|
||||||
var estimator = new ValueMappingEstimator<ReadOnlyMemory<char>, ulong>(Env, keys, values, true, new[] { ("A", "D"), ("B", "E"), ("C", "F") }); | ||||||
|
||||||
var t = estimator.Fit(dataView); | ||||||
|
||||||
var result = t.Transform(dataView); | ||||||
var cursor = result.GetRowCursor((col) => true); | ||||||
var getterD = cursor.GetGetter<ulong>(3); | ||||||
var getterE = cursor.GetGetter<ulong>(4); | ||||||
var getterF = cursor.GetGetter<ulong>(5); | ||||||
var getterD = cursor.GetGetter<ulong>(result.Schema["D"].Index); | ||||||
var getterE = cursor.GetGetter<ulong>(result.Schema["E"].Index); | ||||||
var getterF = cursor.GetGetter<ulong>(result.Schema["F"].Index); | ||||||
cursor.MoveNext(); | ||||||
|
||||||
// The expected values will contain the actual uints and are not generated. | ||||||
|
@@ -289,19 +289,19 @@ public void ValueMappingValuesAsStringKeyTypes() | |||||
var data = new[] { new TestClass() { A = "bar", B = "test", C = "notfound" } }; | ||||||
var dataView = ComponentCreation.CreateDataView(Env, data); | ||||||
|
||||||
IEnumerable<ReadOnlyMemory<char>> keys = new List<ReadOnlyMemory<char>>() { "foo".AsMemory(), "bar".AsMemory(), "test".AsMemory(), "wahoo".AsMemory() }; | ||||||
var keys = new List<ReadOnlyMemory<char>>() { "foo".AsMemory(), "bar".AsMemory(), "test".AsMemory(), "wahoo".AsMemory() }; | ||||||
|
||||||
// Generating the list of strings for the key type values, note that foo1 is duplicated as intended to test that the same index value is returned | ||||||
IEnumerable<ReadOnlyMemory<char>> values = new List<ReadOnlyMemory<char>>() { "foo1".AsMemory(), "foo2".AsMemory(), "foo1".AsMemory(), "foo3".AsMemory() }; | ||||||
var values = new List<ReadOnlyMemory<char>>() { "foo1".AsMemory(), "foo2".AsMemory(), "foo1".AsMemory(), "foo3".AsMemory() }; | ||||||
|
||||||
var estimator = new ValueMappingEstimator<ReadOnlyMemory<char>, ReadOnlyMemory<char>>(Env, keys, values, true, new[] { ("A", "D"), ("B", "E"), ("C", "F") }); | ||||||
var t = estimator.Fit(dataView); | ||||||
|
||||||
var result = t.Transform(dataView); | ||||||
var cursor = result.GetRowCursor((col) => true); | ||||||
var getterD = cursor.GetGetter<uint>(3); | ||||||
var getterE = cursor.GetGetter<uint>(4); | ||||||
var getterF = cursor.GetGetter<uint>(5); | ||||||
var getterD = cursor.GetGetter<uint>(result.Schema["D"].Index); | ||||||
var getterE = cursor.GetGetter<uint>(result.Schema["E"].Index); | ||||||
var getterF = cursor.GetGetter<uint>(result.Schema["F"].Index); | ||||||
cursor.MoveNext(); | ||||||
|
||||||
// The expected values will contain the generated key type values starting from 1. | ||||||
|
@@ -320,6 +320,32 @@ public void ValueMappingValuesAsStringKeyTypes() | |||||
Assert.Equal<uint>(0, fValue); | ||||||
} | ||||||
|
||||||
[Fact] | ||||||
public void ValueMappingValuesAsKeyTypesReverseLookup() | ||||||
{ | ||||||
var data = new[] { new TestClass() { A = "bar", B = "test", C = "notfound" } }; | ||||||
var dataView = ComponentCreation.CreateDataView(Env, data); | ||||||
|
||||||
var keys = new List<ReadOnlyMemory<char>>() { "foo".AsMemory(), "bar".AsMemory(), "test".AsMemory(), "wahoo".AsMemory() }; | ||||||
|
||||||
// Generating the list of strings for the key type values, note that foo1 is duplicated as intended to test that the same index value is returned | ||||||
var values = new List<ReadOnlyMemory<char>>() { "foo1".AsMemory(), "foo2".AsMemory(), "foo1".AsMemory(), "foo3".AsMemory() }; | ||||||
|
||||||
var estimator = new ValueMappingEstimator<ReadOnlyMemory<char>, ReadOnlyMemory<char>>(Env, keys, values, true, new[] { ("A", "D"), ("B", "E"), ("C", "F") }) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do you need three columns? Only "D" is used below. #Resolved There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. True - that is a result of copying code from another test. I have removed the other two columns. In reply to: 247285144 [](ancestors = 247285144) |
||||||
.Append(new KeyToValueMappingEstimator(Env, ("D","DOutput"))); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Please format all files touched. #Resolved There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||||||
var t = estimator.Fit(dataView); | ||||||
|
||||||
var result = t.Transform(dataView); | ||||||
var cursor = result.GetRowCursor((col) => true); | ||||||
var getterD = cursor.GetGetter<ReadOnlyMemory<char>>(result.Schema["DOutput"].Index); | ||||||
cursor.MoveNext(); | ||||||
|
||||||
// The expected values will contain the generated key type values starting from 1. | ||||||
ReadOnlyMemory<char> dValue = default; | ||||||
getterD(ref dValue); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
can we validate it's a "bar"? #Closed |
||||||
Assert.Equal("foo2".AsMemory(), dValue); | ||||||
} | ||||||
|
||||||
[Fact] | ||||||
public void ValueMappingWorkout() | ||||||
{ | ||||||
|
@@ -328,8 +354,8 @@ public void ValueMappingWorkout() | |||||
var badData = new[] { new TestWrong() { A = "bar", B = 1.2f } }; | ||||||
var badDataView = ComponentCreation.CreateDataView(Env, badData); | ||||||
|
||||||
IEnumerable<ReadOnlyMemory<char>> keys = new List<ReadOnlyMemory<char>>() { "foo".AsMemory(), "bar".AsMemory(), "test".AsMemory(), "wahoo".AsMemory() }; | ||||||
IEnumerable<int> values = new List<int>() { 1, 2, 3, 4 }; | ||||||
var keys = new List<ReadOnlyMemory<char>>() { "foo".AsMemory(), "bar".AsMemory(), "test".AsMemory(), "wahoo".AsMemory() }; | ||||||
var values = new List<int>() { 1, 2, 3, 4 }; | ||||||
|
||||||
// Workout on value mapping | ||||||
var est = ML.Transforms.Conversion.ValueMap(keys, values, new[] { ("A", "D"), ("B", "E"), ("C", "F") }); | ||||||
|
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe? #Resolved
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure, I will update.
In reply to: 247282604 [](ancestors = 247282604)