Skip to content

Commit ef638f4

Browse files
authored
Metadata fixes for the ValueMappingEstimator (dotnet#2098)
* The ValueMappingEstimator had a couple of issues when using setting the Values as KeyTypes: 1) The output schema for the Estimator did not contain the KeyType information in the metadata. 2) The reverse lookup of the metadata had the incorrect value. This now sets the correct metadata on the output schema and uses the value data for the reverse lookup. A test was added to confirm the changes using the KeyToValueMapping appended to a ValueMappingEstimator for the reverse lookup. Fixes dotnet#2086 Fixes dotnet#2083
1 parent 4bfd7a1 commit ef638f4

File tree

2 files changed

+84
-61
lines changed

2 files changed

+84
-61
lines changed

src/Microsoft.ML.Data/Transforms/ValueMappingTransformer.cs

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -100,13 +100,14 @@ public override SchemaShape GetOutputSchema(SchemaShape inputSchema)
100100
var isKey = Transformer.ValueColumnType is KeyType;
101101
var columnType = (isKey) ? PrimitiveType.FromKind(DataKind.U4) :
102102
Transformer.ValueColumnType;
103+
var metadataShape = SchemaShape.Create(Transformer.ValueColumnMetadata.Schema);
103104
foreach (var (Input, Output) in _columns)
104105
{
105106
if (!inputSchema.TryFindColumn(Input, out var originalColumn))
106107
throw Host.ExceptSchemaMismatch(nameof(inputSchema), "input", Input);
107108

108-
// Get the type from TOutputType
109-
var col = new SchemaShape.Column(Output, vectorKind, columnType, isKey, originalColumn.Metadata);
109+
// Create the Value column
110+
var col = new SchemaShape.Column(Output, vectorKind, columnType, isKey, metadataShape);
110111
resultDic[Output] = col;
111112
}
112113
return new SchemaShape(resultDic.Values);
@@ -191,18 +192,14 @@ internal static IDataView CreateDataView<TKey, TValue>(IHostEnvironment env,
191192
// set of values. This is used for generating the metadata of
192193
// the column.
193194
HashSet<TValue> valueSet = new HashSet<TValue>();
194-
HashSet<TKey> keySet = new HashSet<TKey>();
195-
for (int i = 0; i < values.Count(); ++i)
195+
foreach (var v in values)
196196
{
197-
var v = values.ElementAt(i);
198197
if (valueSet.Contains(v))
199198
continue;
200199
valueSet.Add(v);
201-
202-
var k = keys.ElementAt(i);
203-
keySet.Add(k);
204200
}
205-
var metaKeys = keySet.ToArray();
201+
202+
var metaKeys = valueSet.ToArray();
206203

207204
// Key Values are treated in one of two ways:
208205
// If the values are of type uint or ulong, these values are used directly as the keys types and no new keys are created.
@@ -387,7 +384,7 @@ protected ValueMappingTransformer(IHostEnvironment env, IDataView lookupMap,
387384
Host.CheckNonEmpty(valueColumn, nameof(valueColumn), "A value column must be specified when passing in an IDataView for the value mapping");
388385
_valueMap = CreateValueMapFromDataView(lookupMap, keyColumn, valueColumn);
389386
int valueColumnIdx = 0;
390-
Host.Assert(lookupMap.Schema.TryGetColumnIndex(valueColumn, out valueColumnIdx));
387+
Host.Check(lookupMap.Schema.TryGetColumnIndex(valueColumn, out valueColumnIdx));
391388
_valueMetadata = lookupMap.Schema[valueColumnIdx].Metadata;
392389

393390
// Create the byte array of the original IDataView, this is used for saving out the data.

0 commit comments

Comments
 (0)