-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Hash estimator #944
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Hash estimator #944
Changes from 5 commits
bd01a0c
249a4e7
6bc51a7
1abf183
2f887ce
a8a9125
4262aea
47db03c
7a61173
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,131 @@ | ||
// Licensed to the .NET Foundation under one or more agreements. | ||
// The .NET Foundation licenses this file to you under the MIT license. | ||
// See the LICENSE file in the project root for more information. | ||
|
||
using Microsoft.ML.Runtime.Api; | ||
using Microsoft.ML.Runtime.Data; | ||
using Microsoft.ML.Runtime.Model; | ||
using Microsoft.ML.Runtime.RunTests; | ||
using Microsoft.ML.Runtime.Tools; | ||
using Microsoft.ML.Transforms; | ||
using System.IO; | ||
using System.Linq; | ||
using Xunit; | ||
using Xunit.Abstractions; | ||
|
||
namespace Microsoft.ML.Tests.Transformers | ||
{ | ||
public class HashTests : TestDataPipeBase | ||
{ | ||
public HashTests(ITestOutputHelper output) : base(output) | ||
{ | ||
} | ||
|
||
private class TestClass | ||
{ | ||
public float A; | ||
public float B; | ||
public float C; | ||
} | ||
|
||
private class TestMeta | ||
{ | ||
[VectorType(2)] | ||
public float[] A; | ||
public float B; | ||
[VectorType(2)] | ||
public double[] C; | ||
public double D; | ||
} | ||
|
||
[Fact] | ||
public void HashWorkout() | ||
{ | ||
var data = new[] { new TestClass() { A = 1, B = 2, C = 3, }, new TestClass() { A = 4, B = 5, C = 6 } }; | ||
|
||
var dataView = ComponentCreation.CreateDataView(Env, data); | ||
var pipe = new HashConverter(Env, new[]{ | ||
new HashConverterTransformer.ColumnInfo("A", "HashA", hashBits:4, invertHash:-1), | ||
new HashConverterTransformer.ColumnInfo("B", "HashB", hashBits:3, ordered:true), | ||
new HashConverterTransformer.ColumnInfo("C", "HashC", seed:42), | ||
new HashConverterTransformer.ColumnInfo("A", "HashD"), | ||
}); | ||
|
||
TestEstimatorCore(pipe, dataView); | ||
Done(); | ||
} | ||
|
||
[Fact] | ||
public void TestMetadata() | ||
{ | ||
|
||
var data = new[] { | ||
new TestMeta() { A=new float[2] { 3.5f, 2.5f}, B=1, C= new double[2] { 5.1f, 6.1f}, D= 7}, | ||
new TestMeta() { A=new float[2] { 3.5f, 2.5f}, B=1, C= new double[2] { 5.1f, 6.1f}, D= 7}, | ||
new TestMeta() { A=new float[2] { 3.5f, 2.5f}, B=1, C= new double[2] { 5.1f, 6.1f}, D= 7}}; | ||
|
||
|
||
var dataView = ComponentCreation.CreateDataView(Env, data); | ||
var pipe = new HashConverter(Env, new[] { | ||
new HashConverterTransformer.ColumnInfo("A", "HashA", invertHash:1, hashBits:10), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
hmm, now that I think of it, we should probably have There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you want this to be part of this PR or as separate PR? In reply to: 218936515 [](ancestors = 218936515) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I mean, we have this pattern in all estimators. Wouldn't it be better to move stuff in one PR, rather than in separate? Or you want me to update all estimators as well? In reply to: 218944749 [](ancestors = 218944749,218941758,218936515) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would rather prefer to keep one style in code, and if it needed to flip everything, other than have two styles in code. In reply to: 218945227 [](ancestors = 218945227,218944749,218941758,218936515) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fair. Let's do it in a different PR for all at once, where applicable In reply to: 218954028 [](ancestors = 218954028,218945227,218944749,218941758,218936515) |
||
new HashConverterTransformer.ColumnInfo("A", "HashAUnlim", invertHash:-1, hashBits:10), | ||
new HashConverterTransformer.ColumnInfo("A", "HashAUnlimOrdered", invertHash:-1, hashBits:10, ordered:true) | ||
}); | ||
var result = pipe.Fit(dataView).Transform(dataView); | ||
ValidateMetadata(result); | ||
Done(); | ||
} | ||
|
||
private void ValidateMetadata(IDataView result) | ||
{ | ||
|
||
Assert.True(result.Schema.TryGetColumnIndex("HashA", out int HashA)); | ||
Assert.True(result.Schema.TryGetColumnIndex("HashAUnlim", out int HashAUnlim)); | ||
Assert.True(result.Schema.TryGetColumnIndex("HashAUnlimOrdered", out int HashAUnlimOrdered)); | ||
VBuffer<DvText> keys = default; | ||
var types = result.Schema.GetMetadataTypes(HashA); | ||
Assert.Equal(types.Select(x => x.Key), new string[1] { MetadataUtils.Kinds.KeyValues }); | ||
result.Schema.GetMetadata(MetadataUtils.Kinds.KeyValues, HashA, ref keys); | ||
Assert.True(keys.Length == 1024); | ||
Assert.Equal(keys.Items().Select(x => x.Value.ToString()), new string[2] {"2.5", "3.5" }); | ||
|
||
types = result.Schema.GetMetadataTypes(HashAUnlim); | ||
Assert.Equal(types.Select(x => x.Key), new string[1] { MetadataUtils.Kinds.KeyValues }); | ||
result.Schema.GetMetadata(MetadataUtils.Kinds.KeyValues, HashA, ref keys); | ||
Assert.True(keys.Length == 1024); | ||
Assert.Equal(keys.Items().Select(x => x.Value.ToString()), new string[2] { "2.5", "3.5" }); | ||
|
||
types = result.Schema.GetMetadataTypes(HashAUnlimOrdered); | ||
Assert.Equal(types.Select(x => x.Key), new string[1] { MetadataUtils.Kinds.KeyValues }); | ||
result.Schema.GetMetadata(MetadataUtils.Kinds.KeyValues, HashA, ref keys); | ||
Assert.True(keys.Length == 1024); | ||
Assert.Equal(keys.Items().Select(x => x.Value.ToString()), new string[2] { "2.5", "3.5" }); | ||
} | ||
[Fact] | ||
public void TestCommandLine() | ||
{ | ||
Assert.Equal(Maml.Main(new[] { @"showschema loader=Text{col=A:R4:0} xf=Hash{col=B:A} in=f:\2.txt" }), (int)0); | ||
} | ||
|
||
[Fact] | ||
public void TestOldSavingAndLoading() | ||
{ | ||
var data = new[] { new TestClass() { A = 1, B = 2, C = 3, }, new TestClass() { A = 4, B = 5, C = 6 } }; | ||
var dataView = ComponentCreation.CreateDataView(Env, data); | ||
var pipe = new HashConverter(Env, new[]{ | ||
new HashConverterTransformer.ColumnInfo("A", "HashA", hashBits:4, invertHash:-1), | ||
new HashConverterTransformer.ColumnInfo("B", "HashB", hashBits:3, ordered:true), | ||
new HashConverterTransformer.ColumnInfo("C", "HashC", seed:42), | ||
new HashConverterTransformer.ColumnInfo("A", "HashD"), | ||
}); | ||
var result = pipe.Fit(dataView).Transform(dataView); | ||
var resultRoles = new RoleMappedData(result); | ||
using (var ms = new MemoryStream()) | ||
{ | ||
TrainUtils.SaveModel(Env, Env.Start("saving"), ms, null, resultRoles); | ||
ms.Position = 0; | ||
var loadedView = ModelFileUtils.LoadTransforms(Env, dataView, ms); | ||
} | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(placeholder for Tom's comment) #Resolved
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hey Ivan, Pete asked me to tell you to sort your usings. :)
In reply to: 218936325 [](ancestors = 218936325)