Skip to content

Commit ad07320

Browse files
authored
Change back previous version hashing of 8 byte types (#5152)
* Fix old hashing for 8 byte types, and key onnx conversion. * Fix more hashing and unit tests * hash length of vector when hashing into a single value
1 parent bc9abda commit ad07320

File tree

4 files changed

+98
-83
lines changed

4 files changed

+98
-83
lines changed

src/Microsoft.ML.Data/Transforms/Hashing.cs

Lines changed: 45 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -556,20 +556,19 @@ public uint HashCore(uint seed, uint mask, in VBuffer<float> values)
556556
return 0;
557557
hash = Hashing.MurmurRound(hash, FloatUtils.GetBits(value == 0 ? 0 : value));
558558
}
559-
return (Hashing.MixHash(hash, sizeof(uint)) & mask) + 1;
559+
return (Hashing.MixHash(hash, values.Length * sizeof(uint)) & mask) + 1;
560560
}
561561
}
562562

563563
private readonly struct HashDouble : IHasher<double>
564564
{
565565
[MethodImpl(MethodImplOptions.AggressiveInlining)]
566-
567566
public uint HashCoreOld(uint seed, uint mask, in double value)
568567
{
569568
if (double.IsNaN(value))
570569
return 0;
571570

572-
return (Hashing.MixHash(HashRound(seed, value)) & mask) + 1;
571+
return (Hashing.MixHash(HashRound(seed, value, true)) & mask) + 1;
573572
}
574573

575574
[MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -578,7 +577,7 @@ public uint HashCore(uint seed, uint mask, in double value)
578577
if (double.IsNaN(value))
579578
return 0;
580579

581-
return (Hashing.MixHash(HashRound(seed, value), sizeof(double)) & mask) + 1;
580+
return (Hashing.MixHash(HashRound(seed, value, false), sizeof(double)) & mask) + 1;
582581
}
583582

584583
[MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -589,17 +588,19 @@ public uint HashCore(uint seed, uint mask, in VBuffer<double> values)
589588
{
590589
if (double.IsNaN(value))
591590
return 0;
592-
hash = HashRound(hash, value);
591+
hash = HashRound(hash, value, false);
593592
}
594-
return (Hashing.MixHash(hash, sizeof(uint)) & mask) + 1;
593+
return (Hashing.MixHash(hash, values.Length * sizeof(double)) & mask) + 1;
595594
}
596595

597596
[MethodImpl(MethodImplOptions.AggressiveInlining)]
598-
private uint HashRound(uint seed, double value)
597+
private uint HashRound(uint seed, double value, bool old)
599598
{
600599
ulong v = FloatUtils.GetBits(value == 0 ? 0 : value);
601600
var hash = Hashing.MurmurRound(seed, Utils.GetLo(v));
602601
var hi = Utils.GetHi(v);
602+
if (old && hi == 0)
603+
return hash;
603604
return Hashing.MurmurRound(hash, hi);
604605
}
605606
}
@@ -648,7 +649,7 @@ public uint HashCore(uint seed, uint mask, in VBuffer<byte> values)
648649
return 0;
649650
hash = Hashing.MurmurRound(hash, value);
650651
}
651-
return (Hashing.MixHash(hash, sizeof(uint)) & mask) + 1;
652+
return (Hashing.MixHash(hash, values.Length * sizeof(uint)) & mask) + 1;
652653
}
653654
}
654655

@@ -672,7 +673,7 @@ public uint HashCore(uint seed, uint mask, in VBuffer<ushort> values)
672673
return 0;
673674
hash = Hashing.MurmurRound(hash, value);
674675
}
675-
return (Hashing.MixHash(hash, sizeof(uint)) & mask) + 1;
676+
return (Hashing.MixHash(hash, values.Length * sizeof(uint)) & mask) + 1;
676677
}
677678
}
678679

@@ -696,7 +697,7 @@ public uint HashCore(uint seed, uint mask, in VBuffer<uint> values)
696697
return 0;
697698
hash = Hashing.MurmurRound(hash, value);
698699
}
699-
return (Hashing.MixHash(hash, sizeof(uint)) & mask) + 1;
700+
return (Hashing.MixHash(hash, values.Length * sizeof(uint)) & mask) + 1;
700701
}
701702
}
702703

@@ -707,15 +708,15 @@ public uint HashCoreOld(uint seed, uint mask, in ulong value)
707708
{
708709
if (value == 0)
709710
return 0;
710-
return (Hashing.MixHash(HashRound(seed, value)) & mask) + 1;
711+
return (Hashing.MixHash(HashRound(seed, value, true)) & mask) + 1;
711712
}
712713

713714
[MethodImpl(MethodImplOptions.AggressiveInlining)]
714715
public uint HashCore(uint seed, uint mask, in ulong value)
715716
{
716717
if (value == 0)
717718
return 0;
718-
return (Hashing.MixHash(HashRound(seed, value), sizeof(uint)) & mask) + 1;
719+
return (Hashing.MixHash(HashRound(seed, value, false), sizeof(ulong)) & mask) + 1;
719720
}
720721

721722
[MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -726,17 +727,17 @@ public uint HashCore(uint seed, uint mask, in VBuffer<ulong> values)
726727
{
727728
if (value == 0)
728729
return 0;
729-
hash = HashRound(hash, value);
730+
hash = HashRound(hash, value, false);
730731
}
731-
return (Hashing.MixHash(hash, sizeof(uint)) & mask) + 1;
732+
return (Hashing.MixHash(hash, values.Length * sizeof(ulong)) & mask) + 1;
732733
}
733734

734735
[MethodImpl(MethodImplOptions.AggressiveInlining)]
735-
private uint HashRound(uint seed, ulong value)
736+
private uint HashRound(uint seed, ulong value, bool old)
736737
{
737738
var hash = Hashing.MurmurRound(seed, Utils.GetLo(value));
738739
var hi = Utils.GetHi(value);
739-
if (hi == 0)
740+
if (old && hi == 0)
740741
return hash;
741742
return Hashing.MurmurRound(hash, hi);
742743
}
@@ -758,7 +759,7 @@ public uint HashCore(uint seed, uint mask, in VBuffer<byte> values)
758759
var hash = seed;
759760
foreach (var value in values.DenseValues())
760761
hash = Hashing.MurmurRound(hash, value);
761-
return (Hashing.MixHash(hash, sizeof(uint)) & mask) + 1;
762+
return (Hashing.MixHash(hash, values.Length * sizeof(uint)) & mask) + 1;
762763
}
763764
}
764765

@@ -778,7 +779,7 @@ public uint HashCore(uint seed, uint mask, in VBuffer<ushort> values)
778779
var hash = seed;
779780
foreach (var value in values.DenseValues())
780781
hash = Hashing.MurmurRound(hash, value);
781-
return (Hashing.MixHash(hash, sizeof(uint)) & mask) + 1;
782+
return (Hashing.MixHash(hash, values.Length * sizeof(uint)) & mask) + 1;
782783
}
783784
}
784785

@@ -798,7 +799,7 @@ public uint HashCore(uint seed, uint mask, in VBuffer<uint> values)
798799
var hash = seed;
799800
foreach (var value in values.DenseValues())
800801
hash = Hashing.MurmurRound(hash, value);
801-
return (Hashing.MixHash(hash, sizeof(uint)) & mask) + 1;
802+
return (Hashing.MixHash(hash, values.Length * sizeof(uint)) & mask) + 1;
802803
}
803804
}
804805

@@ -807,29 +808,31 @@ public uint HashCore(uint seed, uint mask, in VBuffer<uint> values)
807808
[MethodImpl(MethodImplOptions.AggressiveInlining)]
808809
public uint HashCoreOld(uint seed, uint mask, in ulong value)
809810
{
810-
return (Hashing.MixHash(HashRound(seed, value)) & mask) + 1;
811+
return (Hashing.MixHash(HashRound(seed, value, true)) & mask) + 1;
811812
}
812813

813814
[MethodImpl(MethodImplOptions.AggressiveInlining)]
814815
public uint HashCore(uint seed, uint mask, in ulong value)
815816
{
816-
return (Hashing.MixHash(HashRound(seed, value), sizeof(ulong)) & mask) + 1;
817+
return (Hashing.MixHash(HashRound(seed, value, false), sizeof(ulong)) & mask) + 1;
817818
}
818819

819820
[MethodImpl(MethodImplOptions.AggressiveInlining)]
820821
public uint HashCore(uint seed, uint mask, in VBuffer<ulong> values)
821822
{
822823
var hash = seed;
823824
foreach (var value in values.DenseValues())
824-
hash = HashRound(hash, value);
825-
return (Hashing.MixHash(hash, sizeof(uint)) & mask) + 1;
825+
hash = HashRound(hash, value, false);
826+
return (Hashing.MixHash(hash, values.Length * sizeof(ulong)) & mask) + 1;
826827
}
827828

828829
[MethodImpl(MethodImplOptions.AggressiveInlining)]
829-
private uint HashRound(uint seed, ulong value)
830+
private uint HashRound(uint seed, ulong value, bool old)
830831
{
831832
var hash = Hashing.MurmurRound(seed, Utils.GetLo(value));
832833
var hi = Utils.GetHi(value);
834+
if (old && hi == 0)
835+
return hash;
833836
return Hashing.MurmurRound(hash, hi);
834837
}
835838
}
@@ -839,32 +842,32 @@ private uint HashRound(uint seed, ulong value)
839842
[MethodImpl(MethodImplOptions.AggressiveInlining)]
840843
public uint HashCoreOld(uint seed, uint mask, in DataViewRowId value)
841844
{
842-
return (Hashing.MixHash(HashRound(seed, value)) & mask) + 1;
845+
return (Hashing.MixHash(HashRound(seed, value, true)) & mask) + 1;
843846
}
844847

845848
[MethodImpl(MethodImplOptions.AggressiveInlining)]
846849
public uint HashCore(uint seed, uint mask, in DataViewRowId value)
847850
{
848-
return (Hashing.MixHash(HashRound(seed, value), sizeof(uint)) & mask) + 1;
851+
return (Hashing.MixHash(HashRound(seed, value, false), 2 * sizeof(ulong)) & mask) + 1;
849852
}
850853

851854
[MethodImpl(MethodImplOptions.AggressiveInlining)]
852855
public uint HashCore(uint seed, uint mask, in VBuffer<DataViewRowId> values)
853856
{
854857
var hash = seed;
855858
foreach (var value in values.DenseValues())
856-
hash = HashRound(hash, value);
857-
return (Hashing.MixHash(hash, sizeof(uint)) & mask) + 1;
859+
hash = HashRound(hash, value, false);
860+
return (Hashing.MixHash(hash, values.Length * sizeof(uint)) & mask) + 1;
858861
}
859862

860863
[MethodImpl(MethodImplOptions.AggressiveInlining)]
861-
private uint HashRound(uint seed, DataViewRowId value)
864+
private uint HashRound(uint seed, DataViewRowId value, bool old)
862865
{
863866
var hash = Hashing.MurmurRound(seed, Utils.GetLo(value.Low));
864867
var hi = Utils.GetHi(value.Low);
865-
if (hi != 0)
868+
if (old && hi != 0)
866869
hash = Hashing.MurmurRound(hash, hi);
867-
if (value.High != 0)
870+
if (old && value.High != 0)
868871
{
869872
hash = Hashing.MurmurRound(hash, Utils.GetLo(value.High));
870873
hi = Utils.GetHi(value.High);
@@ -891,7 +894,7 @@ public uint HashCore(uint seed, uint mask, in VBuffer<bool> values)
891894
var hash = seed;
892895
foreach (var value in values.DenseValues())
893896
hash = Hashing.MurmurRound(hash, value ? 1u : 0u);
894-
return (Hashing.MixHash(hash, sizeof(uint)) & mask) + 1;
897+
return (Hashing.MixHash(hash, values.Length * sizeof(uint)) & mask) + 1;
895898
}
896899
}
897900

@@ -911,7 +914,7 @@ public uint HashCore(uint seed, uint mask, in VBuffer<sbyte> values)
911914
var hash = seed;
912915
foreach (var value in values.DenseValues())
913916
hash = Hashing.MurmurRound(hash, (uint)value);
914-
return (Hashing.MixHash(hash, sizeof(uint)) & mask) + 1;
917+
return (Hashing.MixHash(hash, values.Length * sizeof(uint)) & mask) + 1;
915918
}
916919
}
917920

@@ -931,7 +934,7 @@ public uint HashCore(uint seed, uint mask, in VBuffer<short> values)
931934
var hash = seed;
932935
foreach (var value in values.DenseValues())
933936
hash = Hashing.MurmurRound(hash, (uint)value);
934-
return (Hashing.MixHash(hash, sizeof(uint)) & mask) + 1;
937+
return (Hashing.MixHash(hash, values.Length * sizeof(uint)) & mask) + 1;
935938
}
936939
}
937940

@@ -951,7 +954,7 @@ public uint HashCore(uint seed, uint mask, in VBuffer<int> values)
951954
var hash = seed;
952955
foreach (var value in values.DenseValues())
953956
hash = Hashing.MurmurRound(hash, (uint)value);
954-
return (Hashing.MixHash(hash, sizeof(uint)) & mask) + 1;
957+
return (Hashing.MixHash(hash, values.Length * sizeof(uint)) & mask) + 1;
955958
}
956959
}
957960

@@ -960,29 +963,31 @@ public uint HashCore(uint seed, uint mask, in VBuffer<int> values)
960963
[MethodImpl(MethodImplOptions.AggressiveInlining)]
961964
public uint HashCoreOld(uint seed, uint mask, in long value)
962965
{
963-
return (Hashing.MixHash(HashRound(seed, value)) & mask) + 1;
966+
return (Hashing.MixHash(HashRound(seed, value, true)) & mask) + 1;
964967
}
965968

966969
[MethodImpl(MethodImplOptions.AggressiveInlining)]
967970
public uint HashCore(uint seed, uint mask, in long value)
968971
{
969-
return (Hashing.MixHash(HashRound(seed, value), sizeof(long)) & mask) + 1;
972+
return (Hashing.MixHash(HashRound(seed, value, false), sizeof(long)) & mask) + 1;
970973
}
971974

972975
[MethodImpl(MethodImplOptions.AggressiveInlining)]
973976
public uint HashCore(uint seed, uint mask, in VBuffer<long> values)
974977
{
975978
var hash = seed;
976979
foreach (var value in values.DenseValues())
977-
hash = HashRound(hash, value);
978-
return (Hashing.MixHash(hash, sizeof(uint)) & mask) + 1;
980+
hash = HashRound(hash, value, false);
981+
return (Hashing.MixHash(hash, values.Length * sizeof(long)) & mask) + 1;
979982
}
980983

981984
[MethodImpl(MethodImplOptions.AggressiveInlining)]
982-
private uint HashRound(uint seed, long value)
985+
private uint HashRound(uint seed, long value, bool old)
983986
{
984987
var hash = Hashing.MurmurRound(seed, Utils.GetLo((ulong)value));
985988
var hi = Utils.GetHi((ulong)value);
989+
if (old && hi == 0)
990+
return hash;
986991
return Hashing.MurmurRound(hash, hi);
987992
}
988993
}

test/BaselineOutput/Common/SavePipe/SavePipeHash-Data.txt

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
#@ col=VarComb:U4[128]:32-**
1919
#@ }
2020
SingleHash 31 27:Hash9 28:Hash10 29:Hash11 30:Hash12
21-
14 14 14 14 6 0 13 24 47 44 32 16 40 22 24 32 16 40 22 56 59 22 56 59 53 22 56 22 120 0 3 112 33 31 117 22 120 51 31 39 51 31 39 51 31 39 51 31 39
22-
0 0 1 4 0 13 0 32 16 20 52 31 44 24 24 52 31 44 56 59 56 56 59 56 22 56 22 24 123 3 0 112 50 41 36 117 123 75 127 51 47 109 108 51 47 109 108 51 47 109 108 51 47 109 108
23-
14 14 11 4 6 6 0 24 60 20 32 47 44 22 22 24 32 47 44 22 22 56 22 22 56 53 53 22 22 150 6 0 38 22 68 68 68 68
24-
74 3:10 6:6 9:10 12:20 15:22 18:20 21:22 24:22 27:53 31:6 36:35 38:47 43:51 45:22 50:66 52:96 57:66 59:96 64:66 66:96 71:66 73:96
21+
14 14 14 14 6 0 13 24 47 44 8 31 17 22 24 32 16 40 22 56 59 35 23 23 53 22 56 22 120 0 7 112 33 31 117 22 120 51 31 39 51 31 39 51 31 39 17 51 35
22+
0 0 1 4 0 13 0 32 16 20 49 51 54 24 24 52 31 44 56 59 56 23 23 23 22 56 22 24 123 3 7 112 50 41 36 117 123 75 127 51 47 109 108 51 47 109 108 51 47 109 108 17 91 57 49
23+
14 14 11 4 6 6 0 24 60 20 8 6 54 22 22 24 32 47 44 22 22 56 35 35 23 53 53 22 22 150 6 7 38 22 68 68 68 5
24+
74 3:10 6:6 9:10 12:40 15:22 18:20 21:22 24:35 27:53 31:3 36:35 38:47 43:51 45:22 50:66 52:96 57:66 59:96 64:66 66:96 71:2 73:55

test/Microsoft.ML.Tests/OnnxConversionTest.cs

Lines changed: 21 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
using System.Collections.Generic;
77
using System.IO;
88
using System.Linq;
9-
using System.Runtime.InteropServices;
109
using System.Text.RegularExpressions;
1110
using Google.Protobuf;
1211
using Microsoft.ML.Data;
@@ -1200,29 +1199,32 @@ private class HashData
12001199
public uint Value { get; set; }
12011200
}
12021201

1203-
[Fact]
1204-
public void MurmurHashKeyTest()
1202+
[Theory]
1203+
[CombinatorialData]
1204+
public void MurmurHashKeyTest(
1205+
[CombinatorialValues(/*DataKind.Byte, DataKind.UInt16, */DataKind.UInt32/*, DataKind.UInt64*/)]DataKind keyType)
12051206
{
1206-
var mlContext = new MLContext();
1207+
var dataFile = DeleteOutputPath("KeysToOnnx.txt");
1208+
File.WriteAllLines(dataFile,
1209+
new[]
1210+
{
1211+
"2",
1212+
"5",
1213+
"19"
1214+
});
12071215

1208-
var samples = new[]
1216+
var data = ML.Data.LoadFromTextFile(dataFile, new[]
12091217
{
1210-
new HashData {Value = 232},
1211-
new HashData {Value = 42},
1212-
new HashData {Value = 0},
1213-
};
1214-
1215-
IDataView data = mlContext.Data.LoadFromEnumerable(samples);
1218+
new TextLoader.Column("Value", keyType, new[]
1219+
{
1220+
new TextLoader.Range(0)
1221+
}, new KeyCount(10))
1222+
});
12161223

1217-
var hashEstimator = mlContext.Transforms.Conversion.MapValueToKey("Value").Append(mlContext.Transforms.Conversion.Hash(new[]
1218-
{
1219-
new HashingEstimator.ColumnOptions(
1220-
"ValueHashed",
1221-
"Value")
1222-
}));
1224+
var hashEstimator = ML.Transforms.Conversion.Hash("ValueHashed", "Value");
12231225
var model = hashEstimator.Fit(data);
12241226
var transformedData = model.Transform(data);
1225-
var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, data);
1227+
var onnxModel = ML.Model.ConvertToOnnxProtobuf(model, data);
12261228

12271229
var onnxFileName = "MurmurHashV2.onnx";
12281230
var onnxTextName = "MurmurHashV2.txt";
@@ -1236,7 +1238,7 @@ public void MurmurHashKeyTest()
12361238
// Evaluate the saved ONNX model using the data used to train the ML.NET pipeline.
12371239
string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray();
12381240
string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray();
1239-
var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxModelPath);
1241+
var onnxEstimator = ML.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxModelPath);
12401242
var onnxTransformer = onnxEstimator.Fit(data);
12411243
var onnxResult = onnxTransformer.Transform(data);
12421244
CompareSelectedColumns<uint>("ValueHashed", "ValueHashed", transformedData, onnxResult);

0 commit comments

Comments
 (0)