From 5cd4918679ba2c90e19b55e27f4b70e4a975b307 Mon Sep 17 00:00:00 2001 From: Brian Lui Date: Mon, 6 Aug 2018 14:42:52 -0700 Subject: [PATCH 1/8] Implemented all remaining active SSE intrinsics --- .../CpuMathUtils.netcoreapp.cs | 154 +++++ .../CpuMathUtils.netstandard.cs | 13 + src/Microsoft.ML.CpuMath/SseIntrinsics.cs | 559 ++++++++++++++++++ .../CpuMathNativeUtils.cs | 32 + .../SsePerformanceTests.cs | 22 +- 5 files changed, 779 insertions(+), 1 deletion(-) diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs index 6c6c1fe6ad..a213c8d7a2 100644 --- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs +++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs @@ -9,6 +9,139 @@ namespace Microsoft.ML.Runtime.Internal.CpuMath { public static partial class CpuMathUtils { + public const int CbAlign = 16; + + private static bool Compat(AlignedArray a) + { + Contracts.AssertValue(a); + Contracts.Assert(a.Size > 0); + return a.CbAlign == CbAlign; + } + + internal static unsafe float* Ptr(AlignedArray a, float* p) + { + Contracts.AssertValue(a); + float* q = p + a.GetBase((long)p); + Contracts.Assert(((long)q & (CbAlign - 1)) == 0); + return q; + } + + public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crun) + { + Contracts.Assert(Compat(mat)); + Contracts.Assert(Compat(src)); + Contracts.Assert(Compat(dst)); + Contracts.Assert(mat.Size == dst.Size * src.Size); + + if (Sse.IsSupported) + { + if (!tran) + { + Contracts.Assert(0 <= crun && crun <= dst.Size); + SseIntrinsics.MatMulA(add, mat, src, dst, crun, src.Size); + } + else + { + Contracts.Assert(0 <= crun && crun <= src.Size); + SseIntrinsics.MatMulTranA(add, mat, src, dst, dst.Size, crun); + } + } + else + { + // TODO: Software fallback + } + } + + public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgposSrc, AlignedArray srcValues, + int posMin, int iposMin, int iposEnd, AlignedArray dst, int crun) + { + Contracts.Assert(Compat(mat)); + Contracts.Assert(Compat(srcValues)); + Contracts.Assert(Compat(dst)); + Contracts.AssertValue(rgposSrc); + Contracts.Assert(0 <= iposMin && iposMin <= iposEnd && iposEnd <= rgposSrc.Length); + Contracts.Assert(mat.Size == dst.Size * srcValues.Size); + + if (iposMin >= iposEnd) + { + if (!add) + dst.ZeroItems(); + return; + } + + Contracts.AssertNonEmpty(rgposSrc); + + if (Sse.IsSupported) + { + if (!tran) + { + Contracts.Assert(0 <= crun && crun <= dst.Size); + SseIntrinsics.MatMulPA(add, mat, rgposSrc, srcValues, posMin, iposMin, iposEnd, dst, crun, srcValues.Size); + } + else + { + Contracts.Assert(0 <= crun && crun <= srcValues.Size); + SseIntrinsics.MatMulTranPA(add, mat, rgposSrc, srcValues, posMin, iposMin, iposEnd, dst, dst.Size); + } + } + else + { + // TODO: Software fallback + } + } + + public static void MatTimesSrc(bool add, int[] starts, int[] indices, float[] coefs, + AlignedArray src, AlignedArray dst, int crow) + { + Contracts.AssertNonEmpty(starts); + Contracts.Assert(starts.Length == crow + 1); + Contracts.Assert(starts[0] == 0); + Contracts.AssertNonEmpty(indices); + Contracts.Assert(starts[crow] == indices.Length); + Contracts.AssertNonEmpty(coefs); + Contracts.Assert(indices.Length == coefs.Length); + Contracts.Assert(Compat(src)); + Contracts.Assert(Compat(dst)); + Contracts.Assert(0 < crow && crow <= dst.Size); + Contracts.Assert(crow * src.Size >= coefs.Length); + + if (Sse.IsSupported) + { + SseIntrinsics.MatMulRU(add, starts, indices, coefs, src, dst, crow); + } + else + { + // TODO: Software fallback + } + } + + public static void MatTimesSrc(bool add, int[] mprowiv, int[] mprowcol, + int[] mprowrun, int[] runs, float[] coefs, + AlignedArray src, AlignedArray dst, int crow) + { + Contracts.AssertNonEmpty(mprowiv); + Contracts.Assert(mprowiv.Length == crow); + Contracts.AssertNonEmpty(mprowcol); + Contracts.Assert(mprowcol.Length == crow); + Contracts.Assert(mprowrun == null || mprowrun.Length == crow); + Contracts.AssertNonEmpty(runs); + Contracts.AssertNonEmpty(coefs); + Contracts.Assert(Compat(src)); + Contracts.Assert(Compat(dst)); + Contracts.Assert(0 < crow && crow <= dst.Size); + + if (mprowrun == null) + { + SseIntrinsics.MatMulCU(add, mprowiv, mprowcol, runs, coefs, + src, dst, crow); + } + else + { + SseIntrinsics.MatMulDU(add, mprowiv, mprowcol, mprowrun, runs, coefs, + src, dst, crow); + } + } + public static void Scale(float a, float[] dst, int count) { Contracts.AssertNonEmpty(dst); @@ -392,5 +525,26 @@ private static float L2DistSquared(Span a, Span b) return norm; } } + + public static void ZeroMatrixItems(AlignedArray dst, int ccol, int cfltRow, int[] indices) + { + Contracts.Assert(0 < ccol && ccol <= cfltRow); + + if (Sse.IsSupported) + { + if (ccol == cfltRow) + { + SseIntrinsics.ZeroItemsU(dst, dst.Size, indices, indices.Length); + } + else + { + SseIntrinsics.ZeroMatrixItemsCore(dst, dst.Size, ccol, cfltRow, indices, indices.Length); + } + } + else + { + // TODO: Software fallback + } + } } } diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs index 501fc9082e..a71316ebb3 100644 --- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs +++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs @@ -6,6 +6,17 @@ namespace Microsoft.ML.Runtime.Internal.CpuMath { public static partial class CpuMathUtils { + public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crun) => SseUtils.MatTimesSrc(tran, add, mat, src, dst, crun); + + public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgposSrc, AlignedArray srcValues, + int posMin, int iposMin, int iposLim, AlignedArray dst, int crun) => SseUtils.MatTimesSrc(tran, add, mat, rgposSrc, srcValues, posMin, iposMin, iposLim, dst, crun); + + public static void MatTimesSrc(bool add, int[] starts, int[] indices, float[] coefs, + AlignedArray src, AlignedArray dst, int crow) => SseUtils.MatTimesSrc(add, starts, indices, coefs, src, dst, crow); + + public static void MatTimesSrc(bool add, int[] mprowiv, int[] mprowcol, int[] mprowrun, int[] runs, float[] coefs, + AlignedArray src, AlignedArray dst, int crow) => SseUtils.MatTimesSrc(add, mprowiv, mprowcol, mprowrun, runs, coefs, src, dst, crow); + public static void Scale(float a, float[] dst, int count) => SseUtils.Scale(a, dst, count); public static void Scale(float a, float[] dst, int offset, int count) => SseUtils.Scale(a, dst, offset, count); @@ -43,5 +54,7 @@ public static partial class CpuMathUtils public static float DotProductSparse(float[] a, int offset, float[] b, int[] indices, int count) => SseUtils.DotProductSparse(a, offset, b, indices, count); public static float L2DistSquared(float[] a, float[] b, int count) => SseUtils.L2DistSquared(a, b, count); + + public static void ZeroMatrixItems(AlignedArray dst, int ccol, int cfltRow, int[] indices) => SseUtils.ZeroMatrixItems(dst, ccol, cfltRow, indices); } } diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs index d11676f283..1124ee663a 100644 --- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs +++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs @@ -71,6 +71,509 @@ private static Vector128 VectorSum(in Vector128 vector) } } + // Multiply matrix times vector into vector. + internal static unsafe void MatMulA(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol) + { + fixed (float* pSrcStart = &src.Items[0]) + fixed (float* pDstStart = &dst.Items[0]) + fixed (float* pMatStart = &mat.Items[0]) + { + float* psrc = CpuMathUtils.Ptr(src, pSrcStart); + float* pdst = CpuMathUtils.Ptr(dst, pDstStart); + float* pmat = CpuMathUtils.Ptr(mat, pMatStart); + + float* pSrcEnd = psrc + ccol; + float* pDstEnd = pdst + crow; + float* pSrcCurrent = psrc; + float* pDstCurrent = pdst; + float* pMatCurrent = pmat; + + while (pDstCurrent < pDstEnd) + { + Vector128 res0 = Sse.SetZeroVector128(); + Vector128 res1 = res0; + Vector128 res2 = res0; + Vector128 res3 = res0; + + while (pSrcCurrent < pSrcEnd) + { + float* pMatTemp = pMatCurrent; + + Vector128 x01 = Sse.LoadAlignedVector128(pMatTemp); + Vector128 x11 = Sse.LoadAlignedVector128(pMatTemp += ccol); + Vector128 x21 = Sse.LoadAlignedVector128(pMatTemp += ccol); + Vector128 x31 = Sse.LoadAlignedVector128(pMatTemp += ccol); + Vector128 x02 = Sse.LoadAlignedVector128(pSrcCurrent); + + res0 = Sse.Add(res0, Sse.Multiply(x01, x02)); + res1 = Sse.Add(res1, Sse.Multiply(x11, x02)); + res2 = Sse.Add(res2, Sse.Multiply(x21, x02)); + res3 = Sse.Add(res3, Sse.Multiply(x31, x02)); + + pSrcCurrent += 4; + pMatCurrent += 4; + } + + // Add up the entries of each, with the 4 results in res0 + res0 = Sse3.HorizontalAdd(res0, res1); + res2 = Sse3.HorizontalAdd(res2, res3); + res0 = Sse3.HorizontalAdd(res0, res2); + + if (add) + { + res0 = Sse.Add(res0, Sse.LoadAlignedVector128(pDstCurrent)); + } + Sse.StoreAligned(pDstCurrent, res0); + + pDstCurrent += 4; + pMatCurrent += 3 * ccol; + } + } + } + + // Partial sparse source vector. + internal static unsafe void MatMulPA(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src, + int posMin, int iposMin, int iposEnd, AlignedArray dst, int crow, int ccol) + { + // REVIEW: For extremely sparse inputs, interchanging the loops would + // likely be more efficient. + fixed (float* pSrcStart = &src.Items[0]) + fixed (float* pDstStart = &dst.Items[0]) + fixed (float* pMatStart = &mat.Items[0]) + fixed (int* pposSrc = &rgposSrc[0]) + { + float* psrc = CpuMathUtils.Ptr(src, pSrcStart); + float* pdst = CpuMathUtils.Ptr(dst, pDstStart); + float* pmat = CpuMathUtils.Ptr(mat, pMatStart); + + int* pposMin = pposSrc + iposMin; + int* pposEnd = pposSrc + iposEnd; + float* pDstEnd = pdst + crow; + float* pm0 = pmat - posMin; + float* pSrcCurrent = psrc - posMin; + float* pDstCurrent = pdst; + + while (pDstCurrent < pDstEnd) + { + float* pm1 = pm0 + ccol; + float* pm2 = pm1 + ccol; + float* pm3 = pm2 + ccol; + Vector128 res = Sse.SetZeroVector128(); + + int* ppos = pposMin; + + while (ppos < pposEnd) + { + int col = *ppos; + Vector128 x1 = Sse.SetVector128(pm3[col], pm2[col], pm1[col], pm0[col]); + Vector128 x2 = Sse.SetAllVector128(pSrcCurrent[col]); + x2 = Sse.Multiply(x2, x1); + res = Sse.Add(res, x2); + + ppos++; + } + + if (add) + { + res = Sse.Add(res, Sse.LoadAlignedVector128(pDstCurrent)); + } + Sse.StoreAligned(pDstCurrent, res); + + pDstCurrent += 4; + pm0 += 4 * ccol; + } + } + } + + internal static unsafe void MatMulTranA(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol) + { + fixed (float* pSrcStart = &src.Items[0]) + fixed (float* pDstStart = &dst.Items[0]) + fixed (float* pMatStart = &mat.Items[0]) + { + float* psrc = CpuMathUtils.Ptr(src, pSrcStart); + float* pdst = CpuMathUtils.Ptr(dst, pDstStart); + float* pmat = CpuMathUtils.Ptr(mat, pMatStart); + + float* pSrcEnd = psrc + ccol; + float* pDstEnd = pdst + crow; + float* pSrcCurrent = psrc; + float* pDstCurrent = pdst; + float* pMatCurrent = pmat; + + if (!add) + { + Vector128 x01 = Sse.LoadAlignedVector128(pSrcCurrent); + // Replicate each slot of x01 into its own register. + Vector128 x11 = Sse.Shuffle(x01, x01, 0x55); + Vector128 x21 = Sse.Shuffle(x01, x01, 0xAA); + Vector128 x31 = Sse.Shuffle(x01, x01, 0xFF); + x01 = Sse.Shuffle(x01, x01, 0x00); + + pSrcCurrent += 4; + + while (pDstCurrent < pDstEnd) + { + float* pMatTemp = pMatCurrent; + Vector128 x02 = Sse.LoadAlignedVector128(pMatTemp); + Vector128 x12 = Sse.LoadAlignedVector128(pMatTemp += crow); + Vector128 x22 = Sse.LoadAlignedVector128(pMatTemp += crow); + Vector128 x32 = Sse.LoadAlignedVector128(pMatTemp += crow); + + x02 = Sse.Multiply(x01, x02); + x12 = Sse.Multiply(x11, x12); + x22 = Sse.Multiply(x21, x22); + x32 = Sse.Multiply(x31, x32); + + x02 = Sse.Add(x02, x12); + x22 = Sse.Add(x22, x32); + x02 = Sse.Add(x02, x22); + + Sse.StoreAligned(pDstCurrent, x02); + + pDstCurrent += 4; + pMatCurrent += 4; + } + + pMatCurrent += 3 * crow; + } + + while (pSrcCurrent < pSrcEnd) + { + Vector128 x01 = Sse.LoadAlignedVector128(pSrcCurrent); + // Replicate each slot of x01 into its own register. + Vector128 x11 = Sse.Shuffle(x01, x01, 0x55); + Vector128 x21 = Sse.Shuffle(x01, x01, 0xAA); + Vector128 x31 = Sse.Shuffle(x01, x01, 0xFF); + x01 = Sse.Shuffle(x01, x01, 0x00); + + pDstCurrent = pdst; + + while (pDstCurrent < pDstEnd) + { + float* pMatTemp = pMatCurrent; + + Vector128 x02 = Sse.LoadAlignedVector128(pMatTemp); + Vector128 x12 = Sse.LoadAlignedVector128(pMatTemp += crow); + Vector128 x22 = Sse.LoadAlignedVector128(pMatTemp += crow); + Vector128 x32 = Sse.LoadAlignedVector128(pMatTemp += crow); + Vector128 x3 = Sse.LoadAlignedVector128(pDstCurrent); + + x02 = Sse.Multiply(x01, x02); + x12 = Sse.Multiply(x11, x12); + x22 = Sse.Multiply(x21, x22); + x32 = Sse.Multiply(x31, x32); + + x02 = Sse.Add(x02, x12); + x22 = Sse.Add(x22, x32); + x02 = Sse.Add(x02, x22); + x3 = Sse.Add(x02, x3); + + Sse.StoreAligned(pDstCurrent, x3); + + pDstCurrent += 4; + pMatCurrent += 4; + } + + pMatCurrent += 3 * crow; + pSrcCurrent += 4; + } + } + } + + // Partial sparse source vector. + internal static unsafe void MatMulTranPA(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src, + int posMin, int iposMin, int iposEnd, AlignedArray dst, int crow) + { + fixed (float* pSrcStart = &src.Items[0]) + fixed (float* pDstStart = &dst.Items[0]) + fixed (float* pMatStart = &mat.Items[0]) + fixed (int* pposSrc = &rgposSrc[0]) + { + float* psrc = CpuMathUtils.Ptr(src, pSrcStart); + float* pdst = CpuMathUtils.Ptr(dst, pDstStart); + float* pmat = CpuMathUtils.Ptr(mat, pMatStart); + + int* ppos = pposSrc + iposMin; + int* pposEnd = pposSrc + iposEnd; + float* pDstEnd = pdst + crow; + + if (!add) + { + int col = *ppos - posMin; + ppos++; + + Vector128 x0 = Sse.SetAllVector128(psrc[col]); + float* pDstCurrent = pdst; + float* pMatCurrent = pmat + col * crow; + + while (pDstCurrent < pDstEnd) + { + Vector128 x1 = Sse.LoadAlignedVector128(pMatCurrent); + x1 = Sse.Multiply(x1, x0); + Sse.StoreAligned(pDstCurrent, x1); + + pDstCurrent += 4; + pMatCurrent += 4; + } + } + + // REVIEW: Should we explore unrolling the outer loop? + while (ppos < pposEnd) + { + int col = *ppos - posMin; + ppos++; + + Vector128 x0 = Sse.SetAllVector128(psrc[col]); + float* pDstCurrent = pdst; + float* pMatCurrent = pmat + col * crow; + + while (pDstCurrent < pDstEnd) + { + Vector128 x1 = Sse.LoadAlignedVector128(pMatCurrent); + Vector128 x2 = Sse.LoadAlignedVector128(pDstCurrent); + x1 = Sse.Multiply(x1, x0); + x2 = Sse.Add(x2, x1); + Sse.StoreAligned(pDstCurrent, x2); + + pDstCurrent += 4; + pMatCurrent += 4; + } + + ppos++; + } + } + } + + // Sparse matrix. + internal static unsafe void MatMulRU(bool add, int[] starts, int[] indices, float[] coefs, + AlignedArray src, AlignedArray dst, int crow) + { + fixed (int* pstarts = &starts[0]) + fixed (int* pindices = &indices[0]) + fixed (float* pcoefs = &coefs[0]) + fixed (float* pSrcStart = &src.Items[0]) + fixed (float* pDstStart = &dst.Items[0]) + { + float* psrc = CpuMathUtils.Ptr(src, pSrcStart); + float* pdst = CpuMathUtils.Ptr(dst, pDstStart); + + int* pii = pstarts + 1; + int* pIdxCurrent = pindices; + float* pMatCurrent = pcoefs; + float* pDstEnd = pdst + crow; + + float* pDstCurrent = pdst; + + while (pDstCurrent < pDstEnd) + { + int* pIdxEnd = pindices + *pii; + pii++; + + Vector128 result = Sse.SetZeroVector128(); + + while (pIdxCurrent + 4 <= pIdxEnd) + { + Vector128 x = Sse.Multiply(Load4(psrc, pIdxCurrent), Sse.LoadVector128(pMatCurrent)); + result = Sse.Add(result, x); + + pIdxCurrent += 4; + pMatCurrent += 4; + } + + while (pIdxCurrent < pIdxEnd) + { + Vector128 x = Sse.MultiplyScalar(Load1(psrc, pIdxCurrent), Sse.SetScalarVector128(*pMatCurrent)); + result = Sse.AddScalar(result, x); + + pIdxCurrent++; + pMatCurrent++; + } + + result = VectorSum(in result); + + if (add) + { + result = Sse.AddScalar(result, Sse.SetScalarVector128(*pDstCurrent)); + } + Sse.StoreScalar(pDstCurrent, result); + + pDstCurrent++; + } + } + } + + // Unpadded convolution. + internal static unsafe void MatMulCU(bool add, int[] mprowiv, int[] mprowcol, + int[] runs, float[] coefs, AlignedArray src, AlignedArray dst, int crow) + { + + fixed (int* pmprowiv = &mprowiv[0]) + fixed (int* pmprowcol = &mprowcol[0]) + fixed (int* pruns = &runs[0]) + fixed (float* pcoefs = &coefs[0]) + fixed (float* pSrcStart = &src.Items[0]) + fixed (float* pDstStart = &dst.Items[0]) + { + float* psrc = CpuMathUtils.Ptr(src, pSrcStart); + float* pdst = CpuMathUtils.Ptr(dst, pDstStart); + + int size = pruns[1]; + int* psupport = pruns + 2; + int* piv = pmprowiv; + int* pcol = pmprowcol; + int* pIdxEnd = psupport + size; + float* pDstEnd = pdst + crow; + + float* pDstCurrent = pdst; + + while (pDstCurrent < pDstEnd) + { + float* pMatCurrent = pcoefs + *piv; + piv++; + float* pSrcCurrent = psrc + *pcol; + pcol++; + int* pIdxCurrent = psupport; + + Vector128 result = Sse.SetZeroVector128(); + + while (pIdxCurrent + 4 <= pIdxEnd) + { + Vector128 x = Sse.Multiply(Load4(pSrcCurrent, pIdxCurrent), Sse.LoadVector128(pMatCurrent)); + result = Sse.Add(result, x); + + pIdxCurrent += 4; + pMatCurrent += 4; + } + + while (pIdxCurrent < pIdxEnd) + { + Vector128 x = Sse.MultiplyScalar(Load1(pSrcCurrent, pIdxCurrent), Sse.SetScalarVector128(*pMatCurrent)); + result = Sse.AddScalar(result, x); + + pIdxCurrent++; + pMatCurrent++; + } + + result = VectorSum(result); + + // Add the bias. + result = Sse.AddScalar(result, Sse.SetScalarVector128(*pMatCurrent)); + + if (add) + { + result = Sse.AddScalar(result, Sse.SetScalarVector128(*pDstCurrent)); + } + Sse.StoreScalar(pDstCurrent, result); + + pDstCurrent++; + } + } + } + + // Padded convolution. + internal static unsafe void MatMulDU(bool add, int[] mprowiv, int[] mprowcol, int[] mprowrun, + int[] runs, float[] coefs, AlignedArray src, AlignedArray dst, int crow) + { + fixed (int* pmprowiv = &mprowiv[0]) + fixed (int* pmprowcol = &mprowcol[0]) + fixed (int* pmprowrun = &mprowrun[0]) + fixed (int* pruns = &runs[0]) + fixed (float* pcoefs = &coefs[0]) + fixed (float* pSrcStart = &src.Items[0]) + fixed (float* pDstStart = &dst.Items[0]) + { + float* psrc = CpuMathUtils.Ptr(src, pSrcStart); + float* pdst = CpuMathUtils.Ptr(dst, pDstStart); + + int* piv = pmprowiv; + int* pcol = pmprowcol; + float* pDstEnd = pdst + crow; + int kernelSize = pruns[1]; + + int* pirun = pmprowrun; + float* pDstCurrent = pdst; + + while (pDstCurrent < pDstEnd) + { + float* pMatCurrent = pcoefs + *piv; + piv++; + float* pMatBias = pMatCurrent + kernelSize; + float* pSrcCurrent = psrc + *pcol; + pcol++; + int irun = *pirun; + pirun++; + + int* pIdxCurrent = pruns + 2 + irun; + int* pIdxEnd = pIdxCurrent + pIdxCurrent[-1]; + + Vector128 result = Sse.SetZeroVector128(); + + if (irun == 0) + { + // No masking needed. + while (pIdxCurrent + 4 <= pIdxEnd) + { + Vector128 x = Sse.Multiply(Load4(pSrcCurrent, pIdxCurrent), Sse.LoadVector128(pMatCurrent)); + result = Sse.Add(result, x); + + pIdxCurrent += 4; + pMatCurrent += 4; + } + + while (pIdxCurrent < pIdxEnd) + { + Vector128 x = Sse.MultiplyScalar(Load1(pSrcCurrent, pIdxCurrent), Sse.SetScalarVector128(*pMatCurrent)); + result = Sse.AddScalar(result, x); + + pIdxCurrent++; + pMatCurrent++; + } + } + else + { + // Need masking. + pMatCurrent += pIdxCurrent[-2]; + // REVIEW NEEDED: Is it the correct translation from: "const float * pmask = reinterpret_cast(piLim);"? + float* pmask = (float*)pIdxEnd; + + while (pIdxCurrent + 4 <= pIdxEnd) + { + Vector128 x = Sse.Multiply(Load4(pSrcCurrent, pIdxCurrent), Sse.And(Sse.LoadVector128(pmask), Sse.LoadVector128(pMatCurrent))); + result = Sse.Add(result, x); + + pIdxCurrent += 4; + pMatCurrent += 4; + pmask += 4; + } + + while (pIdxCurrent < pIdxEnd) + { + Vector128 x = Sse.MultiplyScalar(Load1(pSrcCurrent, pIdxCurrent), Sse.And(Sse.SetScalarVector128(*pmask), Sse.SetScalarVector128(*pMatCurrent))); + result = Sse.AddScalar(result, x); + + pIdxCurrent++; + pMatCurrent++; + pmask++; + } + } + + result = VectorSum(result); + + // Add the bias. + result = Sse.AddScalar(result, Sse.SetScalarVector128(*pMatBias)); + + if (add) + { + result = Sse.AddScalar(result, Sse.SetScalarVector128(*pDstCurrent)); + } + Sse.StoreScalar(pDstCurrent, result); + + pDstCurrent++; + } + } + } + internal static unsafe void ScaleU(float scale, Span dst) { Vector128 scaleVector = Sse.SetAllVector128(scale); @@ -472,5 +975,61 @@ internal static unsafe float Dist2(Span src, Span dst) } } + internal static unsafe void ZeroItemsU(AlignedArray dst, int c, int[] indices, int cindices) + { + fixed (float* pDstStart = &dst.Items[0]) + fixed (int* pidx = &indices[0]) + { + float* pdst = CpuMathUtils.Ptr(dst, pDstStart); + + // REVIEW NEEDED: This line expands to (void)(c); but is it necessary? + // DEBUG_ONLY(c); + + for (int i = 0; i < cindices; ++i) + { + int index = pidx[i]; + Contracts.Assert(0 <= index && index < c); + pdst[index] = 0; + } + } + } + + internal static unsafe void ZeroMatrixItemsCore(AlignedArray dst, int c, int ccol, int cfltRow, int[] indices, int cindices) + { + fixed (float* pDstStart = &dst.Items[0]) + fixed (int* pidx = &indices[0]) + { + float* pdst = CpuMathUtils.Ptr(dst, pDstStart); + + // REVIEW NEEDED: This line expands to (void)(c); but is it necessary? + // DEBUG_ONLY(c); + + int ivLogMin = 0; + int ivLogLim = ccol; + int ivPhyMin = 0; + + for (int i = 0; i < cindices; ++i) + { + int index = pidx[i]; + Contracts.Assert(0 <= index && index < c); + + int col = index - ivLogMin; + if ((uint)col >= (uint)ccol) + { + Contracts.Assert(ivLogMin > index || index >= ivLogLim); + + int row = index / ccol; + ivLogMin = row * ccol; + ivLogLim = ivLogMin + ccol; + ivPhyMin = row * cfltRow; + + Contracts.Assert(ivLogMin <= index && index < ivLogLim); + col = index - ivLogMin; + } + + pdst[ivPhyMin + col] = 0; + } + } + } } } diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs index 90f362de3e..2528fbe0f4 100644 --- a/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs +++ b/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs @@ -9,6 +9,32 @@ namespace Microsoft.ML.CpuMath.PerformanceTests { internal static class CpuMathNativeUtils { + [DllImport("CpuMathNative", EntryPoint = "MatMulA"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe void MatMulA(bool add, /*_In_ const*/ float* pmat, /*_In_ const*/ float* psrc, /*_Inout_*/ float* pdst, int crow, int ccol); + + [DllImport("CpuMathNative", EntryPoint = "MatMulPA"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe void MatMulPA(bool add, /*_In_ const*/ float* pmat, /*_In_ const*/ int* pposSrc, /*_In_ const*/ float* psrc, + int posMin, int iposMin, int iposLim, /*_Inout_*/ float* pdst, int crow, int ccol); + + [DllImport("CpuMathNative", EntryPoint = "MatMulTranA"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe void MatMulTranA(bool add, /*_In_ const*/ float* pmat, /*_In_ const*/ float* psrc, /*_Inout_*/ float* pdst, int crow, int ccol); + + [DllImport("CpuMathNative", EntryPoint = "MatMulTranPA"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe void MatMulTranPA(bool add, /*_In_ const*/ float* pmat, /*_In_ const*/ int* pposSrc, /*_In_ const*/ float* psrc, + int posMin, int iposMin, int iposLim, /*_Inout_*/ float* pdst, int crow); + + [DllImport("CpuMathNative", EntryPoint = "MatMulRU"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe void MatMulRU(bool add, /*_In_ const*/ int* pstarts, /*_In_ const*/ int* pindices, /*_In_ const*/ float* pcoefs, + /*_In_ const*/ float* ps, /*_Inout_*/ float* pdst, int crow); + + [DllImport("CpuMathNative", EntryPoint = "MatMulCU"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe void MatMulCU(bool add, /*_In_ const*/ int* pmprowiv, /*_In_ const*/ int* pmprowcol, + /*_In_ const*/ int* pruns, /*_In_ const*/ float* pcoefs, /*_In_ const*/ float* psrc, /*_Inout_*/ float* pdst, int crow); + + [DllImport("CpuMathNative", EntryPoint = "MatMulDU"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe void MatMulDU(bool add, /*_In_ const*/ int* pmprowiv, /*_In_ const*/ int* pmprowcol, /*_In_ const*/ int* pmprowrun, + /*_In_ const*/ int* pruns, /*_In_ const*/ float* pcoefs, /*_In_ const*/ float* psrc, /*_Inout_*/ float* pdst, int crow); + [DllImport("CpuMathNative", EntryPoint = "DotU"), SuppressUnmanagedCodeSecurity] internal static extern unsafe float DotU(/*const*/ float* pa, /*const*/ float* pb, int c); @@ -41,5 +67,11 @@ internal static class CpuMathNativeUtils [DllImport("CpuMathNative", EntryPoint = "MulElementWiseU"), SuppressUnmanagedCodeSecurity] internal static extern unsafe void MulElementWiseU(/*_In_ const*/ float* ps1, /*_In_ const*/ float* ps2, /*_Inout_*/ float* pd, int c); + + [DllImport("CpuMathNative", EntryPoint = "ZeroItemsU"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe void ZeroItemsU(/*_Inout_*/ float* pd, int c, /*_In_ const*/ int* pindices, int cindices); + + [DllImport("CpuMathNative", EntryPoint = "ZeroMatrixItemsCore"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe void ZeroMatrixItemsCore(/*_Inout_*/ float* pd, int c, int ccol, int cfltRow, /*_In_ const*/ int* pindices, int cindices); } } diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs index 92752a0018..f560fcd048 100644 --- a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs +++ b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs @@ -19,8 +19,11 @@ public class SsePerformanceTests private const int EXP_RANGE = EXP_MAX / 2; private const int DEFAULT_SEED = 253421; private const float DEFAULT_SCALE = 1.11f; + private const int DEFAULT_CROW = 1000; + private const int DEFAULT_CCOL = 1000; + private const bool ADD = true; - private float[] src, dst, original, src1, src2; + private float[] src, dst, original, src1, src2, mat; private int[] idx; private int seed = DEFAULT_SEED; @@ -66,6 +69,7 @@ public void Setup() src2 = new float[LEN]; original = new float[LEN]; idx = new int[IDXLEN]; + mat = new float[DEFAULT_CROW * DEFAULT_CCOL]; seed = GetSeed(); Random rand = new Random(seed); @@ -83,6 +87,11 @@ public void Setup() { idx[i] = rand.Next(0, LEN); } + + for (int i = 0; i < mat.Length; i++) + { + mat[i] = NextFloat(rand, EXP_RANGE); + } } [GlobalCleanup] @@ -91,6 +100,17 @@ public void GlobalCleanup() original.CopyTo(dst, 0); } + [Benchmark] + public unsafe void NativeMatMulAPerf() + { + fixed (float* pmat = mat) + fixed (float* psrc = src) + fixed (float* pdst = dst) + { + CpuMathNativeUtils.MatMulA(ADD, pmat, psrc, pdst, DEFAULT_CROW, DEFAULT_CCOL); + } + } + [Benchmark] public unsafe float NativeDotUPerf() { From 079dd485695c0ff57f617d1be0df8e43e8ddface Mon Sep 17 00:00:00 2001 From: Brian Lui Date: Mon, 6 Aug 2018 16:38:16 -0700 Subject: [PATCH 2/8] Moved CpuMathUtils' architecture-dependent members into SseIntrinsics --- .../CpuMathUtils.netcoreapp.cs | 27 ------- src/Microsoft.ML.CpuMath/SseIntrinsics.cs | 81 ++++++++++++++----- 2 files changed, 61 insertions(+), 47 deletions(-) diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs index a213c8d7a2..4bd9eefae7 100644 --- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs +++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs @@ -9,28 +9,8 @@ namespace Microsoft.ML.Runtime.Internal.CpuMath { public static partial class CpuMathUtils { - public const int CbAlign = 16; - - private static bool Compat(AlignedArray a) - { - Contracts.AssertValue(a); - Contracts.Assert(a.Size > 0); - return a.CbAlign == CbAlign; - } - - internal static unsafe float* Ptr(AlignedArray a, float* p) - { - Contracts.AssertValue(a); - float* q = p + a.GetBase((long)p); - Contracts.Assert(((long)q & (CbAlign - 1)) == 0); - return q; - } - public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crun) { - Contracts.Assert(Compat(mat)); - Contracts.Assert(Compat(src)); - Contracts.Assert(Compat(dst)); Contracts.Assert(mat.Size == dst.Size * src.Size); if (Sse.IsSupported) @@ -55,9 +35,6 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArr public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgposSrc, AlignedArray srcValues, int posMin, int iposMin, int iposEnd, AlignedArray dst, int crun) { - Contracts.Assert(Compat(mat)); - Contracts.Assert(Compat(srcValues)); - Contracts.Assert(Compat(dst)); Contracts.AssertValue(rgposSrc); Contracts.Assert(0 <= iposMin && iposMin <= iposEnd && iposEnd <= rgposSrc.Length); Contracts.Assert(mat.Size == dst.Size * srcValues.Size); @@ -100,8 +77,6 @@ public static void MatTimesSrc(bool add, int[] starts, int[] indices, float[] co Contracts.Assert(starts[crow] == indices.Length); Contracts.AssertNonEmpty(coefs); Contracts.Assert(indices.Length == coefs.Length); - Contracts.Assert(Compat(src)); - Contracts.Assert(Compat(dst)); Contracts.Assert(0 < crow && crow <= dst.Size); Contracts.Assert(crow * src.Size >= coefs.Length); @@ -126,8 +101,6 @@ public static void MatTimesSrc(bool add, int[] mprowiv, int[] mprowcol, Contracts.Assert(mprowrun == null || mprowrun.Length == crow); Contracts.AssertNonEmpty(runs); Contracts.AssertNonEmpty(coefs); - Contracts.Assert(Compat(src)); - Contracts.Assert(Compat(dst)); Contracts.Assert(0 < crow && crow <= dst.Size); if (mprowrun == null) diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs index 1124ee663a..1d61496c47 100644 --- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs +++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs @@ -16,6 +16,23 @@ namespace Microsoft.ML.Runtime.Internal.CpuMath { internal static class SseIntrinsics { + private const int CbAlign = 16; + + private static bool Compat(AlignedArray a) + { + Contracts.AssertValue(a); + Contracts.Assert(a.Size > 0); + return a.CbAlign == CbAlign; + } + + private static unsafe float* Ptr(AlignedArray a, float* p) + { + Contracts.AssertValue(a); + float* q = p + a.GetBase((long)p); + Contracts.Assert(((long)q & (CbAlign - 1)) == 0); + return q; + } + [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] private static unsafe Vector128 Load1(float* src, int* idx) { @@ -74,13 +91,17 @@ private static Vector128 VectorSum(in Vector128 vector) // Multiply matrix times vector into vector. internal static unsafe void MatMulA(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol) { + Contracts.Assert(Compat(mat)); + Contracts.Assert(Compat(src)); + Contracts.Assert(Compat(dst)); + fixed (float* pSrcStart = &src.Items[0]) fixed (float* pDstStart = &dst.Items[0]) fixed (float* pMatStart = &mat.Items[0]) { - float* psrc = CpuMathUtils.Ptr(src, pSrcStart); - float* pdst = CpuMathUtils.Ptr(dst, pDstStart); - float* pmat = CpuMathUtils.Ptr(mat, pMatStart); + float* psrc = Ptr(src, pSrcStart); + float* pdst = Ptr(dst, pDstStart); + float* pmat = Ptr(mat, pMatStart); float* pSrcEnd = psrc + ccol; float* pDstEnd = pdst + crow; @@ -135,6 +156,10 @@ internal static unsafe void MatMulA(bool add, AlignedArray mat, AlignedArray src internal static unsafe void MatMulPA(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src, int posMin, int iposMin, int iposEnd, AlignedArray dst, int crow, int ccol) { + Contracts.Assert(Compat(mat)); + Contracts.Assert(Compat(src)); + Contracts.Assert(Compat(dst)); + // REVIEW: For extremely sparse inputs, interchanging the loops would // likely be more efficient. fixed (float* pSrcStart = &src.Items[0]) @@ -142,9 +167,9 @@ internal static unsafe void MatMulPA(bool add, AlignedArray mat, int[] rgposSrc, fixed (float* pMatStart = &mat.Items[0]) fixed (int* pposSrc = &rgposSrc[0]) { - float* psrc = CpuMathUtils.Ptr(src, pSrcStart); - float* pdst = CpuMathUtils.Ptr(dst, pDstStart); - float* pmat = CpuMathUtils.Ptr(mat, pMatStart); + float* psrc = Ptr(src, pSrcStart); + float* pdst = Ptr(dst, pDstStart); + float* pmat = Ptr(mat, pMatStart); int* pposMin = pposSrc + iposMin; int* pposEnd = pposSrc + iposEnd; @@ -187,13 +212,17 @@ internal static unsafe void MatMulPA(bool add, AlignedArray mat, int[] rgposSrc, internal static unsafe void MatMulTranA(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol) { + Contracts.Assert(Compat(mat)); + Contracts.Assert(Compat(src)); + Contracts.Assert(Compat(dst)); + fixed (float* pSrcStart = &src.Items[0]) fixed (float* pDstStart = &dst.Items[0]) fixed (float* pMatStart = &mat.Items[0]) { - float* psrc = CpuMathUtils.Ptr(src, pSrcStart); - float* pdst = CpuMathUtils.Ptr(dst, pDstStart); - float* pmat = CpuMathUtils.Ptr(mat, pMatStart); + float* psrc = Ptr(src, pSrcStart); + float* pdst = Ptr(dst, pDstStart); + float* pmat = Ptr(mat, pMatStart); float* pSrcEnd = psrc + ccol; float* pDstEnd = pdst + crow; @@ -285,14 +314,18 @@ internal static unsafe void MatMulTranA(bool add, AlignedArray mat, AlignedArray internal static unsafe void MatMulTranPA(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src, int posMin, int iposMin, int iposEnd, AlignedArray dst, int crow) { + Contracts.Assert(Compat(mat)); + Contracts.Assert(Compat(src)); + Contracts.Assert(Compat(dst)); + fixed (float* pSrcStart = &src.Items[0]) fixed (float* pDstStart = &dst.Items[0]) fixed (float* pMatStart = &mat.Items[0]) fixed (int* pposSrc = &rgposSrc[0]) { - float* psrc = CpuMathUtils.Ptr(src, pSrcStart); - float* pdst = CpuMathUtils.Ptr(dst, pDstStart); - float* pmat = CpuMathUtils.Ptr(mat, pMatStart); + float* psrc = Ptr(src, pSrcStart); + float* pdst = Ptr(dst, pDstStart); + float* pmat = Ptr(mat, pMatStart); int* ppos = pposSrc + iposMin; int* pposEnd = pposSrc + iposEnd; @@ -349,14 +382,17 @@ internal static unsafe void MatMulTranPA(bool add, AlignedArray mat, int[] rgpos internal static unsafe void MatMulRU(bool add, int[] starts, int[] indices, float[] coefs, AlignedArray src, AlignedArray dst, int crow) { + Contracts.Assert(Compat(src)); + Contracts.Assert(Compat(dst)); + fixed (int* pstarts = &starts[0]) fixed (int* pindices = &indices[0]) fixed (float* pcoefs = &coefs[0]) fixed (float* pSrcStart = &src.Items[0]) fixed (float* pDstStart = &dst.Items[0]) { - float* psrc = CpuMathUtils.Ptr(src, pSrcStart); - float* pdst = CpuMathUtils.Ptr(dst, pDstStart); + float* psrc = Ptr(src, pSrcStart); + float* pdst = Ptr(dst, pDstStart); int* pii = pstarts + 1; int* pIdxCurrent = pindices; @@ -407,6 +443,8 @@ internal static unsafe void MatMulRU(bool add, int[] starts, int[] indices, floa internal static unsafe void MatMulCU(bool add, int[] mprowiv, int[] mprowcol, int[] runs, float[] coefs, AlignedArray src, AlignedArray dst, int crow) { + Contracts.Assert(Compat(src)); + Contracts.Assert(Compat(dst)); fixed (int* pmprowiv = &mprowiv[0]) fixed (int* pmprowcol = &mprowcol[0]) @@ -415,8 +453,8 @@ internal static unsafe void MatMulCU(bool add, int[] mprowiv, int[] mprowcol, fixed (float* pSrcStart = &src.Items[0]) fixed (float* pDstStart = &dst.Items[0]) { - float* psrc = CpuMathUtils.Ptr(src, pSrcStart); - float* pdst = CpuMathUtils.Ptr(dst, pDstStart); + float* psrc = Ptr(src, pSrcStart); + float* pdst = Ptr(dst, pDstStart); int size = pruns[1]; int* psupport = pruns + 2; @@ -475,6 +513,9 @@ internal static unsafe void MatMulCU(bool add, int[] mprowiv, int[] mprowcol, internal static unsafe void MatMulDU(bool add, int[] mprowiv, int[] mprowcol, int[] mprowrun, int[] runs, float[] coefs, AlignedArray src, AlignedArray dst, int crow) { + Contracts.Assert(Compat(src)); + Contracts.Assert(Compat(dst)); + fixed (int* pmprowiv = &mprowiv[0]) fixed (int* pmprowcol = &mprowcol[0]) fixed (int* pmprowrun = &mprowrun[0]) @@ -483,8 +524,8 @@ internal static unsafe void MatMulDU(bool add, int[] mprowiv, int[] mprowcol, in fixed (float* pSrcStart = &src.Items[0]) fixed (float* pDstStart = &dst.Items[0]) { - float* psrc = CpuMathUtils.Ptr(src, pSrcStart); - float* pdst = CpuMathUtils.Ptr(dst, pDstStart); + float* psrc = Ptr(src, pSrcStart); + float* pdst = Ptr(dst, pDstStart); int* piv = pmprowiv; int* pcol = pmprowcol; @@ -980,7 +1021,7 @@ internal static unsafe void ZeroItemsU(AlignedArray dst, int c, int[] indices, i fixed (float* pDstStart = &dst.Items[0]) fixed (int* pidx = &indices[0]) { - float* pdst = CpuMathUtils.Ptr(dst, pDstStart); + float* pdst = Ptr(dst, pDstStart); // REVIEW NEEDED: This line expands to (void)(c); but is it necessary? // DEBUG_ONLY(c); @@ -999,7 +1040,7 @@ internal static unsafe void ZeroMatrixItemsCore(AlignedArray dst, int c, int cco fixed (float* pDstStart = &dst.Items[0]) fixed (int* pidx = &indices[0]) { - float* pdst = CpuMathUtils.Ptr(dst, pDstStart); + float* pdst = Ptr(dst, pDstStart); // REVIEW NEEDED: This line expands to (void)(c); but is it necessary? // DEBUG_ONLY(c); From 5e1854d83906144ee09334ddb1705ced197cf139 Mon Sep 17 00:00:00 2001 From: Brian Lui Date: Wed, 8 Aug 2018 14:45:10 -0700 Subject: [PATCH 3/8] Implemented all new referenced SSE intrinsics, with software fallbacks, passing unit tests, and performance tests Note: Performance tests for functions that involve AlignedArray are not implemented. --- .../CpuMathUtils.netcoreapp.cs | 458 +++++++++++-- .../CpuMathUtils.netstandard.cs | 32 +- src/Microsoft.ML.CpuMath/SseIntrinsics.cs | 618 ++++++++++++------ .../CpuMathNativeUtils.cs | 82 ++- .../SsePerformanceTests.cs | 211 ++++-- .../UnitTests.cs | 491 ++++++++++++-- 6 files changed, 1508 insertions(+), 384 deletions(-) diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs index 4bd9eefae7..8adff83a2f 100644 --- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs +++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs @@ -28,18 +28,59 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArr } else { - // TODO: Software fallback + if (!tran) + { + Contracts.Assert(0 <= crun && crun <= dst.Size); + for (int i = 0; i < crun; i++) + { + float dotProduct = 0; + for (int j = 0; j < src.Size; j++) + { + dotProduct += mat[i * src.Size + j] * src[j]; + } + + if (add) + { + dst[i] += dotProduct; + } + else + { + dst[i] = dotProduct; + } + } + } + else + { + Contracts.Assert(0 <= crun && crun <= src.Size); + for (int i = 0; i < dst.Size; i++) + { + float dotProduct = 0; + for (int j = 0; j < crun; j++) + { + dotProduct += mat[j * src.Size + i] * src[j]; + } + + if (add) + { + dst[i] += dotProduct; + } + else + { + dst[i] = dotProduct; + } + } + } } } public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgposSrc, AlignedArray srcValues, - int posMin, int iposMin, int iposEnd, AlignedArray dst, int crun) + int posMin, int iposMin, int iposLim, AlignedArray dst, int crun) { Contracts.AssertValue(rgposSrc); - Contracts.Assert(0 <= iposMin && iposMin <= iposEnd && iposEnd <= rgposSrc.Length); + Contracts.Assert(0 <= iposMin && iposMin <= iposLim && iposLim <= rgposSrc.Length); Contracts.Assert(mat.Size == dst.Size * srcValues.Size); - if (iposMin >= iposEnd) + if (iposMin >= iposLim) { if (!add) dst.ZeroItems(); @@ -53,65 +94,86 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgpo if (!tran) { Contracts.Assert(0 <= crun && crun <= dst.Size); - SseIntrinsics.MatMulPA(add, mat, rgposSrc, srcValues, posMin, iposMin, iposEnd, dst, crun, srcValues.Size); + SseIntrinsics.MatMulPA(add, mat, rgposSrc, srcValues, posMin, iposMin, iposLim, dst, crun, srcValues.Size); } else { Contracts.Assert(0 <= crun && crun <= srcValues.Size); - SseIntrinsics.MatMulTranPA(add, mat, rgposSrc, srcValues, posMin, iposMin, iposEnd, dst, dst.Size); + SseIntrinsics.MatMulTranPA(add, mat, rgposSrc, srcValues, posMin, iposMin, iposLim, dst, dst.Size); } } else { - // TODO: Software fallback + if (!tran) + { + Contracts.Assert(0 <= crun && crun <= dst.Size); + for (int i = 0; i < crun; i++) + { + float dotProduct = 0; + for (int j = iposMin; j < iposLim; j++) + { + int col = rgposSrc[j] - posMin; + dotProduct += mat[i * srcValues.Size + col] * srcValues[col]; + } + + if (add) + { + dst[i] += dotProduct; + } + else + { + dst[i] = dotProduct; + } + } + } + else + { + Contracts.Assert(0 <= crun && crun <= srcValues.Size); + for (int i = 0; i < dst.Size; i++) + { + float dotProduct = 0; + for (int j = iposMin; j < iposLim; j++) + { + int col = rgposSrc[j] - posMin; + dotProduct += mat[col * dst.Size + i] * srcValues[col]; + } + + if (add) + { + dst[i] += dotProduct; + } + else + { + dst[i] = dotProduct; + } + } + + } } } - public static void MatTimesSrc(bool add, int[] starts, int[] indices, float[] coefs, - AlignedArray src, AlignedArray dst, int crow) + public static void Add(float a, float[] dst, int count) { - Contracts.AssertNonEmpty(starts); - Contracts.Assert(starts.Length == crow + 1); - Contracts.Assert(starts[0] == 0); - Contracts.AssertNonEmpty(indices); - Contracts.Assert(starts[crow] == indices.Length); - Contracts.AssertNonEmpty(coefs); - Contracts.Assert(indices.Length == coefs.Length); - Contracts.Assert(0 < crow && crow <= dst.Size); - Contracts.Assert(crow * src.Size >= coefs.Length); + Contracts.AssertNonEmpty(dst); + Contracts.Assert(0 < count); + Contracts.Assert(0 < count && count <= dst.Length); - if (Sse.IsSupported) - { - SseIntrinsics.MatMulRU(add, starts, indices, coefs, src, dst, crow); - } - else - { - // TODO: Software fallback - } + Add(a, new Span(dst, 0, count)); } - public static void MatTimesSrc(bool add, int[] mprowiv, int[] mprowcol, - int[] mprowrun, int[] runs, float[] coefs, - AlignedArray src, AlignedArray dst, int crow) + // dst += a + private static void Add(float a, Span dst) { - Contracts.AssertNonEmpty(mprowiv); - Contracts.Assert(mprowiv.Length == crow); - Contracts.AssertNonEmpty(mprowcol); - Contracts.Assert(mprowcol.Length == crow); - Contracts.Assert(mprowrun == null || mprowrun.Length == crow); - Contracts.AssertNonEmpty(runs); - Contracts.AssertNonEmpty(coefs); - Contracts.Assert(0 < crow && crow <= dst.Size); - - if (mprowrun == null) + if (Sse.IsSupported) { - SseIntrinsics.MatMulCU(add, mprowiv, mprowcol, runs, coefs, - src, dst, crow); + SseIntrinsics.AddScalarU(a, dst); } else { - SseIntrinsics.MatMulDU(add, mprowiv, mprowcol, mprowrun, runs, coefs, - src, dst, crow); + for (int i = 0; i < dst.Length; i++) + { + dst[i] += a; + } } } @@ -147,6 +209,57 @@ private static void Scale(float a, Span dst) } } + // dst = a * src + public static void Scale(float a, float[] src, float[] dst, int count) + { + Contracts.AssertNonEmpty(src); + Contracts.Assert(0 < count && count <= src.Length); + Contracts.AssertNonEmpty(dst); + Contracts.Assert(count <= dst.Length); + + Scale(a, new Span(src, 0, count), new Span(dst, 0, count)); + } + + private static void Scale(float a, Span src, Span dst) + { + if (Sse.IsSupported) + { + SseIntrinsics.ScaleSrcU(a, src, dst); + } + else + { + for (int i = 0; i < dst.Length; i++) + { + dst[i] = a * src[i]; + } + } + } + + // dst[i] = a * (dst[i] + b) + public static void ScaleAdd(float a, float b, float[] dst, int count) + { + Contracts.AssertNonEmpty(dst); + Contracts.Assert(0 < count); + Contracts.Assert(0 < count && count <= dst.Length); + + ScaleAdd(a, b, new Span(dst, 0, count)); + } + + private static void ScaleAdd(float a, float b, Span dst) + { + if (Sse.IsSupported) + { + SseIntrinsics.ScaleAddU(a, b, dst); + } + else + { + for (int i = 0; i < dst.Length; i++) + { + dst[i] = a * (dst[i] + b); + } + } + } + public static void AddScale(float a, float[] src, float[] dst, int count) { Contracts.AssertNonEmpty(src); @@ -225,6 +338,33 @@ private static void AddScale(float a, Span src, Span indices, Span(src, 0, count), new Span(dst, 0, count), new Span(res, 0, count)); + } + + private static void AddScaleCopy(float a, Span src, Span dst, Span res) + { + if (Sse.IsSupported) + { + SseIntrinsics.AddScaleCopyU(a, src, dst, res); + } + else + { + for (int i = 0; i < res.Length; i++) + { + res[i] = a * src[i] + dst[i]; + } + } + } + public static void Add(float[] src, float[] dst, int count) { Contracts.AssertNonEmpty(src); @@ -319,6 +459,40 @@ private static void MulElementWise(Span src1, Span src2, Span(src, 0, count)); + } + + public static float Sum(float[] src, int offset, int count) + { + Contracts.AssertNonEmpty(src); + Contracts.Assert(0 < count); + Contracts.Assert(0 <= offset && offset <= src.Length - count); + + return Sum(new Span(src, offset, count)); + } + + private static float Sum(Span src) + { + if (Sse.IsSupported) + { + return SseIntrinsics.SumU(src); + } + else + { + float sum = 0; + for (int i = 0; i < src.Length; i++) + { + sum += src[i]; + } + return sum; + } + } + public static float SumSq(float[] src, int count) { Contracts.AssertNonEmpty(src); @@ -353,6 +527,39 @@ private static float SumSq(Span src) } } + public static float SumSq(float mean, float[] src, int offset, int count) + { + Contracts.AssertNonEmpty(src); + Contracts.Assert(0 < count); + Contracts.Assert(0 <= offset && offset <= src.Length - count); + + return SumSq(mean, new Span(src, offset, count)); + } + + private static float SumSq(float mean, Span src) + { + if (Sse.IsSupported) + { + if (mean == 0) + { + return SseIntrinsics.SumSqU(src); + } + else + { + return SseIntrinsics.SumSqDiffU(mean, src); + } + } + else + { + float result = 0; + for (int i = 0; i < src.Length; i++) + { + result += (src[i] - mean) * (src[i] - mean); + } + return result; + } + } + public static float SumAbs(float[] src, int count) { Contracts.AssertNonEmpty(src); @@ -387,6 +594,106 @@ private static float SumAbs(Span src) } } + public static float SumAbs(float mean, float[] src, int offset, int count) + { + Contracts.AssertNonEmpty(src); + Contracts.Assert(0 < count); + Contracts.Assert(0 <= offset && offset <= src.Length - count); + + return SumAbs(mean, new Span(src, offset, count)); + } + + private static float SumAbs(float mean, Span src) + { + if (Sse.IsSupported) + { + if (mean == 0) + { + return SseIntrinsics.SumAbsU(src); + } + else + { + return SseIntrinsics.SumAbsDiffU(mean, src); + } + } + else + { + float sum = 0; + for (int i = 0; i < src.Length; i++) + { + sum += Math.Abs(src[i] - mean); + } + return sum; + } + } + + public static float MaxAbs(float[] src, int count) + { + Contracts.AssertNonEmpty(src); + Contracts.Assert(0 < count && count <= src.Length); + + return MaxAbs(new Span(src, 0, count)); + } + + public static float MaxAbs(float[] src, int offset, int count) + { + Contracts.AssertNonEmpty(src); + Contracts.Assert(0 < count); + Contracts.Assert(0 <= offset && offset <= src.Length - count); + + return MaxAbs(new Span(src, offset, count)); + } + + private static float MaxAbs(Span src) + { + if (Sse.IsSupported) + { + return SseIntrinsics.MaxAbsU(src); + } + else + { + float max = 0; + for (int i = 0; i < src.Length; i++) + { + float abs = Math.Abs(src[i]); + if (abs > max) + { + max = abs; + } + } + return max; + } + } + + public static float MaxAbsDiff(float mean, float[] src, int count) + { + Contracts.AssertNonEmpty(src); + Contracts.Assert(0 < count && count <= src.Length); + + return MaxAbsDiff(mean, new Span(src, 0, count)); + } + + private static float MaxAbsDiff(float mean, Span src) + { + if (Sse.IsSupported) + { + return SseIntrinsics.MaxAbsDiffU(mean, src); + } + else + { + float max = 0; + for (int i = 0; i < src.Length; i++) + { + float abs = Math.Abs(src[i] - mean); + if (abs > max) + { + max = abs; + } + } + return max; + } + } + public static float DotProductDense(float[] a, float[] b, int count) { Contracts.AssertNonEmpty(a); @@ -503,6 +810,8 @@ public static void ZeroMatrixItems(AlignedArray dst, int ccol, int cfltRow, int[ { Contracts.Assert(0 < ccol && ccol <= cfltRow); + // REVIEW NEEDED: Since the two methods below do not involve any SSE hardware intrinsics, no software fallback is needed. + // REVIEW NEEDED; Keeping the check for SSE support so that we don't miss these two methods in case of any conditional compilation of files if (Sse.IsSupported) { if (ccol == cfltRow) @@ -514,9 +823,70 @@ public static void ZeroMatrixItems(AlignedArray dst, int ccol, int cfltRow, int[ SseIntrinsics.ZeroMatrixItemsCore(dst, dst.Size, ccol, cfltRow, indices, indices.Length); } } + } + + public static void SdcaL1UpdateDense(float primalUpdate, int length, float[] src, float threshold, float[] v, float[] w) + { + Contracts.AssertNonEmpty(src); + Contracts.Assert(length <= src.Length); + Contracts.AssertNonEmpty(v); + Contracts.Assert(length <= v.Length); + Contracts.AssertNonEmpty(w); + Contracts.Assert(length <= w.Length); + Contracts.Assert(length > 0); + + SdcaL1UpdateDense(primalUpdate, new Span(src, 0, length), threshold, new Span(v, 0, length), new Span(w, 0, length)); + } + + private static void SdcaL1UpdateDense(float primalUpdate, Span src, float threshold, Span v, Span w) + { + if (Sse.IsSupported) + { + SseIntrinsics.SdcaL1UpdateU(primalUpdate, src, threshold, v, w); + } + else + { + for (int i = 0; i < src.Length; i++) + { + v[i] += src[i] * primalUpdate; + float value = v[i]; + w[i] = Math.Abs(value) > threshold ? (value > 0 ? value - threshold : value + threshold) : 0; + } + } + } + + // REVIEW NEEDED: The second argument "length" is unused even in the existing code. + public static void SdcaL1UpdateSparse(float primalUpdate, int length, float[] src, int[] indices, int count, float threshold, float[] v, float[] w) + { + Contracts.AssertNonEmpty(src); + Contracts.Assert(count <= src.Length); + Contracts.AssertNonEmpty(indices); + Contracts.Assert(count <= indices.Length); + Contracts.AssertNonEmpty(w); + Contracts.Assert(length <= w.Length); + Contracts.AssertNonEmpty(v); + Contracts.Assert(length <= v.Length); + Contracts.Assert(0 < count); + Contracts.Assert(count < length); + + SdcaL1UpdateSparse(primalUpdate, new Span(src, 0, count), new Span(indices, 0, count), threshold, new Span(v), new Span(w)); + } + + private static void SdcaL1UpdateSparse(float primalUpdate, Span src, Span indices, float threshold, Span v, Span w) + { + if (Sse.IsSupported) + { + SseIntrinsics.SdcaL1UpdateSU(primalUpdate, src, indices, threshold, v, w); + } else { - // TODO: Software fallback + for (int i = 0; i < indices.Length; i++) + { + int index = indices[i]; + v[index] += src[i] * primalUpdate; + float value = v[index]; + w[index] = Math.Abs(value) > threshold ? (value > 0 ? value - threshold : value + threshold) : 0; + } } } } diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs index a71316ebb3..730fb10be7 100644 --- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs +++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs @@ -11,16 +11,16 @@ public static partial class CpuMathUtils public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgposSrc, AlignedArray srcValues, int posMin, int iposMin, int iposLim, AlignedArray dst, int crun) => SseUtils.MatTimesSrc(tran, add, mat, rgposSrc, srcValues, posMin, iposMin, iposLim, dst, crun); - public static void MatTimesSrc(bool add, int[] starts, int[] indices, float[] coefs, - AlignedArray src, AlignedArray dst, int crow) => SseUtils.MatTimesSrc(add, starts, indices, coefs, src, dst, crow); - - public static void MatTimesSrc(bool add, int[] mprowiv, int[] mprowcol, int[] mprowrun, int[] runs, float[] coefs, - AlignedArray src, AlignedArray dst, int crow) => SseUtils.MatTimesSrc(add, mprowiv, mprowcol, mprowrun, runs, coefs, src, dst, crow); + public static void Add(float a, float[] dst, int count) => SseUtils.Add(a, dst, count); public static void Scale(float a, float[] dst, int count) => SseUtils.Scale(a, dst, count); public static void Scale(float a, float[] dst, int offset, int count) => SseUtils.Scale(a, dst, offset, count); + public static void Scale(float a, float[] src, float[] dst, int count) => SseUtils.Scale(a, src, dst, count); + + public static void ScaleAdd(float a, float b, float[] dst, int count) => SseUtils.ScaleAdd(a, b, dst, count); + public static void AddScale(float a, float[] src, float[] dst, int count) => SseUtils.AddScale(a, src, dst, count); public static void AddScale(float a, float[] src, float[] dst, int dstOffset, int count) => SseUtils.AddScale(a, src, dst, dstOffset, count); @@ -29,6 +29,8 @@ public static void MatTimesSrc(bool add, int[] mprowiv, int[] mprowcol, int[] mp public static void AddScale(float a, float[] src, int[] indices, float[] dst, int dstOffset, int count) => SseUtils.AddScale(a, src, indices, dst, dstOffset, count); + public static void AddScaleCopy(float a, float[] src, float[] dst, float[] res, int count) => SseUtils.AddScaleCopy(a, src, dst, res, count); + public static void Add(float[] src, float[] dst, int count) => SseUtils.Add(src, dst, count); public static void Add(float[] src, int[] indices, float[] dst, int count) => SseUtils.Add(src, indices, dst, count); @@ -37,14 +39,28 @@ public static void MatTimesSrc(bool add, int[] mprowiv, int[] mprowcol, int[] mp public static void MulElementWise(float[] src1, float[] src2, float[] dst, int count) => SseUtils.MulElementWise(src1, src2, dst, count); + public static float Sum(float[] src, int count) => SseUtils.Sum(src, count); + + public static float Sum(float[] src, int offset, int count) => SseUtils.Sum(src, offset, count); + public static float SumSq(float[] src, int count) => SseUtils.SumSq(src, count); public static float SumSq(float[] src, int offset, int count) => SseUtils.SumSq(src, offset, count); + public static float SumSq(float mean, float[] src, int offset, int count) => SseUtils.SumSq(mean, src, offset, count); + public static float SumAbs(float[] src, int count) => SseUtils.SumAbs(src, count); public static float SumAbs(float[] src, int offset, int count) => SseUtils.SumAbs(src, offset, count); + public static float SumAbs(float mean, float[] src, int offset, int count) => SseUtils.SumAbs(mean, src, offset, count); + + public static float MaxAbs(float[] src, int count) => SseUtils.MaxAbs(src, count); + + public static float MaxAbs(float[] src, int offset, int count) => SseUtils.MaxAbs(src, offset, count); + + public static float MaxAbsDiff(float mean, float[] src, int count) => SseUtils.MaxAbsDiff(mean, src, count); + public static float DotProductDense(float[] a, float[] b, int count) => SseUtils.DotProductDense(a, b, count); public static float DotProductDense(float[] a, int offset, float[] b, int count) => SseUtils.DotProductDense(a, offset, b, count); @@ -56,5 +72,11 @@ public static void MatTimesSrc(bool add, int[] mprowiv, int[] mprowcol, int[] mp public static float L2DistSquared(float[] a, float[] b, int count) => SseUtils.L2DistSquared(a, b, count); public static void ZeroMatrixItems(AlignedArray dst, int ccol, int cfltRow, int[] indices) => SseUtils.ZeroMatrixItems(dst, ccol, cfltRow, indices); + + public static void SdcaL1UpdateDense(float primalUpdate, int length, float[] src, float threshold, float[] v, float[] w) + => SseUtils.SdcaL1UpdateDense(primalUpdate, length, src, threshold, v, w); + + public static void SdcaL1UpdateSparse(float primalUpdate, int length, float[] src, int[] indices, int count, float threshold, float[] v, float[] w) + => SseUtils.SdcaL1UpdateSparse(primalUpdate, length, src, indices, count, threshold, v, w); } } diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs index 1d61496c47..a57382a4d7 100644 --- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs +++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs @@ -4,8 +4,14 @@ // The exported function names need to be unique (can't be disambiguated based on signature), hence // we introduce suffix letters to indicate the general patterns used. +// * A suffix means aligned and padded for SSE operations. // * U suffix means unaligned and unpadded. // * S suffix means sparse (unaligned) vector. +// * P suffix means sparse (unaligned) partial vector - the vector is only part of a larger sparse vector. +// * R suffix means sparse matrix. +// * C suffix means convolution matrix. +// * D suffix means convolution matrix, with implicit source padding. +// * Tran means the matrix is transposed. using System; using System.Runtime.CompilerServices; @@ -72,7 +78,7 @@ private static unsafe void Store4(Vector128 x, float* dst, int* idx) } [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] - private static Vector128 VectorSum(in Vector128 vector) + private static Vector128 VectorSum(Vector128 vector) { if (Sse3.IsSupported) { @@ -355,7 +361,6 @@ internal static unsafe void MatMulTranPA(bool add, AlignedArray mat, int[] rgpos while (ppos < pposEnd) { int col = *ppos - posMin; - ppos++; Vector128 x0 = Sse.SetAllVector128(psrc[col]); float* pDstCurrent = pdst; @@ -378,267 +383,126 @@ internal static unsafe void MatMulTranPA(bool add, AlignedArray mat, int[] rgpos } } - // Sparse matrix. - internal static unsafe void MatMulRU(bool add, int[] starts, int[] indices, float[] coefs, - AlignedArray src, AlignedArray dst, int crow) + // dst[i] += scale + internal static unsafe void AddScalarU(float scale, Span dst) { - Contracts.Assert(Compat(src)); - Contracts.Assert(Compat(dst)); - - fixed (int* pstarts = &starts[0]) - fixed (int* pindices = &indices[0]) - fixed (float* pcoefs = &coefs[0]) - fixed (float* pSrcStart = &src.Items[0]) - fixed (float* pDstStart = &dst.Items[0]) + fixed (float* pdst = dst) { - float* psrc = Ptr(src, pSrcStart); - float* pdst = Ptr(dst, pDstStart); - - int* pii = pstarts + 1; - int* pIdxCurrent = pindices; - float* pMatCurrent = pcoefs; - float* pDstEnd = pdst + crow; - + float* pDstEnd = pdst + dst.Length; float* pDstCurrent = pdst; - while (pDstCurrent < pDstEnd) - { - int* pIdxEnd = pindices + *pii; - pii++; - - Vector128 result = Sse.SetZeroVector128(); - - while (pIdxCurrent + 4 <= pIdxEnd) - { - Vector128 x = Sse.Multiply(Load4(psrc, pIdxCurrent), Sse.LoadVector128(pMatCurrent)); - result = Sse.Add(result, x); - - pIdxCurrent += 4; - pMatCurrent += 4; - } - - while (pIdxCurrent < pIdxEnd) - { - Vector128 x = Sse.MultiplyScalar(Load1(psrc, pIdxCurrent), Sse.SetScalarVector128(*pMatCurrent)); - result = Sse.AddScalar(result, x); + Vector128 x1 = Sse.SetAllVector128(scale); - pIdxCurrent++; - pMatCurrent++; - } + while (pDstCurrent + 4 <= pDstEnd) + { + Vector128 x2 = Sse.LoadVector128(pDstCurrent); + x2 = Sse.Add(x2, x1); + Sse.Store(pDstCurrent, x2); - result = VectorSum(in result); + pDstCurrent += 4; + } - if (add) - { - result = Sse.AddScalar(result, Sse.SetScalarVector128(*pDstCurrent)); - } - Sse.StoreScalar(pDstCurrent, result); + while (pDstCurrent < pDstEnd) + { + Vector128 x2 = Sse.LoadScalarVector128(pDstCurrent); + x2 = Sse.AddScalar(x2, x1); + Sse.StoreScalar(pDstCurrent, x2); pDstCurrent++; } } } - // Unpadded convolution. - internal static unsafe void MatMulCU(bool add, int[] mprowiv, int[] mprowcol, - int[] runs, float[] coefs, AlignedArray src, AlignedArray dst, int crow) + internal static unsafe void ScaleU(float scale, Span dst) { - Contracts.Assert(Compat(src)); - Contracts.Assert(Compat(dst)); + Vector128 scaleVector = Sse.SetAllVector128(scale); - fixed (int* pmprowiv = &mprowiv[0]) - fixed (int* pmprowcol = &mprowcol[0]) - fixed (int* pruns = &runs[0]) - fixed (float* pcoefs = &coefs[0]) - fixed (float* pSrcStart = &src.Items[0]) - fixed (float* pDstStart = &dst.Items[0]) + fixed (float* pdst = dst) { - float* psrc = Ptr(src, pSrcStart); - float* pdst = Ptr(dst, pDstStart); - - int size = pruns[1]; - int* psupport = pruns + 2; - int* piv = pmprowiv; - int* pcol = pmprowcol; - int* pIdxEnd = psupport + size; - float* pDstEnd = pdst + crow; - float* pDstCurrent = pdst; + float* pEnd = pdst + dst.Length; - while (pDstCurrent < pDstEnd) + while (pDstCurrent + 4 <= pEnd) { - float* pMatCurrent = pcoefs + *piv; - piv++; - float* pSrcCurrent = psrc + *pcol; - pcol++; - int* pIdxCurrent = psupport; - - Vector128 result = Sse.SetZeroVector128(); - - while (pIdxCurrent + 4 <= pIdxEnd) - { - Vector128 x = Sse.Multiply(Load4(pSrcCurrent, pIdxCurrent), Sse.LoadVector128(pMatCurrent)); - result = Sse.Add(result, x); - - pIdxCurrent += 4; - pMatCurrent += 4; - } - - while (pIdxCurrent < pIdxEnd) - { - Vector128 x = Sse.MultiplyScalar(Load1(pSrcCurrent, pIdxCurrent), Sse.SetScalarVector128(*pMatCurrent)); - result = Sse.AddScalar(result, x); + Vector128 dstVector = Sse.LoadVector128(pDstCurrent); - pIdxCurrent++; - pMatCurrent++; - } + dstVector = Sse.Multiply(scaleVector, dstVector); + Sse.Store(pDstCurrent, dstVector); - result = VectorSum(result); + pDstCurrent += 4; + } - // Add the bias. - result = Sse.AddScalar(result, Sse.SetScalarVector128(*pMatCurrent)); + while (pDstCurrent < pEnd) + { + Vector128 dstVector = Sse.LoadScalarVector128(pDstCurrent); - if (add) - { - result = Sse.AddScalar(result, Sse.SetScalarVector128(*pDstCurrent)); - } - Sse.StoreScalar(pDstCurrent, result); + dstVector = Sse.MultiplyScalar(scaleVector, dstVector); + Sse.StoreScalar(pDstCurrent, dstVector); pDstCurrent++; } } } - // Padded convolution. - internal static unsafe void MatMulDU(bool add, int[] mprowiv, int[] mprowcol, int[] mprowrun, - int[] runs, float[] coefs, AlignedArray src, AlignedArray dst, int crow) + internal static unsafe void ScaleSrcU(float scale, Span src, Span dst) { - Contracts.Assert(Compat(src)); - Contracts.Assert(Compat(dst)); + Vector128 scaleVector = Sse.SetAllVector128(scale); - fixed (int* pmprowiv = &mprowiv[0]) - fixed (int* pmprowcol = &mprowcol[0]) - fixed (int* pmprowrun = &mprowrun[0]) - fixed (int* pruns = &runs[0]) - fixed (float* pcoefs = &coefs[0]) - fixed (float* pSrcStart = &src.Items[0]) - fixed (float* pDstStart = &dst.Items[0]) + fixed (float* psrc = src) + fixed (float* pdst = dst) { - float* psrc = Ptr(src, pSrcStart); - float* pdst = Ptr(dst, pDstStart); - - int* piv = pmprowiv; - int* pcol = pmprowcol; - float* pDstEnd = pdst + crow; - int kernelSize = pruns[1]; - - int* pirun = pmprowrun; + float* pDstEnd = pdst + dst.Length; + float* pSrcCurrent = psrc; float* pDstCurrent = pdst; - while (pDstCurrent < pDstEnd) + while (pDstCurrent + 4 <= pDstEnd) { - float* pMatCurrent = pcoefs + *piv; - piv++; - float* pMatBias = pMatCurrent + kernelSize; - float* pSrcCurrent = psrc + *pcol; - pcol++; - int irun = *pirun; - pirun++; - - int* pIdxCurrent = pruns + 2 + irun; - int* pIdxEnd = pIdxCurrent + pIdxCurrent[-1]; - - Vector128 result = Sse.SetZeroVector128(); - - if (irun == 0) - { - // No masking needed. - while (pIdxCurrent + 4 <= pIdxEnd) - { - Vector128 x = Sse.Multiply(Load4(pSrcCurrent, pIdxCurrent), Sse.LoadVector128(pMatCurrent)); - result = Sse.Add(result, x); - - pIdxCurrent += 4; - pMatCurrent += 4; - } - - while (pIdxCurrent < pIdxEnd) - { - Vector128 x = Sse.MultiplyScalar(Load1(pSrcCurrent, pIdxCurrent), Sse.SetScalarVector128(*pMatCurrent)); - result = Sse.AddScalar(result, x); - - pIdxCurrent++; - pMatCurrent++; - } - } - else - { - // Need masking. - pMatCurrent += pIdxCurrent[-2]; - // REVIEW NEEDED: Is it the correct translation from: "const float * pmask = reinterpret_cast(piLim);"? - float* pmask = (float*)pIdxEnd; - - while (pIdxCurrent + 4 <= pIdxEnd) - { - Vector128 x = Sse.Multiply(Load4(pSrcCurrent, pIdxCurrent), Sse.And(Sse.LoadVector128(pmask), Sse.LoadVector128(pMatCurrent))); - result = Sse.Add(result, x); - - pIdxCurrent += 4; - pMatCurrent += 4; - pmask += 4; - } - - while (pIdxCurrent < pIdxEnd) - { - Vector128 x = Sse.MultiplyScalar(Load1(pSrcCurrent, pIdxCurrent), Sse.And(Sse.SetScalarVector128(*pmask), Sse.SetScalarVector128(*pMatCurrent))); - result = Sse.AddScalar(result, x); - - pIdxCurrent++; - pMatCurrent++; - pmask++; - } - } - - result = VectorSum(result); + Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); + srcVector = Sse.Multiply(srcVector, scaleVector); + Sse.Store(pDstCurrent, srcVector); - // Add the bias. - result = Sse.AddScalar(result, Sse.SetScalarVector128(*pMatBias)); + pSrcCurrent += 4; + pDstCurrent += 4; + } - if (add) - { - result = Sse.AddScalar(result, Sse.SetScalarVector128(*pDstCurrent)); - } - Sse.StoreScalar(pDstCurrent, result); + while (pDstCurrent < pDstEnd) + { + Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); + srcVector = Sse.MultiplyScalar(srcVector, scaleVector); + Sse.StoreScalar(pDstCurrent, srcVector); + pSrcCurrent++; pDstCurrent++; } } } - internal static unsafe void ScaleU(float scale, Span dst) + // dst[i] = a * (dst[i] + b) + internal static unsafe void ScaleAddU(float a, float b, Span dst) { - Vector128 scaleVector = Sse.SetAllVector128(scale); + Vector128 x1 = Sse.SetAllVector128(a); + Vector128 x2 = Sse.SetAllVector128(b); fixed (float* pdst = dst) { + float* pDstEnd = pdst + dst.Length; float* pDstCurrent = pdst; - float* pEnd = pdst + dst.Length; - while (pDstCurrent + 4 <= pEnd) + while (pDstCurrent + 4 <= pDstEnd) { Vector128 dstVector = Sse.LoadVector128(pDstCurrent); - - dstVector = Sse.Multiply(scaleVector, dstVector); + dstVector = Sse.Add(dstVector, x2); + dstVector = Sse.Multiply(dstVector, x1); Sse.Store(pDstCurrent, dstVector); pDstCurrent += 4; } - while (pDstCurrent < pEnd) + while (pDstCurrent < pDstEnd) { Vector128 dstVector = Sse.LoadScalarVector128(pDstCurrent); - - dstVector = Sse.MultiplyScalar(scaleVector, dstVector); + dstVector = Sse.AddScalar(dstVector, x2); + dstVector = Sse.MultiplyScalar(dstVector, x1); Sse.StoreScalar(pDstCurrent, dstVector); pDstCurrent++; @@ -685,6 +549,47 @@ internal static unsafe void AddScaleU(float scale, Span src, Span } } + internal static unsafe void AddScaleCopyU(float scale, Span src, Span dst, Span result) + { + fixed (float* psrc = src) + fixed (float* pdst = dst) + fixed (float* pres = result) + { + float* pResEnd = pres + result.Length; + float* pSrcCurrent = psrc; + float* pDstCurrent = pdst; + float* pResCurrent = pres; + + Vector128 x1 = Sse.SetAllVector128(scale); + + while (pResCurrent + 4 <= pResEnd) + { + Vector128 x2 = Sse.LoadVector128(pSrcCurrent); + Vector128 x3 = Sse.LoadVector128(pDstCurrent); + x2 = Sse.Multiply(x2, x1); + x3 = Sse.Add(x3, x2); + Sse.Store(pResCurrent, x3); + + pSrcCurrent += 4; + pDstCurrent += 4; + pResCurrent += 4; + } + + while (pResCurrent < pResEnd) + { + Vector128 x2 = Sse.LoadScalarVector128(pSrcCurrent); + Vector128 x3 = Sse.LoadScalarVector128(pDstCurrent); + x2 = Sse.MultiplyScalar(x2, x1); + x3 = Sse.AddScalar(x3, x2); + Sse.StoreScalar(pResCurrent, x3); + + pSrcCurrent++; + pDstCurrent++; + pResCurrent++; + } + } + } + internal static unsafe void AddScaleSU(float scale, Span src, Span idx, Span dst) { Vector128 scaleVector = Sse.SetAllVector128(scale); @@ -826,6 +731,33 @@ internal static unsafe void MulElementWiseU(Span src1, Span src2, } } + internal static unsafe float SumU(Span src) + { + fixed (float* psrc = src) + { + float* pSrcEnd = psrc + src.Length; + float* pSrcCurrent = psrc; + + Vector128 result = Sse.SetZeroVector128(); + + while (pSrcCurrent + 4 < pSrcEnd) + { + result = Sse.Add(result, Sse.LoadVector128(pSrcCurrent)); + pSrcCurrent += 4; + } + + result = VectorSum(result); + + while (pSrcCurrent < pSrcEnd) + { + result = Sse.AddScalar(result, Sse.LoadScalarVector128(pSrcCurrent)); + pSrcCurrent++; + } + + return Sse.ConvertToSingle(result); + } + } + internal static unsafe float SumSqU(Span src) { Vector128 result = Sse.SetZeroVector128(); @@ -843,7 +775,7 @@ internal static unsafe float SumSqU(Span src) pSrcCurrent += 4; } - result = VectorSum(in result); + result = VectorSum(result); while (pSrcCurrent < pEnd) { @@ -857,6 +789,40 @@ internal static unsafe float SumSqU(Span src) return Sse.ConvertToSingle(result); } + internal static unsafe float SumSqDiffU(float mean, Span src) + { + fixed (float* psrc = src) + { + float* pSrcEnd = psrc + src.Length; + float* pSrcCurrent = psrc; + + Vector128 result = Sse.SetZeroVector128(); + Vector128 meanVector = Sse.SetAllVector128(mean); + + while (pSrcCurrent + 4 <= pSrcEnd) + { + Vector128 x = Sse.LoadVector128(pSrcCurrent); + x = Sse.Subtract(x, meanVector); + result = Sse.Add(result, Sse.Multiply(x, x)); + + pSrcCurrent += 4; + } + + result = VectorSum(result); + + while (pSrcCurrent < pSrcEnd) + { + Vector128 x = Sse.LoadScalarVector128(pSrcCurrent); + x = Sse.SubtractScalar(x, meanVector); + result = Sse.AddScalar(result, Sse.MultiplyScalar(x, x)); + + pSrcCurrent++; + } + + return Sse.ConvertToSingle(result); + } + } + internal static unsafe float SumAbsU(Span src) { Vector128 result = Sse.SetZeroVector128(); @@ -884,13 +850,148 @@ internal static unsafe float SumAbsU(Span src) pSrcCurrent += 4; } - result = VectorSum(in result); + result = VectorSum(result); while (pSrcCurrent < pEnd) + { + Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); + result = Sse.AddScalar(result, Sse.And(srcVector, mask)); + + pSrcCurrent++; + } + } + + return Sse.ConvertToSingle(result); + } + + internal static unsafe float SumAbsDiffU(float mean, Span src) + { + Vector128 result = Sse.SetZeroVector128(); + Vector128 meanVector = Sse.SetAllVector128(mean); + Vector128 mask; + + if (Sse2.IsSupported) + { + mask = Sse.StaticCast(Sse2.SetAllVector128(0x7FFFFFFF)); + } + else + { + mask = Sse.SetAllVector128(BitConverter.Int32BitsToSingle(0x7FFFFFFF)); + } + + fixed (float* psrc = src) + { + float* pSrcCurrent = psrc; + float* pEnd = psrc + src.Length; + + while (pSrcCurrent + 4 <= pEnd) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); + srcVector = Sse.Subtract(srcVector, meanVector); result = Sse.Add(result, Sse.And(srcVector, mask)); + pSrcCurrent += 4; + } + + result = VectorSum(result); + + while (pSrcCurrent < pEnd) + { + Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); + srcVector = Sse.SubtractScalar(srcVector, meanVector); + result = Sse.AddScalar(result, Sse.And(srcVector, mask)); + + pSrcCurrent++; + } + } + + return Sse.ConvertToSingle(result); + } + + internal static unsafe float MaxAbsU(Span src) + { + Vector128 result = Sse.SetZeroVector128(); + Vector128 mask; + + if (Sse2.IsSupported) + { + mask = Sse.StaticCast(Sse2.SetAllVector128(0x7FFFFFFF)); + } + else + { + mask = Sse.SetAllVector128(BitConverter.Int32BitsToSingle(0x7FFFFFFF)); + } + + fixed (float* psrc = src) + { + float* pSrcCurrent = psrc; + float* pEnd = psrc + src.Length; + + while (pSrcCurrent + 4 <= pEnd) + { + Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); + result = Sse.Max(result, Sse.And(srcVector, mask)); + + pSrcCurrent += 4; + } + + Vector128 x1 = Sse.Shuffle(result, result, 0xB1); + result = Sse.Max(result, x1); + x1 = Sse.Shuffle(result, result, 0x02); + result = Sse.MaxScalar(result, x1); + + while (pSrcCurrent < pEnd) + { + Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); + result = Sse.MaxScalar(result, Sse.And(srcVector, mask)); + + pSrcCurrent++; + } + } + + return Sse.ConvertToSingle(result); + } + + internal static unsafe float MaxAbsDiffU(float mean, Span src) + { + Vector128 result = Sse.SetZeroVector128(); + Vector128 meanVector = Sse.SetAllVector128(mean); + Vector128 mask; + + if (Sse2.IsSupported) + { + mask = Sse.StaticCast(Sse2.SetAllVector128(0x7FFFFFFF)); + } + else + { + mask = Sse.SetAllVector128(BitConverter.Int32BitsToSingle(0x7FFFFFFF)); + } + + fixed (float* psrc = src) + { + float* pSrcCurrent = psrc; + float* pEnd = psrc + src.Length; + + while (pSrcCurrent + 4 <= pEnd) + { + Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); + srcVector = Sse.Subtract(srcVector, meanVector); + result = Sse.Max(result, Sse.And(srcVector, mask)); + + pSrcCurrent += 4; + } + + Vector128 x1 = Sse.Shuffle(result, result, 0xB1); + result = Sse.Max(result, x1); + x1 = Sse.Shuffle(result, result, 0x02); + result = Sse.MaxScalar(result, x1); + + while (pSrcCurrent < pEnd) + { + Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); + srcVector = Sse.SubtractScalar(srcVector, meanVector); + result = Sse.MaxScalar(result, Sse.And(srcVector, mask)); + pSrcCurrent++; } } @@ -920,7 +1021,7 @@ internal static unsafe float DotU(Span src, Span dst) pDstCurrent += 4; } - result = VectorSum(in result); + result = VectorSum(result); while (pSrcCurrent < pEnd) { @@ -961,7 +1062,7 @@ internal static unsafe float DotSU(Span src, Span dst, Span i pDstCurrent += 4; } - result = VectorSum(in result); + result = VectorSum(result); while (pIdxCurrent < pEnd) { @@ -1000,7 +1101,7 @@ internal static unsafe float Dist2(Span src, Span dst) pDstCurrent += 4; } - sqDistanceVector = VectorSum(in sqDistanceVector); + sqDistanceVector = VectorSum(sqDistanceVector); float norm = Sse.ConvertToSingle(sqDistanceVector); while (pSrcCurrent < pEnd) @@ -1072,5 +1173,102 @@ internal static unsafe void ZeroMatrixItemsCore(AlignedArray dst, int c, int cco } } } + + internal static unsafe void SdcaL1UpdateU(float primalUpdate, Span src, float threshold, Span v, Span w) + { + fixed (float* psrc = src) + fixed (float* pdst1 = v) + fixed (float* pdst2 = w) + { + float* pSrcEnd = psrc + src.Length; + float* pSrcCurrent = psrc; + float* pDst1Current = pdst1; + float* pDst2Current = pdst2; + + Vector128 xPrimal = Sse.SetAllVector128(primalUpdate); + + Vector128 signMask = Sse.SetAllVector128(-0.0f); // 1000 0000 ... + Vector128 xThreshold = Sse.SetAllVector128(threshold); + + while (pSrcCurrent + 4 <= pSrcEnd) + { + Vector128 xSrc = Sse.LoadVector128(pSrcCurrent); + Vector128 xDst1 = Sse.LoadVector128(pDst1Current); + xDst1 = Sse.Add(xDst1, Sse.Multiply(xSrc, xPrimal)); + Sse.Store(pDst1Current, xDst1); + + Vector128 xSign = Sse.And(xDst1, signMask); // result = 10000... if xDst1 is negative or 00000 otherwise + Vector128 xDst1Abs = Sse.Xor(xDst1, xSign); + Vector128 xCond = Sse.CompareGreaterThan(xDst1Abs, xThreshold); // all 1's if true + Vector128 x2 = Sse.Xor(xSign, xThreshold); // -threshold if xDst1 is negative and +threshold otherwise + Vector128 xDst2 = Sse.And(Sse.Subtract(xDst1, x2), xCond); + Sse.Store(pDst2Current, xDst2); + + pSrcCurrent += 4; + pDst1Current += 4; + pDst2Current += 4; + } + + while (pSrcCurrent < pSrcEnd) + { + *pDst1Current += (*pSrcCurrent) * primalUpdate; + float dst1 = *pDst1Current; + *pDst2Current = Math.Abs(dst1) > threshold ? (dst1 > 0 ? dst1 - threshold : dst1 + threshold) : 0; + + pSrcCurrent++; + pDst1Current++; + pDst2Current++; + } + } + } + + internal static unsafe void SdcaL1UpdateSU(float primalUpdate, Span src, Span indices, float threshold, Span v, Span w) + { + fixed (float* psrc = src) + fixed (int* pidx = indices) + fixed (float* pdst1 = v) + fixed (float* pdst2 = w) + { + int* pIdxEnd = pidx + indices.Length; + float* pSrcCurrent = psrc; + int* pIdxCurrent = pidx; + + Vector128 xPrimal = Sse.SetAllVector128(primalUpdate); + + Vector128 signMask = Sse.SetAllVector128(-0.0f); // 1000 0000 ... + Vector128 xThreshold = Sse.SetAllVector128(threshold); + + while (pIdxCurrent + 4 <= pIdxEnd) + { + Vector128 xSrc = Sse.LoadVector128(pSrcCurrent); + + Vector128 xDst1 = Load4(pdst1, pIdxCurrent); + xDst1 = Sse.Add(xDst1, Sse.Multiply(xSrc, xPrimal)); + + Vector128 xSign = Sse.And(xDst1, signMask); // result = 10000... if xDst1 is negative or 00000 otherwise + Vector128 xDst1Abs = Sse.Xor(xDst1, xSign); + Vector128 xCond = Sse.CompareGreaterThan(xDst1Abs, xThreshold); // all 1's if true + Vector128 x2 = Sse.Xor(xSign, xThreshold); // -threshold if xDst1 is negative and +threshold otherwise + Vector128 xDst2 = Sse.And(Sse.Subtract(xDst1, x2), xCond); + + Store4(xDst1, pdst1, pIdxCurrent); + Store4(xDst2, pdst2, pIdxCurrent); + + pIdxCurrent += 4; + pSrcCurrent += 4; + } + + while (pIdxCurrent < pIdxEnd) + { + int index = *pIdxCurrent; + pdst1[index] += (*pSrcCurrent) * primalUpdate; + float dst1 = pdst1[index]; + pdst2[index] = Math.Abs(dst1) > threshold ? (dst1 > 0 ? dst1 - threshold : dst1 + threshold) : 0; + + pIdxCurrent++; + pSrcCurrent++; + } + } + } } } diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs index 2528fbe0f4..92227abe78 100644 --- a/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs +++ b/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs @@ -2,6 +2,17 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +// The exported function names need to be unique (can't be disambiguated based on signature), hence +// we introduce suffix letters to indicate the general patterns used. +// * A suffix means aligned and padded for SSE operations. +// * U suffix means unaligned and unpadded. +// * S suffix means sparse (unaligned) vector. +// * P suffix means sparse (unaligned) partial vector - the vector is only part of a larger sparse vector. +// * R suffix means sparse matrix. +// * C suffix means convolution matrix. +// * D suffix means convolution matrix, with implicit source padding. +// * Tran means the matrix is transposed. + using System.Runtime.InteropServices; using System.Security; @@ -23,26 +34,26 @@ internal static extern unsafe void MatMulPA(bool add, /*_In_ const*/ float* pmat internal static extern unsafe void MatMulTranPA(bool add, /*_In_ const*/ float* pmat, /*_In_ const*/ int* pposSrc, /*_In_ const*/ float* psrc, int posMin, int iposMin, int iposLim, /*_Inout_*/ float* pdst, int crow); - [DllImport("CpuMathNative", EntryPoint = "MatMulRU"), SuppressUnmanagedCodeSecurity] - internal static extern unsafe void MatMulRU(bool add, /*_In_ const*/ int* pstarts, /*_In_ const*/ int* pindices, /*_In_ const*/ float* pcoefs, - /*_In_ const*/ float* ps, /*_Inout_*/ float* pdst, int crow); + [DllImport("CpuMathNative", EntryPoint = "AddScalarU"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe float AddScalarU(float a, /*_Inout_*/ float* pd, int c); - [DllImport("CpuMathNative", EntryPoint = "MatMulCU"), SuppressUnmanagedCodeSecurity] - internal static extern unsafe void MatMulCU(bool add, /*_In_ const*/ int* pmprowiv, /*_In_ const*/ int* pmprowcol, - /*_In_ const*/ int* pruns, /*_In_ const*/ float* pcoefs, /*_In_ const*/ float* psrc, /*_Inout_*/ float* pdst, int crow); + [DllImport("CpuMathNative", EntryPoint = "ScaleU"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe void ScaleU(float a, /*_Inout_*/ float* pd, int c); - [DllImport("CpuMathNative", EntryPoint = "MatMulDU"), SuppressUnmanagedCodeSecurity] - internal static extern unsafe void MatMulDU(bool add, /*_In_ const*/ int* pmprowiv, /*_In_ const*/ int* pmprowcol, /*_In_ const*/ int* pmprowrun, - /*_In_ const*/ int* pruns, /*_In_ const*/ float* pcoefs, /*_In_ const*/ float* psrc, /*_Inout_*/ float* pdst, int crow); + [DllImport("CpuMathNative", EntryPoint = "ScaleSrcU"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe void ScaleSrcU(float a, /*_In_ const*/ float* ps, /*_Inout_*/ float* pd, int c); - [DllImport("CpuMathNative", EntryPoint = "DotU"), SuppressUnmanagedCodeSecurity] - internal static extern unsafe float DotU(/*const*/ float* pa, /*const*/ float* pb, int c); + [DllImport("CpuMathNative", EntryPoint = "ScaleAddU"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe void ScaleAddU(float a, float b, /*_Inout_*/ float* pd, int c); - [DllImport("CpuMathNative", EntryPoint = "DotSU"), SuppressUnmanagedCodeSecurity] - internal static extern unsafe float DotSU(/*const*/ float* pa, /*const*/ float* pb, /*const*/ int* pi, int c); + [DllImport("CpuMathNative", EntryPoint = "AddScaleU"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe void AddScaleU(float a, /*_In_ const*/ float* ps, /*_Inout_*/ float* pd, int c); - [DllImport("CpuMathNative", EntryPoint = "SumSqU"), SuppressUnmanagedCodeSecurity] - internal static extern unsafe float SumSqU(/*const*/ float* ps, int c); + [DllImport("CpuMathNative", EntryPoint = "AddScaleSU"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe void AddScaleSU(float a, /*_In_ const*/ float* ps, /*_In_ const*/ int* pi, /*_Inout_*/ float* pd, int c); + + [DllImport("CpuMathNative", EntryPoint = "AddScaleCopyU"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe void AddScaleCopyU(float a, /*_In_ const*/ float* ps, /*_In_ const*/ float* pd, /*_Inout_*/ float* pr, int c); [DllImport("CpuMathNative", EntryPoint = "AddU"), SuppressUnmanagedCodeSecurity] internal static extern unsafe void AddU(/*_In_ const*/ float* ps, /*_Inout_*/ float* pd, int c); @@ -50,28 +61,49 @@ internal static extern unsafe void MatMulDU(bool add, /*_In_ const*/ int* pmprow [DllImport("CpuMathNative", EntryPoint = "AddSU"), SuppressUnmanagedCodeSecurity] internal static extern unsafe void AddSU(/*_In_ const*/ float* ps, /*_In_ const*/ int* pi, /*_Inout_*/ float* pd, int c); - [DllImport("CpuMathNative", EntryPoint = "AddScaleU"), SuppressUnmanagedCodeSecurity] - internal static extern unsafe void AddScaleU(float a, /*_In_ const*/ float* ps, /*_Inout_*/ float* pd, int c); + [DllImport("CpuMathNative", EntryPoint = "MulElementWiseU"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe void MulElementWiseU(/*_In_ const*/ float* ps1, /*_In_ const*/ float* ps2, /*_Inout_*/ float* pd, int c); - [DllImport("CpuMathNative", EntryPoint = "AddScaleSU"), SuppressUnmanagedCodeSecurity] - internal static extern unsafe void AddScaleSU(float a, /*_In_ const*/ float* ps, /*_In_ const*/ int* pi, /*_Inout_*/ float* pd, int c); + [DllImport("CpuMathNative", EntryPoint = "SumU"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe float SumU(/*const*/ float* ps, int c); - [DllImport("CpuMathNative", EntryPoint = "ScaleU"), SuppressUnmanagedCodeSecurity] - internal static extern unsafe void ScaleU(float a, /*_Inout_*/ float* pd, int c); + [DllImport("CpuMathNative", EntryPoint = "SumSqU"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe float SumSqU(/*const*/ float* ps, int c); - [DllImport("CpuMathNative", EntryPoint = "Dist2"), SuppressUnmanagedCodeSecurity] - internal static extern unsafe float Dist2(/*const*/ float* px, /*const*/ float* py, int c); + [DllImport("CpuMathNative", EntryPoint = "SumSqDiffU"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe float SumSqDiffU(float mean, /*const*/ float* ps, int c); [DllImport("CpuMathNative", EntryPoint = "SumAbsU"), SuppressUnmanagedCodeSecurity] internal static extern unsafe float SumAbsU(/*const*/ float* ps, int c); - [DllImport("CpuMathNative", EntryPoint = "MulElementWiseU"), SuppressUnmanagedCodeSecurity] - internal static extern unsafe void MulElementWiseU(/*_In_ const*/ float* ps1, /*_In_ const*/ float* ps2, /*_Inout_*/ float* pd, int c); + [DllImport("CpuMathNative", EntryPoint = "SumAbsDiffU"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe float SumAbsDiffU(float mean, /*const*/ float* ps, int c); + + [DllImport("CpuMathNative", EntryPoint = "MaxAbsU"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe float MaxAbsU(/*const*/ float* ps, int c); + + [DllImport("CpuMathNative", EntryPoint = "MaxAbsDiffU"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe float MaxAbsDiffU(float mean, /*const*/ float* ps, int c); + + [DllImport("CpuMathNative", EntryPoint = "DotU"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe float DotU(/*const*/ float* pa, /*const*/ float* pb, int c); + + [DllImport("CpuMathNative", EntryPoint = "DotSU"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe float DotSU(/*const*/ float* pa, /*const*/ float* pb, /*const*/ int* pi, int c); + + [DllImport("CpuMathNative", EntryPoint = "Dist2"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe float Dist2(/*const*/ float* px, /*const*/ float* py, int c); [DllImport("CpuMathNative", EntryPoint = "ZeroItemsU"), SuppressUnmanagedCodeSecurity] internal static extern unsafe void ZeroItemsU(/*_Inout_*/ float* pd, int c, /*_In_ const*/ int* pindices, int cindices); [DllImport("CpuMathNative", EntryPoint = "ZeroMatrixItemsCore"), SuppressUnmanagedCodeSecurity] internal static extern unsafe void ZeroMatrixItemsCore(/*_Inout_*/ float* pd, int c, int ccol, int cfltRow, /*_In_ const*/ int* pindices, int cindices); + + [DllImport("CpuMathNative", EntryPoint = "SdcaL1UpdateU"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe void SdcaL1UpdateU(float primalUpdate, /*_In_ const*/ float* ps, float threshold, /*_Inout_*/ float* pd1, /*_Inout_*/ float* pd2, int c); + + [DllImport("CpuMathNative", EntryPoint = "SdcaL1UpdateSU"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe void SdcaL1UpdateSU(float primalUpdate, /*_In_ const*/ float* ps, /*_In_ const*/ int* pi, float threshold, /*_Inout_*/ float* pd1, /*_Inout_*/ float* pd2, int c); } } diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs index f560fcd048..42dec27378 100644 --- a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs +++ b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs @@ -19,11 +19,14 @@ public class SsePerformanceTests private const int EXP_RANGE = EXP_MAX / 2; private const int DEFAULT_SEED = 253421; private const float DEFAULT_SCALE = 1.11f; - private const int DEFAULT_CROW = 1000; - private const int DEFAULT_CCOL = 1000; + private const int DEFAULT_CROW = 500; + private const int DEFAULT_CCOL = 2000; private const bool ADD = true; - private float[] src, dst, original, src1, src2, mat; + // Naming follows from SseIntrinsics. + private const int CbAlign = 16; + + private float[] src, dst, original, src1, src2, result; private int[] idx; private int seed = DEFAULT_SEED; @@ -68,8 +71,8 @@ public void Setup() src1 = new float[LEN]; src2 = new float[LEN]; original = new float[LEN]; + result = new float[LEN]; idx = new int[IDXLEN]; - mat = new float[DEFAULT_CROW * DEFAULT_CCOL]; seed = GetSeed(); Random rand = new Random(seed); @@ -79,6 +82,7 @@ public void Setup() src[i] = NextFloat(rand, EXP_RANGE); dst[i] = NextFloat(rand, EXP_RANGE); original[i] = dst[i]; + result[i] = dst[i]; src1[i] = NextFloat(rand, EXP_RANGE); src2[i] = NextFloat(rand, EXP_RANGE); } @@ -87,68 +91,104 @@ public void Setup() { idx[i] = rand.Next(0, LEN); } - - for (int i = 0; i < mat.Length; i++) - { - mat[i] = NextFloat(rand, EXP_RANGE); - } } [GlobalCleanup] public void GlobalCleanup() { original.CopyTo(dst, 0); + original.CopyTo(result, 0); + } + + [Benchmark] + public unsafe float NativeAddScalarUPerf() + { + fixed (float* pdst = dst) + { + return CpuMathNativeUtils.AddScalarU(DEFAULT_SCALE, pdst, LEN); + } } [Benchmark] - public unsafe void NativeMatMulAPerf() + public void ManagedAddScalarUPerf() => CpuMathUtils.Add(DEFAULT_SCALE, dst, LEN); + + [Benchmark] + public unsafe void NativeScaleUPerf() + { + fixed (float* pdst = dst) + { + CpuMathNativeUtils.ScaleU(DEFAULT_SCALE, pdst, LEN); + } + } + + [Benchmark] + public void ManagedScaleUPerf() => CpuMathUtils.Scale(DEFAULT_SCALE, dst, LEN); + + [Benchmark] + public unsafe void NativeScaleSrcUPerf() { - fixed (float* pmat = mat) fixed (float* psrc = src) fixed (float* pdst = dst) { - CpuMathNativeUtils.MatMulA(ADD, pmat, psrc, pdst, DEFAULT_CROW, DEFAULT_CCOL); + CpuMathNativeUtils.ScaleSrcU(DEFAULT_SCALE, psrc, pdst, LEN); } } [Benchmark] - public unsafe float NativeDotUPerf() + public void ManagedScaleSrcUPerf() => CpuMathUtils.Scale(DEFAULT_SCALE, src, dst, LEN); + + [Benchmark] + public unsafe void NativeScaleAddUPerf() + { + fixed (float* pdst = dst) + { + CpuMathNativeUtils.ScaleAddU(DEFAULT_SCALE, DEFAULT_SCALE, pdst, LEN); + } + } + + [Benchmark] + public void ManagedScaleAddUPerf() => CpuMathUtils.ScaleAdd(DEFAULT_SCALE, DEFAULT_SCALE, dst, LEN); + + [Benchmark] + public unsafe void NativeAddScaleUPerf() { fixed (float* psrc = src) fixed (float* pdst = dst) { - return CpuMathNativeUtils.DotU(psrc, pdst, LEN); + CpuMathNativeUtils.AddScaleU(DEFAULT_SCALE, psrc, pdst, LEN); } } [Benchmark] - public float ManagedDotUPerf() => CpuMathUtils.DotProductDense(src, dst, LEN); + public void ManagedAddScaleUPerf() => CpuMathUtils.AddScale(DEFAULT_SCALE, src, dst, LEN); [Benchmark] - public unsafe float NativeDotSUPerf() + public unsafe void NativeAddScaleSUPerf() { fixed (float* psrc = src) fixed (float* pdst = dst) fixed (int* pidx = idx) { - return CpuMathNativeUtils.DotSU(psrc, pdst, pidx, IDXLEN); + CpuMathNativeUtils.AddScaleSU(DEFAULT_SCALE, psrc, pidx, pdst, IDXLEN); } } [Benchmark] - public float ManagedDotSUPerf() => CpuMathUtils.DotProductSparse(src, dst, idx, IDXLEN); + public void ManagedAddScaleSUPerf() => CpuMathUtils.AddScale(DEFAULT_SCALE, src, idx, dst, IDXLEN); [Benchmark] - public unsafe float NativeSumSqUPerf() + public unsafe void NativeAddScaleCopyUPerf() { fixed (float* psrc = src) + fixed (float* pdst = dst) + fixed (float* pres = result) { - return CpuMathNativeUtils.SumSqU(psrc, LEN); + CpuMathNativeUtils.AddScaleCopyU(DEFAULT_SCALE, psrc, pdst, pres, LEN); } } [Benchmark] - public float ManagedSumSqUPerf() => CpuMathUtils.SumSq(src, LEN); + public void ManagedAddScaleCopyUPerf() => CpuMathUtils.AddScaleCopy(DEFAULT_SCALE, src, dst, result, LEN); [Benchmark] public unsafe void NativeAddUPerf() @@ -177,44 +217,132 @@ public unsafe void NativeAddSUPerf() [Benchmark] public void ManagedAddSUPerf() => CpuMathUtils.Add(src, idx, dst, IDXLEN); + [Benchmark] - public unsafe void NativeAddScaleUPerf() + public unsafe void NativeMulElementWiseUPerf() { - fixed (float* psrc = src) + fixed (float* psrc1 = src1) + fixed (float* psrc2 = src2) fixed (float* pdst = dst) { - CpuMathNativeUtils.AddScaleU(DEFAULT_SCALE, psrc, pdst, LEN); + CpuMathNativeUtils.MulElementWiseU(psrc1, psrc2, pdst, LEN); } } [Benchmark] - public void ManagedAddScaleUPerf() => CpuMathUtils.AddScale(DEFAULT_SCALE, src, dst, LEN); + public void ManagedMulElementWiseUPerf() => CpuMathUtils.MulElementWise(src1, src2, dst, LEN); [Benchmark] - public unsafe void NativeAddScaleSUPerf() + public unsafe float NativeSumUPerf() + { + fixed (float* psrc = src) + { + return CpuMathNativeUtils.SumU(psrc, LEN); + } + } + + [Benchmark] + public float ManagedSumUPerf() => CpuMathUtils.Sum(src, LEN); + + [Benchmark] + public unsafe float NativeSumSqUPerf() + { + fixed (float* psrc = src) + { + return CpuMathNativeUtils.SumSqU(psrc, LEN); + } + } + + [Benchmark] + public float ManagedSumSqUPerf() => CpuMathUtils.SumSq(src, LEN); + + [Benchmark] + public unsafe float NativeSumSqDiffUPerf() + { + fixed (float* psrc = src) + { + return CpuMathNativeUtils.SumSqDiffU(DEFAULT_SCALE, psrc, LEN); + } + } + + [Benchmark] + public float ManagedSumSqDiffUPerf() => CpuMathUtils.SumSq(DEFAULT_SCALE, src, 0, LEN); + + [Benchmark] + public unsafe float NativeSumAbsUPerf() + { + fixed (float* psrc = src) + { + return CpuMathNativeUtils.SumAbsU(psrc, LEN); + } + } + + [Benchmark] + public float ManagedSumAbsUPerf() => CpuMathUtils.SumAbs(src, LEN); + + [Benchmark] + public unsafe float NativeSumAbsDiffUPerf() + { + fixed (float* psrc = src) + { + return CpuMathNativeUtils.SumAbsDiffU(DEFAULT_SCALE, psrc, LEN); + } + } + + [Benchmark] + public float ManagedSumAbsDiffUPerf() => CpuMathUtils.SumAbs(DEFAULT_SCALE, src, 0, LEN); + + [Benchmark] + public unsafe float NativeMaxAbsUPerf() + { + fixed (float* psrc = src) + { + return CpuMathNativeUtils.MaxAbsU(psrc, LEN); + } + } + + [Benchmark] + public float ManagedMaxAbsUPerf() => CpuMathUtils.MaxAbs(src, LEN); + + [Benchmark] + public unsafe float NativeMaxAbsDiffUPerf() + { + fixed (float* psrc = src) + { + return CpuMathNativeUtils.MaxAbsDiffU(DEFAULT_SCALE, psrc, LEN); + } + } + + [Benchmark] + public float ManagedMaxAbsDiffUPerf() => CpuMathUtils.MaxAbsDiff(DEFAULT_SCALE, src, LEN); + // TODO: MaxAbsU!!! + + [Benchmark] + public unsafe float NativeDotUPerf() { fixed (float* psrc = src) fixed (float* pdst = dst) - fixed (int* pidx = idx) { - CpuMathNativeUtils.AddScaleSU(DEFAULT_SCALE, psrc, pidx, pdst, IDXLEN); + return CpuMathNativeUtils.DotU(psrc, pdst, LEN); } } [Benchmark] - public void ManagedAddScaleSUPerf() => CpuMathUtils.AddScale(DEFAULT_SCALE, src, idx, dst, IDXLEN); + public float ManagedDotUPerf() => CpuMathUtils.DotProductDense(src, dst, LEN); [Benchmark] - public unsafe void NativeScaleUPerf() + public unsafe float NativeDotSUPerf() { + fixed (float* psrc = src) fixed (float* pdst = dst) + fixed (int* pidx = idx) { - CpuMathNativeUtils.ScaleU(DEFAULT_SCALE, pdst, LEN); + return CpuMathNativeUtils.DotSU(psrc, pdst, pidx, IDXLEN); } } [Benchmark] - public void ManagedScaleUPerf() => CpuMathUtils.Scale(DEFAULT_SCALE, dst, LEN); + public float ManagedDotSUPerf() => CpuMathUtils.DotProductSparse(src, dst, idx, IDXLEN); [Benchmark] public unsafe float NativeDist2Perf() @@ -230,29 +358,32 @@ public unsafe float NativeDist2Perf() public float ManagedDist2Perf() => CpuMathUtils.L2DistSquared(src, dst, LEN); [Benchmark] - public unsafe float NativeSumAbsUPerf() + public unsafe void NativeSdcaL1UpdateUPerf() { fixed (float* psrc = src) + fixed (float* pdst = dst) + fixed (float* pres = result) { - return CpuMathNativeUtils.SumAbsU(psrc, LEN); + CpuMathNativeUtils.SdcaL1UpdateU(DEFAULT_SCALE, psrc, DEFAULT_SCALE, pdst, pres, LEN); } } [Benchmark] - public float ManagedSumAbsqUPerf() => CpuMathUtils.SumAbs(src, LEN); + public void ManagedSdcaL1UpdateUPerf() => CpuMathUtils.SdcaL1UpdateDense(DEFAULT_SCALE, LEN, src, DEFAULT_SCALE, dst, result); [Benchmark] - public unsafe void NativeMulElementWiseUPerf() + public unsafe void NativeSdcaL1UpdateSUPerf() { - fixed (float* psrc1 = src1) - fixed (float* psrc2 = src2) + fixed (float* psrc = src) fixed (float* pdst = dst) + fixed (float* pres = result) + fixed (int* pidx = idx) { - CpuMathNativeUtils.MulElementWiseU(psrc1, psrc2, pdst, LEN); + CpuMathNativeUtils.SdcaL1UpdateSU(DEFAULT_SCALE, psrc, pidx, DEFAULT_SCALE, pdst, pres, IDXLEN); } } [Benchmark] - public void ManagedMulElementWiseUPerf() => CpuMathUtils.MulElementWise(src1, src2, dst, LEN); + public void ManagedSdcaL1UpdateSUPerf() => CpuMathUtils.SdcaL1UpdateSparse(DEFAULT_SCALE, LEN, src, idx, IDXLEN, DEFAULT_SCALE, dst, result); } } diff --git a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs b/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs index 6fc2596ef7..6d6a68bd32 100644 --- a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs +++ b/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs @@ -13,7 +13,10 @@ public class CpuMathUtilsUnitTests { private readonly float[][] testArrays; private readonly int[] testIndexArray; + private readonly AlignedArray[] testMatrices; + private readonly AlignedArray[] testSrcVectors; private const float DEFAULT_SCALE = 1.7f; + private const int SseCbAlign = 16; private FloatEqualityComparer comparer; public CpuMathUtilsUnitTests() @@ -25,75 +28,211 @@ public CpuMathUtilsUnitTests() testArrays = new float[][] { testArray1, testArray2 }; testIndexArray = new int[4] { 0, 2, 5, 6 }; comparer = new FloatEqualityComparer(); + + // Padded matrices whose dimensions are multiples of 4 + float[] testMatrix1 = new float[4 * 4] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f, 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f }; + float[] testMatrix2 = new float[4 * 8]; + + for (int i = 0; i < testMatrix2.Length; i++) + { + testMatrix2[i] = i + 1; + } + + AlignedArray testMatrixAligned1 = new AlignedArray(4 * 4, SseCbAlign); + AlignedArray testMatrixAligned2 = new AlignedArray(4 * 8, SseCbAlign); + testMatrixAligned1.CopyFrom(testMatrix1, 0, testMatrix1.Length); + testMatrixAligned2.CopyFrom(testMatrix2, 0, testMatrix2.Length); + + testMatrices = new AlignedArray[] { testMatrixAligned1, testMatrixAligned2 }; + + // Padded source vectors whose dimensions are multiples of 4 + float[] testSrcVector1 = new float[4] { 1f, 2f, 3f, 4f }; + float[] testSrcVector2 = new float[8] { 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f }; + + AlignedArray testSrcVectorAligned1 = new AlignedArray(4, SseCbAlign); + AlignedArray testSrcVectorAligned2 = new AlignedArray(8, SseCbAlign); + testSrcVectorAligned1.CopyFrom(testSrcVector1, 0, testSrcVector1.Length); + testSrcVectorAligned2.CopyFrom(testSrcVector2, 0, testSrcVector2.Length); + + testSrcVectors = new AlignedArray[] { testSrcVectorAligned1, testSrcVectorAligned2 }; } [Theory] - [InlineData(0, 13306.0376f)] - [InlineData(1, 13291.9235f)] - public void DotUTest(int test, float expected) + [InlineData(0, new float[] { 23.28f, -49.72f, 23.28f, -49.72f })] + [InlineData(1, new float[] { 204f, 492f, 780f, 1068f })] + public void MatMulATest(int test, float[] expected) { - float[] src = (float[]) testArrays[test].Clone(); - float[] dst = (float[]) src.Clone(); - - for (int i = 0; i < dst.Length; i++) + AlignedArray mat = testMatrices[test]; + AlignedArray src = testSrcVectors[test]; + AlignedArray dst = new AlignedArray(4, SseCbAlign); + + CpuMathUtils.MatTimesSrc(false, false, mat, src, dst, dst.Size); + float[] actual = new float[dst.Size]; + dst.CopyTo(actual, 0, dst.Size); + Assert.Equal(expected, actual, comparer); + } + + [Theory] + [InlineData(0, new float[] { 23.28f, -48.72f, 25.28f, -46.72f })] + [InlineData(1, new float[] { 204f, 493f, 782f, 1071f })] + public void MatMulAAddTest(int test, float[] expected) + { + AlignedArray mat = testMatrices[test]; + AlignedArray src = testSrcVectors[test]; + AlignedArray dst = new AlignedArray(4, SseCbAlign); + + for (int i = 0; i < dst.Size; i++) { - dst[i] += 1; + dst[i] = i; } - var actual = CpuMathUtils.DotProductDense(src, dst, dst.Length); - Assert.Equal(expected, actual, 2); + CpuMathUtils.MatTimesSrc(false, true, mat, src, dst, dst.Size); + float[] actual = new float[dst.Size]; + dst.CopyTo(actual, 0, dst.Size); + Assert.Equal(expected, actual, comparer); } [Theory] - [InlineData(0, 736.7352f)] - [InlineData(1, 736.7352f)] - public void DotSUTest(int test, float expected) + [InlineData(0, new float[] { -630.38f, -171.1f, 155.66f, 75.1f })] + [InlineData(1, new float[] { 170f, 180f, 190f, 200f, 210f, 220f, 230f, 240f })] + public void MatMulTranATest(int test, float[] expected) { - float[] src = (float[])testArrays[test].Clone(); - float[] dst = (float[])src.Clone(); + AlignedArray mat = testMatrices[test]; + AlignedArray src = testSrcVectors[0]; + AlignedArray dst = new AlignedArray(4 + 4 * test, SseCbAlign); + + CpuMathUtils.MatTimesSrc(true, false, mat, src, dst, src.Size); + float[] actual = new float[dst.Size]; + dst.CopyTo(actual, 0, dst.Size); + Assert.Equal(expected, actual, comparer); + } + + [Theory] + [InlineData(0, new float[] { -630.38f, -170.1f, 157.66f, 78.1f })] + [InlineData(1, new float[] { 170f, 181f, 192f, 203f, 214f, 225f, 236f, 247f })] + public void MatMulTranAAddTest(int test, float[] expected) + { + AlignedArray mat = testMatrices[test]; + AlignedArray src = testSrcVectors[0]; + AlignedArray dst = new AlignedArray(4 + 4 * test, SseCbAlign); + + for (int i = 0; i < dst.Size; i++) + { + dst[i] = i; + } + + CpuMathUtils.MatTimesSrc(true, true, mat, src, dst, src.Size); + float[] actual = new float[dst.Size]; + dst.CopyTo(actual, 0, dst.Size); + Assert.Equal(expected, actual, comparer); + } + + [Theory] + [InlineData(0, new float[] { -27.32f, -9.02f, -27.32f, -9.02f })] + [InlineData(1, new float[] { 95f, 231f, 367f, 503f })] + public void MatMulPATest(int test, float[] expected) + { + AlignedArray mat = testMatrices[test]; + AlignedArray src = testSrcVectors[test]; + AlignedArray dst = new AlignedArray(4, SseCbAlign); int[] idx = testIndexArray; - // Ensures src and dst are different arrays - for (int i = 0; i < dst.Length; i++) + CpuMathUtils.MatTimesSrc(false, false, mat, idx, src, 0, 0, 2 + 2 * test, dst, dst.Size); + float[] actual = new float[dst.Size]; + dst.CopyTo(actual, 0, dst.Size); + Assert.Equal(expected, actual, comparer); + } + + [Theory] + [InlineData(0, new float[] { -27.32f, -8.02f, -25.32f, -6.02f })] + [InlineData(1, new float[] { 95f, 232f, 369f, 506f })] + public void MatMulPAAddTest(int test, float[] expected) + { + AlignedArray mat = testMatrices[test]; + AlignedArray src = testSrcVectors[test]; + AlignedArray dst = new AlignedArray(4, SseCbAlign); + int[] idx = testIndexArray; + + for (int i = 0; i < dst.Size; i++) { - dst[i] += 1; + dst[i] = i; } - var actual = CpuMathUtils.DotProductSparse(src, dst, idx, idx.Length); - Assert.Equal(expected, actual, 4); + CpuMathUtils.MatTimesSrc(false, true, mat, idx, src, 0, 0, 2 + 2 * test, dst, dst.Size); + float[] actual = new float[dst.Size]; + dst.CopyTo(actual, 0, dst.Size); + Assert.Equal(expected, actual, comparer); } [Theory] - [InlineData(0, 13399.9376f)] - [InlineData(1, 13389.1135f)] - public void SumSqUTest(int test, float expected) + [InlineData(0, new float[] { 7.84f, -9.52f, -39.04f, 55.36f })] + [InlineData(1, new float[] { 52f, 56f, 60f, 64f, 68f, 72f, 76f, 80f })] + public void MatMulTranPATest(int test, float[] expected) { - float[] src = (float[])testArrays[test].Clone(); - var actual = CpuMathUtils.SumSq(src, src.Length); - Assert.Equal(expected, actual, 2); + AlignedArray mat = testMatrices[test]; + AlignedArray src = testSrcVectors[0]; + AlignedArray dst = new AlignedArray(4 + 4 * test, SseCbAlign); + int[] idx = testIndexArray; + + CpuMathUtils.MatTimesSrc(true, false, mat, idx, src, 0, 0, 2, dst, src.Size); + float[] actual = new float[dst.Size]; + dst.CopyTo(actual, 0, dst.Size); + Assert.Equal(expected, actual, comparer); + } + + [Theory] + [InlineData(0, new float[] { 7.84f, -8.52f, -37.04f, 58.36f })] + [InlineData(1, new float[] { 52f, 57f, 62f, 67f, 72f, 77f, 82f, 87f })] + public void MatMulTranPAAddTest(int test, float[] expected) + { + AlignedArray mat = testMatrices[test]; + AlignedArray src = testSrcVectors[0]; + AlignedArray dst = new AlignedArray(4 + 4 * test, SseCbAlign); + int[] idx = testIndexArray; + + for (int i = 0; i < dst.Size; i++) + { + dst[i] = i; + } + + CpuMathUtils.MatTimesSrc(true, true, mat, idx, src, 0, 0, 2, dst, src.Size); + float[] actual = new float[dst.Size]; + dst.CopyTo(actual, 0, dst.Size); + Assert.Equal(expected, actual, comparer); } [Theory] [InlineData(0)] [InlineData(1)] - public void AddUTest(int test) + public void AddScalarUTest(int test) { - float[] src = (float[])testArrays[test].Clone(); - float[] dst = (float[])src.Clone(); - float[] expected = (float[])src.Clone(); + float[] dst = (float[])testArrays[test].Clone(); + float[] expected = (float[])dst.Clone(); - // Ensures src and dst are different arrays - for (int i = 0; i < dst.Length; i++) + for (int i = 0; i < expected.Length; i++) { - dst[i] += 1; + expected[i] += DEFAULT_SCALE; } + CpuMathUtils.Add(DEFAULT_SCALE, dst, dst.Length); + var actual = dst; + Assert.Equal(expected, actual, comparer); + } + + [Theory] + [InlineData(0)] + [InlineData(1)] + public void ScaleUTest(int test) + { + float[] dst = (float[])testArrays[test].Clone(); + float[] expected = (float[])dst.Clone(); + for (int i = 0; i < expected.Length; i++) { - expected[i] = 2 * expected[i] + 1; + expected[i] *= DEFAULT_SCALE; } - CpuMathUtils.Add(src, dst, dst.Length); + CpuMathUtils.Scale(DEFAULT_SCALE, dst, dst.Length); var actual = dst; Assert.Equal(expected, actual, comparer); } @@ -101,19 +240,36 @@ public void AddUTest(int test) [Theory] [InlineData(0)] [InlineData(1)] - public void AddSUTest(int test) + public void ScaleSrcUTest(int test) { float[] src = (float[])testArrays[test].Clone(); float[] dst = (float[])src.Clone(); - int[] idx = testIndexArray; float[] expected = (float[])dst.Clone(); - expected[0] = 3.92f; - expected[2] = -12.14f; - expected[5] = -36.69f; - expected[6] = 46.29f; + for (int i = 0; i < expected.Length; i++) + { + expected[i] *= DEFAULT_SCALE; + } - CpuMathUtils.Add(src, idx, dst, idx.Length); + CpuMathUtils.Scale(DEFAULT_SCALE, src, dst, dst.Length); + var actual = dst; + Assert.Equal(expected, actual, comparer); + } + + [Theory] + [InlineData(0)] + [InlineData(1)] + public void ScaleAddUTest(int test) + { + float[] dst = (float[])testArrays[test].Clone(); + float[] expected = (float[])dst.Clone(); + + for (int i = 0; i < expected.Length; i++) + { + expected[i] = DEFAULT_SCALE * (dst[i] + DEFAULT_SCALE); + } + + CpuMathUtils.ScaleAdd(DEFAULT_SCALE, DEFAULT_SCALE, dst, dst.Length); var actual = dst; Assert.Equal(expected, actual, comparer); } @@ -160,28 +316,31 @@ public void AddScaleSUTest(int test) [Theory] [InlineData(0)] [InlineData(1)] - public void ScaleUTest(int test) + public void AddScaleCopyUTest(int test) { - float[] dst = (float[])testArrays[test].Clone(); - float[] expectedOutput = (float[])dst.Clone(); + float[] src = (float[])testArrays[test].Clone(); + float[] dst = (float[])src.Clone(); + float[] result = (float[])dst.Clone(); + float[] expected = (float[])dst.Clone(); - for (int i = 0; i < expectedOutput.Length; i++) + for (int i = 0; i < expected.Length; i++) { - expectedOutput[i] *= DEFAULT_SCALE; + expected[i] *= (1 + DEFAULT_SCALE); } - CpuMathUtils.Scale(DEFAULT_SCALE, dst, dst.Length); - var managedOutput = dst; - Assert.Equal(expectedOutput, managedOutput, comparer); + CpuMathUtils.AddScaleCopy(DEFAULT_SCALE, src, dst, result, dst.Length); + var actual = result; + Assert.Equal(expected, actual, comparer); } [Theory] - [InlineData(0, 8.0f)] - [InlineData(1, 7.0f)] - public void Dist2Test(int test, float expected) + [InlineData(0)] + [InlineData(1)] + public void AddUTest(int test) { float[] src = (float[])testArrays[test].Clone(); float[] dst = (float[])src.Clone(); + float[] expected = (float[])src.Clone(); // Ensures src and dst are different arrays for (int i = 0; i < dst.Length; i++) @@ -189,18 +348,34 @@ public void Dist2Test(int test, float expected) dst[i] += 1; } - var actual = CpuMathUtils.L2DistSquared(src, dst, dst.Length); - Assert.Equal(expected, actual, 0); + for (int i = 0; i < expected.Length; i++) + { + expected[i] = 2 * expected[i] + 1; + } + + CpuMathUtils.Add(src, dst, dst.Length); + var actual = dst; + Assert.Equal(expected, actual, comparer); } [Theory] - [InlineData(0, 196.98f)] - [InlineData(1, 193.69f)] - public void SumAbsUTest(int test, float expected) + [InlineData(0)] + [InlineData(1)] + public void AddSUTest(int test) { float[] src = (float[])testArrays[test].Clone(); - var actual = CpuMathUtils.SumAbs(src, src.Length); - Assert.Equal(expected, actual, 2); + float[] dst = (float[])src.Clone(); + int[] idx = testIndexArray; + float[] expected = (float[])dst.Clone(); + + expected[0] = 3.92f; + expected[2] = -12.14f; + expected[5] = -36.69f; + expected[6] = 46.29f; + + CpuMathUtils.Add(src, idx, dst, idx.Length); + var actual = dst; + Assert.Equal(expected, actual, comparer); } [Theory] @@ -229,6 +404,202 @@ public void MulElementWiseUTest(int test) var actual = dst; Assert.Equal(expected, actual, comparer); } + + [Theory] + [InlineData(0, -93.9f)] + [InlineData(1, -97.19f)] + public void SumUTest(int test, float expected) + { + float[] src = (float[])testArrays[test].Clone(); + var actual = CpuMathUtils.Sum(src, src.Length); + Assert.Equal(expected, actual, 2); + } + + [Theory] + [InlineData(0, 13399.9376f)] + [InlineData(1, 13389.1135f)] + public void SumSqUTest(int test, float expected) + { + float[] src = (float[])testArrays[test].Clone(); + var actual = CpuMathUtils.SumSq(src, src.Length); + Assert.Equal(expected, actual, 2); + } + + [Theory] + [InlineData(0, 13742.3176f)] + [InlineData(1, 13739.7895f)] + public void SumSqDiffUTest(int test, float expected) + { + float[] src = (float[])testArrays[test].Clone(); + var actual = CpuMathUtils.SumSq(DEFAULT_SCALE, src, 0, src.Length); + Assert.Equal(expected, actual, 2); + } + + [Theory] + [InlineData(0, 196.98f)] + [InlineData(1, 193.69f)] + public void SumAbsUTest(int test, float expected) + { + float[] src = (float[])testArrays[test].Clone(); + var actual = CpuMathUtils.SumAbs(src, src.Length); + Assert.Equal(expected, actual, 2); + } + + [Theory] + [InlineData(0, 196.98f)] + [InlineData(1, 195.39f)] + public void SumAbsDiffUTest(int test, float expected) + { + float[] src = (float[])testArrays[test].Clone(); + var actual = CpuMathUtils.SumAbs(DEFAULT_SCALE, src, 0, src.Length); + Assert.Equal(expected, actual, 2); + } + + [Theory] + [InlineData(0, 106.37f)] + [InlineData(1, 106.37f)] + public void MaxAbsUTest(int test, float expected) + { + float[] src = (float[])testArrays[test].Clone(); + var actual = CpuMathUtils.MaxAbs(src, src.Length); + Assert.Equal(expected, actual, 2); + } + + [Theory] + [InlineData(0, 108.07f)] + [InlineData(1, 108.07f)] + public void MaxAbsDiffUTest(int test, float expected) + { + float[] src = (float[])testArrays[test].Clone(); + var actual = CpuMathUtils.MaxAbsDiff(DEFAULT_SCALE, src, src.Length); + Assert.Equal(expected, actual, 2); + } + + [Theory] + [InlineData(0, 13306.0376f)] + [InlineData(1, 13291.9235f)] + public void DotUTest(int test, float expected) + { + float[] src = (float[])testArrays[test].Clone(); + float[] dst = (float[])src.Clone(); + + for (int i = 0; i < dst.Length; i++) + { + dst[i] += 1; + } + + var actual = CpuMathUtils.DotProductDense(src, dst, dst.Length); + Assert.Equal(expected, actual, 2); + } + + [Theory] + [InlineData(0, 736.7352f)] + [InlineData(1, 736.7352f)] + public void DotSUTest(int test, float expected) + { + float[] src = (float[])testArrays[test].Clone(); + float[] dst = (float[])src.Clone(); + int[] idx = testIndexArray; + + // Ensures src and dst are different arrays + for (int i = 0; i < dst.Length; i++) + { + dst[i] += 1; + } + + var actual = CpuMathUtils.DotProductSparse(src, dst, idx, idx.Length); + Assert.Equal(expected, actual, 4); + } + + [Theory] + [InlineData(0, 8.0f)] + [InlineData(1, 7.0f)] + public void Dist2Test(int test, float expected) + { + float[] src = (float[])testArrays[test].Clone(); + float[] dst = (float[])src.Clone(); + + // Ensures src and dst are different arrays + for (int i = 0; i < dst.Length; i++) + { + dst[i] += 1; + } + + var actual = CpuMathUtils.L2DistSquared(src, dst, dst.Length); + Assert.Equal(expected, actual, 0); + } + + [Theory] + [InlineData(0, new int[] { 0, 2 }, new float[] { 0f, 2f, 0f, 4f })] + [InlineData(1, new int[] { 0, 2, 5, 6 }, new float[] { 0f, 2f, 0f, 4f, 5f, 0f, 0f, 8f })] + public void ZeroItemsUTest(int test, int[] idx, float[] expected) + { + AlignedArray src = new AlignedArray(4 + 4 * test, SseCbAlign); + src.CopyFrom(testSrcVectors[test]); + + CpuMathUtils.ZeroMatrixItems(src, src.Size, src.Size, idx); + float[] actual = new float[src.Size]; + src.CopyTo(actual, 0, src.Size); + Assert.Equal(expected, actual, comparer); + } + + [Theory] + [InlineData(0, new int[] { 0, 1 }, new float[] { 0f, 2f, 0f, 4f })] + [InlineData(1, new int[] { 0, 2, 4 }, new float[] { 0f, 2f, 0f, 4f, 5f, 0f, 7f, 8f })] + public void ZeroMatrixItemsCoreTest(int test, int[] idx, float[] expected) + { + AlignedArray src = new AlignedArray(4 + 4 * test, SseCbAlign); + src.CopyFrom(testSrcVectors[test]); + + CpuMathUtils.ZeroMatrixItems(src, src.Size / 2 - 1, src.Size / 2, idx); + float[] actual = new float[src.Size]; + src.CopyTo(actual, 0, src.Size); + Assert.Equal(expected, actual, comparer); + } + + [Theory] + [InlineData(0)] + [InlineData(1)] + public void SdcaL1UpdateUTest(int test) + { + float[] src = (float[])testArrays[test].Clone(); + float[] v = (float[])src.Clone(); + float[] w = (float[])src.Clone(); + float[] expected = (float[])w.Clone(); + + for (int i = 0; i < expected.Length; i++) + { + float value = src[i] * (1 + DEFAULT_SCALE); + expected[i] = Math.Abs(value) > DEFAULT_SCALE ? (value > 0 ? value - DEFAULT_SCALE : value + DEFAULT_SCALE) : 0; + } + + CpuMathUtils.SdcaL1UpdateDense(DEFAULT_SCALE, src.Length, src, DEFAULT_SCALE, v, w); + var actual = w; + Assert.Equal(expected, actual, comparer); + } + + [Theory] + [InlineData(0)] + [InlineData(1)] + public void SdcaL1UpdateSUTest(int test) + { + float[] src = (float[])testArrays[test].Clone(); + float[] v = (float[])src.Clone(); + float[] w = (float[])src.Clone(); + int[] idx = testIndexArray; + float[] expected = (float[])w.Clone(); + + for (int i = 0; i < idx.Length; i++) + { + int index = idx[i]; + float value = v[index] + src[i] * DEFAULT_SCALE; + expected[index] = Math.Abs(value) > DEFAULT_SCALE ? (value > 0 ? value - DEFAULT_SCALE : value + DEFAULT_SCALE) : 0; + } + + CpuMathUtils.SdcaL1UpdateSparse(DEFAULT_SCALE, src.Length, src, idx, idx.Length, DEFAULT_SCALE, v, w); + var actual = w; + Assert.Equal(expected, actual, comparer); + } } internal class FloatEqualityComparer : IEqualityComparer From 81d0c29f86eda1ededcd090f6bf37170a5997a40 Mon Sep 17 00:00:00 2001 From: Brian Lui Date: Wed, 8 Aug 2018 14:52:59 -0700 Subject: [PATCH 4/8] Minor clean-up before submitting PR --- .../CpuMathUtils.netcoreapp.cs | 2 +- .../CpuMathNativeUtils.cs | 20 ------------------- .../UnitTests.cs | 3 ++- 3 files changed, 3 insertions(+), 22 deletions(-) diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs index 8adff83a2f..fbcbb2c192 100644 --- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs +++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs @@ -811,7 +811,7 @@ public static void ZeroMatrixItems(AlignedArray dst, int ccol, int cfltRow, int[ Contracts.Assert(0 < ccol && ccol <= cfltRow); // REVIEW NEEDED: Since the two methods below do not involve any SSE hardware intrinsics, no software fallback is needed. - // REVIEW NEEDED; Keeping the check for SSE support so that we don't miss these two methods in case of any conditional compilation of files + // REVIEW NEEDED: Keeping the check for SSE support so that we don't miss these two methods in case of any conditional compilation of files if (Sse.IsSupported) { if (ccol == cfltRow) diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs index 92227abe78..8df3352556 100644 --- a/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs +++ b/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs @@ -20,20 +20,6 @@ namespace Microsoft.ML.CpuMath.PerformanceTests { internal static class CpuMathNativeUtils { - [DllImport("CpuMathNative", EntryPoint = "MatMulA"), SuppressUnmanagedCodeSecurity] - internal static extern unsafe void MatMulA(bool add, /*_In_ const*/ float* pmat, /*_In_ const*/ float* psrc, /*_Inout_*/ float* pdst, int crow, int ccol); - - [DllImport("CpuMathNative", EntryPoint = "MatMulPA"), SuppressUnmanagedCodeSecurity] - internal static extern unsafe void MatMulPA(bool add, /*_In_ const*/ float* pmat, /*_In_ const*/ int* pposSrc, /*_In_ const*/ float* psrc, - int posMin, int iposMin, int iposLim, /*_Inout_*/ float* pdst, int crow, int ccol); - - [DllImport("CpuMathNative", EntryPoint = "MatMulTranA"), SuppressUnmanagedCodeSecurity] - internal static extern unsafe void MatMulTranA(bool add, /*_In_ const*/ float* pmat, /*_In_ const*/ float* psrc, /*_Inout_*/ float* pdst, int crow, int ccol); - - [DllImport("CpuMathNative", EntryPoint = "MatMulTranPA"), SuppressUnmanagedCodeSecurity] - internal static extern unsafe void MatMulTranPA(bool add, /*_In_ const*/ float* pmat, /*_In_ const*/ int* pposSrc, /*_In_ const*/ float* psrc, - int posMin, int iposMin, int iposLim, /*_Inout_*/ float* pdst, int crow); - [DllImport("CpuMathNative", EntryPoint = "AddScalarU"), SuppressUnmanagedCodeSecurity] internal static extern unsafe float AddScalarU(float a, /*_Inout_*/ float* pd, int c); @@ -94,12 +80,6 @@ internal static extern unsafe void MatMulTranPA(bool add, /*_In_ const*/ float* [DllImport("CpuMathNative", EntryPoint = "Dist2"), SuppressUnmanagedCodeSecurity] internal static extern unsafe float Dist2(/*const*/ float* px, /*const*/ float* py, int c); - [DllImport("CpuMathNative", EntryPoint = "ZeroItemsU"), SuppressUnmanagedCodeSecurity] - internal static extern unsafe void ZeroItemsU(/*_Inout_*/ float* pd, int c, /*_In_ const*/ int* pindices, int cindices); - - [DllImport("CpuMathNative", EntryPoint = "ZeroMatrixItemsCore"), SuppressUnmanagedCodeSecurity] - internal static extern unsafe void ZeroMatrixItemsCore(/*_Inout_*/ float* pd, int c, int ccol, int cfltRow, /*_In_ const*/ int* pindices, int cindices); - [DllImport("CpuMathNative", EntryPoint = "SdcaL1UpdateU"), SuppressUnmanagedCodeSecurity] internal static extern unsafe void SdcaL1UpdateU(float primalUpdate, /*_In_ const*/ float* ps, float threshold, /*_Inout_*/ float* pd1, /*_Inout_*/ float* pd2, int c); diff --git a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs b/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs index 6d6a68bd32..1d4b668f55 100644 --- a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs +++ b/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs @@ -30,7 +30,8 @@ public CpuMathUtilsUnitTests() comparer = new FloatEqualityComparer(); // Padded matrices whose dimensions are multiples of 4 - float[] testMatrix1 = new float[4 * 4] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f, 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f }; + float[] testMatrix1 = new float[4 * 4] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f, + 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f }; float[] testMatrix2 = new float[4 * 8]; for (int i = 0; i < testMatrix2.Length; i++) From 02bfbe6c0f2f8d12ef7ef37006557d4f0616a054 Mon Sep 17 00:00:00 2001 From: Brian Lui Date: Wed, 8 Aug 2018 17:30:54 -0700 Subject: [PATCH 5/8] Minor changes --- src/Microsoft.ML.CpuMath/SseIntrinsics.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs index a57382a4d7..24a4bcbf3a 100644 --- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs +++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs @@ -78,7 +78,7 @@ private static unsafe void Store4(Vector128 x, float* dst, int* idx) } [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] - private static Vector128 VectorSum(Vector128 vector) + private static Vector128 VectorSum(in Vector128 vector) { if (Sse3.IsSupported) { From be3281d58112b4eeb853efebd0542d081ddc0176 Mon Sep 17 00:00:00 2001 From: Brian Lui Date: Thu, 9 Aug 2018 15:08:19 -0700 Subject: [PATCH 6/8] Respond to PR feedback, except for implementing new unit tests (coming soon) --- .../CpuMathUtils.netcoreapp.cs | 266 +++++++++++------- src/Microsoft.ML.CpuMath/SseIntrinsics.cs | 252 ++++++----------- .../SsePerformanceTests.cs | 3 - 3 files changed, 251 insertions(+), 270 deletions(-) diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs index fbcbb2c192..e17019ffa7 100644 --- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs +++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs @@ -17,12 +17,14 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArr { if (!tran) { - Contracts.Assert(0 <= crun && crun <= dst.Size); + Contracts.Assert(crun >= 0); + Contracts.Assert(crun <= dst.Size); SseIntrinsics.MatMulA(add, mat, src, dst, crun, src.Size); } else { - Contracts.Assert(0 <= crun && crun <= src.Size); + Contracts.Assert(crun >= 0); + Contracts.Assert(crun <= src.Size); SseIntrinsics.MatMulTranA(add, mat, src, dst, dst.Size, crun); } } @@ -30,7 +32,8 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArr { if (!tran) { - Contracts.Assert(0 <= crun && crun <= dst.Size); + Contracts.Assert(crun >= 0); + Contracts.Assert(crun <= dst.Size); for (int i = 0; i < crun; i++) { float dotProduct = 0; @@ -51,7 +54,8 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArr } else { - Contracts.Assert(0 <= crun && crun <= src.Size); + Contracts.Assert(crun >= 0); + Contracts.Assert(crun <= src.Size); for (int i = 0; i < dst.Size; i++) { float dotProduct = 0; @@ -77,7 +81,9 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgpo int posMin, int iposMin, int iposLim, AlignedArray dst, int crun) { Contracts.AssertValue(rgposSrc); - Contracts.Assert(0 <= iposMin && iposMin <= iposLim && iposLim <= rgposSrc.Length); + Contracts.Assert(iposMin >= 0); + Contracts.Assert(iposMin <= iposLim); + Contracts.Assert(iposLim <= rgposSrc.Length); Contracts.Assert(mat.Size == dst.Size * srcValues.Size); if (iposMin >= iposLim) @@ -93,12 +99,14 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgpo { if (!tran) { - Contracts.Assert(0 <= crun && crun <= dst.Size); + Contracts.Assert(crun >= 0); + Contracts.Assert(crun <= dst.Size); SseIntrinsics.MatMulPA(add, mat, rgposSrc, srcValues, posMin, iposMin, iposLim, dst, crun, srcValues.Size); } else { - Contracts.Assert(0 <= crun && crun <= srcValues.Size); + Contracts.Assert(crun >= 0); + Contracts.Assert(crun <= srcValues.Size); SseIntrinsics.MatMulTranPA(add, mat, rgposSrc, srcValues, posMin, iposMin, iposLim, dst, dst.Size); } } @@ -106,7 +114,8 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgpo { if (!tran) { - Contracts.Assert(0 <= crun && crun <= dst.Size); + Contracts.Assert(crun >= 0); + Contracts.Assert(crun <= dst.Size); for (int i = 0; i < crun; i++) { float dotProduct = 0; @@ -128,7 +137,8 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgpo } else { - Contracts.Assert(0 <= crun && crun <= srcValues.Size); + Contracts.Assert(crun >= 0); + Contracts.Assert(crun <= srcValues.Size); for (int i = 0; i < dst.Size; i++) { float dotProduct = 0; @@ -155,8 +165,8 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgpo public static void Add(float a, float[] dst, int count) { Contracts.AssertNonEmpty(dst); - Contracts.Assert(0 < count); - Contracts.Assert(0 < count && count <= dst.Length); + Contracts.Assert(count > 0); + Contracts.Assert(count <= dst.Length); Add(a, new Span(dst, 0, count)); } @@ -180,7 +190,8 @@ private static void Add(float a, Span dst) public static void Scale(float a, float[] dst, int count) { Contracts.AssertNonEmpty(dst); - Contracts.Assert(0 < count && count <= dst.Length); + Contracts.Assert(count > 0); + Contracts.Assert(count <= dst.Length); Scale(a, new Span(dst, 0, count)); } @@ -188,8 +199,9 @@ public static void Scale(float a, float[] dst, int count) public static void Scale(float a, float[] dst, int offset, int count) { Contracts.AssertNonEmpty(dst); - Contracts.Assert(0 < count); - Contracts.Assert(0 <= offset && offset < dst.Length - count); + Contracts.Assert(count > 0); + Contracts.Assert(offset >= 0); + Contracts.Assert(offset < (dst.Length - count)); Scale(a, new Span(dst, offset, count)); } @@ -213,8 +225,9 @@ private static void Scale(float a, Span dst) public static void Scale(float a, float[] src, float[] dst, int count) { Contracts.AssertNonEmpty(src); - Contracts.Assert(0 < count && count <= src.Length); Contracts.AssertNonEmpty(dst); + Contracts.Assert(count > 0); + Contracts.Assert(count <= src.Length); Contracts.Assert(count <= dst.Length); Scale(a, new Span(src, 0, count), new Span(dst, 0, count)); @@ -239,8 +252,8 @@ private static void Scale(float a, Span src, Span dst) public static void ScaleAdd(float a, float b, float[] dst, int count) { Contracts.AssertNonEmpty(dst); - Contracts.Assert(0 < count); - Contracts.Assert(0 < count && count <= dst.Length); + Contracts.Assert(count > 0); + Contracts.Assert(count <= dst.Length); ScaleAdd(a, b, new Span(dst, 0, count)); } @@ -263,8 +276,9 @@ private static void ScaleAdd(float a, float b, Span dst) public static void AddScale(float a, float[] src, float[] dst, int count) { Contracts.AssertNonEmpty(src); - Contracts.Assert(0 < count && count <= src.Length); Contracts.AssertNonEmpty(dst); + Contracts.Assert(count > 0); + Contracts.Assert(count <= src.Length); Contracts.Assert(count <= dst.Length); AddScale(a, new Span(src, 0, count), new Span(dst, 0, count)); @@ -273,10 +287,12 @@ public static void AddScale(float a, float[] src, float[] dst, int count) public static void AddScale(float a, float[] src, float[] dst, int dstOffset, int count) { Contracts.AssertNonEmpty(src); - Contracts.Assert(count <= src.Length); Contracts.AssertNonEmpty(dst); - Contracts.Assert(0 <= dstOffset && dstOffset < dst.Length); - Contracts.Assert(0 < count && count <= dst.Length - dstOffset); + Contracts.Assert(dstOffset >= 0); + Contracts.Assert(dstOffset < dst.Length); + Contracts.Assert(count > 0); + Contracts.Assert(count <= src.Length); + Contracts.Assert(count <= (dst.Length - dstOffset)); AddScale(a, new Span(src, 0, count), new Span(dst, dstOffset, count)); } @@ -299,10 +315,11 @@ private static void AddScale(float a, Span src, Span dst) public static void AddScale(float a, float[] src, int[] indices, float[] dst, int count) { Contracts.AssertNonEmpty(src); - Contracts.Assert(0 < count && count <= src.Length); Contracts.AssertNonEmpty(indices); - Contracts.Assert(count <= indices.Length); Contracts.AssertNonEmpty(dst); + Contracts.Assert(count > 0); + Contracts.Assert(count <= src.Length); + Contracts.Assert(count <= indices.Length); Contracts.Assert(count < dst.Length); AddScale(a, new Span(src), new Span(indices, 0, count), new Span(dst)); @@ -311,12 +328,14 @@ public static void AddScale(float a, float[] src, int[] indices, float[] dst, in public static void AddScale(float a, float[] src, int[] indices, float[] dst, int dstOffset, int count) { Contracts.AssertNonEmpty(src); - Contracts.Assert(0 < count && count <= src.Length); Contracts.AssertNonEmpty(indices); - Contracts.Assert(count <= indices.Length); Contracts.AssertNonEmpty(dst); - Contracts.Assert(0 <= dstOffset && dstOffset < dst.Length); - Contracts.Assert(count < dst.Length - dstOffset); + Contracts.Assert(dstOffset >= 0); + Contracts.Assert(dstOffset < dst.Length); + Contracts.Assert(count > 0); + Contracts.Assert(count <= src.Length); + Contracts.Assert(count <= indices.Length); + Contracts.Assert(count < (dst.Length - dstOffset)); AddScale(a, new Span(src), new Span(indices, 0, count), new Span(dst, dstOffset, dst.Length - dstOffset)); @@ -340,11 +359,12 @@ private static void AddScale(float a, Span src, Span indices, Span 0); + Contracts.Assert(count <= src.Length); + Contracts.Assert(count <= dst.Length); Contracts.Assert(count <= res.Length); AddScaleCopy(a, new Span(src, 0, count), new Span(dst, 0, count), new Span(res, 0, count)); @@ -368,8 +388,9 @@ private static void AddScaleCopy(float a, Span src, Span dst, Span public static void Add(float[] src, float[] dst, int count) { Contracts.AssertNonEmpty(src); - Contracts.Assert(0 < count && count <= src.Length); Contracts.AssertNonEmpty(dst); + Contracts.Assert(count > 0); + Contracts.Assert(count <= src.Length); Contracts.Assert(count <= dst.Length); Add(new Span(src, 0, count), new Span(dst, 0, count)); @@ -393,10 +414,11 @@ private static void Add(Span src, Span dst) public static void Add(float[] src, int[] indices, float[] dst, int count) { Contracts.AssertNonEmpty(src); - Contracts.Assert(0 < count && count <= src.Length); Contracts.AssertNonEmpty(indices); - Contracts.Assert(count <= indices.Length); Contracts.AssertNonEmpty(dst); + Contracts.Assert(count > 0); + Contracts.Assert(count <= src.Length); + Contracts.Assert(count <= indices.Length); Contracts.Assert(count < dst.Length); Add(new Span(src), new Span(indices, 0, count), new Span(dst)); @@ -405,12 +427,14 @@ public static void Add(float[] src, int[] indices, float[] dst, int count) public static void Add(float[] src, int[] indices, float[] dst, int dstOffset, int count) { Contracts.AssertNonEmpty(src); - Contracts.Assert(0 < count && count <= src.Length); Contracts.AssertNonEmpty(indices); - Contracts.Assert(count <= indices.Length); Contracts.AssertNonEmpty(dst); - Contracts.Assert(0 <= dstOffset && dstOffset < dst.Length); - Contracts.Assert(count <= dst.Length - dstOffset); + Contracts.Assert(dstOffset >= 0); + Contracts.Assert(dstOffset < dst.Length); + Contracts.Assert(count > 0); + Contracts.Assert(count <= src.Length); + Contracts.Assert(count <= indices.Length); + Contracts.Assert(count <= (dst.Length - dstOffset)); Add(new Span(src), new Span(indices, 0, count), new Span(dst, dstOffset, dst.Length - dstOffset)); @@ -435,10 +459,11 @@ private static void Add(Span src, Span indices, Span dst) public static void MulElementWise(float[] src1, float[] src2, float[] dst, int count) { Contracts.AssertNonEmpty(src1); - Contracts.Assert(0 < count && count <= src1.Length); Contracts.AssertNonEmpty(src2); - Contracts.Assert(0 < count && count <= src2.Length); Contracts.AssertNonEmpty(dst); + Contracts.Assert(count > 0); + Contracts.Assert(count <= src1.Length); + Contracts.Assert(count <= src2.Length); MulElementWise(new Span(src1, 0, count), new Span(src2, 0, count), new Span(dst, 0, count)); @@ -462,7 +487,8 @@ private static void MulElementWise(Span src1, Span src2, Span 0); + Contracts.Assert(count <= src.Length); return Sum(new Span(src, 0, count)); } @@ -470,8 +496,9 @@ public static float Sum(float[] src, int count) public static float Sum(float[] src, int offset, int count) { Contracts.AssertNonEmpty(src); - Contracts.Assert(0 < count); - Contracts.Assert(0 <= offset && offset <= src.Length - count); + Contracts.Assert(count > 0); + Contracts.Assert(offset >= 0); + Contracts.Assert(offset <= (src.Length - count)); return Sum(new Span(src, offset, count)); } @@ -496,7 +523,8 @@ private static float Sum(Span src) public static float SumSq(float[] src, int count) { Contracts.AssertNonEmpty(src); - Contracts.Assert(0 < count && count <= src.Length); + Contracts.Assert(count > 0); + Contracts.Assert(count <= src.Length); return SumSq(new Span(src, 0, count)); } @@ -504,8 +532,9 @@ public static float SumSq(float[] src, int count) public static float SumSq(float[] src, int offset, int count) { Contracts.AssertNonEmpty(src); - Contracts.Assert(0 < count); - Contracts.Assert(0 <= offset && offset <= src.Length - count); + Contracts.Assert(count > 0); + Contracts.Assert(offset >= 0); + Contracts.Assert(offset <= (src.Length - count)); return SumSq(new Span(src, offset, count)); } @@ -530,8 +559,9 @@ private static float SumSq(Span src) public static float SumSq(float mean, float[] src, int offset, int count) { Contracts.AssertNonEmpty(src); - Contracts.Assert(0 < count); - Contracts.Assert(0 <= offset && offset <= src.Length - count); + Contracts.Assert(count > 0); + Contracts.Assert(offset >= 0); + Contracts.Assert(offset <= (src.Length - count)); return SumSq(mean, new Span(src, offset, count)); } @@ -540,14 +570,7 @@ private static float SumSq(float mean, Span src) { if (Sse.IsSupported) { - if (mean == 0) - { - return SseIntrinsics.SumSqU(src); - } - else - { - return SseIntrinsics.SumSqDiffU(mean, src); - } + return (mean == 0) ? SseIntrinsics.SumSqU(src) : SseIntrinsics.SumSqDiffU(mean, src); } else { @@ -563,7 +586,8 @@ private static float SumSq(float mean, Span src) public static float SumAbs(float[] src, int count) { Contracts.AssertNonEmpty(src); - Contracts.Assert(0 < count && count <= src.Length); + Contracts.Assert(count > 0); + Contracts.Assert(count <= src.Length); return SumAbs(new Span(src, 0, count)); } @@ -571,8 +595,9 @@ public static float SumAbs(float[] src, int count) public static float SumAbs(float[] src, int offset, int count) { Contracts.AssertNonEmpty(src); - Contracts.Assert(0 < count); - Contracts.Assert(0 <= offset && offset <= src.Length - count); + Contracts.Assert(count > 0); + Contracts.Assert(offset >= 0); + Contracts.Assert(offset <= (src.Length - count)); return SumAbs(new Span(src, offset, count)); } @@ -597,8 +622,9 @@ private static float SumAbs(Span src) public static float SumAbs(float mean, float[] src, int offset, int count) { Contracts.AssertNonEmpty(src); - Contracts.Assert(0 < count); - Contracts.Assert(0 <= offset && offset <= src.Length - count); + Contracts.Assert(count > 0); + Contracts.Assert(offset >= 0); + Contracts.Assert(offset <= (src.Length - count)); return SumAbs(mean, new Span(src, offset, count)); } @@ -607,14 +633,7 @@ private static float SumAbs(float mean, Span src) { if (Sse.IsSupported) { - if (mean == 0) - { - return SseIntrinsics.SumAbsU(src); - } - else - { - return SseIntrinsics.SumAbsDiffU(mean, src); - } + return (mean == 0) ? SseIntrinsics.SumAbsU(src) : SseIntrinsics.SumAbsDiffU(mean, src); } else { @@ -630,7 +649,8 @@ private static float SumAbs(float mean, Span src) public static float MaxAbs(float[] src, int count) { Contracts.AssertNonEmpty(src); - Contracts.Assert(0 < count && count <= src.Length); + Contracts.Assert(count > 0); + Contracts.Assert(count <= src.Length); return MaxAbs(new Span(src, 0, count)); } @@ -638,8 +658,9 @@ public static float MaxAbs(float[] src, int count) public static float MaxAbs(float[] src, int offset, int count) { Contracts.AssertNonEmpty(src); - Contracts.Assert(0 < count); - Contracts.Assert(0 <= offset && offset <= src.Length - count); + Contracts.Assert(count > 0); + Contracts.Assert(offset >= 0); + Contracts.Assert(offset <= (src.Length - count)); return MaxAbs(new Span(src, offset, count)); } @@ -668,7 +689,8 @@ private static float MaxAbs(Span src) public static float MaxAbsDiff(float mean, float[] src, int count) { Contracts.AssertNonEmpty(src); - Contracts.Assert(0 < count && count <= src.Length); + Contracts.Assert(count > 0); + Contracts.Assert(count <= src.Length); return MaxAbsDiff(mean, new Span(src, 0, count)); } @@ -698,7 +720,7 @@ public static float DotProductDense(float[] a, float[] b, int count) { Contracts.AssertNonEmpty(a); Contracts.AssertNonEmpty(b); - Contracts.Assert(0 < count); + Contracts.Assert(count > 0); Contracts.Assert(a.Length >= count); Contracts.Assert(b.Length >= count); @@ -708,10 +730,11 @@ public static float DotProductDense(float[] a, float[] b, int count) public static float DotProductDense(float[] a, int offset, float[] b, int count) { Contracts.AssertNonEmpty(a); - Contracts.Assert(0 < count); - Contracts.Assert(0 <= offset && offset <= a.Length - count); Contracts.AssertNonEmpty(b); - Contracts.Assert(b.Length >= count); + Contracts.Assert(count > 0); + Contracts.Assert(count <= b.Length); + Contracts.Assert(offset >= 0); + Contracts.Assert(offset <= (a.Length - count)); return DotProductDense(new Span(a, offset, count), new Span(b, 0, count)); } @@ -737,7 +760,8 @@ public static float DotProductSparse(float[] a, float[] b, int[] indices, int co { Contracts.AssertNonEmpty(a); Contracts.AssertNonEmpty(b); - Contracts.Assert(0 < count); + Contracts.AssertNonEmpty(indices); + Contracts.Assert(count > 0); Contracts.Assert(count < a.Length); Contracts.Assert(count <= b.Length); Contracts.Assert(count <= indices.Length); @@ -749,12 +773,14 @@ public static float DotProductSparse(float[] a, float[] b, int[] indices, int co public static float DotProductSparse(float[] a, int offset, float[] b, int[] indices, int count) { Contracts.AssertNonEmpty(a); - Contracts.Assert(0 < count); - Contracts.Assert(0 <= offset && offset < a.Length); - Contracts.Assert(a.Length - offset > count); Contracts.AssertNonEmpty(b); + Contracts.AssertNonEmpty(indices); + Contracts.Assert(count > 0); + Contracts.Assert(count < (a.Length - offset)); Contracts.Assert(count <= b.Length); Contracts.Assert(count <= indices.Length); + Contracts.Assert(offset >= 0); + Contracts.Assert(offset < a.Length); return DotProductSparse(new Span(a, offset, a.Length - offset), new Span(b), new Span(indices, 0, count)); @@ -782,7 +808,8 @@ public static float L2DistSquared(float[] a, float[] b, int count) { Contracts.AssertNonEmpty(a); Contracts.AssertNonEmpty(b); - Contracts.Assert(0 < count && count <= a.Length); + Contracts.Assert(count > 0); + Contracts.Assert(count <= a.Length); Contracts.Assert(count <= b.Length); return L2DistSquared(new Span(a, 0, count), new Span(b, 0, count)); @@ -808,19 +835,62 @@ private static float L2DistSquared(Span a, Span b) public static void ZeroMatrixItems(AlignedArray dst, int ccol, int cfltRow, int[] indices) { - Contracts.Assert(0 < ccol && ccol <= cfltRow); + Contracts.Assert(ccol > 0); + Contracts.Assert(ccol <= cfltRow); - // REVIEW NEEDED: Since the two methods below do not involve any SSE hardware intrinsics, no software fallback is needed. - // REVIEW NEEDED: Keeping the check for SSE support so that we don't miss these two methods in case of any conditional compilation of files - if (Sse.IsSupported) + if (ccol == cfltRow) { - if (ccol == cfltRow) + ZeroItemsU(dst, dst.Size, indices, indices.Length); + } + else + { + ZeroMatrixItemsCore(dst, dst.Size, ccol, cfltRow, indices, indices.Length); + } + } + + private static unsafe void ZeroItemsU(AlignedArray dst, int c, int[] indices, int cindices) + { + fixed (float* pdst = &dst.Items[0]) + fixed (int* pidx = &indices[0]) + { + for (int i = 0; i < cindices; ++i) { - SseIntrinsics.ZeroItemsU(dst, dst.Size, indices, indices.Length); + int index = pidx[i]; + Contracts.Assert(0 <= index && index < c); + pdst[index] = 0; } - else + } + } + + private static unsafe void ZeroMatrixItemsCore(AlignedArray dst, int c, int ccol, int cfltRow, int[] indices, int cindices) + { + fixed (float* pdst = &dst.Items[0]) + fixed (int* pidx = &indices[0]) + { + int ivLogMin = 0; + int ivLogLim = ccol; + int ivPhyMin = 0; + + for (int i = 0; i < cindices; ++i) { - SseIntrinsics.ZeroMatrixItemsCore(dst, dst.Size, ccol, cfltRow, indices, indices.Length); + int index = pidx[i]; + Contracts.Assert(0 <= index && index < c); + + int col = index - ivLogMin; + if ((uint)col >= (uint)ccol) + { + Contracts.Assert(ivLogMin > index || index >= ivLogLim); + + int row = index / ccol; + ivLogMin = row * ccol; + ivLogLim = ivLogMin + ccol; + ivPhyMin = row * cfltRow; + + Contracts.Assert(ivLogMin <= index && index < ivLogLim); + col = index - ivLogMin; + } + + pdst[ivPhyMin + col] = 0; } } } @@ -828,12 +898,12 @@ public static void ZeroMatrixItems(AlignedArray dst, int ccol, int cfltRow, int[ public static void SdcaL1UpdateDense(float primalUpdate, int length, float[] src, float threshold, float[] v, float[] w) { Contracts.AssertNonEmpty(src); - Contracts.Assert(length <= src.Length); Contracts.AssertNonEmpty(v); - Contracts.Assert(length <= v.Length); Contracts.AssertNonEmpty(w); - Contracts.Assert(length <= w.Length); Contracts.Assert(length > 0); + Contracts.Assert(length <= src.Length); + Contracts.Assert(length <= v.Length); + Contracts.Assert(length <= w.Length); SdcaL1UpdateDense(primalUpdate, new Span(src, 0, length), threshold, new Span(v, 0, length), new Span(w, 0, length)); } @@ -859,15 +929,15 @@ private static void SdcaL1UpdateDense(float primalUpdate, Span src, float public static void SdcaL1UpdateSparse(float primalUpdate, int length, float[] src, int[] indices, int count, float threshold, float[] v, float[] w) { Contracts.AssertNonEmpty(src); - Contracts.Assert(count <= src.Length); Contracts.AssertNonEmpty(indices); - Contracts.Assert(count <= indices.Length); - Contracts.AssertNonEmpty(w); - Contracts.Assert(length <= w.Length); Contracts.AssertNonEmpty(v); - Contracts.Assert(length <= v.Length); - Contracts.Assert(0 < count); + Contracts.AssertNonEmpty(w); + Contracts.Assert(count > 0); + Contracts.Assert(count <= src.Length); + Contracts.Assert(count <= indices.Length); Contracts.Assert(count < length); + Contracts.Assert(length <= v.Length); + Contracts.Assert(length <= w.Length); SdcaL1UpdateSparse(primalUpdate, new Span(src, 0, count), new Span(indices, 0, count), threshold, new Span(v), new Span(w)); } diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs index 24a4bcbf3a..659e5d3e90 100644 --- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs +++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs @@ -52,29 +52,22 @@ private static unsafe Vector128 Load4(float* src, int* idx) } [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] - private static Vector128 Rotate(Vector128 x) + private static Vector128 Rotate(in Vector128 x) { // The control byte shuffles the four 32-bit floats of x: ABCD -> BCDA. return Sse.Shuffle(x, x, 0x39); } [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] - private static Vector128 RotateReverse(Vector128 x) - { - // The control byte shuffles the four 32-bit floats of x: ABCD -> DABC. - return Sse.Shuffle(x, x, 0x93); - } - - [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] - private static unsafe void Store4(Vector128 x, float* dst, int* idx) + private static unsafe void Store4(in Vector128 x, float* dst, int* idx) { Sse.StoreScalar(dst + idx[0], x); - x = Rotate(x); - Sse.StoreScalar(dst + idx[1], x); - x = Rotate(x); - Sse.StoreScalar(dst + idx[2], x); - x = Rotate(x); - Sse.StoreScalar(dst + idx[3], x); + Vector128 rotated = Rotate(in x); + Sse.StoreScalar(dst + idx[1], rotated); + rotated = Rotate(in rotated); + Sse.StoreScalar(dst + idx[2], rotated); + rotated = Rotate(in rotated); + Sse.StoreScalar(dst + idx[3], rotated); } [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] @@ -82,18 +75,44 @@ private static Vector128 VectorSum(in Vector128 vector) { if (Sse3.IsSupported) { - Vector128 tmp = Sse3.HorizontalAdd(vector, vector); - return Sse3.HorizontalAdd(tmp, tmp); + Vector128 partialSum = Sse3.HorizontalAdd(vector, vector); + return Sse3.HorizontalAdd(partialSum, partialSum); } else { - // SSE3 is not supported. - Vector128 tmp = Sse.Add(vector, Sse.MoveHighToLow(vector, vector)); - // The control byte shuffles the four 32-bit floats of tmp: ABCD -> BADC. - return Sse.Add(tmp, Sse.Shuffle(tmp, tmp, 0xb1)); + Vector128 partialSum = Sse.Add(vector, Sse.MoveHighToLow(vector, vector)); + // The control byte shuffles the four 32-bit floats of partialSum: ABCD -> BADC. + return Sse.Add(partialSum, Sse.Shuffle(partialSum, partialSum, 0xB1)); } } + [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] + private static Vector128 VectorMax(in Vector128 vector) + { + Vector128 x1 = Sse.Shuffle(vector, vector, 0xB1); + Vector128 partialMax = Sse.Max(vector, x1); + x1 = Sse.Shuffle(partialMax, partialMax, 0x02); + return Sse.MaxScalar(partialMax, x1); + } + + [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] + private static Vector128 GetAbsMask() + { + return (Sse2.IsSupported) ? + Sse.StaticCast(Sse2.SetAllVector128(0x7FFFFFFF)) : + Sse.SetAllVector128(BitConverter.Int32BitsToSingle(0x7FFFFFFF)); + } + + [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] + private static Vector128 GetNewDst(in Vector128 xDst1, in Vector128 signMask, in Vector128 xThreshold) + { + Vector128 xSign = Sse.And(xDst1, signMask); // result = 0x8000 0000 if xDst1 is negative or 0x0000 0000 otherwise + Vector128 xDst1Abs = Sse.Xor(xDst1, xSign); + Vector128 xCond = Sse.CompareGreaterThan(xDst1Abs, xThreshold); // result = 0xFFFF FFFF if true + Vector128 x2 = Sse.Xor(xSign, xThreshold); // -xThreshold if xDst1 is negative and +xThreshold otherwise + return Sse.And(Sse.Subtract(xDst1, x2), xCond); + } + // Multiply matrix times vector into vector. internal static unsafe void MatMulA(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol) { @@ -111,7 +130,6 @@ internal static unsafe void MatMulA(bool add, AlignedArray mat, AlignedArray src float* pSrcEnd = psrc + ccol; float* pDstEnd = pdst + crow; - float* pSrcCurrent = psrc; float* pDstCurrent = pdst; float* pMatCurrent = pmat; @@ -122,6 +140,8 @@ internal static unsafe void MatMulA(bool add, AlignedArray mat, AlignedArray src Vector128 res2 = res0; Vector128 res3 = res0; + float* pSrcCurrent = psrc; + while (pSrcCurrent < pSrcEnd) { float* pMatTemp = pMatCurrent; @@ -189,7 +209,7 @@ internal static unsafe void MatMulPA(bool add, AlignedArray mat, int[] rgposSrc, float* pm1 = pm0 + ccol; float* pm2 = pm1 + ccol; float* pm3 = pm2 + ccol; - Vector128 res = Sse.SetZeroVector128(); + Vector128 result = Sse.SetZeroVector128(); int* ppos = pposMin; @@ -199,16 +219,16 @@ internal static unsafe void MatMulPA(bool add, AlignedArray mat, int[] rgposSrc, Vector128 x1 = Sse.SetVector128(pm3[col], pm2[col], pm1[col], pm0[col]); Vector128 x2 = Sse.SetAllVector128(pSrcCurrent[col]); x2 = Sse.Multiply(x2, x1); - res = Sse.Add(res, x2); + result = Sse.Add(result, x2); ppos++; } if (add) { - res = Sse.Add(res, Sse.LoadAlignedVector128(pDstCurrent)); + result = Sse.Add(result, Sse.LoadAlignedVector128(pDstCurrent)); } - Sse.StoreAligned(pDstCurrent, res); + Sse.StoreAligned(pDstCurrent, result); pDstCurrent += 4; pm0 += 4 * ccol; @@ -233,20 +253,21 @@ internal static unsafe void MatMulTranA(bool add, AlignedArray mat, AlignedArray float* pSrcEnd = psrc + ccol; float* pDstEnd = pdst + crow; float* pSrcCurrent = psrc; - float* pDstCurrent = pdst; float* pMatCurrent = pmat; if (!add) { Vector128 x01 = Sse.LoadAlignedVector128(pSrcCurrent); - // Replicate each slot of x01 into its own register. - Vector128 x11 = Sse.Shuffle(x01, x01, 0x55); - Vector128 x21 = Sse.Shuffle(x01, x01, 0xAA); - Vector128 x31 = Sse.Shuffle(x01, x01, 0xFF); - x01 = Sse.Shuffle(x01, x01, 0x00); + // Replicate each 32-bit slot of x01 (ABCD) into its own register. + Vector128 x11 = Sse.Shuffle(x01, x01, 0x55); // B + Vector128 x21 = Sse.Shuffle(x01, x01, 0xAA); // C + Vector128 x31 = Sse.Shuffle(x01, x01, 0xFF); // D + x01 = Sse.Shuffle(x01, x01, 0x00); // A pSrcCurrent += 4; + float* pDstCurrent = pdst; + while (pDstCurrent < pDstEnd) { float* pMatTemp = pMatCurrent; @@ -276,13 +297,13 @@ internal static unsafe void MatMulTranA(bool add, AlignedArray mat, AlignedArray while (pSrcCurrent < pSrcEnd) { Vector128 x01 = Sse.LoadAlignedVector128(pSrcCurrent); - // Replicate each slot of x01 into its own register. - Vector128 x11 = Sse.Shuffle(x01, x01, 0x55); - Vector128 x21 = Sse.Shuffle(x01, x01, 0xAA); - Vector128 x31 = Sse.Shuffle(x01, x01, 0xFF); - x01 = Sse.Shuffle(x01, x01, 0x00); + // Replicate each 32-bit slot of x01 (ABCD) into its own register. + Vector128 x11 = Sse.Shuffle(x01, x01, 0x55); // B + Vector128 x21 = Sse.Shuffle(x01, x01, 0xAA); // C + Vector128 x31 = Sse.Shuffle(x01, x01, 0xFF); // D + x01 = Sse.Shuffle(x01, x01, 0x00); // A - pDstCurrent = pdst; + float* pDstCurrent = pdst; while (pDstCurrent < pDstEnd) { @@ -610,7 +631,7 @@ internal static unsafe void AddScaleSU(float scale, Span src, Span i srcVector = Sse.Multiply(srcVector, scaleVector); dstVector = Sse.Add(dstVector, srcVector); - Store4(dstVector, pDstCurrent, pIdxCurrent); + Store4(in dstVector, pDstCurrent, pIdxCurrent); pIdxCurrent += 4; pSrcCurrent += 4; @@ -678,7 +699,7 @@ internal static unsafe void AddSU(Span src, Span idx, Span ds Vector128 dstVector = Sse.LoadVector128(pSrcCurrent); srcVector = Sse.Add(srcVector, dstVector); - Store4(srcVector, pDstCurrent, pIdxCurrent); + Store4(in srcVector, pDstCurrent, pIdxCurrent); pIdxCurrent += 4; pSrcCurrent += 4; @@ -746,7 +767,7 @@ internal static unsafe float SumU(Span src) pSrcCurrent += 4; } - result = VectorSum(result); + result = VectorSum(in result); while (pSrcCurrent < pSrcEnd) { @@ -775,7 +796,7 @@ internal static unsafe float SumSqU(Span src) pSrcCurrent += 4; } - result = VectorSum(result); + result = VectorSum(in result); while (pSrcCurrent < pEnd) { @@ -808,7 +829,7 @@ internal static unsafe float SumSqDiffU(float mean, Span src) pSrcCurrent += 4; } - result = VectorSum(result); + result = VectorSum(in result); while (pSrcCurrent < pSrcEnd) { @@ -826,16 +847,7 @@ internal static unsafe float SumSqDiffU(float mean, Span src) internal static unsafe float SumAbsU(Span src) { Vector128 result = Sse.SetZeroVector128(); - Vector128 mask; - - if (Sse2.IsSupported) - { - mask = Sse.StaticCast(Sse2.SetAllVector128(0x7FFFFFFF)); - } - else - { - mask = Sse.SetAllVector128(BitConverter.Int32BitsToSingle(0x7FFFFFFF)); - } + Vector128 mask = GetAbsMask(); fixed (float* psrc = src) { @@ -850,7 +862,7 @@ internal static unsafe float SumAbsU(Span src) pSrcCurrent += 4; } - result = VectorSum(result); + result = VectorSum(in result); while (pSrcCurrent < pEnd) { @@ -868,16 +880,7 @@ internal static unsafe float SumAbsDiffU(float mean, Span src) { Vector128 result = Sse.SetZeroVector128(); Vector128 meanVector = Sse.SetAllVector128(mean); - Vector128 mask; - - if (Sse2.IsSupported) - { - mask = Sse.StaticCast(Sse2.SetAllVector128(0x7FFFFFFF)); - } - else - { - mask = Sse.SetAllVector128(BitConverter.Int32BitsToSingle(0x7FFFFFFF)); - } + Vector128 mask = GetAbsMask(); fixed (float* psrc = src) { @@ -893,7 +896,7 @@ internal static unsafe float SumAbsDiffU(float mean, Span src) pSrcCurrent += 4; } - result = VectorSum(result); + result = VectorSum(in result); while (pSrcCurrent < pEnd) { @@ -911,16 +914,7 @@ internal static unsafe float SumAbsDiffU(float mean, Span src) internal static unsafe float MaxAbsU(Span src) { Vector128 result = Sse.SetZeroVector128(); - Vector128 mask; - - if (Sse2.IsSupported) - { - mask = Sse.StaticCast(Sse2.SetAllVector128(0x7FFFFFFF)); - } - else - { - mask = Sse.SetAllVector128(BitConverter.Int32BitsToSingle(0x7FFFFFFF)); - } + Vector128 mask = GetAbsMask(); fixed (float* psrc = src) { @@ -935,10 +929,7 @@ internal static unsafe float MaxAbsU(Span src) pSrcCurrent += 4; } - Vector128 x1 = Sse.Shuffle(result, result, 0xB1); - result = Sse.Max(result, x1); - x1 = Sse.Shuffle(result, result, 0x02); - result = Sse.MaxScalar(result, x1); + result = VectorMax(in result); while (pSrcCurrent < pEnd) { @@ -956,16 +947,7 @@ internal static unsafe float MaxAbsDiffU(float mean, Span src) { Vector128 result = Sse.SetZeroVector128(); Vector128 meanVector = Sse.SetAllVector128(mean); - Vector128 mask; - - if (Sse2.IsSupported) - { - mask = Sse.StaticCast(Sse2.SetAllVector128(0x7FFFFFFF)); - } - else - { - mask = Sse.SetAllVector128(BitConverter.Int32BitsToSingle(0x7FFFFFFF)); - } + Vector128 mask = GetAbsMask(); fixed (float* psrc = src) { @@ -981,10 +963,7 @@ internal static unsafe float MaxAbsDiffU(float mean, Span src) pSrcCurrent += 4; } - Vector128 x1 = Sse.Shuffle(result, result, 0xB1); - result = Sse.Max(result, x1); - x1 = Sse.Shuffle(result, result, 0x02); - result = Sse.MaxScalar(result, x1); + result = VectorMax(in result); while (pSrcCurrent < pEnd) { @@ -1021,7 +1000,7 @@ internal static unsafe float DotU(Span src, Span dst) pDstCurrent += 4; } - result = VectorSum(result); + result = VectorSum(in result); while (pSrcCurrent < pEnd) { @@ -1062,7 +1041,7 @@ internal static unsafe float DotSU(Span src, Span dst, Span i pDstCurrent += 4; } - result = VectorSum(result); + result = VectorSum(in result); while (pIdxCurrent < pEnd) { @@ -1101,7 +1080,7 @@ internal static unsafe float Dist2(Span src, Span dst) pDstCurrent += 4; } - sqDistanceVector = VectorSum(sqDistanceVector); + sqDistanceVector = VectorSum(in sqDistanceVector); float norm = Sse.ConvertToSingle(sqDistanceVector); while (pSrcCurrent < pEnd) @@ -1117,63 +1096,6 @@ internal static unsafe float Dist2(Span src, Span dst) } } - internal static unsafe void ZeroItemsU(AlignedArray dst, int c, int[] indices, int cindices) - { - fixed (float* pDstStart = &dst.Items[0]) - fixed (int* pidx = &indices[0]) - { - float* pdst = Ptr(dst, pDstStart); - - // REVIEW NEEDED: This line expands to (void)(c); but is it necessary? - // DEBUG_ONLY(c); - - for (int i = 0; i < cindices; ++i) - { - int index = pidx[i]; - Contracts.Assert(0 <= index && index < c); - pdst[index] = 0; - } - } - } - - internal static unsafe void ZeroMatrixItemsCore(AlignedArray dst, int c, int ccol, int cfltRow, int[] indices, int cindices) - { - fixed (float* pDstStart = &dst.Items[0]) - fixed (int* pidx = &indices[0]) - { - float* pdst = Ptr(dst, pDstStart); - - // REVIEW NEEDED: This line expands to (void)(c); but is it necessary? - // DEBUG_ONLY(c); - - int ivLogMin = 0; - int ivLogLim = ccol; - int ivPhyMin = 0; - - for (int i = 0; i < cindices; ++i) - { - int index = pidx[i]; - Contracts.Assert(0 <= index && index < c); - - int col = index - ivLogMin; - if ((uint)col >= (uint)ccol) - { - Contracts.Assert(ivLogMin > index || index >= ivLogLim); - - int row = index / ccol; - ivLogMin = row * ccol; - ivLogLim = ivLogMin + ccol; - ivPhyMin = row * cfltRow; - - Contracts.Assert(ivLogMin <= index && index < ivLogLim); - col = index - ivLogMin; - } - - pdst[ivPhyMin + col] = 0; - } - } - } - internal static unsafe void SdcaL1UpdateU(float primalUpdate, Span src, float threshold, Span v, Span w) { fixed (float* psrc = src) @@ -1187,21 +1109,18 @@ internal static unsafe void SdcaL1UpdateU(float primalUpdate, Span src, f Vector128 xPrimal = Sse.SetAllVector128(primalUpdate); - Vector128 signMask = Sse.SetAllVector128(-0.0f); // 1000 0000 ... + Vector128 signMask = Sse.SetAllVector128(-0.0f); // 0x8000 0000 Vector128 xThreshold = Sse.SetAllVector128(threshold); while (pSrcCurrent + 4 <= pSrcEnd) { Vector128 xSrc = Sse.LoadVector128(pSrcCurrent); + Vector128 xDst1 = Sse.LoadVector128(pDst1Current); xDst1 = Sse.Add(xDst1, Sse.Multiply(xSrc, xPrimal)); - Sse.Store(pDst1Current, xDst1); + Vector128 xDst2 = GetNewDst(xDst1, signMask, xThreshold); - Vector128 xSign = Sse.And(xDst1, signMask); // result = 10000... if xDst1 is negative or 00000 otherwise - Vector128 xDst1Abs = Sse.Xor(xDst1, xSign); - Vector128 xCond = Sse.CompareGreaterThan(xDst1Abs, xThreshold); // all 1's if true - Vector128 x2 = Sse.Xor(xSign, xThreshold); // -threshold if xDst1 is negative and +threshold otherwise - Vector128 xDst2 = Sse.And(Sse.Subtract(xDst1, x2), xCond); + Sse.Store(pDst1Current, xDst1); Sse.Store(pDst2Current, xDst2); pSrcCurrent += 4; @@ -1235,7 +1154,7 @@ internal static unsafe void SdcaL1UpdateSU(float primalUpdate, Span src, Vector128 xPrimal = Sse.SetAllVector128(primalUpdate); - Vector128 signMask = Sse.SetAllVector128(-0.0f); // 1000 0000 ... + Vector128 signMask = Sse.SetAllVector128(-0.0f); // 0x8000 0000 Vector128 xThreshold = Sse.SetAllVector128(threshold); while (pIdxCurrent + 4 <= pIdxEnd) @@ -1244,15 +1163,10 @@ internal static unsafe void SdcaL1UpdateSU(float primalUpdate, Span src, Vector128 xDst1 = Load4(pdst1, pIdxCurrent); xDst1 = Sse.Add(xDst1, Sse.Multiply(xSrc, xPrimal)); + Vector128 xDst2 = GetNewDst(xDst1, signMask, xThreshold); - Vector128 xSign = Sse.And(xDst1, signMask); // result = 10000... if xDst1 is negative or 00000 otherwise - Vector128 xDst1Abs = Sse.Xor(xDst1, xSign); - Vector128 xCond = Sse.CompareGreaterThan(xDst1Abs, xThreshold); // all 1's if true - Vector128 x2 = Sse.Xor(xSign, xThreshold); // -threshold if xDst1 is negative and +threshold otherwise - Vector128 xDst2 = Sse.And(Sse.Subtract(xDst1, x2), xCond); - - Store4(xDst1, pdst1, pIdxCurrent); - Store4(xDst2, pdst2, pIdxCurrent); + Store4(in xDst1, pdst1, pIdxCurrent); + Store4(in xDst2, pdst2, pIdxCurrent); pIdxCurrent += 4; pSrcCurrent += 4; diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs index 42dec27378..ade2ea6a0e 100644 --- a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs +++ b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs @@ -23,9 +23,6 @@ public class SsePerformanceTests private const int DEFAULT_CCOL = 2000; private const bool ADD = true; - // Naming follows from SseIntrinsics. - private const int CbAlign = 16; - private float[] src, dst, original, src1, src2, result; private int[] idx; private int seed = DEFAULT_SEED; From 17b2f79c1ec8b92768512776b9e695ecd1210c9a Mon Sep 17 00:00:00 2001 From: Brian Lui Date: Thu, 9 Aug 2018 15:41:55 -0700 Subject: [PATCH 7/8] Respond to PR feedback: Implemented new unit tests --- .../UnitTests.cs | 144 +++++++++--------- 1 file changed, 72 insertions(+), 72 deletions(-) diff --git a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs b/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs index 1d4b668f55..d1d5955a8e 100644 --- a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs +++ b/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs @@ -15,6 +15,7 @@ public class CpuMathUtilsUnitTests private readonly int[] testIndexArray; private readonly AlignedArray[] testMatrices; private readonly AlignedArray[] testSrcVectors; + private readonly AlignedArray[] testDstVectors; private const float DEFAULT_SCALE = 1.7f; private const int SseCbAlign = 16; private FloatEqualityComparer comparer; @@ -56,16 +57,28 @@ public CpuMathUtilsUnitTests() testSrcVectorAligned2.CopyFrom(testSrcVector2, 0, testSrcVector2.Length); testSrcVectors = new AlignedArray[] { testSrcVectorAligned1, testSrcVectorAligned2 }; + + // Padded destination vectors whose dimensions are multiples of 4 + float[] testDstVector1 = new float[4] { 0f, 1f, 2f, 3f }; + float[] testDstVector2 = new float[8] { 0f, 1f, 2f, 3f, 4f, 5f, 6f, 7f }; + + AlignedArray testDstVectorAligned1 = new AlignedArray(4, SseCbAlign); + AlignedArray testDstVectorAligned2 = new AlignedArray(8, SseCbAlign); + testDstVectorAligned1.CopyFrom(testDstVector1, 0, testDstVector1.Length); + testDstVectorAligned2.CopyFrom(testDstVector2, 0, testDstVector2.Length); + + testDstVectors = new AlignedArray[] { testDstVectorAligned1, testDstVectorAligned2 }; } [Theory] - [InlineData(0, new float[] { 23.28f, -49.72f, 23.28f, -49.72f })] - [InlineData(1, new float[] { 204f, 492f, 780f, 1068f })] - public void MatMulATest(int test, float[] expected) + [InlineData(0, 0, 0, new float[] { 23.28f, -49.72f, 23.28f, -49.72f })] + [InlineData(1, 1, 0, new float[] { 204f, 492f, 780f, 1068f })] + [InlineData(1, 0, 1, new float[] { 30f, 70f, 110f, 150f, 190f, 230f, 270f, 310f })] + public void MatMulATest(int matTest, int srcTest, int dstTest, float[] expected) { - AlignedArray mat = testMatrices[test]; - AlignedArray src = testSrcVectors[test]; - AlignedArray dst = new AlignedArray(4, SseCbAlign); + AlignedArray mat = testMatrices[matTest]; + AlignedArray src = testSrcVectors[srcTest]; + AlignedArray dst = testDstVectors[dstTest]; CpuMathUtils.MatTimesSrc(false, false, mat, src, dst, dst.Size); float[] actual = new float[dst.Size]; @@ -74,18 +87,14 @@ public void MatMulATest(int test, float[] expected) } [Theory] - [InlineData(0, new float[] { 23.28f, -48.72f, 25.28f, -46.72f })] - [InlineData(1, new float[] { 204f, 493f, 782f, 1071f })] - public void MatMulAAddTest(int test, float[] expected) + [InlineData(0, 0, 0, new float[] { 23.28f, -48.72f, 25.28f, -46.72f })] + [InlineData(1, 1, 0, new float[] { 204f, 493f, 782f, 1071f })] + [InlineData(1, 0, 1, new float[] { 30f, 71f, 112f, 153f, 194f, 235f, 276f, 317f })] + public void MatMulAAddTest(int matTest, int srcTest, int dstTest, float[] expected) { - AlignedArray mat = testMatrices[test]; - AlignedArray src = testSrcVectors[test]; - AlignedArray dst = new AlignedArray(4, SseCbAlign); - - for (int i = 0; i < dst.Size; i++) - { - dst[i] = i; - } + AlignedArray mat = testMatrices[matTest]; + AlignedArray src = testSrcVectors[srcTest]; + AlignedArray dst = testDstVectors[dstTest]; CpuMathUtils.MatTimesSrc(false, true, mat, src, dst, dst.Size); float[] actual = new float[dst.Size]; @@ -94,13 +103,14 @@ public void MatMulAAddTest(int test, float[] expected) } [Theory] - [InlineData(0, new float[] { -630.38f, -171.1f, 155.66f, 75.1f })] - [InlineData(1, new float[] { 170f, 180f, 190f, 200f, 210f, 220f, 230f, 240f })] - public void MatMulTranATest(int test, float[] expected) + [InlineData(0, 0, 0, new float[] { -630.38f, -171.1f, 155.66f, 75.1f })] + [InlineData(1, 0, 1, new float[] { 170f, 180f, 190f, 200f, 210f, 220f, 230f, 240f })] + [InlineData(1, 1, 0, new float[] { 708f, 744f, 780f, 816f })] + public void MatMulTranATest(int matTest, int srcTest, int dstTest, float[] expected) { - AlignedArray mat = testMatrices[test]; - AlignedArray src = testSrcVectors[0]; - AlignedArray dst = new AlignedArray(4 + 4 * test, SseCbAlign); + AlignedArray mat = testMatrices[matTest]; + AlignedArray src = testSrcVectors[srcTest]; + AlignedArray dst = testDstVectors[dstTest]; CpuMathUtils.MatTimesSrc(true, false, mat, src, dst, src.Size); float[] actual = new float[dst.Size]; @@ -109,18 +119,14 @@ public void MatMulTranATest(int test, float[] expected) } [Theory] - [InlineData(0, new float[] { -630.38f, -170.1f, 157.66f, 78.1f })] - [InlineData(1, new float[] { 170f, 181f, 192f, 203f, 214f, 225f, 236f, 247f })] - public void MatMulTranAAddTest(int test, float[] expected) + [InlineData(0, 0, 0, new float[] { -630.38f, -170.1f, 157.66f, 78.1f })] + [InlineData(1, 0, 1, new float[] { 170f, 181f, 192f, 203f, 214f, 225f, 236f, 247f })] + [InlineData(1, 1, 0, new float[] { 708f, 745f, 782f, 819f })] + public void MatMulTranAAddTest(int matTest, int srcTest, int dstTest, float[] expected) { - AlignedArray mat = testMatrices[test]; - AlignedArray src = testSrcVectors[0]; - AlignedArray dst = new AlignedArray(4 + 4 * test, SseCbAlign); - - for (int i = 0; i < dst.Size; i++) - { - dst[i] = i; - } + AlignedArray mat = testMatrices[matTest]; + AlignedArray src = testSrcVectors[srcTest]; + AlignedArray dst = testDstVectors[dstTest]; CpuMathUtils.MatTimesSrc(true, true, mat, src, dst, src.Size); float[] actual = new float[dst.Size]; @@ -129,74 +135,68 @@ public void MatMulTranAAddTest(int test, float[] expected) } [Theory] - [InlineData(0, new float[] { -27.32f, -9.02f, -27.32f, -9.02f })] - [InlineData(1, new float[] { 95f, 231f, 367f, 503f })] - public void MatMulPATest(int test, float[] expected) + [InlineData(0, 0, 0, new float[] { -27.32f, -9.02f, -27.32f, -9.02f })] + [InlineData(1, 1, 0, new float[] { 95f, 231f, 367f, 503f })] + [InlineData(1, 0, 1, new float[] { 10f, 26f, 42f, 58f, 74f, 90f, 106f, 122f })] + public void MatMulPATest(int matTest, int srcTest, int dstTest, float[] expected) { - AlignedArray mat = testMatrices[test]; - AlignedArray src = testSrcVectors[test]; - AlignedArray dst = new AlignedArray(4, SseCbAlign); + AlignedArray mat = testMatrices[matTest]; + AlignedArray src = testSrcVectors[srcTest]; + AlignedArray dst = testDstVectors[dstTest]; int[] idx = testIndexArray; - CpuMathUtils.MatTimesSrc(false, false, mat, idx, src, 0, 0, 2 + 2 * test, dst, dst.Size); + CpuMathUtils.MatTimesSrc(false, false, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, dst.Size); float[] actual = new float[dst.Size]; dst.CopyTo(actual, 0, dst.Size); Assert.Equal(expected, actual, comparer); } [Theory] - [InlineData(0, new float[] { -27.32f, -8.02f, -25.32f, -6.02f })] - [InlineData(1, new float[] { 95f, 232f, 369f, 506f })] - public void MatMulPAAddTest(int test, float[] expected) + [InlineData(0, 0, 0, new float[] { -27.32f, -8.02f, -25.32f, -6.02f })] + [InlineData(1, 1, 0, new float[] { 95f, 232f, 369f, 506f })] + [InlineData(1, 0, 1, new float[] { 10f, 27f, 44f, 61f, 78f, 95f, 112f, 129f })] + public void MatMulPAAddTest(int matTest, int srcTest, int dstTest, float[] expected) { - AlignedArray mat = testMatrices[test]; - AlignedArray src = testSrcVectors[test]; - AlignedArray dst = new AlignedArray(4, SseCbAlign); + AlignedArray mat = testMatrices[matTest]; + AlignedArray src = testSrcVectors[srcTest]; + AlignedArray dst = testDstVectors[dstTest]; int[] idx = testIndexArray; - for (int i = 0; i < dst.Size; i++) - { - dst[i] = i; - } - - CpuMathUtils.MatTimesSrc(false, true, mat, idx, src, 0, 0, 2 + 2 * test, dst, dst.Size); + CpuMathUtils.MatTimesSrc(false, true, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, dst.Size); float[] actual = new float[dst.Size]; dst.CopyTo(actual, 0, dst.Size); Assert.Equal(expected, actual, comparer); } [Theory] - [InlineData(0, new float[] { 7.84f, -9.52f, -39.04f, 55.36f })] - [InlineData(1, new float[] { 52f, 56f, 60f, 64f, 68f, 72f, 76f, 80f })] - public void MatMulTranPATest(int test, float[] expected) + [InlineData(0, 0, 0, new float[] { 7.84f, -9.52f, -39.04f, 55.36f })] + [InlineData(1, 0, 1, new float[] { 52f, 56f, 60f, 64f, 68f, 72f, 76f, 80f })] + [InlineData(1, 1, 0, new float[] { 329f, 346f, 363f, 380f })] + public void MatMulTranPATest(int matTest, int srcTest, int dstTest, float[] expected) { - AlignedArray mat = testMatrices[test]; - AlignedArray src = testSrcVectors[0]; - AlignedArray dst = new AlignedArray(4 + 4 * test, SseCbAlign); + AlignedArray mat = testMatrices[matTest]; + AlignedArray src = testSrcVectors[srcTest]; + AlignedArray dst = testDstVectors[dstTest]; int[] idx = testIndexArray; - CpuMathUtils.MatTimesSrc(true, false, mat, idx, src, 0, 0, 2, dst, src.Size); + CpuMathUtils.MatTimesSrc(true, false, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, src.Size); float[] actual = new float[dst.Size]; dst.CopyTo(actual, 0, dst.Size); Assert.Equal(expected, actual, comparer); } [Theory] - [InlineData(0, new float[] { 7.84f, -8.52f, -37.04f, 58.36f })] - [InlineData(1, new float[] { 52f, 57f, 62f, 67f, 72f, 77f, 82f, 87f })] - public void MatMulTranPAAddTest(int test, float[] expected) + [InlineData(0, 0, 0, new float[] { 7.84f, -8.52f, -37.04f, 58.36f })] + [InlineData(1, 0, 1, new float[] { 52f, 57f, 62f, 67f, 72f, 77f, 82f, 87f })] + [InlineData(1, 1, 0, new float[] { 329f, 347f, 365f, 383f })] + public void MatMulTranPAAddTest(int matTest, int srcTest, int dstTest, float[] expected) { - AlignedArray mat = testMatrices[test]; - AlignedArray src = testSrcVectors[0]; - AlignedArray dst = new AlignedArray(4 + 4 * test, SseCbAlign); + AlignedArray mat = testMatrices[matTest]; + AlignedArray src = testSrcVectors[srcTest]; + AlignedArray dst = testDstVectors[dstTest]; int[] idx = testIndexArray; - for (int i = 0; i < dst.Size; i++) - { - dst[i] = i; - } - - CpuMathUtils.MatTimesSrc(true, true, mat, idx, src, 0, 0, 2, dst, src.Size); + CpuMathUtils.MatTimesSrc(true, true, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, src.Size); float[] actual = new float[dst.Size]; dst.CopyTo(actual, 0, dst.Size); Assert.Equal(expected, actual, comparer); From ef979b1abc12954ed16b6422e3b0c9d0b1b9755b Mon Sep 17 00:00:00 2001 From: Brian Lui Date: Fri, 10 Aug 2018 14:49:15 -0700 Subject: [PATCH 8/8] Respond to PR feedback: Style changes --- .../CpuMathUtils.netcoreapp.cs | 20 ++++++++----------- src/Microsoft.ML.CpuMath/SseIntrinsics.cs | 2 +- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs index e17019ffa7..b238d602b0 100644 --- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs +++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs @@ -12,18 +12,17 @@ public static partial class CpuMathUtils public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crun) { Contracts.Assert(mat.Size == dst.Size * src.Size); + Contracts.Assert(crun >= 0); if (Sse.IsSupported) { if (!tran) { - Contracts.Assert(crun >= 0); Contracts.Assert(crun <= dst.Size); SseIntrinsics.MatMulA(add, mat, src, dst, crun, src.Size); } else { - Contracts.Assert(crun >= 0); Contracts.Assert(crun <= src.Size); SseIntrinsics.MatMulTranA(add, mat, src, dst, dst.Size, crun); } @@ -32,7 +31,6 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArr { if (!tran) { - Contracts.Assert(crun >= 0); Contracts.Assert(crun <= dst.Size); for (int i = 0; i < crun; i++) { @@ -54,7 +52,6 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArr } else { - Contracts.Assert(crun >= 0); Contracts.Assert(crun <= src.Size); for (int i = 0; i < dst.Size; i++) { @@ -94,18 +91,17 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgpo } Contracts.AssertNonEmpty(rgposSrc); + Contracts.Assert(crun >= 0); if (Sse.IsSupported) { if (!tran) { - Contracts.Assert(crun >= 0); Contracts.Assert(crun <= dst.Size); SseIntrinsics.MatMulPA(add, mat, rgposSrc, srcValues, posMin, iposMin, iposLim, dst, crun, srcValues.Size); } else { - Contracts.Assert(crun >= 0); Contracts.Assert(crun <= srcValues.Size); SseIntrinsics.MatMulTranPA(add, mat, rgposSrc, srcValues, posMin, iposMin, iposLim, dst, dst.Size); } @@ -114,7 +110,6 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgpo { if (!tran) { - Contracts.Assert(crun >= 0); Contracts.Assert(crun <= dst.Size); for (int i = 0; i < crun; i++) { @@ -137,7 +132,6 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgpo } else { - Contracts.Assert(crun >= 0); Contracts.Assert(crun <= srcValues.Size); for (int i = 0; i < dst.Size; i++) { @@ -171,7 +165,6 @@ public static void Add(float a, float[] dst, int count) Add(a, new Span(dst, 0, count)); } - // dst += a private static void Add(float a, Span dst) { if (Sse.IsSupported) @@ -856,7 +849,8 @@ private static unsafe void ZeroItemsU(AlignedArray dst, int c, int[] indices, in for (int i = 0; i < cindices; ++i) { int index = pidx[i]; - Contracts.Assert(0 <= index && index < c); + Contracts.Assert(index >= 0); + Contracts.Assert(index < c); pdst[index] = 0; } } @@ -874,7 +868,8 @@ private static unsafe void ZeroMatrixItemsCore(AlignedArray dst, int c, int ccol for (int i = 0; i < cindices; ++i) { int index = pidx[i]; - Contracts.Assert(0 <= index && index < c); + Contracts.Assert(index >= 0); + Contracts.Assert(index < c); int col = index - ivLogMin; if ((uint)col >= (uint)ccol) @@ -886,7 +881,8 @@ private static unsafe void ZeroMatrixItemsCore(AlignedArray dst, int c, int ccol ivLogLim = ivLogMin + ccol; ivPhyMin = row * cfltRow; - Contracts.Assert(ivLogMin <= index && index < ivLogLim); + Contracts.Assert(index >= ivLogMin); + Contracts.Assert(index < ivLogLim); col = index - ivLogMin; } diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs index 659e5d3e90..2ac1f56f14 100644 --- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs +++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs @@ -98,7 +98,7 @@ private static Vector128 VectorMax(in Vector128 vector) [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] private static Vector128 GetAbsMask() { - return (Sse2.IsSupported) ? + return Sse2.IsSupported ? Sse.StaticCast(Sse2.SetAllVector128(0x7FFFFFFF)) : Sse.SetAllVector128(BitConverter.Int32BitsToSingle(0x7FFFFFFF)); }