From 5cd4918679ba2c90e19b55e27f4b70e4a975b307 Mon Sep 17 00:00:00 2001
From: Brian Lui <t-brlui@microsoft.com>
Date: Mon, 6 Aug 2018 14:42:52 -0700
Subject: [PATCH 1/8] Implemented all remaining active SSE intrinsics

---
 .../CpuMathUtils.netcoreapp.cs                | 154 +++++
 .../CpuMathUtils.netstandard.cs               |  13 +
 src/Microsoft.ML.CpuMath/SseIntrinsics.cs     | 559 ++++++++++++++++++
 .../CpuMathNativeUtils.cs                     |  32 +
 .../SsePerformanceTests.cs                    |  22 +-
 5 files changed, 779 insertions(+), 1 deletion(-)

diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
index 6c6c1fe6ad..a213c8d7a2 100644
--- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
+++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
@@ -9,6 +9,139 @@ namespace Microsoft.ML.Runtime.Internal.CpuMath
 {
     public static partial class CpuMathUtils
     {
+        public const int CbAlign = 16;
+
+        private static bool Compat(AlignedArray a)
+        {
+            Contracts.AssertValue(a);
+            Contracts.Assert(a.Size > 0);
+            return a.CbAlign == CbAlign;
+        }
+
+        internal static unsafe float* Ptr(AlignedArray a, float* p)
+        {
+            Contracts.AssertValue(a);
+            float* q = p + a.GetBase((long)p);
+            Contracts.Assert(((long)q & (CbAlign - 1)) == 0);
+            return q;
+        }
+
+        public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crun)
+        {
+            Contracts.Assert(Compat(mat));
+            Contracts.Assert(Compat(src));
+            Contracts.Assert(Compat(dst));
+            Contracts.Assert(mat.Size == dst.Size * src.Size);
+
+            if (Sse.IsSupported)
+            {
+                if (!tran)
+                {
+                    Contracts.Assert(0 <= crun && crun <= dst.Size);
+                    SseIntrinsics.MatMulA(add, mat, src, dst, crun, src.Size);
+                }
+                else
+                {
+                    Contracts.Assert(0 <= crun && crun <= src.Size);
+                    SseIntrinsics.MatMulTranA(add, mat, src, dst, dst.Size, crun);
+                }
+            }
+            else
+            {
+                // TODO: Software fallback
+            }
+        }
+
+        public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgposSrc, AlignedArray srcValues,
+            int posMin, int iposMin, int iposEnd, AlignedArray dst, int crun)
+        {
+            Contracts.Assert(Compat(mat));
+            Contracts.Assert(Compat(srcValues));
+            Contracts.Assert(Compat(dst));
+            Contracts.AssertValue(rgposSrc);
+            Contracts.Assert(0 <= iposMin && iposMin <= iposEnd && iposEnd <= rgposSrc.Length);
+            Contracts.Assert(mat.Size == dst.Size * srcValues.Size);
+
+            if (iposMin >= iposEnd)
+            {
+                if (!add)
+                    dst.ZeroItems();
+                return;
+            }
+
+            Contracts.AssertNonEmpty(rgposSrc);
+
+            if (Sse.IsSupported)
+            {
+                if (!tran)
+                {
+                    Contracts.Assert(0 <= crun && crun <= dst.Size);
+                    SseIntrinsics.MatMulPA(add, mat, rgposSrc, srcValues, posMin, iposMin, iposEnd, dst, crun, srcValues.Size);
+                }
+                else
+                {
+                    Contracts.Assert(0 <= crun && crun <= srcValues.Size);
+                    SseIntrinsics.MatMulTranPA(add, mat, rgposSrc, srcValues, posMin, iposMin, iposEnd, dst, dst.Size);
+                }
+            }
+            else
+            {
+                // TODO: Software fallback
+            }
+        }
+
+        public static void MatTimesSrc(bool add, int[] starts, int[] indices, float[] coefs,
+            AlignedArray src, AlignedArray dst, int crow)
+        {
+            Contracts.AssertNonEmpty(starts);
+            Contracts.Assert(starts.Length == crow + 1);
+            Contracts.Assert(starts[0] == 0);
+            Contracts.AssertNonEmpty(indices);
+            Contracts.Assert(starts[crow] == indices.Length);
+            Contracts.AssertNonEmpty(coefs);
+            Contracts.Assert(indices.Length == coefs.Length);
+            Contracts.Assert(Compat(src));
+            Contracts.Assert(Compat(dst));
+            Contracts.Assert(0 < crow && crow <= dst.Size);
+            Contracts.Assert(crow * src.Size >= coefs.Length);
+
+            if (Sse.IsSupported)
+            {
+                SseIntrinsics.MatMulRU(add, starts, indices, coefs, src, dst, crow);
+            }
+            else
+            {
+                // TODO: Software fallback
+            }
+        }
+
+        public static void MatTimesSrc(bool add, int[] mprowiv, int[] mprowcol,
+            int[] mprowrun, int[] runs, float[] coefs,
+            AlignedArray src, AlignedArray dst, int crow)
+        {
+            Contracts.AssertNonEmpty(mprowiv);
+            Contracts.Assert(mprowiv.Length == crow);
+            Contracts.AssertNonEmpty(mprowcol);
+            Contracts.Assert(mprowcol.Length == crow);
+            Contracts.Assert(mprowrun == null || mprowrun.Length == crow);
+            Contracts.AssertNonEmpty(runs);
+            Contracts.AssertNonEmpty(coefs);
+            Contracts.Assert(Compat(src));
+            Contracts.Assert(Compat(dst));
+            Contracts.Assert(0 < crow && crow <= dst.Size);
+
+            if (mprowrun == null)
+            {
+                SseIntrinsics.MatMulCU(add, mprowiv, mprowcol, runs, coefs,
+                    src, dst, crow);
+            }
+            else
+            {
+                SseIntrinsics.MatMulDU(add, mprowiv, mprowcol, mprowrun, runs, coefs,
+                    src, dst, crow);
+            }
+        }
+
         public static void Scale(float a, float[] dst, int count)
         {
             Contracts.AssertNonEmpty(dst);
@@ -392,5 +525,26 @@ private static float L2DistSquared(Span<float> a, Span<float> b)
                 return norm;
             }
         }
+
+        public static void ZeroMatrixItems(AlignedArray dst, int ccol, int cfltRow, int[] indices)
+        {
+            Contracts.Assert(0 < ccol && ccol <= cfltRow);
+
+            if (Sse.IsSupported)
+            {
+                if (ccol == cfltRow)
+                {
+                    SseIntrinsics.ZeroItemsU(dst, dst.Size, indices, indices.Length);
+                }
+                else
+                {
+                    SseIntrinsics.ZeroMatrixItemsCore(dst, dst.Size, ccol, cfltRow, indices, indices.Length);
+                }
+            }
+            else
+            {
+                // TODO: Software fallback
+            }
+        }
     }
 }
diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs
index 501fc9082e..a71316ebb3 100644
--- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs
+++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs
@@ -6,6 +6,17 @@ namespace Microsoft.ML.Runtime.Internal.CpuMath
 {
     public static partial class CpuMathUtils
     {
+        public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crun) => SseUtils.MatTimesSrc(tran, add, mat, src, dst, crun);
+
+        public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgposSrc, AlignedArray srcValues,
+            int posMin, int iposMin, int iposLim, AlignedArray dst, int crun) => SseUtils.MatTimesSrc(tran, add, mat, rgposSrc, srcValues, posMin, iposMin, iposLim, dst, crun);
+
+        public static void MatTimesSrc(bool add, int[] starts, int[] indices, float[] coefs,
+            AlignedArray src, AlignedArray dst, int crow) => SseUtils.MatTimesSrc(add, starts, indices, coefs, src, dst, crow);
+
+        public static void MatTimesSrc(bool add, int[] mprowiv, int[] mprowcol, int[] mprowrun, int[] runs, float[] coefs,
+            AlignedArray src, AlignedArray dst, int crow) => SseUtils.MatTimesSrc(add, mprowiv, mprowcol, mprowrun, runs, coefs, src, dst, crow);
+
         public static void Scale(float a, float[] dst, int count) => SseUtils.Scale(a, dst, count);
 
         public static void Scale(float a, float[] dst, int offset, int count) => SseUtils.Scale(a, dst, offset, count);
@@ -43,5 +54,7 @@ public static partial class CpuMathUtils
         public static float DotProductSparse(float[] a, int offset, float[] b, int[] indices, int count) => SseUtils.DotProductSparse(a, offset, b, indices, count);
 
         public static float L2DistSquared(float[] a, float[] b, int count) => SseUtils.L2DistSquared(a, b, count);
+
+        public static void ZeroMatrixItems(AlignedArray dst, int ccol, int cfltRow, int[] indices) => SseUtils.ZeroMatrixItems(dst, ccol, cfltRow, indices);
     }
 }
diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
index d11676f283..1124ee663a 100644
--- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
+++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
@@ -71,6 +71,509 @@ private static Vector128<float> VectorSum(in Vector128<float> vector)
             }
         }
 
+        // Multiply matrix times vector into vector.
+        internal static unsafe void MatMulA(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol)
+        {
+            fixed (float* pSrcStart = &src.Items[0])
+            fixed (float* pDstStart = &dst.Items[0])
+            fixed (float* pMatStart = &mat.Items[0])
+            {
+                float* psrc = CpuMathUtils.Ptr(src, pSrcStart);
+                float* pdst = CpuMathUtils.Ptr(dst, pDstStart);
+                float* pmat = CpuMathUtils.Ptr(mat, pMatStart);
+
+                float* pSrcEnd = psrc + ccol;
+                float* pDstEnd = pdst + crow;
+                float* pSrcCurrent = psrc;
+                float* pDstCurrent = pdst;
+                float* pMatCurrent = pmat;
+
+                while (pDstCurrent < pDstEnd)
+                {
+                    Vector128<float> res0 = Sse.SetZeroVector128();
+                    Vector128<float> res1 = res0;
+                    Vector128<float> res2 = res0;
+                    Vector128<float> res3 = res0;
+
+                    while (pSrcCurrent < pSrcEnd)
+                    {
+                        float* pMatTemp = pMatCurrent;
+
+                        Vector128<float> x01 = Sse.LoadAlignedVector128(pMatTemp);
+                        Vector128<float> x11 = Sse.LoadAlignedVector128(pMatTemp += ccol);
+                        Vector128<float> x21 = Sse.LoadAlignedVector128(pMatTemp += ccol);
+                        Vector128<float> x31 = Sse.LoadAlignedVector128(pMatTemp += ccol);
+                        Vector128<float> x02 = Sse.LoadAlignedVector128(pSrcCurrent);
+
+                        res0 = Sse.Add(res0, Sse.Multiply(x01, x02));
+                        res1 = Sse.Add(res1, Sse.Multiply(x11, x02));
+                        res2 = Sse.Add(res2, Sse.Multiply(x21, x02));
+                        res3 = Sse.Add(res3, Sse.Multiply(x31, x02));
+
+                        pSrcCurrent += 4;
+                        pMatCurrent += 4;
+                    }
+
+                    // Add up the entries of each, with the 4 results in res0
+                    res0 = Sse3.HorizontalAdd(res0, res1);
+                    res2 = Sse3.HorizontalAdd(res2, res3);
+                    res0 = Sse3.HorizontalAdd(res0, res2);
+
+                    if (add)
+                    {
+                        res0 = Sse.Add(res0, Sse.LoadAlignedVector128(pDstCurrent));
+                    }
+                    Sse.StoreAligned(pDstCurrent, res0);
+
+                    pDstCurrent += 4;
+                    pMatCurrent += 3 * ccol;
+                }
+            }
+        }
+
+        // Partial sparse source vector.
+        internal static unsafe void MatMulPA(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src,
+                                        int posMin, int iposMin, int iposEnd, AlignedArray dst, int crow, int ccol)
+        {
+            // REVIEW: For extremely sparse inputs, interchanging the loops would
+            // likely be more efficient.
+            fixed (float* pSrcStart = &src.Items[0])
+            fixed (float* pDstStart = &dst.Items[0])
+            fixed (float* pMatStart = &mat.Items[0])
+            fixed (int* pposSrc = &rgposSrc[0])
+            {
+                float* psrc = CpuMathUtils.Ptr(src, pSrcStart);
+                float* pdst = CpuMathUtils.Ptr(dst, pDstStart);
+                float* pmat = CpuMathUtils.Ptr(mat, pMatStart);
+
+                int* pposMin = pposSrc + iposMin;
+                int* pposEnd = pposSrc + iposEnd;
+                float* pDstEnd = pdst + crow;
+                float* pm0 = pmat - posMin;
+                float* pSrcCurrent = psrc - posMin;
+                float* pDstCurrent = pdst;
+
+                while (pDstCurrent < pDstEnd)
+                {
+                    float* pm1 = pm0 + ccol;
+                    float* pm2 = pm1 + ccol;
+                    float* pm3 = pm2 + ccol;
+                    Vector128<float> res = Sse.SetZeroVector128();
+
+                    int* ppos = pposMin;
+
+                    while (ppos < pposEnd)
+                    {
+                        int col = *ppos;
+                        Vector128<float> x1 = Sse.SetVector128(pm3[col], pm2[col], pm1[col], pm0[col]);
+                        Vector128<float> x2 = Sse.SetAllVector128(pSrcCurrent[col]);
+                        x2 = Sse.Multiply(x2, x1);
+                        res = Sse.Add(res, x2);
+
+                        ppos++;
+                    }
+
+                    if (add)
+                    {
+                        res = Sse.Add(res, Sse.LoadAlignedVector128(pDstCurrent));
+                    }
+                    Sse.StoreAligned(pDstCurrent, res);
+
+                    pDstCurrent += 4;
+                    pm0 += 4 * ccol;
+                }
+            }
+        }
+
+        internal static unsafe void MatMulTranA(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol)
+        {
+            fixed (float* pSrcStart = &src.Items[0])
+            fixed (float* pDstStart = &dst.Items[0])
+            fixed (float* pMatStart = &mat.Items[0])
+            {
+                float* psrc = CpuMathUtils.Ptr(src, pSrcStart);
+                float* pdst = CpuMathUtils.Ptr(dst, pDstStart);
+                float* pmat = CpuMathUtils.Ptr(mat, pMatStart);
+
+                float* pSrcEnd = psrc + ccol;
+                float* pDstEnd = pdst + crow;
+                float* pSrcCurrent = psrc;
+                float* pDstCurrent = pdst;
+                float* pMatCurrent = pmat;
+
+                if (!add)
+                {
+                    Vector128<float> x01 = Sse.LoadAlignedVector128(pSrcCurrent);
+                    // Replicate each slot of x01 into its own register.
+                    Vector128<float> x11 = Sse.Shuffle(x01, x01, 0x55);
+                    Vector128<float> x21 = Sse.Shuffle(x01, x01, 0xAA);
+                    Vector128<float> x31 = Sse.Shuffle(x01, x01, 0xFF);
+                    x01 = Sse.Shuffle(x01, x01, 0x00);
+
+                    pSrcCurrent += 4;
+
+                    while (pDstCurrent < pDstEnd)
+                    {
+                        float* pMatTemp = pMatCurrent;
+                        Vector128<float> x02 = Sse.LoadAlignedVector128(pMatTemp);
+                        Vector128<float> x12 = Sse.LoadAlignedVector128(pMatTemp += crow);
+                        Vector128<float> x22 = Sse.LoadAlignedVector128(pMatTemp += crow);
+                        Vector128<float> x32 = Sse.LoadAlignedVector128(pMatTemp += crow);
+
+                        x02 = Sse.Multiply(x01, x02);
+                        x12 = Sse.Multiply(x11, x12);
+                        x22 = Sse.Multiply(x21, x22);
+                        x32 = Sse.Multiply(x31, x32);
+
+                        x02 = Sse.Add(x02, x12);
+                        x22 = Sse.Add(x22, x32);
+                        x02 = Sse.Add(x02, x22);
+
+                        Sse.StoreAligned(pDstCurrent, x02);
+
+                        pDstCurrent += 4;
+                        pMatCurrent += 4;
+                    }
+
+                    pMatCurrent += 3 * crow;
+                }
+
+                while (pSrcCurrent < pSrcEnd)
+                {
+                    Vector128<float> x01 = Sse.LoadAlignedVector128(pSrcCurrent);
+                    // Replicate each slot of x01 into its own register.
+                    Vector128<float> x11 = Sse.Shuffle(x01, x01, 0x55);
+                    Vector128<float> x21 = Sse.Shuffle(x01, x01, 0xAA);
+                    Vector128<float> x31 = Sse.Shuffle(x01, x01, 0xFF);
+                    x01 = Sse.Shuffle(x01, x01, 0x00);
+
+                    pDstCurrent = pdst;
+
+                    while (pDstCurrent < pDstEnd)
+                    {
+                        float* pMatTemp = pMatCurrent;
+
+                        Vector128<float> x02 = Sse.LoadAlignedVector128(pMatTemp);
+                        Vector128<float> x12 = Sse.LoadAlignedVector128(pMatTemp += crow);
+                        Vector128<float> x22 = Sse.LoadAlignedVector128(pMatTemp += crow);
+                        Vector128<float> x32 = Sse.LoadAlignedVector128(pMatTemp += crow);
+                        Vector128<float> x3 = Sse.LoadAlignedVector128(pDstCurrent);
+
+                        x02 = Sse.Multiply(x01, x02);
+                        x12 = Sse.Multiply(x11, x12);
+                        x22 = Sse.Multiply(x21, x22);
+                        x32 = Sse.Multiply(x31, x32);
+
+                        x02 = Sse.Add(x02, x12);
+                        x22 = Sse.Add(x22, x32);
+                        x02 = Sse.Add(x02, x22);
+                        x3 = Sse.Add(x02, x3);
+
+                        Sse.StoreAligned(pDstCurrent, x3);
+
+                        pDstCurrent += 4;
+                        pMatCurrent += 4;
+                    }
+
+                    pMatCurrent += 3 * crow;
+                    pSrcCurrent += 4;
+                }
+            }
+        }
+
+        // Partial sparse source vector.
+        internal static unsafe void MatMulTranPA(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src,
+                                        int posMin, int iposMin, int iposEnd, AlignedArray dst, int crow)
+        {
+            fixed (float* pSrcStart = &src.Items[0])
+            fixed (float* pDstStart = &dst.Items[0])
+            fixed (float* pMatStart = &mat.Items[0])
+            fixed (int* pposSrc = &rgposSrc[0])
+            {
+                float* psrc = CpuMathUtils.Ptr(src, pSrcStart);
+                float* pdst = CpuMathUtils.Ptr(dst, pDstStart);
+                float* pmat = CpuMathUtils.Ptr(mat, pMatStart);
+
+                int* ppos = pposSrc + iposMin;
+                int* pposEnd = pposSrc + iposEnd;
+                float* pDstEnd = pdst + crow;
+
+                if (!add)
+                {
+                    int col = *ppos - posMin;
+                    ppos++;
+
+                    Vector128<float> x0 = Sse.SetAllVector128(psrc[col]);
+                    float* pDstCurrent = pdst;
+                    float* pMatCurrent = pmat + col * crow;
+
+                    while (pDstCurrent < pDstEnd)
+                    {
+                        Vector128<float> x1 = Sse.LoadAlignedVector128(pMatCurrent);
+                        x1 = Sse.Multiply(x1, x0);
+                        Sse.StoreAligned(pDstCurrent, x1);
+
+                        pDstCurrent += 4;
+                        pMatCurrent += 4;
+                    }
+                }
+
+                // REVIEW: Should we explore unrolling the outer loop?
+                while (ppos < pposEnd)
+                {
+                    int col = *ppos - posMin;
+                    ppos++;
+
+                    Vector128<float> x0 = Sse.SetAllVector128(psrc[col]);
+                    float* pDstCurrent = pdst;
+                    float* pMatCurrent = pmat + col * crow;
+
+                    while (pDstCurrent < pDstEnd)
+                    {
+                        Vector128<float> x1 = Sse.LoadAlignedVector128(pMatCurrent);
+                        Vector128<float> x2 = Sse.LoadAlignedVector128(pDstCurrent);
+                        x1 = Sse.Multiply(x1, x0);
+                        x2 = Sse.Add(x2, x1);
+                        Sse.StoreAligned(pDstCurrent, x2);
+
+                        pDstCurrent += 4;
+                        pMatCurrent += 4;
+                    }
+
+                    ppos++;
+                }
+            }
+        }
+
+        // Sparse matrix.
+        internal static unsafe void MatMulRU(bool add, int[] starts, int[] indices, float[] coefs,
+                                                AlignedArray src, AlignedArray dst, int crow)
+        {
+            fixed (int* pstarts = &starts[0])
+            fixed (int* pindices = &indices[0])
+            fixed (float* pcoefs = &coefs[0])
+            fixed (float* pSrcStart = &src.Items[0])
+            fixed (float* pDstStart = &dst.Items[0])
+            {
+                float* psrc = CpuMathUtils.Ptr(src, pSrcStart);
+                float* pdst = CpuMathUtils.Ptr(dst, pDstStart);
+
+                int* pii = pstarts + 1;
+                int* pIdxCurrent = pindices;
+                float* pMatCurrent = pcoefs;
+                float* pDstEnd = pdst + crow;
+
+                float* pDstCurrent = pdst;
+
+                while (pDstCurrent < pDstEnd)
+                {
+                    int* pIdxEnd = pindices + *pii;
+                    pii++;
+
+                    Vector128<float> result = Sse.SetZeroVector128();
+
+                    while (pIdxCurrent + 4 <= pIdxEnd)
+                    {
+                        Vector128<float> x = Sse.Multiply(Load4(psrc, pIdxCurrent), Sse.LoadVector128(pMatCurrent));
+                        result = Sse.Add(result, x);
+
+                        pIdxCurrent += 4;
+                        pMatCurrent += 4;
+                    }
+
+                    while (pIdxCurrent < pIdxEnd)
+                    {
+                        Vector128<float> x = Sse.MultiplyScalar(Load1(psrc, pIdxCurrent), Sse.SetScalarVector128(*pMatCurrent));
+                        result = Sse.AddScalar(result, x);
+
+                        pIdxCurrent++;
+                        pMatCurrent++;
+                    }
+
+                    result = VectorSum(in result);
+
+                    if (add)
+                    {
+                        result = Sse.AddScalar(result, Sse.SetScalarVector128(*pDstCurrent));
+                    }
+                    Sse.StoreScalar(pDstCurrent, result);
+
+                    pDstCurrent++;
+                }
+            }
+        }
+
+        // Unpadded convolution.
+        internal static unsafe void MatMulCU(bool add, int[] mprowiv, int[] mprowcol,
+            int[] runs, float[] coefs, AlignedArray src, AlignedArray dst, int crow)
+        {
+
+            fixed (int* pmprowiv = &mprowiv[0])
+            fixed (int* pmprowcol = &mprowcol[0])
+            fixed (int* pruns = &runs[0])
+            fixed (float* pcoefs = &coefs[0])
+            fixed (float* pSrcStart = &src.Items[0])
+            fixed (float* pDstStart = &dst.Items[0])
+            {
+                float* psrc = CpuMathUtils.Ptr(src, pSrcStart);
+                float* pdst = CpuMathUtils.Ptr(dst, pDstStart);
+
+                int size = pruns[1];
+                int* psupport = pruns + 2;
+                int* piv = pmprowiv;
+                int* pcol = pmprowcol;
+                int* pIdxEnd = psupport + size;
+                float* pDstEnd = pdst + crow;
+
+                float* pDstCurrent = pdst;
+
+                while (pDstCurrent < pDstEnd)
+                {
+                    float* pMatCurrent = pcoefs + *piv;
+                    piv++;
+                    float* pSrcCurrent = psrc + *pcol;
+                    pcol++;
+                    int* pIdxCurrent = psupport;
+
+                    Vector128<float> result = Sse.SetZeroVector128();
+
+                    while (pIdxCurrent + 4 <= pIdxEnd)
+                    {
+                        Vector128<float> x = Sse.Multiply(Load4(pSrcCurrent, pIdxCurrent), Sse.LoadVector128(pMatCurrent));
+                        result = Sse.Add(result, x);
+
+                        pIdxCurrent += 4;
+                        pMatCurrent += 4;
+                    }
+
+                    while (pIdxCurrent < pIdxEnd)
+                    {
+                        Vector128<float> x = Sse.MultiplyScalar(Load1(pSrcCurrent, pIdxCurrent), Sse.SetScalarVector128(*pMatCurrent));
+                        result = Sse.AddScalar(result, x);
+
+                        pIdxCurrent++;
+                        pMatCurrent++;
+                    }
+
+                    result = VectorSum(result);
+
+                    // Add the bias.
+                    result = Sse.AddScalar(result, Sse.SetScalarVector128(*pMatCurrent));
+
+                    if (add)
+                    {
+                        result = Sse.AddScalar(result, Sse.SetScalarVector128(*pDstCurrent));
+                    }
+                    Sse.StoreScalar(pDstCurrent, result);
+
+                    pDstCurrent++;
+                }
+            }
+        }
+
+        // Padded convolution.
+        internal static unsafe void MatMulDU(bool add, int[] mprowiv, int[] mprowcol, int[] mprowrun,
+            int[] runs, float[] coefs, AlignedArray src, AlignedArray dst, int crow)
+        {
+            fixed (int* pmprowiv = &mprowiv[0])
+            fixed (int* pmprowcol = &mprowcol[0])
+            fixed (int* pmprowrun = &mprowrun[0])
+            fixed (int* pruns = &runs[0])
+            fixed (float* pcoefs = &coefs[0])
+            fixed (float* pSrcStart = &src.Items[0])
+            fixed (float* pDstStart = &dst.Items[0])
+            {
+                float* psrc = CpuMathUtils.Ptr(src, pSrcStart);
+                float* pdst = CpuMathUtils.Ptr(dst, pDstStart);
+
+                int* piv = pmprowiv;
+                int* pcol = pmprowcol;
+                float* pDstEnd = pdst + crow;
+                int kernelSize = pruns[1];
+
+                int* pirun = pmprowrun;
+                float* pDstCurrent = pdst;
+
+                while (pDstCurrent < pDstEnd)
+                {
+                    float* pMatCurrent = pcoefs + *piv;
+                    piv++;
+                    float* pMatBias = pMatCurrent + kernelSize;
+                    float* pSrcCurrent = psrc + *pcol;
+                    pcol++;
+                    int irun = *pirun;
+                    pirun++;
+
+                    int* pIdxCurrent = pruns + 2 + irun;
+                    int* pIdxEnd = pIdxCurrent + pIdxCurrent[-1];
+
+                    Vector128<float> result = Sse.SetZeroVector128();
+
+                    if (irun == 0)
+                    {
+                        // No masking needed.
+                        while (pIdxCurrent + 4 <= pIdxEnd)
+                        {
+                            Vector128<float> x = Sse.Multiply(Load4(pSrcCurrent, pIdxCurrent), Sse.LoadVector128(pMatCurrent));
+                            result = Sse.Add(result, x);
+
+                            pIdxCurrent += 4;
+                            pMatCurrent += 4;
+                        }
+
+                        while (pIdxCurrent < pIdxEnd)
+                        {
+                            Vector128<float> x = Sse.MultiplyScalar(Load1(pSrcCurrent, pIdxCurrent), Sse.SetScalarVector128(*pMatCurrent));
+                            result = Sse.AddScalar(result, x);
+
+                            pIdxCurrent++;
+                            pMatCurrent++;
+                        }
+                    }
+                    else
+                    {
+                        // Need masking.
+                        pMatCurrent += pIdxCurrent[-2];
+                        // REVIEW NEEDED: Is it the correct translation from: "const float * pmask = reinterpret_cast<const float *>(piLim);"?
+                        float* pmask = (float*)pIdxEnd;
+
+                        while (pIdxCurrent + 4 <= pIdxEnd)
+                        {
+                            Vector128<float> x = Sse.Multiply(Load4(pSrcCurrent, pIdxCurrent), Sse.And(Sse.LoadVector128(pmask), Sse.LoadVector128(pMatCurrent)));
+                            result = Sse.Add(result, x);
+
+                            pIdxCurrent += 4;
+                            pMatCurrent += 4;
+                            pmask += 4;
+                        }
+
+                        while (pIdxCurrent < pIdxEnd)
+                        {
+                            Vector128<float> x = Sse.MultiplyScalar(Load1(pSrcCurrent, pIdxCurrent), Sse.And(Sse.SetScalarVector128(*pmask), Sse.SetScalarVector128(*pMatCurrent)));
+                            result = Sse.AddScalar(result, x);
+
+                            pIdxCurrent++;
+                            pMatCurrent++;
+                            pmask++;
+                        }
+                    }
+
+                    result = VectorSum(result);
+
+                    // Add the bias.
+                    result = Sse.AddScalar(result, Sse.SetScalarVector128(*pMatBias));
+
+                    if (add)
+                    {
+                        result = Sse.AddScalar(result, Sse.SetScalarVector128(*pDstCurrent));
+                    }
+                    Sse.StoreScalar(pDstCurrent, result);
+
+                    pDstCurrent++;
+                }
+            }
+        }
+
         internal static unsafe void ScaleU(float scale, Span<float> dst)
         {
             Vector128<float> scaleVector = Sse.SetAllVector128(scale);
@@ -472,5 +975,61 @@ internal static unsafe float Dist2(Span<float> src, Span<float> dst)
             }
         }
 
+        internal static unsafe void ZeroItemsU(AlignedArray dst, int c, int[] indices, int cindices)
+        {
+            fixed (float* pDstStart = &dst.Items[0])
+            fixed (int* pidx = &indices[0])
+            {
+                float* pdst = CpuMathUtils.Ptr(dst, pDstStart);
+
+                // REVIEW NEEDED: This line expands to (void)(c); but is it necessary?
+                // DEBUG_ONLY(c);
+
+                for (int i = 0; i < cindices; ++i)
+                {
+                    int index = pidx[i];
+                    Contracts.Assert(0 <= index && index < c);
+                    pdst[index] = 0;
+                }
+            }
+        }
+
+        internal static unsafe void ZeroMatrixItemsCore(AlignedArray dst, int c, int ccol, int cfltRow, int[] indices, int cindices)
+        {
+            fixed (float* pDstStart = &dst.Items[0])
+            fixed (int* pidx = &indices[0])
+            {
+                float* pdst = CpuMathUtils.Ptr(dst, pDstStart);
+
+                // REVIEW NEEDED: This line expands to (void)(c); but is it necessary?
+                // DEBUG_ONLY(c);
+
+                int ivLogMin = 0;
+                int ivLogLim = ccol;
+                int ivPhyMin = 0;
+
+                for (int i = 0; i < cindices; ++i)
+                {
+                    int index = pidx[i];
+                    Contracts.Assert(0 <= index && index < c);
+
+                    int col = index - ivLogMin;
+                    if ((uint)col >= (uint)ccol)
+                    {
+                        Contracts.Assert(ivLogMin > index || index >= ivLogLim);
+
+                        int row = index / ccol;
+                        ivLogMin = row * ccol;
+                        ivLogLim = ivLogMin + ccol;
+                        ivPhyMin = row * cfltRow;
+
+                        Contracts.Assert(ivLogMin <= index && index < ivLogLim);
+                        col = index - ivLogMin;
+                    }
+
+                    pdst[ivPhyMin + col] = 0;
+                }
+            }
+        }
     }
 }
diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs
index 90f362de3e..2528fbe0f4 100644
--- a/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs
+++ b/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs
@@ -9,6 +9,32 @@ namespace Microsoft.ML.CpuMath.PerformanceTests
 {
     internal static class CpuMathNativeUtils
     {
+        [DllImport("CpuMathNative", EntryPoint = "MatMulA"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe void MatMulA(bool add, /*_In_ const*/ float* pmat, /*_In_ const*/ float* psrc, /*_Inout_*/ float* pdst, int crow, int ccol);
+
+        [DllImport("CpuMathNative", EntryPoint = "MatMulPA"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe void MatMulPA(bool add, /*_In_ const*/ float* pmat, /*_In_ const*/ int* pposSrc, /*_In_ const*/ float* psrc,
+            int posMin, int iposMin, int iposLim, /*_Inout_*/ float* pdst, int crow, int ccol);
+
+        [DllImport("CpuMathNative", EntryPoint = "MatMulTranA"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe void MatMulTranA(bool add, /*_In_ const*/ float* pmat, /*_In_ const*/ float* psrc, /*_Inout_*/ float* pdst, int crow, int ccol);
+
+        [DllImport("CpuMathNative", EntryPoint = "MatMulTranPA"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe void MatMulTranPA(bool add, /*_In_ const*/ float* pmat, /*_In_ const*/ int* pposSrc, /*_In_ const*/ float* psrc,
+            int posMin, int iposMin, int iposLim, /*_Inout_*/ float* pdst, int crow);
+
+        [DllImport("CpuMathNative", EntryPoint = "MatMulRU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe void MatMulRU(bool add, /*_In_ const*/ int* pstarts, /*_In_ const*/ int* pindices, /*_In_ const*/ float* pcoefs,
+            /*_In_ const*/ float* ps, /*_Inout_*/ float* pdst, int crow);
+
+        [DllImport("CpuMathNative", EntryPoint = "MatMulCU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe void MatMulCU(bool add, /*_In_ const*/ int* pmprowiv, /*_In_ const*/ int* pmprowcol,
+            /*_In_ const*/ int* pruns, /*_In_ const*/ float* pcoefs, /*_In_ const*/ float* psrc, /*_Inout_*/ float* pdst, int crow);
+
+        [DllImport("CpuMathNative", EntryPoint = "MatMulDU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe void MatMulDU(bool add, /*_In_ const*/ int* pmprowiv, /*_In_ const*/ int* pmprowcol, /*_In_ const*/ int* pmprowrun,
+            /*_In_ const*/ int* pruns, /*_In_ const*/ float* pcoefs, /*_In_ const*/ float* psrc, /*_Inout_*/ float* pdst, int crow);
+
         [DllImport("CpuMathNative", EntryPoint = "DotU"), SuppressUnmanagedCodeSecurity]
         internal static extern unsafe float DotU(/*const*/ float* pa, /*const*/ float* pb, int c);
 
@@ -41,5 +67,11 @@ internal static class CpuMathNativeUtils
 
         [DllImport("CpuMathNative", EntryPoint = "MulElementWiseU"), SuppressUnmanagedCodeSecurity]
         internal static extern unsafe void MulElementWiseU(/*_In_ const*/ float* ps1, /*_In_ const*/ float* ps2, /*_Inout_*/ float* pd, int c);
+
+        [DllImport("CpuMathNative", EntryPoint = "ZeroItemsU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe void ZeroItemsU(/*_Inout_*/ float* pd, int c, /*_In_ const*/ int* pindices, int cindices);
+
+        [DllImport("CpuMathNative", EntryPoint = "ZeroMatrixItemsCore"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe void ZeroMatrixItemsCore(/*_Inout_*/ float* pd, int c, int ccol, int cfltRow, /*_In_ const*/ int* pindices, int cindices);
     }
 }
diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs
index 92752a0018..f560fcd048 100644
--- a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs
+++ b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs
@@ -19,8 +19,11 @@ public class SsePerformanceTests
         private const int EXP_RANGE = EXP_MAX / 2;
         private const int DEFAULT_SEED = 253421;
         private const float DEFAULT_SCALE = 1.11f;
+        private const int DEFAULT_CROW = 1000;
+        private const int DEFAULT_CCOL = 1000;
+        private const bool ADD = true;
 
-        private float[] src, dst, original, src1, src2;
+        private float[] src, dst, original, src1, src2, mat;
         private int[] idx;
         private int seed = DEFAULT_SEED;
 
@@ -66,6 +69,7 @@ public void Setup()
             src2 = new float[LEN];
             original = new float[LEN];
             idx = new int[IDXLEN];
+            mat = new float[DEFAULT_CROW * DEFAULT_CCOL];
 
             seed = GetSeed();
             Random rand = new Random(seed);
@@ -83,6 +87,11 @@ public void Setup()
             {
                 idx[i] = rand.Next(0, LEN);
             }
+
+            for (int i = 0; i < mat.Length; i++)
+            {
+                mat[i] = NextFloat(rand, EXP_RANGE);
+            }
         }
 
         [GlobalCleanup]
@@ -91,6 +100,17 @@ public void GlobalCleanup()
             original.CopyTo(dst, 0);
         }
 
+        [Benchmark]
+        public unsafe void NativeMatMulAPerf()
+        {
+            fixed (float* pmat = mat)
+            fixed (float* psrc = src)
+            fixed (float* pdst = dst)
+            {
+                CpuMathNativeUtils.MatMulA(ADD, pmat, psrc, pdst, DEFAULT_CROW, DEFAULT_CCOL);
+            }
+        }
+
         [Benchmark]
         public unsafe float NativeDotUPerf()
         {

From 079dd485695c0ff57f617d1be0df8e43e8ddface Mon Sep 17 00:00:00 2001
From: Brian Lui <t-brlui@microsoft.com>
Date: Mon, 6 Aug 2018 16:38:16 -0700
Subject: [PATCH 2/8] Moved CpuMathUtils' architecture-dependent members into
 SseIntrinsics

---
 .../CpuMathUtils.netcoreapp.cs                | 27 -------
 src/Microsoft.ML.CpuMath/SseIntrinsics.cs     | 81 ++++++++++++++-----
 2 files changed, 61 insertions(+), 47 deletions(-)

diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
index a213c8d7a2..4bd9eefae7 100644
--- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
+++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
@@ -9,28 +9,8 @@ namespace Microsoft.ML.Runtime.Internal.CpuMath
 {
     public static partial class CpuMathUtils
     {
-        public const int CbAlign = 16;
-
-        private static bool Compat(AlignedArray a)
-        {
-            Contracts.AssertValue(a);
-            Contracts.Assert(a.Size > 0);
-            return a.CbAlign == CbAlign;
-        }
-
-        internal static unsafe float* Ptr(AlignedArray a, float* p)
-        {
-            Contracts.AssertValue(a);
-            float* q = p + a.GetBase((long)p);
-            Contracts.Assert(((long)q & (CbAlign - 1)) == 0);
-            return q;
-        }
-
         public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crun)
         {
-            Contracts.Assert(Compat(mat));
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
             Contracts.Assert(mat.Size == dst.Size * src.Size);
 
             if (Sse.IsSupported)
@@ -55,9 +35,6 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArr
         public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgposSrc, AlignedArray srcValues,
             int posMin, int iposMin, int iposEnd, AlignedArray dst, int crun)
         {
-            Contracts.Assert(Compat(mat));
-            Contracts.Assert(Compat(srcValues));
-            Contracts.Assert(Compat(dst));
             Contracts.AssertValue(rgposSrc);
             Contracts.Assert(0 <= iposMin && iposMin <= iposEnd && iposEnd <= rgposSrc.Length);
             Contracts.Assert(mat.Size == dst.Size * srcValues.Size);
@@ -100,8 +77,6 @@ public static void MatTimesSrc(bool add, int[] starts, int[] indices, float[] co
             Contracts.Assert(starts[crow] == indices.Length);
             Contracts.AssertNonEmpty(coefs);
             Contracts.Assert(indices.Length == coefs.Length);
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
             Contracts.Assert(0 < crow && crow <= dst.Size);
             Contracts.Assert(crow * src.Size >= coefs.Length);
 
@@ -126,8 +101,6 @@ public static void MatTimesSrc(bool add, int[] mprowiv, int[] mprowcol,
             Contracts.Assert(mprowrun == null || mprowrun.Length == crow);
             Contracts.AssertNonEmpty(runs);
             Contracts.AssertNonEmpty(coefs);
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
             Contracts.Assert(0 < crow && crow <= dst.Size);
 
             if (mprowrun == null)
diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
index 1124ee663a..1d61496c47 100644
--- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
+++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
@@ -16,6 +16,23 @@ namespace Microsoft.ML.Runtime.Internal.CpuMath
 {
     internal static class SseIntrinsics
     {
+        private const int CbAlign = 16;
+
+        private static bool Compat(AlignedArray a)
+        {
+            Contracts.AssertValue(a);
+            Contracts.Assert(a.Size > 0);
+            return a.CbAlign == CbAlign;
+        }
+
+        private static unsafe float* Ptr(AlignedArray a, float* p)
+        {
+            Contracts.AssertValue(a);
+            float* q = p + a.GetBase((long)p);
+            Contracts.Assert(((long)q & (CbAlign - 1)) == 0);
+            return q;
+        }
+
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
         private static unsafe Vector128<float> Load1(float* src, int* idx)
         {
@@ -74,13 +91,17 @@ private static Vector128<float> VectorSum(in Vector128<float> vector)
         // Multiply matrix times vector into vector.
         internal static unsafe void MatMulA(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol)
         {
+            Contracts.Assert(Compat(mat));
+            Contracts.Assert(Compat(src));
+            Contracts.Assert(Compat(dst));
+
             fixed (float* pSrcStart = &src.Items[0])
             fixed (float* pDstStart = &dst.Items[0])
             fixed (float* pMatStart = &mat.Items[0])
             {
-                float* psrc = CpuMathUtils.Ptr(src, pSrcStart);
-                float* pdst = CpuMathUtils.Ptr(dst, pDstStart);
-                float* pmat = CpuMathUtils.Ptr(mat, pMatStart);
+                float* psrc = Ptr(src, pSrcStart);
+                float* pdst = Ptr(dst, pDstStart);
+                float* pmat = Ptr(mat, pMatStart);
 
                 float* pSrcEnd = psrc + ccol;
                 float* pDstEnd = pdst + crow;
@@ -135,6 +156,10 @@ internal static unsafe void MatMulA(bool add, AlignedArray mat, AlignedArray src
         internal static unsafe void MatMulPA(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src,
                                         int posMin, int iposMin, int iposEnd, AlignedArray dst, int crow, int ccol)
         {
+            Contracts.Assert(Compat(mat));
+            Contracts.Assert(Compat(src));
+            Contracts.Assert(Compat(dst));
+
             // REVIEW: For extremely sparse inputs, interchanging the loops would
             // likely be more efficient.
             fixed (float* pSrcStart = &src.Items[0])
@@ -142,9 +167,9 @@ internal static unsafe void MatMulPA(bool add, AlignedArray mat, int[] rgposSrc,
             fixed (float* pMatStart = &mat.Items[0])
             fixed (int* pposSrc = &rgposSrc[0])
             {
-                float* psrc = CpuMathUtils.Ptr(src, pSrcStart);
-                float* pdst = CpuMathUtils.Ptr(dst, pDstStart);
-                float* pmat = CpuMathUtils.Ptr(mat, pMatStart);
+                float* psrc = Ptr(src, pSrcStart);
+                float* pdst = Ptr(dst, pDstStart);
+                float* pmat = Ptr(mat, pMatStart);
 
                 int* pposMin = pposSrc + iposMin;
                 int* pposEnd = pposSrc + iposEnd;
@@ -187,13 +212,17 @@ internal static unsafe void MatMulPA(bool add, AlignedArray mat, int[] rgposSrc,
 
         internal static unsafe void MatMulTranA(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol)
         {
+            Contracts.Assert(Compat(mat));
+            Contracts.Assert(Compat(src));
+            Contracts.Assert(Compat(dst));
+
             fixed (float* pSrcStart = &src.Items[0])
             fixed (float* pDstStart = &dst.Items[0])
             fixed (float* pMatStart = &mat.Items[0])
             {
-                float* psrc = CpuMathUtils.Ptr(src, pSrcStart);
-                float* pdst = CpuMathUtils.Ptr(dst, pDstStart);
-                float* pmat = CpuMathUtils.Ptr(mat, pMatStart);
+                float* psrc = Ptr(src, pSrcStart);
+                float* pdst = Ptr(dst, pDstStart);
+                float* pmat = Ptr(mat, pMatStart);
 
                 float* pSrcEnd = psrc + ccol;
                 float* pDstEnd = pdst + crow;
@@ -285,14 +314,18 @@ internal static unsafe void MatMulTranA(bool add, AlignedArray mat, AlignedArray
         internal static unsafe void MatMulTranPA(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src,
                                         int posMin, int iposMin, int iposEnd, AlignedArray dst, int crow)
         {
+            Contracts.Assert(Compat(mat));
+            Contracts.Assert(Compat(src));
+            Contracts.Assert(Compat(dst));
+
             fixed (float* pSrcStart = &src.Items[0])
             fixed (float* pDstStart = &dst.Items[0])
             fixed (float* pMatStart = &mat.Items[0])
             fixed (int* pposSrc = &rgposSrc[0])
             {
-                float* psrc = CpuMathUtils.Ptr(src, pSrcStart);
-                float* pdst = CpuMathUtils.Ptr(dst, pDstStart);
-                float* pmat = CpuMathUtils.Ptr(mat, pMatStart);
+                float* psrc = Ptr(src, pSrcStart);
+                float* pdst = Ptr(dst, pDstStart);
+                float* pmat = Ptr(mat, pMatStart);
 
                 int* ppos = pposSrc + iposMin;
                 int* pposEnd = pposSrc + iposEnd;
@@ -349,14 +382,17 @@ internal static unsafe void MatMulTranPA(bool add, AlignedArray mat, int[] rgpos
         internal static unsafe void MatMulRU(bool add, int[] starts, int[] indices, float[] coefs,
                                                 AlignedArray src, AlignedArray dst, int crow)
         {
+            Contracts.Assert(Compat(src));
+            Contracts.Assert(Compat(dst));
+
             fixed (int* pstarts = &starts[0])
             fixed (int* pindices = &indices[0])
             fixed (float* pcoefs = &coefs[0])
             fixed (float* pSrcStart = &src.Items[0])
             fixed (float* pDstStart = &dst.Items[0])
             {
-                float* psrc = CpuMathUtils.Ptr(src, pSrcStart);
-                float* pdst = CpuMathUtils.Ptr(dst, pDstStart);
+                float* psrc = Ptr(src, pSrcStart);
+                float* pdst = Ptr(dst, pDstStart);
 
                 int* pii = pstarts + 1;
                 int* pIdxCurrent = pindices;
@@ -407,6 +443,8 @@ internal static unsafe void MatMulRU(bool add, int[] starts, int[] indices, floa
         internal static unsafe void MatMulCU(bool add, int[] mprowiv, int[] mprowcol,
             int[] runs, float[] coefs, AlignedArray src, AlignedArray dst, int crow)
         {
+            Contracts.Assert(Compat(src));
+            Contracts.Assert(Compat(dst));
 
             fixed (int* pmprowiv = &mprowiv[0])
             fixed (int* pmprowcol = &mprowcol[0])
@@ -415,8 +453,8 @@ internal static unsafe void MatMulCU(bool add, int[] mprowiv, int[] mprowcol,
             fixed (float* pSrcStart = &src.Items[0])
             fixed (float* pDstStart = &dst.Items[0])
             {
-                float* psrc = CpuMathUtils.Ptr(src, pSrcStart);
-                float* pdst = CpuMathUtils.Ptr(dst, pDstStart);
+                float* psrc = Ptr(src, pSrcStart);
+                float* pdst = Ptr(dst, pDstStart);
 
                 int size = pruns[1];
                 int* psupport = pruns + 2;
@@ -475,6 +513,9 @@ internal static unsafe void MatMulCU(bool add, int[] mprowiv, int[] mprowcol,
         internal static unsafe void MatMulDU(bool add, int[] mprowiv, int[] mprowcol, int[] mprowrun,
             int[] runs, float[] coefs, AlignedArray src, AlignedArray dst, int crow)
         {
+            Contracts.Assert(Compat(src));
+            Contracts.Assert(Compat(dst));
+
             fixed (int* pmprowiv = &mprowiv[0])
             fixed (int* pmprowcol = &mprowcol[0])
             fixed (int* pmprowrun = &mprowrun[0])
@@ -483,8 +524,8 @@ internal static unsafe void MatMulDU(bool add, int[] mprowiv, int[] mprowcol, in
             fixed (float* pSrcStart = &src.Items[0])
             fixed (float* pDstStart = &dst.Items[0])
             {
-                float* psrc = CpuMathUtils.Ptr(src, pSrcStart);
-                float* pdst = CpuMathUtils.Ptr(dst, pDstStart);
+                float* psrc = Ptr(src, pSrcStart);
+                float* pdst = Ptr(dst, pDstStart);
 
                 int* piv = pmprowiv;
                 int* pcol = pmprowcol;
@@ -980,7 +1021,7 @@ internal static unsafe void ZeroItemsU(AlignedArray dst, int c, int[] indices, i
             fixed (float* pDstStart = &dst.Items[0])
             fixed (int* pidx = &indices[0])
             {
-                float* pdst = CpuMathUtils.Ptr(dst, pDstStart);
+                float* pdst = Ptr(dst, pDstStart);
 
                 // REVIEW NEEDED: This line expands to (void)(c); but is it necessary?
                 // DEBUG_ONLY(c);
@@ -999,7 +1040,7 @@ internal static unsafe void ZeroMatrixItemsCore(AlignedArray dst, int c, int cco
             fixed (float* pDstStart = &dst.Items[0])
             fixed (int* pidx = &indices[0])
             {
-                float* pdst = CpuMathUtils.Ptr(dst, pDstStart);
+                float* pdst = Ptr(dst, pDstStart);
 
                 // REVIEW NEEDED: This line expands to (void)(c); but is it necessary?
                 // DEBUG_ONLY(c);

From 5e1854d83906144ee09334ddb1705ced197cf139 Mon Sep 17 00:00:00 2001
From: Brian Lui <t-brlui@microsoft.com>
Date: Wed, 8 Aug 2018 14:45:10 -0700
Subject: [PATCH 3/8] Implemented all new referenced SSE intrinsics, with
 software fallbacks, passing unit tests, and performance tests

Note: Performance tests for functions that involve AlignedArray are not implemented.
---
 .../CpuMathUtils.netcoreapp.cs                | 458 +++++++++++--
 .../CpuMathUtils.netstandard.cs               |  32 +-
 src/Microsoft.ML.CpuMath/SseIntrinsics.cs     | 618 ++++++++++++------
 .../CpuMathNativeUtils.cs                     |  82 ++-
 .../SsePerformanceTests.cs                    | 211 ++++--
 .../UnitTests.cs                              | 491 ++++++++++++--
 6 files changed, 1508 insertions(+), 384 deletions(-)

diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
index 4bd9eefae7..8adff83a2f 100644
--- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
+++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
@@ -28,18 +28,59 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArr
             }
             else
             {
-                // TODO: Software fallback
+                if (!tran)
+                {
+                    Contracts.Assert(0 <= crun && crun <= dst.Size);
+                    for (int i = 0; i < crun; i++)
+                    {
+                        float dotProduct = 0;
+                        for (int j = 0; j < src.Size; j++)
+                        {
+                            dotProduct += mat[i * src.Size + j] * src[j];
+                        }
+
+                        if (add)
+                        {
+                            dst[i] += dotProduct;
+                        }
+                        else
+                        {
+                            dst[i] = dotProduct;
+                        }
+                    }
+                }
+                else
+                {
+                    Contracts.Assert(0 <= crun && crun <= src.Size);
+                    for (int i = 0; i < dst.Size; i++)
+                    {
+                        float dotProduct = 0;
+                        for (int j = 0; j < crun; j++)
+                        {
+                            dotProduct += mat[j * src.Size + i] * src[j];
+                        }
+
+                        if (add)
+                        {
+                            dst[i] += dotProduct;
+                        }
+                        else
+                        {
+                            dst[i] = dotProduct;
+                        }
+                    }
+                }
             }
         }
 
         public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgposSrc, AlignedArray srcValues,
-            int posMin, int iposMin, int iposEnd, AlignedArray dst, int crun)
+            int posMin, int iposMin, int iposLim, AlignedArray dst, int crun)
         {
             Contracts.AssertValue(rgposSrc);
-            Contracts.Assert(0 <= iposMin && iposMin <= iposEnd && iposEnd <= rgposSrc.Length);
+            Contracts.Assert(0 <= iposMin && iposMin <= iposLim && iposLim <= rgposSrc.Length);
             Contracts.Assert(mat.Size == dst.Size * srcValues.Size);
 
-            if (iposMin >= iposEnd)
+            if (iposMin >= iposLim)
             {
                 if (!add)
                     dst.ZeroItems();
@@ -53,65 +94,86 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgpo
                 if (!tran)
                 {
                     Contracts.Assert(0 <= crun && crun <= dst.Size);
-                    SseIntrinsics.MatMulPA(add, mat, rgposSrc, srcValues, posMin, iposMin, iposEnd, dst, crun, srcValues.Size);
+                    SseIntrinsics.MatMulPA(add, mat, rgposSrc, srcValues, posMin, iposMin, iposLim, dst, crun, srcValues.Size);
                 }
                 else
                 {
                     Contracts.Assert(0 <= crun && crun <= srcValues.Size);
-                    SseIntrinsics.MatMulTranPA(add, mat, rgposSrc, srcValues, posMin, iposMin, iposEnd, dst, dst.Size);
+                    SseIntrinsics.MatMulTranPA(add, mat, rgposSrc, srcValues, posMin, iposMin, iposLim, dst, dst.Size);
                 }
             }
             else
             {
-                // TODO: Software fallback
+                if (!tran)
+                {
+                    Contracts.Assert(0 <= crun && crun <= dst.Size);
+                    for (int i = 0; i < crun; i++)
+                    {
+                        float dotProduct = 0;
+                        for (int j = iposMin; j < iposLim; j++)
+                        {
+                            int col = rgposSrc[j] - posMin;
+                            dotProduct += mat[i * srcValues.Size + col] * srcValues[col];
+                        }
+
+                        if (add)
+                        {
+                            dst[i] += dotProduct;
+                        }
+                        else
+                        {
+                            dst[i] = dotProduct;
+                        }
+                    }
+                }
+                else
+                {
+                    Contracts.Assert(0 <= crun && crun <= srcValues.Size);
+                    for (int i = 0; i < dst.Size; i++)
+                    {
+                        float dotProduct = 0;
+                        for (int j = iposMin; j < iposLim; j++)
+                        {
+                            int col = rgposSrc[j] - posMin;
+                            dotProduct += mat[col * dst.Size + i] * srcValues[col];
+                        }
+
+                        if (add)
+                        {
+                            dst[i] += dotProduct;
+                        }
+                        else
+                        {
+                            dst[i] = dotProduct;
+                        }
+                    }
+
+                }
             }
         }
 
-        public static void MatTimesSrc(bool add, int[] starts, int[] indices, float[] coefs,
-            AlignedArray src, AlignedArray dst, int crow)
+        public static void Add(float a, float[] dst, int count)
         {
-            Contracts.AssertNonEmpty(starts);
-            Contracts.Assert(starts.Length == crow + 1);
-            Contracts.Assert(starts[0] == 0);
-            Contracts.AssertNonEmpty(indices);
-            Contracts.Assert(starts[crow] == indices.Length);
-            Contracts.AssertNonEmpty(coefs);
-            Contracts.Assert(indices.Length == coefs.Length);
-            Contracts.Assert(0 < crow && crow <= dst.Size);
-            Contracts.Assert(crow * src.Size >= coefs.Length);
+            Contracts.AssertNonEmpty(dst);
+            Contracts.Assert(0 < count);
+            Contracts.Assert(0 < count && count <= dst.Length);
 
-            if (Sse.IsSupported)
-            {
-                SseIntrinsics.MatMulRU(add, starts, indices, coefs, src, dst, crow);
-            }
-            else
-            {
-                // TODO: Software fallback
-            }
+            Add(a, new Span<float>(dst, 0, count));
         }
 
-        public static void MatTimesSrc(bool add, int[] mprowiv, int[] mprowcol,
-            int[] mprowrun, int[] runs, float[] coefs,
-            AlignedArray src, AlignedArray dst, int crow)
+        // dst += a
+        private static void Add(float a, Span<float> dst)
         {
-            Contracts.AssertNonEmpty(mprowiv);
-            Contracts.Assert(mprowiv.Length == crow);
-            Contracts.AssertNonEmpty(mprowcol);
-            Contracts.Assert(mprowcol.Length == crow);
-            Contracts.Assert(mprowrun == null || mprowrun.Length == crow);
-            Contracts.AssertNonEmpty(runs);
-            Contracts.AssertNonEmpty(coefs);
-            Contracts.Assert(0 < crow && crow <= dst.Size);
-
-            if (mprowrun == null)
+            if (Sse.IsSupported)
             {
-                SseIntrinsics.MatMulCU(add, mprowiv, mprowcol, runs, coefs,
-                    src, dst, crow);
+                SseIntrinsics.AddScalarU(a, dst);
             }
             else
             {
-                SseIntrinsics.MatMulDU(add, mprowiv, mprowcol, mprowrun, runs, coefs,
-                    src, dst, crow);
+                for (int i = 0; i < dst.Length; i++)
+                {
+                    dst[i] += a;
+                }
             }
         }
 
@@ -147,6 +209,57 @@ private static void Scale(float a, Span<float> dst)
             }
         }
 
+        // dst = a * src
+        public static void Scale(float a, float[] src, float[] dst, int count)
+        {
+            Contracts.AssertNonEmpty(src);
+            Contracts.Assert(0 < count && count <= src.Length);
+            Contracts.AssertNonEmpty(dst);
+            Contracts.Assert(count <= dst.Length);
+
+            Scale(a, new Span<float>(src, 0, count), new Span<float>(dst, 0, count));
+        }
+
+        private static void Scale(float a, Span<float> src, Span<float> dst)
+        {
+            if (Sse.IsSupported)
+            {
+                SseIntrinsics.ScaleSrcU(a, src, dst);
+            }
+            else
+            {
+                for (int i = 0; i < dst.Length; i++)
+                {
+                    dst[i] = a * src[i];
+                }
+            }
+        }
+
+        // dst[i] = a * (dst[i] + b)
+        public static void ScaleAdd(float a, float b, float[] dst, int count)
+        {
+            Contracts.AssertNonEmpty(dst);
+            Contracts.Assert(0 < count);
+            Contracts.Assert(0 < count && count <= dst.Length);
+
+            ScaleAdd(a, b, new Span<float>(dst, 0, count));
+        }
+
+        private static void ScaleAdd(float a, float b, Span<float> dst)
+        {
+            if (Sse.IsSupported)
+            {
+                SseIntrinsics.ScaleAddU(a, b, dst);
+            }
+            else
+            {
+                for (int i = 0; i < dst.Length; i++)
+                {
+                    dst[i] = a * (dst[i] + b);
+                }
+            }
+        }
+
         public static void AddScale(float a, float[] src, float[] dst, int count)
         {
             Contracts.AssertNonEmpty(src);
@@ -225,6 +338,33 @@ private static void AddScale(float a, Span<float> src, Span<int> indices, Span<f
             }
         }
 
+        public static void AddScaleCopy(float a, float[] src, float[] dst, float[] res, int count)
+        {
+            Contracts.AssertNonEmpty(dst);
+            Contracts.Assert(0 < count && count <= dst.Length);
+            Contracts.AssertNonEmpty(src);
+            Contracts.Assert(count <= src.Length);
+            Contracts.AssertNonEmpty(res);
+            Contracts.Assert(count <= res.Length);
+
+            AddScaleCopy(a, new Span<float>(src, 0, count), new Span<float>(dst, 0, count), new Span<float>(res, 0, count));
+        }
+
+        private static void AddScaleCopy(float a, Span<float> src, Span<float> dst, Span<float> res)
+        {
+            if (Sse.IsSupported)
+            {
+                SseIntrinsics.AddScaleCopyU(a, src, dst, res);
+            }
+            else
+            {
+                for (int i = 0; i < res.Length; i++)
+                {
+                    res[i] = a * src[i] + dst[i];
+                }
+            }
+        }
+
         public static void Add(float[] src, float[] dst, int count)
         {
             Contracts.AssertNonEmpty(src);
@@ -319,6 +459,40 @@ private static void MulElementWise(Span<float> src1, Span<float> src2, Span<floa
             }
         }
 
+        public static float Sum(float[] src, int count)
+        {
+            Contracts.AssertNonEmpty(src);
+            Contracts.Assert(0 < count && count <= src.Length);
+
+            return Sum(new Span<float>(src, 0, count));
+        }
+
+        public static float Sum(float[] src, int offset, int count)
+        {
+            Contracts.AssertNonEmpty(src);
+            Contracts.Assert(0 < count);
+            Contracts.Assert(0 <= offset && offset <= src.Length - count);
+
+            return Sum(new Span<float>(src, offset, count));
+        }
+
+        private static float Sum(Span<float> src)
+        {
+            if (Sse.IsSupported)
+            {
+                return SseIntrinsics.SumU(src);
+            }
+            else
+            {
+                float sum = 0;
+                for (int i = 0; i < src.Length; i++)
+                {
+                    sum += src[i];
+                }
+                return sum;
+            }
+        }
+
         public static float SumSq(float[] src, int count)
         {
             Contracts.AssertNonEmpty(src);
@@ -353,6 +527,39 @@ private static float SumSq(Span<float> src)
             }
         }
 
+        public static float SumSq(float mean, float[] src, int offset, int count)
+        {
+            Contracts.AssertNonEmpty(src);
+            Contracts.Assert(0 < count);
+            Contracts.Assert(0 <= offset && offset <= src.Length - count);
+
+            return SumSq(mean, new Span<float>(src, offset, count));
+        }
+
+        private static float SumSq(float mean, Span<float> src)
+        {
+            if (Sse.IsSupported)
+            {
+                if (mean == 0)
+                {
+                    return SseIntrinsics.SumSqU(src);
+                }
+                else
+                {
+                    return SseIntrinsics.SumSqDiffU(mean, src);
+                }
+            }
+            else
+            {
+                float result = 0;
+                for (int i = 0; i < src.Length; i++)
+                {
+                    result += (src[i] - mean) * (src[i] - mean);
+                }
+                return result;
+            }
+        }
+
         public static float SumAbs(float[] src, int count)
         {
             Contracts.AssertNonEmpty(src);
@@ -387,6 +594,106 @@ private static float SumAbs(Span<float> src)
             }
         }
 
+        public static float SumAbs(float mean, float[] src, int offset, int count)
+        {
+            Contracts.AssertNonEmpty(src);
+            Contracts.Assert(0 < count);
+            Contracts.Assert(0 <= offset && offset <= src.Length - count);
+
+            return SumAbs(mean, new Span<float>(src, offset, count));
+        }
+
+        private static float SumAbs(float mean, Span<float> src)
+        {
+            if (Sse.IsSupported)
+            {
+                if (mean == 0)
+                {
+                    return SseIntrinsics.SumAbsU(src);
+                }
+                else
+                {
+                    return SseIntrinsics.SumAbsDiffU(mean, src);
+                }
+            }
+            else
+            {
+                float sum = 0;
+                for (int i = 0; i < src.Length; i++)
+                {
+                    sum += Math.Abs(src[i] - mean);
+                }
+                return sum;
+            }
+        }
+
+        public static float MaxAbs(float[] src, int count)
+        {
+            Contracts.AssertNonEmpty(src);
+            Contracts.Assert(0 < count && count <= src.Length);
+
+            return MaxAbs(new Span<float>(src, 0, count));
+        }
+
+        public static float MaxAbs(float[] src, int offset, int count)
+        {
+            Contracts.AssertNonEmpty(src);
+            Contracts.Assert(0 < count);
+            Contracts.Assert(0 <= offset && offset <= src.Length - count);
+
+            return MaxAbs(new Span<float>(src, offset, count));
+        }
+
+        private static float MaxAbs(Span<float> src)
+        {
+            if (Sse.IsSupported)
+            {
+                return SseIntrinsics.MaxAbsU(src);
+            }
+            else
+            {
+                float max = 0;
+                for (int i = 0; i < src.Length; i++)
+                {
+                    float abs = Math.Abs(src[i]);
+                    if (abs > max)
+                    {
+                        max = abs;
+                    }
+                }
+                return max;
+            }
+        }
+
+        public static float MaxAbsDiff(float mean, float[] src, int count)
+        {
+            Contracts.AssertNonEmpty(src);
+            Contracts.Assert(0 < count && count <= src.Length);
+
+            return MaxAbsDiff(mean, new Span<float>(src, 0, count));
+        }
+
+        private static float MaxAbsDiff(float mean, Span<float> src)
+        {
+            if (Sse.IsSupported)
+            {
+                return SseIntrinsics.MaxAbsDiffU(mean, src);
+            }
+            else
+            {
+                float max = 0;
+                for (int i = 0; i < src.Length; i++)
+                {
+                    float abs = Math.Abs(src[i] - mean);
+                    if (abs > max)
+                    {
+                        max = abs;
+                    }
+                }
+                return max;
+            }
+        }
+
         public static float DotProductDense(float[] a, float[] b, int count)
         {
             Contracts.AssertNonEmpty(a);
@@ -503,6 +810,8 @@ public static void ZeroMatrixItems(AlignedArray dst, int ccol, int cfltRow, int[
         {
             Contracts.Assert(0 < ccol && ccol <= cfltRow);
 
+            // REVIEW NEEDED: Since the two methods below do not involve any SSE hardware intrinsics, no software fallback is needed.
+            // REVIEW NEEDED; Keeping the check for SSE support so that we don't miss these two methods in case of any conditional compilation of files
             if (Sse.IsSupported)
             {
                 if (ccol == cfltRow)
@@ -514,9 +823,70 @@ public static void ZeroMatrixItems(AlignedArray dst, int ccol, int cfltRow, int[
                     SseIntrinsics.ZeroMatrixItemsCore(dst, dst.Size, ccol, cfltRow, indices, indices.Length);
                 }
             }
+        }
+
+        public static void SdcaL1UpdateDense(float primalUpdate, int length, float[] src, float threshold, float[] v, float[] w)
+        {
+            Contracts.AssertNonEmpty(src);
+            Contracts.Assert(length <= src.Length);
+            Contracts.AssertNonEmpty(v);
+            Contracts.Assert(length <= v.Length);
+            Contracts.AssertNonEmpty(w);
+            Contracts.Assert(length <= w.Length);
+            Contracts.Assert(length > 0);
+
+            SdcaL1UpdateDense(primalUpdate, new Span<float>(src, 0, length), threshold, new Span<float>(v, 0, length), new Span<float>(w, 0, length));
+        }
+
+        private static void SdcaL1UpdateDense(float primalUpdate, Span<float> src, float threshold, Span<float> v, Span<float> w)
+        {
+            if (Sse.IsSupported)
+            {
+                SseIntrinsics.SdcaL1UpdateU(primalUpdate, src, threshold, v, w);
+            }
+            else
+            {
+                for (int i = 0; i < src.Length; i++)
+                {
+                    v[i] += src[i] * primalUpdate;
+                    float value = v[i];
+                    w[i] = Math.Abs(value) > threshold ? (value > 0 ? value - threshold : value + threshold) : 0;
+                }
+            }
+        }
+
+        // REVIEW NEEDED: The second argument "length" is unused even in the existing code.
+        public static void SdcaL1UpdateSparse(float primalUpdate, int length, float[] src, int[] indices, int count, float threshold, float[] v, float[] w)
+        {
+            Contracts.AssertNonEmpty(src);
+            Contracts.Assert(count <= src.Length);
+            Contracts.AssertNonEmpty(indices);
+            Contracts.Assert(count <= indices.Length);
+            Contracts.AssertNonEmpty(w);
+            Contracts.Assert(length <= w.Length);
+            Contracts.AssertNonEmpty(v);
+            Contracts.Assert(length <= v.Length);
+            Contracts.Assert(0 < count);
+            Contracts.Assert(count < length);
+
+            SdcaL1UpdateSparse(primalUpdate, new Span<float>(src, 0, count), new Span<int>(indices, 0, count), threshold, new Span<float>(v), new Span<float>(w));
+        }
+
+        private static void SdcaL1UpdateSparse(float primalUpdate, Span<float> src, Span<int> indices, float threshold, Span<float> v, Span<float> w)
+        {
+            if (Sse.IsSupported)
+            {
+                SseIntrinsics.SdcaL1UpdateSU(primalUpdate, src, indices, threshold, v, w);
+            }
             else
             {
-                // TODO: Software fallback
+                for (int i = 0; i < indices.Length; i++)
+                {
+                    int index = indices[i];
+                    v[index] += src[i] * primalUpdate;
+                    float value = v[index];
+                    w[index] = Math.Abs(value) > threshold ? (value > 0 ? value - threshold : value + threshold) : 0;
+                }
             }
         }
     }
diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs
index a71316ebb3..730fb10be7 100644
--- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs
+++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs
@@ -11,16 +11,16 @@ public static partial class CpuMathUtils
         public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgposSrc, AlignedArray srcValues,
             int posMin, int iposMin, int iposLim, AlignedArray dst, int crun) => SseUtils.MatTimesSrc(tran, add, mat, rgposSrc, srcValues, posMin, iposMin, iposLim, dst, crun);
 
-        public static void MatTimesSrc(bool add, int[] starts, int[] indices, float[] coefs,
-            AlignedArray src, AlignedArray dst, int crow) => SseUtils.MatTimesSrc(add, starts, indices, coefs, src, dst, crow);
-
-        public static void MatTimesSrc(bool add, int[] mprowiv, int[] mprowcol, int[] mprowrun, int[] runs, float[] coefs,
-            AlignedArray src, AlignedArray dst, int crow) => SseUtils.MatTimesSrc(add, mprowiv, mprowcol, mprowrun, runs, coefs, src, dst, crow);
+        public static void Add(float a, float[] dst, int count) => SseUtils.Add(a, dst, count);
 
         public static void Scale(float a, float[] dst, int count) => SseUtils.Scale(a, dst, count);
 
         public static void Scale(float a, float[] dst, int offset, int count) => SseUtils.Scale(a, dst, offset, count);
 
+        public static void Scale(float a, float[] src, float[] dst, int count) => SseUtils.Scale(a, src, dst, count);
+
+        public static void ScaleAdd(float a, float b, float[] dst, int count) => SseUtils.ScaleAdd(a, b, dst, count);
+
         public static void AddScale(float a, float[] src, float[] dst, int count) => SseUtils.AddScale(a, src, dst, count);
 
         public static void AddScale(float a, float[] src, float[] dst, int dstOffset, int count) => SseUtils.AddScale(a, src, dst, dstOffset, count);
@@ -29,6 +29,8 @@ public static void MatTimesSrc(bool add, int[] mprowiv, int[] mprowcol, int[] mp
 
         public static void AddScale(float a, float[] src, int[] indices, float[] dst, int dstOffset, int count) => SseUtils.AddScale(a, src, indices, dst, dstOffset, count);
 
+        public static void AddScaleCopy(float a, float[] src, float[] dst, float[] res, int count) => SseUtils.AddScaleCopy(a, src, dst, res, count);
+
         public static void Add(float[] src, float[] dst, int count) => SseUtils.Add(src, dst, count);
 
         public static void Add(float[] src, int[] indices, float[] dst, int count) => SseUtils.Add(src, indices, dst, count);
@@ -37,14 +39,28 @@ public static void MatTimesSrc(bool add, int[] mprowiv, int[] mprowcol, int[] mp
 
         public static void MulElementWise(float[] src1, float[] src2, float[] dst, int count) => SseUtils.MulElementWise(src1, src2, dst, count);
 
+        public static float Sum(float[] src, int count) => SseUtils.Sum(src, count);
+
+        public static float Sum(float[] src, int offset, int count) => SseUtils.Sum(src, offset, count);
+
         public static float SumSq(float[] src, int count) => SseUtils.SumSq(src, count);
 
         public static float SumSq(float[] src, int offset, int count) => SseUtils.SumSq(src, offset, count);
 
+        public static float SumSq(float mean, float[] src, int offset, int count) => SseUtils.SumSq(mean, src, offset, count);
+
         public static float SumAbs(float[] src, int count) => SseUtils.SumAbs(src, count);
 
         public static float SumAbs(float[] src, int offset, int count) => SseUtils.SumAbs(src, offset, count);
 
+        public static float SumAbs(float mean, float[] src, int offset, int count) => SseUtils.SumAbs(mean, src, offset, count);
+
+        public static float MaxAbs(float[] src, int count) => SseUtils.MaxAbs(src, count);
+
+        public static float MaxAbs(float[] src, int offset, int count) => SseUtils.MaxAbs(src, offset, count);
+
+        public static float MaxAbsDiff(float mean, float[] src, int count) => SseUtils.MaxAbsDiff(mean, src, count);
+
         public static float DotProductDense(float[] a, float[] b, int count) => SseUtils.DotProductDense(a, b, count);
 
         public static float DotProductDense(float[] a, int offset, float[] b, int count) => SseUtils.DotProductDense(a, offset, b, count);
@@ -56,5 +72,11 @@ public static void MatTimesSrc(bool add, int[] mprowiv, int[] mprowcol, int[] mp
         public static float L2DistSquared(float[] a, float[] b, int count) => SseUtils.L2DistSquared(a, b, count);
 
         public static void ZeroMatrixItems(AlignedArray dst, int ccol, int cfltRow, int[] indices) => SseUtils.ZeroMatrixItems(dst, ccol, cfltRow, indices);
+
+        public static void SdcaL1UpdateDense(float primalUpdate, int length, float[] src, float threshold, float[] v, float[] w)
+            => SseUtils.SdcaL1UpdateDense(primalUpdate, length, src, threshold, v, w);
+
+        public static void SdcaL1UpdateSparse(float primalUpdate, int length, float[] src, int[] indices, int count, float threshold, float[] v, float[] w)
+            => SseUtils.SdcaL1UpdateSparse(primalUpdate, length, src, indices, count, threshold, v, w);
     }
 }
diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
index 1d61496c47..a57382a4d7 100644
--- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
+++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
@@ -4,8 +4,14 @@
 
 // The exported function names need to be unique (can't be disambiguated based on signature), hence
 // we introduce suffix letters to indicate the general patterns used.
+// * A suffix means aligned and padded for SSE operations.
 // * U suffix means unaligned and unpadded.
 // * S suffix means sparse (unaligned) vector.
+// * P suffix means sparse (unaligned) partial vector - the vector is only part of a larger sparse vector.
+// * R suffix means sparse matrix.
+// * C suffix means convolution matrix.
+// * D suffix means convolution matrix, with implicit source padding.
+// * Tran means the matrix is transposed.
 
 using System;
 using System.Runtime.CompilerServices;
@@ -72,7 +78,7 @@ private static unsafe void Store4(Vector128<float> x, float* dst, int* idx)
         }
 
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
-        private static Vector128<float> VectorSum(in Vector128<float> vector)
+        private static Vector128<float> VectorSum(Vector128<float> vector)
         {
             if (Sse3.IsSupported)
             {
@@ -355,7 +361,6 @@ internal static unsafe void MatMulTranPA(bool add, AlignedArray mat, int[] rgpos
                 while (ppos < pposEnd)
                 {
                     int col = *ppos - posMin;
-                    ppos++;
 
                     Vector128<float> x0 = Sse.SetAllVector128(psrc[col]);
                     float* pDstCurrent = pdst;
@@ -378,267 +383,126 @@ internal static unsafe void MatMulTranPA(bool add, AlignedArray mat, int[] rgpos
             }
         }
 
-        // Sparse matrix.
-        internal static unsafe void MatMulRU(bool add, int[] starts, int[] indices, float[] coefs,
-                                                AlignedArray src, AlignedArray dst, int crow)
+        // dst[i] += scale
+        internal static unsafe void AddScalarU(float scale, Span<float> dst)
         {
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
-
-            fixed (int* pstarts = &starts[0])
-            fixed (int* pindices = &indices[0])
-            fixed (float* pcoefs = &coefs[0])
-            fixed (float* pSrcStart = &src.Items[0])
-            fixed (float* pDstStart = &dst.Items[0])
+            fixed (float* pdst = dst)
             {
-                float* psrc = Ptr(src, pSrcStart);
-                float* pdst = Ptr(dst, pDstStart);
-
-                int* pii = pstarts + 1;
-                int* pIdxCurrent = pindices;
-                float* pMatCurrent = pcoefs;
-                float* pDstEnd = pdst + crow;
-
+                float* pDstEnd = pdst + dst.Length;
                 float* pDstCurrent = pdst;
 
-                while (pDstCurrent < pDstEnd)
-                {
-                    int* pIdxEnd = pindices + *pii;
-                    pii++;
-
-                    Vector128<float> result = Sse.SetZeroVector128();
-
-                    while (pIdxCurrent + 4 <= pIdxEnd)
-                    {
-                        Vector128<float> x = Sse.Multiply(Load4(psrc, pIdxCurrent), Sse.LoadVector128(pMatCurrent));
-                        result = Sse.Add(result, x);
-
-                        pIdxCurrent += 4;
-                        pMatCurrent += 4;
-                    }
-
-                    while (pIdxCurrent < pIdxEnd)
-                    {
-                        Vector128<float> x = Sse.MultiplyScalar(Load1(psrc, pIdxCurrent), Sse.SetScalarVector128(*pMatCurrent));
-                        result = Sse.AddScalar(result, x);
+                Vector128<float> x1 = Sse.SetAllVector128(scale);
 
-                        pIdxCurrent++;
-                        pMatCurrent++;
-                    }
+                while (pDstCurrent + 4 <= pDstEnd)
+                {
+                    Vector128<float> x2 = Sse.LoadVector128(pDstCurrent);
+                    x2 = Sse.Add(x2, x1);
+                    Sse.Store(pDstCurrent, x2);
 
-                    result = VectorSum(in result);
+                    pDstCurrent += 4;
+                }
 
-                    if (add)
-                    {
-                        result = Sse.AddScalar(result, Sse.SetScalarVector128(*pDstCurrent));
-                    }
-                    Sse.StoreScalar(pDstCurrent, result);
+                while (pDstCurrent < pDstEnd)
+                {
+                    Vector128<float> x2 = Sse.LoadScalarVector128(pDstCurrent);
+                    x2 = Sse.AddScalar(x2, x1);
+                    Sse.StoreScalar(pDstCurrent, x2);
 
                     pDstCurrent++;
                 }
             }
         }
 
-        // Unpadded convolution.
-        internal static unsafe void MatMulCU(bool add, int[] mprowiv, int[] mprowcol,
-            int[] runs, float[] coefs, AlignedArray src, AlignedArray dst, int crow)
+        internal static unsafe void ScaleU(float scale, Span<float> dst)
         {
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
+            Vector128<float> scaleVector = Sse.SetAllVector128(scale);
 
-            fixed (int* pmprowiv = &mprowiv[0])
-            fixed (int* pmprowcol = &mprowcol[0])
-            fixed (int* pruns = &runs[0])
-            fixed (float* pcoefs = &coefs[0])
-            fixed (float* pSrcStart = &src.Items[0])
-            fixed (float* pDstStart = &dst.Items[0])
+            fixed (float* pdst = dst)
             {
-                float* psrc = Ptr(src, pSrcStart);
-                float* pdst = Ptr(dst, pDstStart);
-
-                int size = pruns[1];
-                int* psupport = pruns + 2;
-                int* piv = pmprowiv;
-                int* pcol = pmprowcol;
-                int* pIdxEnd = psupport + size;
-                float* pDstEnd = pdst + crow;
-
                 float* pDstCurrent = pdst;
+                float* pEnd = pdst + dst.Length;
 
-                while (pDstCurrent < pDstEnd)
+                while (pDstCurrent + 4 <= pEnd)
                 {
-                    float* pMatCurrent = pcoefs + *piv;
-                    piv++;
-                    float* pSrcCurrent = psrc + *pcol;
-                    pcol++;
-                    int* pIdxCurrent = psupport;
-
-                    Vector128<float> result = Sse.SetZeroVector128();
-
-                    while (pIdxCurrent + 4 <= pIdxEnd)
-                    {
-                        Vector128<float> x = Sse.Multiply(Load4(pSrcCurrent, pIdxCurrent), Sse.LoadVector128(pMatCurrent));
-                        result = Sse.Add(result, x);
-
-                        pIdxCurrent += 4;
-                        pMatCurrent += 4;
-                    }
-
-                    while (pIdxCurrent < pIdxEnd)
-                    {
-                        Vector128<float> x = Sse.MultiplyScalar(Load1(pSrcCurrent, pIdxCurrent), Sse.SetScalarVector128(*pMatCurrent));
-                        result = Sse.AddScalar(result, x);
+                    Vector128<float> dstVector = Sse.LoadVector128(pDstCurrent);
 
-                        pIdxCurrent++;
-                        pMatCurrent++;
-                    }
+                    dstVector = Sse.Multiply(scaleVector, dstVector);
+                    Sse.Store(pDstCurrent, dstVector);
 
-                    result = VectorSum(result);
+                    pDstCurrent += 4;
+                }
 
-                    // Add the bias.
-                    result = Sse.AddScalar(result, Sse.SetScalarVector128(*pMatCurrent));
+                while (pDstCurrent < pEnd)
+                {
+                    Vector128<float> dstVector = Sse.LoadScalarVector128(pDstCurrent);
 
-                    if (add)
-                    {
-                        result = Sse.AddScalar(result, Sse.SetScalarVector128(*pDstCurrent));
-                    }
-                    Sse.StoreScalar(pDstCurrent, result);
+                    dstVector = Sse.MultiplyScalar(scaleVector, dstVector);
+                    Sse.StoreScalar(pDstCurrent, dstVector);
 
                     pDstCurrent++;
                 }
             }
         }
 
-        // Padded convolution.
-        internal static unsafe void MatMulDU(bool add, int[] mprowiv, int[] mprowcol, int[] mprowrun,
-            int[] runs, float[] coefs, AlignedArray src, AlignedArray dst, int crow)
+        internal static unsafe void ScaleSrcU(float scale, Span<float> src, Span<float> dst)
         {
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
+            Vector128<float> scaleVector = Sse.SetAllVector128(scale);
 
-            fixed (int* pmprowiv = &mprowiv[0])
-            fixed (int* pmprowcol = &mprowcol[0])
-            fixed (int* pmprowrun = &mprowrun[0])
-            fixed (int* pruns = &runs[0])
-            fixed (float* pcoefs = &coefs[0])
-            fixed (float* pSrcStart = &src.Items[0])
-            fixed (float* pDstStart = &dst.Items[0])
+            fixed (float* psrc = src)
+            fixed (float* pdst = dst)
             {
-                float* psrc = Ptr(src, pSrcStart);
-                float* pdst = Ptr(dst, pDstStart);
-
-                int* piv = pmprowiv;
-                int* pcol = pmprowcol;
-                float* pDstEnd = pdst + crow;
-                int kernelSize = pruns[1];
-
-                int* pirun = pmprowrun;
+                float* pDstEnd = pdst + dst.Length;
+                float* pSrcCurrent = psrc;
                 float* pDstCurrent = pdst;
 
-                while (pDstCurrent < pDstEnd)
+                while (pDstCurrent + 4 <= pDstEnd)
                 {
-                    float* pMatCurrent = pcoefs + *piv;
-                    piv++;
-                    float* pMatBias = pMatCurrent + kernelSize;
-                    float* pSrcCurrent = psrc + *pcol;
-                    pcol++;
-                    int irun = *pirun;
-                    pirun++;
-
-                    int* pIdxCurrent = pruns + 2 + irun;
-                    int* pIdxEnd = pIdxCurrent + pIdxCurrent[-1];
-
-                    Vector128<float> result = Sse.SetZeroVector128();
-
-                    if (irun == 0)
-                    {
-                        // No masking needed.
-                        while (pIdxCurrent + 4 <= pIdxEnd)
-                        {
-                            Vector128<float> x = Sse.Multiply(Load4(pSrcCurrent, pIdxCurrent), Sse.LoadVector128(pMatCurrent));
-                            result = Sse.Add(result, x);
-
-                            pIdxCurrent += 4;
-                            pMatCurrent += 4;
-                        }
-
-                        while (pIdxCurrent < pIdxEnd)
-                        {
-                            Vector128<float> x = Sse.MultiplyScalar(Load1(pSrcCurrent, pIdxCurrent), Sse.SetScalarVector128(*pMatCurrent));
-                            result = Sse.AddScalar(result, x);
-
-                            pIdxCurrent++;
-                            pMatCurrent++;
-                        }
-                    }
-                    else
-                    {
-                        // Need masking.
-                        pMatCurrent += pIdxCurrent[-2];
-                        // REVIEW NEEDED: Is it the correct translation from: "const float * pmask = reinterpret_cast<const float *>(piLim);"?
-                        float* pmask = (float*)pIdxEnd;
-
-                        while (pIdxCurrent + 4 <= pIdxEnd)
-                        {
-                            Vector128<float> x = Sse.Multiply(Load4(pSrcCurrent, pIdxCurrent), Sse.And(Sse.LoadVector128(pmask), Sse.LoadVector128(pMatCurrent)));
-                            result = Sse.Add(result, x);
-
-                            pIdxCurrent += 4;
-                            pMatCurrent += 4;
-                            pmask += 4;
-                        }
-
-                        while (pIdxCurrent < pIdxEnd)
-                        {
-                            Vector128<float> x = Sse.MultiplyScalar(Load1(pSrcCurrent, pIdxCurrent), Sse.And(Sse.SetScalarVector128(*pmask), Sse.SetScalarVector128(*pMatCurrent)));
-                            result = Sse.AddScalar(result, x);
-
-                            pIdxCurrent++;
-                            pMatCurrent++;
-                            pmask++;
-                        }
-                    }
-
-                    result = VectorSum(result);
+                    Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
+                    srcVector = Sse.Multiply(srcVector, scaleVector);
+                    Sse.Store(pDstCurrent, srcVector);
 
-                    // Add the bias.
-                    result = Sse.AddScalar(result, Sse.SetScalarVector128(*pMatBias));
+                    pSrcCurrent += 4;
+                    pDstCurrent += 4;
+                }
 
-                    if (add)
-                    {
-                        result = Sse.AddScalar(result, Sse.SetScalarVector128(*pDstCurrent));
-                    }
-                    Sse.StoreScalar(pDstCurrent, result);
+                while (pDstCurrent < pDstEnd)
+                {
+                    Vector128<float> srcVector = Sse.LoadScalarVector128(pSrcCurrent);
+                    srcVector = Sse.MultiplyScalar(srcVector, scaleVector);
+                    Sse.StoreScalar(pDstCurrent, srcVector);
 
+                    pSrcCurrent++;
                     pDstCurrent++;
                 }
             }
         }
 
-        internal static unsafe void ScaleU(float scale, Span<float> dst)
+        // dst[i] = a * (dst[i] + b)
+        internal static unsafe void ScaleAddU(float a, float b, Span<float> dst)
         {
-            Vector128<float> scaleVector = Sse.SetAllVector128(scale);
+            Vector128<float> x1 = Sse.SetAllVector128(a);
+            Vector128<float> x2 = Sse.SetAllVector128(b);
 
             fixed (float* pdst = dst)
             {
+                float* pDstEnd = pdst + dst.Length;
                 float* pDstCurrent = pdst;
-                float* pEnd = pdst + dst.Length;
 
-                while (pDstCurrent + 4 <= pEnd)
+                while (pDstCurrent + 4 <= pDstEnd)
                 {
                     Vector128<float> dstVector = Sse.LoadVector128(pDstCurrent);
-
-                    dstVector = Sse.Multiply(scaleVector, dstVector);
+                    dstVector = Sse.Add(dstVector, x2);
+                    dstVector = Sse.Multiply(dstVector, x1);
                     Sse.Store(pDstCurrent, dstVector);
 
                     pDstCurrent += 4;
                 }
 
-                while (pDstCurrent < pEnd)
+                while (pDstCurrent < pDstEnd)
                 {
                     Vector128<float> dstVector = Sse.LoadScalarVector128(pDstCurrent);
-
-                    dstVector = Sse.MultiplyScalar(scaleVector, dstVector);
+                    dstVector = Sse.AddScalar(dstVector, x2);
+                    dstVector = Sse.MultiplyScalar(dstVector, x1);
                     Sse.StoreScalar(pDstCurrent, dstVector);
 
                     pDstCurrent++;
@@ -685,6 +549,47 @@ internal static unsafe void AddScaleU(float scale, Span<float> src, Span<float>
             }
         }
 
+        internal static unsafe void AddScaleCopyU(float scale, Span<float> src, Span<float> dst, Span<float> result)
+        {
+            fixed (float* psrc = src)
+            fixed (float* pdst = dst)
+            fixed (float* pres = result)
+            {
+                float* pResEnd = pres + result.Length;
+                float* pSrcCurrent = psrc;
+                float* pDstCurrent = pdst;
+                float* pResCurrent = pres;
+
+                Vector128<float> x1 = Sse.SetAllVector128(scale);
+
+                while (pResCurrent + 4 <= pResEnd)
+                {
+                    Vector128<float> x2 = Sse.LoadVector128(pSrcCurrent);
+                    Vector128<float> x3 = Sse.LoadVector128(pDstCurrent);
+                    x2 = Sse.Multiply(x2, x1);
+                    x3 = Sse.Add(x3, x2);
+                    Sse.Store(pResCurrent, x3);
+
+                    pSrcCurrent += 4;
+                    pDstCurrent += 4;
+                    pResCurrent += 4;
+                }
+
+                while (pResCurrent < pResEnd)
+                {
+                    Vector128<float> x2 = Sse.LoadScalarVector128(pSrcCurrent);
+                    Vector128<float> x3 = Sse.LoadScalarVector128(pDstCurrent);
+                    x2 = Sse.MultiplyScalar(x2, x1);
+                    x3 = Sse.AddScalar(x3, x2);
+                    Sse.StoreScalar(pResCurrent, x3);
+
+                    pSrcCurrent++;
+                    pDstCurrent++;
+                    pResCurrent++;
+                }
+            }
+        }
+
         internal static unsafe void AddScaleSU(float scale, Span<float> src, Span<int> idx, Span<float> dst)
         {
             Vector128<float> scaleVector = Sse.SetAllVector128(scale);
@@ -826,6 +731,33 @@ internal static unsafe void MulElementWiseU(Span<float> src1, Span<float> src2,
             }
         }
 
+        internal static unsafe float SumU(Span<float> src)
+        {
+            fixed (float* psrc = src)
+            {
+                float* pSrcEnd = psrc + src.Length;
+                float* pSrcCurrent = psrc;
+
+                Vector128<float> result = Sse.SetZeroVector128();
+
+                while (pSrcCurrent + 4 < pSrcEnd)
+                {
+                    result = Sse.Add(result, Sse.LoadVector128(pSrcCurrent));
+                    pSrcCurrent += 4;
+                }
+
+                result = VectorSum(result);
+
+                while (pSrcCurrent < pSrcEnd)
+                {
+                    result = Sse.AddScalar(result, Sse.LoadScalarVector128(pSrcCurrent));
+                    pSrcCurrent++;
+                }
+
+                return Sse.ConvertToSingle(result);
+            }
+        }
+
         internal static unsafe float SumSqU(Span<float> src)
         {
             Vector128<float> result = Sse.SetZeroVector128();
@@ -843,7 +775,7 @@ internal static unsafe float SumSqU(Span<float> src)
                     pSrcCurrent += 4;
                 }
 
-                result = VectorSum(in result);
+                result = VectorSum(result);
 
                 while (pSrcCurrent < pEnd)
                 {
@@ -857,6 +789,40 @@ internal static unsafe float SumSqU(Span<float> src)
             return Sse.ConvertToSingle(result);
         }
 
+        internal static unsafe float SumSqDiffU(float mean, Span<float> src)
+        {
+            fixed (float* psrc = src)
+            {
+                float* pSrcEnd = psrc + src.Length;
+                float* pSrcCurrent = psrc;
+
+                Vector128<float> result = Sse.SetZeroVector128();
+                Vector128<float> meanVector = Sse.SetAllVector128(mean);
+
+                while (pSrcCurrent + 4 <= pSrcEnd)
+                {
+                    Vector128<float> x = Sse.LoadVector128(pSrcCurrent);
+                    x = Sse.Subtract(x, meanVector);
+                    result = Sse.Add(result, Sse.Multiply(x, x));
+
+                    pSrcCurrent += 4;
+                }
+
+                result = VectorSum(result);
+
+                while (pSrcCurrent < pSrcEnd)
+                {
+                    Vector128<float> x = Sse.LoadScalarVector128(pSrcCurrent);
+                    x = Sse.SubtractScalar(x, meanVector);
+                    result = Sse.AddScalar(result, Sse.MultiplyScalar(x, x));
+
+                    pSrcCurrent++;
+                }
+
+                return Sse.ConvertToSingle(result);
+            }
+        }
+
         internal static unsafe float SumAbsU(Span<float> src)
         {
             Vector128<float> result = Sse.SetZeroVector128();
@@ -884,13 +850,148 @@ internal static unsafe float SumAbsU(Span<float> src)
                     pSrcCurrent += 4;
                 }
 
-                result = VectorSum(in result);
+                result = VectorSum(result);
 
                 while (pSrcCurrent < pEnd)
+                {
+                    Vector128<float> srcVector = Sse.LoadScalarVector128(pSrcCurrent);
+                    result = Sse.AddScalar(result, Sse.And(srcVector, mask));
+
+                    pSrcCurrent++;
+                }
+            }
+
+            return Sse.ConvertToSingle(result);
+        }
+
+        internal static unsafe float SumAbsDiffU(float mean, Span<float> src)
+        {
+            Vector128<float> result = Sse.SetZeroVector128();
+            Vector128<float> meanVector = Sse.SetAllVector128(mean);
+            Vector128<float> mask;
+
+            if (Sse2.IsSupported)
+            {
+                mask = Sse.StaticCast<int, float>(Sse2.SetAllVector128(0x7FFFFFFF));
+            }
+            else
+            {
+                mask = Sse.SetAllVector128(BitConverter.Int32BitsToSingle(0x7FFFFFFF));
+            }
+
+            fixed (float* psrc = src)
+            {
+                float* pSrcCurrent = psrc;
+                float* pEnd = psrc + src.Length;
+
+                while (pSrcCurrent + 4 <= pEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
+                    srcVector = Sse.Subtract(srcVector, meanVector);
                     result = Sse.Add(result, Sse.And(srcVector, mask));
 
+                    pSrcCurrent += 4;
+                }
+
+                result = VectorSum(result);
+
+                while (pSrcCurrent < pEnd)
+                {
+                    Vector128<float> srcVector = Sse.LoadScalarVector128(pSrcCurrent);
+                    srcVector = Sse.SubtractScalar(srcVector, meanVector);
+                    result = Sse.AddScalar(result, Sse.And(srcVector, mask));
+
+                    pSrcCurrent++;
+                }
+            }
+
+            return Sse.ConvertToSingle(result);
+        }
+
+        internal static unsafe float MaxAbsU(Span<float> src)
+        {
+            Vector128<float> result = Sse.SetZeroVector128();
+            Vector128<float> mask;
+
+            if (Sse2.IsSupported)
+            {
+                mask = Sse.StaticCast<int, float>(Sse2.SetAllVector128(0x7FFFFFFF));
+            }
+            else
+            {
+                mask = Sse.SetAllVector128(BitConverter.Int32BitsToSingle(0x7FFFFFFF));
+            }
+
+            fixed (float* psrc = src)
+            {
+                float* pSrcCurrent = psrc;
+                float* pEnd = psrc + src.Length;
+
+                while (pSrcCurrent + 4 <= pEnd)
+                {
+                    Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
+                    result = Sse.Max(result, Sse.And(srcVector, mask));
+
+                    pSrcCurrent += 4;
+                }
+
+                Vector128<float> x1 = Sse.Shuffle(result, result, 0xB1);
+                result = Sse.Max(result, x1);
+                x1 = Sse.Shuffle(result, result, 0x02);
+                result = Sse.MaxScalar(result, x1);
+
+                while (pSrcCurrent < pEnd)
+                {
+                    Vector128<float> srcVector = Sse.LoadScalarVector128(pSrcCurrent);
+                    result = Sse.MaxScalar(result, Sse.And(srcVector, mask));
+
+                    pSrcCurrent++;
+                }
+            }
+
+            return Sse.ConvertToSingle(result);
+        }
+
+        internal static unsafe float MaxAbsDiffU(float mean, Span<float> src)
+        {
+            Vector128<float> result = Sse.SetZeroVector128();
+            Vector128<float> meanVector = Sse.SetAllVector128(mean);
+            Vector128<float> mask;
+
+            if (Sse2.IsSupported)
+            {
+                mask = Sse.StaticCast<int, float>(Sse2.SetAllVector128(0x7FFFFFFF));
+            }
+            else
+            {
+                mask = Sse.SetAllVector128(BitConverter.Int32BitsToSingle(0x7FFFFFFF));
+            }
+
+            fixed (float* psrc = src)
+            {
+                float* pSrcCurrent = psrc;
+                float* pEnd = psrc + src.Length;
+
+                while (pSrcCurrent + 4 <= pEnd)
+                {
+                    Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
+                    srcVector = Sse.Subtract(srcVector, meanVector);
+                    result = Sse.Max(result, Sse.And(srcVector, mask));
+
+                    pSrcCurrent += 4;
+                }
+
+                Vector128<float> x1 = Sse.Shuffle(result, result, 0xB1);
+                result = Sse.Max(result, x1);
+                x1 = Sse.Shuffle(result, result, 0x02);
+                result = Sse.MaxScalar(result, x1);
+
+                while (pSrcCurrent < pEnd)
+                {
+                    Vector128<float> srcVector = Sse.LoadScalarVector128(pSrcCurrent);
+                    srcVector = Sse.SubtractScalar(srcVector, meanVector);
+                    result = Sse.MaxScalar(result, Sse.And(srcVector, mask));
+
                     pSrcCurrent++;
                 }
             }
@@ -920,7 +1021,7 @@ internal static unsafe float DotU(Span<float> src, Span<float> dst)
                     pDstCurrent += 4;
                 }
 
-                result = VectorSum(in result);
+                result = VectorSum(result);
 
                 while (pSrcCurrent < pEnd)
                 {
@@ -961,7 +1062,7 @@ internal static unsafe float DotSU(Span<float> src, Span<float> dst, Span<int> i
                     pDstCurrent += 4;
                 }
 
-                result = VectorSum(in result);
+                result = VectorSum(result);
 
                 while (pIdxCurrent < pEnd)
                 {
@@ -1000,7 +1101,7 @@ internal static unsafe float Dist2(Span<float> src, Span<float> dst)
                     pDstCurrent += 4;
                 }
 
-                sqDistanceVector = VectorSum(in sqDistanceVector);
+                sqDistanceVector = VectorSum(sqDistanceVector);
 
                 float norm = Sse.ConvertToSingle(sqDistanceVector);
                 while (pSrcCurrent < pEnd)
@@ -1072,5 +1173,102 @@ internal static unsafe void ZeroMatrixItemsCore(AlignedArray dst, int c, int cco
                 }
             }
         }
+
+        internal static unsafe void SdcaL1UpdateU(float primalUpdate, Span<float> src, float threshold, Span<float> v, Span<float> w)
+        {
+            fixed (float* psrc = src)
+            fixed (float* pdst1 = v)
+            fixed (float* pdst2 = w)
+            {
+                float* pSrcEnd = psrc + src.Length;
+                float* pSrcCurrent = psrc;
+                float* pDst1Current = pdst1;
+                float* pDst2Current = pdst2;
+
+                Vector128<float> xPrimal = Sse.SetAllVector128(primalUpdate);
+
+                Vector128<float> signMask = Sse.SetAllVector128(-0.0f); // 1000 0000 ...
+                Vector128<float> xThreshold = Sse.SetAllVector128(threshold);
+
+                while (pSrcCurrent + 4 <= pSrcEnd)
+                {
+                    Vector128<float> xSrc = Sse.LoadVector128(pSrcCurrent);
+                    Vector128<float> xDst1 = Sse.LoadVector128(pDst1Current);
+                    xDst1 = Sse.Add(xDst1, Sse.Multiply(xSrc, xPrimal));
+                    Sse.Store(pDst1Current, xDst1);
+
+                    Vector128<float> xSign = Sse.And(xDst1, signMask); // result = 10000... if xDst1 is negative or 00000 otherwise
+                    Vector128<float> xDst1Abs = Sse.Xor(xDst1, xSign);
+                    Vector128<float> xCond = Sse.CompareGreaterThan(xDst1Abs, xThreshold); // all 1's if true
+                    Vector128<float> x2 = Sse.Xor(xSign, xThreshold); // -threshold if xDst1 is negative and +threshold otherwise
+                    Vector128<float> xDst2 = Sse.And(Sse.Subtract(xDst1, x2), xCond);
+                    Sse.Store(pDst2Current, xDst2);
+
+                    pSrcCurrent += 4;
+                    pDst1Current += 4;
+                    pDst2Current += 4;
+                }
+
+                while (pSrcCurrent < pSrcEnd)
+                {
+                    *pDst1Current += (*pSrcCurrent) * primalUpdate;
+                    float dst1 = *pDst1Current;
+                    *pDst2Current = Math.Abs(dst1) > threshold ? (dst1 > 0 ? dst1 - threshold : dst1 + threshold) : 0;
+
+                    pSrcCurrent++;
+                    pDst1Current++;
+                    pDst2Current++;
+                }
+            }
+        }
+
+        internal static unsafe void SdcaL1UpdateSU(float primalUpdate, Span<float> src, Span<int> indices, float threshold, Span<float> v, Span<float> w)
+        {
+            fixed (float* psrc = src)
+            fixed (int* pidx = indices)
+            fixed (float* pdst1 = v)
+            fixed (float* pdst2 = w)
+            {
+                int* pIdxEnd = pidx + indices.Length;
+                float* pSrcCurrent = psrc;
+                int* pIdxCurrent = pidx;
+
+                Vector128<float> xPrimal = Sse.SetAllVector128(primalUpdate);
+
+                Vector128<float> signMask = Sse.SetAllVector128(-0.0f); // 1000 0000 ...
+                Vector128<float> xThreshold = Sse.SetAllVector128(threshold);
+
+                while (pIdxCurrent + 4 <= pIdxEnd)
+                {
+                    Vector128<float> xSrc = Sse.LoadVector128(pSrcCurrent);
+
+                    Vector128<float> xDst1 = Load4(pdst1, pIdxCurrent);
+                    xDst1 = Sse.Add(xDst1, Sse.Multiply(xSrc, xPrimal));
+
+                    Vector128<float> xSign = Sse.And(xDst1, signMask); // result = 10000... if xDst1 is negative or 00000 otherwise
+                    Vector128<float> xDst1Abs = Sse.Xor(xDst1, xSign);
+                    Vector128<float> xCond = Sse.CompareGreaterThan(xDst1Abs, xThreshold); // all 1's if true
+                    Vector128<float> x2 = Sse.Xor(xSign, xThreshold); // -threshold if xDst1 is negative and +threshold otherwise
+                    Vector128<float> xDst2 = Sse.And(Sse.Subtract(xDst1, x2), xCond);
+
+                    Store4(xDst1, pdst1, pIdxCurrent);
+                    Store4(xDst2, pdst2, pIdxCurrent);
+
+                    pIdxCurrent += 4;
+                    pSrcCurrent += 4;
+                }
+
+                while (pIdxCurrent < pIdxEnd)
+                {
+                    int index = *pIdxCurrent;
+                    pdst1[index] += (*pSrcCurrent) * primalUpdate;
+                    float dst1 = pdst1[index];
+                    pdst2[index] = Math.Abs(dst1) > threshold ? (dst1 > 0 ? dst1 - threshold : dst1 + threshold) : 0;
+
+                    pIdxCurrent++;
+                    pSrcCurrent++;
+                }
+            }
+        }
     }
 }
diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs
index 2528fbe0f4..92227abe78 100644
--- a/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs
+++ b/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs
@@ -2,6 +2,17 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 
+// The exported function names need to be unique (can't be disambiguated based on signature), hence
+// we introduce suffix letters to indicate the general patterns used.
+// * A suffix means aligned and padded for SSE operations.
+// * U suffix means unaligned and unpadded.
+// * S suffix means sparse (unaligned) vector.
+// * P suffix means sparse (unaligned) partial vector - the vector is only part of a larger sparse vector.
+// * R suffix means sparse matrix.
+// * C suffix means convolution matrix.
+// * D suffix means convolution matrix, with implicit source padding.
+// * Tran means the matrix is transposed.
+
 using System.Runtime.InteropServices;
 using System.Security;
 
@@ -23,26 +34,26 @@ internal static extern unsafe void MatMulPA(bool add, /*_In_ const*/ float* pmat
         internal static extern unsafe void MatMulTranPA(bool add, /*_In_ const*/ float* pmat, /*_In_ const*/ int* pposSrc, /*_In_ const*/ float* psrc,
             int posMin, int iposMin, int iposLim, /*_Inout_*/ float* pdst, int crow);
 
-        [DllImport("CpuMathNative", EntryPoint = "MatMulRU"), SuppressUnmanagedCodeSecurity]
-        internal static extern unsafe void MatMulRU(bool add, /*_In_ const*/ int* pstarts, /*_In_ const*/ int* pindices, /*_In_ const*/ float* pcoefs,
-            /*_In_ const*/ float* ps, /*_Inout_*/ float* pdst, int crow);
+        [DllImport("CpuMathNative", EntryPoint = "AddScalarU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe float AddScalarU(float a, /*_Inout_*/ float* pd, int c);
 
-        [DllImport("CpuMathNative", EntryPoint = "MatMulCU"), SuppressUnmanagedCodeSecurity]
-        internal static extern unsafe void MatMulCU(bool add, /*_In_ const*/ int* pmprowiv, /*_In_ const*/ int* pmprowcol,
-            /*_In_ const*/ int* pruns, /*_In_ const*/ float* pcoefs, /*_In_ const*/ float* psrc, /*_Inout_*/ float* pdst, int crow);
+        [DllImport("CpuMathNative", EntryPoint = "ScaleU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe void ScaleU(float a, /*_Inout_*/ float* pd, int c);
 
-        [DllImport("CpuMathNative", EntryPoint = "MatMulDU"), SuppressUnmanagedCodeSecurity]
-        internal static extern unsafe void MatMulDU(bool add, /*_In_ const*/ int* pmprowiv, /*_In_ const*/ int* pmprowcol, /*_In_ const*/ int* pmprowrun,
-            /*_In_ const*/ int* pruns, /*_In_ const*/ float* pcoefs, /*_In_ const*/ float* psrc, /*_Inout_*/ float* pdst, int crow);
+        [DllImport("CpuMathNative", EntryPoint = "ScaleSrcU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe void ScaleSrcU(float a, /*_In_ const*/ float* ps, /*_Inout_*/ float* pd, int c);
 
-        [DllImport("CpuMathNative", EntryPoint = "DotU"), SuppressUnmanagedCodeSecurity]
-        internal static extern unsafe float DotU(/*const*/ float* pa, /*const*/ float* pb, int c);
+        [DllImport("CpuMathNative", EntryPoint = "ScaleAddU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe void ScaleAddU(float a, float b, /*_Inout_*/ float* pd, int c);
 
-        [DllImport("CpuMathNative", EntryPoint = "DotSU"), SuppressUnmanagedCodeSecurity]
-        internal static extern unsafe float DotSU(/*const*/ float* pa, /*const*/ float* pb, /*const*/ int* pi, int c);
+        [DllImport("CpuMathNative", EntryPoint = "AddScaleU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe void AddScaleU(float a, /*_In_ const*/ float* ps, /*_Inout_*/ float* pd, int c);
 
-        [DllImport("CpuMathNative", EntryPoint = "SumSqU"), SuppressUnmanagedCodeSecurity]
-        internal static extern unsafe float SumSqU(/*const*/ float* ps, int c);
+        [DllImport("CpuMathNative", EntryPoint = "AddScaleSU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe void AddScaleSU(float a, /*_In_ const*/ float* ps, /*_In_ const*/ int* pi, /*_Inout_*/ float* pd, int c);
+
+        [DllImport("CpuMathNative", EntryPoint = "AddScaleCopyU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe void AddScaleCopyU(float a, /*_In_ const*/ float* ps, /*_In_ const*/ float* pd, /*_Inout_*/ float* pr, int c);
 
         [DllImport("CpuMathNative", EntryPoint = "AddU"), SuppressUnmanagedCodeSecurity]
         internal static extern unsafe void AddU(/*_In_ const*/ float* ps, /*_Inout_*/ float* pd, int c);
@@ -50,28 +61,49 @@ internal static extern unsafe void MatMulDU(bool add, /*_In_ const*/ int* pmprow
         [DllImport("CpuMathNative", EntryPoint = "AddSU"), SuppressUnmanagedCodeSecurity]
         internal static extern unsafe void AddSU(/*_In_ const*/ float* ps, /*_In_ const*/ int* pi, /*_Inout_*/ float* pd, int c);
 
-        [DllImport("CpuMathNative", EntryPoint = "AddScaleU"), SuppressUnmanagedCodeSecurity]
-        internal static extern unsafe void AddScaleU(float a, /*_In_ const*/ float* ps, /*_Inout_*/ float* pd, int c);
+        [DllImport("CpuMathNative", EntryPoint = "MulElementWiseU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe void MulElementWiseU(/*_In_ const*/ float* ps1, /*_In_ const*/ float* ps2, /*_Inout_*/ float* pd, int c);
 
-        [DllImport("CpuMathNative", EntryPoint = "AddScaleSU"), SuppressUnmanagedCodeSecurity]
-        internal static extern unsafe void AddScaleSU(float a, /*_In_ const*/ float* ps, /*_In_ const*/ int* pi, /*_Inout_*/ float* pd, int c);
+        [DllImport("CpuMathNative", EntryPoint = "SumU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe float SumU(/*const*/ float* ps, int c);
 
-        [DllImport("CpuMathNative", EntryPoint = "ScaleU"), SuppressUnmanagedCodeSecurity]
-        internal static extern unsafe void ScaleU(float a, /*_Inout_*/ float* pd, int c);
+        [DllImport("CpuMathNative", EntryPoint = "SumSqU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe float SumSqU(/*const*/ float* ps, int c);
 
-        [DllImport("CpuMathNative", EntryPoint = "Dist2"), SuppressUnmanagedCodeSecurity]
-        internal static extern unsafe float Dist2(/*const*/ float* px, /*const*/ float* py, int c);
+        [DllImport("CpuMathNative", EntryPoint = "SumSqDiffU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe float SumSqDiffU(float mean, /*const*/ float* ps, int c);
 
         [DllImport("CpuMathNative", EntryPoint = "SumAbsU"), SuppressUnmanagedCodeSecurity]
         internal static extern unsafe float SumAbsU(/*const*/ float* ps, int c);
 
-        [DllImport("CpuMathNative", EntryPoint = "MulElementWiseU"), SuppressUnmanagedCodeSecurity]
-        internal static extern unsafe void MulElementWiseU(/*_In_ const*/ float* ps1, /*_In_ const*/ float* ps2, /*_Inout_*/ float* pd, int c);
+        [DllImport("CpuMathNative", EntryPoint = "SumAbsDiffU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe float SumAbsDiffU(float mean, /*const*/ float* ps, int c);
+
+        [DllImport("CpuMathNative", EntryPoint = "MaxAbsU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe float MaxAbsU(/*const*/ float* ps, int c);
+
+        [DllImport("CpuMathNative", EntryPoint = "MaxAbsDiffU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe float MaxAbsDiffU(float mean, /*const*/ float* ps, int c);
+
+        [DllImport("CpuMathNative", EntryPoint = "DotU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe float DotU(/*const*/ float* pa, /*const*/ float* pb, int c);
+
+        [DllImport("CpuMathNative", EntryPoint = "DotSU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe float DotSU(/*const*/ float* pa, /*const*/ float* pb, /*const*/ int* pi, int c);
+
+        [DllImport("CpuMathNative", EntryPoint = "Dist2"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe float Dist2(/*const*/ float* px, /*const*/ float* py, int c);
 
         [DllImport("CpuMathNative", EntryPoint = "ZeroItemsU"), SuppressUnmanagedCodeSecurity]
         internal static extern unsafe void ZeroItemsU(/*_Inout_*/ float* pd, int c, /*_In_ const*/ int* pindices, int cindices);
 
         [DllImport("CpuMathNative", EntryPoint = "ZeroMatrixItemsCore"), SuppressUnmanagedCodeSecurity]
         internal static extern unsafe void ZeroMatrixItemsCore(/*_Inout_*/ float* pd, int c, int ccol, int cfltRow, /*_In_ const*/ int* pindices, int cindices);
+
+        [DllImport("CpuMathNative", EntryPoint = "SdcaL1UpdateU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe void SdcaL1UpdateU(float primalUpdate, /*_In_ const*/ float* ps, float threshold, /*_Inout_*/ float* pd1, /*_Inout_*/ float* pd2, int c);
+
+        [DllImport("CpuMathNative", EntryPoint = "SdcaL1UpdateSU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe void SdcaL1UpdateSU(float primalUpdate, /*_In_ const*/ float* ps, /*_In_ const*/ int* pi, float threshold, /*_Inout_*/ float* pd1, /*_Inout_*/ float* pd2, int c);
     }
 }
diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs
index f560fcd048..42dec27378 100644
--- a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs
+++ b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs
@@ -19,11 +19,14 @@ public class SsePerformanceTests
         private const int EXP_RANGE = EXP_MAX / 2;
         private const int DEFAULT_SEED = 253421;
         private const float DEFAULT_SCALE = 1.11f;
-        private const int DEFAULT_CROW = 1000;
-        private const int DEFAULT_CCOL = 1000;
+        private const int DEFAULT_CROW = 500;
+        private const int DEFAULT_CCOL = 2000;
         private const bool ADD = true;
 
-        private float[] src, dst, original, src1, src2, mat;
+        // Naming follows from SseIntrinsics.
+        private const int CbAlign = 16;
+
+        private float[] src, dst, original, src1, src2, result;
         private int[] idx;
         private int seed = DEFAULT_SEED;
 
@@ -68,8 +71,8 @@ public void Setup()
             src1 = new float[LEN];
             src2 = new float[LEN];
             original = new float[LEN];
+            result = new float[LEN];
             idx = new int[IDXLEN];
-            mat = new float[DEFAULT_CROW * DEFAULT_CCOL];
 
             seed = GetSeed();
             Random rand = new Random(seed);
@@ -79,6 +82,7 @@ public void Setup()
                 src[i] = NextFloat(rand, EXP_RANGE);
                 dst[i] = NextFloat(rand, EXP_RANGE);
                 original[i] = dst[i];
+                result[i] = dst[i];
                 src1[i] = NextFloat(rand, EXP_RANGE);
                 src2[i] = NextFloat(rand, EXP_RANGE);
             }
@@ -87,68 +91,104 @@ public void Setup()
             {
                 idx[i] = rand.Next(0, LEN);
             }
-
-            for (int i = 0; i < mat.Length; i++)
-            {
-                mat[i] = NextFloat(rand, EXP_RANGE);
-            }
         }
 
         [GlobalCleanup]
         public void GlobalCleanup()
         {
             original.CopyTo(dst, 0);
+            original.CopyTo(result, 0);
+        }
+
+        [Benchmark]
+        public unsafe float NativeAddScalarUPerf()
+        {
+            fixed (float* pdst = dst)
+            {
+                return CpuMathNativeUtils.AddScalarU(DEFAULT_SCALE, pdst, LEN);
+            }
         }
 
         [Benchmark]
-        public unsafe void NativeMatMulAPerf()
+        public void ManagedAddScalarUPerf() => CpuMathUtils.Add(DEFAULT_SCALE, dst, LEN);
+
+        [Benchmark]
+        public unsafe void NativeScaleUPerf()
+        {
+            fixed (float* pdst = dst)
+            {
+                CpuMathNativeUtils.ScaleU(DEFAULT_SCALE, pdst, LEN);
+            }
+        }
+
+        [Benchmark]
+        public void ManagedScaleUPerf() => CpuMathUtils.Scale(DEFAULT_SCALE, dst, LEN);
+
+        [Benchmark]
+        public unsafe void NativeScaleSrcUPerf()
         {
-            fixed (float* pmat = mat)
             fixed (float* psrc = src)
             fixed (float* pdst = dst)
             {
-                CpuMathNativeUtils.MatMulA(ADD, pmat, psrc, pdst, DEFAULT_CROW, DEFAULT_CCOL);
+                CpuMathNativeUtils.ScaleSrcU(DEFAULT_SCALE, psrc, pdst, LEN);
             }
         }
 
         [Benchmark]
-        public unsafe float NativeDotUPerf()
+        public void ManagedScaleSrcUPerf() => CpuMathUtils.Scale(DEFAULT_SCALE, src, dst, LEN);
+
+        [Benchmark]
+        public unsafe void NativeScaleAddUPerf()
+        {
+            fixed (float* pdst = dst)
+            {
+                CpuMathNativeUtils.ScaleAddU(DEFAULT_SCALE, DEFAULT_SCALE, pdst, LEN);
+            }
+        }
+
+        [Benchmark]
+        public void ManagedScaleAddUPerf() => CpuMathUtils.ScaleAdd(DEFAULT_SCALE, DEFAULT_SCALE, dst, LEN);
+
+        [Benchmark]
+        public unsafe void NativeAddScaleUPerf()
         {
             fixed (float* psrc = src)
             fixed (float* pdst = dst)
             {
-                return CpuMathNativeUtils.DotU(psrc, pdst, LEN);
+                CpuMathNativeUtils.AddScaleU(DEFAULT_SCALE, psrc, pdst, LEN);
             }
         }
 
         [Benchmark]
-        public float ManagedDotUPerf() => CpuMathUtils.DotProductDense(src, dst, LEN);
+        public void ManagedAddScaleUPerf() => CpuMathUtils.AddScale(DEFAULT_SCALE, src, dst, LEN);
 
         [Benchmark]
-        public unsafe float NativeDotSUPerf()
+        public unsafe void NativeAddScaleSUPerf()
         {
             fixed (float* psrc = src)
             fixed (float* pdst = dst)
             fixed (int* pidx = idx)
             {
-                return CpuMathNativeUtils.DotSU(psrc, pdst, pidx, IDXLEN);
+                CpuMathNativeUtils.AddScaleSU(DEFAULT_SCALE, psrc, pidx, pdst, IDXLEN);
             }
         }
 
         [Benchmark]
-        public float ManagedDotSUPerf() => CpuMathUtils.DotProductSparse(src, dst, idx, IDXLEN);
+        public void ManagedAddScaleSUPerf() => CpuMathUtils.AddScale(DEFAULT_SCALE, src, idx, dst, IDXLEN);
 
         [Benchmark]
-        public unsafe float NativeSumSqUPerf()
+        public unsafe void NativeAddScaleCopyUPerf()
         {
             fixed (float* psrc = src)
+            fixed (float* pdst = dst)
+            fixed (float* pres = result)
             {
-                return CpuMathNativeUtils.SumSqU(psrc, LEN);
+                CpuMathNativeUtils.AddScaleCopyU(DEFAULT_SCALE, psrc, pdst, pres, LEN);
             }
         }
 
         [Benchmark]
-        public float ManagedSumSqUPerf() => CpuMathUtils.SumSq(src, LEN);
+        public void ManagedAddScaleCopyUPerf() => CpuMathUtils.AddScaleCopy(DEFAULT_SCALE, src, dst, result, LEN);
 
         [Benchmark]
         public unsafe void NativeAddUPerf()
@@ -177,44 +217,132 @@ public unsafe void NativeAddSUPerf()
         [Benchmark]
         public void ManagedAddSUPerf() => CpuMathUtils.Add(src, idx, dst, IDXLEN);
 
+
         [Benchmark]
-        public unsafe void NativeAddScaleUPerf()
+        public unsafe void NativeMulElementWiseUPerf()
         {
-            fixed (float* psrc = src)
+            fixed (float* psrc1 = src1)
+            fixed (float* psrc2 = src2)
             fixed (float* pdst = dst)
             {
-                CpuMathNativeUtils.AddScaleU(DEFAULT_SCALE, psrc, pdst, LEN);
+                CpuMathNativeUtils.MulElementWiseU(psrc1, psrc2, pdst, LEN);
             }
         }
 
         [Benchmark]
-        public void ManagedAddScaleUPerf() => CpuMathUtils.AddScale(DEFAULT_SCALE, src, dst, LEN);
+        public void ManagedMulElementWiseUPerf() => CpuMathUtils.MulElementWise(src1, src2, dst, LEN);
 
         [Benchmark]
-        public unsafe void NativeAddScaleSUPerf()
+        public unsafe float NativeSumUPerf()
+        {
+            fixed (float* psrc = src)
+            {
+                return CpuMathNativeUtils.SumU(psrc, LEN);
+            }
+        }
+
+        [Benchmark]
+        public float ManagedSumUPerf() => CpuMathUtils.Sum(src, LEN);
+
+        [Benchmark]
+        public unsafe float NativeSumSqUPerf()
+        {
+            fixed (float* psrc = src)
+            {
+                return CpuMathNativeUtils.SumSqU(psrc, LEN);
+            }
+        }
+
+        [Benchmark]
+        public float ManagedSumSqUPerf() => CpuMathUtils.SumSq(src, LEN);
+
+        [Benchmark]
+        public unsafe float NativeSumSqDiffUPerf()
+        {
+            fixed (float* psrc = src)
+            {
+                return CpuMathNativeUtils.SumSqDiffU(DEFAULT_SCALE, psrc, LEN);
+            }
+        }
+
+        [Benchmark]
+        public float ManagedSumSqDiffUPerf() => CpuMathUtils.SumSq(DEFAULT_SCALE, src, 0, LEN);
+
+        [Benchmark]
+        public unsafe float NativeSumAbsUPerf()
+        {
+            fixed (float* psrc = src)
+            {
+                return CpuMathNativeUtils.SumAbsU(psrc, LEN);
+            }
+        }
+
+        [Benchmark]
+        public float ManagedSumAbsUPerf() => CpuMathUtils.SumAbs(src, LEN);
+
+        [Benchmark]
+        public unsafe float NativeSumAbsDiffUPerf()
+        {
+            fixed (float* psrc = src)
+            {
+                return CpuMathNativeUtils.SumAbsDiffU(DEFAULT_SCALE, psrc, LEN);
+            }
+        }
+
+        [Benchmark]
+        public float ManagedSumAbsDiffUPerf() => CpuMathUtils.SumAbs(DEFAULT_SCALE, src, 0, LEN);
+
+        [Benchmark]
+        public unsafe float NativeMaxAbsUPerf()
+        {
+            fixed (float* psrc = src)
+            {
+                return CpuMathNativeUtils.MaxAbsU(psrc, LEN);
+            }
+        }
+
+        [Benchmark]
+        public float ManagedMaxAbsUPerf() => CpuMathUtils.MaxAbs(src, LEN);
+
+        [Benchmark]
+        public unsafe float NativeMaxAbsDiffUPerf()
+        {
+            fixed (float* psrc = src)
+            {
+                return CpuMathNativeUtils.MaxAbsDiffU(DEFAULT_SCALE, psrc, LEN);
+            }
+        }
+
+        [Benchmark]
+        public float ManagedMaxAbsDiffUPerf() => CpuMathUtils.MaxAbsDiff(DEFAULT_SCALE, src, LEN);
+        // TODO: MaxAbsU!!!
+
+        [Benchmark]
+        public unsafe float NativeDotUPerf()
         {
             fixed (float* psrc = src)
             fixed (float* pdst = dst)
-            fixed (int* pidx = idx)
             {
-                CpuMathNativeUtils.AddScaleSU(DEFAULT_SCALE, psrc, pidx, pdst, IDXLEN);
+                return CpuMathNativeUtils.DotU(psrc, pdst, LEN);
             }
         }
 
         [Benchmark]
-        public void ManagedAddScaleSUPerf() => CpuMathUtils.AddScale(DEFAULT_SCALE, src, idx, dst, IDXLEN);
+        public float ManagedDotUPerf() => CpuMathUtils.DotProductDense(src, dst, LEN);
 
         [Benchmark]
-        public unsafe void NativeScaleUPerf()
+        public unsafe float NativeDotSUPerf()
         {
+            fixed (float* psrc = src)
             fixed (float* pdst = dst)
+            fixed (int* pidx = idx)
             {
-                CpuMathNativeUtils.ScaleU(DEFAULT_SCALE, pdst, LEN);
+                return CpuMathNativeUtils.DotSU(psrc, pdst, pidx, IDXLEN);
             }
         }
 
         [Benchmark]
-        public void ManagedScaleUPerf() => CpuMathUtils.Scale(DEFAULT_SCALE, dst, LEN);
+        public float ManagedDotSUPerf() => CpuMathUtils.DotProductSparse(src, dst, idx, IDXLEN);
 
         [Benchmark]
         public unsafe float NativeDist2Perf()
@@ -230,29 +358,32 @@ public unsafe float NativeDist2Perf()
         public float ManagedDist2Perf() => CpuMathUtils.L2DistSquared(src, dst, LEN);
 
         [Benchmark]
-        public unsafe float NativeSumAbsUPerf()
+        public unsafe void NativeSdcaL1UpdateUPerf()
         {
             fixed (float* psrc = src)
+            fixed (float* pdst = dst)
+            fixed (float* pres = result)
             {
-                return CpuMathNativeUtils.SumAbsU(psrc, LEN);
+                CpuMathNativeUtils.SdcaL1UpdateU(DEFAULT_SCALE, psrc, DEFAULT_SCALE, pdst, pres, LEN);
             }
         }
 
         [Benchmark]
-        public float ManagedSumAbsqUPerf() => CpuMathUtils.SumAbs(src, LEN);
+        public void ManagedSdcaL1UpdateUPerf() => CpuMathUtils.SdcaL1UpdateDense(DEFAULT_SCALE, LEN, src, DEFAULT_SCALE, dst, result);
 
         [Benchmark]
-        public unsafe void NativeMulElementWiseUPerf()
+        public unsafe void NativeSdcaL1UpdateSUPerf()
         {
-            fixed (float* psrc1 = src1)
-            fixed (float* psrc2 = src2)
+            fixed (float* psrc = src)
             fixed (float* pdst = dst)
+            fixed (float* pres = result)
+            fixed (int* pidx = idx)
             {
-                CpuMathNativeUtils.MulElementWiseU(psrc1, psrc2, pdst, LEN);
+                CpuMathNativeUtils.SdcaL1UpdateSU(DEFAULT_SCALE, psrc, pidx, DEFAULT_SCALE, pdst, pres, IDXLEN);
             }
         }
 
         [Benchmark]
-        public void ManagedMulElementWiseUPerf() => CpuMathUtils.MulElementWise(src1, src2, dst, LEN);
+        public void ManagedSdcaL1UpdateSUPerf() => CpuMathUtils.SdcaL1UpdateSparse(DEFAULT_SCALE, LEN, src, idx, IDXLEN, DEFAULT_SCALE, dst, result);
     }
 }
diff --git a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs b/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs
index 6fc2596ef7..6d6a68bd32 100644
--- a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs
+++ b/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs
@@ -13,7 +13,10 @@ public class CpuMathUtilsUnitTests
     {
         private readonly float[][] testArrays;
         private readonly int[] testIndexArray;
+        private readonly AlignedArray[] testMatrices;
+        private readonly AlignedArray[] testSrcVectors;
         private const float DEFAULT_SCALE = 1.7f;
+        private const int SseCbAlign = 16;
         private FloatEqualityComparer comparer;
 
         public CpuMathUtilsUnitTests()
@@ -25,75 +28,211 @@ public CpuMathUtilsUnitTests()
             testArrays = new float[][] { testArray1, testArray2 };
             testIndexArray = new int[4] { 0, 2, 5, 6 };
             comparer = new FloatEqualityComparer();
+
+            // Padded matrices whose dimensions are multiples of 4
+            float[] testMatrix1 = new float[4 * 4] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f, 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f };
+            float[] testMatrix2 = new float[4 * 8];
+
+            for (int i = 0; i < testMatrix2.Length; i++)
+            {
+                testMatrix2[i] = i + 1;
+            }
+
+            AlignedArray testMatrixAligned1 = new AlignedArray(4 * 4, SseCbAlign);
+            AlignedArray testMatrixAligned2 = new AlignedArray(4 * 8, SseCbAlign);
+            testMatrixAligned1.CopyFrom(testMatrix1, 0, testMatrix1.Length);
+            testMatrixAligned2.CopyFrom(testMatrix2, 0, testMatrix2.Length);
+
+            testMatrices = new AlignedArray[] { testMatrixAligned1, testMatrixAligned2 };
+
+            // Padded source vectors whose dimensions are multiples of 4
+            float[] testSrcVector1 = new float[4] { 1f, 2f, 3f, 4f };
+            float[] testSrcVector2 = new float[8] { 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f };
+
+            AlignedArray testSrcVectorAligned1 = new AlignedArray(4, SseCbAlign);
+            AlignedArray testSrcVectorAligned2 = new AlignedArray(8, SseCbAlign);
+            testSrcVectorAligned1.CopyFrom(testSrcVector1, 0, testSrcVector1.Length);
+            testSrcVectorAligned2.CopyFrom(testSrcVector2, 0, testSrcVector2.Length);
+
+            testSrcVectors = new AlignedArray[] { testSrcVectorAligned1, testSrcVectorAligned2 };
         }
 
         [Theory]
-        [InlineData(0, 13306.0376f)]
-        [InlineData(1, 13291.9235f)]
-        public void DotUTest(int test, float expected)
+        [InlineData(0, new float[] { 23.28f, -49.72f, 23.28f, -49.72f })]
+        [InlineData(1, new float[] { 204f, 492f, 780f, 1068f })]
+        public void MatMulATest(int test, float[] expected)
         {
-            float[] src = (float[]) testArrays[test].Clone();
-            float[] dst = (float[]) src.Clone();
-            
-            for (int i = 0; i < dst.Length; i++)
+            AlignedArray mat = testMatrices[test];
+            AlignedArray src = testSrcVectors[test];
+            AlignedArray dst = new AlignedArray(4, SseCbAlign);
+
+            CpuMathUtils.MatTimesSrc(false, false, mat, src, dst, dst.Size);
+            float[] actual = new float[dst.Size];
+            dst.CopyTo(actual, 0, dst.Size);
+            Assert.Equal(expected, actual, comparer);
+        }
+
+        [Theory]
+        [InlineData(0, new float[] { 23.28f, -48.72f, 25.28f, -46.72f })]
+        [InlineData(1, new float[] { 204f, 493f, 782f, 1071f })]
+        public void MatMulAAddTest(int test, float[] expected)
+        {
+            AlignedArray mat = testMatrices[test];
+            AlignedArray src = testSrcVectors[test];
+            AlignedArray dst = new AlignedArray(4, SseCbAlign);
+
+            for (int i = 0; i < dst.Size; i++)
             {
-                dst[i] += 1;
+                dst[i] = i;
             }
 
-            var actual = CpuMathUtils.DotProductDense(src, dst, dst.Length);
-            Assert.Equal(expected, actual, 2);
+            CpuMathUtils.MatTimesSrc(false, true, mat, src, dst, dst.Size);
+            float[] actual = new float[dst.Size];
+            dst.CopyTo(actual, 0, dst.Size);
+            Assert.Equal(expected, actual, comparer);
         }
 
         [Theory]
-        [InlineData(0, 736.7352f)]
-        [InlineData(1, 736.7352f)]
-        public void DotSUTest(int test, float expected)
+        [InlineData(0, new float[] { -630.38f, -171.1f, 155.66f, 75.1f })]
+        [InlineData(1, new float[] { 170f, 180f, 190f, 200f, 210f, 220f, 230f, 240f })]
+        public void MatMulTranATest(int test, float[] expected)
         {
-            float[] src = (float[])testArrays[test].Clone();
-            float[] dst = (float[])src.Clone();
+            AlignedArray mat = testMatrices[test];
+            AlignedArray src = testSrcVectors[0];
+            AlignedArray dst = new AlignedArray(4 + 4 * test, SseCbAlign);
+
+            CpuMathUtils.MatTimesSrc(true, false, mat, src, dst, src.Size);
+            float[] actual = new float[dst.Size];
+            dst.CopyTo(actual, 0, dst.Size);
+            Assert.Equal(expected, actual, comparer);
+        }
+
+        [Theory]
+        [InlineData(0, new float[] { -630.38f, -170.1f, 157.66f, 78.1f })]
+        [InlineData(1, new float[] { 170f, 181f, 192f, 203f, 214f, 225f, 236f, 247f })]
+        public void MatMulTranAAddTest(int test, float[] expected)
+        {
+            AlignedArray mat = testMatrices[test];
+            AlignedArray src = testSrcVectors[0];
+            AlignedArray dst = new AlignedArray(4 + 4 * test, SseCbAlign);
+
+            for (int i = 0; i < dst.Size; i++)
+            {
+                dst[i] = i;
+            }
+
+            CpuMathUtils.MatTimesSrc(true, true, mat, src, dst, src.Size);
+            float[] actual = new float[dst.Size];
+            dst.CopyTo(actual, 0, dst.Size);
+            Assert.Equal(expected, actual, comparer);
+        }
+
+        [Theory]
+        [InlineData(0, new float[] { -27.32f, -9.02f, -27.32f, -9.02f })]
+        [InlineData(1, new float[] { 95f, 231f, 367f, 503f })]
+        public void MatMulPATest(int test, float[] expected)
+        {
+            AlignedArray mat = testMatrices[test];
+            AlignedArray src = testSrcVectors[test];
+            AlignedArray dst = new AlignedArray(4, SseCbAlign);
             int[] idx = testIndexArray;
 
-            // Ensures src and dst are different arrays
-            for (int i = 0; i < dst.Length; i++)
+            CpuMathUtils.MatTimesSrc(false, false, mat, idx, src, 0, 0, 2 + 2 * test, dst, dst.Size);
+            float[] actual = new float[dst.Size];
+            dst.CopyTo(actual, 0, dst.Size);
+            Assert.Equal(expected, actual, comparer);
+        }
+
+        [Theory]
+        [InlineData(0, new float[] { -27.32f, -8.02f, -25.32f, -6.02f })]
+        [InlineData(1, new float[] { 95f, 232f, 369f, 506f })]
+        public void MatMulPAAddTest(int test, float[] expected)
+        {
+            AlignedArray mat = testMatrices[test];
+            AlignedArray src = testSrcVectors[test];
+            AlignedArray dst = new AlignedArray(4, SseCbAlign);
+            int[] idx = testIndexArray;
+
+            for (int i = 0; i < dst.Size; i++)
             {
-                dst[i] += 1;
+                dst[i] = i;
             }
 
-            var actual = CpuMathUtils.DotProductSparse(src, dst, idx, idx.Length);
-            Assert.Equal(expected, actual, 4);
+            CpuMathUtils.MatTimesSrc(false, true, mat, idx, src, 0, 0, 2 + 2 * test, dst, dst.Size);
+            float[] actual = new float[dst.Size];
+            dst.CopyTo(actual, 0, dst.Size);
+            Assert.Equal(expected, actual, comparer);
         }
 
         [Theory]
-        [InlineData(0, 13399.9376f)]
-        [InlineData(1, 13389.1135f)]
-        public void SumSqUTest(int test, float expected)
+        [InlineData(0, new float[] { 7.84f, -9.52f, -39.04f, 55.36f })]
+        [InlineData(1, new float[] { 52f, 56f, 60f, 64f, 68f, 72f, 76f, 80f })]
+        public void MatMulTranPATest(int test, float[] expected)
         {
-            float[] src = (float[])testArrays[test].Clone();
-            var actual = CpuMathUtils.SumSq(src, src.Length);
-            Assert.Equal(expected, actual, 2);
+            AlignedArray mat = testMatrices[test];
+            AlignedArray src = testSrcVectors[0];
+            AlignedArray dst = new AlignedArray(4 + 4 * test, SseCbAlign);
+            int[] idx = testIndexArray;
+
+            CpuMathUtils.MatTimesSrc(true, false, mat, idx, src, 0, 0, 2, dst, src.Size);
+            float[] actual = new float[dst.Size];
+            dst.CopyTo(actual, 0, dst.Size);
+            Assert.Equal(expected, actual, comparer);
+        }
+
+        [Theory]
+        [InlineData(0, new float[] { 7.84f, -8.52f, -37.04f, 58.36f })]
+        [InlineData(1, new float[] { 52f, 57f, 62f, 67f, 72f, 77f, 82f, 87f })]
+        public void MatMulTranPAAddTest(int test, float[] expected)
+        {
+            AlignedArray mat = testMatrices[test];
+            AlignedArray src = testSrcVectors[0];
+            AlignedArray dst = new AlignedArray(4 + 4 * test, SseCbAlign);
+            int[] idx = testIndexArray;
+
+            for (int i = 0; i < dst.Size; i++)
+            {
+                dst[i] = i;
+            }
+
+            CpuMathUtils.MatTimesSrc(true, true, mat, idx, src, 0, 0, 2, dst, src.Size);
+            float[] actual = new float[dst.Size];
+            dst.CopyTo(actual, 0, dst.Size);
+            Assert.Equal(expected, actual, comparer);
         }
 
         [Theory]
         [InlineData(0)]
         [InlineData(1)]
-        public void AddUTest(int test)
+        public void AddScalarUTest(int test)
         {
-            float[] src = (float[])testArrays[test].Clone();
-            float[] dst = (float[])src.Clone();
-            float[] expected = (float[])src.Clone();
+            float[] dst = (float[])testArrays[test].Clone();
+            float[] expected = (float[])dst.Clone();
 
-            // Ensures src and dst are different arrays
-            for (int i = 0; i < dst.Length; i++)
+            for (int i = 0; i < expected.Length; i++)
             {
-                dst[i] += 1;
+                expected[i] += DEFAULT_SCALE;
             }
 
+            CpuMathUtils.Add(DEFAULT_SCALE, dst, dst.Length);
+            var actual = dst;
+            Assert.Equal(expected, actual, comparer);
+        }
+
+        [Theory]
+        [InlineData(0)]
+        [InlineData(1)]
+        public void ScaleUTest(int test)
+        {
+            float[] dst = (float[])testArrays[test].Clone();
+            float[] expected = (float[])dst.Clone();
+
             for (int i = 0; i < expected.Length; i++)
             {
-                expected[i] = 2 * expected[i] + 1;
+                expected[i] *= DEFAULT_SCALE;
             }
 
-            CpuMathUtils.Add(src, dst, dst.Length);
+            CpuMathUtils.Scale(DEFAULT_SCALE, dst, dst.Length);
             var actual = dst;
             Assert.Equal(expected, actual, comparer);
         }
@@ -101,19 +240,36 @@ public void AddUTest(int test)
         [Theory]
         [InlineData(0)]
         [InlineData(1)]
-        public void AddSUTest(int test)
+        public void ScaleSrcUTest(int test)
         {
             float[] src = (float[])testArrays[test].Clone();
             float[] dst = (float[])src.Clone();
-            int[] idx = testIndexArray;
             float[] expected = (float[])dst.Clone();
 
-            expected[0] = 3.92f;
-            expected[2] = -12.14f;
-            expected[5] = -36.69f;
-            expected[6] = 46.29f;
+            for (int i = 0; i < expected.Length; i++)
+            {
+                expected[i] *= DEFAULT_SCALE;
+            }
 
-            CpuMathUtils.Add(src, idx, dst, idx.Length);
+            CpuMathUtils.Scale(DEFAULT_SCALE, src, dst, dst.Length);
+            var actual = dst;
+            Assert.Equal(expected, actual, comparer);
+        }
+
+        [Theory]
+        [InlineData(0)]
+        [InlineData(1)]
+        public void ScaleAddUTest(int test)
+        {
+            float[] dst = (float[])testArrays[test].Clone();
+            float[] expected = (float[])dst.Clone();
+
+            for (int i = 0; i < expected.Length; i++)
+            {
+                expected[i] = DEFAULT_SCALE * (dst[i] + DEFAULT_SCALE);
+            }
+
+            CpuMathUtils.ScaleAdd(DEFAULT_SCALE, DEFAULT_SCALE, dst, dst.Length);
             var actual = dst;
             Assert.Equal(expected, actual, comparer);
         }
@@ -160,28 +316,31 @@ public void AddScaleSUTest(int test)
         [Theory]
         [InlineData(0)]
         [InlineData(1)]
-        public void ScaleUTest(int test)
+        public void AddScaleCopyUTest(int test)
         {
-            float[] dst = (float[])testArrays[test].Clone();
-            float[] expectedOutput = (float[])dst.Clone();
+            float[] src = (float[])testArrays[test].Clone();
+            float[] dst = (float[])src.Clone();
+            float[] result = (float[])dst.Clone();
+            float[] expected = (float[])dst.Clone();
 
-            for (int i = 0; i < expectedOutput.Length; i++)
+            for (int i = 0; i < expected.Length; i++)
             {
-                expectedOutput[i] *= DEFAULT_SCALE;
+                expected[i] *= (1 + DEFAULT_SCALE);
             }
 
-            CpuMathUtils.Scale(DEFAULT_SCALE, dst, dst.Length);
-            var managedOutput = dst;
-            Assert.Equal(expectedOutput, managedOutput, comparer);
+            CpuMathUtils.AddScaleCopy(DEFAULT_SCALE, src, dst, result, dst.Length);
+            var actual = result;
+            Assert.Equal(expected, actual, comparer);
         }
 
         [Theory]
-        [InlineData(0, 8.0f)]
-        [InlineData(1, 7.0f)]
-        public void Dist2Test(int test, float expected)
+        [InlineData(0)]
+        [InlineData(1)]
+        public void AddUTest(int test)
         {
             float[] src = (float[])testArrays[test].Clone();
             float[] dst = (float[])src.Clone();
+            float[] expected = (float[])src.Clone();
 
             // Ensures src and dst are different arrays
             for (int i = 0; i < dst.Length; i++)
@@ -189,18 +348,34 @@ public void Dist2Test(int test, float expected)
                 dst[i] += 1;
             }
 
-            var actual = CpuMathUtils.L2DistSquared(src, dst, dst.Length);
-            Assert.Equal(expected, actual, 0);
+            for (int i = 0; i < expected.Length; i++)
+            {
+                expected[i] = 2 * expected[i] + 1;
+            }
+
+            CpuMathUtils.Add(src, dst, dst.Length);
+            var actual = dst;
+            Assert.Equal(expected, actual, comparer);
         }
 
         [Theory]
-        [InlineData(0, 196.98f)]
-        [InlineData(1, 193.69f)]
-        public void SumAbsUTest(int test, float expected)
+        [InlineData(0)]
+        [InlineData(1)]
+        public void AddSUTest(int test)
         {
             float[] src = (float[])testArrays[test].Clone();
-            var actual = CpuMathUtils.SumAbs(src, src.Length);
-            Assert.Equal(expected, actual, 2);
+            float[] dst = (float[])src.Clone();
+            int[] idx = testIndexArray;
+            float[] expected = (float[])dst.Clone();
+
+            expected[0] = 3.92f;
+            expected[2] = -12.14f;
+            expected[5] = -36.69f;
+            expected[6] = 46.29f;
+
+            CpuMathUtils.Add(src, idx, dst, idx.Length);
+            var actual = dst;
+            Assert.Equal(expected, actual, comparer);
         }
 
         [Theory]
@@ -229,6 +404,202 @@ public void MulElementWiseUTest(int test)
             var actual = dst;
             Assert.Equal(expected, actual, comparer);
         }
+
+        [Theory]
+        [InlineData(0, -93.9f)]
+        [InlineData(1, -97.19f)]
+        public void SumUTest(int test, float expected)
+        {
+            float[] src = (float[])testArrays[test].Clone();
+            var actual = CpuMathUtils.Sum(src, src.Length);
+            Assert.Equal(expected, actual, 2);
+        }
+
+        [Theory]
+        [InlineData(0, 13399.9376f)]
+        [InlineData(1, 13389.1135f)]
+        public void SumSqUTest(int test, float expected)
+        {
+            float[] src = (float[])testArrays[test].Clone();
+            var actual = CpuMathUtils.SumSq(src, src.Length);
+            Assert.Equal(expected, actual, 2);
+        }
+
+        [Theory]
+        [InlineData(0, 13742.3176f)]
+        [InlineData(1, 13739.7895f)]
+        public void SumSqDiffUTest(int test, float expected)
+        {
+            float[] src = (float[])testArrays[test].Clone();
+            var actual = CpuMathUtils.SumSq(DEFAULT_SCALE, src, 0, src.Length);
+            Assert.Equal(expected, actual, 2);
+        }
+
+        [Theory]
+        [InlineData(0, 196.98f)]
+        [InlineData(1, 193.69f)]
+        public void SumAbsUTest(int test, float expected)
+        {
+            float[] src = (float[])testArrays[test].Clone();
+            var actual = CpuMathUtils.SumAbs(src, src.Length);
+            Assert.Equal(expected, actual, 2);
+        }
+
+        [Theory]
+        [InlineData(0, 196.98f)]
+        [InlineData(1, 195.39f)]
+        public void SumAbsDiffUTest(int test, float expected)
+        {
+            float[] src = (float[])testArrays[test].Clone();
+            var actual = CpuMathUtils.SumAbs(DEFAULT_SCALE, src, 0, src.Length);
+            Assert.Equal(expected, actual, 2);
+        }
+
+        [Theory]
+        [InlineData(0, 106.37f)]
+        [InlineData(1, 106.37f)]
+        public void MaxAbsUTest(int test, float expected)
+        {
+            float[] src = (float[])testArrays[test].Clone();
+            var actual = CpuMathUtils.MaxAbs(src, src.Length);
+            Assert.Equal(expected, actual, 2);
+        }
+
+        [Theory]
+        [InlineData(0, 108.07f)]
+        [InlineData(1, 108.07f)]
+        public void MaxAbsDiffUTest(int test, float expected)
+        {
+            float[] src = (float[])testArrays[test].Clone();
+            var actual = CpuMathUtils.MaxAbsDiff(DEFAULT_SCALE, src, src.Length);
+            Assert.Equal(expected, actual, 2);
+        }
+
+        [Theory]
+        [InlineData(0, 13306.0376f)]
+        [InlineData(1, 13291.9235f)]
+        public void DotUTest(int test, float expected)
+        {
+            float[] src = (float[])testArrays[test].Clone();
+            float[] dst = (float[])src.Clone();
+
+            for (int i = 0; i < dst.Length; i++)
+            {
+                dst[i] += 1;
+            }
+
+            var actual = CpuMathUtils.DotProductDense(src, dst, dst.Length);
+            Assert.Equal(expected, actual, 2);
+        }
+
+        [Theory]
+        [InlineData(0, 736.7352f)]
+        [InlineData(1, 736.7352f)]
+        public void DotSUTest(int test, float expected)
+        {
+            float[] src = (float[])testArrays[test].Clone();
+            float[] dst = (float[])src.Clone();
+            int[] idx = testIndexArray;
+
+            // Ensures src and dst are different arrays
+            for (int i = 0; i < dst.Length; i++)
+            {
+                dst[i] += 1;
+            }
+
+            var actual = CpuMathUtils.DotProductSparse(src, dst, idx, idx.Length);
+            Assert.Equal(expected, actual, 4);
+        }
+
+        [Theory]
+        [InlineData(0, 8.0f)]
+        [InlineData(1, 7.0f)]
+        public void Dist2Test(int test, float expected)
+        {
+            float[] src = (float[])testArrays[test].Clone();
+            float[] dst = (float[])src.Clone();
+
+            // Ensures src and dst are different arrays
+            for (int i = 0; i < dst.Length; i++)
+            {
+                dst[i] += 1;
+            }
+
+            var actual = CpuMathUtils.L2DistSquared(src, dst, dst.Length);
+            Assert.Equal(expected, actual, 0);
+        }
+
+        [Theory]
+        [InlineData(0, new int[] { 0, 2 }, new float[] { 0f, 2f, 0f, 4f })]
+        [InlineData(1, new int[] { 0, 2, 5, 6 }, new float[] { 0f, 2f, 0f, 4f, 5f, 0f, 0f, 8f })]
+        public void ZeroItemsUTest(int test, int[] idx, float[] expected)
+        {
+            AlignedArray src = new AlignedArray(4 + 4 * test, SseCbAlign);
+            src.CopyFrom(testSrcVectors[test]);
+
+            CpuMathUtils.ZeroMatrixItems(src, src.Size, src.Size, idx);
+            float[] actual = new float[src.Size];
+            src.CopyTo(actual, 0, src.Size);
+            Assert.Equal(expected, actual, comparer);
+        }
+
+        [Theory]
+        [InlineData(0, new int[] { 0, 1 }, new float[] { 0f, 2f, 0f, 4f })]
+        [InlineData(1, new int[] { 0, 2, 4 }, new float[] { 0f, 2f, 0f, 4f, 5f, 0f, 7f, 8f })]
+        public void ZeroMatrixItemsCoreTest(int test, int[] idx, float[] expected)
+        {
+            AlignedArray src = new AlignedArray(4 + 4 * test, SseCbAlign);
+            src.CopyFrom(testSrcVectors[test]);
+
+            CpuMathUtils.ZeroMatrixItems(src, src.Size / 2 - 1, src.Size / 2, idx);
+            float[] actual = new float[src.Size];
+            src.CopyTo(actual, 0, src.Size);
+            Assert.Equal(expected, actual, comparer);
+        }
+
+        [Theory]
+        [InlineData(0)]
+        [InlineData(1)]
+        public void SdcaL1UpdateUTest(int test)
+        {
+            float[] src = (float[])testArrays[test].Clone();
+            float[] v = (float[])src.Clone();
+            float[] w = (float[])src.Clone();
+            float[] expected = (float[])w.Clone();
+
+            for (int i = 0; i < expected.Length; i++)
+            {
+                float value = src[i] * (1 + DEFAULT_SCALE);
+                expected[i] = Math.Abs(value) > DEFAULT_SCALE ? (value > 0 ? value - DEFAULT_SCALE : value + DEFAULT_SCALE) : 0;
+            }
+
+            CpuMathUtils.SdcaL1UpdateDense(DEFAULT_SCALE, src.Length, src, DEFAULT_SCALE, v, w);
+            var actual = w;
+            Assert.Equal(expected, actual, comparer);
+        }
+
+        [Theory]
+        [InlineData(0)]
+        [InlineData(1)]
+        public void SdcaL1UpdateSUTest(int test)
+        {
+            float[] src = (float[])testArrays[test].Clone();
+            float[] v = (float[])src.Clone();
+            float[] w = (float[])src.Clone();
+            int[] idx = testIndexArray;
+            float[] expected = (float[])w.Clone();
+
+            for (int i = 0; i < idx.Length; i++)
+            {
+                int index = idx[i];
+                float value = v[index] + src[i] * DEFAULT_SCALE;
+                expected[index] = Math.Abs(value) > DEFAULT_SCALE ? (value > 0 ? value - DEFAULT_SCALE : value + DEFAULT_SCALE) : 0;
+            }
+
+            CpuMathUtils.SdcaL1UpdateSparse(DEFAULT_SCALE, src.Length, src, idx, idx.Length, DEFAULT_SCALE, v, w);
+            var actual = w;
+            Assert.Equal(expected, actual, comparer);
+        }
     }
 
     internal class FloatEqualityComparer : IEqualityComparer<float>

From 81d0c29f86eda1ededcd090f6bf37170a5997a40 Mon Sep 17 00:00:00 2001
From: Brian Lui <t-brlui@microsoft.com>
Date: Wed, 8 Aug 2018 14:52:59 -0700
Subject: [PATCH 4/8] Minor clean-up before submitting PR

---
 .../CpuMathUtils.netcoreapp.cs                |  2 +-
 .../CpuMathNativeUtils.cs                     | 20 -------------------
 .../UnitTests.cs                              |  3 ++-
 3 files changed, 3 insertions(+), 22 deletions(-)

diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
index 8adff83a2f..fbcbb2c192 100644
--- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
+++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
@@ -811,7 +811,7 @@ public static void ZeroMatrixItems(AlignedArray dst, int ccol, int cfltRow, int[
             Contracts.Assert(0 < ccol && ccol <= cfltRow);
 
             // REVIEW NEEDED: Since the two methods below do not involve any SSE hardware intrinsics, no software fallback is needed.
-            // REVIEW NEEDED; Keeping the check for SSE support so that we don't miss these two methods in case of any conditional compilation of files
+            // REVIEW NEEDED: Keeping the check for SSE support so that we don't miss these two methods in case of any conditional compilation of files
             if (Sse.IsSupported)
             {
                 if (ccol == cfltRow)
diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs
index 92227abe78..8df3352556 100644
--- a/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs
+++ b/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs
@@ -20,20 +20,6 @@ namespace Microsoft.ML.CpuMath.PerformanceTests
 {
     internal static class CpuMathNativeUtils
     {
-        [DllImport("CpuMathNative", EntryPoint = "MatMulA"), SuppressUnmanagedCodeSecurity]
-        internal static extern unsafe void MatMulA(bool add, /*_In_ const*/ float* pmat, /*_In_ const*/ float* psrc, /*_Inout_*/ float* pdst, int crow, int ccol);
-
-        [DllImport("CpuMathNative", EntryPoint = "MatMulPA"), SuppressUnmanagedCodeSecurity]
-        internal static extern unsafe void MatMulPA(bool add, /*_In_ const*/ float* pmat, /*_In_ const*/ int* pposSrc, /*_In_ const*/ float* psrc,
-            int posMin, int iposMin, int iposLim, /*_Inout_*/ float* pdst, int crow, int ccol);
-
-        [DllImport("CpuMathNative", EntryPoint = "MatMulTranA"), SuppressUnmanagedCodeSecurity]
-        internal static extern unsafe void MatMulTranA(bool add, /*_In_ const*/ float* pmat, /*_In_ const*/ float* psrc, /*_Inout_*/ float* pdst, int crow, int ccol);
-
-        [DllImport("CpuMathNative", EntryPoint = "MatMulTranPA"), SuppressUnmanagedCodeSecurity]
-        internal static extern unsafe void MatMulTranPA(bool add, /*_In_ const*/ float* pmat, /*_In_ const*/ int* pposSrc, /*_In_ const*/ float* psrc,
-            int posMin, int iposMin, int iposLim, /*_Inout_*/ float* pdst, int crow);
-
         [DllImport("CpuMathNative", EntryPoint = "AddScalarU"), SuppressUnmanagedCodeSecurity]
         internal static extern unsafe float AddScalarU(float a, /*_Inout_*/ float* pd, int c);
 
@@ -94,12 +80,6 @@ internal static extern unsafe void MatMulTranPA(bool add, /*_In_ const*/ float*
         [DllImport("CpuMathNative", EntryPoint = "Dist2"), SuppressUnmanagedCodeSecurity]
         internal static extern unsafe float Dist2(/*const*/ float* px, /*const*/ float* py, int c);
 
-        [DllImport("CpuMathNative", EntryPoint = "ZeroItemsU"), SuppressUnmanagedCodeSecurity]
-        internal static extern unsafe void ZeroItemsU(/*_Inout_*/ float* pd, int c, /*_In_ const*/ int* pindices, int cindices);
-
-        [DllImport("CpuMathNative", EntryPoint = "ZeroMatrixItemsCore"), SuppressUnmanagedCodeSecurity]
-        internal static extern unsafe void ZeroMatrixItemsCore(/*_Inout_*/ float* pd, int c, int ccol, int cfltRow, /*_In_ const*/ int* pindices, int cindices);
-
         [DllImport("CpuMathNative", EntryPoint = "SdcaL1UpdateU"), SuppressUnmanagedCodeSecurity]
         internal static extern unsafe void SdcaL1UpdateU(float primalUpdate, /*_In_ const*/ float* ps, float threshold, /*_Inout_*/ float* pd1, /*_Inout_*/ float* pd2, int c);
 
diff --git a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs b/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs
index 6d6a68bd32..1d4b668f55 100644
--- a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs
+++ b/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs
@@ -30,7 +30,8 @@ public CpuMathUtilsUnitTests()
             comparer = new FloatEqualityComparer();
 
             // Padded matrices whose dimensions are multiples of 4
-            float[] testMatrix1 = new float[4 * 4] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f, 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f };
+            float[] testMatrix1 = new float[4 * 4] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f,
+                1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f };
             float[] testMatrix2 = new float[4 * 8];
 
             for (int i = 0; i < testMatrix2.Length; i++)

From 02bfbe6c0f2f8d12ef7ef37006557d4f0616a054 Mon Sep 17 00:00:00 2001
From: Brian Lui <t-brlui@microsoft.com>
Date: Wed, 8 Aug 2018 17:30:54 -0700
Subject: [PATCH 5/8] Minor changes

---
 src/Microsoft.ML.CpuMath/SseIntrinsics.cs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
index a57382a4d7..24a4bcbf3a 100644
--- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
+++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
@@ -78,7 +78,7 @@ private static unsafe void Store4(Vector128<float> x, float* dst, int* idx)
         }
 
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
-        private static Vector128<float> VectorSum(Vector128<float> vector)
+        private static Vector128<float> VectorSum(in Vector128<float> vector)
         {
             if (Sse3.IsSupported)
             {

From be3281d58112b4eeb853efebd0542d081ddc0176 Mon Sep 17 00:00:00 2001
From: Brian Lui <t-brlui@microsoft.com>
Date: Thu, 9 Aug 2018 15:08:19 -0700
Subject: [PATCH 6/8] Respond to PR feedback, except for implementing new unit
 tests (coming soon)

---
 .../CpuMathUtils.netcoreapp.cs                | 266 +++++++++++-------
 src/Microsoft.ML.CpuMath/SseIntrinsics.cs     | 252 ++++++-----------
 .../SsePerformanceTests.cs                    |   3 -
 3 files changed, 251 insertions(+), 270 deletions(-)

diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
index fbcbb2c192..e17019ffa7 100644
--- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
+++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
@@ -17,12 +17,14 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArr
             {
                 if (!tran)
                 {
-                    Contracts.Assert(0 <= crun && crun <= dst.Size);
+                    Contracts.Assert(crun >= 0);
+                    Contracts.Assert(crun <= dst.Size);
                     SseIntrinsics.MatMulA(add, mat, src, dst, crun, src.Size);
                 }
                 else
                 {
-                    Contracts.Assert(0 <= crun && crun <= src.Size);
+                    Contracts.Assert(crun >= 0);
+                    Contracts.Assert(crun <= src.Size);
                     SseIntrinsics.MatMulTranA(add, mat, src, dst, dst.Size, crun);
                 }
             }
@@ -30,7 +32,8 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArr
             {
                 if (!tran)
                 {
-                    Contracts.Assert(0 <= crun && crun <= dst.Size);
+                    Contracts.Assert(crun >= 0);
+                    Contracts.Assert(crun <= dst.Size);
                     for (int i = 0; i < crun; i++)
                     {
                         float dotProduct = 0;
@@ -51,7 +54,8 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArr
                 }
                 else
                 {
-                    Contracts.Assert(0 <= crun && crun <= src.Size);
+                    Contracts.Assert(crun >= 0);
+                    Contracts.Assert(crun <= src.Size);
                     for (int i = 0; i < dst.Size; i++)
                     {
                         float dotProduct = 0;
@@ -77,7 +81,9 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgpo
             int posMin, int iposMin, int iposLim, AlignedArray dst, int crun)
         {
             Contracts.AssertValue(rgposSrc);
-            Contracts.Assert(0 <= iposMin && iposMin <= iposLim && iposLim <= rgposSrc.Length);
+            Contracts.Assert(iposMin >= 0);
+            Contracts.Assert(iposMin <= iposLim);
+            Contracts.Assert(iposLim <= rgposSrc.Length);
             Contracts.Assert(mat.Size == dst.Size * srcValues.Size);
 
             if (iposMin >= iposLim)
@@ -93,12 +99,14 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgpo
             {
                 if (!tran)
                 {
-                    Contracts.Assert(0 <= crun && crun <= dst.Size);
+                    Contracts.Assert(crun >= 0);
+                    Contracts.Assert(crun <= dst.Size);
                     SseIntrinsics.MatMulPA(add, mat, rgposSrc, srcValues, posMin, iposMin, iposLim, dst, crun, srcValues.Size);
                 }
                 else
                 {
-                    Contracts.Assert(0 <= crun && crun <= srcValues.Size);
+                    Contracts.Assert(crun >= 0);
+                    Contracts.Assert(crun <= srcValues.Size);
                     SseIntrinsics.MatMulTranPA(add, mat, rgposSrc, srcValues, posMin, iposMin, iposLim, dst, dst.Size);
                 }
             }
@@ -106,7 +114,8 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgpo
             {
                 if (!tran)
                 {
-                    Contracts.Assert(0 <= crun && crun <= dst.Size);
+                    Contracts.Assert(crun >= 0);
+                    Contracts.Assert(crun <= dst.Size);
                     for (int i = 0; i < crun; i++)
                     {
                         float dotProduct = 0;
@@ -128,7 +137,8 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgpo
                 }
                 else
                 {
-                    Contracts.Assert(0 <= crun && crun <= srcValues.Size);
+                    Contracts.Assert(crun >= 0);
+                    Contracts.Assert(crun <= srcValues.Size);
                     for (int i = 0; i < dst.Size; i++)
                     {
                         float dotProduct = 0;
@@ -155,8 +165,8 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgpo
         public static void Add(float a, float[] dst, int count)
         {
             Contracts.AssertNonEmpty(dst);
-            Contracts.Assert(0 < count);
-            Contracts.Assert(0 < count && count <= dst.Length);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(count <= dst.Length);
 
             Add(a, new Span<float>(dst, 0, count));
         }
@@ -180,7 +190,8 @@ private static void Add(float a, Span<float> dst)
         public static void Scale(float a, float[] dst, int count)
         {
             Contracts.AssertNonEmpty(dst);
-            Contracts.Assert(0 < count && count <= dst.Length);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(count <= dst.Length);
 
             Scale(a, new Span<float>(dst, 0, count));
         }
@@ -188,8 +199,9 @@ public static void Scale(float a, float[] dst, int count)
         public static void Scale(float a, float[] dst, int offset, int count)
         {
             Contracts.AssertNonEmpty(dst);
-            Contracts.Assert(0 < count);
-            Contracts.Assert(0 <= offset && offset < dst.Length - count);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(offset >= 0);
+            Contracts.Assert(offset < (dst.Length - count));
 
             Scale(a, new Span<float>(dst, offset, count));
         }
@@ -213,8 +225,9 @@ private static void Scale(float a, Span<float> dst)
         public static void Scale(float a, float[] src, float[] dst, int count)
         {
             Contracts.AssertNonEmpty(src);
-            Contracts.Assert(0 < count && count <= src.Length);
             Contracts.AssertNonEmpty(dst);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(count <= src.Length);
             Contracts.Assert(count <= dst.Length);
 
             Scale(a, new Span<float>(src, 0, count), new Span<float>(dst, 0, count));
@@ -239,8 +252,8 @@ private static void Scale(float a, Span<float> src, Span<float> dst)
         public static void ScaleAdd(float a, float b, float[] dst, int count)
         {
             Contracts.AssertNonEmpty(dst);
-            Contracts.Assert(0 < count);
-            Contracts.Assert(0 < count && count <= dst.Length);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(count <= dst.Length);
 
             ScaleAdd(a, b, new Span<float>(dst, 0, count));
         }
@@ -263,8 +276,9 @@ private static void ScaleAdd(float a, float b, Span<float> dst)
         public static void AddScale(float a, float[] src, float[] dst, int count)
         {
             Contracts.AssertNonEmpty(src);
-            Contracts.Assert(0 < count && count <= src.Length);
             Contracts.AssertNonEmpty(dst);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(count <= src.Length);
             Contracts.Assert(count <= dst.Length);
 
             AddScale(a, new Span<float>(src, 0, count), new Span<float>(dst, 0, count));
@@ -273,10 +287,12 @@ public static void AddScale(float a, float[] src, float[] dst, int count)
         public static void AddScale(float a, float[] src, float[] dst, int dstOffset, int count)
         {
             Contracts.AssertNonEmpty(src);
-            Contracts.Assert(count <= src.Length);
             Contracts.AssertNonEmpty(dst);
-            Contracts.Assert(0 <= dstOffset && dstOffset < dst.Length);
-            Contracts.Assert(0 < count && count <= dst.Length - dstOffset);
+            Contracts.Assert(dstOffset >= 0);
+            Contracts.Assert(dstOffset < dst.Length);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(count <= src.Length);
+            Contracts.Assert(count <= (dst.Length - dstOffset));
 
             AddScale(a, new Span<float>(src, 0, count), new Span<float>(dst, dstOffset, count));
         }
@@ -299,10 +315,11 @@ private static void AddScale(float a, Span<float> src, Span<float> dst)
         public static void AddScale(float a, float[] src, int[] indices, float[] dst, int count)
         {
             Contracts.AssertNonEmpty(src);
-            Contracts.Assert(0 < count && count <= src.Length);
             Contracts.AssertNonEmpty(indices);
-            Contracts.Assert(count <= indices.Length);
             Contracts.AssertNonEmpty(dst);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(count <= src.Length);
+            Contracts.Assert(count <= indices.Length);
             Contracts.Assert(count < dst.Length);
 
             AddScale(a, new Span<float>(src), new Span<int>(indices, 0, count), new Span<float>(dst));
@@ -311,12 +328,14 @@ public static void AddScale(float a, float[] src, int[] indices, float[] dst, in
         public static void AddScale(float a, float[] src, int[] indices, float[] dst, int dstOffset, int count)
         {
             Contracts.AssertNonEmpty(src);
-            Contracts.Assert(0 < count && count <= src.Length);
             Contracts.AssertNonEmpty(indices);
-            Contracts.Assert(count <= indices.Length);
             Contracts.AssertNonEmpty(dst);
-            Contracts.Assert(0 <= dstOffset && dstOffset < dst.Length);
-            Contracts.Assert(count < dst.Length - dstOffset);
+            Contracts.Assert(dstOffset >= 0);
+            Contracts.Assert(dstOffset < dst.Length);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(count <= src.Length);
+            Contracts.Assert(count <= indices.Length);
+            Contracts.Assert(count < (dst.Length - dstOffset));
 
             AddScale(a, new Span<float>(src), new Span<int>(indices, 0, count),
                     new Span<float>(dst, dstOffset, dst.Length - dstOffset));
@@ -340,11 +359,12 @@ private static void AddScale(float a, Span<float> src, Span<int> indices, Span<f
 
         public static void AddScaleCopy(float a, float[] src, float[] dst, float[] res, int count)
         {
-            Contracts.AssertNonEmpty(dst);
-            Contracts.Assert(0 < count && count <= dst.Length);
             Contracts.AssertNonEmpty(src);
-            Contracts.Assert(count <= src.Length);
+            Contracts.AssertNonEmpty(dst);
             Contracts.AssertNonEmpty(res);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(count <= src.Length);
+            Contracts.Assert(count <= dst.Length);
             Contracts.Assert(count <= res.Length);
 
             AddScaleCopy(a, new Span<float>(src, 0, count), new Span<float>(dst, 0, count), new Span<float>(res, 0, count));
@@ -368,8 +388,9 @@ private static void AddScaleCopy(float a, Span<float> src, Span<float> dst, Span
         public static void Add(float[] src, float[] dst, int count)
         {
             Contracts.AssertNonEmpty(src);
-            Contracts.Assert(0 < count && count <= src.Length);
             Contracts.AssertNonEmpty(dst);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(count <= src.Length);
             Contracts.Assert(count <= dst.Length);
 
             Add(new Span<float>(src, 0, count), new Span<float>(dst, 0, count));
@@ -393,10 +414,11 @@ private static void Add(Span<float> src, Span<float> dst)
         public static void Add(float[] src, int[] indices, float[] dst, int count)
         {
             Contracts.AssertNonEmpty(src);
-            Contracts.Assert(0 < count && count <= src.Length);
             Contracts.AssertNonEmpty(indices);
-            Contracts.Assert(count <= indices.Length);
             Contracts.AssertNonEmpty(dst);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(count <= src.Length);
+            Contracts.Assert(count <= indices.Length);
             Contracts.Assert(count < dst.Length);
 
             Add(new Span<float>(src), new Span<int>(indices, 0, count), new Span<float>(dst));
@@ -405,12 +427,14 @@ public static void Add(float[] src, int[] indices, float[] dst, int count)
         public static void Add(float[] src, int[] indices, float[] dst, int dstOffset, int count)
         {
             Contracts.AssertNonEmpty(src);
-            Contracts.Assert(0 < count && count <= src.Length);
             Contracts.AssertNonEmpty(indices);
-            Contracts.Assert(count <= indices.Length);
             Contracts.AssertNonEmpty(dst);
-            Contracts.Assert(0 <= dstOffset && dstOffset < dst.Length);
-            Contracts.Assert(count <= dst.Length - dstOffset);
+            Contracts.Assert(dstOffset >= 0);
+            Contracts.Assert(dstOffset < dst.Length);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(count <= src.Length);
+            Contracts.Assert(count <= indices.Length);
+            Contracts.Assert(count <= (dst.Length - dstOffset));
 
             Add(new Span<float>(src), new Span<int>(indices, 0, count),
                 new Span<float>(dst, dstOffset, dst.Length - dstOffset));
@@ -435,10 +459,11 @@ private static void Add(Span<float> src, Span<int> indices, Span<float> dst)
         public static void MulElementWise(float[] src1, float[] src2, float[] dst, int count)
         {
             Contracts.AssertNonEmpty(src1);
-            Contracts.Assert(0 < count && count <= src1.Length);
             Contracts.AssertNonEmpty(src2);
-            Contracts.Assert(0 < count && count <= src2.Length);
             Contracts.AssertNonEmpty(dst);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(count <= src1.Length);
+            Contracts.Assert(count <= src2.Length);
 
             MulElementWise(new Span<float>(src1, 0, count), new Span<float>(src2, 0, count),
                             new Span<float>(dst, 0, count));
@@ -462,7 +487,8 @@ private static void MulElementWise(Span<float> src1, Span<float> src2, Span<floa
         public static float Sum(float[] src, int count)
         {
             Contracts.AssertNonEmpty(src);
-            Contracts.Assert(0 < count && count <= src.Length);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(count <= src.Length);
 
             return Sum(new Span<float>(src, 0, count));
         }
@@ -470,8 +496,9 @@ public static float Sum(float[] src, int count)
         public static float Sum(float[] src, int offset, int count)
         {
             Contracts.AssertNonEmpty(src);
-            Contracts.Assert(0 < count);
-            Contracts.Assert(0 <= offset && offset <= src.Length - count);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(offset >= 0);
+            Contracts.Assert(offset <= (src.Length - count));
 
             return Sum(new Span<float>(src, offset, count));
         }
@@ -496,7 +523,8 @@ private static float Sum(Span<float> src)
         public static float SumSq(float[] src, int count)
         {
             Contracts.AssertNonEmpty(src);
-            Contracts.Assert(0 < count && count <= src.Length);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(count <= src.Length);
 
             return SumSq(new Span<float>(src, 0, count));
         }
@@ -504,8 +532,9 @@ public static float SumSq(float[] src, int count)
         public static float SumSq(float[] src, int offset, int count)
         {
             Contracts.AssertNonEmpty(src);
-            Contracts.Assert(0 < count);
-            Contracts.Assert(0 <= offset && offset <= src.Length - count);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(offset >= 0);
+            Contracts.Assert(offset <= (src.Length - count));
 
             return SumSq(new Span<float>(src, offset, count));
         }
@@ -530,8 +559,9 @@ private static float SumSq(Span<float> src)
         public static float SumSq(float mean, float[] src, int offset, int count)
         {
             Contracts.AssertNonEmpty(src);
-            Contracts.Assert(0 < count);
-            Contracts.Assert(0 <= offset && offset <= src.Length - count);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(offset >= 0);
+            Contracts.Assert(offset <= (src.Length - count));
 
             return SumSq(mean, new Span<float>(src, offset, count));
         }
@@ -540,14 +570,7 @@ private static float SumSq(float mean, Span<float> src)
         {
             if (Sse.IsSupported)
             {
-                if (mean == 0)
-                {
-                    return SseIntrinsics.SumSqU(src);
-                }
-                else
-                {
-                    return SseIntrinsics.SumSqDiffU(mean, src);
-                }
+                return (mean == 0) ? SseIntrinsics.SumSqU(src) : SseIntrinsics.SumSqDiffU(mean, src);
             }
             else
             {
@@ -563,7 +586,8 @@ private static float SumSq(float mean, Span<float> src)
         public static float SumAbs(float[] src, int count)
         {
             Contracts.AssertNonEmpty(src);
-            Contracts.Assert(0 < count && count <= src.Length);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(count <= src.Length);
 
             return SumAbs(new Span<float>(src, 0, count));
         }
@@ -571,8 +595,9 @@ public static float SumAbs(float[] src, int count)
         public static float SumAbs(float[] src, int offset, int count)
         {
             Contracts.AssertNonEmpty(src);
-            Contracts.Assert(0 < count);
-            Contracts.Assert(0 <= offset && offset <= src.Length - count);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(offset >= 0);
+            Contracts.Assert(offset <= (src.Length - count));
 
             return SumAbs(new Span<float>(src, offset, count));
         }
@@ -597,8 +622,9 @@ private static float SumAbs(Span<float> src)
         public static float SumAbs(float mean, float[] src, int offset, int count)
         {
             Contracts.AssertNonEmpty(src);
-            Contracts.Assert(0 < count);
-            Contracts.Assert(0 <= offset && offset <= src.Length - count);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(offset >= 0);
+            Contracts.Assert(offset <= (src.Length - count));
 
             return SumAbs(mean, new Span<float>(src, offset, count));
         }
@@ -607,14 +633,7 @@ private static float SumAbs(float mean, Span<float> src)
         {
             if (Sse.IsSupported)
             {
-                if (mean == 0)
-                {
-                    return SseIntrinsics.SumAbsU(src);
-                }
-                else
-                {
-                    return SseIntrinsics.SumAbsDiffU(mean, src);
-                }
+                return (mean == 0) ? SseIntrinsics.SumAbsU(src) : SseIntrinsics.SumAbsDiffU(mean, src);
             }
             else
             {
@@ -630,7 +649,8 @@ private static float SumAbs(float mean, Span<float> src)
         public static float MaxAbs(float[] src, int count)
         {
             Contracts.AssertNonEmpty(src);
-            Contracts.Assert(0 < count && count <= src.Length);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(count <= src.Length);
 
             return MaxAbs(new Span<float>(src, 0, count));
         }
@@ -638,8 +658,9 @@ public static float MaxAbs(float[] src, int count)
         public static float MaxAbs(float[] src, int offset, int count)
         {
             Contracts.AssertNonEmpty(src);
-            Contracts.Assert(0 < count);
-            Contracts.Assert(0 <= offset && offset <= src.Length - count);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(offset >= 0);
+            Contracts.Assert(offset <= (src.Length - count));
 
             return MaxAbs(new Span<float>(src, offset, count));
         }
@@ -668,7 +689,8 @@ private static float MaxAbs(Span<float> src)
         public static float MaxAbsDiff(float mean, float[] src, int count)
         {
             Contracts.AssertNonEmpty(src);
-            Contracts.Assert(0 < count && count <= src.Length);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(count <= src.Length);
 
             return MaxAbsDiff(mean, new Span<float>(src, 0, count));
         }
@@ -698,7 +720,7 @@ public static float DotProductDense(float[] a, float[] b, int count)
         {
             Contracts.AssertNonEmpty(a);
             Contracts.AssertNonEmpty(b);
-            Contracts.Assert(0 < count);
+            Contracts.Assert(count > 0);
             Contracts.Assert(a.Length >= count);
             Contracts.Assert(b.Length >= count);
 
@@ -708,10 +730,11 @@ public static float DotProductDense(float[] a, float[] b, int count)
         public static float DotProductDense(float[] a, int offset, float[] b, int count)
         {
             Contracts.AssertNonEmpty(a);
-            Contracts.Assert(0 < count);
-            Contracts.Assert(0 <= offset && offset <= a.Length - count);
             Contracts.AssertNonEmpty(b);
-            Contracts.Assert(b.Length >= count);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(count <= b.Length);
+            Contracts.Assert(offset >= 0);
+            Contracts.Assert(offset <= (a.Length - count));
 
             return DotProductDense(new Span<float>(a, offset, count), new Span<float>(b, 0, count));
         }
@@ -737,7 +760,8 @@ public static float DotProductSparse(float[] a, float[] b, int[] indices, int co
         {
             Contracts.AssertNonEmpty(a);
             Contracts.AssertNonEmpty(b);
-            Contracts.Assert(0 < count);
+            Contracts.AssertNonEmpty(indices);
+            Contracts.Assert(count > 0);
             Contracts.Assert(count < a.Length);
             Contracts.Assert(count <= b.Length);
             Contracts.Assert(count <= indices.Length);
@@ -749,12 +773,14 @@ public static float DotProductSparse(float[] a, float[] b, int[] indices, int co
         public static float DotProductSparse(float[] a, int offset, float[] b, int[] indices, int count)
         {
             Contracts.AssertNonEmpty(a);
-            Contracts.Assert(0 < count);
-            Contracts.Assert(0 <= offset && offset < a.Length);
-            Contracts.Assert(a.Length - offset > count);
             Contracts.AssertNonEmpty(b);
+            Contracts.AssertNonEmpty(indices);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(count < (a.Length - offset));
             Contracts.Assert(count <= b.Length);
             Contracts.Assert(count <= indices.Length);
+            Contracts.Assert(offset >= 0);
+            Contracts.Assert(offset < a.Length);
 
             return DotProductSparse(new Span<float>(a, offset, a.Length - offset),
                                     new Span<float>(b), new Span<int>(indices, 0, count));
@@ -782,7 +808,8 @@ public static float L2DistSquared(float[] a, float[] b, int count)
         {
             Contracts.AssertNonEmpty(a);
             Contracts.AssertNonEmpty(b);
-            Contracts.Assert(0 < count && count <= a.Length);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(count <= a.Length);
             Contracts.Assert(count <= b.Length);
 
             return L2DistSquared(new Span<float>(a, 0, count), new Span<float>(b, 0, count));
@@ -808,19 +835,62 @@ private static float L2DistSquared(Span<float> a, Span<float> b)
 
         public static void ZeroMatrixItems(AlignedArray dst, int ccol, int cfltRow, int[] indices)
         {
-            Contracts.Assert(0 < ccol && ccol <= cfltRow);
+            Contracts.Assert(ccol > 0);
+            Contracts.Assert(ccol <= cfltRow);
 
-            // REVIEW NEEDED: Since the two methods below do not involve any SSE hardware intrinsics, no software fallback is needed.
-            // REVIEW NEEDED: Keeping the check for SSE support so that we don't miss these two methods in case of any conditional compilation of files
-            if (Sse.IsSupported)
+            if (ccol == cfltRow)
             {
-                if (ccol == cfltRow)
+                ZeroItemsU(dst, dst.Size, indices, indices.Length);
+            }
+            else
+            {
+                ZeroMatrixItemsCore(dst, dst.Size, ccol, cfltRow, indices, indices.Length);
+            }
+        }
+
+        private static unsafe void ZeroItemsU(AlignedArray dst, int c, int[] indices, int cindices)
+        {
+            fixed (float* pdst = &dst.Items[0])
+            fixed (int* pidx = &indices[0])
+            {
+                for (int i = 0; i < cindices; ++i)
                 {
-                    SseIntrinsics.ZeroItemsU(dst, dst.Size, indices, indices.Length);
+                    int index = pidx[i];
+                    Contracts.Assert(0 <= index && index < c);
+                    pdst[index] = 0;
                 }
-                else
+            }
+        }
+
+        private static unsafe void ZeroMatrixItemsCore(AlignedArray dst, int c, int ccol, int cfltRow, int[] indices, int cindices)
+        {
+            fixed (float* pdst = &dst.Items[0])
+            fixed (int* pidx = &indices[0])
+            {
+                int ivLogMin = 0;
+                int ivLogLim = ccol;
+                int ivPhyMin = 0;
+
+                for (int i = 0; i < cindices; ++i)
                 {
-                    SseIntrinsics.ZeroMatrixItemsCore(dst, dst.Size, ccol, cfltRow, indices, indices.Length);
+                    int index = pidx[i];
+                    Contracts.Assert(0 <= index && index < c);
+
+                    int col = index - ivLogMin;
+                    if ((uint)col >= (uint)ccol)
+                    {
+                        Contracts.Assert(ivLogMin > index || index >= ivLogLim);
+
+                        int row = index / ccol;
+                        ivLogMin = row * ccol;
+                        ivLogLim = ivLogMin + ccol;
+                        ivPhyMin = row * cfltRow;
+
+                        Contracts.Assert(ivLogMin <= index && index < ivLogLim);
+                        col = index - ivLogMin;
+                    }
+
+                    pdst[ivPhyMin + col] = 0;
                 }
             }
         }
@@ -828,12 +898,12 @@ public static void ZeroMatrixItems(AlignedArray dst, int ccol, int cfltRow, int[
         public static void SdcaL1UpdateDense(float primalUpdate, int length, float[] src, float threshold, float[] v, float[] w)
         {
             Contracts.AssertNonEmpty(src);
-            Contracts.Assert(length <= src.Length);
             Contracts.AssertNonEmpty(v);
-            Contracts.Assert(length <= v.Length);
             Contracts.AssertNonEmpty(w);
-            Contracts.Assert(length <= w.Length);
             Contracts.Assert(length > 0);
+            Contracts.Assert(length <= src.Length);
+            Contracts.Assert(length <= v.Length);
+            Contracts.Assert(length <= w.Length);
 
             SdcaL1UpdateDense(primalUpdate, new Span<float>(src, 0, length), threshold, new Span<float>(v, 0, length), new Span<float>(w, 0, length));
         }
@@ -859,15 +929,15 @@ private static void SdcaL1UpdateDense(float primalUpdate, Span<float> src, float
         public static void SdcaL1UpdateSparse(float primalUpdate, int length, float[] src, int[] indices, int count, float threshold, float[] v, float[] w)
         {
             Contracts.AssertNonEmpty(src);
-            Contracts.Assert(count <= src.Length);
             Contracts.AssertNonEmpty(indices);
-            Contracts.Assert(count <= indices.Length);
-            Contracts.AssertNonEmpty(w);
-            Contracts.Assert(length <= w.Length);
             Contracts.AssertNonEmpty(v);
-            Contracts.Assert(length <= v.Length);
-            Contracts.Assert(0 < count);
+            Contracts.AssertNonEmpty(w);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(count <= src.Length);
+            Contracts.Assert(count <= indices.Length);
             Contracts.Assert(count < length);
+            Contracts.Assert(length <= v.Length);
+            Contracts.Assert(length <= w.Length);
 
             SdcaL1UpdateSparse(primalUpdate, new Span<float>(src, 0, count), new Span<int>(indices, 0, count), threshold, new Span<float>(v), new Span<float>(w));
         }
diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
index 24a4bcbf3a..659e5d3e90 100644
--- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
+++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
@@ -52,29 +52,22 @@ private static unsafe Vector128<float> Load4(float* src, int* idx)
         }
 
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
-        private static Vector128<float> Rotate(Vector128<float> x)
+        private static Vector128<float> Rotate(in Vector128<float> x)
         {
             // The control byte shuffles the four 32-bit floats of x: ABCD -> BCDA.
             return Sse.Shuffle(x, x, 0x39);
         }
 
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
-        private static Vector128<float> RotateReverse(Vector128<float> x)
-        {
-            // The control byte shuffles the four 32-bit floats of x: ABCD -> DABC.
-            return Sse.Shuffle(x, x, 0x93);
-        }
-
-        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
-        private static unsafe void Store4(Vector128<float> x, float* dst, int* idx)
+        private static unsafe void Store4(in Vector128<float> x, float* dst, int* idx)
         {
             Sse.StoreScalar(dst + idx[0], x);
-            x = Rotate(x);
-            Sse.StoreScalar(dst + idx[1], x);
-            x = Rotate(x);
-            Sse.StoreScalar(dst + idx[2], x);
-            x = Rotate(x);
-            Sse.StoreScalar(dst + idx[3], x);
+            Vector128<float> rotated = Rotate(in x);
+            Sse.StoreScalar(dst + idx[1], rotated);
+            rotated = Rotate(in rotated);
+            Sse.StoreScalar(dst + idx[2], rotated);
+            rotated = Rotate(in rotated);
+            Sse.StoreScalar(dst + idx[3], rotated);
         }
 
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
@@ -82,18 +75,44 @@ private static Vector128<float> VectorSum(in Vector128<float> vector)
         {
             if (Sse3.IsSupported)
             {
-                Vector128<float> tmp = Sse3.HorizontalAdd(vector, vector);
-                return Sse3.HorizontalAdd(tmp, tmp);
+                Vector128<float> partialSum = Sse3.HorizontalAdd(vector, vector);
+                return Sse3.HorizontalAdd(partialSum, partialSum);
             }
             else
             {
-                // SSE3 is not supported.
-                Vector128<float> tmp = Sse.Add(vector, Sse.MoveHighToLow(vector, vector));
-                // The control byte shuffles the four 32-bit floats of tmp: ABCD -> BADC.
-                return Sse.Add(tmp, Sse.Shuffle(tmp, tmp, 0xb1));
+                Vector128<float> partialSum = Sse.Add(vector, Sse.MoveHighToLow(vector, vector));
+                // The control byte shuffles the four 32-bit floats of partialSum: ABCD -> BADC.
+                return Sse.Add(partialSum, Sse.Shuffle(partialSum, partialSum, 0xB1));
             }
         }
 
+        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
+        private static Vector128<float> VectorMax(in Vector128<float> vector)
+        {
+            Vector128<float> x1 = Sse.Shuffle(vector, vector, 0xB1);
+            Vector128<float> partialMax = Sse.Max(vector, x1);
+            x1 = Sse.Shuffle(partialMax, partialMax, 0x02);
+            return Sse.MaxScalar(partialMax, x1);
+        }
+
+        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
+        private static Vector128<float> GetAbsMask()
+        {
+            return (Sse2.IsSupported) ?
+                Sse.StaticCast<int, float>(Sse2.SetAllVector128(0x7FFFFFFF)) :
+                Sse.SetAllVector128(BitConverter.Int32BitsToSingle(0x7FFFFFFF));
+        }
+
+        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
+        private static Vector128<float> GetNewDst(in Vector128<float> xDst1, in Vector128<float> signMask, in Vector128<float> xThreshold)
+        {
+            Vector128<float> xSign = Sse.And(xDst1, signMask); // result = 0x8000 0000 if xDst1 is negative or 0x0000 0000 otherwise
+            Vector128<float> xDst1Abs = Sse.Xor(xDst1, xSign);
+            Vector128<float> xCond = Sse.CompareGreaterThan(xDst1Abs, xThreshold); // result = 0xFFFF FFFF if true
+            Vector128<float> x2 = Sse.Xor(xSign, xThreshold); // -xThreshold if xDst1 is negative and +xThreshold otherwise
+            return Sse.And(Sse.Subtract(xDst1, x2), xCond);
+        }
+
         // Multiply matrix times vector into vector.
         internal static unsafe void MatMulA(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol)
         {
@@ -111,7 +130,6 @@ internal static unsafe void MatMulA(bool add, AlignedArray mat, AlignedArray src
 
                 float* pSrcEnd = psrc + ccol;
                 float* pDstEnd = pdst + crow;
-                float* pSrcCurrent = psrc;
                 float* pDstCurrent = pdst;
                 float* pMatCurrent = pmat;
 
@@ -122,6 +140,8 @@ internal static unsafe void MatMulA(bool add, AlignedArray mat, AlignedArray src
                     Vector128<float> res2 = res0;
                     Vector128<float> res3 = res0;
 
+                    float* pSrcCurrent = psrc;
+
                     while (pSrcCurrent < pSrcEnd)
                     {
                         float* pMatTemp = pMatCurrent;
@@ -189,7 +209,7 @@ internal static unsafe void MatMulPA(bool add, AlignedArray mat, int[] rgposSrc,
                     float* pm1 = pm0 + ccol;
                     float* pm2 = pm1 + ccol;
                     float* pm3 = pm2 + ccol;
-                    Vector128<float> res = Sse.SetZeroVector128();
+                    Vector128<float> result = Sse.SetZeroVector128();
 
                     int* ppos = pposMin;
 
@@ -199,16 +219,16 @@ internal static unsafe void MatMulPA(bool add, AlignedArray mat, int[] rgposSrc,
                         Vector128<float> x1 = Sse.SetVector128(pm3[col], pm2[col], pm1[col], pm0[col]);
                         Vector128<float> x2 = Sse.SetAllVector128(pSrcCurrent[col]);
                         x2 = Sse.Multiply(x2, x1);
-                        res = Sse.Add(res, x2);
+                        result = Sse.Add(result, x2);
 
                         ppos++;
                     }
 
                     if (add)
                     {
-                        res = Sse.Add(res, Sse.LoadAlignedVector128(pDstCurrent));
+                        result = Sse.Add(result, Sse.LoadAlignedVector128(pDstCurrent));
                     }
-                    Sse.StoreAligned(pDstCurrent, res);
+                    Sse.StoreAligned(pDstCurrent, result);
 
                     pDstCurrent += 4;
                     pm0 += 4 * ccol;
@@ -233,20 +253,21 @@ internal static unsafe void MatMulTranA(bool add, AlignedArray mat, AlignedArray
                 float* pSrcEnd = psrc + ccol;
                 float* pDstEnd = pdst + crow;
                 float* pSrcCurrent = psrc;
-                float* pDstCurrent = pdst;
                 float* pMatCurrent = pmat;
 
                 if (!add)
                 {
                     Vector128<float> x01 = Sse.LoadAlignedVector128(pSrcCurrent);
-                    // Replicate each slot of x01 into its own register.
-                    Vector128<float> x11 = Sse.Shuffle(x01, x01, 0x55);
-                    Vector128<float> x21 = Sse.Shuffle(x01, x01, 0xAA);
-                    Vector128<float> x31 = Sse.Shuffle(x01, x01, 0xFF);
-                    x01 = Sse.Shuffle(x01, x01, 0x00);
+                    // Replicate each 32-bit slot of x01 (ABCD) into its own register.
+                    Vector128<float> x11 = Sse.Shuffle(x01, x01, 0x55); // B
+                    Vector128<float> x21 = Sse.Shuffle(x01, x01, 0xAA); // C
+                    Vector128<float> x31 = Sse.Shuffle(x01, x01, 0xFF); // D
+                    x01 = Sse.Shuffle(x01, x01, 0x00); // A
 
                     pSrcCurrent += 4;
 
+                    float* pDstCurrent = pdst;
+
                     while (pDstCurrent < pDstEnd)
                     {
                         float* pMatTemp = pMatCurrent;
@@ -276,13 +297,13 @@ internal static unsafe void MatMulTranA(bool add, AlignedArray mat, AlignedArray
                 while (pSrcCurrent < pSrcEnd)
                 {
                     Vector128<float> x01 = Sse.LoadAlignedVector128(pSrcCurrent);
-                    // Replicate each slot of x01 into its own register.
-                    Vector128<float> x11 = Sse.Shuffle(x01, x01, 0x55);
-                    Vector128<float> x21 = Sse.Shuffle(x01, x01, 0xAA);
-                    Vector128<float> x31 = Sse.Shuffle(x01, x01, 0xFF);
-                    x01 = Sse.Shuffle(x01, x01, 0x00);
+                    // Replicate each 32-bit slot of x01 (ABCD) into its own register.
+                    Vector128<float> x11 = Sse.Shuffle(x01, x01, 0x55); // B
+                    Vector128<float> x21 = Sse.Shuffle(x01, x01, 0xAA); // C
+                    Vector128<float> x31 = Sse.Shuffle(x01, x01, 0xFF); // D
+                    x01 = Sse.Shuffle(x01, x01, 0x00); // A
 
-                    pDstCurrent = pdst;
+                    float* pDstCurrent = pdst;
 
                     while (pDstCurrent < pDstEnd)
                     {
@@ -610,7 +631,7 @@ internal static unsafe void AddScaleSU(float scale, Span<float> src, Span<int> i
 
                     srcVector = Sse.Multiply(srcVector, scaleVector);
                     dstVector = Sse.Add(dstVector, srcVector);
-                    Store4(dstVector, pDstCurrent, pIdxCurrent);
+                    Store4(in dstVector, pDstCurrent, pIdxCurrent);
 
                     pIdxCurrent += 4;
                     pSrcCurrent += 4;
@@ -678,7 +699,7 @@ internal static unsafe void AddSU(Span<float> src, Span<int> idx, Span<float> ds
                     Vector128<float> dstVector = Sse.LoadVector128(pSrcCurrent);
 
                     srcVector = Sse.Add(srcVector, dstVector);
-                    Store4(srcVector, pDstCurrent, pIdxCurrent);
+                    Store4(in srcVector, pDstCurrent, pIdxCurrent);
 
                     pIdxCurrent += 4;
                     pSrcCurrent += 4;
@@ -746,7 +767,7 @@ internal static unsafe float SumU(Span<float> src)
                     pSrcCurrent += 4;
                 }
 
-                result = VectorSum(result);
+                result = VectorSum(in result);
 
                 while (pSrcCurrent < pSrcEnd)
                 {
@@ -775,7 +796,7 @@ internal static unsafe float SumSqU(Span<float> src)
                     pSrcCurrent += 4;
                 }
 
-                result = VectorSum(result);
+                result = VectorSum(in result);
 
                 while (pSrcCurrent < pEnd)
                 {
@@ -808,7 +829,7 @@ internal static unsafe float SumSqDiffU(float mean, Span<float> src)
                     pSrcCurrent += 4;
                 }
 
-                result = VectorSum(result);
+                result = VectorSum(in result);
 
                 while (pSrcCurrent < pSrcEnd)
                 {
@@ -826,16 +847,7 @@ internal static unsafe float SumSqDiffU(float mean, Span<float> src)
         internal static unsafe float SumAbsU(Span<float> src)
         {
             Vector128<float> result = Sse.SetZeroVector128();
-            Vector128<float> mask;
-
-            if (Sse2.IsSupported)
-            {
-                mask = Sse.StaticCast<int, float>(Sse2.SetAllVector128(0x7FFFFFFF));
-            }
-            else
-            {
-                mask = Sse.SetAllVector128(BitConverter.Int32BitsToSingle(0x7FFFFFFF));
-            }
+            Vector128<float> mask = GetAbsMask();
 
             fixed (float* psrc = src)
             {
@@ -850,7 +862,7 @@ internal static unsafe float SumAbsU(Span<float> src)
                     pSrcCurrent += 4;
                 }
 
-                result = VectorSum(result);
+                result = VectorSum(in result);
 
                 while (pSrcCurrent < pEnd)
                 {
@@ -868,16 +880,7 @@ internal static unsafe float SumAbsDiffU(float mean, Span<float> src)
         {
             Vector128<float> result = Sse.SetZeroVector128();
             Vector128<float> meanVector = Sse.SetAllVector128(mean);
-            Vector128<float> mask;
-
-            if (Sse2.IsSupported)
-            {
-                mask = Sse.StaticCast<int, float>(Sse2.SetAllVector128(0x7FFFFFFF));
-            }
-            else
-            {
-                mask = Sse.SetAllVector128(BitConverter.Int32BitsToSingle(0x7FFFFFFF));
-            }
+            Vector128<float> mask = GetAbsMask();
 
             fixed (float* psrc = src)
             {
@@ -893,7 +896,7 @@ internal static unsafe float SumAbsDiffU(float mean, Span<float> src)
                     pSrcCurrent += 4;
                 }
 
-                result = VectorSum(result);
+                result = VectorSum(in result);
 
                 while (pSrcCurrent < pEnd)
                 {
@@ -911,16 +914,7 @@ internal static unsafe float SumAbsDiffU(float mean, Span<float> src)
         internal static unsafe float MaxAbsU(Span<float> src)
         {
             Vector128<float> result = Sse.SetZeroVector128();
-            Vector128<float> mask;
-
-            if (Sse2.IsSupported)
-            {
-                mask = Sse.StaticCast<int, float>(Sse2.SetAllVector128(0x7FFFFFFF));
-            }
-            else
-            {
-                mask = Sse.SetAllVector128(BitConverter.Int32BitsToSingle(0x7FFFFFFF));
-            }
+            Vector128<float> mask = GetAbsMask();
 
             fixed (float* psrc = src)
             {
@@ -935,10 +929,7 @@ internal static unsafe float MaxAbsU(Span<float> src)
                     pSrcCurrent += 4;
                 }
 
-                Vector128<float> x1 = Sse.Shuffle(result, result, 0xB1);
-                result = Sse.Max(result, x1);
-                x1 = Sse.Shuffle(result, result, 0x02);
-                result = Sse.MaxScalar(result, x1);
+                result = VectorMax(in result);
 
                 while (pSrcCurrent < pEnd)
                 {
@@ -956,16 +947,7 @@ internal static unsafe float MaxAbsDiffU(float mean, Span<float> src)
         {
             Vector128<float> result = Sse.SetZeroVector128();
             Vector128<float> meanVector = Sse.SetAllVector128(mean);
-            Vector128<float> mask;
-
-            if (Sse2.IsSupported)
-            {
-                mask = Sse.StaticCast<int, float>(Sse2.SetAllVector128(0x7FFFFFFF));
-            }
-            else
-            {
-                mask = Sse.SetAllVector128(BitConverter.Int32BitsToSingle(0x7FFFFFFF));
-            }
+            Vector128<float> mask = GetAbsMask();
 
             fixed (float* psrc = src)
             {
@@ -981,10 +963,7 @@ internal static unsafe float MaxAbsDiffU(float mean, Span<float> src)
                     pSrcCurrent += 4;
                 }
 
-                Vector128<float> x1 = Sse.Shuffle(result, result, 0xB1);
-                result = Sse.Max(result, x1);
-                x1 = Sse.Shuffle(result, result, 0x02);
-                result = Sse.MaxScalar(result, x1);
+                result = VectorMax(in result);
 
                 while (pSrcCurrent < pEnd)
                 {
@@ -1021,7 +1000,7 @@ internal static unsafe float DotU(Span<float> src, Span<float> dst)
                     pDstCurrent += 4;
                 }
 
-                result = VectorSum(result);
+                result = VectorSum(in result);
 
                 while (pSrcCurrent < pEnd)
                 {
@@ -1062,7 +1041,7 @@ internal static unsafe float DotSU(Span<float> src, Span<float> dst, Span<int> i
                     pDstCurrent += 4;
                 }
 
-                result = VectorSum(result);
+                result = VectorSum(in result);
 
                 while (pIdxCurrent < pEnd)
                 {
@@ -1101,7 +1080,7 @@ internal static unsafe float Dist2(Span<float> src, Span<float> dst)
                     pDstCurrent += 4;
                 }
 
-                sqDistanceVector = VectorSum(sqDistanceVector);
+                sqDistanceVector = VectorSum(in sqDistanceVector);
 
                 float norm = Sse.ConvertToSingle(sqDistanceVector);
                 while (pSrcCurrent < pEnd)
@@ -1117,63 +1096,6 @@ internal static unsafe float Dist2(Span<float> src, Span<float> dst)
             }
         }
 
-        internal static unsafe void ZeroItemsU(AlignedArray dst, int c, int[] indices, int cindices)
-        {
-            fixed (float* pDstStart = &dst.Items[0])
-            fixed (int* pidx = &indices[0])
-            {
-                float* pdst = Ptr(dst, pDstStart);
-
-                // REVIEW NEEDED: This line expands to (void)(c); but is it necessary?
-                // DEBUG_ONLY(c);
-
-                for (int i = 0; i < cindices; ++i)
-                {
-                    int index = pidx[i];
-                    Contracts.Assert(0 <= index && index < c);
-                    pdst[index] = 0;
-                }
-            }
-        }
-
-        internal static unsafe void ZeroMatrixItemsCore(AlignedArray dst, int c, int ccol, int cfltRow, int[] indices, int cindices)
-        {
-            fixed (float* pDstStart = &dst.Items[0])
-            fixed (int* pidx = &indices[0])
-            {
-                float* pdst = Ptr(dst, pDstStart);
-
-                // REVIEW NEEDED: This line expands to (void)(c); but is it necessary?
-                // DEBUG_ONLY(c);
-
-                int ivLogMin = 0;
-                int ivLogLim = ccol;
-                int ivPhyMin = 0;
-
-                for (int i = 0; i < cindices; ++i)
-                {
-                    int index = pidx[i];
-                    Contracts.Assert(0 <= index && index < c);
-
-                    int col = index - ivLogMin;
-                    if ((uint)col >= (uint)ccol)
-                    {
-                        Contracts.Assert(ivLogMin > index || index >= ivLogLim);
-
-                        int row = index / ccol;
-                        ivLogMin = row * ccol;
-                        ivLogLim = ivLogMin + ccol;
-                        ivPhyMin = row * cfltRow;
-
-                        Contracts.Assert(ivLogMin <= index && index < ivLogLim);
-                        col = index - ivLogMin;
-                    }
-
-                    pdst[ivPhyMin + col] = 0;
-                }
-            }
-        }
-
         internal static unsafe void SdcaL1UpdateU(float primalUpdate, Span<float> src, float threshold, Span<float> v, Span<float> w)
         {
             fixed (float* psrc = src)
@@ -1187,21 +1109,18 @@ internal static unsafe void SdcaL1UpdateU(float primalUpdate, Span<float> src, f
 
                 Vector128<float> xPrimal = Sse.SetAllVector128(primalUpdate);
 
-                Vector128<float> signMask = Sse.SetAllVector128(-0.0f); // 1000 0000 ...
+                Vector128<float> signMask = Sse.SetAllVector128(-0.0f); // 0x8000 0000
                 Vector128<float> xThreshold = Sse.SetAllVector128(threshold);
 
                 while (pSrcCurrent + 4 <= pSrcEnd)
                 {
                     Vector128<float> xSrc = Sse.LoadVector128(pSrcCurrent);
+
                     Vector128<float> xDst1 = Sse.LoadVector128(pDst1Current);
                     xDst1 = Sse.Add(xDst1, Sse.Multiply(xSrc, xPrimal));
-                    Sse.Store(pDst1Current, xDst1);
+                    Vector128<float> xDst2 = GetNewDst(xDst1, signMask, xThreshold);
 
-                    Vector128<float> xSign = Sse.And(xDst1, signMask); // result = 10000... if xDst1 is negative or 00000 otherwise
-                    Vector128<float> xDst1Abs = Sse.Xor(xDst1, xSign);
-                    Vector128<float> xCond = Sse.CompareGreaterThan(xDst1Abs, xThreshold); // all 1's if true
-                    Vector128<float> x2 = Sse.Xor(xSign, xThreshold); // -threshold if xDst1 is negative and +threshold otherwise
-                    Vector128<float> xDst2 = Sse.And(Sse.Subtract(xDst1, x2), xCond);
+                    Sse.Store(pDst1Current, xDst1);
                     Sse.Store(pDst2Current, xDst2);
 
                     pSrcCurrent += 4;
@@ -1235,7 +1154,7 @@ internal static unsafe void SdcaL1UpdateSU(float primalUpdate, Span<float> src,
 
                 Vector128<float> xPrimal = Sse.SetAllVector128(primalUpdate);
 
-                Vector128<float> signMask = Sse.SetAllVector128(-0.0f); // 1000 0000 ...
+                Vector128<float> signMask = Sse.SetAllVector128(-0.0f); // 0x8000 0000
                 Vector128<float> xThreshold = Sse.SetAllVector128(threshold);
 
                 while (pIdxCurrent + 4 <= pIdxEnd)
@@ -1244,15 +1163,10 @@ internal static unsafe void SdcaL1UpdateSU(float primalUpdate, Span<float> src,
 
                     Vector128<float> xDst1 = Load4(pdst1, pIdxCurrent);
                     xDst1 = Sse.Add(xDst1, Sse.Multiply(xSrc, xPrimal));
+                    Vector128<float> xDst2 = GetNewDst(xDst1, signMask, xThreshold);
 
-                    Vector128<float> xSign = Sse.And(xDst1, signMask); // result = 10000... if xDst1 is negative or 00000 otherwise
-                    Vector128<float> xDst1Abs = Sse.Xor(xDst1, xSign);
-                    Vector128<float> xCond = Sse.CompareGreaterThan(xDst1Abs, xThreshold); // all 1's if true
-                    Vector128<float> x2 = Sse.Xor(xSign, xThreshold); // -threshold if xDst1 is negative and +threshold otherwise
-                    Vector128<float> xDst2 = Sse.And(Sse.Subtract(xDst1, x2), xCond);
-
-                    Store4(xDst1, pdst1, pIdxCurrent);
-                    Store4(xDst2, pdst2, pIdxCurrent);
+                    Store4(in xDst1, pdst1, pIdxCurrent);
+                    Store4(in xDst2, pdst2, pIdxCurrent);
 
                     pIdxCurrent += 4;
                     pSrcCurrent += 4;
diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs
index 42dec27378..ade2ea6a0e 100644
--- a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs
+++ b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs
@@ -23,9 +23,6 @@ public class SsePerformanceTests
         private const int DEFAULT_CCOL = 2000;
         private const bool ADD = true;
 
-        // Naming follows from SseIntrinsics.
-        private const int CbAlign = 16;
-
         private float[] src, dst, original, src1, src2, result;
         private int[] idx;
         private int seed = DEFAULT_SEED;

From 17b2f79c1ec8b92768512776b9e695ecd1210c9a Mon Sep 17 00:00:00 2001
From: Brian Lui <t-brlui@microsoft.com>
Date: Thu, 9 Aug 2018 15:41:55 -0700
Subject: [PATCH 7/8] Respond to PR feedback: Implemented new unit tests

---
 .../UnitTests.cs                              | 144 +++++++++---------
 1 file changed, 72 insertions(+), 72 deletions(-)

diff --git a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs b/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs
index 1d4b668f55..d1d5955a8e 100644
--- a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs
+++ b/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs
@@ -15,6 +15,7 @@ public class CpuMathUtilsUnitTests
         private readonly int[] testIndexArray;
         private readonly AlignedArray[] testMatrices;
         private readonly AlignedArray[] testSrcVectors;
+        private readonly AlignedArray[] testDstVectors;
         private const float DEFAULT_SCALE = 1.7f;
         private const int SseCbAlign = 16;
         private FloatEqualityComparer comparer;
@@ -56,16 +57,28 @@ public CpuMathUtilsUnitTests()
             testSrcVectorAligned2.CopyFrom(testSrcVector2, 0, testSrcVector2.Length);
 
             testSrcVectors = new AlignedArray[] { testSrcVectorAligned1, testSrcVectorAligned2 };
+
+            // Padded destination vectors whose dimensions are multiples of 4
+            float[] testDstVector1 = new float[4] { 0f, 1f, 2f, 3f };
+            float[] testDstVector2 = new float[8] { 0f, 1f, 2f, 3f, 4f, 5f, 6f, 7f };
+
+            AlignedArray testDstVectorAligned1 = new AlignedArray(4, SseCbAlign);
+            AlignedArray testDstVectorAligned2 = new AlignedArray(8, SseCbAlign);
+            testDstVectorAligned1.CopyFrom(testDstVector1, 0, testDstVector1.Length);
+            testDstVectorAligned2.CopyFrom(testDstVector2, 0, testDstVector2.Length);
+
+            testDstVectors = new AlignedArray[] { testDstVectorAligned1, testDstVectorAligned2 };
         }
 
         [Theory]
-        [InlineData(0, new float[] { 23.28f, -49.72f, 23.28f, -49.72f })]
-        [InlineData(1, new float[] { 204f, 492f, 780f, 1068f })]
-        public void MatMulATest(int test, float[] expected)
+        [InlineData(0, 0, 0, new float[] { 23.28f, -49.72f, 23.28f, -49.72f })]
+        [InlineData(1, 1, 0, new float[] { 204f, 492f, 780f, 1068f })]
+        [InlineData(1, 0, 1, new float[] { 30f, 70f, 110f, 150f, 190f, 230f, 270f, 310f })]
+        public void MatMulATest(int matTest, int srcTest, int dstTest, float[] expected)
         {
-            AlignedArray mat = testMatrices[test];
-            AlignedArray src = testSrcVectors[test];
-            AlignedArray dst = new AlignedArray(4, SseCbAlign);
+            AlignedArray mat = testMatrices[matTest];
+            AlignedArray src = testSrcVectors[srcTest];
+            AlignedArray dst = testDstVectors[dstTest];
 
             CpuMathUtils.MatTimesSrc(false, false, mat, src, dst, dst.Size);
             float[] actual = new float[dst.Size];
@@ -74,18 +87,14 @@ public void MatMulATest(int test, float[] expected)
         }
 
         [Theory]
-        [InlineData(0, new float[] { 23.28f, -48.72f, 25.28f, -46.72f })]
-        [InlineData(1, new float[] { 204f, 493f, 782f, 1071f })]
-        public void MatMulAAddTest(int test, float[] expected)
+        [InlineData(0, 0, 0, new float[] { 23.28f, -48.72f, 25.28f, -46.72f })]
+        [InlineData(1, 1, 0, new float[] { 204f, 493f, 782f, 1071f })]
+        [InlineData(1, 0, 1, new float[] { 30f, 71f, 112f, 153f, 194f, 235f, 276f, 317f })]
+        public void MatMulAAddTest(int matTest, int srcTest, int dstTest, float[] expected)
         {
-            AlignedArray mat = testMatrices[test];
-            AlignedArray src = testSrcVectors[test];
-            AlignedArray dst = new AlignedArray(4, SseCbAlign);
-
-            for (int i = 0; i < dst.Size; i++)
-            {
-                dst[i] = i;
-            }
+            AlignedArray mat = testMatrices[matTest];
+            AlignedArray src = testSrcVectors[srcTest];
+            AlignedArray dst = testDstVectors[dstTest];
 
             CpuMathUtils.MatTimesSrc(false, true, mat, src, dst, dst.Size);
             float[] actual = new float[dst.Size];
@@ -94,13 +103,14 @@ public void MatMulAAddTest(int test, float[] expected)
         }
 
         [Theory]
-        [InlineData(0, new float[] { -630.38f, -171.1f, 155.66f, 75.1f })]
-        [InlineData(1, new float[] { 170f, 180f, 190f, 200f, 210f, 220f, 230f, 240f })]
-        public void MatMulTranATest(int test, float[] expected)
+        [InlineData(0, 0, 0, new float[] { -630.38f, -171.1f, 155.66f, 75.1f })]
+        [InlineData(1, 0, 1, new float[] { 170f, 180f, 190f, 200f, 210f, 220f, 230f, 240f })]
+        [InlineData(1, 1, 0, new float[] { 708f, 744f, 780f, 816f })]
+        public void MatMulTranATest(int matTest, int srcTest, int dstTest, float[] expected)
         {
-            AlignedArray mat = testMatrices[test];
-            AlignedArray src = testSrcVectors[0];
-            AlignedArray dst = new AlignedArray(4 + 4 * test, SseCbAlign);
+            AlignedArray mat = testMatrices[matTest];
+            AlignedArray src = testSrcVectors[srcTest];
+            AlignedArray dst = testDstVectors[dstTest];
 
             CpuMathUtils.MatTimesSrc(true, false, mat, src, dst, src.Size);
             float[] actual = new float[dst.Size];
@@ -109,18 +119,14 @@ public void MatMulTranATest(int test, float[] expected)
         }
 
         [Theory]
-        [InlineData(0, new float[] { -630.38f, -170.1f, 157.66f, 78.1f })]
-        [InlineData(1, new float[] { 170f, 181f, 192f, 203f, 214f, 225f, 236f, 247f })]
-        public void MatMulTranAAddTest(int test, float[] expected)
+        [InlineData(0, 0, 0, new float[] { -630.38f, -170.1f, 157.66f, 78.1f })]
+        [InlineData(1, 0, 1, new float[] { 170f, 181f, 192f, 203f, 214f, 225f, 236f, 247f })]
+        [InlineData(1, 1, 0, new float[] { 708f, 745f, 782f, 819f })]
+        public void MatMulTranAAddTest(int matTest, int srcTest, int dstTest, float[] expected)
         {
-            AlignedArray mat = testMatrices[test];
-            AlignedArray src = testSrcVectors[0];
-            AlignedArray dst = new AlignedArray(4 + 4 * test, SseCbAlign);
-
-            for (int i = 0; i < dst.Size; i++)
-            {
-                dst[i] = i;
-            }
+            AlignedArray mat = testMatrices[matTest];
+            AlignedArray src = testSrcVectors[srcTest];
+            AlignedArray dst = testDstVectors[dstTest];
 
             CpuMathUtils.MatTimesSrc(true, true, mat, src, dst, src.Size);
             float[] actual = new float[dst.Size];
@@ -129,74 +135,68 @@ public void MatMulTranAAddTest(int test, float[] expected)
         }
 
         [Theory]
-        [InlineData(0, new float[] { -27.32f, -9.02f, -27.32f, -9.02f })]
-        [InlineData(1, new float[] { 95f, 231f, 367f, 503f })]
-        public void MatMulPATest(int test, float[] expected)
+        [InlineData(0, 0, 0, new float[] { -27.32f, -9.02f, -27.32f, -9.02f })]
+        [InlineData(1, 1, 0, new float[] { 95f, 231f, 367f, 503f })]
+        [InlineData(1, 0, 1, new float[] { 10f, 26f, 42f, 58f, 74f, 90f, 106f, 122f })]
+        public void MatMulPATest(int matTest, int srcTest, int dstTest, float[] expected)
         {
-            AlignedArray mat = testMatrices[test];
-            AlignedArray src = testSrcVectors[test];
-            AlignedArray dst = new AlignedArray(4, SseCbAlign);
+            AlignedArray mat = testMatrices[matTest];
+            AlignedArray src = testSrcVectors[srcTest];
+            AlignedArray dst = testDstVectors[dstTest];
             int[] idx = testIndexArray;
 
-            CpuMathUtils.MatTimesSrc(false, false, mat, idx, src, 0, 0, 2 + 2 * test, dst, dst.Size);
+            CpuMathUtils.MatTimesSrc(false, false, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, dst.Size);
             float[] actual = new float[dst.Size];
             dst.CopyTo(actual, 0, dst.Size);
             Assert.Equal(expected, actual, comparer);
         }
 
         [Theory]
-        [InlineData(0, new float[] { -27.32f, -8.02f, -25.32f, -6.02f })]
-        [InlineData(1, new float[] { 95f, 232f, 369f, 506f })]
-        public void MatMulPAAddTest(int test, float[] expected)
+        [InlineData(0, 0, 0, new float[] { -27.32f, -8.02f, -25.32f, -6.02f })]
+        [InlineData(1, 1, 0, new float[] { 95f, 232f, 369f, 506f })]
+        [InlineData(1, 0, 1, new float[] { 10f, 27f, 44f, 61f, 78f, 95f, 112f, 129f })]
+        public void MatMulPAAddTest(int matTest, int srcTest, int dstTest, float[] expected)
         {
-            AlignedArray mat = testMatrices[test];
-            AlignedArray src = testSrcVectors[test];
-            AlignedArray dst = new AlignedArray(4, SseCbAlign);
+            AlignedArray mat = testMatrices[matTest];
+            AlignedArray src = testSrcVectors[srcTest];
+            AlignedArray dst = testDstVectors[dstTest];
             int[] idx = testIndexArray;
 
-            for (int i = 0; i < dst.Size; i++)
-            {
-                dst[i] = i;
-            }
-
-            CpuMathUtils.MatTimesSrc(false, true, mat, idx, src, 0, 0, 2 + 2 * test, dst, dst.Size);
+            CpuMathUtils.MatTimesSrc(false, true, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, dst.Size);
             float[] actual = new float[dst.Size];
             dst.CopyTo(actual, 0, dst.Size);
             Assert.Equal(expected, actual, comparer);
         }
 
         [Theory]
-        [InlineData(0, new float[] { 7.84f, -9.52f, -39.04f, 55.36f })]
-        [InlineData(1, new float[] { 52f, 56f, 60f, 64f, 68f, 72f, 76f, 80f })]
-        public void MatMulTranPATest(int test, float[] expected)
+        [InlineData(0, 0, 0, new float[] { 7.84f, -9.52f, -39.04f, 55.36f })]
+        [InlineData(1, 0, 1, new float[] { 52f, 56f, 60f, 64f, 68f, 72f, 76f, 80f })]
+        [InlineData(1, 1, 0, new float[] { 329f, 346f, 363f, 380f })]
+        public void MatMulTranPATest(int matTest, int srcTest, int dstTest, float[] expected)
         {
-            AlignedArray mat = testMatrices[test];
-            AlignedArray src = testSrcVectors[0];
-            AlignedArray dst = new AlignedArray(4 + 4 * test, SseCbAlign);
+            AlignedArray mat = testMatrices[matTest];
+            AlignedArray src = testSrcVectors[srcTest];
+            AlignedArray dst = testDstVectors[dstTest];
             int[] idx = testIndexArray;
 
-            CpuMathUtils.MatTimesSrc(true, false, mat, idx, src, 0, 0, 2, dst, src.Size);
+            CpuMathUtils.MatTimesSrc(true, false, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, src.Size);
             float[] actual = new float[dst.Size];
             dst.CopyTo(actual, 0, dst.Size);
             Assert.Equal(expected, actual, comparer);
         }
 
         [Theory]
-        [InlineData(0, new float[] { 7.84f, -8.52f, -37.04f, 58.36f })]
-        [InlineData(1, new float[] { 52f, 57f, 62f, 67f, 72f, 77f, 82f, 87f })]
-        public void MatMulTranPAAddTest(int test, float[] expected)
+        [InlineData(0, 0, 0, new float[] { 7.84f, -8.52f, -37.04f, 58.36f })]
+        [InlineData(1, 0, 1, new float[] { 52f, 57f, 62f, 67f, 72f, 77f, 82f, 87f })]
+        [InlineData(1, 1, 0, new float[] { 329f, 347f, 365f, 383f })]
+        public void MatMulTranPAAddTest(int matTest, int srcTest, int dstTest, float[] expected)
         {
-            AlignedArray mat = testMatrices[test];
-            AlignedArray src = testSrcVectors[0];
-            AlignedArray dst = new AlignedArray(4 + 4 * test, SseCbAlign);
+            AlignedArray mat = testMatrices[matTest];
+            AlignedArray src = testSrcVectors[srcTest];
+            AlignedArray dst = testDstVectors[dstTest];
             int[] idx = testIndexArray;
 
-            for (int i = 0; i < dst.Size; i++)
-            {
-                dst[i] = i;
-            }
-
-            CpuMathUtils.MatTimesSrc(true, true, mat, idx, src, 0, 0, 2, dst, src.Size);
+            CpuMathUtils.MatTimesSrc(true, true, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, src.Size);
             float[] actual = new float[dst.Size];
             dst.CopyTo(actual, 0, dst.Size);
             Assert.Equal(expected, actual, comparer);

From ef979b1abc12954ed16b6422e3b0c9d0b1b9755b Mon Sep 17 00:00:00 2001
From: Brian Lui <t-brlui@microsoft.com>
Date: Fri, 10 Aug 2018 14:49:15 -0700
Subject: [PATCH 8/8] Respond to PR feedback: Style changes

---
 .../CpuMathUtils.netcoreapp.cs                | 20 ++++++++-----------
 src/Microsoft.ML.CpuMath/SseIntrinsics.cs     |  2 +-
 2 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
index e17019ffa7..b238d602b0 100644
--- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
+++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
@@ -12,18 +12,17 @@ public static partial class CpuMathUtils
         public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crun)
         {
             Contracts.Assert(mat.Size == dst.Size * src.Size);
+            Contracts.Assert(crun >= 0);
 
             if (Sse.IsSupported)
             {
                 if (!tran)
                 {
-                    Contracts.Assert(crun >= 0);
                     Contracts.Assert(crun <= dst.Size);
                     SseIntrinsics.MatMulA(add, mat, src, dst, crun, src.Size);
                 }
                 else
                 {
-                    Contracts.Assert(crun >= 0);
                     Contracts.Assert(crun <= src.Size);
                     SseIntrinsics.MatMulTranA(add, mat, src, dst, dst.Size, crun);
                 }
@@ -32,7 +31,6 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArr
             {
                 if (!tran)
                 {
-                    Contracts.Assert(crun >= 0);
                     Contracts.Assert(crun <= dst.Size);
                     for (int i = 0; i < crun; i++)
                     {
@@ -54,7 +52,6 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArr
                 }
                 else
                 {
-                    Contracts.Assert(crun >= 0);
                     Contracts.Assert(crun <= src.Size);
                     for (int i = 0; i < dst.Size; i++)
                     {
@@ -94,18 +91,17 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgpo
             }
 
             Contracts.AssertNonEmpty(rgposSrc);
+            Contracts.Assert(crun >= 0);
 
             if (Sse.IsSupported)
             {
                 if (!tran)
                 {
-                    Contracts.Assert(crun >= 0);
                     Contracts.Assert(crun <= dst.Size);
                     SseIntrinsics.MatMulPA(add, mat, rgposSrc, srcValues, posMin, iposMin, iposLim, dst, crun, srcValues.Size);
                 }
                 else
                 {
-                    Contracts.Assert(crun >= 0);
                     Contracts.Assert(crun <= srcValues.Size);
                     SseIntrinsics.MatMulTranPA(add, mat, rgposSrc, srcValues, posMin, iposMin, iposLim, dst, dst.Size);
                 }
@@ -114,7 +110,6 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgpo
             {
                 if (!tran)
                 {
-                    Contracts.Assert(crun >= 0);
                     Contracts.Assert(crun <= dst.Size);
                     for (int i = 0; i < crun; i++)
                     {
@@ -137,7 +132,6 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgpo
                 }
                 else
                 {
-                    Contracts.Assert(crun >= 0);
                     Contracts.Assert(crun <= srcValues.Size);
                     for (int i = 0; i < dst.Size; i++)
                     {
@@ -171,7 +165,6 @@ public static void Add(float a, float[] dst, int count)
             Add(a, new Span<float>(dst, 0, count));
         }
 
-        // dst += a
         private static void Add(float a, Span<float> dst)
         {
             if (Sse.IsSupported)
@@ -856,7 +849,8 @@ private static unsafe void ZeroItemsU(AlignedArray dst, int c, int[] indices, in
                 for (int i = 0; i < cindices; ++i)
                 {
                     int index = pidx[i];
-                    Contracts.Assert(0 <= index && index < c);
+                    Contracts.Assert(index >= 0);
+                    Contracts.Assert(index < c);
                     pdst[index] = 0;
                 }
             }
@@ -874,7 +868,8 @@ private static unsafe void ZeroMatrixItemsCore(AlignedArray dst, int c, int ccol
                 for (int i = 0; i < cindices; ++i)
                 {
                     int index = pidx[i];
-                    Contracts.Assert(0 <= index && index < c);
+                    Contracts.Assert(index >= 0);
+                    Contracts.Assert(index < c);
 
                     int col = index - ivLogMin;
                     if ((uint)col >= (uint)ccol)
@@ -886,7 +881,8 @@ private static unsafe void ZeroMatrixItemsCore(AlignedArray dst, int c, int ccol
                         ivLogLim = ivLogMin + ccol;
                         ivPhyMin = row * cfltRow;
 
-                        Contracts.Assert(ivLogMin <= index && index < ivLogLim);
+                        Contracts.Assert(index >= ivLogMin);
+                        Contracts.Assert(index < ivLogLim);
                         col = index - ivLogMin;
                     }
 
diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
index 659e5d3e90..2ac1f56f14 100644
--- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
+++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
@@ -98,7 +98,7 @@ private static Vector128<float> VectorMax(in Vector128<float> vector)
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
         private static Vector128<float> GetAbsMask()
         {
-            return (Sse2.IsSupported) ?
+            return Sse2.IsSupported ?
                 Sse.StaticCast<int, float>(Sse2.SetAllVector128(0x7FFFFFFF)) :
                 Sse.SetAllVector128(BitConverter.Int32BitsToSingle(0x7FFFFFFF));
         }