diff --git a/src/Microsoft.ML.CpuMath/AlignedArray.cs b/src/Microsoft.ML.CpuMath/AlignedArray.cs
deleted file mode 100644
index 87583a8ef6..0000000000
--- a/src/Microsoft.ML.CpuMath/AlignedArray.cs
+++ /dev/null
@@ -1,214 +0,0 @@
-// Licensed to the .NET Foundation under one or more agreements.
-// The .NET Foundation licenses this file to you under the MIT license.
-// See the LICENSE file in the project root for more information.
-
-using System;
-
-namespace Microsoft.ML.Runtime.Internal.CpuMath
-{
-    using Float = System.Single;
-
-    /// <summary>
-    /// This implements a logical array of Floats that is automatically aligned for SSE/AVX operations.
-    /// To pin and force alignment, call the GetPin method, typically wrapped in a using (since it
-    /// returns a Pin struct that is IDisposable). From the pin, you can get the IntPtr to pass to
-    /// native code.
-    ///
-    /// The ctor takes an alignment value, which must be a power of two at least sizeof(Float).
-    /// </summary>
-    public sealed class AlignedArray
-    {
-        // Items includes "head" items filled with NaN, followed by _size entries, followed by "tail"
-        // items, also filled with NaN. Note that _size * sizeof(Float) is divisible by _cbAlign.
-        // It is illegal to access any slot outsize [_base, _base + _size). This is internal so clients
-        // can easily pin it.
-        internal Float[] Items;
-
-        private readonly int _size; // Must be divisible by (_cbAlign / sizeof(Float)).
-        private readonly int _cbAlign; // The alignment in bytes, a power of two, divisible by sizeof(Float).
-        private int _base; // Where the values start in Items (changes to ensure alignment).
-
-        private object _lock; // Used to make sure only one thread can re-align the values.
-
-        /// <summary>
-        /// Allocate an aligned vector with the given alignment (in bytes).
-        /// The alignment must be a power of two and at least sizeof(Float).
-        /// </summary>
-        public AlignedArray(int size, int cbAlign)
-        {
-            Contracts.Assert(0 < size);
-            // cbAlign should be a power of two.
-            Contracts.Assert(sizeof(Float) <= cbAlign);
-            Contracts.Assert((cbAlign & (cbAlign - 1)) == 0);
-            // cbAlign / sizeof(Float) should divide size.
-            Contracts.Assert((size * sizeof(Float)) % cbAlign == 0);
-
-            Items = new Float[size + cbAlign / sizeof(Float)];
-            _size = size;
-            _cbAlign = cbAlign;
-            _lock = new object();
-        }
-
-        internal unsafe int GetBase(long addr)
-        {
-#if DEBUG
-            fixed (Float* pv = Items)
-                Contracts.Assert((Float*)addr == pv);
-#endif
-
-            int cbLow = (int)(addr & (_cbAlign - 1));
-            int ibMin = cbLow == 0 ? 0 : _cbAlign - cbLow;
-            Contracts.Assert(ibMin % sizeof(Float) == 0);
-
-            int ifltMin = ibMin / sizeof(Float);
-            if (ifltMin == _base)
-                return _base;
-
-            MoveData(ifltMin);
-#if DEBUG
-            // Anything outsize [_base, _base + _size) should not be accessed, so
-            // set them to NaN, for debug validation.
-            for (int i = 0; i < _base; i++)
-                Items[i] = Float.NaN;
-            for (int i = _base + _size; i < Items.Length; i++)
-                Items[i] = Float.NaN;
-#endif
-            return _base;
-        }
-
-        private void MoveData(int newBase)
-        {
-            lock (_lock)
-            {
-                // Since the array is already pinned, addr and ifltMin in GetBase() cannot change
-                // so all we need is to make sure the array is moved only once.
-                if (_base != newBase)
-                {
-                    Array.Copy(Items, _base, Items, newBase, _size);
-                    _base = newBase;
-                }
-            }
-        }
-
-        public int Size { get { return _size; } }
-
-        public int CbAlign { get { return _cbAlign; } }
-
-        public Float this[int index]
-        {
-            get
-            {
-                Contracts.Assert(0 <= index && index < _size);
-                return Items[index + _base];
-            }
-            set
-            {
-                Contracts.Assert(0 <= index && index < _size);
-                Items[index + _base] = value;
-            }
-        }
-
-        public void CopyTo(Float[] dst, int index, int count)
-        {
-            Contracts.Assert(0 <= count && count <= _size);
-            Contracts.Assert(dst != null);
-            Contracts.Assert(0 <= index && index <= dst.Length - count);
-            Array.Copy(Items, _base, dst, index, count);
-        }
-
-        public void CopyTo(int start, Float[] dst, int index, int count)
-        {
-            Contracts.Assert(0 <= count);
-            Contracts.Assert(0 <= start && start <= _size - count);
-            Contracts.Assert(dst != null);
-            Contracts.Assert(0 <= index && index <= dst.Length - count);
-            Array.Copy(Items, start + _base, dst, index, count);
-        }
-
-        public void CopyFrom(Float[] src, int index, int count)
-        {
-            Contracts.Assert(0 <= count && count <= _size);
-            Contracts.Assert(src != null);
-            Contracts.Assert(0 <= index && index <= src.Length - count);
-            Array.Copy(src, index, Items, _base, count);
-        }
-
-        public void CopyFrom(int start, Float[] src, int index, int count)
-        {
-            Contracts.Assert(0 <= count);
-            Contracts.Assert(0 <= start && start <= _size - count);
-            Contracts.Assert(src != null);
-            Contracts.Assert(0 <= index && index <= src.Length - count);
-            Array.Copy(src, index, Items, start + _base, count);
-        }
-
-        // Copies values from a sparse vector.
-        // valuesSrc contains only the non-zero entries. Those are copied into their logical positions in the dense array.
-        // rgposSrc contains the logical positions + offset of the non-zero entries in the dense array.
-        // rgposSrc runs parallel to the valuesSrc array.
-        public void CopyFrom(int[] rgposSrc, Float[] valuesSrc, int posMin, int iposMin, int iposLim, bool zeroItems)
-        {
-            Contracts.Assert(rgposSrc != null);
-            Contracts.Assert(valuesSrc != null);
-            Contracts.Assert(rgposSrc.Length <= valuesSrc.Length);
-            Contracts.Assert(0 <= iposMin && iposMin <= iposLim && iposLim <= rgposSrc.Length);
-
-            // Zeroing-out and setting the values in one-pass does not seem to give any perf benefit.
-            // So explicitly zeroing and then setting the values.
-            if (zeroItems)
-                ZeroItems();
-
-            for (int ipos = iposMin; ipos < iposLim; ++ipos)
-            {
-                Contracts.Assert(posMin <= rgposSrc[ipos]);
-                int iv = _base + rgposSrc[ipos] - posMin;
-                Contracts.Assert(iv < _size + _base);
-                Items[iv] = valuesSrc[ipos];
-            }
-        }
-
-        public void CopyFrom(AlignedArray src)
-        {
-            Contracts.Assert(src != null);
-            Contracts.Assert(src._size == _size);
-            Contracts.Assert(src._cbAlign == _cbAlign);
-            Array.Copy(src.Items, src._base, Items, _base, _size);
-        }
-
-        public void ZeroItems()
-        {
-            Array.Clear(Items, _base, _size);
-        }
-
-        public void ZeroItems(int[] rgposSrc, int posMin, int iposMin, int iposLim)
-        {
-            Contracts.Assert(rgposSrc != null);
-            Contracts.Assert(0 <= iposMin && iposMin <= iposLim && iposLim <= rgposSrc.Length);
-            Contracts.Assert(iposLim - iposMin <= _size);
-
-            int ivCur = 0;
-            for (int ipos = iposMin; ipos < iposLim; ++ipos)
-            {
-                int ivNextNonZero = rgposSrc[ipos] - posMin;
-                Contracts.Assert(ivCur <= ivNextNonZero && ivNextNonZero < _size);
-                while (ivCur < ivNextNonZero)
-                    Items[_base + ivCur++] = 0;
-                Contracts.Assert(ivCur == ivNextNonZero);
-                // Skip the non-zero element at ivNextNonZero.
-                ivCur++;
-            }
-
-            while (ivCur < _size)
-                Items[_base + ivCur++] = 0;
-        }
-
-        // REVIEW: This is hackish and slightly dangerous. Perhaps we should wrap this in an
-        // IDisposable that "locks" this, prohibiting GetBase from being called, while the buffer
-        // is "checked out".
-        public void GetRawBuffer(out Float[] items, out int offset)
-        {
-            items = Items;
-            offset = _base;
-        }
-    }
-}
\ No newline at end of file
diff --git a/src/Microsoft.ML.CpuMath/AlignedMatrix.cs b/src/Microsoft.ML.CpuMath/AlignedMatrix.cs
deleted file mode 100644
index f76ec7815d..0000000000
--- a/src/Microsoft.ML.CpuMath/AlignedMatrix.cs
+++ /dev/null
@@ -1,675 +0,0 @@
-// Licensed to the .NET Foundation under one or more agreements.
-// The .NET Foundation licenses this file to you under the MIT license.
-// See the LICENSE file in the project root for more information.
-
-using Float = System.Single;
-
-using System;
-using System.Collections;
-using System.Collections.Generic;
-
-namespace Microsoft.ML.Runtime.Internal.CpuMath
-{
-    using Conditional = System.Diagnostics.ConditionalAttribute;
-
-    /// <summary>
-    /// This implements a logical array of Floats that is automatically aligned for SSE/AVX operations.
-    /// This is a thin wrapper around the AlignedArray type implemented in C++. This simply couples
-    /// the AlignedArray with a logical size, which does not include padding, while the AlignedArray
-    /// size does include padding.
-    /// </summary>
-    public sealed class CpuAlignedVector : ICpuVector
-    {
-        private readonly AlignedArray _items;
-        private readonly int _size; // The logical size.
-
-        /// <summary>
-        /// The value count.
-        /// </summary>
-        public int ValueCount { get { return _size; } }
-
-        /// <summary>
-        /// The logical size of the vector.
-        /// </summary>
-        public int VectorSize { get { return _size; } }
-
-        // Round cflt up to a multiple of cfltAlign.
-        private static int RoundUp(int cflt, int cfltAlign)
-        {
-            Contracts.Assert(0 < cflt);
-            // cfltAlign should be a power of two.
-            Contracts.Assert(0 < cfltAlign && (cfltAlign & (cfltAlign - 1)) == 0);
-
-            // Determine the number of "blobs" of size cfltAlign.
-            int cblob = (cflt + cfltAlign - 1) / cfltAlign;
-            return cblob * cfltAlign;
-        }
-
-        /// <summary>
-        /// Allocate an aligned vector with the given alignment (in bytes).
-        /// The alignment must be a power of two and at least sizeof(Float).
-        /// </summary>
-        public CpuAlignedVector(int size, int cbAlign)
-        {
-            Contracts.Assert(0 < size);
-            // cbAlign should be a power of two.
-            Contracts.Assert(sizeof(Float) <= cbAlign);
-            Contracts.Assert((cbAlign & (cbAlign - 1)) == 0);
-
-            int cfltAlign = cbAlign / sizeof(Float);
-            int cflt = RoundUp(size, cfltAlign);
-            _items = new AlignedArray(cflt, cbAlign);
-            _size = size;
-            AssertValid();
-        }
-
-        public void Dispose()
-        {
-        }
-
-        [Conditional("DEBUG")]
-        private void AssertValid()
-        {
-#if DEBUG
-            Contracts.Assert(0 < _size && _size <= _items.Size);
-
-            // The padding, [_size, _items.Size), should contain zeros.
-            for (int i = _size; i < _items.Size; i++)
-                Contracts.Assert(_items[i] == 0);
-#endif
-        }
-
-        /// <summary>
-        /// The physical AligenedArray items.
-        /// </summary>
-        public AlignedArray Items { get { return _items; } }
-
-        /// <summary>
-        /// The alignment.
-        /// </summary>
-        public int CbAlign
-        {
-            get { return _items.CbAlign; }
-        }
-
-        /// <summary>
-        /// Set and get the value of the vector at the given index.
-        /// </summary>
-        /// <param name="index">The index</param>
-        /// <returns>The value at the given index</returns>
-        public Float this[int index]
-        {
-            get
-            {
-                Contracts.Assert(0 <= index && index < _size);
-                return _items[index];
-            }
-            set
-            {
-                Contracts.Assert(0 <= index && index < _size);
-                _items[index] = value;
-            }
-        }
-
-        /// <summary>
-        /// Get the value of the vector at the given index.
-        /// </summary>
-        /// <param name="i">The index</param>
-        /// <returns>The value at the given index</returns>
-        public Float GetValue(int i)
-        {
-            Contracts.Assert(0 <= i && i < _size);
-            return _items[i];
-        }
-
-        /// <summary>
-        /// Assign randomized values to the vector elements via the input function.
-        /// </summary>
-        /// <param name="rand">The input rand om function that takes no arguments and returns a float value</param>
-        public void Randomize(Func<Float> rand)
-        {
-            Contracts.AssertValue(rand);
-            for (int i = 0; i < _size; i++)
-                _items[i] = rand();
-        }
-
-        /// <summary>
-        /// Assign zeros to the vector elements.
-        /// </summary>
-        public void Zero()
-        {
-            _items.ZeroItems();
-        }
-
-        /// <summary>
-        /// Copy the values into dst, starting at slot ivDst and advancing ivDst.
-        /// </summary>
-        /// <param name="dst">The destination array</param>
-        /// <param name="ivDst">The starting index in the destination array</param>
-        public void CopyTo(Float[] dst, ref int ivDst)
-        {
-            Contracts.AssertValue(dst);
-            Contracts.Assert(0 <= ivDst && ivDst <= dst.Length - _size);
-            _items.CopyTo(dst, ivDst, _size);
-            ivDst += _size;
-        }
-
-        /// <summary>
-        /// Copy the values from this vector starting at slot ivSrc into dst, starting at slot ivDst.
-        /// The number of values that are copied is determined by count.
-        /// </summary>
-        /// <param name="ivSrc">The staring index in this vector</param>
-        /// <param name="dst">The destination array</param>
-        /// <param name="ivDst">The starting index in the destination array</param>
-        /// <param name="count">The number of elements to be copied</param>
-        public void CopyTo(int ivSrc, Float[] dst, int ivDst, int count)
-        {
-            Contracts.AssertValue(dst);
-            Contracts.Assert(0 <= count && count <= dst.Length);
-            Contracts.Assert(0 <= ivSrc && ivSrc <= _size - count);
-            Contracts.Assert(0 <= ivDst && ivDst <= dst.Length - count);
-            _items.CopyTo(ivSrc, dst, ivDst, count);
-        }
-
-        /// <summary>
-        /// Copy the values from src, starting at slot index and advancing index, into this vector.
-        /// </summary>
-        /// <param name="src">The source array</param>
-        /// <param name="index">The starting index in the source array</param>
-        public void CopyFrom(Float[] src, ref int index)
-        {
-            Contracts.AssertValue(src);
-            Contracts.Assert(0 <= index && index <= src.Length - _size);
-            _items.CopyFrom(src, index, _size);
-            index += _size;
-        }
-
-        /// <summary>
-        /// Copy the values from src, starting at slot index and advancing index, into this vector, starting at slot ivDst.
-        /// The number of values that are copied is determined by count.
-        /// </summary>
-        /// <param name="ivDst">The staring index in this vector</param>
-        /// <param name="src">The source array</param>
-        /// <param name="ivSrc">The starting index in the source array</param>
-        /// <param name="count">The number of elements to be copied</param>
-        public void CopyFrom(int ivDst, Float[] src, int ivSrc, int count)
-        {
-            Contracts.AssertValue(src);
-            Contracts.Assert(0 <= count && count <= src.Length);
-            Contracts.Assert(0 <= ivDst && ivDst <= _size - count);
-            Contracts.Assert(0 <= ivSrc && ivSrc <= src.Length - count);
-            _items.CopyFrom(ivDst, src, ivSrc, _size);
-        }
-
-        /// <summary>
-        /// Copy the values of src vector into this vector. The src vector must have the same size as this vector.
-        /// </summary>
-        /// <param name="src">The source vector</param>
-        public void CopyFrom(CpuAlignedVector src)
-        {
-            Contracts.AssertValue(src);
-            Contracts.Assert(src._size == _size);
-            _items.CopyFrom(src._items);
-        }
-
-        /// <summary>
-        /// Get the underlying AlignedArray as IEnumerator&lt;Float&gt;.
-        /// </summary>
-        public IEnumerator<Float> GetEnumerator()
-        {
-            for (int i = 0; i < _size; i++)
-                yield return _items[i];
-        }
-
-        IEnumerator IEnumerable.GetEnumerator()
-        {
-            return GetEnumerator();
-        }
-    }
-
-    /// <summary>
-    /// This implements a logical matrix of Floats that is automatically aligned for SSE/AVX operations.
-    /// The ctor takes an alignment value, which must be a power of two at least sizeof(Float).
-    /// </summary>
-    public abstract class CpuAlignedMatrixBase
-    {
-        // _items includes "head" items filled with NaN, followed by RunLenPhy * RunCntPhy entries, followed by
-        // "tail" items, also filled with NaN. Note that RunLenPhy and RunCntPhy are divisible by the alignment
-        // specified in the ctor and are >= RunLen and RunCnt, respectively. It is illegal to access any slot
-        // outsize [_base, _base + RunLenPhy * RunCntPhy). The padding should all be zero (and maintained as such).
-        // The items are arranged in "runs" of length RunLen. There are RunCnt such runs. Each run ends with
-        // (RunLenPhy - RunLen) padding slots. There are an addition (RunCntPhy - RunCnt) padding runs of length
-        // RunLenPhy, which are entirely zero. Any native code should be able to assume and should maintain
-        // these invariants.
-        public AlignedArray Items { get; }
-
-        protected readonly int FloatAlign; // The alignment.
-
-        // Since FloatAlign is a power of two, shifting by Shift = log_2(FloatAlign) is the same as multiplying/dividing by FloatAlign.
-        protected readonly int Shift;
-        // Since FloatAlign is a power of two, bitwise and with Mask = FloatAlign - 1 will be the same as moding by FloatAlign.
-        protected readonly int Mask;
-
-        // Logical length of runs (RunLen) and number of runs (RunCnt).
-        public readonly int RunLen;
-        public readonly int RunCnt;
-
-        // Physical (padded) length and number of runs.
-        public readonly int RunLenPhy;
-        public readonly int RunCntPhy;
-
-        /// <summary>
-        /// The logical number values in the matrix
-        /// </summary>
-        public int ValueCount => RunLen * RunCnt;
-
-        /// <summary>
-        /// The logical number of rows
-        /// </summary>
-        public abstract int RowCount { get; }
-
-        /// <summary>
-        /// The logical number of columns
-        /// </summary>
-        public abstract int ColCount { get; }
-
-        /// <summary>
-        /// The physical number of rows
-        /// </summary>
-        public abstract int RowCountPhy { get; }
-
-        /// <summary>
-        /// The pysical number of columns
-        /// </summary>
-        public abstract int ColCountPhy { get; }
-
-        // Round cflt up to a multiple of cfltAlign.
-        protected static int RoundUp(int cflt, int cfltAlign)
-        {
-            Contracts.Assert(0 < cflt);
-            // cfltAlign should be a power of two.
-            Contracts.Assert(0 < cfltAlign && (cfltAlign & (cfltAlign - 1)) == 0);
-
-            // Determine the number of "blobs" of size cfltAlign.
-            int cblob = (cflt + cfltAlign - 1) / cfltAlign;
-            return cblob * cfltAlign;
-        }
-
-        /// <summary>
-        /// Allocate an aligned matrix with the given alignment (in bytes).
-        /// </summary>
-        protected CpuAlignedMatrixBase(int runLen, int runCnt, int cbAlign)
-        {
-            Contracts.Assert(0 < runLen);
-            Contracts.Assert(0 < runCnt);
-            // cbAlign should be a power of two.
-            Contracts.Assert(sizeof(Float) <= cbAlign);
-            Contracts.Assert((cbAlign & (cbAlign - 1)) == 0);
-
-            RunLen = runLen;
-            RunCnt = runCnt;
-
-            FloatAlign = cbAlign / sizeof(Float);
-            Shift = GeneralUtils.CbitLowZero((uint)FloatAlign);
-            Mask = FloatAlign - 1;
-
-            RunLenPhy = RoundUp(runLen, FloatAlign);
-            RunCntPhy = RoundUp(runCnt, FloatAlign);
-            Items = new AlignedArray(RunLenPhy * RunCntPhy, cbAlign);
-
-            AssertValid();
-        }
-
-        [Conditional("DEBUG")]
-        protected void AssertValid()
-        {
-#if DEBUG
-            Contracts.Assert(0 < RunLen && RunLen <= RunLenPhy);
-            Contracts.Assert(0 < RunCnt && RunCnt <= RunCntPhy);
-            Contracts.Assert(RunLenPhy * RunCntPhy == Items.Size);
-
-            // Assert that the padding at the end of each run contains zeros.
-            for (int i = 0; i < RunCnt; i++)
-            {
-                for (int j = RunLen; j < RunLenPhy; j++)
-                    Contracts.Assert(Items[i * RunLenPhy + j] == 0);
-            }
-
-            // Assert that the padding runs contain zeros.
-            for (int i = RunCnt; i < RunCntPhy; i++)
-            {
-                for (int j = 0; j < RunLenPhy; j++)
-                    Contracts.Assert(Items[i * RunLenPhy + j] == 0);
-            }
-#endif
-        }
-
-        public void Dispose()
-        {
-        }
-
-        /// <summary>
-        /// Assign randomized values to the matrix elements via the input function.
-        /// </summary>
-        /// <param name="rand">The input rand om function that takes no arguments and returns a float value</param>
-        public void Randomize(Func<Float> rand)
-        {
-            Contracts.AssertValue(rand);
-            for (int i = 0, k = 0; i < RunCnt; i++)
-            {
-                Contracts.Assert(k == i * RunLenPhy);
-                for (int j = 0; j < RunLen; j++)
-                    Items[k + j] = rand();
-                k += RunLenPhy;
-            }
-        }
-
-        /// <summary>
-        /// Assign zeros to the matrix elements.
-        /// </summary>
-        public void Zero()
-        {
-            Items.ZeroItems();
-        }
-
-        /// <summary>
-        /// Copy the values of src matrix into this matrix. The src matrix must have the same physical and logical size as this matrix.
-        /// </summary>
-        /// <param name="src">The source matrix</param>
-        public void CopyFrom(CpuAlignedMatrixBase src)
-        {
-            AssertValid();
-            Contracts.AssertValue(src);
-            src.AssertValid();
-            Contracts.Assert(src.RunLen == RunLen);
-            Contracts.Assert(src.RunCnt == RunCnt);
-            Contracts.Assert(src.RunLenPhy == RunLenPhy);
-            Contracts.Assert(src.RunCntPhy == RunCntPhy);
-            Items.CopyFrom(src.Items);
-        }
-    }
-
-    /// <summary>
-    /// This implements a logical row-major matrix of Floats that is automatically aligned for SSE/AVX operations.
-    /// The ctor takes an alignment value, which must be a power of two at least sizeof(Float).
-    /// </summary>
-    public abstract class CpuAlignedMatrixRowBase : CpuAlignedMatrixBase, ICpuBuffer<Float>
-    {
-        protected CpuAlignedMatrixRowBase(int crow, int ccol, int cbAlign)
-            : base(ccol, crow, cbAlign)
-        {
-        }
-
-        /// <summary>
-        /// The logical number of rows
-        /// </summary>
-        public override int RowCount => RunCnt;
-
-        /// <summary>
-        /// The logical number of columns
-        /// </summary>
-        public override int ColCount { get { return RunLen; } }
-
-        /// <summary>
-        /// The physical number of rows
-        /// </summary>
-        public override int RowCountPhy { get { return RunCntPhy; } }
-
-        /// <summary>
-        /// The physical number of columns
-        /// </summary>
-        public override int ColCountPhy { get { return RunLenPhy; } }
-
-        /// <summary>
-        /// Copy the values into dst, starting at slot ivDst and advancing ivDst.
-        /// </summary>
-        /// <param name="dst">The destination array</param>
-        /// <param name="ivDst">The starting index in the destination array</param>
-        public void CopyTo(Float[] dst, ref int ivDst)
-        {
-            Contracts.AssertValue(dst);
-            Contracts.Assert(0 <= ivDst && ivDst <= dst.Length - ValueCount);
-
-            if (ColCount == ColCountPhy)
-            {
-                // Can copy all at once.
-                Items.CopyTo(0, dst, ivDst, ValueCount);
-                ivDst += ValueCount;
-            }
-            else
-            {
-                // Copy each row.
-                int ivSrc = 0;
-                for (int row = 0; row < RowCount; row++)
-                {
-                    Items.CopyTo(ivSrc, dst, ivDst, ColCount);
-                    ivSrc += ColCountPhy;
-                    ivDst += ColCount;
-                }
-            }
-        }
-
-        /// <summary>
-        /// Copy the values from src, starting at slot ivSrc and advancing ivSrc.
-        /// </summary>
-        /// <param name="src">The source array</param>
-        /// <param name="ivSrc">The starting index in the source array</param>
-        public void CopyFrom(Float[] src, ref int ivSrc)
-        {
-            Contracts.AssertValue(src);
-            Contracts.Assert(0 <= ivSrc && ivSrc <= src.Length - ValueCount);
-
-            if (ColCount == ColCountPhy)
-            {
-                Items.CopyFrom(src, ivSrc, ValueCount);
-                ivSrc += ValueCount;
-            }
-            else
-            {
-                for (int row = 0; row < RowCount; row++)
-                {
-                    Items.CopyFrom(row * ColCountPhy, src, ivSrc, ColCount);
-                    ivSrc += ColCount;
-                }
-            }
-        }
-
-        /// <summary>
-        /// Get the underlying AlignedArray as IEnumerator&lt;Float&gt;.
-        /// </summary>
-        public IEnumerator<Float> GetEnumerator()
-        {
-            for (int row = 0; row < RowCount; row++)
-            {
-                int ivBase = row * ColCountPhy;
-                for (int col = 0; col < ColCount; col++)
-                    yield return Items[ivBase + col];
-            }
-        }
-
-        IEnumerator IEnumerable.GetEnumerator()
-        {
-            return GetEnumerator();
-        }
-    }
-
-    /// <summary>
-    /// This implements a row-major matrix of Floats that is automatically aligned for SSE/AVX operations.
-    /// The ctor takes an alignment value, which must be a power of two at least sizeof(Float).
-    /// </summary>
-    public sealed class CpuAlignedMatrixRow : CpuAlignedMatrixRowBase, ICpuFullMatrix
-    {
-        public CpuAlignedMatrixRow(int crow, int ccol, int cbAlign)
-            : base(crow, ccol, cbAlign)
-        {
-        }
-
-        /// <summary>
-        /// The logical number of rows
-        /// </summary>
-        public override int RowCount { get { return RunCnt; } }
-
-        /// <summary>
-        /// The logical number of columns
-        /// </summary>
-        public override int ColCount { get { return RunLen; } }
-
-        /// <summary>
-        /// The physical number of rows
-        /// </summary>
-        public override int RowCountPhy { get { return RunCntPhy; } }
-
-        /// <summary>
-        /// The physical number of columns
-        /// </summary>
-        public override int ColCountPhy { get { return RunLenPhy; } }
-
-        /// <summary>
-        /// Copy the values from this matrix, starting from the row into dst, starting at slot ivDst and advancing ivDst.
-        /// </summary>
-        /// <param name="row">The starting row in this matrix</param>
-        /// <param name="dst">The destination array</param>
-        /// <param name="ivDst">The starting index in the destination array</param>
-        public void CopyTo(int row, Float[] dst, ref int ivDst)
-        {
-            Contracts.AssertValue(dst);
-            Contracts.Assert(0 <= row && row < RowCount);
-            Contracts.Assert(0 <= ivDst && ivDst <= dst.Length - ColCount);
-
-            Items.CopyTo(row * ColCountPhy, dst, ivDst, ColCount);
-            ivDst += ColCount;
-        }
-
-        /// <summary>
-        /// Assign zeros to the values at the indices
-        /// </summary>
-        /// <param name="indices">The indices</param>
-        public void ZeroItems(int[] indices)
-        {
-            Contracts.AssertValue(indices);
-
-            // REVIEW: Ideally, we'd adjust the indices once so we wouldn't need to
-            // repeatedly deal with padding adjustments.
-            CpuMathUtils.ZeroMatrixItems(Items, ColCount, ColCountPhy, indices);
-        }
-    }
-
-    /// <summary>
-    /// This implements a logical matrix of Floats that is automatically aligned for SSE/AVX operations.
-    /// The ctor takes an alignment value, which must be a power of two at least sizeof(Float).
-    /// </summary>
-    public sealed class CpuAlignedMatrixCol : CpuAlignedMatrixBase, ICpuFullMatrix
-    {
-        /// <summary>
-        /// Allocate an aligned matrix with the given alignment (in bytes).
-        /// </summary>
-        public CpuAlignedMatrixCol(int crow, int ccol, int cbAlign)
-            : base(crow, ccol, cbAlign)
-        {
-        }
-
-        /// <summary>
-        /// The logical number of rows
-        /// </summary>
-        public override int RowCount { get { return RunCnt; } }
-
-        /// <summary>
-        /// The logical number of columns
-        /// </summary>
-        public override int ColCount { get { return RunLen; } }
-
-        /// <summary>
-        /// The physical number of rows
-        /// </summary>
-        public override int RowCountPhy { get { return RunCntPhy; } }
-
-        /// <summary>
-        /// The physical number of columns
-        /// </summary>
-        public override int ColCountPhy { get { return RunLenPhy; } }
-
-        /// <summary>
-        /// Copy the values into dst, starting at slot ivDst and advancing ivDst.
-        /// </summary>
-        /// <param name="dst">The destination array</param>
-        /// <param name="ivDst">The starting index in the destination array</param>
-        public void CopyTo(Float[] dst, ref int ivDst)
-        {
-            Contracts.AssertValue(dst);
-            Contracts.Assert(0 <= ivDst && ivDst <= dst.Length - ValueCount);
-
-            for (int row = 0; row < RowCount; row++)
-            {
-                for (int col = 0; col < ColCount; col++)
-                    dst[ivDst++] = Items[row + col * RowCountPhy];
-            }
-        }
-
-        /// <summary>
-        /// Copy the values from this matrix, starting from the row into dst, starting at slot ivDst and advancing ivDst.
-        /// </summary>
-        /// <param name="row">The starting row in this matrix</param>
-        /// <param name="dst">The destination array</param>
-        /// <param name="ivDst">The starting index in the destination array</param>
-        public void CopyTo(int row, Float[] dst, ref int ivDst)
-        {
-            Contracts.AssertValue(dst);
-            Contracts.Assert(0 <= row && row < RowCount);
-            Contracts.Assert(0 <= ivDst && ivDst <= dst.Length - ColCount);
-
-            for (int col = 0; col < ColCount; col++)
-                dst[ivDst++] = Items[row + col * RowCountPhy];
-        }
-
-        /// <summary>
-        /// Copy the values from src, starting at slot ivSrc and advancing ivSrc.
-        /// </summary>
-        /// <param name="src">The source array</param>
-        /// <param name="ivSrc">The starting index in the source array</param>
-        public void CopyFrom(Float[] src, ref int ivSrc)
-        {
-            Contracts.AssertValue(src);
-            Contracts.Assert(0 <= ivSrc && ivSrc <= src.Length - ValueCount);
-            for (int row = 0; row < RowCount; row++)
-            {
-                for (int col = 0; col < ColCount; col++)
-                    Items[row + col * RowCountPhy] = src[ivSrc++];
-            }
-        }
-
-        /// <summary>
-        /// Assign zeros to the values at the indices
-        /// </summary>
-        /// <param name="indices">The indices</param>
-        public void ZeroItems(int[] indices)
-        {
-            Contracts.AssertValue(indices);
-
-            // REVIEW: Ideally, we'd adjust the indices once so we wouldn't need to
-            // repeatedly deal with padding adjustments.
-            foreach (int iv in indices)
-            {
-                int row = iv / ColCount;
-                int col = iv % ColCount;
-                Items[row + col * ColCountPhy] = 0;
-            }
-        }
-
-        /// <summary>
-        /// Get the underlying AlignedArray as IEnumerator&lt;Float&gt;.
-        /// </summary>
-        public IEnumerator<Float> GetEnumerator()
-        {
-            for (int row = 0; row < RowCount; row++)
-            {
-                for (int col = 0; col < ColCount; col++)
-                    yield return Items[row + col * RowCountPhy];
-            }
-        }
-
-        IEnumerator IEnumerable.GetEnumerator()
-        {
-            return GetEnumerator();
-        }
-    }
-}
\ No newline at end of file
diff --git a/src/Microsoft.ML.CpuMath/Avx.cs b/src/Microsoft.ML.CpuMath/Avx.cs
deleted file mode 100644
index 6dcf898b6f..0000000000
--- a/src/Microsoft.ML.CpuMath/Avx.cs
+++ /dev/null
@@ -1,1174 +0,0 @@
-// Licensed to the .NET Foundation under one or more agreements.
-// The .NET Foundation licenses this file to you under the MIT license.
-// See the LICENSE file in the project root for more information.
-
-using System;
-
-namespace Microsoft.ML.Runtime.Internal.CpuMath
-{
-    /// <summary>
-    /// Keep Avx.cs in sync with Sse.cs. When making changes to one, use BeyondCompare or a similar tool
-    /// to view diffs and propagate appropriate changes to the other.
-    /// </summary>
-    public static class AvxUtils
-    {
-        public const int CbAlign = 32;
-
-        private static bool Compat(AlignedArray a)
-        {
-            Contracts.AssertValue(a);
-            Contracts.Assert(a.Size > 0);
-            return a.CbAlign == CbAlign;
-        }
-
-        private static unsafe float* Ptr(AlignedArray a, float* p)
-        {
-            Contracts.AssertValue(a);
-            float* q = p + a.GetBase((long)p);
-            Contracts.Assert(((long)q & (CbAlign - 1)) == 0);
-            return q;
-        }
-
-        public static bool CheckAvx()
-        {
-            return Thunk.ChkAvx();
-        }
-
-        public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crun)
-        {
-            Contracts.Assert(Compat(mat));
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
-            Contracts.Assert(mat.Size == dst.Size * src.Size);
-
-            unsafe
-            {
-                fixed (float* pmat = &mat.Items[0])
-                fixed (float* psrc = &src.Items[0])
-                fixed (float* pdst = &dst.Items[0])
-                {
-                    if (!tran)
-                    {
-                        Contracts.Assert(0 <= crun && crun <= dst.Size);
-                        Thunk.MatMulX(add, Ptr(mat, pmat), Ptr(src, psrc), Ptr(dst, pdst), crun, src.Size);
-                    }
-                    else
-                    {
-                        Contracts.Assert(0 <= crun && crun <= src.Size);
-                        Thunk.MatMulTranX(add, Ptr(mat, pmat), Ptr(src, psrc), Ptr(dst, pdst), dst.Size, crun);
-                    }
-                }
-            }
-        }
-
-        public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgposSrc, AlignedArray srcValues,
-            int posMin, int iposMin, int iposLim, AlignedArray dst, int crun)
-        {
-            Contracts.Assert(Compat(mat));
-            Contracts.Assert(Compat(srcValues));
-            Contracts.Assert(Compat(dst));
-            Contracts.AssertValue(rgposSrc);
-            Contracts.Assert(0 <= iposMin && iposMin <= iposLim && iposLim <= rgposSrc.Length);
-            Contracts.Assert(mat.Size == dst.Size * srcValues.Size);
-
-            if (iposMin >= iposLim)
-            {
-                if (!add)
-                    dst.ZeroItems();
-                return;
-            }
-            Contracts.AssertNonEmpty(rgposSrc);
-            unsafe
-            {
-                fixed (float* pdst = &dst.Items[0])
-                fixed (float* pmat = &mat.Items[0])
-                fixed (float* psrc = &srcValues.Items[0])
-                fixed (int* ppossrc = &rgposSrc[0])
-                {
-                    if (!tran)
-                    {
-                        Contracts.Assert(0 <= crun && crun <= dst.Size);
-                        Thunk.MatMulPX(add, Ptr(mat, pmat), ppossrc, Ptr(srcValues, psrc), posMin, iposMin, iposLim, Ptr(dst, pdst), crun, srcValues.Size);
-                    }
-                    else
-                    {
-                        Contracts.Assert(0 <= crun && crun <= srcValues.Size);
-                        Thunk.MatMulTranPX(add, Ptr(mat, pmat), ppossrc, Ptr(srcValues, psrc), posMin, iposMin, iposLim, Ptr(dst, pdst), dst.Size);
-                    }
-                }
-            }
-        }
-
-        public static void MatTimesSrc(bool add, int[] starts, int[] indices, float[] coefs,
-            AlignedArray src, AlignedArray dst, int crow)
-        {
-            Contracts.AssertNonEmpty(starts);
-            Contracts.Assert(starts.Length == crow + 1);
-            Contracts.Assert(starts[0] == 0);
-            Contracts.AssertNonEmpty(indices);
-            Contracts.Assert(starts[crow] == indices.Length);
-            Contracts.AssertNonEmpty(coefs);
-            Contracts.Assert(indices.Length == coefs.Length);
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
-            Contracts.Assert(0 < crow && crow <= dst.Size);
-            Contracts.Assert(crow * src.Size >= coefs.Length);
-
-            unsafe
-            {
-                fixed (int* pstarts = &starts[0])
-                fixed (int* pindices = &indices[0])
-                fixed (float* pcoefs = &coefs[0])
-                fixed (float* psrc = &src.Items[0])
-                fixed (float* pdst = &dst.Items[0])
-                    Thunk.MatMulRX(add, pstarts, pindices, pcoefs, Ptr(src, psrc), Ptr(dst, pdst), crow);
-            }
-        }
-
-        public static void MatTimesSrc(bool add, int[] mprowiv, int[] mprowcol,
-            int[] mprowrun, int[] runs, float[] coefs,
-            AlignedArray src, AlignedArray dst, int crow)
-        {
-            Contracts.AssertNonEmpty(mprowiv);
-            Contracts.Assert(mprowiv.Length == crow);
-            Contracts.AssertNonEmpty(mprowcol);
-            Contracts.Assert(mprowcol.Length == crow);
-            Contracts.Assert(mprowrun == null || mprowrun.Length == crow);
-            Contracts.AssertNonEmpty(runs);
-            Contracts.AssertNonEmpty(coefs);
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
-            Contracts.Assert(0 < crow && crow <= dst.Size);
-
-            unsafe
-            {
-                fixed (int* pmprowiv = &mprowiv[0])
-                fixed (int* pmprowcol = &mprowcol[0])
-                fixed (int* pruns = &runs[0])
-                fixed (float* pcoefs = &coefs[0])
-                fixed (float* psrc = &src.Items[0])
-                fixed (float* pdst = &dst.Items[0])
-                {
-                    if (mprowrun == null)
-                    {
-                        Thunk.MatMulCX(add, pmprowiv, pmprowcol, pruns, pcoefs,
-                            Ptr(src, psrc), Ptr(dst, pdst), crow);
-                    }
-                    else
-                    {
-                        fixed (int* pmprowrun = &mprowrun[0])
-                        {
-                            Thunk.MatMulDX(add, pmprowiv, pmprowcol, pmprowrun, pruns, pcoefs,
-                                Ptr(src, psrc), Ptr(dst, pdst), crow);
-                        }
-                    }
-                }
-            }
-        }
-
-        public static void MeanOfSrc(bool add, int[] mprowcol, int[] mprowindices,
-            int[] indices, AlignedArray src, AlignedArray dst, int crow)
-        {
-            Contracts.AssertNonEmpty(mprowcol);
-            Contracts.Assert(mprowcol.Length == crow);
-            Contracts.Assert(mprowindices == null || mprowindices.Length == crow);
-            Contracts.AssertNonEmpty(indices);
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
-            Contracts.Assert(0 < crow && crow <= dst.Size);
-
-            unsafe
-            {
-                fixed (int* pmprowcol = &mprowcol[0])
-                fixed (int* pmprowindices = mprowindices)
-                fixed (int* pindices = &indices[0])
-                fixed (float* psrc = &src.Items[0])
-                fixed (float* pdst = &dst.Items[0])
-                {
-                    // REVIEW: Implement using AVX
-                    Thunk.MeanU(add, pmprowcol, pmprowindices, pindices, Ptr(src, psrc), Ptr(dst, pdst), crow);
-                }
-            }
-        }
-
-        public static void MaxOfSrc(bool add, int[] mprowcol, int[] mprowindices,
-            int[] indices, AlignedArray src, AlignedArray dst, int crow)
-        {
-            Contracts.AssertNonEmpty(mprowcol);
-            Contracts.Assert(mprowcol.Length == crow);
-            Contracts.Assert(mprowindices == null || mprowindices.Length == crow);
-            Contracts.AssertNonEmpty(indices);
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
-            Contracts.Assert(0 < crow && crow <= dst.Size);
-
-            unsafe
-            {
-                fixed (int* pmprowcol = &mprowcol[0])
-                fixed (int* pmprowindices = mprowindices)
-                fixed (int* pindices = &indices[0])
-                fixed (float* psrc = &src.Items[0])
-                fixed (float* pdst = &dst.Items[0])
-                {
-                    // REVIEW: Implement using AVX
-                    Thunk.MaxU(add, pmprowcol, pmprowindices, pindices, Ptr(src, psrc), Ptr(dst, pdst), crow);
-                }
-            }
-        }
-
-        public static void RespNormOfSrc(bool add, float alpha, float beta, bool avgOverFullKernel, float offset,
-            int[] mprowcol, int[] mprowindices, int[] indices,
-            AlignedArray src, AlignedArray dst, int crow)
-        {
-            Contracts.AssertNonEmpty(mprowcol);
-            Contracts.Assert(mprowcol.Length == crow);
-            Contracts.Assert(mprowindices == null || mprowindices.Length == crow);
-            Contracts.AssertNonEmpty(indices);
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
-            Contracts.Assert(0 < crow && crow <= dst.Size);
-
-            unsafe
-            {
-                fixed (int* pmprowcol = &mprowcol[0])
-                fixed (int* pmprowindices = mprowindices)
-                fixed (int* pindices = &indices[0])
-                fixed (float* psrc = &src.Items[0])
-                fixed (float* pdst = &dst.Items[0])
-                {
-                    // REVIEW: Implement using AVX
-                    Thunk.RespNormU(add, alpha, beta, avgOverFullKernel, offset, pmprowcol, pmprowindices, pindices,
-                        Ptr(src, psrc), Ptr(dst, pdst), crow);
-                }
-            }
-        }
-
-        public static void MatTranTimesSrc(bool add, int[] starts, int[] indices, float[] coefs,
-            AlignedArray src, AlignedArray dst, int ccol)
-        {
-            Contracts.AssertNonEmpty(starts);
-            Contracts.Assert(starts.Length == ccol + 1);
-            Contracts.Assert(starts[0] == 0);
-            Contracts.AssertNonEmpty(indices);
-            Contracts.Assert(starts[ccol] == indices.Length);
-            Contracts.AssertNonEmpty(coefs);
-            Contracts.Assert(indices.Length == coefs.Length);
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
-            Contracts.Assert(0 < ccol && ccol <= src.Size);
-            Contracts.Assert(dst.Size * ccol >= coefs.Length);
-
-            unsafe
-            {
-                fixed (int* pstarts = &starts[0])
-                fixed (int* pindices = &indices[0])
-                fixed (float* pcoefs = &coefs[0])
-                fixed (float* psrc = &src.Items[0])
-                fixed (float* pdst = &dst.Items[0])
-                    Thunk.MatMulTranRX(add, pstarts, pindices, pcoefs, Ptr(src, psrc), Ptr(dst, pdst), dst.Size, ccol);
-            }
-        }
-
-        public static void MatTranTimesSrc(bool add, int[] mpcoliv, int[] mpcolrow, int[] mpcolrun,
-            int[] runs, float[] coefs, AlignedArray src, AlignedArray dst, int ccol)
-        {
-            Contracts.AssertNonEmpty(mpcoliv);
-            Contracts.Assert(mpcoliv.Length == ccol);
-            Contracts.AssertNonEmpty(mpcolrow);
-            Contracts.Assert(mpcolrow.Length == ccol);
-            Contracts.AssertNonEmpty(runs);
-            Contracts.AssertNonEmpty(coefs);
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
-            Contracts.Assert(mpcolrun == null || mpcolrun.Length == ccol);
-            Contracts.Assert(0 < ccol && ccol <= src.Size);
-
-            unsafe
-            {
-                fixed (int* pmpcoliv = &mpcoliv[0])
-                fixed (int* pmpcolrow = &mpcolrow[0])
-                fixed (int* pruns = &runs[0])
-                fixed (float* pcoefs = &coefs[0])
-                fixed (float* psrc = &src.Items[0])
-                fixed (float* pdst = &dst.Items[0])
-                {
-                    if (mpcolrun == null)
-                    {
-                        Thunk.MatMulTranCX(add, pmpcoliv, pmpcolrow, pruns, pcoefs,
-                            Ptr(src, psrc), Ptr(dst, pdst), dst.Size, ccol);
-                    }
-                    else
-                    {
-                        fixed (int* pmpcolrun = &mpcolrun[0])
-                        {
-                            Thunk.MatMulTranDX(add, pmpcoliv, pmpcolrow, pmpcolrun, pruns, pcoefs,
-                                Ptr(src, psrc), Ptr(dst, pdst), dst.Size, ccol);
-                        }
-                    }
-                }
-            }
-        }
-
-        public static void MeanBackOfSrc(bool add, int[] mpcolrow, int[] mpcolindices,
-            int[] indices, AlignedArray src, AlignedArray dst, int ccol)
-        {
-            Contracts.AssertNonEmpty(mpcolrow);
-            Contracts.Assert(mpcolrow.Length == ccol);
-            Contracts.Assert(mpcolindices == null || mpcolindices.Length == ccol);
-            Contracts.AssertNonEmpty(indices);
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
-            Contracts.Assert(0 < ccol && ccol <= src.Size);
-
-            unsafe
-            {
-                fixed (int* pmpcolrow = &mpcolrow[0])
-                fixed (int* pmpcolindices = mpcolindices)
-                fixed (int* pindices = &indices[0])
-                fixed (float* psrc = &src.Items[0])
-                fixed (float* pdst = &dst.Items[0])
-                {
-                    // REVIEW: Implement using AVX
-                    Thunk.MeanBackU(add, pmpcolrow, pmpcolindices, pindices, Ptr(src, psrc), Ptr(dst, pdst), dst.Size, ccol);
-                }
-            }
-        }
-
-        public static void MaxBackOfSrc(bool add, int[] mpcolrow, int[] mpcolindices,
-            int[] indices, AlignedArray src, AlignedArray dst, AlignedArray val, int ccol)
-        {
-            Contracts.AssertNonEmpty(mpcolrow);
-            Contracts.Assert(mpcolrow.Length == ccol);
-            Contracts.Assert(mpcolindices == null || mpcolindices.Length == ccol);
-            Contracts.AssertNonEmpty(indices);
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
-            Contracts.Assert(Compat(val));
-            Contracts.Assert(0 < ccol && ccol <= src.Size);
-            Contracts.Assert(dst.Size == val.Size);
-
-            unsafe
-            {
-                fixed (int* pmpcolrow = &mpcolrow[0])
-                fixed (int* pmpcolindices = mpcolindices)
-                fixed (int* pindices = &indices[0])
-                fixed (float* psrc = &src.Items[0])
-                fixed (float* pdst = &dst.Items[0])
-                fixed (float* pval = &val.Items[0])
-                {
-                    // REVIEW: Implement using AVX
-                    Thunk.MaxBackU(add, pmpcolrow, pmpcolindices, pindices, Ptr(src, psrc), Ptr(dst, pdst), Ptr(val, pval), dst.Size, ccol);
-                }
-            }
-        }
-
-        public static void RespNormBackOfSrc(bool add, float alpha, float beta, bool avgOverFullKernel, float offset,
-            int[] mpcolrow, int[] mpcolindices, int[] indices,
-            AlignedArray errors, AlignedArray errorsPrev, AlignedArray valuesPrev, int ccol)
-        {
-            Contracts.AssertNonEmpty(mpcolrow);
-            Contracts.Assert(mpcolrow.Length == ccol);
-            Contracts.Assert(mpcolindices == null || mpcolindices.Length == ccol);
-            Contracts.AssertNonEmpty(indices);
-            Contracts.Assert(Compat(errors));
-            Contracts.Assert(Compat(errorsPrev));
-            Contracts.Assert(Compat(valuesPrev));
-            Contracts.Assert(0 < ccol && ccol <= errors.Size);
-            Contracts.Assert(errorsPrev.Size == valuesPrev.Size);
-
-            unsafe
-            {
-                fixed (int* pmpcolrow = &mpcolrow[0])
-                fixed (int* pmpcolindices = mpcolindices)
-                fixed (int* pindices = &indices[0])
-                fixed (float* perr = &errors.Items[0])
-                fixed (float* perrPrev = &errorsPrev.Items[0])
-                fixed (float* pvalPrev = &valuesPrev.Items[0])
-                {
-                    // REVIEW: Implement using AVX
-                    Thunk.RespNormBackU(add, alpha, beta, avgOverFullKernel, offset, pmpcolrow, pmpcolindices, pindices,
-                        Ptr(errors, perr), Ptr(errorsPrev, perrPrev), Ptr(valuesPrev, pvalPrev), errorsPrev.Size, ccol);
-                }
-            }
-        }
-
-        public static void AddXYTran(float a, AlignedArray x, AlignedArray y, AlignedArray mat, int crow, float decay)
-        {
-            Contracts.Assert(Compat(x));
-            Contracts.Assert(Compat(y));
-            Contracts.Assert(Compat(mat));
-            Contracts.Assert(0 < crow && crow <= x.Size);
-            Contracts.Assert(x.Size * y.Size == mat.Size);
-            Contracts.Assert(decay >= 0);
-
-            unsafe
-            {
-                fixed (float* px = &x.Items[0])
-                fixed (float* py = &y.Items[0])
-                fixed (float* pmat = &mat.Items[0])
-                    Thunk.AddXYTranX(a, Ptr(x, px), Ptr(y, py), Ptr(mat, pmat), crow, y.Size, decay);
-            }
-        }
-
-        public static void AddXYTran(float a, AlignedArray x, int[] rgposY, AlignedArray valuesY,
-            int posMinY, int iposMinY, int iposLimY, AlignedArray mat, int crow)
-        {
-            Contracts.Assert(Compat(x));
-            Contracts.Assert(Compat(valuesY));
-            Contracts.Assert(Compat(mat));
-            Contracts.AssertNonEmpty(rgposY);
-            Contracts.Assert(0 <= iposMinY && iposMinY <= iposLimY && iposLimY <= rgposY.Length);
-            Contracts.Assert(0 < crow && crow <= x.Size);
-            Contracts.Assert(x.Size * valuesY.Size == mat.Size);
-
-            if (iposMinY >= iposLimY)
-                return;
-
-            unsafe
-            {
-                fixed (float* px = &x.Items[0])
-                fixed (float* py = &valuesY.Items[0])
-                fixed (int* pposy = &rgposY[0])
-                fixed (float* pmat = &mat.Items[0])
-                {
-                    Thunk.AddXYTranPX(a, Ptr(x, px), pposy, Ptr(valuesY, py), posMinY, iposMinY, iposLimY, Ptr(mat, pmat),
-                        crow, valuesY.Size);
-                }
-            }
-        }
-
-        public static void AddXYTran(float a, AlignedArray x, AlignedArray y,
-            int[] starts, int[] indices, float[] coefs, int crow, float decay)
-        {
-            Contracts.Assert(Compat(x));
-            Contracts.Assert(Compat(y));
-            Contracts.Assert(0 < crow && crow <= x.Size);
-            Contracts.AssertNonEmpty(starts);
-            Contracts.Assert(starts.Length == crow + 1);
-            Contracts.Assert(starts[0] == 0);
-            Contracts.AssertNonEmpty(indices);
-            Contracts.Assert(starts[crow] == indices.Length);
-            Contracts.AssertNonEmpty(coefs);
-            Contracts.Assert(indices.Length == coefs.Length);
-            Contracts.Assert(crow * y.Size >= coefs.Length);
-            Contracts.Assert(decay >= 0);
-
-            unsafe
-            {
-                fixed (float* px = &x.Items[0])
-                fixed (float* py = &y.Items[0])
-                fixed (int* pstarts = &starts[0])
-                fixed (int* pindices = &indices[0])
-                fixed (float* pcoefs = &coefs[0])
-                    Thunk.AddXYTranRX(a, Ptr(x, px), Ptr(y, py), pstarts, pindices, pcoefs, crow, decay);
-            }
-        }
-
-        public static void AddXYTran(float a, AlignedArray x, AlignedArray y, int[] mprowiv,
-            int[] mprowcol, int[] mprowrun, int[] runs, float[] coefs, int crow)
-        {
-            Contracts.Assert(Compat(x));
-            Contracts.Assert(Compat(y));
-            Contracts.Assert(0 < crow && crow <= x.Size);
-            Contracts.AssertNonEmpty(mprowiv);
-            Contracts.Assert(mprowiv.Length == crow);
-            Contracts.AssertNonEmpty(mprowcol);
-            Contracts.Assert(mprowcol.Length == crow);
-            Contracts.Assert(mprowrun == null || mprowrun.Length == crow);
-            Contracts.AssertNonEmpty(runs);
-            Contracts.AssertNonEmpty(coefs);
-
-            unsafe
-            {
-                fixed (float* px = &x.Items[0])
-                fixed (float* py = &y.Items[0])
-                fixed (int* pmprowiv = &mprowiv[0])
-                fixed (int* pmprowcol = &mprowcol[0])
-                fixed (int* pruns = &runs[0])
-                fixed (float* pcoefs = &coefs[0])
-                {
-                    if (mprowrun == null)
-                        Thunk.AddXYTranCX(a, Ptr(x, px), Ptr(y, py), pmprowiv, pmprowcol, pruns, pcoefs, crow);
-                    else
-                    {
-                        fixed (int* pmprowrun = mprowrun)
-                            Thunk.AddXYTranDX(a, Ptr(x, px), Ptr(y, py), pmprowiv, pmprowcol, pmprowrun, pruns, pcoefs, crow);
-                    }
-                }
-            }
-        }
-
-        public static void AddXYTran(float a, AlignedArray x, AlignedArray y, AlignedArray mat, float momentum, AlignedArray delta, int crow)
-        {
-            Contracts.Assert(Compat(x));
-            Contracts.Assert(Compat(y));
-            Contracts.Assert(Compat(mat));
-            Contracts.Assert(Compat(delta));
-            Contracts.Assert(0 < crow && crow <= x.Size);
-            Contracts.Assert(x.Size * y.Size == mat.Size);
-            Contracts.Assert(mat.Size == delta.Size);
-
-            unsafe
-            {
-                fixed (float* px = &x.Items[0])
-                fixed (float* py = &y.Items[0])
-                fixed (float* pmat = &mat.Items[0])
-                fixed (float* pdel = &delta.Items[0])
-                    Thunk.AddXYTranMomX(a, Ptr(x, px), Ptr(y, py), Ptr(mat, pmat), momentum, Ptr(delta, pdel), crow, y.Size);
-            }
-        }
-
-        public static void AddXYTran(AlignedArray x, AlignedArray y, AlignedArray mat, AlignedArray accGrads, AlignedArray accUpdates,
-            float decay, float cond, int crow)
-        {
-            Contracts.Assert(Compat(x));
-            Contracts.Assert(Compat(y));
-            Contracts.Assert(Compat(mat));
-            Contracts.Assert(Compat(accGrads));
-            Contracts.Assert(Compat(accUpdates));
-            Contracts.Assert(0 < crow && crow <= x.Size);
-            Contracts.Assert(x.Size * y.Size == mat.Size);
-            Contracts.Assert(mat.Size == accGrads.Size);
-            Contracts.Assert(mat.Size == accUpdates.Size);
-
-            unsafe
-            {
-                fixed (float* px = &x.Items[0])
-                fixed (float* py = &y.Items[0])
-                fixed (float* pmat = &mat.Items[0])
-                fixed (float* pag = &accGrads.Items[0])
-                fixed (float* pau = &accUpdates.Items[0])
-                    Thunk.AddXYTranGradX(Ptr(x, px), Ptr(y, py), Ptr(mat, pmat), Ptr(accGrads, pag), Ptr(accUpdates, pau), decay, cond, crow, y.Size);
-            }
-        }
-
-        public static void AddXYTran(AlignedArray x, AlignedArray y, int[] starts, int[] indices,
-            float[] coefs, float[] accGrads, float[] accUpdates, float decay, float cond, int crow)
-        {
-            Contracts.Assert(Compat(x));
-            Contracts.Assert(Compat(y));
-            Contracts.Assert(0 < crow && crow <= x.Size);
-            Contracts.AssertNonEmpty(starts);
-            Contracts.Assert(starts.Length == crow + 1);
-            Contracts.Assert(starts[0] == 0);
-            Contracts.AssertNonEmpty(indices);
-            Contracts.Assert(starts[crow] == indices.Length);
-            Contracts.AssertNonEmpty(coefs);
-            Contracts.Assert(indices.Length == coefs.Length);
-            Contracts.Assert(crow * y.Size >= coefs.Length);
-            Contracts.AssertNonEmpty(accGrads);
-            Contracts.Assert(coefs.Length == accGrads.Length);
-            Contracts.AssertNonEmpty(accUpdates);
-            Contracts.Assert(coefs.Length == accUpdates.Length);
-
-            unsafe
-            {
-                fixed (float* px = &x.Items[0])
-                fixed (float* py = &y.Items[0])
-                fixed (int* pstarts = &starts[0])
-                fixed (int* pindices = &indices[0])
-                fixed (float* pcoefs = &coefs[0])
-                fixed (float* pag = &accGrads[0])
-                fixed (float* pau = &accUpdates[0])
-                    Thunk.AddXYTranGradRX(Ptr(x, px), Ptr(y, py), pstarts, pindices, pcoefs, pag, pau, decay, cond, crow);
-            }
-        }
-
-        public static void AddXYTran(AlignedArray x, int[] rgposY, AlignedArray valuesY,
-            int posMinY, int iposMinY, int iposLimY, AlignedArray mat,
-            AlignedArray accGrads, AlignedArray accUpdates, float decay, float cond, int crow)
-        {
-            Contracts.Assert(Compat(x));
-            Contracts.AssertNonEmpty(rgposY);
-            Contracts.Assert(Compat(valuesY));
-            Contracts.Assert(Compat(mat));
-            Contracts.Assert(0 <= iposMinY && iposMinY <= iposLimY && iposLimY <= rgposY.Length);
-            Contracts.Assert(0 < crow && crow <= x.Size);
-            Contracts.Assert(x.Size * valuesY.Size == mat.Size);
-            Contracts.Assert(mat.Size == accGrads.Size);
-            Contracts.Assert(mat.Size == accUpdates.Size);
-
-            if (iposMinY >= iposLimY)
-                return;
-
-            unsafe
-            {
-                fixed (float* px = &x.Items[0])
-                fixed (float* py = &valuesY.Items[0])
-                fixed (int* pposy = &rgposY[0])
-                fixed (float* pmat = &mat.Items[0])
-                fixed (float* pag = &accGrads.Items[0])
-                fixed (float* pau = &accUpdates.Items[0])
-                {
-                    Thunk.AddXYTranGradPX(Ptr(x, px), pposy, Ptr(valuesY, py), posMinY, iposMinY, iposLimY, Ptr(mat, pmat),
-                        Ptr(accGrads, pag), Ptr(accUpdates, pau), decay, cond, crow, valuesY.Size);
-                }
-            }
-        }
-
-        public static void Scale(float a, AlignedArray dst)
-        {
-            Contracts.Assert(Compat(dst));
-
-            unsafe
-            {
-                fixed (float* pdst = &dst.Items[0])
-                    Thunk.ScaleX(a, Ptr(dst, pdst), dst.Size);
-            }
-        }
-
-        public static void Scale(float a, float[] dst, int count)
-        {
-            Contracts.AssertNonEmpty(dst);
-            Contracts.Assert(0 < count && count <= dst.Length);
-
-            unsafe
-            {
-                fixed (float* pd = &dst[0])
-                    Thunk.ScaleU(a, pd, count);
-            }
-        }
-
-        public static void ScaleConvWeights(float a, int kernelSize, float[] dst)
-        {
-            Contracts.AssertValue(dst);
-
-            // REVIEW: implement in SSE/AVX.
-            for (int istart = 0; istart < dst.Length; istart += kernelSize + 1)
-            {
-                for (int i = 0; i < kernelSize; i++)
-                    dst[istart + i] *= a;
-            }
-        }
-
-        public static void ScaleMaxNorm(bool tran, float maxNorm, AlignedArray mat, int crun, int runLenPhy)
-        {
-            // Called only with Avx alignment.
-            Contracts.Assert(Compat(mat));
-
-            unsafe
-            {
-                fixed (float* pmat = &mat.Items[0])
-                {
-                    if (!tran)
-                        Thunk.ScaleMaxNormX(maxNorm, Ptr(mat, pmat), crun, runLenPhy);
-                    else
-                        Thunk.ScaleMaxNormTranU(maxNorm, Ptr(mat, pmat), crun, runLenPhy);
-                }
-            }
-        }
-
-        public static void ScaleMaxNorm(float maxNorm, int[] starts, int[] indices, float[] mat)
-        {
-            Contracts.AssertNonEmpty(starts);
-
-            int crow = starts.Length - 1;
-            Contracts.Assert(starts[0] == 0);
-            Contracts.AssertValue(indices);
-            Contracts.Assert(starts[crow] == indices.Length);
-            Contracts.AssertNonEmpty(mat);
-
-            unsafe
-            {
-                fixed (int* pstarts = &starts[0])
-                fixed (float* pmat = &mat[0])
-                    Thunk.ScaleMaxNormRU(maxNorm, pstarts, pmat, crow);
-            }
-        }
-
-        public static void ScaleMaxNorm(float maxNorm, int kernCount, int kernSize, float[] mat)
-        {
-            Contracts.AssertNonEmpty(mat);
-
-            unsafe
-            {
-                fixed (float* pmat = &mat[0])
-                    Thunk.ScaleMaxNormCU(maxNorm, kernCount, kernSize, pmat);
-            }
-        }
-
-        public static void AddScale(float a, AlignedArray src, AlignedArray dst)
-        {
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
-            Contracts.Assert(src.Size == dst.Size);
-
-            unsafe
-            {
-                fixed (float* psrc = &src.Items[0])
-                fixed (float* pdst = &dst.Items[0])
-                    Thunk.AddScaleX(a, Ptr(src, psrc), Ptr(dst, pdst), dst.Size);
-            }
-        }
-
-        public static void AddScale(float a, AlignedArray src, AlignedArray dst, float momentum, AlignedArray delta)
-        {
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
-            Contracts.Assert(Compat(delta));
-            Contracts.Assert(src.Size == dst.Size);
-            Contracts.Assert(src.Size == delta.Size);
-
-            unsafe
-            {
-                fixed (float* psrc = &src.Items[0])
-                fixed (float* pdst = &dst.Items[0])
-                fixed (float* pdel = &delta.Items[0])
-                    Thunk.AddScaleMomX(a, Ptr(src, psrc), Ptr(dst, pdst), momentum, Ptr(delta, pdel), dst.Size);
-            }
-        }
-
-        public static void AddScale(float a, float[] src, float[] dst, int count)
-        {
-            Contracts.AssertNonEmpty(src);
-            Contracts.Assert(0 < count && count <= src.Length);
-            Contracts.AssertNonEmpty(dst);
-            Contracts.Assert(count <= dst.Length);
-
-            unsafe
-            {
-                fixed (float* psrc = &src[0])
-                fixed (float* pdst = &dst[0])
-                    Thunk.AddScaleU(a, psrc, pdst, count);
-            }
-        }
-
-        public static void AddScale(AlignedArray src, AlignedArray dst,
-            AlignedArray accGrads, AlignedArray accUpdates, float decay, float cond)
-        {
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
-            Contracts.Assert(Compat(accGrads));
-            Contracts.Assert(Compat(accUpdates));
-            Contracts.Assert(src.Size == dst.Size);
-            Contracts.Assert(src.Size == accGrads.Size);
-            Contracts.Assert(src.Size == accUpdates.Size);
-
-            unsafe
-            {
-                fixed (float* psrc = &src.Items[0])
-                fixed (float* pdst = &dst.Items[0])
-                fixed (float* pag = &accGrads.Items[0])
-                fixed (float* pau = &accUpdates.Items[0])
-                    Thunk.AddScaleGradX(Ptr(src, psrc), Ptr(dst, pdst), Ptr(accGrads, pag), Ptr(accUpdates, pau), decay, cond, dst.Size);
-            }
-        }
-
-        public static void Add(AlignedArray src, AlignedArray dst)
-        {
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
-            Contracts.Assert(src.Size == dst.Size);
-
-            unsafe
-            {
-                fixed (float* psrc = &src.Items[0])
-                fixed (float* pdst = &dst.Items[0])
-                    Thunk.AddX(Ptr(src, psrc), Ptr(dst, pdst), dst.Size);
-            }
-        }
-
-        public static void Add(float[] src, float[] dst, int count)
-        {
-            Contracts.AssertNonEmpty(src);
-            Contracts.Assert(0 < count && count <= src.Length);
-            Contracts.AssertNonEmpty(dst);
-            Contracts.Assert(count <= dst.Length);
-
-            unsafe
-            {
-                fixed (float* ps = &src[0])
-                fixed (float* pd = &dst[0])
-                    Thunk.AddU(ps, pd, count);
-            }
-        }
-
-        public static float Sum(AlignedArray src)
-        {
-            Contracts.Assert(Compat(src));
-
-            unsafe
-            {
-                fixed (float* psrc = &src.Items[0])
-                    return Thunk.SumX(Ptr(src, psrc), src.Size);
-            }
-        }
-
-        public static float Sum(float[] src, int count)
-        {
-            Contracts.AssertNonEmpty(src);
-            Contracts.Assert(0 < count && count <= src.Length);
-
-            unsafe
-            {
-                fixed (float* psrc = &src[0])
-                    return Thunk.SumU(psrc, count);
-            }
-        }
-
-        public static float SumSq(float[] src, int count)
-        {
-            Contracts.AssertNonEmpty(src);
-            Contracts.Assert(0 < count && count <= src.Length);
-
-            unsafe
-            {
-                fixed (float* psrc = &src[0])
-                    return Thunk.SumSqU(psrc, count);
-            }
-        }
-
-        public static float SumAbs(float[] src, int count)
-        {
-            Contracts.AssertNonEmpty(src);
-            Contracts.Assert(0 < count && count <= src.Length);
-
-            unsafe
-            {
-                fixed (float* psrc = &src[0])
-                    return Thunk.SumAbsU(psrc, count);
-            }
-        }
-
-        public static float MaxAbs(float[] src, int count)
-        {
-            Contracts.AssertNonEmpty(src);
-            Contracts.Assert(0 < count && count <= src.Length);
-
-            unsafe
-            {
-                fixed (float* psrc = &src[0])
-                    return Thunk.MaxAbsU(psrc, count);
-            }
-        }
-
-        public static float DotProductSparse(float[] a, float[] b, int[] indices, int count)
-        {
-            Contracts.AssertNonEmpty(a);
-            Contracts.AssertNonEmpty(b);
-            Contracts.Assert(0 < count);
-            Contracts.Assert(count < a.Length);
-            Contracts.Assert(count <= b.Length);
-            Contracts.Assert(count <= indices.Length);
-
-            unsafe
-            {
-                fixed (float* pa = &a[0])
-                fixed (float* pb = &b[0])
-                fixed (int* pi = &indices[0])
-                    return Thunk.DotSU(pa, pb, pi, count);
-            }
-        }
-
-        public static float DotProductSparse(float[] a, int offset, float[] b, int[] indices, int count)
-        {
-            Contracts.AssertNonEmpty(a);
-            Contracts.Assert(0 < count);
-            Contracts.Assert(0 <= offset && offset < a.Length);
-            Contracts.Assert(a.Length - offset > count);
-            Contracts.AssertNonEmpty(b);
-            Contracts.Assert(count <= b.Length);
-            Contracts.Assert(count <= indices.Length);
-
-            unsafe
-            {
-                fixed (float* pa = &a[offset])
-                fixed (float* pb = &b[0])
-                fixed (int* pi = &indices[0])
-                    return Thunk.DotSU(pa, pb, pi, count);
-            }
-        }
-
-        public static void ApplySigmoid(AlignedArray src, AlignedArray dst, int c)
-        {
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
-            Contracts.Assert(src.Size == dst.Size);
-            Contracts.Assert(0 < c && c <= dst.Size);
-
-            unsafe
-            {
-                fixed (float* psrc = &src.Items[0])
-                fixed (float* pdst = &dst.Items[0])
-                    Thunk.ApplySigmoidX(Ptr(src, psrc), Ptr(dst, pdst), c);
-            }
-        }
-
-        public static void ApplySoftMax(AlignedArray src, AlignedArray dst, int c)
-        {
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
-            Contracts.Assert(src.Size == dst.Size);
-            Contracts.Assert(0 < c && c <= dst.Size);
-
-            unsafe
-            {
-                fixed (float* psrc = &src.Items[0])
-                fixed (float* pdst = &dst.Items[0])
-                    Thunk.ApplySoftMaxX(Ptr(src, psrc), Ptr(dst, pdst), c);
-            }
-        }
-
-        public static void ApplyRectifiedLinear(AlignedArray src, AlignedArray dst, int c)
-        {
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
-            Contracts.Assert(src.Size == dst.Size);
-            Contracts.Assert(0 < c && c <= dst.Size);
-
-            unsafe
-            {
-                fixed (float* psrc = &src.Items[0])
-                fixed (float* pdst = &dst.Items[0])
-                    Thunk.ApplyRectifiedLinearX(Ptr(src, psrc), Ptr(dst, pdst), c);
-            }
-        }
-
-        public static void ApplySquare(AlignedArray src, AlignedArray dst, int c)
-        {
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
-            Contracts.Assert(src.Size == dst.Size);
-            Contracts.Assert(0 < c && c <= dst.Size);
-
-            unsafe
-            {
-                fixed (float* psrc = &src.Items[0])
-                fixed (float* pdst = &dst.Items[0])
-                    Thunk.ApplySquareX(Ptr(src, psrc), Ptr(dst, pdst), c);
-            }
-        }
-
-        public static void ApplySqrt(AlignedArray src, AlignedArray dst, int c)
-        {
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
-            Contracts.Assert(src.Size == dst.Size);
-            Contracts.Assert(0 < c && c <= dst.Size);
-
-            unsafe
-            {
-                fixed (float* psrc = &src.Items[0])
-                fixed (float* pdst = &dst.Items[0])
-                    Thunk.ApplySqrtX(Ptr(src, psrc), Ptr(dst, pdst), c);
-            }
-        }
-
-        public static void ApplySoftRectifiedLinear(AlignedArray src, AlignedArray dst, int c)
-        {
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
-            Contracts.Assert(src.Size == dst.Size);
-            Contracts.Assert(0 < c && c <= dst.Size);
-
-            unsafe
-            {
-                fixed (float* psrc = &src.Items[0])
-                fixed (float* pdst = &dst.Items[0])
-                    Thunk.ApplySoftRectifiedLinearX(Ptr(src, psrc), Ptr(dst, pdst), c);
-            }
-        }
-
-        public static void ApplyAbs(AlignedArray src, AlignedArray dst, int c)
-        {
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
-            Contracts.Assert(src.Size == dst.Size);
-            Contracts.Assert(0 < c && c <= dst.Size);
-
-            unsafe
-            {
-                fixed (float* psrc = &src.Items[0])
-                fixed (float* pdst = &dst.Items[0])
-                    Thunk.ApplyAbsX(Ptr(src, psrc), Ptr(dst, pdst), c);
-            }
-        }
-
-        public static void ApplyTanh(AlignedArray src, AlignedArray dst, int c)
-        {
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
-            Contracts.Assert(src.Size == dst.Size);
-            Contracts.Assert(0 < c && c <= dst.Size);
-
-            unsafe
-            {
-                fixed (float* psrc = &src.Items[0])
-                fixed (float* pdst = &dst.Items[0])
-                    Thunk.ApplyTanhX(Ptr(src, psrc), Ptr(dst, pdst), c);
-            }
-        }
-
-        public static void ApplyBoundedRectifiedLinear(AlignedArray src, AlignedArray dst, int c)
-        {
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
-            Contracts.Assert(src.Size == dst.Size);
-            Contracts.Assert(0 <= c && c <= dst.Size);
-
-            unsafe
-            {
-                fixed (float* psrc = &src.Items[0])
-                fixed (float* pdst = &dst.Items[0])
-                    Thunk.ApplyBoundedRectifiedLinearX(Ptr(src, psrc), Ptr(dst, pdst), c);
-            }
-        }
-
-        public static void ApplySigmoidDerivative(AlignedArray value, AlignedArray grad)
-        {
-            Contracts.Assert(Compat(value));
-            Contracts.Assert(Compat(grad));
-            Contracts.Assert(value.Size == grad.Size);
-
-            unsafe
-            {
-                fixed (float* pvalue = &value.Items[0])
-                fixed (float* pgrad = &grad.Items[0])
-                    Thunk.ApplySigmoidDerivativeX(Ptr(value, pvalue), Ptr(grad, pgrad), grad.Size);
-            }
-        }
-
-        public static void ApplyRectifiedLinearDerivative(AlignedArray value, AlignedArray grad)
-        {
-            Contracts.Assert(Compat(value));
-            Contracts.Assert(Compat(grad));
-            Contracts.Assert(value.Size == grad.Size);
-
-            unsafe
-            {
-                fixed (float* pvalue = &value.Items[0])
-                fixed (float* pgrad = &grad.Items[0])
-                    Thunk.ApplyRectifiedLinearDerivativeX(Ptr(value, pvalue), Ptr(grad, pgrad), grad.Size);
-            }
-        }
-
-        public static void ApplySquareDerivative(AlignedArray input, AlignedArray output, AlignedArray grad, bool drop)
-        {
-            Contracts.Assert(Compat(input));
-            Contracts.Assert(Compat(output));
-            Contracts.Assert(Compat(grad));
-            Contracts.Assert(output.Size == input.Size);
-            Contracts.Assert(output.Size == grad.Size);
-
-            unsafe
-            {
-                fixed (float* px = &input.Items[0])
-                fixed (float* py = &output.Items[0])
-                fixed (float* pg = &grad.Items[0])
-                    Thunk.ApplySquareDerivativeX(Ptr(input, px), Ptr(output, py), Ptr(grad, pg), grad.Size, drop);
-            }
-        }
-
-        public static void ApplySqrtDerivative(AlignedArray value, AlignedArray grad)
-        {
-            Contracts.Assert(Compat(value));
-            Contracts.Assert(Compat(grad));
-            Contracts.Assert(value.Size == grad.Size);
-
-            unsafe
-            {
-                fixed (float* pvalue = &value.Items[0])
-                fixed (float* pgrad = &grad.Items[0])
-                    Thunk.ApplySqrtDerivativeX(Ptr(value, pvalue), Ptr(grad, pgrad), grad.Size);
-            }
-        }
-
-        public static void ApplySoftRectifiedLinearDerivative(AlignedArray input, AlignedArray output, AlignedArray grad)
-        {
-            Contracts.Assert(Compat(input));
-            Contracts.Assert(Compat(output));
-            Contracts.Assert(Compat(grad));
-            Contracts.Assert(output.Size == input.Size);
-            Contracts.Assert(output.Size == grad.Size);
-
-            unsafe
-            {
-                fixed (float* px = &input.Items[0])
-                fixed (float* py = &output.Items[0])
-                fixed (float* pg = &grad.Items[0])
-                    Thunk.ApplySoftRectifiedLinearDerivativeX(Ptr(input, px), Ptr(output, py), Ptr(grad, pg), grad.Size);
-            }
-        }
-
-        public static void ApplyAbsDerivative(AlignedArray input, AlignedArray output, AlignedArray grad, bool drop)
-        {
-            Contracts.Assert(Compat(input));
-            Contracts.Assert(Compat(output));
-            Contracts.Assert(Compat(grad));
-            Contracts.Assert(output.Size == input.Size);
-            Contracts.Assert(output.Size == grad.Size);
-
-            unsafe
-            {
-                fixed (float* px = &input.Items[0])
-                fixed (float* py = &output.Items[0])
-                fixed (float* pg = &grad.Items[0])
-                    Thunk.ApplyAbsDerivativeX(Ptr(input, px), Ptr(output, py), Ptr(grad, pg), grad.Size, drop);
-            }
-        }
-
-        public static void ApplyTanhDerivative(AlignedArray value, AlignedArray grad)
-        {
-            Contracts.Assert(Compat(value));
-            Contracts.Assert(Compat(grad));
-            Contracts.Assert(value.Size == grad.Size);
-
-            unsafe
-            {
-                fixed (float* pvalue = &value.Items[0])
-                fixed (float* pgrad = &grad.Items[0])
-                    Thunk.ApplyTanhDerivativeX(Ptr(value, pvalue), Ptr(grad, pgrad), grad.Size);
-            }
-        }
-
-        public static void ApplyBoundedRectifiedLinearDerivative(AlignedArray value, AlignedArray grad)
-        {
-            Contracts.Assert(Compat(value));
-            Contracts.Assert(Compat(grad));
-            Contracts.Assert(value.Size == grad.Size);
-
-            unsafe
-            {
-                fixed (float* pvalue = &value.Items[0])
-                fixed (float* pgrad = &grad.Items[0])
-                    Thunk.ApplyBoundedRectifiedLinearDerivativeX(Ptr(value, pvalue), Ptr(grad, pgrad), grad.Size);
-            }
-        }
-
-        public static void ZeroMatrixItems(AlignedArray dst, int ccol, int cfltRow, int[] indices)
-        {
-            Contracts.Assert(0 < ccol && ccol <= cfltRow);
-
-            unsafe
-            {
-                fixed (float* pdst = &dst.Items[0])
-                fixed (int* pi = &indices[0])
-                {
-                    if (ccol == cfltRow)
-                        Thunk.ZeroItemsU(Ptr(dst, pdst), dst.Size, pi, indices.Length);
-                    else
-                        Thunk.ZeroMatrixItemsCore(Ptr(dst, pdst), dst.Size, ccol, cfltRow, pi, indices.Length);
-                }
-            }
-        }
-
-        public static void ScaleAdadelta(float[] mat, float[] accGrads, float[] accUpdates, float decay, float cond, float[] grads)
-        {
-            Contracts.AssertNonEmpty(mat);
-            Contracts.AssertNonEmpty(accGrads);
-            Contracts.AssertNonEmpty(accUpdates);
-            Contracts.Assert(mat.Length == accGrads.Length);
-            Contracts.Assert(mat.Length == accUpdates.Length);
-            Contracts.Assert(mat.Length <= grads.Length);
-
-            unsafe
-            {
-                fixed (float* pm = &mat[0])
-                fixed (float* pag = &accGrads[0])
-                fixed (float* pau = &accUpdates[0])
-                fixed (float* pg = &grads[0])
-                    Thunk.ScaleAdadeltaX(pm, pag, pau, decay, cond, pg, mat.Length);
-            }
-        }
-    }
-}
\ No newline at end of file
diff --git a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
index b31a427139..fd04fb63ea 100644
--- a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
+++ b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
@@ -20,25 +20,6 @@ internal static class AvxIntrinsics
     {
         private static readonly Vector256<float> _absMask256 = Avx.StaticCast<int, float>(Avx.SetAllVector256(0x7FFFFFFF));
 
-        private const int Vector256Alignment = 32;
-
-        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
-        private static bool HasCompatibleAlignment(AlignedArray alignedArray)
-        {
-            Contracts.AssertValue(alignedArray);
-            Contracts.Assert(alignedArray.Size > 0);
-            return (alignedArray.CbAlign % Vector256Alignment) == 0;
-        }
-
-        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
-        private static unsafe float* GetAlignedBase(AlignedArray alignedArray, float* unalignedBase)
-        {
-            Contracts.AssertValue(alignedArray);
-            float* alignedBase = unalignedBase + alignedArray.GetBase((long)unalignedBase);
-            Contracts.Assert(((long)alignedBase % Vector256Alignment) == 0);
-            return alignedBase;
-        }
-
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
         private static Vector128<float> GetHigh(in Vector256<float> x)
             => Avx.ExtractVector128(x, 1);
@@ -106,19 +87,15 @@ private static Vector256<float> GetNewDst256(in Vector256<float> xDst1, in Vecto
         }
 
         // Multiply matrix times vector into vector.
-        public static unsafe void MatMulX(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol)
+        public static unsafe void MatMulX(bool add, float[] mat, float[] src, float[] dst, int crow, int ccol)
         {
-            Contracts.Assert(HasCompatibleAlignment(mat));
-            Contracts.Assert(HasCompatibleAlignment(src));
-            Contracts.Assert(HasCompatibleAlignment(dst));
-
-            fixed (float* pSrcStart = &src.Items[0])
-            fixed (float* pDstStart = &dst.Items[0])
-            fixed (float* pMatStart = &mat.Items[0])
+            fixed (float* pSrcStart = &src[0])
+            fixed (float* pDstStart = &dst[0])
+            fixed (float* pMatStart = &mat[0])
             {
-                float* psrc = GetAlignedBase(src, pSrcStart);
-                float* pdst = GetAlignedBase(dst, pDstStart);
-                float* pmat = GetAlignedBase(mat, pMatStart);
+                float* psrc = pSrcStart;
+                float* pdst = pDstStart;
+                float* pmat = pMatStart;
 
                 float* pSrcEnd = psrc + ccol;
                 float* pDstEnd = pdst + crow;
@@ -138,11 +115,11 @@ public static unsafe void MatMulX(bool add, AlignedArray mat, AlignedArray src,
                     {
                         float* pMatTemp = pMatCurrent;
 
-                        Vector256<float> x01 = Avx.LoadAlignedVector256(pMatTemp);
-                        Vector256<float> x11 = Avx.LoadAlignedVector256(pMatTemp += ccol);
-                        Vector256<float> x21 = Avx.LoadAlignedVector256(pMatTemp += ccol);
-                        Vector256<float> x31 = Avx.LoadAlignedVector256(pMatTemp += ccol);
-                        Vector256<float> x02 = Avx.LoadAlignedVector256(pSrcCurrent);
+                        Vector256<float> x01 = Avx.LoadVector256(pMatTemp);
+                        Vector256<float> x11 = Avx.LoadVector256(pMatTemp += ccol);
+                        Vector256<float> x21 = Avx.LoadVector256(pMatTemp += ccol);
+                        Vector256<float> x31 = Avx.LoadVector256(pMatTemp += ccol);
+                        Vector256<float> x02 = Avx.LoadVector256(pSrcCurrent);
 
                         res0 = Avx.Add(res0, Avx.Multiply(x01, x02));
                         res1 = Avx.Add(res1, Avx.Multiply(x11, x02));
@@ -161,7 +138,7 @@ public static unsafe void MatMulX(bool add, AlignedArray mat, AlignedArray src,
                     Vector128<float> sum = Sse.Add(Avx.GetLowerHalf(res0), GetHigh(in res0));
                     if (add)
                     {
-                        sum = Sse.Add(sum, Sse.LoadAlignedVector128(pDstCurrent));
+                        sum = Sse.Add(sum, Sse.LoadVector128(pDstCurrent));
                     }
                     Sse.StoreAligned(pDstCurrent, sum);
 
@@ -172,23 +149,19 @@ public static unsafe void MatMulX(bool add, AlignedArray mat, AlignedArray src,
         }
 
         // Partial sparse source vector.
-        public static unsafe void MatMulPX(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src,
-                                        int posMin, int iposMin, int iposEnd, AlignedArray dst, int crow, int ccol)
+        public static unsafe void MatMulPX(bool add, float[] mat, int[] rgposSrc, float[] src,
+                                        int posMin, int iposMin, int iposEnd, float[] dst, int crow, int ccol)
         {
-            Contracts.Assert(HasCompatibleAlignment(mat));
-            Contracts.Assert(HasCompatibleAlignment(src));
-            Contracts.Assert(HasCompatibleAlignment(dst));
-
             // REVIEW: For extremely sparse inputs, interchanging the loops would
             // likely be more efficient.
-            fixed (float* pSrcStart = &src.Items[0])
-            fixed (float* pDstStart = &dst.Items[0])
-            fixed (float* pMatStart = &mat.Items[0])
+            fixed (float* pSrcStart = &src[0])
+            fixed (float* pDstStart = &dst[0])
+            fixed (float* pMatStart = &mat[0])
             fixed (int* pposSrc = &rgposSrc[0])
             {
-                float* psrc = GetAlignedBase(src, pSrcStart);
-                float* pdst = GetAlignedBase(dst, pDstStart);
-                float* pmat = GetAlignedBase(mat, pMatStart);
+                float* psrc = pSrcStart;
+                float* pdst = pDstStart;
+                float* pmat = pMatStart;
 
                 int* pposMin = pposSrc + iposMin;
                 int* pposEnd = pposSrc + iposEnd;
@@ -221,7 +194,7 @@ public static unsafe void MatMulPX(bool add, AlignedArray mat, int[] rgposSrc, A
 
                     if (add)
                     {
-                        result = Avx.Add(result, Avx.LoadAlignedVector256(pDstCurrent));
+                        result = Avx.Add(result, Avx.LoadVector256(pDstCurrent));
                     }
                     Avx.StoreAligned(pDstCurrent, result);
 
@@ -231,19 +204,15 @@ public static unsafe void MatMulPX(bool add, AlignedArray mat, int[] rgposSrc, A
             }
         }
 
-        public static unsafe void MatMulTranX(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol)
+        public static unsafe void MatMulTranX(bool add, float[] mat, float[] src, float[] dst, int crow, int ccol)
         {
-            Contracts.Assert(HasCompatibleAlignment(mat));
-            Contracts.Assert(HasCompatibleAlignment(src));
-            Contracts.Assert(HasCompatibleAlignment(dst));
-
-            fixed (float* pSrcStart = &src.Items[0])
-            fixed (float* pDstStart = &dst.Items[0])
-            fixed (float* pMatStart = &mat.Items[0])
+            fixed (float* pSrcStart = &src[0])
+            fixed (float* pDstStart = &dst[0])
+            fixed (float* pMatStart = &mat[0])
             {
-                float* psrc = GetAlignedBase(src, pSrcStart);
-                float* pdst = GetAlignedBase(dst, pDstStart);
-                float* pmat = GetAlignedBase(mat, pMatStart);
+                float* psrc = pSrcStart;
+                float* pdst = pDstStart;
+                float* pmat = pMatStart;
 
                 float* pSrcEnd = psrc + ccol;
                 float* pDstEnd = pdst + crow;
@@ -253,7 +222,7 @@ public static unsafe void MatMulTranX(bool add, AlignedArray mat, AlignedArray s
                 // We do 4-way unrolling
                 if (!add)
                 {
-                    Vector128<float> h01 = Sse.LoadAlignedVector128(pSrcCurrent);
+                    Vector128<float> h01 = Sse.LoadVector128(pSrcCurrent);
                     // Replicate each slot of h01 (ABCD) into its own register.
                     Vector128<float> h11 = Sse.Shuffle(h01, h01, 0x55); // B
                     Vector128<float> h21 = Sse.Shuffle(h01, h01, 0xAA); // C
@@ -272,10 +241,10 @@ public static unsafe void MatMulTranX(bool add, AlignedArray mat, AlignedArray s
                     while (pDstCurrent < pDstEnd)
                     {
                         float* pMatTemp = pMatCurrent;
-                        Vector256<float> x02 = Avx.LoadAlignedVector256(pMatTemp);
-                        Vector256<float> x12 = Avx.LoadAlignedVector256(pMatTemp += crow);
-                        Vector256<float> x22 = Avx.LoadAlignedVector256(pMatTemp += crow);
-                        Vector256<float> x32 = Avx.LoadAlignedVector256(pMatTemp += crow);
+                        Vector256<float> x02 = Avx.LoadVector256(pMatTemp);
+                        Vector256<float> x12 = Avx.LoadVector256(pMatTemp += crow);
+                        Vector256<float> x22 = Avx.LoadVector256(pMatTemp += crow);
+                        Vector256<float> x32 = Avx.LoadVector256(pMatTemp += crow);
 
                         x02 = Avx.Multiply(x01, x02);
                         x12 = Avx.Multiply(x11, x12);
@@ -297,7 +266,7 @@ public static unsafe void MatMulTranX(bool add, AlignedArray mat, AlignedArray s
 
                 while (pSrcCurrent < pSrcEnd)
                 {
-                    Vector128<float> h01 = Sse.LoadAlignedVector128(pSrcCurrent);
+                    Vector128<float> h01 = Sse.LoadVector128(pSrcCurrent);
                     // Replicate each slot of h01 (ABCD) into its own register.
                     Vector128<float> h11 = Sse.Shuffle(h01, h01, 0x55); // B
                     Vector128<float> h21 = Sse.Shuffle(h01, h01, 0xAA); // C
@@ -315,11 +284,11 @@ public static unsafe void MatMulTranX(bool add, AlignedArray mat, AlignedArray s
                     {
                         float* pMatTemp = pMatCurrent;
 
-                        Vector256<float> x02 = Avx.LoadAlignedVector256(pMatTemp);
-                        Vector256<float> x12 = Avx.LoadAlignedVector256(pMatTemp += crow);
-                        Vector256<float> x22 = Avx.LoadAlignedVector256(pMatTemp += crow);
-                        Vector256<float> x32 = Avx.LoadAlignedVector256(pMatTemp += crow);
-                        Vector256<float> x3 = Avx.LoadAlignedVector256(pDstCurrent);
+                        Vector256<float> x02 = Avx.LoadVector256(pMatTemp);
+                        Vector256<float> x12 = Avx.LoadVector256(pMatTemp += crow);
+                        Vector256<float> x22 = Avx.LoadVector256(pMatTemp += crow);
+                        Vector256<float> x32 = Avx.LoadVector256(pMatTemp += crow);
+                        Vector256<float> x3 = Avx.LoadVector256(pDstCurrent);
 
                         x02 = Avx.Multiply(x01, x02);
                         x12 = Avx.Multiply(x11, x12);
@@ -344,21 +313,17 @@ public static unsafe void MatMulTranX(bool add, AlignedArray mat, AlignedArray s
         }
 
         // Partial sparse source vector.
-        public static unsafe void MatMulTranPX(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src,
-                                        int posMin, int iposMin, int iposEnd, AlignedArray dst, int crow)
+        public static unsafe void MatMulTranPX(bool add, float[] mat, int[] rgposSrc, float[] src,
+                                        int posMin, int iposMin, int iposEnd, float[] dst, int crow)
         {
-            Contracts.Assert(HasCompatibleAlignment(mat));
-            Contracts.Assert(HasCompatibleAlignment(src));
-            Contracts.Assert(HasCompatibleAlignment(dst));
-
-            fixed (float* pSrcStart = &src.Items[0])
-            fixed (float* pDstStart = &dst.Items[0])
-            fixed (float* pMatStart = &mat.Items[0])
+            fixed (float* pSrcStart = &src[0])
+            fixed (float* pDstStart = &dst[0])
+            fixed (float* pMatStart = &mat[0])
             fixed (int* pposSrc = &rgposSrc[0])
             {
-                float* psrc = GetAlignedBase(src, pSrcStart);
-                float* pdst = GetAlignedBase(dst, pDstStart);
-                float* pmat = GetAlignedBase(mat, pMatStart);
+                float* psrc = pSrcStart;
+                float* pdst = pDstStart;
+                float* pmat = pMatStart;
 
                 int* ppos = pposSrc + iposMin;
                 int* pposEnd = pposSrc + iposEnd;
@@ -375,7 +340,7 @@ public static unsafe void MatMulTranPX(bool add, AlignedArray mat, int[] rgposSr
 
                     while (pDstCurrent < pDstEnd)
                     {
-                        Vector256<float> x1 = Avx.LoadAlignedVector256(pMatCurrent);
+                        Vector256<float> x1 = Avx.LoadVector256(pMatCurrent);
                         x1 = Avx.Multiply(x1, x0);
                         Avx.StoreAligned(pDstCurrent, x1);
 
@@ -395,8 +360,8 @@ public static unsafe void MatMulTranPX(bool add, AlignedArray mat, int[] rgposSr
 
                     while (pDstCurrent < pDstEnd)
                     {
-                        Vector256<float> x1 = Avx.LoadAlignedVector256(pMatCurrent);
-                        Vector256<float> x2 = Avx.LoadAlignedVector256(pDstCurrent);
+                        Vector256<float> x1 = Avx.LoadVector256(pMatCurrent);
+                        Vector256<float> x2 = Avx.LoadVector256(pDstCurrent);
                         x1 = Avx.Multiply(x1, x0);
                         x2 = Avx.Add(x2, x1);
                         Avx.StoreAligned(pDstCurrent, x2);
diff --git a/src/Microsoft.ML.CpuMath/CpuAligenedMathUtils.cs b/src/Microsoft.ML.CpuMath/CpuAligenedMathUtils.cs
deleted file mode 100644
index 30308f219d..0000000000
--- a/src/Microsoft.ML.CpuMath/CpuAligenedMathUtils.cs
+++ /dev/null
@@ -1,153 +0,0 @@
-// Licensed to the .NET Foundation under one or more agreements.
-// The .NET Foundation licenses this file to you under the MIT license.
-// See the LICENSE file in the project root for more information.
-
-namespace Microsoft.ML.Runtime.Internal.CpuMath
-{
-    public static class CpuAligenedMathUtils<TMatrix>
-        where TMatrix : CpuAlignedMatrixBase, ICpuFullMatrix
-    {
-        /// <summary>
-        /// Assert the compatibility of the underlying AlignedArray for the input matrix in terms of alignment amount.
-        /// </summary>
-        /// <param name="values">The input matrix</param>
-        public static void AssertCompatible(ICpuFullMatrix values)
-        {
-#if DEBUG
-            var mat = values as TMatrix;
-            Contracts.AssertValue(mat);
-            Contracts.Assert((mat.Items.CbAlign % CpuMathUtils.GetVectorAlignment()) == 0);
-#endif
-        }
-
-        /// <summary>
-        /// Assert the compatibility of the underlying AlignedArray for the input vector in terms of alignment amount.
-        /// </summary>
-        /// <param name="values">The input vector</param>
-        public static void AssertCompatible(ICpuVector values)
-        {
-#if DEBUG
-            CpuAlignedVector vec = values as CpuAlignedVector;
-            Contracts.AssertValue(vec);
-            Contracts.Assert((vec.Items.CbAlign % CpuMathUtils.GetVectorAlignment()) == 0);
-#endif
-        }
-
-        private static TMatrix A(ICpuFullMatrix x)
-        {
-            AssertCompatible(x);
-            return (TMatrix)x;
-        }
-
-        private static CpuAlignedVector A(ICpuVector x)
-        {
-            AssertCompatible(x);
-            return (CpuAlignedVector)x;
-        }
-
-        private static void AssertCompatibleCore(ICpuMatrix mat, ICpuVector src, ICpuVector dst)
-        {
-            AssertCompatible(src);
-            AssertCompatible(dst);
-            Contracts.Assert(mat.ColCount == src.VectorSize);
-            Contracts.Assert(mat.RowCount == dst.VectorSize);
-        }
-
-        /// <summary>
-        /// Asserts the following:
-        /// 1. The compatibility of the underlying AlignedArray for mat in terms of alignment amount.
-        /// 2. The compatibility of the underlying AlignedArray for src in terms of alignment amount.
-        /// 3. The compatibility of the underlying AlignedArray for dst in terms of alignment amount.
-        /// 4. The compatibility of the matrix-vector multiplication mat * src = dst.
-        /// </summary>
-        /// <param name="mat"></param>
-        /// <param name="src"></param>
-        /// <param name="dst"></param>
-        public static void AssertCompatible(ICpuFullMatrix mat, ICpuVector src, ICpuVector dst)
-        {
-            // Also check the physical sizes.
-            AssertCompatible(mat);
-            AssertCompatibleCore(mat, src, dst);
-            var m = A(mat);
-            Contracts.Assert(m.ColCountPhy == A(src).Items.Size);
-            Contracts.Assert(m.RowCountPhy == A(dst).Items.Size);
-        }
-
-        /// <summary>
-        /// Matrix multiplication:
-        /// if (add)
-        ///     dst = mat * src
-        /// else
-        ///     dest += mat * src
-        /// </summary>
-        /// <param name="add">The addition flag</param>
-        /// <param name="mat">The multiplier matrix</param>
-        /// <param name="src">The source vector</param>
-        /// <param name="dst">The destination vector</param>
-        public static void MatTimesSrc(bool add, ICpuFullMatrix mat, ICpuVector src, ICpuVector dst)
-        {
-            bool colMajor = typeof(TMatrix) == typeof(CpuAlignedMatrixCol);
-            AssertCompatible(mat, src, dst);
-            var m = A(mat);
-            CpuMathUtils.MatTimesSrc(colMajor, add, m.Items, A(src).Items, A(dst).Items, m.RunCnt);
-        }
-
-        /// <summary>
-        /// Matrix transpose multiplication:
-        /// if (add)
-        ///     dst = mat' * src
-        /// else
-        ///     dest += mat' * src
-        /// </summary>
-        /// <param name="add">The addition flag</param>
-        /// <param name="mat">The multiplier matrix</param>
-        /// <param name="src">The source vector</param>
-        /// <param name="dst">The destination vector</param>
-        public static void MatTranTimesSrc(bool add, ICpuFullMatrix mat, ICpuVector src, ICpuVector dst)
-        {
-            bool colMajor = typeof(TMatrix) == typeof(CpuAlignedMatrixCol);
-            AssertCompatible(mat, dst, src);
-            var m = A(mat);
-            CpuMathUtils.MatTimesSrc(!colMajor, add, m.Items, A(src).Items, A(dst).Items, m.RunCnt);
-        }
-    }
-
-    public static class GeneralUtils
-    {
-        /// <summary>
-        /// Count the number of zero bits in the lonest string of zero's from the lowest significant bit of the input integer.
-        /// </summary>
-        /// <param name="u">The input integer</param>
-        /// <returns></returns>
-        public static int CbitLowZero(uint u)
-        {
-            if (u == 0)
-                return 32;
-
-            int cbit = 0;
-            if ((u & 0x0000FFFF) == 0)
-            {
-                cbit += 16;
-                u >>= 16;
-            }
-            if ((u & 0x000000FF) == 0)
-            {
-                cbit += 8;
-                u >>= 8;
-            }
-            if ((u & 0x0000000F) == 0)
-            {
-                cbit += 4;
-                u >>= 4;
-            }
-            if ((u & 0x00000003) == 0)
-            {
-                cbit += 2;
-                u >>= 2;
-            }
-            if ((u & 0x00000001) == 0)
-                cbit += 1;
-            return cbit;
-        }
-    }
-}
diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
index f15f5c3938..236847f415 100644
--- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
+++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
@@ -10,13 +10,13 @@ namespace Microsoft.ML.Runtime.Internal.CpuMath
 {
     public static partial class CpuMathUtils
     {
-        // The count of bytes in Vector128<T>, corresponding to _cbAlign in AlignedArray
+        // The count of bytes in Vector128<T>, corresponding to _cbAlign in float[]
         private const int Vector128Alignment = 16;
 
-        // The count of bytes in Vector256<T>, corresponding to _cbAlign in AlignedArray
+        // The count of bytes in Vector256<T>, corresponding to _cbAlign in float[]
         private const int Vector256Alignment = 32;
 
-        // The count of bytes in a 32-bit float, corresponding to _cbAlign in AlignedArray
+        // The count of bytes in a 32-bit float, corresponding to _cbAlign in float[]
         private const int FloatAlignment = 4;
 
         // If neither AVX nor SSE is supported, return basic alignment for a 4-byte float.
@@ -24,48 +24,48 @@ public static partial class CpuMathUtils
         public static int GetVectorAlignment()
             => Avx.IsSupported ? Vector256Alignment : (Sse.IsSupported ? Vector128Alignment : FloatAlignment);
 
-        public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crun)
+        public static void MatTimesSrc(bool tran, bool add, float[] mat, float[] src, float[] dst, int crun)
         {
-            Contracts.Assert(mat.Size == dst.Size * src.Size);
+            Contracts.Assert(mat.Length == dst.Length * src.Length);
             Contracts.Assert(crun >= 0);
 
             if (Avx.IsSupported)
             {
                 if (!tran)
                 {
-                    Contracts.Assert(crun <= dst.Size);
-                    AvxIntrinsics.MatMulX(add, mat, src, dst, crun, src.Size);
+                    Contracts.Assert(crun <= dst.Length);
+                    AvxIntrinsics.MatMulX(add, mat, src, dst, crun, src.Length);
                 }
                 else
                 {
-                    Contracts.Assert(crun <= src.Size);
-                    AvxIntrinsics.MatMulTranX(add, mat, src, dst, dst.Size, crun);
+                    Contracts.Assert(crun <= src.Length);
+                    AvxIntrinsics.MatMulTranX(add, mat, src, dst, dst.Length, crun);
                 }
             }
             else if (Sse.IsSupported)
             {
                 if (!tran)
                 {
-                    Contracts.Assert(crun <= dst.Size);
-                    SseIntrinsics.MatMulA(add, mat, src, dst, crun, src.Size);
+                    Contracts.Assert(crun <= dst.Length);
+                    SseIntrinsics.MatMul(add, mat, src, dst, crun, src.Length);
                 }
                 else
                 {
-                    Contracts.Assert(crun <= src.Size);
-                    SseIntrinsics.MatMulTranA(add, mat, src, dst, dst.Size, crun);
+                    Contracts.Assert(crun <= src.Length);
+                    SseIntrinsics.MatMulTran(add, mat, src, dst, dst.Length, crun);
                 }
             }
             else
             {
                 if (!tran)
                 {
-                    Contracts.Assert(crun <= dst.Size);
+                    Contracts.Assert(crun <= dst.Length);
                     for (int i = 0; i < crun; i++)
                     {
                         float dotProduct = 0;
-                        for (int j = 0; j < src.Size; j++)
+                        for (int j = 0; j < src.Length; j++)
                         {
-                            dotProduct += mat[i * src.Size + j] * src[j];
+                            dotProduct += mat[i * src.Length + j] * src[j];
                         }
 
                         if (add)
@@ -80,13 +80,13 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArr
                 }
                 else
                 {
-                    Contracts.Assert(crun <= src.Size);
-                    for (int i = 0; i < dst.Size; i++)
+                    Contracts.Assert(crun <= src.Length);
+                    for (int i = 0; i < dst.Length; i++)
                     {
                         float dotProduct = 0;
                         for (int j = 0; j < crun; j++)
                         {
-                            dotProduct += mat[j * src.Size + i] * src[j];
+                            dotProduct += mat[j * src.Length + i] * src[j];
                         }
 
                         if (add)
@@ -102,19 +102,19 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArr
             }
         }
 
-        public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgposSrc, AlignedArray srcValues,
-            int posMin, int iposMin, int iposLim, AlignedArray dst, int crun)
+        public static void MatTimesSrc(bool tran, bool add, float[] mat, int[] rgposSrc, float[] srcValues,
+            int posMin, int iposMin, int iposLim, float[] dst, int crun)
         {
             Contracts.AssertValue(rgposSrc);
             Contracts.Assert(iposMin >= 0);
             Contracts.Assert(iposMin <= iposLim);
             Contracts.Assert(iposLim <= rgposSrc.Length);
-            Contracts.Assert(mat.Size == dst.Size * srcValues.Size);
+            Contracts.Assert(mat.Length == dst.Length * srcValues.Length);
 
             if (iposMin >= iposLim)
             {
                 if (!add)
-                    dst.ZeroItems();
+                    Array.Clear(dst, 0, dst.Length);
                 return;
             }
 
@@ -125,40 +125,40 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgpo
             {
                 if (!tran)
                 {
-                    Contracts.Assert(crun <= dst.Size);
-                    AvxIntrinsics.MatMulPX(add, mat, rgposSrc, srcValues, posMin, iposMin, iposLim, dst, crun, srcValues.Size);
+                    Contracts.Assert(crun <= dst.Length);
+                    AvxIntrinsics.MatMulPX(add, mat, rgposSrc, srcValues, posMin, iposMin, iposLim, dst, crun, srcValues.Length);
                 }
                 else
                 {
-                    Contracts.Assert(crun <= srcValues.Size);
-                    AvxIntrinsics.MatMulTranPX(add, mat, rgposSrc, srcValues, posMin, iposMin, iposLim, dst, dst.Size);
+                    Contracts.Assert(crun <= srcValues.Length);
+                    AvxIntrinsics.MatMulTranPX(add, mat, rgposSrc, srcValues, posMin, iposMin, iposLim, dst, dst.Length);
                 }
             }
             else if (Sse.IsSupported)
             {
                 if (!tran)
                 {
-                    Contracts.Assert(crun <= dst.Size);
-                    SseIntrinsics.MatMulPA(add, mat, rgposSrc, srcValues, posMin, iposMin, iposLim, dst, crun, srcValues.Size);
+                    Contracts.Assert(crun <= dst.Length);
+                    SseIntrinsics.MatMulP(add, mat, rgposSrc, srcValues, posMin, iposMin, iposLim, dst, crun, srcValues.Length);
                 }
                 else
                 {
-                    Contracts.Assert(crun <= srcValues.Size);
-                    SseIntrinsics.MatMulTranPA(add, mat, rgposSrc, srcValues, posMin, iposMin, iposLim, dst, dst.Size);
+                    Contracts.Assert(crun <= srcValues.Length);
+                    SseIntrinsics.MatMulTranP(add, mat, rgposSrc, srcValues, posMin, iposMin, iposLim, dst, dst.Length);
                 }
             }
             else
             {
                 if (!tran)
                 {
-                    Contracts.Assert(crun <= dst.Size);
+                    Contracts.Assert(crun <= dst.Length);
                     for (int i = 0; i < crun; i++)
                     {
                         float dotProduct = 0;
                         for (int j = iposMin; j < iposLim; j++)
                         {
                             int col = rgposSrc[j] - posMin;
-                            dotProduct += mat[i * srcValues.Size + col] * srcValues[col];
+                            dotProduct += mat[i * srcValues.Length + col] * srcValues[col];
                         }
 
                         if (add)
@@ -173,14 +173,14 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgpo
                 }
                 else
                 {
-                    Contracts.Assert(crun <= srcValues.Size);
-                    for (int i = 0; i < dst.Size; i++)
+                    Contracts.Assert(crun <= srcValues.Length);
+                    for (int i = 0; i < dst.Length; i++)
                     {
                         float dotProduct = 0;
                         for (int j = iposMin; j < iposLim; j++)
                         {
                             int col = rgposSrc[j] - posMin;
-                            dotProduct += mat[col * dst.Size + i] * srcValues[col];
+                            dotProduct += mat[col * dst.Length + i] * srcValues[col];
                         }
 
                         if (add)
@@ -947,24 +947,24 @@ private static float L2DistSquared(Span<float> a, Span<float> b)
             }
         }
 
-        public static void ZeroMatrixItems(AlignedArray dst, int ccol, int cfltRow, int[] indices)
+        public static void ZeroMatrixItems(float[] dst, int ccol, int cfltRow, int[] indices)
         {
             Contracts.Assert(ccol > 0);
             Contracts.Assert(ccol <= cfltRow);
 
             if (ccol == cfltRow)
             {
-                ZeroItemsU(dst, dst.Size, indices, indices.Length);
+                ZeroItemsU(dst, dst.Length, indices, indices.Length);
             }
             else
             {
-                ZeroMatrixItemsCore(dst, dst.Size, ccol, cfltRow, indices, indices.Length);
+                ZeroMatrixItemsCore(dst, dst.Length, ccol, cfltRow, indices, indices.Length);
             }
         }
 
-        private static unsafe void ZeroItemsU(AlignedArray dst, int c, int[] indices, int cindices)
+        private static unsafe void ZeroItemsU(float[] dst, int c, int[] indices, int cindices)
         {
-            fixed (float* pdst = &dst.Items[0])
+            fixed (float* pdst = &dst[0])
             fixed (int* pidx = &indices[0])
             {
                 for (int i = 0; i < cindices; ++i)
@@ -977,9 +977,9 @@ private static unsafe void ZeroItemsU(AlignedArray dst, int c, int[] indices, in
             }
         }
 
-        private static unsafe void ZeroMatrixItemsCore(AlignedArray dst, int c, int ccol, int cfltRow, int[] indices, int cindices)
+        private static unsafe void ZeroMatrixItemsCore(float[] dst, int c, int ccol, int cfltRow, int[] indices, int cindices)
         {
-            fixed (float* pdst = &dst.Items[0])
+            fixed (float* pdst = &dst[0])
             fixed (int* pidx = &indices[0])
             {
                 int ivLogMin = 0;
diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs
index b35f171388..b78a06452b 100644
--- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs
+++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs
@@ -8,24 +8,15 @@ namespace Microsoft.ML.Runtime.Internal.CpuMath
 {
     public static partial class CpuMathUtils
     {
-        // The count of bytes in Vector128<T>, corresponding to _cbAlign in AlignedArray
-        private const int Vector128Alignment = 16;
+        public static void MatTimesSrc(bool tran, bool add, float[] mat, float[] src, float[] dst, int crun) => SseUtils.MatTimesSrc(tran, add, mat, src, dst, crun);
 
-        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
-        public static int GetVectorAlignment()
-            => Vector128Alignment;
-
-        public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crun) => SseUtils.MatTimesSrc(tran, add, mat, src, dst, crun);
-
-        public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgposSrc, AlignedArray srcValues,
-            int posMin, int iposMin, int iposLim, AlignedArray dst, int crun) => SseUtils.MatTimesSrc(tran, add, mat, rgposSrc, srcValues, posMin, iposMin, iposLim, dst, crun);
+        public static void MatTimesSrc(bool tran, bool add, float[] mat, int[] rgposSrc, float[] srcValues,
+            int posMin, int iposMin, int iposLim, float[] dst, int crun) => SseUtils.MatTimesSrc(tran, add, mat, rgposSrc, srcValues, posMin, iposMin, iposLim, dst, crun);
 
         public static void Add(float a, float[] dst, int count) => SseUtils.Add(a, dst, count);
 
         public static void Scale(float a, float[] dst, int count) => SseUtils.Scale(a, dst, count);
 
-        public static void Scale(float a, float[] dst, int offset, int count) => SseUtils.Scale(a, dst, offset, count);
-
         public static void Scale(float a, float[] src, float[] dst, int count) => SseUtils.Scale(a, src, dst, count);
 
         public static void ScaleAdd(float a, float b, float[] dst, int count) => SseUtils.ScaleAdd(a, b, dst, count);
@@ -44,8 +35,6 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgpo
 
         public static void Add(float[] src, int[] indices, float[] dst, int count) => SseUtils.Add(src, indices, dst, count);
 
-        public static void Add(float[] src, int[] indices, float[] dst, int dstOffset, int count) => SseUtils.Add(src, indices, dst, dstOffset, count);
-
         public static void MulElementWise(float[] src1, float[] src2, float[] dst, int count) => SseUtils.MulElementWise(src1, src2, dst, count);
 
         public static float Sum(float[] src, int count) => SseUtils.Sum(src, count);
@@ -58,14 +47,10 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgpo
 
         public static float SumSq(float mean, float[] src, int offset, int count) => SseUtils.SumSq(mean, src, offset, count);
 
-        public static float SumAbs(float[] src, int count) => SseUtils.SumAbs(src, count);
-
         public static float SumAbs(float[] src, int offset, int count) => SseUtils.SumAbs(src, offset, count);
 
         public static float SumAbs(float mean, float[] src, int offset, int count) => SseUtils.SumAbs(mean, src, offset, count);
 
-        public static float MaxAbs(float[] src, int count) => SseUtils.MaxAbs(src, count);
-
         public static float MaxAbs(float[] src, int offset, int count) => SseUtils.MaxAbs(src, offset, count);
 
         public static float MaxAbsDiff(float mean, float[] src, int count) => SseUtils.MaxAbsDiff(mean, src, count);
@@ -80,8 +65,6 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgpo
 
         public static float L2DistSquared(float[] a, float[] b, int count) => SseUtils.L2DistSquared(a, b, count);
 
-        public static void ZeroMatrixItems(AlignedArray dst, int ccol, int cfltRow, int[] indices) => SseUtils.ZeroMatrixItems(dst, ccol, cfltRow, indices);
-
         public static void SdcaL1UpdateDense(float primalUpdate, int length, float[] src, float threshold, float[] v, float[] w)
             => SseUtils.SdcaL1UpdateDense(primalUpdate, length, src, threshold, v, w);
 
diff --git a/src/Microsoft.ML.CpuMath/Sse.cs b/src/Microsoft.ML.CpuMath/Sse.cs
index 13de22dd5b..bed31376dc 100644
--- a/src/Microsoft.ML.CpuMath/Sse.cs
+++ b/src/Microsoft.ML.CpuMath/Sse.cs
@@ -2,6 +2,8 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 
+using System;
+
 namespace Microsoft.ML.Runtime.Internal.CpuMath
 {
     /// <summary>
@@ -10,581 +12,65 @@ namespace Microsoft.ML.Runtime.Internal.CpuMath
     /// </summary>
     public static class SseUtils
     {
-        public const int CbAlign = 16;
-
-        private static bool Compat(AlignedArray a)
-        {
-            Contracts.AssertValue(a);
-            Contracts.Assert(a.Size > 0);
-            return a.CbAlign == CbAlign;
-        }
-
-        private static unsafe float* Ptr(AlignedArray a, float* p)
-        {
-            Contracts.AssertValue(a);
-            float* q = p + a.GetBase((long)p);
-            Contracts.Assert(((long)q & (CbAlign - 1)) == 0);
-            return q;
-        }
-
-        public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crun)
+        public static void MatTimesSrc(bool tran, bool add, float[] mat, float[] src, float[] dst, int crun)
         {
-            Contracts.Assert(Compat(mat));
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
-            Contracts.Assert(mat.Size == dst.Size * src.Size);
+            Contracts.Assert(mat.Length == dst.Length * src.Length);
 
             unsafe
             {
-                fixed (float* pmat = &mat.Items[0])
-                fixed (float* psrc = &src.Items[0])
-                fixed (float* pdst = &dst.Items[0])
+                fixed (float* pmat = &mat[0])
+                fixed (float* psrc = &src[0])
+                fixed (float* pdst = &dst[0])
                 {
                     if (!tran)
                     {
-                        Contracts.Assert(0 <= crun && crun <= dst.Size);
-                        Thunk.MatMulA(add, Ptr(mat, pmat), Ptr(src, psrc), Ptr(dst, pdst), crun, src.Size);
+                        Contracts.Assert(0 <= crun && crun <= dst.Length);
+                        Thunk.MatMul(add, pmat, psrc, pdst, crun, src.Length);
                     }
                     else
                     {
-                        Contracts.Assert(0 <= crun && crun <= src.Size);
-                        Thunk.MatMulTranA(add, Ptr(mat, pmat), Ptr(src, psrc), Ptr(dst, pdst), dst.Size, crun);
+                        Contracts.Assert(0 <= crun && crun <= src.Length);
+                        Thunk.MatMulTran(add, pmat, psrc, pdst, dst.Length, crun);
                     }
                 }
             }
         }
 
-        public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgposSrc, AlignedArray srcValues,
-            int posMin, int iposMin, int iposLim, AlignedArray dst, int crun)
+        public static void MatTimesSrc(bool tran, bool add, float[] mat, int[] rgposSrc, float[] srcValues,
+            int posMin, int iposMin, int iposLim, float[] dst, int crun)
         {
-            Contracts.Assert(Compat(mat));
-            Contracts.Assert(Compat(srcValues));
-            Contracts.Assert(Compat(dst));
             Contracts.AssertValue(rgposSrc);
             Contracts.Assert(0 <= iposMin && iposMin <= iposLim && iposLim <= rgposSrc.Length);
-            Contracts.Assert(mat.Size == dst.Size * srcValues.Size);
+            Contracts.Assert(mat.Length == dst.Length * srcValues.Length);
 
             if (iposMin >= iposLim)
             {
                 if (!add)
-                    dst.ZeroItems();
+                    Array.Clear(dst, 0, dst.Length);
                 return;
             }
             Contracts.AssertNonEmpty(rgposSrc);
             unsafe
             {
-                fixed (float* pdst = &dst.Items[0])
-                fixed (float* pmat = &mat.Items[0])
-                fixed (float* psrc = &srcValues.Items[0])
+                fixed (float* pdst = &dst[0])
+                fixed (float* pmat = &mat[0])
+                fixed (float* psrc = &srcValues[0])
                 fixed (int* ppossrc = &rgposSrc[0])
                 {
                     if (!tran)
                     {
-                        Contracts.Assert(0 <= crun && crun <= dst.Size);
-                        Thunk.MatMulPA(add, Ptr(mat, pmat), ppossrc, Ptr(srcValues, psrc), posMin, iposMin, iposLim, Ptr(dst, pdst), crun, srcValues.Size);
-                    }
-                    else
-                    {
-                        Contracts.Assert(0 <= crun && crun <= srcValues.Size);
-                        Thunk.MatMulTranPA(add, Ptr(mat, pmat), ppossrc, Ptr(srcValues, psrc), posMin, iposMin, iposLim, Ptr(dst, pdst), dst.Size);
-                    }
-                }
-            }
-        }
-
-        public static void MatTimesSrc(bool add, int[] starts, int[] indices, float[] coefs,
-            AlignedArray src, AlignedArray dst, int crow)
-        {
-            Contracts.AssertNonEmpty(starts);
-            Contracts.Assert(starts.Length == crow + 1);
-            Contracts.Assert(starts[0] == 0);
-            Contracts.AssertNonEmpty(indices);
-            Contracts.Assert(starts[crow] == indices.Length);
-            Contracts.AssertNonEmpty(coefs);
-            Contracts.Assert(indices.Length == coefs.Length);
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
-            Contracts.Assert(0 < crow && crow <= dst.Size);
-            Contracts.Assert(crow * src.Size >= coefs.Length);
-
-            unsafe
-            {
-                fixed (int* pstarts = &starts[0])
-                fixed (int* pindices = &indices[0])
-                fixed (float* pcoefs = &coefs[0])
-                fixed (float* psrc = &src.Items[0])
-                fixed (float* pdst = &dst.Items[0])
-                    Thunk.MatMulRU(add, pstarts, pindices, pcoefs, Ptr(src, psrc), Ptr(dst, pdst), crow);
-            }
-        }
-
-        public static void MatTimesSrc(bool add, int[] mprowiv, int[] mprowcol,
-            int[] mprowrun, int[] runs, float[] coefs,
-            AlignedArray src, AlignedArray dst, int crow)
-        {
-            Contracts.AssertNonEmpty(mprowiv);
-            Contracts.Assert(mprowiv.Length == crow);
-            Contracts.AssertNonEmpty(mprowcol);
-            Contracts.Assert(mprowcol.Length == crow);
-            Contracts.Assert(mprowrun == null || mprowrun.Length == crow);
-            Contracts.AssertNonEmpty(runs);
-            Contracts.AssertNonEmpty(coefs);
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
-            Contracts.Assert(0 < crow && crow <= dst.Size);
-
-            unsafe
-            {
-                fixed (int* pmprowiv = &mprowiv[0])
-                fixed (int* pmprowcol = &mprowcol[0])
-                fixed (int* pruns = &runs[0])
-                fixed (float* pcoefs = &coefs[0])
-                fixed (float* psrc = &src.Items[0])
-                fixed (float* pdst = &dst.Items[0])
-                {
-                    if (mprowrun == null)
-                    {
-                        Thunk.MatMulCU(add, pmprowiv, pmprowcol, pruns, pcoefs,
-                            Ptr(src, psrc), Ptr(dst, pdst), crow);
+                        Contracts.Assert(0 <= crun && crun <= dst.Length);
+                        Thunk.MatMulP(add, pmat, ppossrc, psrc, posMin, iposMin, iposLim, pdst, crun, srcValues.Length);
                     }
                     else
                     {
-                        fixed (int* pmprowrun = &mprowrun[0])
-                        {
-                            Thunk.MatMulDU(add, pmprowiv, pmprowcol, pmprowrun, pruns, pcoefs,
-                                Ptr(src, psrc), Ptr(dst, pdst), crow);
-                        }
+                        Contracts.Assert(0 <= crun && crun <= srcValues.Length);
+                        Thunk.MatMulTranP(add, pmat, ppossrc, psrc, posMin, iposMin, iposLim, pdst, dst.Length);
                     }
                 }
             }
         }
 
-        public static void MeanOfSrc(bool add, int[] mprowcol, int[] mprowindices,
-            int[] indices, AlignedArray src, AlignedArray dst, int crow)
-        {
-            Contracts.AssertNonEmpty(mprowcol);
-            Contracts.Assert(mprowcol.Length == crow);
-            Contracts.Assert(mprowindices == null || mprowindices.Length == crow);
-            Contracts.AssertNonEmpty(indices);
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
-            Contracts.Assert(0 < crow && crow <= dst.Size);
-
-            unsafe
-            {
-                fixed (int* pmprowcol = &mprowcol[0])
-                fixed (int* pmprowindices = mprowindices)
-                fixed (int* pindices = &indices[0])
-                fixed (float* psrc = &src.Items[0])
-                fixed (float* pdst = &dst.Items[0])
-                    Thunk.MeanU(add, pmprowcol, pmprowindices, pindices, Ptr(src, psrc), Ptr(dst, pdst), crow);
-            }
-        }
-
-        public static void MaxOfSrc(bool add, int[] mprowcol, int[] mprowindices,
-            int[] indices, AlignedArray src, AlignedArray dst, int crow)
-        {
-            Contracts.AssertNonEmpty(mprowcol);
-            Contracts.Assert(mprowcol.Length == crow);
-            Contracts.Assert(mprowindices == null || mprowindices.Length == crow);
-            Contracts.AssertNonEmpty(indices);
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
-            Contracts.Assert(0 < crow && crow <= dst.Size);
-
-            unsafe
-            {
-                fixed (int* pmprowcol = &mprowcol[0])
-                fixed (int* pmprowindices = mprowindices)
-                fixed (int* pindices = &indices[0])
-                fixed (float* psrc = &src.Items[0])
-                fixed (float* pdst = &dst.Items[0])
-                    Thunk.MaxU(add, pmprowcol, pmprowindices, pindices, Ptr(src, psrc), Ptr(dst, pdst), crow);
-            }
-        }
-
-        public static void RespNormOfSrc(bool add, float alpha, float beta, bool avgOverFullKernel, float offset,
-            int[] mprowcol, int[] mprowindices, int[] indices,
-            AlignedArray src, AlignedArray dst, int crow)
-        {
-            Contracts.AssertNonEmpty(mprowcol);
-            Contracts.Assert(mprowcol.Length == crow);
-            Contracts.Assert(mprowindices == null || mprowindices.Length == crow);
-            Contracts.AssertNonEmpty(indices);
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
-            Contracts.Assert(0 < crow && crow <= dst.Size);
-
-            unsafe
-            {
-                fixed (int* pmprowcol = &mprowcol[0])
-                fixed (int* pmprowindices = mprowindices)
-                fixed (int* pindices = &indices[0])
-                fixed (float* psrc = &src.Items[0])
-                fixed (float* pdst = &dst.Items[0])
-                {
-                    Thunk.RespNormU(add, alpha, beta, avgOverFullKernel, offset, pmprowcol, pmprowindices, pindices,
-                        Ptr(src, psrc), Ptr(dst, pdst), crow);
-                }
-            }
-        }
-
-        public static void MatTranTimesSrc(bool add, int[] starts, int[] indices, float[] coefs,
-            AlignedArray src, AlignedArray dst, int ccol)
-        {
-            Contracts.AssertNonEmpty(starts);
-            Contracts.Assert(starts.Length == ccol + 1);
-            Contracts.Assert(starts[0] == 0);
-            Contracts.AssertNonEmpty(indices);
-            Contracts.Assert(starts[ccol] == indices.Length);
-            Contracts.AssertNonEmpty(coefs);
-            Contracts.Assert(indices.Length == coefs.Length);
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
-            Contracts.Assert(0 < ccol && ccol <= src.Size);
-            Contracts.Assert(dst.Size * ccol >= coefs.Length);
-
-            unsafe
-            {
-                fixed (int* pstarts = &starts[0])
-                fixed (int* pindices = &indices[0])
-                fixed (float* pcoefs = &coefs[0])
-                fixed (float* psrc = &src.Items[0])
-                fixed (float* pdst = &dst.Items[0])
-                    Thunk.MatMulTranRU(add, pstarts, pindices, pcoefs, Ptr(src, psrc), Ptr(dst, pdst), dst.Size, ccol);
-            }
-        }
-
-        public static void MatTranTimesSrc(bool add, int[] mpcoliv, int[] mpcolrow, int[] mpcolrun,
-            int[] runs, float[] coefs, AlignedArray src, AlignedArray dst, int ccol)
-        {
-            Contracts.AssertNonEmpty(mpcoliv);
-            Contracts.Assert(mpcoliv.Length == ccol);
-            Contracts.AssertNonEmpty(mpcolrow);
-            Contracts.Assert(mpcolrow.Length == ccol);
-            Contracts.AssertNonEmpty(runs);
-            Contracts.AssertNonEmpty(coefs);
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
-            Contracts.Assert(mpcolrun == null || mpcolrun.Length == ccol);
-            Contracts.Assert(0 < ccol && ccol <= src.Size);
-
-            unsafe
-            {
-                fixed (int* pmpcoliv = &mpcoliv[0])
-                fixed (int* pmpcolrow = &mpcolrow[0])
-                fixed (int* pruns = &runs[0])
-                fixed (float* pcoefs = &coefs[0])
-                fixed (float* psrc = &src.Items[0])
-                fixed (float* pdst = &dst.Items[0])
-                {
-                    if (mpcolrun == null)
-                    {
-                        Thunk.MatMulTranCU(add, pmpcoliv, pmpcolrow, pruns, pcoefs,
-                            Ptr(src, psrc), Ptr(dst, pdst), dst.Size, ccol);
-                    }
-                    else
-                    {
-                        fixed (int* pmpcolrun = &mpcolrun[0])
-                        {
-                            Thunk.MatMulTranDU(add, pmpcoliv, pmpcolrow, pmpcolrun, pruns, pcoefs,
-                                Ptr(src, psrc), Ptr(dst, pdst), dst.Size, ccol);
-                        }
-                    }
-                }
-            }
-        }
-
-        public static void MeanBackOfSrc(bool add, int[] mpcolrow, int[] mpcolindices,
-            int[] indices, AlignedArray src, AlignedArray dst, int ccol)
-        {
-            Contracts.AssertNonEmpty(mpcolrow);
-            Contracts.Assert(mpcolrow.Length == ccol);
-            Contracts.Assert(mpcolindices == null || mpcolindices.Length == ccol);
-            Contracts.AssertNonEmpty(indices);
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
-            Contracts.Assert(0 < ccol && ccol <= src.Size);
-
-            unsafe
-            {
-                fixed (int* pmpcolrow = &mpcolrow[0])
-                fixed (int* pmpcolindices = mpcolindices)
-                fixed (int* pindices = &indices[0])
-                fixed (float* psrc = &src.Items[0])
-                fixed (float* pdst = &dst.Items[0])
-                    Thunk.MeanBackU(add, pmpcolrow, pmpcolindices, pindices, Ptr(src, psrc), Ptr(dst, pdst), dst.Size, ccol);
-            }
-        }
-
-        public static void MaxBackOfSrc(bool add, int[] mpcolrow, int[] mpcolindices,
-            int[] indices, AlignedArray src, AlignedArray dst, AlignedArray val, int ccol)
-        {
-            Contracts.AssertNonEmpty(mpcolrow);
-            Contracts.Assert(mpcolrow.Length == ccol);
-            Contracts.Assert(mpcolindices == null || mpcolindices.Length == ccol);
-            Contracts.AssertNonEmpty(indices);
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
-            Contracts.Assert(Compat(val));
-            Contracts.Assert(0 < ccol && ccol <= src.Size);
-            Contracts.Assert(dst.Size == val.Size);
-
-            unsafe
-            {
-                fixed (int* pmpcolrow = &mpcolrow[0])
-                fixed (int* pmpcolindices = mpcolindices)
-                fixed (int* pindices = &indices[0])
-                fixed (float* psrc = &src.Items[0])
-                fixed (float* pdst = &dst.Items[0])
-                fixed (float* pval = &val.Items[0])
-                    Thunk.MaxBackU(add, pmpcolrow, pmpcolindices, pindices, Ptr(src, psrc), Ptr(dst, pdst), Ptr(val, pval), dst.Size, ccol);
-            }
-        }
-
-        public static void RespNormBackOfSrc(bool add, float alpha, float beta, bool avgOverFullKernel, float offset,
-            int[] mpcolrow, int[] mpcolindices, int[] indices,
-            AlignedArray errors, AlignedArray errorsPrev, AlignedArray valuesPrev, int ccol)
-        {
-            Contracts.AssertNonEmpty(mpcolrow);
-            Contracts.Assert(mpcolrow.Length == ccol);
-            Contracts.Assert(mpcolindices == null || mpcolindices.Length == ccol);
-            Contracts.AssertNonEmpty(indices);
-            Contracts.Assert(Compat(errors));
-            Contracts.Assert(Compat(errorsPrev));
-            Contracts.Assert(Compat(valuesPrev));
-            Contracts.Assert(0 < ccol && ccol <= errors.Size);
-            Contracts.Assert(errorsPrev.Size == valuesPrev.Size);
-
-            unsafe
-            {
-                fixed (int* pmpcolrow = &mpcolrow[0])
-                fixed (int* pmpcolindices = mpcolindices)
-                fixed (int* pindices = &indices[0])
-                fixed (float* perr = &errors.Items[0])
-                fixed (float* perrPrev = &errorsPrev.Items[0])
-                fixed (float* pvalPrev = &valuesPrev.Items[0])
-                {
-                    Thunk.RespNormBackU(add, alpha, beta, avgOverFullKernel, offset, pmpcolrow, pmpcolindices, pindices,
-                        Ptr(errors, perr), Ptr(errorsPrev, perrPrev), Ptr(valuesPrev, pvalPrev), errorsPrev.Size, ccol);
-                }
-            }
-        }
-
-        public static void AddXYTran(float a, AlignedArray x, AlignedArray y, AlignedArray mat, int crow, float decay)
-        {
-            Contracts.Assert(Compat(x));
-            Contracts.Assert(Compat(y));
-            Contracts.Assert(Compat(mat));
-            Contracts.Assert(0 < crow && crow <= x.Size);
-            Contracts.Assert(x.Size * y.Size == mat.Size);
-            Contracts.Assert(decay >= 0);
-
-            unsafe
-            {
-                fixed (float* px = &x.Items[0])
-                fixed (float* py = &y.Items[0])
-                fixed (float* pmat = &mat.Items[0])
-                    Thunk.AddXYTranA(a, Ptr(x, px), Ptr(y, py), Ptr(mat, pmat), crow, y.Size, decay);
-            }
-        }
-
-        public static void AddXYTran(float a, AlignedArray x, int[] rgposY, AlignedArray valuesY,
-            int posMinY, int iposMinY, int iposLimY, AlignedArray mat, int crow)
-        {
-            Contracts.Assert(Compat(x));
-            Contracts.Assert(Compat(valuesY));
-            Contracts.Assert(Compat(mat));
-            Contracts.AssertNonEmpty(rgposY);
-            Contracts.Assert(0 <= iposMinY && iposMinY <= iposLimY && iposLimY <= rgposY.Length);
-            Contracts.Assert(0 < crow && crow <= x.Size);
-            Contracts.Assert(x.Size * valuesY.Size == mat.Size);
-
-            if (iposMinY >= iposLimY)
-                return;
-
-            unsafe
-            {
-                fixed (float* px = &x.Items[0])
-                fixed (float* py = &valuesY.Items[0])
-                fixed (int* pposy = &rgposY[0])
-                fixed (float* pmat = &mat.Items[0])
-                {
-                    Thunk.AddXYTranPA(a, Ptr(x, px), pposy, Ptr(valuesY, py), posMinY, iposMinY, iposLimY, Ptr(mat, pmat),
-                        crow, valuesY.Size);
-                }
-            }
-        }
-
-        public static void AddXYTran(float a, AlignedArray x, AlignedArray y,
-            int[] starts, int[] indices, float[] coefs, int crow, float decay)
-        {
-            Contracts.Assert(Compat(x));
-            Contracts.Assert(Compat(y));
-            Contracts.Assert(0 < crow && crow <= x.Size);
-            Contracts.AssertNonEmpty(starts);
-            Contracts.Assert(starts.Length == crow + 1);
-            Contracts.Assert(starts[0] == 0);
-            Contracts.AssertNonEmpty(indices);
-            Contracts.Assert(starts[crow] == indices.Length);
-            Contracts.AssertNonEmpty(coefs);
-            Contracts.Assert(indices.Length == coefs.Length);
-            Contracts.Assert(crow * y.Size >= coefs.Length);
-            Contracts.Assert(decay >= 0);
-
-            unsafe
-            {
-                fixed (float* px = &x.Items[0])
-                fixed (float* py = &y.Items[0])
-                fixed (int* pstarts = &starts[0])
-                fixed (int* pindices = &indices[0])
-                fixed (float* pcoefs = &coefs[0])
-                    Thunk.AddXYTranRU(a, Ptr(x, px), Ptr(y, py), pstarts, pindices, pcoefs, crow, decay);
-            }
-        }
-
-        public static void AddXYTran(float a, AlignedArray x, AlignedArray y, int[] mprowiv,
-            int[] mprowcol, int[] mprowrun, int[] runs, float[] coefs, int crow)
-        {
-            Contracts.Assert(Compat(x));
-            Contracts.Assert(Compat(y));
-            Contracts.Assert(0 < crow && crow <= x.Size);
-            Contracts.AssertNonEmpty(mprowiv);
-            Contracts.Assert(mprowiv.Length == crow);
-            Contracts.AssertNonEmpty(mprowcol);
-            Contracts.Assert(mprowcol.Length == crow);
-            Contracts.Assert(mprowrun == null || mprowrun.Length == crow);
-            Contracts.AssertNonEmpty(runs);
-            Contracts.AssertNonEmpty(coefs);
-
-            unsafe
-            {
-                fixed (float* px = &x.Items[0])
-                fixed (float* py = &y.Items[0])
-                fixed (int* pmprowiv = &mprowiv[0])
-                fixed (int* pmprowcol = &mprowcol[0])
-                fixed (int* pruns = &runs[0])
-                fixed (float* pcoefs = &coefs[0])
-                {
-                    if (mprowrun == null)
-                        Thunk.AddXYTranCU(a, Ptr(x, px), Ptr(y, py), pmprowiv, pmprowcol, pruns, pcoefs, crow);
-                    else
-                    {
-                        fixed (int* pmprowrun = &mprowrun[0])
-                            Thunk.AddXYTranDU(a, Ptr(x, px), Ptr(y, py), pmprowiv, pmprowcol, pmprowrun, pruns, pcoefs, crow);
-                    }
-                }
-            }
-        }
-
-        public static void AddXYTran(float a, AlignedArray x, AlignedArray y, AlignedArray mat, float momentum, AlignedArray delta, int crow)
-        {
-            Contracts.Assert(Compat(x));
-            Contracts.Assert(Compat(y));
-            Contracts.Assert(Compat(mat));
-            Contracts.Assert(Compat(delta));
-            Contracts.Assert(0 < crow && crow <= x.Size);
-            Contracts.Assert(x.Size * y.Size == mat.Size);
-            Contracts.Assert(mat.Size == delta.Size);
-
-            unsafe
-            {
-                fixed (float* px = &x.Items[0])
-                fixed (float* py = &y.Items[0])
-                fixed (float* pmat = &mat.Items[0])
-                fixed (float* pdel = &delta.Items[0])
-                    Thunk.AddXYTranMomA(a, Ptr(x, px), Ptr(y, py), Ptr(mat, pmat), momentum, Ptr(delta, pdel), crow, y.Size);
-            }
-        }
-
-        public static void AddXYTran(AlignedArray x, AlignedArray y, AlignedArray mat, AlignedArray accGrads, AlignedArray accUpdates,
-            float decay, float cond, int crow)
-        {
-            Contracts.Assert(Compat(x));
-            Contracts.Assert(Compat(y));
-            Contracts.Assert(Compat(mat));
-            Contracts.Assert(Compat(accGrads));
-            Contracts.Assert(Compat(accUpdates));
-            Contracts.Assert(0 < crow && crow <= x.Size);
-            Contracts.Assert(x.Size * y.Size == mat.Size);
-            Contracts.Assert(mat.Size == accGrads.Size);
-            Contracts.Assert(mat.Size == accUpdates.Size);
-
-            unsafe
-            {
-                fixed (float* px = &x.Items[0])
-                fixed (float* py = &y.Items[0])
-                fixed (float* pmat = &mat.Items[0])
-                fixed (float* pag = &accGrads.Items[0])
-                fixed (float* pau = &accUpdates.Items[0])
-                    Thunk.AddXYTranGradA(Ptr(x, px), Ptr(y, py), Ptr(mat, pmat), Ptr(accGrads, pag), Ptr(accUpdates, pau), decay, cond, crow, y.Size);
-            }
-        }
-
-        public static void AddXYTran(AlignedArray x, AlignedArray y, int[] starts, int[] indices,
-            float[] coefs, float[] accGrads, float[] accUpdates, float decay, float cond, int crow)
-        {
-            Contracts.Assert(Compat(x));
-            Contracts.Assert(Compat(y));
-            Contracts.Assert(0 < crow && crow <= x.Size);
-            Contracts.AssertNonEmpty(starts);
-            Contracts.Assert(starts.Length == crow + 1);
-            Contracts.Assert(starts[0] == 0);
-            Contracts.AssertNonEmpty(indices);
-            Contracts.Assert(starts[crow] == indices.Length);
-            Contracts.AssertNonEmpty(coefs);
-            Contracts.Assert(indices.Length == coefs.Length);
-            Contracts.Assert(crow * y.Size >= coefs.Length);
-            Contracts.AssertNonEmpty(accGrads);
-            Contracts.Assert(coefs.Length == accGrads.Length);
-            Contracts.AssertNonEmpty(accUpdates);
-            Contracts.Assert(coefs.Length == accUpdates.Length);
-
-            unsafe
-            {
-                fixed (float* px = &x.Items[0])
-                fixed (float* py = &y.Items[0])
-                fixed (int* pstarts = &starts[0])
-                fixed (int* pindices = &indices[0])
-                fixed (float* pcoefs = &coefs[0])
-                fixed (float* pag = &accGrads[0])
-                fixed (float* pau = &accUpdates[0])
-                    Thunk.AddXYTranGradRU(Ptr(x, px), Ptr(y, py), pstarts, pindices, pcoefs, pag, pau, decay, cond, crow);
-            }
-        }
-
-        public static void AddXYTran(AlignedArray x, int[] rgposY, AlignedArray valuesY,
-            int posMinY, int iposMinY, int iposLimY, AlignedArray mat,
-            AlignedArray accGrads, AlignedArray accUpdates, float decay, float cond, int crow)
-        {
-            Contracts.Assert(Compat(x));
-            Contracts.AssertNonEmpty(rgposY);
-            Contracts.Assert(Compat(valuesY));
-            Contracts.Assert(Compat(mat));
-            Contracts.Assert(0 <= iposMinY && iposMinY <= iposLimY && iposLimY <= rgposY.Length);
-            Contracts.Assert(0 < crow && crow <= x.Size);
-            Contracts.Assert(x.Size * valuesY.Size == mat.Size);
-            Contracts.Assert(mat.Size == accGrads.Size);
-            Contracts.Assert(mat.Size == accUpdates.Size);
-
-            if (iposMinY >= iposLimY)
-                return;
-
-            unsafe
-            {
-                fixed (float* px = &x.Items[0])
-                fixed (float* py = &valuesY.Items[0])
-                fixed (int* pposy = &rgposY[0])
-                fixed (float* pmat = &mat.Items[0])
-                fixed (float* pag = &accGrads.Items[0])
-                fixed (float* pau = &accUpdates.Items[0])
-                {
-                    Thunk.AddXYTranGradPA(Ptr(x, px), pposy, Ptr(valuesY, py), posMinY, iposMinY, iposLimY, Ptr(mat, pmat),
-                        Ptr(accGrads, pag), Ptr(accUpdates, pau), decay, cond, crow, valuesY.Size);
-                }
-            }
-        }
-
         // dst += a
         public static void Add(float a, float[] dst, int count)
         {
@@ -599,17 +85,6 @@ public static void Add(float a, float[] dst, int count)
             }
         }
 
-        public static void Scale(float a, AlignedArray dst)
-        {
-            Contracts.Assert(Compat(dst));
-
-            unsafe
-            {
-                fixed (float* pdst = &dst.Items[0])
-                    Thunk.ScaleA(a, Ptr(dst, pdst), dst.Size);
-            }
-        }
-
         public static void Scale(float a, float[] dst, int count)
         {
             Contracts.AssertNonEmpty(dst);
@@ -622,19 +97,6 @@ public static void Scale(float a, float[] dst, int count)
             }
         }
 
-        public static void Scale(float a, float[] dst, int offset, int count)
-        {
-            Contracts.AssertNonEmpty(dst);
-            Contracts.Assert(0 < count);
-            Contracts.Assert(0 <= offset && offset < dst.Length - count);
-
-            unsafe
-            {
-                fixed (float* pd = &dst[offset])
-                    Thunk.ScaleU(a, pd, count);
-            }
-        }
-
         // dst = a * src
         public static void Scale(float a, float[] src, float[] dst, int count)
         {
@@ -667,98 +129,6 @@ public static void ScaleAdd(float a, float b, float[] dst, int count)
             }
         }
 
-        public static void ScaleConvWeights(float a, int kernelSize, float[] dst)
-        {
-            Contracts.AssertValue(dst);
-
-            // REVIEW: implement in SSE/AVX.
-            for (int istart = 0; istart < dst.Length; istart += kernelSize + 1)
-            {
-                for (int i = 0; i < kernelSize; i++)
-                    dst[istart + i] *= a;
-            }
-        }
-
-        public static void ScaleMaxNorm(bool tran, float maxNorm, AlignedArray mat, int crun, int runLenPhy)
-        {
-            // Called also by MklMath which uses Avx alignment, which is a multiple of Sse alignment.
-            // Hence, Compat(mat) cannot be asserted here since it checks for exact Sse alignment (mat.CbAlign == CbAlign).
-            Contracts.AssertValue(mat);
-            Contracts.Assert(mat.Size > 0);
-            Contracts.Assert((mat.CbAlign % CbAlign) == 0);
-
-            unsafe
-            {
-                fixed (float* pmat = &mat.Items[0])
-                {
-                    if (!tran)
-                        Thunk.ScaleMaxNormA(maxNorm, Ptr(mat, pmat), crun, runLenPhy);
-                    else
-                        Thunk.ScaleMaxNormTranU(maxNorm, Ptr(mat, pmat), crun, runLenPhy);
-                }
-            }
-        }
-
-        public static void ScaleMaxNorm(float maxNorm, int[] starts, int[] indices, float[] mat)
-        {
-            Contracts.AssertNonEmpty(starts);
-
-            int crow = starts.Length - 1;
-            Contracts.Assert(starts[0] == 0);
-            Contracts.AssertValue(indices);
-            Contracts.Assert(starts[crow] == indices.Length);
-            Contracts.AssertNonEmpty(mat);
-
-            unsafe
-            {
-                fixed (int* pstarts = &starts[0])
-                fixed (float* pmat = &mat[0])
-                    Thunk.ScaleMaxNormRU(maxNorm, pstarts, pmat, crow);
-            }
-        }
-
-        public static void ScaleMaxNorm(float maxNorm, int kernCount, int kernSize, float[] mat)
-        {
-            Contracts.AssertNonEmpty(mat);
-
-            unsafe
-            {
-                fixed (float* pmat = &mat[0])
-                    Thunk.ScaleMaxNormCU(maxNorm, kernCount, kernSize, pmat);
-            }
-        }
-
-        public static void AddScale(float a, AlignedArray src, AlignedArray dst)
-        {
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
-            Contracts.Assert(src.Size == dst.Size);
-
-            unsafe
-            {
-                fixed (float* psrc = &src.Items[0])
-                fixed (float* pdst = &dst.Items[0])
-                    Thunk.AddScaleA(a, Ptr(src, psrc), Ptr(dst, pdst), dst.Size);
-            }
-        }
-
-        public static void AddScale(float a, AlignedArray src, AlignedArray dst, float momentum, AlignedArray delta)
-        {
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
-            Contracts.Assert(Compat(delta));
-            Contracts.Assert(src.Size == dst.Size);
-            Contracts.Assert(src.Size == delta.Size);
-
-            unsafe
-            {
-                fixed (float* psrc = &src.Items[0])
-                fixed (float* pdst = &dst.Items[0])
-                fixed (float* pdel = &delta.Items[0])
-                    Thunk.AddScaleMomA(a, Ptr(src, psrc), Ptr(dst, pdst), momentum, Ptr(delta, pdel), dst.Size);
-            }
-        }
-
         public static void AddScale(float a, float[] src, float[] dst, int count)
         {
             Contracts.AssertNonEmpty(src);
@@ -846,41 +216,6 @@ public static void AddScale(float a, float[] src, int[] indices, float[] dst,
             }
         }
 
-        public static void AddScale(AlignedArray src, AlignedArray dst,
-            AlignedArray accGrads, AlignedArray accUpdates, float decay, float cond)
-        {
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
-            Contracts.Assert(Compat(accGrads));
-            Contracts.Assert(Compat(accUpdates));
-            Contracts.Assert(src.Size == dst.Size);
-            Contracts.Assert(src.Size == accGrads.Size);
-            Contracts.Assert(src.Size == accUpdates.Size);
-
-            unsafe
-            {
-                fixed (float* psrc = &src.Items[0])
-                fixed (float* pdst = &dst.Items[0])
-                fixed (float* pag = &accGrads.Items[0])
-                fixed (float* pau = &accUpdates.Items[0])
-                    Thunk.AddScaleGradA(Ptr(src, psrc), Ptr(dst, pdst), Ptr(accGrads, pag), Ptr(accUpdates, pau), decay, cond, dst.Size);
-            }
-        }
-
-        public static void Add(AlignedArray src, AlignedArray dst)
-        {
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
-            Contracts.Assert(src.Size == dst.Size);
-
-            unsafe
-            {
-                fixed (float* psrc = &src.Items[0])
-                fixed (float* pdst = &dst.Items[0])
-                    Thunk.AddA(Ptr(src, psrc), Ptr(dst, pdst), dst.Size);
-            }
-        }
-
         public static void Add(float[] src, float[] dst, int count)
         {
             Contracts.AssertNonEmpty(src);
@@ -914,25 +249,6 @@ public static void Add(float[] src, int[] indices, float[] dst, int count)
             }
         }
 
-        public static void Add(float[] src, int[] indices, float[] dst, int dstOffset, int count)
-        {
-            Contracts.AssertNonEmpty(src);
-            Contracts.Assert(0 < count && count <= src.Length);
-            Contracts.AssertNonEmpty(indices);
-            Contracts.Assert(count <= indices.Length);
-            Contracts.AssertNonEmpty(dst);
-            Contracts.Assert(0 <= dstOffset && dstOffset < dst.Length);
-            Contracts.Assert(count <= dst.Length - dstOffset);
-
-            unsafe
-            {
-                fixed (float* ps = &src[0])
-                fixed (int* pi = &indices[0])
-                fixed (float* pd = &dst[dstOffset])
-                    Thunk.AddSU(ps, pi, pd, count);
-            }
-        }
-
         public static void MulElementWise(float[] src1, float[] src2, float[] dst, int count)
         {
             Contracts.AssertNonEmpty(src1);
@@ -949,36 +265,6 @@ public static void MulElementWise(float[] src1, float[] src2, float[] dst, int c
             }
         }
 
-        public static void MulElementWise(float[] src1, float[] src2, int[] indices, float[] dst, int count)
-        {
-            Contracts.AssertNonEmpty(src1);
-            Contracts.Assert(0 < count && count <= src1.Length);
-            Contracts.AssertNonEmpty(src2);
-            Contracts.Assert(0 < count && count <= src2.Length);
-            Contracts.AssertNonEmpty(dst);
-            Contracts.AssertNonEmpty(indices);
-            Contracts.Assert(count <= indices.Length);
-            unsafe
-            {
-                fixed (float* ps1 = &src1[0])
-                fixed (float* ps2 = &src2[0])
-                fixed (int* pi = &indices[0])
-                fixed (float* pd = &dst[0])
-                    Thunk.MulElementWiseSU(ps1, ps2, pi, pd, count);
-            }
-        }
-
-        public static float Sum(AlignedArray src)
-        {
-            Contracts.Assert(Compat(src));
-
-            unsafe
-            {
-                fixed (float* psrc = &src.Items[0])
-                    return Thunk.SumA(Ptr(src, psrc), src.Size);
-            }
-        }
-
         public static float Sum(float[] src, int count)
         {
             Contracts.AssertNonEmpty(src);
@@ -1042,18 +328,6 @@ public static float SumSq(float mean, float[] src, int offset, int count)
             }
         }
 
-        public static float SumAbs(float[] src, int count)
-        {
-            Contracts.AssertNonEmpty(src);
-            Contracts.Assert(0 < count && count <= src.Length);
-
-            unsafe
-            {
-                fixed (float* psrc = &src[0])
-                    return Thunk.SumAbsU(psrc, count);
-            }
-        }
-
         public static float SumAbs(float[] src, int offset, int count)
         {
             Contracts.AssertNonEmpty(src);
@@ -1080,18 +354,6 @@ public static float SumAbs(float mean, float[] src, int offset, int count)
             }
         }
 
-        public static float MaxAbs(float[] src, int count)
-        {
-            Contracts.AssertNonEmpty(src);
-            Contracts.Assert(0 < count && count <= src.Length);
-
-            unsafe
-            {
-                fixed (float* psrc = &src[0])
-                    return Thunk.MaxAbsU(psrc, src.Length);
-            }
-        }
-
         public static float MaxAbsDiff(float mean, float[] src, int count)
         {
             Contracts.AssertNonEmpty(src);
@@ -1201,279 +463,6 @@ public static float L2DistSquared(float[] a, float[] b, int count)
             }
         }
 
-        public static void ApplySigmoid(AlignedArray src, AlignedArray dst, int c)
-        {
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
-            Contracts.Assert(src.Size == dst.Size);
-            Contracts.Assert(0 < c && c <= dst.Size);
-
-            unsafe
-            {
-                fixed (float* psrc = &src.Items[0])
-                fixed (float* pdst = &dst.Items[0])
-                    Thunk.ApplySigmoidA(Ptr(src, psrc), Ptr(dst, pdst), c);
-            }
-        }
-
-        public static void ApplySoftMax(AlignedArray src, AlignedArray dst, int c)
-        {
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
-            Contracts.Assert(src.Size == dst.Size);
-            Contracts.Assert(0 < c && c <= dst.Size);
-
-            unsafe
-            {
-                fixed (float* psrc = &src.Items[0])
-                fixed (float* pdst = &dst.Items[0])
-                    Thunk.ApplySoftMaxA(Ptr(src, psrc), Ptr(dst, pdst), c);
-            }
-        }
-
-        public static void ApplyRectifiedLinear(AlignedArray src, AlignedArray dst, int c)
-        {
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
-            Contracts.Assert(src.Size == dst.Size);
-            Contracts.Assert(0 < c && c <= dst.Size);
-
-            unsafe
-            {
-                fixed (float* psrc = &src.Items[0])
-                fixed (float* pdst = &dst.Items[0])
-                    Thunk.ApplyRectifiedLinearA(Ptr(src, psrc), Ptr(dst, pdst), c);
-            }
-        }
-
-        public static void ApplySquare(AlignedArray src, AlignedArray dst, int c)
-        {
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
-            Contracts.Assert(src.Size == dst.Size);
-            Contracts.Assert(0 < c && c <= dst.Size);
-
-            unsafe
-            {
-                fixed (float* psrc = &src.Items[0])
-                fixed (float* pdst = &dst.Items[0])
-                    Thunk.ApplySquareA(Ptr(src, psrc), Ptr(dst, pdst), c);
-            }
-        }
-
-        public static void ApplySqrt(AlignedArray src, AlignedArray dst, int c)
-        {
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
-            Contracts.Assert(src.Size == dst.Size);
-            Contracts.Assert(0 < c && c <= dst.Size);
-
-            unsafe
-            {
-                fixed (float* psrc = &src.Items[0])
-                fixed (float* pdst = &dst.Items[0])
-                    Thunk.ApplySqrtA(Ptr(src, psrc), Ptr(dst, pdst), c);
-            }
-        }
-
-        public static void ApplySoftRectifiedLinear(AlignedArray src, AlignedArray dst, int c)
-        {
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
-            Contracts.Assert(src.Size == dst.Size);
-            Contracts.Assert(0 < c && c <= dst.Size);
-
-            unsafe
-            {
-                fixed (float* psrc = &src.Items[0])
-                fixed (float* pdst = &dst.Items[0])
-                    Thunk.ApplySoftRectifiedLinearA(Ptr(src, psrc), Ptr(dst, pdst), c);
-            }
-        }
-
-        public static void ApplyAbs(AlignedArray src, AlignedArray dst, int c)
-        {
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
-            Contracts.Assert(src.Size == dst.Size);
-            Contracts.Assert(0 < c && c <= dst.Size);
-
-            unsafe
-            {
-                fixed (float* psrc = &src.Items[0])
-                fixed (float* pdst = &dst.Items[0])
-                    Thunk.ApplyAbsA(Ptr(src, psrc), Ptr(dst, pdst), c);
-            }
-        }
-
-        public static void ApplyTanh(AlignedArray src, AlignedArray dst, int c)
-        {
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
-            Contracts.Assert(src.Size == dst.Size);
-            Contracts.Assert(0 < c && c <= dst.Size);
-
-            unsafe
-            {
-                fixed (float* psrc = &src.Items[0])
-                fixed (float* pdst = &dst.Items[0])
-                    Thunk.ApplyTanhA(Ptr(src, psrc), Ptr(dst, pdst), c);
-            }
-        }
-
-        public static void ApplyBoundedRectifiedLinear(AlignedArray src, AlignedArray dst, int c)
-        {
-            Contracts.Assert(Compat(src));
-            Contracts.Assert(Compat(dst));
-            Contracts.Assert(src.Size == dst.Size);
-            Contracts.Assert(0 <= c && c <= dst.Size);
-
-            unsafe
-            {
-                fixed (float* psrc = &src.Items[0])
-                fixed (float* pdst = &dst.Items[0])
-                    Thunk.ApplyBoundedRectifiedLinearA(Ptr(src, psrc), Ptr(dst, pdst), c);
-            }
-        }
-
-        public static void ApplySigmoidDerivative(AlignedArray value, AlignedArray grad)
-        {
-            Contracts.Assert(Compat(value));
-            Contracts.Assert(Compat(grad));
-            Contracts.Assert(value.Size == grad.Size);
-
-            unsafe
-            {
-                fixed (float* pvalue = &value.Items[0])
-                fixed (float* pgrad = &grad.Items[0])
-                    Thunk.ApplySigmoidDerivativeA(Ptr(value, pvalue), Ptr(grad, pgrad), grad.Size);
-            }
-        }
-
-        public static void ApplyRectifiedLinearDerivative(AlignedArray value, AlignedArray grad)
-        {
-            Contracts.Assert(Compat(value));
-            Contracts.Assert(Compat(grad));
-            Contracts.Assert(value.Size == grad.Size);
-
-            unsafe
-            {
-                fixed (float* pvalue = &value.Items[0])
-                fixed (float* pgrad = &grad.Items[0])
-                    Thunk.ApplyRectifiedLinearDerivativeA(Ptr(value, pvalue), Ptr(grad, pgrad), grad.Size);
-            }
-        }
-
-        public static void ApplySquareDerivative(AlignedArray input, AlignedArray output, AlignedArray grad, bool drop)
-        {
-            Contracts.Assert(Compat(input));
-            Contracts.Assert(Compat(output));
-            Contracts.Assert(Compat(grad));
-            Contracts.Assert(output.Size == input.Size);
-            Contracts.Assert(output.Size == grad.Size);
-
-            unsafe
-            {
-                fixed (float* px = &input.Items[0])
-                fixed (float* py = &output.Items[0])
-                fixed (float* pg = &grad.Items[0])
-                    Thunk.ApplySquareDerivativeA(Ptr(input, px), Ptr(output, py), Ptr(grad, pg), grad.Size, drop);
-            }
-        }
-
-        public static void ApplySqrtDerivative(AlignedArray value, AlignedArray grad)
-        {
-            Contracts.Assert(Compat(value));
-            Contracts.Assert(Compat(grad));
-            Contracts.Assert(value.Size == grad.Size);
-
-            unsafe
-            {
-                fixed (float* pvalue = &value.Items[0])
-                fixed (float* pgrad = &grad.Items[0])
-                    Thunk.ApplySqrtDerivativeA(Ptr(value, pvalue), Ptr(grad, pgrad), grad.Size);
-            }
-        }
-
-        public static void ApplySoftRectifiedLinearDerivative(AlignedArray input, AlignedArray output, AlignedArray grad)
-        {
-            Contracts.Assert(Compat(input));
-            Contracts.Assert(Compat(output));
-            Contracts.Assert(Compat(grad));
-            Contracts.Assert(output.Size == input.Size);
-            Contracts.Assert(output.Size == grad.Size);
-
-            unsafe
-            {
-                fixed (float* px = &input.Items[0])
-                fixed (float* py = &output.Items[0])
-                fixed (float* pg = &grad.Items[0])
-                    Thunk.ApplySoftRectifiedLinearDerivativeA(Ptr(input, px), Ptr(output, py), Ptr(grad, pg), grad.Size);
-            }
-        }
-
-        public static void ApplyAbsDerivative(AlignedArray input, AlignedArray output, AlignedArray grad, bool drop)
-        {
-            Contracts.Assert(Compat(input));
-            Contracts.Assert(Compat(output));
-            Contracts.Assert(Compat(grad));
-            Contracts.Assert(output.Size == input.Size);
-            Contracts.Assert(output.Size == grad.Size);
-
-            unsafe
-            {
-                fixed (float* px = &input.Items[0])
-                fixed (float* py = &output.Items[0])
-                fixed (float* pg = &grad.Items[0])
-                    Thunk.ApplyAbsDerivativeA(Ptr(input, px), Ptr(output, py), Ptr(grad, pg), grad.Size, drop);
-            }
-        }
-
-        public static void ApplyTanhDerivative(AlignedArray value, AlignedArray grad)
-        {
-            Contracts.Assert(Compat(value));
-            Contracts.Assert(Compat(grad));
-            Contracts.Assert(value.Size == grad.Size);
-
-            unsafe
-            {
-                fixed (float* pvalue = &value.Items[0])
-                fixed (float* pgrad = &grad.Items[0])
-                    Thunk.ApplyTanhDerivativeA(Ptr(value, pvalue), Ptr(grad, pgrad), grad.Size);
-            }
-        }
-
-        public static void ApplyBoundedRectifiedLinearDerivative(AlignedArray value, AlignedArray grad)
-        {
-            Contracts.Assert(Compat(value));
-            Contracts.Assert(Compat(grad));
-            Contracts.Assert(value.Size == grad.Size);
-
-            unsafe
-            {
-                fixed (float* pvalue = &value.Items[0])
-                fixed (float* pgrad = &grad.Items[0])
-                    Thunk.ApplyBoundedRectifiedLinearDerivativeA(Ptr(value, pvalue), Ptr(grad, pgrad), grad.Size);
-            }
-        }
-
-        public static void ZeroMatrixItems(AlignedArray dst, int ccol, int cfltRow, int[] indices)
-        {
-            Contracts.Assert(0 < ccol && ccol <= cfltRow);
-
-            unsafe
-            {
-                fixed (float* pdst = &dst.Items[0])
-                fixed (int* pi = &indices[0])
-                {
-                    if (ccol == cfltRow)
-                        Thunk.ZeroItemsU(Ptr(dst, pdst), dst.Size, pi, indices.Length);
-                    else
-                        Thunk.ZeroMatrixItemsCore(Ptr(dst, pdst), dst.Size, ccol, cfltRow, pi, indices.Length);
-                }
-            }
-        }
-
         public static void SdcaL1UpdateDense(float primalUpdate, int length, float[] src, float threshold, float[] v, float[] w)
         {
             Contracts.AssertNonEmpty(src);
@@ -1515,24 +504,5 @@ public static void SdcaL1UpdateSparse(float primalUpdate, int length, float[] sr
                     Thunk.SdcaL1UpdateSU(primalUpdate, psrc, pi, threshold, pd1, pd2, count);
             }
         }
-
-        public static void ScaleAdadelta(float[] mat, float[] accGrads, float[] accUpdates, float decay, float cond, float[] grads)
-        {
-            Contracts.AssertNonEmpty(mat);
-            Contracts.AssertNonEmpty(accGrads);
-            Contracts.AssertNonEmpty(accUpdates);
-            Contracts.Assert(mat.Length == accGrads.Length);
-            Contracts.Assert(mat.Length == accUpdates.Length);
-            Contracts.Assert(mat.Length <= grads.Length);
-
-            unsafe
-            {
-                fixed (float* pm = &mat[0])
-                fixed (float* pag = &accGrads[0])
-                fixed (float* pau = &accUpdates[0])
-                fixed (float* pg = &grads[0])
-                    Thunk.ScaleAdadeltaU(pm, pag, pau, decay, cond, pg, mat.Length);
-            }
-        }
     }
 }
\ No newline at end of file
diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
index 4c36d0094e..9f5ff8cf27 100644
--- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
+++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
@@ -26,26 +26,6 @@ internal static class SseIntrinsics
             Sse.StaticCast<int, float>(Sse2.SetAllVector128(0x7FFFFFFF)) :
             Sse.SetAllVector128(BitConverter.Int32BitsToSingle(0x7FFFFFFF));
 
-        // The count of bytes in Vector128<T>, corresponding to _cbAlign in AlignedArray
-        private const int Vector128Alignment = 16;
-
-        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
-        private static bool HasCompatibleAlignment(AlignedArray alignedArray)
-        {
-            Contracts.AssertValue(alignedArray);
-            Contracts.Assert(alignedArray.Size > 0);
-            return (alignedArray.CbAlign % Vector128Alignment) == 0;
-        }
-
-        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
-        private static unsafe float* GetAlignedBase(AlignedArray alignedArray, float* unalignedBase)
-        {
-            Contracts.AssertValue(alignedArray);
-            float* alignedBase = unalignedBase + alignedArray.GetBase((long)unalignedBase);
-            Contracts.Assert(((long)alignedBase & (Vector128Alignment - 1)) == 0);
-            return alignedBase;
-        }
-
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
         internal static unsafe Vector128<float> Load1(float* src, int* idx)
              => Sse.SetScalarVector128(src[idx[0]]);
@@ -72,7 +52,7 @@ internal static unsafe void Store4(in Vector128<float> x, float* dst, int* idx)
         }
 
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
-       internal static Vector128<float> VectorSum128(in Vector128<float> vector)
+        internal static Vector128<float> VectorSum128(in Vector128<float> vector)
         {
             if (Sse3.IsSupported)
             {
@@ -117,19 +97,15 @@ internal static Vector128<float> GetNewDst128(in Vector128<float> xDst1, in Vect
         }
 
         // Multiply matrix times vector into vector.
-        public static unsafe void MatMulA(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol)
+        public static unsafe void MatMul(bool add, float[] mat, float[] src, float[] dst, int crow, int ccol)
         {
-            Contracts.Assert(HasCompatibleAlignment(mat));
-            Contracts.Assert(HasCompatibleAlignment(src));
-            Contracts.Assert(HasCompatibleAlignment(dst));
-
-            fixed (float* pSrcStart = &src.Items[0])
-            fixed (float* pDstStart = &dst.Items[0])
-            fixed (float* pMatStart = &mat.Items[0])
+            fixed (float* pSrcStart = &src[0])
+            fixed (float* pDstStart = &dst[0])
+            fixed (float* pMatStart = &mat[0])
             {
-                float* psrc = GetAlignedBase(src, pSrcStart);
-                float* pdst = GetAlignedBase(dst, pDstStart);
-                float* pmat = GetAlignedBase(mat, pMatStart);
+                float* psrc = pSrcStart;
+                float* pdst = pDstStart;
+                float* pmat = pMatStart;
 
                 float* pSrcEnd = psrc + ccol;
                 float* pDstEnd = pdst + crow;
@@ -149,11 +125,11 @@ public static unsafe void MatMulA(bool add, AlignedArray mat, AlignedArray src,
                     {
                         float* pMatTemp = pMatCurrent;
 
-                        Vector128<float> x01 = Sse.LoadAlignedVector128(pMatTemp);
-                        Vector128<float> x11 = Sse.LoadAlignedVector128(pMatTemp += ccol);
-                        Vector128<float> x21 = Sse.LoadAlignedVector128(pMatTemp += ccol);
-                        Vector128<float> x31 = Sse.LoadAlignedVector128(pMatTemp += ccol);
-                        Vector128<float> x02 = Sse.LoadAlignedVector128(pSrcCurrent);
+                        Vector128<float> x01 = Sse.LoadVector128(pMatTemp);
+                        Vector128<float> x11 = Sse.LoadVector128(pMatTemp += ccol);
+                        Vector128<float> x21 = Sse.LoadVector128(pMatTemp += ccol);
+                        Vector128<float> x31 = Sse.LoadVector128(pMatTemp += ccol);
+                        Vector128<float> x02 = Sse.LoadVector128(pSrcCurrent);
 
                         res0 = Sse.Add(res0, Sse.Multiply(x01, x02));
                         res1 = Sse.Add(res1, Sse.Multiply(x11, x02));
@@ -171,7 +147,7 @@ public static unsafe void MatMulA(bool add, AlignedArray mat, AlignedArray src,
 
                     if (add)
                     {
-                        res0 = Sse.Add(res0, Sse.LoadAlignedVector128(pDstCurrent));
+                        res0 = Sse.Add(res0, Sse.LoadVector128(pDstCurrent));
                     }
                     Sse.StoreAligned(pDstCurrent, res0);
 
@@ -182,23 +158,19 @@ public static unsafe void MatMulA(bool add, AlignedArray mat, AlignedArray src,
         }
 
         // Partial sparse source vector.
-        public static unsafe void MatMulPA(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src,
-                                        int posMin, int iposMin, int iposEnd, AlignedArray dst, int crow, int ccol)
+        public static unsafe void MatMulP(bool add, float[] mat, int[] rgposSrc, float[] src,
+                                        int posMin, int iposMin, int iposEnd, float[] dst, int crow, int ccol)
         {
-            Contracts.Assert(HasCompatibleAlignment(mat));
-            Contracts.Assert(HasCompatibleAlignment(src));
-            Contracts.Assert(HasCompatibleAlignment(dst));
-
             // REVIEW: For extremely sparse inputs, interchanging the loops would
             // likely be more efficient.
-            fixed (float* pSrcStart = &src.Items[0])
-            fixed (float* pDstStart = &dst.Items[0])
-            fixed (float* pMatStart = &mat.Items[0])
+            fixed (float* pSrcStart = &src[0])
+            fixed (float* pDstStart = &dst[0])
+            fixed (float* pMatStart = &mat[0])
             fixed (int* pposSrc = &rgposSrc[0])
             {
-                float* psrc = GetAlignedBase(src, pSrcStart);
-                float* pdst = GetAlignedBase(dst, pDstStart);
-                float* pmat = GetAlignedBase(mat, pMatStart);
+                float* psrc = pSrcStart;
+                float* pdst = pDstStart;
+                float* pmat = pMatStart;
 
                 int* pposMin = pposSrc + iposMin;
                 int* pposEnd = pposSrc + iposEnd;
@@ -229,7 +201,7 @@ public static unsafe void MatMulPA(bool add, AlignedArray mat, int[] rgposSrc, A
 
                     if (add)
                     {
-                        result = Sse.Add(result, Sse.LoadAlignedVector128(pDstCurrent));
+                        result = Sse.Add(result, Sse.LoadVector128(pDstCurrent));
                     }
                     Sse.StoreAligned(pDstCurrent, result);
 
@@ -239,19 +211,15 @@ public static unsafe void MatMulPA(bool add, AlignedArray mat, int[] rgposSrc, A
             }
         }
 
-        public static unsafe void MatMulTranA(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol)
+        public static unsafe void MatMulTran(bool add, float[] mat, float[] src, float[] dst, int crow, int ccol)
         {
-            Contracts.Assert(HasCompatibleAlignment(mat));
-            Contracts.Assert(HasCompatibleAlignment(src));
-            Contracts.Assert(HasCompatibleAlignment(dst));
-
-            fixed (float* pSrcStart = &src.Items[0])
-            fixed (float* pDstStart = &dst.Items[0])
-            fixed (float* pMatStart = &mat.Items[0])
+            fixed (float* pSrcStart = &src[0])
+            fixed (float* pDstStart = &dst[0])
+            fixed (float* pMatStart = &mat[0])
             {
-                float* psrc = GetAlignedBase(src, pSrcStart);
-                float* pdst = GetAlignedBase(dst, pDstStart);
-                float* pmat = GetAlignedBase(mat, pMatStart);
+                float* psrc = pSrcStart;
+                float* pdst = pDstStart;
+                float* pmat = pMatStart;
 
                 float* pSrcEnd = psrc + ccol;
                 float* pDstEnd = pdst + crow;
@@ -260,7 +228,7 @@ public static unsafe void MatMulTranA(bool add, AlignedArray mat, AlignedArray s
 
                 if (!add)
                 {
-                    Vector128<float> x01 = Sse.LoadAlignedVector128(pSrcCurrent);
+                    Vector128<float> x01 = Sse.LoadVector128(pSrcCurrent);
                     // Replicate each 32-bit slot of x01 (ABCD) into its own register.
                     Vector128<float> x11 = Sse.Shuffle(x01, x01, 0x55); // B
                     Vector128<float> x21 = Sse.Shuffle(x01, x01, 0xAA); // C
@@ -274,10 +242,10 @@ public static unsafe void MatMulTranA(bool add, AlignedArray mat, AlignedArray s
                     while (pDstCurrent < pDstEnd)
                     {
                         float* pMatTemp = pMatCurrent;
-                        Vector128<float> x02 = Sse.LoadAlignedVector128(pMatTemp);
-                        Vector128<float> x12 = Sse.LoadAlignedVector128(pMatTemp += crow);
-                        Vector128<float> x22 = Sse.LoadAlignedVector128(pMatTemp += crow);
-                        Vector128<float> x32 = Sse.LoadAlignedVector128(pMatTemp += crow);
+                        Vector128<float> x02 = Sse.LoadVector128(pMatTemp);
+                        Vector128<float> x12 = Sse.LoadVector128(pMatTemp += crow);
+                        Vector128<float> x22 = Sse.LoadVector128(pMatTemp += crow);
+                        Vector128<float> x32 = Sse.LoadVector128(pMatTemp += crow);
 
                         x02 = Sse.Multiply(x01, x02);
                         x12 = Sse.Multiply(x11, x12);
@@ -299,7 +267,7 @@ public static unsafe void MatMulTranA(bool add, AlignedArray mat, AlignedArray s
 
                 while (pSrcCurrent < pSrcEnd)
                 {
-                    Vector128<float> x01 = Sse.LoadAlignedVector128(pSrcCurrent);
+                    Vector128<float> x01 = Sse.LoadVector128(pSrcCurrent);
                     // Replicate each 32-bit slot of x01 (ABCD) into its own register.
                     Vector128<float> x11 = Sse.Shuffle(x01, x01, 0x55); // B
                     Vector128<float> x21 = Sse.Shuffle(x01, x01, 0xAA); // C
@@ -312,11 +280,11 @@ public static unsafe void MatMulTranA(bool add, AlignedArray mat, AlignedArray s
                     {
                         float* pMatTemp = pMatCurrent;
 
-                        Vector128<float> x02 = Sse.LoadAlignedVector128(pMatTemp);
-                        Vector128<float> x12 = Sse.LoadAlignedVector128(pMatTemp += crow);
-                        Vector128<float> x22 = Sse.LoadAlignedVector128(pMatTemp += crow);
-                        Vector128<float> x32 = Sse.LoadAlignedVector128(pMatTemp += crow);
-                        Vector128<float> x3 = Sse.LoadAlignedVector128(pDstCurrent);
+                        Vector128<float> x02 = Sse.LoadVector128(pMatTemp);
+                        Vector128<float> x12 = Sse.LoadVector128(pMatTemp += crow);
+                        Vector128<float> x22 = Sse.LoadVector128(pMatTemp += crow);
+                        Vector128<float> x32 = Sse.LoadVector128(pMatTemp += crow);
+                        Vector128<float> x3 = Sse.LoadVector128(pDstCurrent);
 
                         x02 = Sse.Multiply(x01, x02);
                         x12 = Sse.Multiply(x11, x12);
@@ -341,21 +309,17 @@ public static unsafe void MatMulTranA(bool add, AlignedArray mat, AlignedArray s
         }
 
         // Partial sparse source vector.
-        public static unsafe void MatMulTranPA(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src,
-                                        int posMin, int iposMin, int iposEnd, AlignedArray dst, int crow)
+        public static unsafe void MatMulTranP(bool add, float[] mat, int[] rgposSrc, float[] src,
+                                        int posMin, int iposMin, int iposEnd, float[] dst, int crow)
         {
-            Contracts.Assert(HasCompatibleAlignment(mat));
-            Contracts.Assert(HasCompatibleAlignment(src));
-            Contracts.Assert(HasCompatibleAlignment(dst));
-
-            fixed (float* pSrcStart = &src.Items[0])
-            fixed (float* pDstStart = &dst.Items[0])
-            fixed (float* pMatStart = &mat.Items[0])
+            fixed (float* pSrcStart = &src[0])
+            fixed (float* pDstStart = &dst[0])
+            fixed (float* pMatStart = &mat[0])
             fixed (int* pposSrc = &rgposSrc[0])
             {
-                float* psrc = GetAlignedBase(src, pSrcStart);
-                float* pdst = GetAlignedBase(dst, pDstStart);
-                float* pmat = GetAlignedBase(mat, pMatStart);
+                float* psrc = pSrcStart;
+                float* pdst = pDstStart;
+                float* pmat = pMatStart;
 
                 int* ppos = pposSrc + iposMin;
                 int* pposEnd = pposSrc + iposEnd;
@@ -372,7 +336,7 @@ public static unsafe void MatMulTranPA(bool add, AlignedArray mat, int[] rgposSr
 
                     while (pDstCurrent < pDstEnd)
                     {
-                        Vector128<float> x1 = Sse.LoadAlignedVector128(pMatCurrent);
+                        Vector128<float> x1 = Sse.LoadVector128(pMatCurrent);
                         x1 = Sse.Multiply(x1, x0);
                         Sse.StoreAligned(pDstCurrent, x1);
 
@@ -392,8 +356,8 @@ public static unsafe void MatMulTranPA(bool add, AlignedArray mat, int[] rgposSr
 
                     while (pDstCurrent < pDstEnd)
                     {
-                        Vector128<float> x1 = Sse.LoadAlignedVector128(pMatCurrent);
-                        Vector128<float> x2 = Sse.LoadAlignedVector128(pDstCurrent);
+                        Vector128<float> x1 = Sse.LoadVector128(pMatCurrent);
+                        Vector128<float> x2 = Sse.LoadVector128(pDstCurrent);
                         x1 = Sse.Multiply(x1, x0);
                         x2 = Sse.Add(x2, x1);
                         Sse.StoreAligned(pDstCurrent, x2);
diff --git a/src/Microsoft.ML.CpuMath/Thunk.cs b/src/Microsoft.ML.CpuMath/Thunk.cs
index 1053f75b75..4bfa770b0b 100644
--- a/src/Microsoft.ML.CpuMath/Thunk.cs
+++ b/src/Microsoft.ML.CpuMath/Thunk.cs
@@ -3,7 +3,6 @@
 // See the LICENSE file in the project root for more information.
 
 using System.Runtime.InteropServices;
-using System.Runtime.CompilerServices;
 using System.Security;
 
 namespace Microsoft.ML.Runtime.Internal.CpuMath
@@ -13,423 +12,87 @@ internal static unsafe class Thunk
         internal const string NativePath = "CpuMathNative";
 
         [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern bool ChkAvx();
-
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void MatMulA(bool add, /*const*/ float* pmat, /*const*/ float* psrc, float* pdst, int crow, int ccol);
+        public static extern void AddScaleU(float a, /*const*/ float* ps, float* pd, int c);
 
         [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void MatMulX(bool add, /*const*/ float* pmat, /*const*/ float* psrc, float* pdst, int crow, int ccol);
+        public static extern void AddScaleSU(float a, /*const*/ float* ps, /*const*/ int* pi, float* pd, int c);
 
         [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void MatMulPA(bool add, /*const*/ float* pmat, /*const*/ int* pposSrc, /*const*/ float* psrc,
-            int posMin, int iposMin, int iposLim, float* pdst, int crow, int ccol);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void MatMulPX(bool add, /*const*/ float* pmat, /*const*/ int* pposSrc, /*const*/ float* psrc,
-            int posMin, int iposMin, int iposLim, float* pdst, int crow, int ccol);
+        public static extern void AddScaleCopyU(float a, /*const*/ float* ps, /*const*/ float* pd, float* pr, int c);
 
         [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void MatMulRU(bool add, /*const*/ int* pstarts, /*const*/ int* pindices, /*const*/ float* pcoefs,
-            /*const*/ float* psrc, float* pdst, int crow);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void MatMulRX(bool add, /*const*/ int* pstarts, /*const*/ int* pindices, /*const*/ float* pcoefs,
-            /*const*/ float* psrc, float* pdst, int crow);
+        public static extern void AddScalarU(float a, float* pd, int c);
 
         [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void MatMulCU(bool add, /*const*/ int* pmprowiv, /*const*/ int* pmprowcol,
-            /*const*/ int* pruns, /*const*/ float* pcoefs, /*const*/ float* psrc, float* pdst, int crow);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void MatMulDU(bool add, /*const*/ int* pmprowiv, /*const*/ int* pmprowcol, /*const*/ int* pmprowrun,
-            /*const*/ int* pruns, /*const*/ float* pcoefs, /*const*/ float* psrc, float* pdst, int crow);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void MatMulCX(bool add, /*const*/ int* pmprowiv, /*const*/ int* pmprowcol,
-            /*const*/ int* pruns, /*const*/ float* pcoefs, /*const*/ float* psrc, float* pdst, int crow);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void MatMulDX(bool add, /*const*/ int* pmprowiv, /*const*/ int* pmprowcol, /*const*/ int* pmprowrun,
-            /*const*/ int* pruns, /*const*/ float* pcoefs, /*const*/ float* psrc, float* pdst, int crow);
+        public static extern void AddU(/*const*/ float* ps, float* pd, int c);
 
         [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void MeanU(bool add, /*const*/ int* pmprowcol, /*const*/ int* pmprowindices, /*const*/ int* pindices,
-            /*const*/ float* psrc, float* pdst, int crow);
+        public static extern void AddSU(/*const*/ float* ps, /*const*/ int* pi, float* pd, int c);
 
         [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void MaxU(bool add, /*const*/ int* pmprowcol, /*const*/ int* pmprowindices, /*const*/ int* pindices,
-            /*const*/ float* psrc, float* pdst, int crow);
+        public static extern void MatMul(bool add, /*const*/ float* pmat, /*const*/ float* psrc, float* pdst, int crow, int ccol);
 
         [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void RespNormU(bool add, float alpha, float beta, bool avgOverFullKernel, float offset,
-            /*const*/ int* pmprowcol, /*const*/ int* pmprowindices, /*const*/ int* pindices,
-            /*const*/ float* psrc, float* pdst, int crow);
+        public static extern void MatMulP(bool add, /*const*/ float* pmat, /*const*/ int* pposSrc, /*const*/ float* psrc,
+            int posMin, int iposMin, int iposLim, float* pdst, int crow, int ccol);
 
         // These treat pmat as if it is stored in column-major order. Thus, crow and ccol are the numbers of rows
         // and columns from that perspective. Alternatively, crow is the number of rows in the transpose of pmat
         // (thought of as row-major order).
         [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void MatMulTranA(bool add, /*const*/ float* pmat, /*const*/ float* psrc, float* pdst, int crow, int ccol);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void MatMulTranX(bool add, /*const*/ float* pmat, /*const*/ float* psrc, float* pdst, int crow, int ccol);
+        public static extern void MatMulTran(bool add, /*const*/ float* pmat, /*const*/ float* psrc, float* pdst, int crow, int ccol);
 
         [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void MatMulTranPA(bool add, /*const*/ float* pmat, /*const*/ int* pposSrc, /*const*/ float* psrc,
-            int posMin, int iposMin, int iposLim, float* pdst, int crow);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void MatMulTranPX(bool add, /*const*/ float* pmat, /*const*/ int* pposSrc, /*const*/ float* psrc,
+        public static extern void MatMulTranP(bool add, /*const*/ float* pmat, /*const*/ int* pposSrc, /*const*/ float* psrc,
             int posMin, int iposMin, int iposLim, float* pdst, int crow);
 
         [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void MatMulTranRU(bool add, /*const*/ int* pstarts, /*const*/ int* pindices, /*const*/ float* pcoefs,
-            /*const*/ float* psrc, float* pdst, int crow, int ccol);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void MatMulTranRX(bool add, /*const*/ int* pstarts, /*const*/ int* pindices, /*const*/ float* pcoefs,
-            /*const*/ float* psrc, float* pdst, int crow, int ccol);
-
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void MatMulTranCU(bool add, /*const*/ int* pmpcoliv, /*const*/ int* pmpcolrow,
-            /*const*/ int* pruns, /*const*/ float* pcoefs, /*const*/ float* psrc, float* pdst, int crow, int ccol);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void MatMulTranDU(bool add, /*const*/ int* pmpcoliv, /*const*/ int* pmpcolrow, /*const*/ int* pmpcolrun,
-            /*const*/ int* pruns, /*const*/ float* pcoefs, /*const*/ float* psrc, float* pdst, int crow, int ccol);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void MatMulTranCX(bool add, /*const*/ int* pmpcoliv, /*const*/ int* pmpcolrow,
-            /*const*/ int* pruns, /*const*/ float* pcoefs, /*const*/ float* psrc, float* pdst, int crow, int ccol);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void MatMulTranDX(bool add, /*const*/ int* pmpcoliv, /*const*/ int* pmpcolrow, /*const*/ int* pmpcolrun,
-            /*const*/ int* pruns, /*const*/ float* pcoefs, /*const*/ float* psrc, float* pdst, int crow, int ccol);
-
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void MeanBackU(bool add, /*const*/ int* pmpcolrow, /*const*/ int* pmpcolindices, /*const*/ int* pindices,
-            /*const*/ float* psrc, float* pdst, int crow, int ccol);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void MaxBackU(bool add, /*const*/ int* pmpcolrow, /*const*/ int* pmpcolindices, /*const*/ int* pindices,
-            /*const*/ float* psrc, float* pdst, /*const*/ float* pval, int crow, int ccol);
-
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void RespNormBackU(bool add, float alpha, float beta, bool avgOverFullKernel, float offset,
-            /*const*/ int* pmpcolrow, /*const*/ int* pmpcolindices, /*const*/ int* pindices,
-            /*const*/ float* perrors, float* perrorsPrev, /*const*/ float* pvaluesPrev, int crow, int ccol);
-
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void AddXYTranA(float a, /*const*/ float* px, /*const*/ float* py, float* pmat, int crow, int ccol, float decay);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void AddXYTranX(float a, /*const*/ float* px, /*const*/ float* py, float* pmat, int crow, int ccol, float decay);
-
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void AddXYTranPA(float a, /*const*/ float* px, /*const*/ int* pposY, /*const*/ float* pvaluesY,
-            int posMinY, int iposMinY, int iposLimY, float* pmat, int crow, int ccol);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void AddXYTranPX(float a, /*const*/ float* px, /*const*/ int* pposY, /*const*/ float* pvaluesY,
-            int posMinY, int iposMinY, int iposLimY, float* pmat, int crow, int ccol);
-
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void AddXYTranRU(float a, /*const*/ float* px, /*const*/ float* py,
-            /*const*/ int* pstarts, /*const*/ int* pindices, float* pcoefs, int crow, float decay);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void AddXYTranRX(float a, /*const*/ float* px, /*const*/ float* py,
-            /*const*/ int* pstarts, /*const*/ int* pindices, float* pcoefs, int crow, float decay);
-
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void AddXYTranCU(float a, /*const*/ float* px, /*const*/ float* py, /*const*/ int* pmprowiv, /*const*/ int* pmprowcol,
-            /*const*/ int* pruns, float* pcoefs, int crow);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void AddXYTranDU(float a, /*const*/ float* px, /*const*/ float* py, /*const*/ int* pmprowiv, /*const*/ int* pmprowcol,
-            /*const*/ int* pmprowrun, /*const*/ int* pruns, float* pcoefs, int crow);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void AddXYTranCX(float a, /*const*/ float* px, /*const*/ float* py, /*const*/ int* pmprowiv, /*const*/ int* pmprowcol,
-            /*const*/ int* pruns, float* pcoefs, int crow);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void AddXYTranDX(float a, /*const*/ float* px, /*const*/ float* py, /*const*/ int* pmprowiv, /*const*/ int* pmprowcol,
-            /*const*/ int* pmprowrun, /*const*/ int* pruns, float* pcoefs, int crow);
-
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void AddXYTranMomA(float a, /*const*/ float* px, /*const*/ float* py, float* pmat, float momentum, float* pdel, int crow, int ccol);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void AddXYTranMomX(float a, /*const*/ float* px, /*const*/ float* py, float* pmat, float momentum, float* pdel, int crow, int ccol);
-
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void AddXYTranGradA(/*const*/ float* px, /*const*/ float* py, float* pmat, float* paccGrads, float* paccUpdates,
-            float decay, float cond, int crow, int ccol);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void AddXYTranGradX(/*const*/ float* px, /*const*/ float* py, float* pmat, float* paccGrads, float* paccUpdates,
-            float decay, float cond, int crow, int ccol);
+        public static extern float MulElementWiseU(/*const*/ float* ps1, /*const*/float* ps2, float* pd, int c);
 
         [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void AddXYTranGradRU(/*const*/ float* px, /*const*/ float* py, /*const*/ int* pstarts, /*const*/ int* pindices,
-            float* pcoefs, float* paccGrads, float* paccUpdates, float decay, float cond, int crow);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void AddXYTranGradRX(/*const*/ float* px, /*const*/ float* py, /*const*/ int* pstarts, /*const*/ int* pindices,
-            float* pcoefs, float* paccGrads, float* paccUpdates, float decay, float cond, int crow);
+        public static extern float MaxAbsU(/*const*/ float* ps, int c);
 
         [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void AddXYTranGradPA(/*const*/ float* px, /*const*/ int* pposY, /*const*/ float* pvaluesY,
-            int posMinY, int iposMinY, int iposLimY, float* pmat, float* paccGrads, float* paccUpdates,
-            float decay, float cond, int crow, int ccol);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void AddXYTranGradPX(/*const*/ float* px, /*const*/ int* pposY, /*const*/ float* pvaluesY,
-            int posMinY, int iposMinY, int iposLimY, float* pmat, float* paccGrads, float* paccUpdates,
-            float decay, float cond, int crow, int ccol);
+        public static extern float MaxAbsDiffU(float mean, /*const*/ float* ps, int c);
 
         [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
         public static extern void ScaleU(float a, float* pd, int c);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void ScaleA(float a, float* pd, int c);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void ScaleX(float a, float* pd, int c);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void ScaleSrcU(float a, /*const*/ float* ps, float* pd, int c);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void ScaleAddU(float a, float b, float* pd, int c);
-
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void ScaleMaxNormA(float maxNorm, float* pmat, int crow, int ccol);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void ScaleMaxNormX(float maxNorm, float* pmat, int crow, int ccol);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void ScaleMaxNormTranU(float maxNorm, float* pmat, int crow, int ccol);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void ScaleMaxNormRU(float maxNorm, /*const*/ int* pstarts, float* pmat, int crow);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void ScaleMaxNormCU(float maxNorm, int kernCount, int kernSize, float* pmat);
 
         [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void AddScaleA(float a, /*const*/ float* ps, float* pd, int c);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void AddScaleU(float a, /*const*/ float* ps, float* pd, int c);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void AddScaleX(float a, /*const*/ float* ps, float* pd, int c);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void AddScaleSU(float a, /*const*/ float* ps, /*const*/ int* pi, float* pd, int c);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void AddScaleCopyU(float a, /*const*/ float* ps, /*const*/ float* pd, float* pr, int c);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void AddScaleMomA(float a, /*const*/ float* ps, float* pd, float momentum, float* pdel, int c);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void AddScaleMomX(float a, /*const*/ float* ps, float* pd, float momentum, float* pdel, int c);
-
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void AddScaleGradA(/*const*/ float* ps, float* pd, float* paccGrads, float* paccUpdates,
-            float decay, float cond, int c);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void AddScaleGradX(/*const*/ float* ps, float* pd, float* paccGrads, float* paccUpdates,
-            float decay, float cond, int c);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void AddScaleMultiA(int count, /*const*/ float* ps, float* pd, float* paccGrads,
-            float* paccUpdates, float decay, float cond, int size);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void AddScalarU(float a, float* pd, int c);
-
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void AddU(/*const*/ float* ps, float* pd, int c);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void AddA(/*const*/ float* ps, float* pd, int c);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void AddX(/*const*/ float* ps, float* pd, int c);
+        public static extern void ScaleSrcU(float a, /*const*/ float* ps, float* pd, int c);
 
         [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void AddSU(/*const*/ float* ps, /*const*/ int* pi, float* pd, int c);
+        public static extern void ScaleAddU(float a, float b, float* pd, int c);
 
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern float SumA(/*const*/ float* ps, int c);
         [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
         public static extern float SumU(/*const*/ float* ps, int c);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern float SumX(/*const*/ float* ps, int c);
 
         [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
         public static extern float SumSqU(/*const*/ float* ps, int c);
+
         [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
         public static extern float SumSqDiffU(float mean, /*const*/ float* ps, int c);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern float SumAbsU(/*const*/ float* ps, int c);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern float SumAbsDiffU(float mean, /*const*/ float* ps, int c);
 
         [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern float MulElementWiseU(/*const*/ float* ps1, /*const*/float* ps2, float* pd, int c);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern float MulElementWiseSU(/*const*/ float* ps1, /*const*/float* ps2,  /*const*/ int* pi, float* pd, int c);
+        public static extern float SumAbsU(/*const*/ float* ps, int c);
 
         [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern float MaxAbsU(/*const*/ float* ps, int c);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern float MaxAbsDiffU(float mean, /*const*/ float* ps, int c);
+        public static extern float SumAbsDiffU(float mean, /*const*/ float* ps, int c);
 
         [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
         public static extern float DotU(/*const*/ float* pa, /*const*/ float* pb, int c);
+
         [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
         public static extern float DotSU(/*const*/ float* pa, /*const*/ float* pb, /*const*/ int* pi, int c);
 
         [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
         public static extern float Dist2(/*const*/ float* px, /*const*/ float* py, int c);
 
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void ApplySigmoidA(/*const*/ float* ps, float* pd, int c);
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static void ApplySigmoidX(/*const*/ float* ps, float* pd, int c)
-        {
-            ApplySigmoidA(ps, pd, c);
-        }
-
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void ApplySoftMaxU(float* ps, float* pd, int c);
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static void ApplySoftMaxA(float* ps, float* pd, int c)
-        {
-            ApplySoftMaxU(ps, pd, c);
-        }
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static void ApplySoftMaxX(float* ps, float* pd, int c)
-        {
-            ApplySoftMaxU(ps, pd, c);
-        }
-
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void ApplyRectifiedLinearA(float* ps, float* pd, int c);
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static void ApplyRectifiedLinearX(float* ps, float* pd, int c)
-        {
-            ApplyRectifiedLinearA(ps, pd, c);
-        }
-
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void ApplySquareA(float* ps, float* pd, int c);
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static void ApplySquareX(float* ps, float* pd, int c)
-        {
-            ApplySquareA(ps, pd, c);
-        }
-
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void ApplySqrtA(float* ps, float* pd, int c);
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static void ApplySqrtX(float* ps, float* pd, int c)
-        {
-            ApplySqrtA(ps, pd, c);
-        }
-
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void ApplySoftRectifiedLinearU(float* ps, float* pd, int c);
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static void ApplySoftRectifiedLinearA(float* ps, float* pd, int c)
-        {
-            ApplySoftRectifiedLinearU(ps, pd, c);
-        }
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static void ApplySoftRectifiedLinearX(float* ps, float* pd, int c)
-        {
-            ApplySoftRectifiedLinearU(ps, pd, c);
-        }
-
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void ApplyAbsA(float* ps, float* pd, int c);
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static void ApplyAbsX(float* ps, float* pd, int c)
-        {
-            ApplyAbsA(ps, pd, c);
-        }
-
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void ApplyTanhA(float* ps, float* pd, int c);
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static void ApplyTanhX(float* ps, float* pd, int c)
-        {
-            ApplyTanhA(ps, pd, c);
-        }
-
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void ApplyBoundedRectifiedLinearA(float* ps, float* pd, int c);
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static void ApplyBoundedRectifiedLinearX(float* ps, float* pd, int c)
-        {
-            ApplyBoundedRectifiedLinearA(ps, pd, c);
-        }
-
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void ApplySigmoidDerivativeA(/*const*/ float* pv, float* pg, int c);
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static void ApplySigmoidDerivativeX(/*const*/ float* pv, float* pg, int c)
-        {
-            ApplySigmoidDerivativeA(pv, pg, c);
-        }
-
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void ApplyRectifiedLinearDerivativeA(/*const*/ float* pv, float* pg, int c);
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static void ApplyRectifiedLinearDerivativeX(/*const*/ float* pv, float* pg, int c)
-        {
-            ApplyRectifiedLinearDerivativeA(pv, pg, c);
-        }
-
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void ApplySquareDerivativeA(/*const*/ float* px, /*const*/ float* py, float* pg, int c, bool drop);
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static void ApplySquareDerivativeX(/*const*/ float* px, /*const*/ float* py, float* pg, int c, bool drop)
-        {
-            ApplySquareDerivativeA(px, py, pg, c, drop);
-        }
-
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void ApplySqrtDerivativeA(/*const*/ float* pv, float* pg, int c);
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static void ApplySqrtDerivativeX(/*const*/ float* pv, float* pg, int c)
-        {
-            ApplySqrtDerivativeA(pv, pg, c);
-        }
-
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void ApplySoftRectifiedLinearDerivativeU(/*const*/ float* px, /*const*/ float* py, float* pg, int c);
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static void ApplySoftRectifiedLinearDerivativeA(/*const*/ float* px, /*const*/ float* py, float* pg, int c)
-        {
-            ApplySoftRectifiedLinearDerivativeU(px, py, pg, c);
-        }
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static void ApplySoftRectifiedLinearDerivativeX(/*const*/ float* px, /*const*/ float* py, float* pg, int c)
-        {
-            ApplySoftRectifiedLinearDerivativeU(px, py, pg, c);
-        }
-
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void ApplyAbsDerivativeA(/*const*/ float* px, /*const*/ float* py, float* pg, int c, bool drop);
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static void ApplyAbsDerivativeX(/*const*/ float* px, /*const*/ float* py, float* pg, int c, bool drop)
-        {
-            ApplyAbsDerivativeA(px, py, pg, c, drop);
-        }
-
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void ApplyTanhDerivativeA(/*const*/ float* pv, float* pg, int c);
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static void ApplyTanhDerivativeX(/*const*/ float* pv, float* pg, int c)
-        {
-            ApplyTanhDerivativeA(pv, pg, c);
-        }
-
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void ApplyBoundedRectifiedLinearDerivativeA(/*const*/ float* pv, float* pg, int c);
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static void ApplyBoundedRectifiedLinearDerivativeX(/*const*/ float* pv, float* pg, int c)
-        {
-            ApplyBoundedRectifiedLinearDerivativeA(pv, pg, c);
-        }
-
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void ZeroItemsU(float* pd, int c, /*const*/ int* pindices, int cindices);
-
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void ZeroMatrixItemsCore(float* pd, int c, int ccol, int cfltRow, /*const*/ int* pindices, int cindices);
-
         [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
         public static extern void SdcaL1UpdateU(float primalUpdate, /*const*/ float* ps, float threshold, float* pd1, float* pd2, int c);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void SdcaL1UpdateSU(float primalUpdate, /*const*/ float* ps, /*const*/ int* pi, float threshold, float* pd1, float* pd2, int c);
 
         [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void ScaleAdadeltaU(float* mat, float* accGrads, float* accUpdates, float decay, float cond, float* grads, int size);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void ScaleAdadeltaA(float* mat, float* accGrads, float* accUpdates, float decay, float cond, float* grads, int size);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void ScaleAdadeltaX(float* mat, float* accGrads, float* accUpdates, float decay, float cond, float* grads, int size);
-
+        public static extern void SdcaL1UpdateSU(float primalUpdate, /*const*/ float* ps, /*const*/ int* pi, float threshold, float* pd1, float* pd2, int c);
 #if !CORECLR
         // In CoreCLR we use Buffer.MemoryCopy directly instead of
         // plumbing our own version.
diff --git a/src/Microsoft.ML.Data/Depricated/Vector/VectorUtils.cs b/src/Microsoft.ML.Data/Depricated/Vector/VectorUtils.cs
index 5b2df5b486..bd307cdc76 100644
--- a/src/Microsoft.ML.Data/Depricated/Vector/VectorUtils.cs
+++ b/src/Microsoft.ML.Data/Depricated/Vector/VectorUtils.cs
@@ -4,7 +4,6 @@
 
 using System;
 using System.Collections.Generic;
-using System.Linq;
 using Microsoft.ML.Runtime.Data;
 using Microsoft.ML.Runtime.Internal.CpuMath;
 using Microsoft.ML.Runtime.Internal.Utilities;
diff --git a/src/Microsoft.ML.StandardLearners/FactorizationMachine/FactorizationMachineInterface.cs b/src/Microsoft.ML.StandardLearners/FactorizationMachine/FactorizationMachineInterface.cs
index a4a2b79787..969964beff 100644
--- a/src/Microsoft.ML.StandardLearners/FactorizationMachine/FactorizationMachineInterface.cs
+++ b/src/Microsoft.ML.StandardLearners/FactorizationMachine/FactorizationMachineInterface.cs
@@ -2,10 +2,7 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 
-using Microsoft.ML.Runtime.Internal.CpuMath;
-using Microsoft.ML.Runtime.Internal.Utilities;
 using System.Runtime.InteropServices;
-
 using System.Security;
 
 namespace Microsoft.ML.Runtime.FactorizationMachine
@@ -13,22 +10,6 @@ namespace Microsoft.ML.Runtime.FactorizationMachine
     internal static unsafe class FieldAwareFactorizationMachineInterface
     {
         internal const string NativePath = "FactorizationMachineNative";
-        public const int CbAlign = 16;
-
-        private static bool Compat(AlignedArray a)
-        {
-            Contracts.AssertValue(a);
-            Contracts.Assert(a.Size > 0);
-            return a.CbAlign == CbAlign;
-        }
-
-        private static unsafe float* Ptr(AlignedArray a, float* p)
-        {
-            Contracts.AssertValue(a);
-            float* q = p + a.GetBase((long)p);
-            Contracts.Assert(((long)q & (CbAlign - 1)) == 0);
-            return q;
-        }
 
         [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
         public static extern void CalculateIntermediateVariablesNative(int fieldCount, int latentDim, int count, int* /*const*/ fieldIndices, int* /*const*/ featureIndices,
@@ -40,14 +21,12 @@ public static extern void CalculateGradientAndUpdateNative(float lambdaLinear, f
             float* linearWeights, float* latentWeights, float* linearAccumulatedSquaredGrads, float* latentAccumulatedSquaredGrads);
 
         public static void CalculateIntermediateVariables(int fieldCount, int latentDim, int count, int[] fieldIndices, int[] featureIndices, float[] featureValues,
-            float[] linearWeights, AlignedArray latentWeights, AlignedArray latentSum, ref float response)
+            float[] linearWeights, float[] latentWeights, float[] latentSum, ref float response)
         {
             Contracts.AssertNonEmpty(fieldIndices);
             Contracts.AssertNonEmpty(featureValues);
             Contracts.AssertNonEmpty(featureIndices);
             Contracts.AssertNonEmpty(linearWeights);
-            Contracts.Assert(Compat(latentWeights));
-            Contracts.Assert(Compat(latentSum));
 
             unsafe
             {
@@ -55,38 +34,35 @@ public static void CalculateIntermediateVariables(int fieldCount, int latentDim,
                 fixed (int* pi = &featureIndices[0])
                 fixed (float* px = &featureValues[0])
                 fixed (float* pw = &linearWeights[0])
-                fixed (float* pv = &latentWeights.Items[0])
-                fixed (float* pq = &latentSum.Items[0])
+                fixed (float* pv = &latentWeights[0])
+                fixed (float* pq = &latentSum[0])
                 fixed (float* pr = &response)
-                    CalculateIntermediateVariablesNative(fieldCount, latentDim, count, pf, pi, px, pw, Ptr(latentWeights, pv), Ptr(latentSum, pq), pr);
+                    CalculateIntermediateVariablesNative(fieldCount, latentDim, count, pf, pi, px, pw, pv, pq, pr);
             }
         }
 
         public static void CalculateGradientAndUpdate(float lambdaLinear, float lambdaLatent, float learningRate, int fieldCount, int latentDim,
-            float weight, int count, int[] fieldIndices, int[] featureIndices, float[] featureValues, AlignedArray latentSum, float slope,
-            float[] linearWeights, AlignedArray latentWeights, float[] linearAccumulatedSquaredGrads, AlignedArray latentAccumulatedSquaredGrads)
+            float weight, int count, int[] fieldIndices, int[] featureIndices, float[] featureValues, float[] latentSum, float slope,
+            float[] linearWeights, float[] latentWeights, float[] linearAccumulatedSquaredGrads, float[] latentAccumulatedSquaredGrads)
         {
             Contracts.AssertNonEmpty(fieldIndices);
             Contracts.AssertNonEmpty(featureIndices);
             Contracts.AssertNonEmpty(featureValues);
-            Contracts.Assert(Compat(latentSum));
             Contracts.AssertNonEmpty(linearWeights);
-            Contracts.Assert(Compat(latentWeights));
             Contracts.AssertNonEmpty(linearAccumulatedSquaredGrads);
-            Contracts.Assert(Compat(latentAccumulatedSquaredGrads));
 
             unsafe
             {
                 fixed (int* pf = &fieldIndices[0])
                 fixed (int* pi = &featureIndices[0])
                 fixed (float* px = &featureValues[0])
-                fixed (float* pq = &latentSum.Items[0])
+                fixed (float* pq = &latentSum[0])
                 fixed (float* pw = &linearWeights[0])
-                fixed (float* pv = &latentWeights.Items[0])
+                fixed (float* pv = &latentWeights[0])
                 fixed (float* phw = &linearAccumulatedSquaredGrads[0])
-                fixed (float* phv = &latentAccumulatedSquaredGrads.Items[0])
+                fixed (float* phv = &latentAccumulatedSquaredGrads[0])
                     CalculateGradientAndUpdateNative(lambdaLinear, lambdaLatent, learningRate, fieldCount, latentDim, weight, count, pf, pi, px,
-                        Ptr(latentSum, pq), slope, pw, Ptr(latentWeights, pv), phw, Ptr(latentAccumulatedSquaredGrads, phv));
+                        pq, slope, pw, pv, phw, phv);
             }
 
         }
diff --git a/src/Microsoft.ML.StandardLearners/FactorizationMachine/FactorizationMachineTrainer.cs b/src/Microsoft.ML.StandardLearners/FactorizationMachine/FactorizationMachineTrainer.cs
index 2f2161fa12..d5bf118981 100644
--- a/src/Microsoft.ML.StandardLearners/FactorizationMachine/FactorizationMachineTrainer.cs
+++ b/src/Microsoft.ML.StandardLearners/FactorizationMachine/FactorizationMachineTrainer.cs
@@ -12,7 +12,6 @@
 using Microsoft.ML.Runtime.Data;
 using Microsoft.ML.Runtime.EntryPoints;
 using Microsoft.ML.Runtime.FactorizationMachine;
-using Microsoft.ML.Runtime.Internal.CpuMath;
 using Microsoft.ML.Runtime.Internal.Utilities;
 using Microsoft.ML.Runtime.Training;
 
@@ -186,12 +185,12 @@ private void Initialize(IHostEnvironment env, Arguments args)
         }
 
         private void InitializeTrainingState(int fieldCount, int featureCount, FieldAwareFactorizationMachinePredictor predictor, out float[] linearWeights,
-            out AlignedArray latentWeightsAligned, out float[] linearAccumulatedSquaredGrads, out AlignedArray latentAccumulatedSquaredGradsAligned)
+            out float[] latentWeightsAligned, out float[] linearAccumulatedSquaredGrads, out float[] latentAccumulatedSquaredGradsAligned)
         {
             linearWeights = new float[featureCount];
-            latentWeightsAligned = new AlignedArray(featureCount * fieldCount * _latentDimAligned, 16);
+            latentWeightsAligned = new float[featureCount * fieldCount * _latentDimAligned];
             linearAccumulatedSquaredGrads = new float[featureCount];
-            latentAccumulatedSquaredGradsAligned = new AlignedArray(featureCount * fieldCount * _latentDimAligned, 16);
+            latentAccumulatedSquaredGradsAligned = new float[featureCount * fieldCount * _latentDimAligned];
 
             if (predictor == null)
             {
@@ -247,8 +246,8 @@ private static float CalculateLossSlope(float label, float modelResponse)
             return -sign * MathUtils.Sigmoid(-margin);
         }
 
-        private static double CalculateAvgLoss(IChannel ch, RoleMappedData data, bool norm, float[] linearWeights, AlignedArray latentWeightsAligned,
-            int latentDimAligned, AlignedArray latentSum, int[] featureFieldBuffer, int[] featureIndexBuffer, float[] featureValueBuffer, VBuffer<float> buffer, ref long badExampleCount)
+        private static double CalculateAvgLoss(IChannel ch, RoleMappedData data, bool norm, float[] linearWeights, float[] latentWeightsAligned,
+            int latentDimAligned, float[] latentSum, int[] featureFieldBuffer, int[] featureIndexBuffer, float[] featureValueBuffer, VBuffer<float> buffer, ref long badExampleCount)
         {
             var featureColumns = data.Schema.GetColumns(RoleMappedSchema.ColumnRole.Feature);
             Func<int, bool> pred = c => featureColumns.Select(ci => ci.Index).Contains(c) || c == data.Schema.Label.Index || (data.Schema.Weight != null && c == data.Schema.Weight.Index);
@@ -339,7 +338,7 @@ private FieldAwareFactorizationMachinePredictor TrainCore(IChannel ch, IProgress
             var featureValueBuffer = new float[totalFeatureCount];
             var featureIndexBuffer = new int[totalFeatureCount];
             var featureFieldBuffer = new int[totalFeatureCount];
-            var latentSum = new AlignedArray(fieldCount * fieldCount * _latentDimAligned, 16);
+            var latentSum = new float[fieldCount * fieldCount * _latentDimAligned];
             var metricNames = new List<string>() { "Training-loss" };
             if (validData != null)
                 metricNames.Add("Validation-loss");
@@ -356,7 +355,7 @@ private FieldAwareFactorizationMachinePredictor TrainCore(IChannel ch, IProgress
             });
             Func<int, bool> pred = c => fieldColumnIndexes.Contains(c) || c == data.Schema.Label.Index || (data.Schema.Weight != null && c == data.Schema.Weight.Index);
             InitializeTrainingState(fieldCount, totalFeatureCount, predictor, out float[] linearWeights,
-                out AlignedArray latentWeightsAligned, out float[] linearAccSqGrads, out AlignedArray latentAccSqGradsAligned);
+                out float[] latentWeightsAligned, out float[] linearAccSqGrads, out float[] latentAccSqGradsAligned);
 
             // refer to Algorithm 3 in https://github.com/wschin/fast-ffm/blob/master/fast-ffm.pdf
             while (iter++ < _numIterations)
diff --git a/src/Microsoft.ML.StandardLearners/FactorizationMachine/FieldAwareFactorizationMachinePredictor.cs b/src/Microsoft.ML.StandardLearners/FactorizationMachine/FieldAwareFactorizationMachinePredictor.cs
index 62f987901a..5c477b58e0 100644
--- a/src/Microsoft.ML.StandardLearners/FactorizationMachine/FieldAwareFactorizationMachinePredictor.cs
+++ b/src/Microsoft.ML.StandardLearners/FactorizationMachine/FieldAwareFactorizationMachinePredictor.cs
@@ -31,7 +31,7 @@ public sealed class FieldAwareFactorizationMachinePredictor : PredictorBase<floa
         internal int LatentDim { get; }
         internal int LatentDimAligned { get; }
         private readonly float[] _linearWeights;
-        private readonly AlignedArray _latentWeightsAligned;
+        private readonly float[] _latentWeightsAligned;
 
         private static VersionInfo GetVersionInfo()
         {
@@ -45,14 +45,14 @@ private static VersionInfo GetVersionInfo()
         }
 
         internal FieldAwareFactorizationMachinePredictor(IHostEnvironment env, bool norm, int fieldCount, int featureCount, int latentDim,
-            float[] linearWeights, AlignedArray latentWeightsAligned) : base(env, LoaderSignature)
+            float[] linearWeights, float[] latentWeightsAligned) : base(env, LoaderSignature)
         {
             Host.Assert(fieldCount > 0);
             Host.Assert(featureCount > 0);
             Host.Assert(latentDim > 0);
             Host.Assert(Utils.Size(linearWeights) == featureCount);
             LatentDimAligned = FieldAwareFactorizationMachineUtils.GetAlignedVectorLength(latentDim);
-            Host.Assert(latentWeightsAligned.Size == checked(featureCount * fieldCount * LatentDimAligned));
+            Host.Assert(latentWeightsAligned.Length == checked(featureCount * fieldCount * LatentDimAligned));
 
             _norm = norm;
             FieldCount = fieldCount;
@@ -93,7 +93,7 @@ private FieldAwareFactorizationMachinePredictor(IHostEnvironment env, ModelLoadC
             FeatureCount = featureCount;
             LatentDim = latentDim;
             _linearWeights = linearWeights;
-            _latentWeightsAligned = new AlignedArray(FeatureCount * FieldCount * LatentDimAligned, 16);
+            _latentWeightsAligned = new float[FeatureCount * FieldCount * LatentDimAligned];
             for (int j = 0; j < FeatureCount; j++)
             {
                 for (int f = 0; f < FieldCount; f++)
@@ -139,7 +139,7 @@ protected override void SaveCore(ModelSaveContext ctx)
             Host.Assert(FeatureCount > 0);
             Host.Assert(LatentDim > 0);
             Host.Assert(Utils.Size(_linearWeights) == FeatureCount);
-            Host.Assert(_latentWeightsAligned.Size == FeatureCount * FieldCount * LatentDimAligned);
+            Host.Assert(_latentWeightsAligned.Length == FeatureCount * FieldCount * LatentDimAligned);
 
             ctx.Writer.Write(_norm);
             ctx.Writer.Write(FieldCount);
@@ -161,7 +161,7 @@ protected override void SaveCore(ModelSaveContext ctx)
         }
 
         internal float CalculateResponse(ValueGetter<VBuffer<float>>[] getters, VBuffer<float> featureBuffer,
-            int[] featureFieldBuffer, int[] featureIndexBuffer, float[] featureValueBuffer, AlignedArray latentSum)
+            int[] featureFieldBuffer, int[] featureIndexBuffer, float[] featureValueBuffer, float[] latentSum)
         {
             int count = 0;
             float modelResponse = 0;
@@ -182,11 +182,11 @@ internal void CopyLinearWeightsTo(float[] linearWeights)
             Array.Copy(_linearWeights, linearWeights, _linearWeights.Length);
         }
 
-        internal void CopyLatentWeightsTo(AlignedArray latentWeights)
+        internal void CopyLatentWeightsTo(float[] latentWeights)
         {
             Host.AssertValue(_latentWeightsAligned);
             Host.AssertValue(latentWeights);
-            latentWeights.CopyFrom(_latentWeightsAligned);
+            Array.Copy(_latentWeightsAligned, 0, latentWeights, 0, latentWeights.Length);
         }
     }
 
diff --git a/src/Microsoft.ML.StandardLearners/FactorizationMachine/FieldAwareFactorizationMachineUtils.cs b/src/Microsoft.ML.StandardLearners/FactorizationMachine/FieldAwareFactorizationMachineUtils.cs
index d432f10228..6a24111b7b 100644
--- a/src/Microsoft.ML.StandardLearners/FactorizationMachine/FieldAwareFactorizationMachineUtils.cs
+++ b/src/Microsoft.ML.StandardLearners/FactorizationMachine/FieldAwareFactorizationMachineUtils.cs
@@ -96,7 +96,7 @@ public FieldAwareFactorizationMachineScalarRowMapper(IHostEnvironment env, RoleM
 
         public IRow GetRow(IRow input, Func<int, bool> predicate, out Action action)
         {
-            var latentSum = new AlignedArray(_pred.FieldCount * _pred.FieldCount * _pred.LatentDimAligned, 16);
+            var latentSum = new float[_pred.FieldCount * _pred.FieldCount * _pred.LatentDimAligned];
             var featureBuffer = new VBuffer<float>();
             var featureFieldBuffer = new int[_pred.FeatureCount];
             var featureIndexBuffer = new int[_pred.FeatureCount];
diff --git a/src/Microsoft.ML.Transforms/RffTransform.cs b/src/Microsoft.ML.Transforms/RffTransform.cs
index 866683bb99..5cfa53c67e 100644
--- a/src/Microsoft.ML.Transforms/RffTransform.cs
+++ b/src/Microsoft.ML.Transforms/RffTransform.cs
@@ -94,10 +94,10 @@ private sealed class TransformInfo
             public readonly int SrcDim;
 
             // the matrix containing the random fourier vectors
-            public readonly AlignedArray RndFourierVectors;
+            public readonly float[] RndFourierVectors;
 
             // the random rotations
-            public readonly AlignedArray RotationTerms;
+            public readonly float[] RotationTerms;
 
             private readonly IFourierDistributionSampler _matrixGenerator;
             private readonly bool _useSin;
@@ -121,12 +121,10 @@ public TransformInfo(IHost host, Column item, Arguments args, int d, Float avgDi
                     generator = args.MatrixGenerator;
                 _matrixGenerator = generator.CreateComponent(host, avgDist);
 
-                int roundedUpD = RoundUp(NewDim, _cfltAlign);
-                int roundedUpNumFeatures = RoundUp(SrcDim, _cfltAlign);
-                RndFourierVectors = new AlignedArray(roundedUpD * roundedUpNumFeatures, CpuMathUtils.GetVectorAlignment());
-                RotationTerms = _useSin ? null : new AlignedArray(roundedUpD, CpuMathUtils.GetVectorAlignment());
+                RndFourierVectors = new float[NewDim * SrcDim];
+                RotationTerms = _useSin ? null : new float[NewDim];
 
-                InitializeFourierCoefficients(roundedUpNumFeatures, roundedUpD);
+                InitializeFourierCoefficients(SrcDim, NewDim);
             }
 
             public TransformInfo(IHostEnvironment env, ModelLoadContext ctx, int colValueCount, string directoryName)
@@ -157,11 +155,9 @@ public TransformInfo(IHostEnvironment env, ModelLoadContext ctx, int colValueCou
                     ctx.LoadModelOrNull<IFourierDistributionSampler, SignatureLoadModel>(env, out _matrixGenerator, directoryName));
 
                 // initialize the transform matrix
-                int roundedUpD = RoundUp(NewDim, _cfltAlign);
-                int roundedUpNumFeatures = RoundUp(SrcDim, _cfltAlign);
-                RndFourierVectors = new AlignedArray(roundedUpD * roundedUpNumFeatures, CpuMathUtils.GetVectorAlignment());
-                RotationTerms = _useSin ? null : new AlignedArray(roundedUpD, CpuMathUtils.GetVectorAlignment());
-                InitializeFourierCoefficients(roundedUpNumFeatures, roundedUpD);
+                RndFourierVectors = new float[NewDim * SrcDim];
+                RotationTerms = _useSin ? null : new float[NewDim];
+                InitializeFourierCoefficients(SrcDim, NewDim);
             }
 
             public void Save(ModelSaveContext ctx, string directoryName)
@@ -229,8 +225,6 @@ private static VersionInfo GetVersionInfo()
         private readonly TransformInfo[] _transformInfos;
 
         private const string RegistrationName = "Rff";
-        private static readonly int _cfltAlign = CpuMathUtils.GetVectorAlignment() / sizeof(float);
-
         private static string TestColumnType(ColumnType type)
         {
             if (type.ItemType == NumberType.Float && type.ValueCount > 0)
@@ -335,18 +329,6 @@ public override void Save(ModelSaveContext ctx)
                 _transformInfos[i].Save(ctx, string.Format("MatrixGenerator{0}", i));
         }
 
-        // Round cflt up to a multiple of cfltAlign.
-        private static int RoundUp(int cflt, int cfltAlign)
-        {
-            Contracts.Assert(0 < cflt);
-            // cfltAlign should be a power of two.
-            Contracts.Assert(0 < cfltAlign && (cfltAlign & (cfltAlign - 1)) == 0);
-
-            // Determine the number of "blobs" of size cfltAlign.
-            int cblob = (cflt + cfltAlign - 1) / cfltAlign;
-            return cblob * cfltAlign;
-        }
-
         private static Float[] Train(IHost host, ColInfo[] infos, Arguments args, IDataView trainingData)
         {
             Contracts.AssertValue(host, "host");
@@ -499,8 +481,8 @@ private ValueGetter<VBuffer<Float>> GetterFromVectorType(IRow input, int iinfo)
             var getSrc = GetSrcGetter<VBuffer<Float>>(input, iinfo);
             var src = default(VBuffer<Float>);
 
-            var featuresAligned = new AlignedArray(RoundUp(Infos[iinfo].TypeSrc.ValueCount, _cfltAlign), CpuMathUtils.GetVectorAlignment());
-            var productAligned = new AlignedArray(RoundUp(_transformInfos[iinfo].NewDim, _cfltAlign), CpuMathUtils.GetVectorAlignment());
+            var featuresAligned = new float[Infos[iinfo].TypeSrc.ValueCount];
+            var productAligned = new float[_transformInfos[iinfo].NewDim];
 
             return
                 (ref VBuffer<Float> dst) =>
@@ -515,8 +497,8 @@ private ValueGetter<VBuffer<Float>> GetterFromFloatType(IRow input, int iinfo)
             var getSrc = GetSrcGetter<Float>(input, iinfo);
             var src = default(Float);
 
-            var featuresAligned = new AlignedArray(RoundUp(1, _cfltAlign), CpuMathUtils.GetVectorAlignment());
-            var productAligned = new AlignedArray(RoundUp(_transformInfos[iinfo].NewDim, _cfltAlign), CpuMathUtils.GetVectorAlignment());
+            var featuresAligned = new float[1];
+            var productAligned = new float[_transformInfos[iinfo].NewDim];
 
             var oneDimensionalVector = new VBuffer<Float>(1, new Float[] { 0 });
 
@@ -530,7 +512,7 @@ private ValueGetter<VBuffer<Float>> GetterFromFloatType(IRow input, int iinfo)
         }
 
         private static void TransformFeatures(IHost host, ref VBuffer<Float> src, ref VBuffer<Float> dst, TransformInfo transformInfo,
-            AlignedArray featuresAligned, AlignedArray productAligned)
+            float[] featuresAligned, float[] productAligned)
         {
             Contracts.AssertValue(host, "host");
             host.Check(src.Length == transformInfo.SrcDim, "column does not have the expected dimensionality.");
@@ -552,7 +534,7 @@ private static void TransformFeatures(IHost host, ref VBuffer<Float> src, ref VB
 
             if (src.IsDense)
             {
-                featuresAligned.CopyFrom(src.Values, 0, src.Length);
+                Array.Copy(src.Values, 0, featuresAligned, 0, src.Length);
                 CpuMathUtils.MatTimesSrc(false, false, transformInfo.RndFourierVectors, featuresAligned, productAligned,
                     transformInfo.NewDim);
             }
@@ -560,7 +542,12 @@ private static void TransformFeatures(IHost host, ref VBuffer<Float> src, ref VB
             {
                 // This overload of MatTimesSrc ignores the values in slots that are not in src.Indices, so there is
                 // no need to zero them out.
-                featuresAligned.CopyFrom(src.Indices, src.Values, 0, 0, src.Count, zeroItems: false);
+                for (int ipos = 0; ipos < src.Count; ++ipos)
+                {
+                    int iv = src.Indices[ipos];
+                    featuresAligned[iv] = src.Values[ipos];
+                }
+
                 CpuMathUtils.MatTimesSrc(false, false, transformInfo.RndFourierVectors, src.Indices, featuresAligned, 0, 0,
                     src.Count, productAligned, transformInfo.NewDim);
             }
diff --git a/src/Native/CpuMathNative/Avx.cpp b/src/Native/CpuMathNative/Avx.cpp
index fd2e78ed5a..bac2c99b9c 100644
--- a/src/Native/CpuMathNative/Avx.cpp
+++ b/src/Native/CpuMathNative/Avx.cpp
@@ -103,8 +103,8 @@ EXPORT_API(void) MatMulX(bool add, _In_ const float * pmat, _In_ const float * p
 
         __m128 sum = _mm_add_ps(_get_lo(res0), _get_hi(res0));
         if (add)
-            sum = _mm_add_ps(sum, _mm_load_ps(pd));
-        _mm_store_ps(pd, sum);
+            sum = _mm_add_ps(sum, _mm_loadu_ps(pd));
+        _mm_storeu_ps(pd, sum);
     }
 
     _vleave();
@@ -145,173 +145,6 @@ EXPORT_API(void) MatMulPX(bool add, _In_ const float * pmat, _In_ const int * pp
     _vleave();
 }
 
-// Sparse matrix.
-EXPORT_API(void) MatMulRX(bool add, _In_ const int * pstarts, _In_ const int * pindices, _In_ const float * pcoefs,
-    _In_ const float * ps, _Inout_ float * pdst, int crow)
-{
-    const int * pii = pstarts + 1;
-    const int * pi = pindices;
-    const float * pm = pcoefs;
-    const float * pdLim = pdst + crow;
-    for (float * pd = pdst; pd < pdLim; pd++)
-    {
-        const int * piLim = pindices + *pii++;
-
-        __m256 res2 = _mm256_setzero_ps();
-        for (; pi + 8 <= piLim; pi += 8, pm += 8)
-        {
-            __m256 x = _mm256_mul_ps(_load8(ps, pi), _mm256_loadu_ps(pm));
-            res2 = _mm256_add_ps(res2, x);
-        }
-        __m128 res = _mm_add_ps(_get_lo(res2), _get_hi(res2));
-        if (pi + 4 <= piLim)
-        {
-            __m128 x = _mm_mul_ps(_load4(ps, pi), _mm_loadu_ps(pm));
-            res = _mm_add_ps(res, x);
-            pi += 4; pm += 4;
-        }
-        for (; pi < piLim; pi++, pm++)
-        {
-            __m128 x = _mm_mul_ss(_load1(ps, pi), _mm_set_ss(*pm));
-            res = _mm_add_ss(res, x);
-        }
-        res = _mm_hadd_ps(res, res);
-        res = _mm_hadd_ps(res, res);
-
-        if (add)
-            res = _mm_add_ss(res, _mm_set_ss(*pd));
-        _mm_store_ss(pd, res);
-    }
-
-    _vleave();
-}
-
-// Unpadded convolution.
-EXPORT_API(void) MatMulCX(bool add, _In_ const int * pmprowiv, _In_ const int * pmprowcol,
-    _In_ const int * pruns, _In_ const float * pcoefs, _In_ const float * psrc, _Inout_ float * pdst, int crow)
-{
-    int size = pruns[1];
-    const int * psupport = pruns + 2;
-    const int * piv = pmprowiv;
-    const int * pcol = pmprowcol;
-    const int * piLim = psupport + size;
-    const float * pdLim = pdst + crow;
-
-    for (float * pd = pdst; pd < pdLim; pd++)
-    {
-        const float * pm = pcoefs + *piv++;
-        const float * ps = psrc + *pcol++;
-        const int * pi = psupport;
-
-        __m256 res2 = _mm256_setzero_ps();
-        for (; pi + 8 <= piLim; pi += 8, pm += 8)
-        {
-            __m256 x = _mm256_mul_ps(_load8(ps, pi), _mm256_loadu_ps(pm));
-            res2 = _mm256_add_ps(res2, x);
-        }
-        __m128 res = _mm_add_ps(_get_lo(res2), _get_hi(res2));
-        if (pi + 4 <= piLim)
-        {
-            __m128 x = _mm_mul_ps(_load4(ps, pi), _mm_loadu_ps(pm));
-            res = _mm_add_ps(res, x);
-            pi += 4; pm += 4;
-        }
-        for (; pi < piLim; pi++, pm++)
-        {
-            __m128 x = _mm_mul_ss(_load1(ps, pi), _mm_set_ss(*pm));
-            res = _mm_add_ss(res, x);
-        }
-        res = _mm_hadd_ps(res, res);
-        res = _mm_hadd_ps(res, res);
-
-        // Add the bias.
-        res = _mm_add_ss(res, _mm_set_ss(*pm));
-
-        if (add)
-            res = _mm_add_ss(res, _mm_set_ss(*pd));
-        _mm_store_ss(pd, res);
-    }
-
-    _vleave();
-}
-
-// Padded convolution.
-EXPORT_API(void) MatMulDX(bool add, _In_ const int * pmprowiv, _In_ const int * pmprowcol, _In_ const int * pmprowrun,
-    _In_ const int * pruns, _In_ const float * pcoefs, _In_ const float * psrc, _Inout_ float * pdst, int crow)
-{
-    const int * piv = pmprowiv;
-    const int * pcol = pmprowcol;
-    const float * pdLim = pdst + crow;
-    int kernelSize = pruns[1];
-
-    const int * pirun = pmprowrun;
-    for (float * pd = pdst; pd < pdLim; pd++)
-    {
-        const float * pm = pcoefs + *piv++;
-        const float * pmBias = pm + kernelSize;
-        const float * ps = psrc + *pcol++;
-        int irun = *pirun++;
-
-        const int * pi = pruns + 2 + irun;
-        const int * piLim = pi + pi[-1];
-        __m256 res2 = _mm256_setzero_ps();
-        __m128 res;
-        if (irun == 0)
-        {
-            // No masking needed.
-            for (; pi + 8 <= piLim; pi += 8, pm += 8)
-            {
-                __m256 x = _mm256_mul_ps(_load8(ps, pi), _mm256_loadu_ps(pm));
-                res2 = _mm256_add_ps(res2, x);
-            }
-            res = _mm_add_ps(_get_lo(res2), _get_hi(res2));
-            if (pi + 4 <= piLim)
-            {
-                __m128 x = _mm_mul_ps(_load4(ps, pi), _mm_loadu_ps(pm));
-                res = _mm_add_ps(res, x);
-                pi += 4; pm += 4;
-            }
-            for (; pi < piLim; pi++, pm++)
-            {
-                __m128 x = _mm_mul_ss(_load1(ps, pi), _mm_set_ss(*pm));
-                res = _mm_add_ss(res, x);
-            }
-        }
-        else
-        {
-            // Need masking.
-            pm += pi[-2];
-            const float * pmask = reinterpret_cast<const float *>(piLim);
-            for (; pi + 8 <= piLim; pi += 8, pm += 8, pmask += 8)
-            {
-                __m256 x = _mm256_mul_ps(_load8(ps, pi), _mm256_and_ps(_mm256_loadu_ps(pmask), _mm256_loadu_ps(pm)));
-                res2 = _mm256_add_ps(res2, x);
-            }
-            res = _mm_add_ps(_get_lo(res2), _get_hi(res2));
-            if (pi + 4 <= piLim)
-            {
-                __m128 x = _mm_mul_ps(_load4(ps, pi), _mm_and_ps(_mm_loadu_ps(pmask), _mm_loadu_ps(pm)));
-                res = _mm_add_ps(res, x);
-                pi += 4; pm += 4; pmask += 4;
-            }
-            for (; pi < piLim; pi++, pm++, pmask++)
-            {
-                __m128 x = _mm_mul_ss(_load1(ps, pi), _mm_and_ps(_mm_set_ss(*pmask), _mm_set_ss(*pm)));
-                res = _mm_add_ss(res, x);
-            }
-        }
-        res = _mm_hadd_ps(res, res);
-        res = _mm_hadd_ps(res, res);
-
-        res = _mm_add_ss(res, _mm_set_ss(*pmBias));
-        if (add)
-            res = _mm_add_ss(res, _mm_set_ss(*pd));
-        _mm_store_ss(pd, res);
-    }
-
-    _vleave();
-}
-
 EXPORT_API(void) MatMulTranX(bool add, _In_ const float * pmat, _In_ const float * psrc, _Inout_ float * pdst, int crow, int ccol)
 {
     const float * psLim = psrc + ccol;
@@ -322,7 +155,7 @@ EXPORT_API(void) MatMulTranX(bool add, _In_ const float * pmat, _In_ const float
     // We do 4-way unrolling
     if (!add)
     {
-        __m128 h01 = _mm_load_ps(ps);
+        __m128 h01 = _mm_loadu_ps(ps);
         // Replicate each slot of x01 into its own register.
         __m128 h11 = _mm_shuffle_ps(h01, h01, 0x55);
         __m128 h21 = _mm_shuffle_ps(h01, h01, 0xAA);
@@ -357,7 +190,7 @@ EXPORT_API(void) MatMulTranX(bool add, _In_ const float * pmat, _In_ const float
 
     for (; ps < psLim; ps += 4)
     {
-        __m128 h01 = _mm_load_ps(ps);
+        __m128 h01 = _mm_loadu_ps(ps);
         // Replicate each slot of x01 into its own register.
         __m128 h11 = _mm_shuffle_ps(h01, h01, 0x55);
         __m128 h21 = _mm_shuffle_ps(h01, h01, 0xAA);
@@ -432,894 +265,4 @@ EXPORT_API(void) MatMulTranPX(bool add, _In_ const float * pmat, _In_ const int
     }
 
     _vleave();
-}
-
-// Sparse matrix.
-EXPORT_API(void) MatMulTranRX(bool add, _In_ const int * pstarts, _In_ const int * pindices, _In_ const float * pcoefs,
-    _In_ const float * psrc, _Inout_ float * pd, int crow, int ccol)
-{
-    if (!add)
-        memset(pd, 0, crow * sizeof(float));
-
-    const int * pii = pstarts + 1;
-    const int * pi = pindices;
-    const float * pm = pcoefs;
-    const float * psLim = psrc + ccol;
-    for (const float * ps = psrc; ps < psLim; ps++)
-    {
-        float x = *ps;
-        const int * piLim = pindices + *pii++;
-
-        __m128 x0 = _mm_set1_ps(x);
-        __m256 x1 = _mm256_set_m128(x0, x0);
-        for (; pi + 8 <= piLim; pi += 8, pm += 8)
-        {
-            __m256 x2 = _mm256_mul_ps(x1, _mm256_loadu_ps(pm));
-            x2 = _mm256_add_ps(x2, _load8(pd, pi));
-            _store8(x2, pd, pi);
-        }
-        if (pi + 4 <= piLim)
-        {
-            __m128 x2 = _mm_mul_ps(x0, _mm_loadu_ps(pm));
-            x2 = _mm_add_ps(x2, _load4(pd, pi));
-            _store4(x2, pd, pi);
-            pi += 4; pm += 4;
-        }
-        for (; pi < piLim; pi++, pm++)
-        {
-            __m128 x2 = _mm_mul_ss(x0, _mm_set_ss(*pm));
-            x2 = _mm_add_ss(x2, _load1(pd, pi));
-            _store1(x2, pd, pi);
-        }
-    }
-
-    _vleave();
-}
-
-// Unpadded convolution.
-EXPORT_API(void) MatMulTranCX(bool add, _In_ const int * pmpcoliv, _In_ const int * pmpcolrow,
-    _In_ const int * pruns, _In_ const float * pcoefs, _In_ const float * psrc, _Inout_ float * pdst, int crow, int ccol)
-{
-    if (!add)
-        memset(pdst, 0, crow * sizeof(float));
-
-    int size = pruns[1];
-    const int * psupport = pruns + 2;
-    const int * piv = pmpcoliv;
-    const int * prow = pmpcolrow;
-    const int * piLim = psupport + size;
-    const float * psLim = psrc + ccol;
-    for (const float * ps = psrc; ps < psLim; ps++)
-    {
-        const float * pm = pcoefs + *piv++;
-        float * pd = pdst + *prow++;
-        const int * pi = psupport;
-
-        float x = *ps;
-        __m128 x0 = _mm_set1_ps(x);
-        __m256 x1 = _mm256_set_m128(x0, x0);
-        for (; pi + 8 <= piLim; pi += 8, pm += 8)
-        {
-            __m256 x2 = _mm256_mul_ps(x1, _mm256_loadu_ps(pm));
-            x2 = _mm256_add_ps(x2, _load8(pd, pi));
-            _store8(x2, pd, pi);
-        }
-        if (pi + 4 <= piLim)
-        {
-            __m128 x2 = _mm_mul_ps(x0, _mm_loadu_ps(pm));
-            x2 = _mm_add_ps(x2, _load4(pd, pi));
-            _store4(x2, pd, pi);
-            pi += 4; pm += 4;
-        }
-        for (; pi < piLim; pi++, pm++)
-        {
-            __m128 x2 = _mm_mul_ss(x0, _mm_set_ss(*pm));
-            x2 = _mm_add_ss(x2, _load1(pd, pi));
-            _store1(x2, pd, pi);
-        }
-    }
-
-    _vleave();
-}
-
-// Padded convolution.
-EXPORT_API(void) MatMulTranDX(bool add, _In_ const int * pmpcoliv, _In_ const int * pmpcolrow, _In_ const int * pmpcolrun,
-    _In_ const int * pruns, _In_ const float * pcoefs, _In_ const float * psrc, _Inout_ float * pdst, int crow, int ccol)
-{
-    if (!add)
-        memset(pdst, 0, crow * sizeof(float));
-
-    const int * piv = pmpcoliv;
-    const int * prow = pmpcolrow;
-    const float * psLim = psrc + ccol;
-    int kernelSize = pruns[1];
-
-    const int * pirun = pmpcolrun;
-    for (const float * ps = psrc; ps < psLim; ps++)
-    {
-        const float * pm = pcoefs + *piv++;
-        float * pd = pdst + *prow++;
-        int irun = *pirun++;
-        const int * pi = pruns + 2 + irun;
-        const int * piLim = pi + pi[-1];
-
-        float x = *ps;
-        __m128 x0 = _mm_set1_ps(x);
-        __m256 x1 = _mm256_set_m128(x0, x0);
-        if (irun == 0)
-        {
-            // No masking needed.
-            for (; pi + 8 <= piLim; pi += 8, pm += 8)
-            {
-                __m256 x2 = _mm256_mul_ps(x1, _mm256_loadu_ps(pm));
-                x2 = _mm256_add_ps(x2, _load8(pd, pi));
-                _store8(x2, pd, pi);
-            }
-            if (pi + 4 <= piLim)
-            {
-                __m128 x2 = _mm_mul_ps(x0, _mm_loadu_ps(pm));
-                x2 = _mm_add_ps(x2, _load4(pd, pi));
-                _store4(x2, pd, pi);
-                pi += 4; pm += 4;
-            }
-            for (; pi < piLim; pi++, pm++)
-            {
-                __m128 x2 = _mm_mul_ss(x0, _mm_set_ss(*pm));
-                x2 = _mm_add_ss(x2, _load1(pd, pi));
-                _store1(x2, pd, pi);
-            }
-        }
-        else
-        {
-            // Need masking.
-            pm += pi[-2];
-            const float * pmask = reinterpret_cast<const float *>(piLim);
-            for (; pi + 8 <= piLim; pi += 8, pm += 8, pmask += 8)
-            {
-                __m256 x2 = _mm256_mul_ps(_mm256_and_ps(_mm256_loadu_ps(pmask), x1), _mm256_loadu_ps(pm));
-                x2 = _mm256_add_ps(x2, _load8(pd, pi));
-                _store8(x2, pd, pi);
-            }
-            if (pi + 4 <= piLim)
-            {
-                __m128 x2 = _mm_mul_ps(_mm_and_ps(_mm_loadu_ps(pmask), x0), _mm_loadu_ps(pm));
-                x2 = _mm_add_ps(x2, _load4(pd, pi));
-                _store4(x2, pd, pi);
-                pi += 4; pm += 4; pmask += 4;
-            }
-            for (; pi < piLim; pi++, pm++, pmask++)
-            {
-                __m128 x2 = _mm_mul_ss(_mm_and_ps(_mm_set_ss(*pmask), x0), _mm_set_ss(*pm));
-                x2 = _mm_add_ss(x2, _load1(pd, pi));
-                _store1(x2, pd, pi);
-            }
-        }
-    }
-
-    _vleave();
-}
-
-template <bool useDecay>
-void AddXYTranXCore(float a, _In_ const float * px, _In_ const float * py, _Inout_ float * pmat, int crow, int ccol, float decay)
-{
-    const float * pyBase = py;
-    const float * pxLim = px + crow;
-    const float * pyLim = py + ccol;
-    float * pm = pmat;
-    __m256 wd;
-    if (useDecay)
-        wd = _mm256_set1_ps(1 - decay);
-    for (; px < pxLim; px++)
-    {
-        float r = a * *px;
-        py = pyBase;
-
-        __m256 x1 = _mm256_set1_ps(r);
-        for (; py + 32 <= pyLim; py += 32, pm += 32)
-        {
-            __m256 x02 = _mm256_load_ps(py);
-            __m256 x12 = _mm256_load_ps(py + 8);
-            __m256 x22 = _mm256_load_ps(py + 16);
-            __m256 x32 = _mm256_load_ps(py + 24);
-            __m256 x03 = _mm256_load_ps(pm);
-            __m256 x13 = _mm256_load_ps(pm + 8);
-            __m256 x23 = _mm256_load_ps(pm + 16);
-            __m256 x33 = _mm256_load_ps(pm + 24);
-            x02 = _mm256_mul_ps(x1, x02);
-            x12 = _mm256_mul_ps(x1, x12);
-            x22 = _mm256_mul_ps(x1, x22);
-            x32 = _mm256_mul_ps(x1, x32);
-            if (useDecay)
-            {
-                x03 = _mm256_mul_ps(wd, x03);
-                x13 = _mm256_mul_ps(wd, x13);
-                x23 = _mm256_mul_ps(wd, x23);
-                x33 = _mm256_mul_ps(wd, x33);
-            }
-            x03 = _mm256_add_ps(x02, x03);
-            x13 = _mm256_add_ps(x12, x13);
-            x23 = _mm256_add_ps(x22, x23);
-            x33 = _mm256_add_ps(x32, x33);
-            _mm256_store_ps(pm, x03);
-            _mm256_store_ps(pm + 8, x13);
-            _mm256_store_ps(pm + 16, x23);
-            _mm256_store_ps(pm + 24, x33);
-        }
-        for (; py < pyLim; py += 8, pm += 8)
-        {
-            __m256 x02 = _mm256_load_ps(py);
-            __m256 x03 = _mm256_load_ps(pm);
-            x02 = _mm256_mul_ps(x1, x02);
-            if (useDecay)
-                x03 = _mm256_mul_ps(wd, x03);
-            x03 = _mm256_add_ps(x02, x03);
-            _mm256_store_ps(pm, x03);
-        }
-    }
-
-    _vleave();
-}
-
-EXPORT_API(void) AddXYTranX(float a, _In_ const float * px, _In_ const float * py, _Inout_ float * pmat, int crow, int ccol, float decay)
-{
-    if (decay == 0)
-        AddXYTranXCore<false>(a, px, py, pmat, crow, ccol, decay);
-    else
-        AddXYTranXCore<true>(a, px, py, pmat, crow, ccol, decay);
-}
-
-// Partial sparse source vector.
-EXPORT_API(void) AddXYTranPX(float a, _In_ const float * px, _In_ const int * pposY, _In_ const float * pvaluesY,
-    int posMinY, int iposMinY, int iposLimY, _Inout_ float * pmat, int crow, int ccol)
-{
-    const int * pposMin = pposY + iposMinY;
-    const int * pposLim = pposY + iposLimY;
-    const float * pxLim = px + crow;
-    float * pm0 = pmat - posMinY;
-    const float * py = pvaluesY - posMinY;
-
-    __m256 x0 = _mm256_set1_ps(a);
-    for (; px < pxLim; px += 8, pm0 += 8 * ccol)
-    {
-        float * pm1 = pm0 + ccol;
-        float * pm2 = pm1 + ccol;
-        float * pm3 = pm2 + ccol;
-
-        __m256 x1 = _mm256_load_ps(px);
-        x1 = _mm256_mul_ps(x1, x0);
-
-        for (const int * ppos = pposMin; ppos < pposLim; ppos++)
-        {
-            int col1 = *ppos;
-            int col2 = col1 + 4 * ccol;
-            __m256 x2 = _mm256_set1_ps(py[col1]);
-            __m256 x3 = _mm256_setr_ps(
-                pm0[col1], pm1[col1], pm2[col1], pm3[col1],
-                pm0[col2], pm1[col2], pm2[col2], pm3[col2]);
-            x2 = _mm256_mul_ps(x2, x1);
-            x3 = _mm256_add_ps(x3, x2);
-
-            __m128 t1 = _get_lo(x3);
-            __m128 t2 = _get_hi(x3);
-            _mm_store_ss(pm0 + col1, t1); t1 = _rotate(t1);
-            _mm_store_ss(pm1 + col1, t1); t1 = _rotate(t1);
-            _mm_store_ss(pm2 + col1, t1); t1 = _rotate(t1);
-            _mm_store_ss(pm3 + col1, t1);
-            _mm_store_ss(pm0 + col2, t2); t2 = _rotate(t2);
-            _mm_store_ss(pm1 + col2, t2); t2 = _rotate(t2);
-            _mm_store_ss(pm2 + col2, t2); t2 = _rotate(t2);
-            _mm_store_ss(pm3 + col2, t2);
-        }
-    }
-
-    _vleave();
-}
-
-template <bool useDecay>
-void AddXYTranRXCore(float a, _In_ const float * px, _In_ const float * py,
-    _In_ const int * pstarts, _In_ const int * pindices, _Inout_ float * pcoefs, int crow, float decay)
-{
-    const int * pii = pstarts + 1;
-    const int * pi = pindices;
-    float * pm = pcoefs;
-    const float * pxLim = px + crow;
-    __m128 wd0;
-    __m256 wd1;
-    if (useDecay)
-    {
-        wd0 = _mm_set1_ps(1 - decay);
-        wd1 = _mm256_set_m128(wd0, wd0);
-    }
-    for (; px < pxLim; px++)
-    {
-        const int * piLim = pindices + *pii++;
-        float r = a * *px;
-
-        __m128 x0 = _mm_set1_ps(r);
-        __m256 x1 = _mm256_set_m128(x0, x0);
-        for (; pi + 8 <= piLim; pi += 8, pm += 8)
-        {
-            __m256 x2 = _mm256_mul_ps(x1, _load8(py, pi));
-            __m256 x3 = _mm256_loadu_ps(pm);
-            if (useDecay)
-                x3 = _mm256_mul_ps(x3, wd1);
-            x2 = _mm256_add_ps(x2, x3);
-            _mm256_storeu_ps(pm, x2);
-        }
-        if (pi + 4 <= piLim)
-        {
-            __m128 x2 = _mm_mul_ps(x0, _load4(py, pi));
-            __m128 x3 = _mm_loadu_ps(pm);
-            if (useDecay)
-                x3 = _mm_mul_ps(x3, wd0);
-            x2 = _mm_add_ps(x2, x3);
-            _mm_storeu_ps(pm, x2);
-            pi += 4; pm += 4;
-        }
-        for (; pi < piLim; pi++, pm++)
-            *pm = (useDecay ? (*pm * (1 - decay)) : *pm) + py[*pi] * r;
-    }
-
-    _vleave();
-}
-
-// Sparse matrix.
-EXPORT_API(void) AddXYTranRX(float a, _In_ const float * px, _In_ const float * py,
-    _In_ const int * pstarts, _In_ const int * pindices, _Inout_ float * pcoefs, int crow, float decay)
-{
-    if (decay == 0)
-        AddXYTranRXCore<false>(a, px, py, pstarts, pindices, pcoefs, crow, decay);
-    else
-        AddXYTranRXCore<true>(a, px, py, pstarts, pindices, pcoefs, crow, decay);
-}
-
-// Unpadded convolution.
-EXPORT_API(void) AddXYTranCX(float a, _In_ const float * px, _In_ const float * py, _In_ const int * pmprowiv, _In_ const int * pmprowcol,
-    _In_ const int * pruns, _Inout_ float * pcoefs, int crow)
-{
-    int size = pruns[1];
-    const int * psupport = pruns + 2;
-    const int * piv = pmprowiv;
-    const int * pcol = pmprowcol;
-    const float * pxLim = px + crow;
-    const int * piLim = psupport + size;
-
-    for (; px < pxLim; px++)
-    {
-        float * pm = pcoefs + *piv++;
-        const float * ps = py + *pcol++;
-        const int * pi = psupport;
-        float r = a * *px;
-
-        __m128 x0 = _mm_set1_ps(r);
-        __m256 x1 = _mm256_set_m128(x0, x0);
-        for (; pi + 8 <= piLim; pi += 8, pm += 8)
-        {
-            __m256 x2 = _mm256_mul_ps(x1, _load8(ps, pi));
-            x2 = _mm256_add_ps(x2, _mm256_loadu_ps(pm));
-            _mm256_storeu_ps(pm, x2);
-        }
-        if (pi + 4 <= piLim)
-        {
-            __m128 x2 = _mm_mul_ps(x0, _load4(ps, pi));
-            x2 = _mm_add_ps(x2, _mm_loadu_ps(pm));
-            _mm_storeu_ps(pm, x2);
-            pi += 4; pm += 4;
-        }
-        for (; pi < piLim; pi++, pm++)
-            *pm += ps[*pi] * r;
-        // Update the bias.
-        *pm += r;
-    }
-
-    _vleave();
-}
-
-// Padded convolution.
-EXPORT_API(void) AddXYTranDX(float a, _In_ const float * px, _In_ const float * py, _In_ const int * pmprowiv, _In_ const int * pmprowcol,
-    _In_ const int * pmprowrun, _In_ const int * pruns, _Inout_ float * pcoefs, int crow)
-{
-    const int * piv = pmprowiv;
-    const int * pcol = pmprowcol;
-    const float * pxLim = px + crow;
-    int kernelSize = pruns[1];
-
-    const int * pirun = pmprowrun;
-    for (; px < pxLim; px++)
-    {
-        float * pm = pcoefs + *piv++;
-        const float * ps = py + *pcol++;
-        int irun = *pirun++;
-        const int * pi = pruns + 2 + irun;
-        const int * piLim = pi + pi[-1];
-
-        float r = a * *px;
-
-        // Update the bias.
-        pm[kernelSize] += r;
-
-        __m128 x0 = _mm_set1_ps(r);
-        __m256 x1 = _mm256_set_m128(x0, x0);
-        if (irun == 0)
-        {
-            // No masking needed.
-            for (; pi + 8 <= piLim; pi += 8, pm += 8)
-            {
-                __m256 x2 = _mm256_mul_ps(x1, _load8(ps, pi));
-                x2 = _mm256_add_ps(x2, _mm256_loadu_ps(pm));
-                _mm256_storeu_ps(pm, x2);
-            }
-            if (pi + 4 <= piLim)
-            {
-                __m128 x2 = _mm_mul_ps(x0, _load4(ps, pi));
-                x2 = _mm_add_ps(x2, _mm_loadu_ps(pm));
-                _mm_storeu_ps(pm, x2);
-                pi += 4; pm += 4;
-            }
-            for (; pi < piLim; pi++, pm++)
-                *pm += ps[*pi] * r;
-        }
-        else
-        {
-            // Need masking.
-            pm += pi[-2];
-            const float * pmask = reinterpret_cast<const float *>(piLim);
-            for (; pi + 8 <= piLim; pi += 8, pm += 8, pmask += 8)
-            {
-                __m256 x2 = _mm256_mul_ps(_mm256_and_ps(_mm256_loadu_ps(pmask), x1), _load8(ps, pi));
-                x2 = _mm256_add_ps(x2, _mm256_loadu_ps(pm));
-                _mm256_storeu_ps(pm, x2);
-            }
-            if (pi + 4 <= piLim)
-            {
-                __m128 x2 = _mm_mul_ps(_mm_and_ps(_mm_loadu_ps(pmask), x0), _load4(ps, pi));
-                x2 = _mm_add_ps(x2, _mm_loadu_ps(pm));
-                _mm_storeu_ps(pm, x2);
-                pi += 4; pm += 4; pmask += 4;
-            }
-            for (; pi < piLim; pi++, pm++, pmask++)
-            {
-                __m128 x2 = _mm_mul_ss(_mm_and_ps(_mm_set_ss(*pmask), x0), _load1(ps, pi));
-                x2 = _mm_add_ss(x2, _mm_set_ss(*pm));
-                _mm_store_ss(pm, x2);
-            }
-        }
-    }
-
-    _vleave();
-}
-
-// With momentum.
-EXPORT_API(void) AddXYTranMomX(float a, _In_ const float * px, _In_ const float * py, _Inout_ float * pmat, float momentum, _Inout_ float * pdel, int crow, int ccol)
-{
-    const float * pyBase = py;
-    const float * pxLim = px + crow;
-    const float * pyLim = py + ccol;
-    float * pm = pmat;
-    float * pd = pdel;
-
-    __m256 x0 = _mm256_set1_ps(momentum);
-    for (; px < pxLim; px++)
-    {
-        float r = a * *px;
-
-        __m256 x1 = _mm256_set1_ps(r);
-        for (py = pyBase; py < pyLim; pm += 8, pd += 8, py += 8)
-        {
-            __m256 x2 = _mm256_load_ps(py);
-            __m256 x3 = _mm256_load_ps(pd);
-            __m256 x4 = _mm256_load_ps(pm);
-            x2 = _mm256_mul_ps(x1, x2);
-            x3 = _mm256_mul_ps(x0, x3);
-            x3 = _mm256_add_ps(x2, x3);
-            x4 = _mm256_add_ps(x3, x4);
-
-            _mm256_store_ps(pd, x3);
-            _mm256_store_ps(pm, x4);
-        }
-    }
-
-    _vleave();
-}
-
-// coef: coefs to update, ag: accumulated grads, au: accumulated updates, g: cur grads.
-// Note: parameters coef, ag, au and g will be updated, do not reuse parameter g in calling code.
-__forceinline void UpdateAdadelta(__m256& coef, __m256& ag, __m256& au, __m256& g, const __m256& dec, const __m256& decc, const __m256& c)
-{
-    __m256 x4 = _mm256_mul_ps(g, g);   // x4 == g * g
-    x4 = _mm256_mul_ps(decc, x4);      // x4 == (1 - decay) * g * g
-    ag = _mm256_mul_ps(dec, ag);       // ag == decay * accG
-    ag = _mm256_add_ps(ag, x4);        // ag == decay * accG + (1 - decay) * g * g
-    __m256 x41 = _mm256_add_ps(ag, c); // x41 == ag + cond
-    __m256 x51 = _mm256_add_ps(au, c); // x51 == accU + cond
-#if 0
-    // naive version:
-    x51 = _mm256_div_ps(x51, x41);
-    x41 = _mm256_sqrt_ps(x51);         // x41 == rate
-#else
-    // faster (approximate) version:
-    x41 = _mm256_rsqrt_ps(x41);
-    __m256 x52 = _mm256_rsqrt_ps(x51);
-    x51 = _mm256_mul_ps(x51, x52);
-    x41 = _mm256_mul_ps(x41, x51);     // x41 == rate
-#endif
-    g = _mm256_mul_ps(g, x41);         // g - current update
-    coef = _mm256_add_ps(coef, g);
-
-    g = _mm256_mul_ps(g, g);           // g  == newU * newU
-    g = _mm256_mul_ps(decc, g);        // g  == (1 - decay) * newU * newU
-    au = _mm256_mul_ps(dec, au);       // au == decay * accU
-    au = _mm256_add_ps(au, g);         // au == decay * accU + (1 - decay) * newU * newU
-}
-
-// For Adadelta.
-EXPORT_API(void) AddXYTranGradX(_In_ const float * px, _In_ const float * py, _Inout_ float * pmat, _Inout_ float * paccGrads, _Inout_ float * paccUpdates,
-    float decay, float cond, int crow, int ccol)
-{
-    const float * pyBase = py;
-    const float * pxLim = px + crow;
-    const float * pyLim = py + ccol;
-    float * pm = pmat;
-    float * pag = paccGrads;
-    float * pau = paccUpdates;
-
-    __m256 dec = _mm256_set1_ps(decay);
-    __m256 decc = _mm256_set1_ps(1 - decay);
-    __m256 c = _mm256_set1_ps(cond);
-    for (; px < pxLim; px++)
-    {
-        float r = *px;
-
-        __m256 x1 = _mm256_set1_ps(r);
-        for (py = pyBase; py < pyLim; pm += 8, pag += 8, pau += 8, py += 8)
-        {
-            __m256 x2 = _mm256_load_ps(py);
-            __m256 ag = _mm256_load_ps(pag);
-            __m256 au = _mm256_load_ps(pau);
-            __m256 coef = _mm256_load_ps(pm);
-            x2 = _mm256_mul_ps(x1, x2);         // x2 == g
-
-            UpdateAdadelta(coef, ag, au, x2, dec, decc, c);
-
-            _mm256_store_ps(pm, coef);
-            _mm256_store_ps(pag, ag);
-            _mm256_store_ps(pau, au);
-        }
-    }
-
-    _vleave();
-}
-
-// For Adadelta, sparse matrix.
-EXPORT_API(void) AddXYTranGradRX(_In_ const float * px, _In_ const float * py, _In_ const int * pstarts, _In_ const int * pindices,
-    _Inout_ float * pcoefs, _Inout_ float * paccGrads, _Inout_ float * paccUpdates, float decay, float cond, int crow)
-{
-    const int * pii = pstarts + 1;
-    const int * pi = pindices;
-    float * pm = pcoefs;
-    const float * pxLim = px + crow;
-    float * pag = paccGrads;
-    float * pau = paccUpdates;
-
-    __m256 dec = _mm256_set1_ps(decay);
-    __m256 decc = _mm256_set1_ps(1 - decay);
-    __m256 c = _mm256_set1_ps(cond);
-
-    for (; px < pxLim; px++)
-    {
-        const int * piLim = pindices + *pii++;
-        float r = *px;
-
-        __m256 x1 = _mm256_set1_ps(r);
-        for (; pi + 8 <= piLim; pi += 8, pm += 8, pag += 8, pau += 8)
-        {
-            __m256 g = _mm256_mul_ps(x1, _load8(py, pi));
-            __m256 ag = _mm256_loadu_ps(pag);
-            __m256 au = _mm256_loadu_ps(pau);
-            __m256 coef = _mm256_loadu_ps(pm);
-
-            UpdateAdadelta(coef, ag, au, g, dec, decc, c);
-
-            _mm256_storeu_ps(pm, coef);
-            _mm256_storeu_ps(pag, ag);
-            _mm256_storeu_ps(pau, au);
-        }
-
-        // REVIEW: Why is this so different than the SSE version?
-        for (; pi < piLim; pi++, pm++, pag++, pau++)
-        {
-            float g = py[*pi] * r;
-            float accGrad = decay * *pag + (1 - decay) * g * g;
-            float accUpd = *pau;
-            float newUpd = sqrtf((accUpd + cond) / (accGrad + cond)) * g;
-            *pm += newUpd;
-            *pag = accGrad;
-            *pau = decay * accUpd + (1 - decay) * newUpd * newUpd;
-        }
-    }
-
-    _vleave();
-}
-
-// For Adadelta, partial sparse source vector.
-EXPORT_API(void) AddXYTranGradPX(_In_ const float * px, _In_ const int * pposY, _In_ const float * pvaluesY,
-    int posMinY, int iposMinY, int iposLimY, _Inout_ float * pmat, _Inout_ float * paccGrads, _Inout_ float * paccUpdates,
-    float decay, float cond, int crow, int ccol)
-{
-    const int * pposMin = pposY + iposMinY;
-    const int * pposLim = pposY + iposLimY;
-    const float * pxLim = px + crow;
-    const float * py = pvaluesY - posMinY;
-    float * pm0 = pmat - posMinY;
-    float * pag0 = paccGrads - posMinY;
-    float * pau0 = paccUpdates - posMinY;
-
-    __m256 dec = _mm256_set1_ps(decay);
-    __m256 decc = _mm256_set1_ps(1 - decay);
-    __m256 c = _mm256_set1_ps(cond);
-    for (; px < pxLim; px += 8, pm0 += 8 * ccol, pag0 += 8 * ccol, pau0 += 8 * ccol)
-    {
-        float * pm1 = pm0 + ccol;
-        float * pm2 = pm1 + ccol;
-        float * pm3 = pm2 + ccol;
-
-        float * pag1 = pag0 + ccol;
-        float * pag2 = pag1 + ccol;
-        float * pag3 = pag2 + ccol;
-
-        float * pau1 = pau0 + ccol;
-        float * pau2 = pau1 + ccol;
-        float * pau3 = pau2 + ccol;
-
-        __m256 x1 = _mm256_load_ps(px);
-
-        for (const int * ppos = pposMin; ppos < pposLim; ppos++)
-        {
-            int col1 = *ppos;
-            int col2 = col1 + 4 * ccol;
-            __m256 x2 = _mm256_set1_ps(py[col1]);
-            __m256 ag = _mm256_setr_ps(
-                pag0[col1], pag1[col1], pag2[col1], pag3[col1],
-                pag0[col2], pag1[col2], pag2[col2], pag3[col2]);
-            __m256 au = _mm256_setr_ps(
-                pau0[col1], pau1[col1], pau2[col1], pau3[col1],
-                pau0[col2], pau1[col2], pau2[col2], pau3[col2]);
-            __m256 coef = _mm256_setr_ps(
-                pm0[col1], pm1[col1], pm2[col1], pm3[col1],
-                pm0[col2], pm1[col2], pm2[col2], pm3[col2]);
-            x2 = _mm256_mul_ps(x2, x1);
-
-            UpdateAdadelta(coef, ag, au, x2, dec, decc, c);
-
-            __m128 t1 = _get_lo(coef);
-            __m128 t2 = _get_hi(coef);
-            _mm_store_ss(pm0 + col1, t1); t1 = _rotate(t1);
-            _mm_store_ss(pm1 + col1, t1); t1 = _rotate(t1);
-            _mm_store_ss(pm2 + col1, t1); t1 = _rotate(t1);
-            _mm_store_ss(pm3 + col1, t1);
-            _mm_store_ss(pm0 + col2, t2); t2 = _rotate(t2);
-            _mm_store_ss(pm1 + col2, t2); t2 = _rotate(t2);
-            _mm_store_ss(pm2 + col2, t2); t2 = _rotate(t2);
-            _mm_store_ss(pm3 + col2, t2);
-
-            t1 = _get_lo(ag);
-            t2 = _get_hi(ag);
-            _mm_store_ss(pag0 + col1, t1); t1 = _rotate(t1);
-            _mm_store_ss(pag1 + col1, t1); t1 = _rotate(t1);
-            _mm_store_ss(pag2 + col1, t1); t1 = _rotate(t1);
-            _mm_store_ss(pag3 + col1, t1);
-            _mm_store_ss(pag0 + col2, t2); t2 = _rotate(t2);
-            _mm_store_ss(pag1 + col2, t2); t2 = _rotate(t2);
-            _mm_store_ss(pag2 + col2, t2); t2 = _rotate(t2);
-            _mm_store_ss(pag3 + col2, t2);
-
-            t1 = _get_lo(au);
-            t2 = _get_hi(au);
-            _mm_store_ss(pau0 + col1, t1); t1 = _rotate(t1);
-            _mm_store_ss(pau1 + col1, t1); t1 = _rotate(t1);
-            _mm_store_ss(pau2 + col1, t1); t1 = _rotate(t1);
-            _mm_store_ss(pau3 + col1, t1);
-            _mm_store_ss(pau0 + col2, t2); t2 = _rotate(t2);
-            _mm_store_ss(pau1 + col2, t2); t2 = _rotate(t2);
-            _mm_store_ss(pau2 + col2, t2); t2 = _rotate(t2);
-            _mm_store_ss(pau3 + col2, t2);
-        }
-    }
-
-    _vleave();
-}
-
-EXPORT_API(void) ScaleX(float a, _Inout_ float * pd, int c)
-{
-    float * pdLim = pd + c;
-
-    __m256 x1 = _mm256_set1_ps(a);
-    for (; pd < pdLim; pd += 8)
-    {
-        __m256 x2 = _mm256_load_ps(pd);
-        x2 = _mm256_mul_ps(x1, x2);
-        _mm256_store_ps(pd, x2);
-    }
-
-    _vleave();
-}
-
-EXPORT_API(void) ScaleMaxNormX(float maxNorm, _Inout_ float * pmat, int crow, int ccol)
-{
-    float * pm = pmat;
-    float maxNormSq = maxNorm * maxNorm;
-    __m256 m = _mm256_set1_ps(maxNorm);
-    for (int irow = 0; irow < crow; irow++)
-    {
-        __m256 rowNorm = _mm256_set1_ps(0);
-        float * pms = pm;
-        float * pmLim = pm + ccol;
-        for (; pm < pmLim; pm += 8)
-        {
-            __m256 x1 = _mm256_load_ps(pm);
-            x1 = _mm256_mul_ps(x1, x1);
-            rowNorm = _mm256_add_ps(x1, rowNorm);
-        }
-        rowNorm = _mm256_hadd_ps(rowNorm, rowNorm);
-        rowNorm = _mm256_hadd_ps(rowNorm, rowNorm);
-        float rowNormRes = _mm_cvtss_f32(_mm_add_ss(_get_lo(rowNorm), _get_hi(rowNorm)));
-        if (rowNormRes > maxNormSq)
-        {
-            __m256 scale = _mm256_set1_ps(rowNormRes);
-#if 0
-            // REVIEW: this is faster but it uses approximation so results differ significantly from CLR.
-            scale = _mm256_rsqrt_ps(scale);
-            scale = _mm256_mul_ps(scale, m);
-#else
-            scale = _mm256_sqrt_ps(scale);
-            scale = _mm256_div_ps(m, scale);
-#endif
-            for (pm = pms; pm < pmLim; pm += 8)
-            {
-                __m256 x1 = _mm256_load_ps(pm);
-                x1 = _mm256_mul_ps(x1, scale);
-                _mm256_store_ps(pm, x1);
-            }
-        }
-    }
-
-    _vleave();
-}
-
-EXPORT_API(void) AddScaleX(float a, _In_ const float * ps, _Inout_ float * pd, int c)
-{
-    float * pdLim = pd + c;
-
-    __m256 x1 = _mm256_set1_ps(a);
-    for (; pd < pdLim; pd += 8, ps += 8)
-    {
-        __m256 x2 = _mm256_load_ps(ps);
-        __m256 x3 = _mm256_load_ps(pd);
-        x2 = _mm256_mul_ps(x1, x2);
-        x3 = _mm256_add_ps(x2, x3);
-        _mm256_store_ps(pd, x3);
-    }
-
-    _vleave();
-}
-
-EXPORT_API(void) AddScaleMomX(float a, _In_ const float * ps, _Inout_ float * pd, float momentum, _Inout_ float * pe, int c)
-{
-    float * pdLim = pd + c;
-
-    __m256 x0 = _mm256_set1_ps(momentum);
-    __m256 x1 = _mm256_set1_ps(a);
-    for (; pd < pdLim; pd += 8, pe += 8, ps += 8)
-    {
-        __m256 x2 = _mm256_load_ps(ps);
-        __m256 x3 = _mm256_load_ps(pe);
-        __m256 x4 = _mm256_load_ps(pd);
-        x2 = _mm256_mul_ps(x1, x2);
-        x3 = _mm256_mul_ps(x0, x3);
-        x3 = _mm256_add_ps(x2, x3);
-        x4 = _mm256_add_ps(x3, x4);
-        _mm256_store_ps(pe, x3);
-        _mm256_store_ps(pd, x4);
-    }
-
-    _vleave();
-}
-
-EXPORT_API(void) AddScaleGradX(_In_ const float * ps, _Inout_ float * pd, _Inout_ float * paccGrads, _Inout_ float * paccUpdates,
-    float decay, float cond, int c)
-{
-    float * pdLim = pd + c;
-
-    __m256 dec = _mm256_set1_ps(decay);
-    __m256 decc = _mm256_set1_ps(1 - decay);
-    __m256 cnd = _mm256_set1_ps(cond);
-    for (; pd < pdLim; pd += 8, ps += 8, paccGrads += 8, paccUpdates += 8)
-    {
-        __m256 g = _mm256_load_ps(ps);
-        __m256 ag = _mm256_load_ps(paccGrads);
-        __m256 au = _mm256_load_ps(paccUpdates);
-        __m256 coef = _mm256_load_ps(pd);
-
-        UpdateAdadelta(coef, ag, au, g, dec, decc, cnd);
-
-        _mm256_store_ps(pd, coef);
-        _mm256_store_ps(paccGrads, ag);
-        _mm256_store_ps(paccUpdates, au);
-    }
-
-    _vleave();
-}
-
-EXPORT_API(void) AddX(_In_ const float * ps, _Inout_ float * pd, int c)
-{
-    float * pdLim = pd + c;
-
-    for (; pd < pdLim; pd += 8, ps += 8)
-    {
-        __m256 x1 = _mm256_load_ps(ps);
-        __m256 x2 = _mm256_load_ps(pd);
-        x2 = _mm256_add_ps(x1, x2);
-        _mm256_store_ps(pd, x2);
-    }
-
-    _vleave();
-}
-
-EXPORT_API(float) SumX(const float * ps, int c)
-{
-    const float * psLim = ps + c;
-
-    __m256 res = _mm256_setzero_ps();
-    for (; ps < psLim; ps += 8)
-    {
-        __m256 x1 = _mm256_load_ps(ps);
-        res = _mm256_add_ps(res, x1);
-    }
-    res = _mm256_hadd_ps(res, res);
-    res = _mm256_hadd_ps(res, res);
-    __m128 r = _mm_add_ss(_get_lo(res), _get_hi(res));
-
-    float ret = _mm_cvtss_f32(r);
-    _vleave();
-    return ret;
-}
-
-EXPORT_API(void) ScaleAdadeltaX(_Inout_ float * mat, _Inout_ float * accGrads, _Inout_ float * accUpdates, float decay, float cond, _In_ const float * grads, int size)
-{
-    float * pm = mat;
-    float * pmLim = pm + size;
-    float * pag = accGrads;
-    float * pau = accUpdates;
-    const float * pg = grads;
-
-    __m256 dec = _mm256_set1_ps(decay);
-    __m256 decc = _mm256_set1_ps(1 - decay);
-    __m256 c = _mm256_set1_ps(cond);
-
-    for (; pm + 8 <= pmLim; pm += 8, pag += 8, pau += 8, pg += 8)
-    {
-        __m256 g = _mm256_loadu_ps(pg);
-        __m256 ag = _mm256_loadu_ps(pag);
-        __m256 au = _mm256_loadu_ps(pau);
-        __m256 coef = _mm256_loadu_ps(pm);
-
-        UpdateAdadelta(coef, ag, au, g, dec, decc, c);
-
-        _mm256_storeu_ps(pm, coef);
-        _mm256_storeu_ps(pag, ag);
-        _mm256_storeu_ps(pau, au);
-    }
-
-    for (; pm < pmLim; pm++, pag++, pau++, pg++)
-    {
-        float g = *pg;
-        float accGrad = decay * *pag + (1 - decay) * g * g;
-        float accUpd = *pau;
-        float newUpd = sqrtf((accUpd + cond) / (accGrad + cond)) * g;
-        *pm += newUpd;
-        *pag = accGrad;
-        *pau = decay * accUpd + (1 - decay) * newUpd * newUpd;
-    }
-
-    _vleave();
-}
+}
\ No newline at end of file
diff --git a/src/Native/CpuMathNative/Sse.cpp b/src/Native/CpuMathNative/Sse.cpp
index c5221f0020..b87a24d495 100644
--- a/src/Native/CpuMathNative/Sse.cpp
+++ b/src/Native/CpuMathNative/Sse.cpp
@@ -85,27 +85,8 @@ DWORD xmmYmmStateSupport()
 
 #endif
 
-// Test whether Avx is available.
-EXPORT_API(bool) ChkAvx()
-{
-#ifdef _WIN32
-	int cpuInfo[4];
-	__cpuid(cpuInfo, 1);
-
-	// 28th bit of second integer of Cpu Info denotes whether the Avx is supported in CPU or not 
-	// Reference https://msdn.microsoft.com/en-us/library/hskdteyh(v=vs.100).aspx
-	return cpuInfo[2] & (1 << 28) || false;
-#else
-	unsigned char buffer[16];
-	(void) getcpuid(1, buffer);
-
-	// taken from https://github.com/dotnet/coreclr/blob/b5f4d2df2e087401f2c3aab2c37021e326707915/src/vm/codeman.cpp#L1381
-	return ((buffer[11] & 0x18) == 0x18) && (xmmYmmStateSupport() == 1);
-#endif
-}
-
 // Multiply matrix times vector into vector.
-EXPORT_API(void) MatMulA(bool add, _In_ const float * pmat, _In_ const float * psrc, _Inout_ float * pdst, int crow, int ccol)
+EXPORT_API(void) MatMul(bool add, _In_ const float * pmat, _In_ const float * psrc, _Inout_ float * pdst, int crow, int ccol)
 {
     const float * psLim = psrc + ccol;
     const float * pdLim = pdst + crow;
@@ -119,11 +100,11 @@ EXPORT_API(void) MatMulA(bool add, _In_ const float * pmat, _In_ const float * p
         for (const float * ps = psrc; ps < psLim; ps += 4, pm += 4)
         {
             const float * pmTmp;
-            __m128 x01 = _mm_load_ps(pmTmp = pm);
-            __m128 x11 = _mm_load_ps(pmTmp += ccol);
-            __m128 x21 = _mm_load_ps(pmTmp += ccol);
-            __m128 x31 = _mm_load_ps(pmTmp += ccol);
-            __m128 x02 = _mm_load_ps(ps);
+            __m128 x01 = _mm_loadu_ps(pmTmp = pm);
+            __m128 x11 = _mm_loadu_ps(pmTmp += ccol);
+            __m128 x21 = _mm_loadu_ps(pmTmp += ccol);
+            __m128 x31 = _mm_loadu_ps(pmTmp += ccol);
+            __m128 x02 = _mm_loadu_ps(ps);
             x01 = _mm_mul_ps(x01, x02);
             x11 = _mm_mul_ps(x11, x02);
             x21 = _mm_mul_ps(x21, x02);
@@ -140,13 +121,13 @@ EXPORT_API(void) MatMulA(bool add, _In_ const float * pmat, _In_ const float * p
         res0 = _mm_hadd_ps(res0, res2);
 
         if (add)
-            res0 = _mm_add_ps(res0, _mm_load_ps(pd));
-        _mm_store_ps(pd, res0);
+            res0 = _mm_add_ps(res0, _mm_loadu_ps(pd));
+        _mm_storeu_ps(pd, res0);
     }
 }
 
 // Partial sparse source vector.
-EXPORT_API(void) MatMulPA(bool add, _In_ const float * pmat, _In_ const int * pposSrc, _In_ const float * psrc,
+EXPORT_API(void) MatMulP(bool add, _In_ const float * pmat, _In_ const int * pposSrc, _In_ const float * psrc,
     int posMin, int iposMin, int iposLim, _Inout_ float * pdst, int crow, int ccol)
 {
     // REVIEW: For extremely sparse inputs, interchanging the loops would
@@ -172,313 +153,12 @@ EXPORT_API(void) MatMulPA(bool add, _In_ const float * pmat, _In_ const int * pp
         }
 
         if (add)
-            res = _mm_add_ps(res, _mm_load_ps(pd));
-        _mm_store_ps(pd, res);
-    }
-}
-
-// Sparse matrix.
-EXPORT_API(void) MatMulRU(bool add, _In_ const int * pstarts, _In_ const int * pindices, _In_ const float * pcoefs,
-    _In_ const float * ps, _Inout_ float * pdst, int crow)
-{
-    const int * pii = pstarts + 1;
-    const int * pi = pindices;
-    const float * pm = pcoefs;
-    const float * pdLim = pdst + crow;
-    for (float * pd = pdst; pd < pdLim; pd++)
-    {
-        const int * piLim = pindices + *pii++;
-
-        __m128 res = _mm_setzero_ps();
-        for (; pi + 4 <= piLim; pi += 4, pm += 4)
-        {
-            __m128 x = _mm_mul_ps(_load4(ps, pi), _mm_loadu_ps(pm));
-            res = _mm_add_ps(res, x);
-        }
-        for (; pi < piLim; pi++, pm++)
-        {
-            __m128 x = _mm_mul_ss(_load1(ps, pi), _mm_set_ss(*pm));
-            res = _mm_add_ss(res, x);
-        }
-        res = _mm_hadd_ps(res, res);
-        res = _mm_hadd_ps(res, res);
-
-        if (add)
-            res = _mm_add_ss(res, _mm_set_ss(*pd));
-        _mm_store_ss(pd, res);
-    }
-}
-
-// Unpadded convolution.
-EXPORT_API(void) MatMulCU(bool add, _In_ const int * pmprowiv, _In_ const int * pmprowcol,
-    _In_ const int * pruns, _In_ const float * pcoefs, _In_ const float * psrc, _Inout_ float * pdst, int crow)
-{
-    int size = pruns[1];
-    const int * psupport = pruns + 2;
-    const int * piv = pmprowiv;
-    const int * pcol = pmprowcol;
-    const int * piLim = psupport + size;
-    const float * pdLim = pdst + crow;
-
-    for (float * pd = pdst; pd < pdLim; pd++)
-    {
-        const float * pm = pcoefs + *piv++;
-        const float * ps = psrc + *pcol++;
-        const int * pi = psupport;
-
-        __m128 res = _mm_setzero_ps();
-        for (; pi + 4 <= piLim; pi += 4, pm += 4)
-        {
-            __m128 x = _mm_mul_ps(_load4(ps, pi), _mm_loadu_ps(pm));
-            res = _mm_add_ps(res, x);
-        }
-        for (; pi < piLim; pi++, pm++)
-        {
-            __m128 x = _mm_mul_ss(_load1(ps, pi), _mm_set_ss(*pm));
-            res = _mm_add_ss(res, x);
-        }
-        res = _mm_hadd_ps(res, res);
-        res = _mm_hadd_ps(res, res);
-
-        // Add the bias.
-        res = _mm_add_ss(res, _mm_set_ss(*pm));
-
-        if (add)
-            res = _mm_add_ss(res, _mm_set_ss(*pd));
-        _mm_store_ss(pd, res);
-    }
-}
-
-// Padded convolution.
-EXPORT_API(void) MatMulDU(bool add, _In_ const int * pmprowiv, _In_ const int * pmprowcol, _In_ const int * pmprowrun,
-    _In_ const int * pruns, _In_ const float * pcoefs, _In_ const float * psrc, _Inout_ float * pdst, int crow)
-{
-    const int * piv = pmprowiv;
-    const int * pcol = pmprowcol;
-    const float * pdLim = pdst + crow;
-    int kernelSize = pruns[1];
-
-    const int * pirun = pmprowrun;
-    for (float * pd = pdst; pd < pdLim; pd++)
-    {
-        const float * pm = pcoefs + *piv++;
-        const float * pmBias = pm + kernelSize;
-        const float * ps = psrc + *pcol++;
-        int irun = *pirun++;
-
-        const int * pi = pruns + 2 + irun;
-        const int * piLim = pi + pi[-1];
-        __m128 res = _mm_setzero_ps();
-        if (irun == 0)
-        {
-            // No masking needed.
-            for (; pi + 4 <= piLim; pi += 4, pm += 4)
-            {
-                __m128 x = _mm_mul_ps(_load4(ps, pi), _mm_loadu_ps(pm));
-                res = _mm_add_ps(res, x);
-            }
-            for (; pi < piLim; pi++, pm++)
-            {
-                __m128 x = _mm_mul_ss(_load1(ps, pi), _mm_set_ss(*pm));
-                res = _mm_add_ss(res, x);
-            }
-        }
-        else
-        {
-            // Need masking.
-            pm += pi[-2];
-            const float * pmask = reinterpret_cast<const float *>(piLim);
-            for (; pi + 4 <= piLim; pi += 4, pm += 4, pmask += 4)
-            {
-                __m128 x = _mm_mul_ps(_load4(ps, pi), _mm_and_ps(_mm_loadu_ps(pmask), _mm_loadu_ps(pm)));
-                res = _mm_add_ps(res, x);
-            }
-            for (; pi < piLim; pi++, pm++, pmask++)
-            {
-                __m128 x = _mm_mul_ss(_load1(ps, pi), _mm_and_ps(_mm_set_ss(*pmask), _mm_set_ss(*pm)));
-                res = _mm_add_ss(res, x);
-            }
-        }
-        res = _mm_hadd_ps(res, res);
-        res = _mm_hadd_ps(res, res);
-
-        res = _mm_add_ss(res, _mm_set_ss(*pmBias));
-        if (add)
-            res = _mm_add_ss(res, _mm_set_ss(*pd));
-        _mm_store_ss(pd, res);
-    }
-}
-
-// Mean pooling.
-EXPORT_API(void) MeanU(bool add, _In_ const int * pmprowcol, _In_opt_ const int * pmprowindices, _In_ const int * pindices,
-    _In_ const float * psrc, _Inout_ float * pdst, int crow)
-{
-    const int * pcol = pmprowcol;
-    const float * pdLim = pdst + crow;
-
-    if (pmprowindices == nullptr)
-    {
-        int size = pindices[0];
-        __m128 x0 = _mm_set_ss((float)size);
-        const int * piLim = pindices + 1 + size;
-        for (float * pd = pdst; pd < pdLim; pd++)
-        {
-            const float * ps = psrc + *pcol++;
-            const int * pi = pindices + 1;
-
-            __m128 res = _mm_setzero_ps();
-            for (; pi + 4 <= piLim; pi += 4)
-                res = _mm_add_ps(res, _load4(ps, pi));
-            for (; pi < piLim; pi++)
-                res = _mm_add_ss(res, _load1(ps, pi));
-            res = _mm_hadd_ps(res, res);
-            res = _mm_hadd_ps(res, res);
-
-            res = _mm_div_ss(res, x0);
-            if (add)
-                res = _mm_add_ss(res, _mm_set_ss(*pd));
-            _mm_store_ss(pd, res);
-        }
-    }
-    else
-    {
-        const int * pii = pmprowindices;
-        for (float * pd = pdst; pd < pdLim; pd++)
-        {
-            const float * ps = psrc + *pcol++;
-            int ii = *pii++;
-
-            const int * pi = pindices + ii;
-            int size = *pi++;
-            const int * piLim = pi + size;
-            __m128 res = _mm_setzero_ps();
-            for (; pi + 4 <= piLim; pi += 4)
-                res = _mm_add_ps(res, _load4(ps, pi));
-            for (; pi < piLim; pi++)
-                res = _mm_add_ss(res, _load1(ps, pi));
-            res = _mm_hadd_ps(res, res);
-            res = _mm_hadd_ps(res, res);
-
-            res = _mm_div_ss(res, _mm_set_ss((float)size));
-            if (add)
-                res = _mm_add_ss(res, _mm_set_ss(*pd));
-            _mm_store_ss(pd, res);
-        }
-    }
-}
-
-// Max pooling.
-EXPORT_API(void) MaxU(bool add, _In_ const int * pmprowcol, _In_opt_ const int * pmprowindices, _In_ const int * pindices,
-    _In_ const float * psrc, _Inout_ float * pdst, int crow)
-{
-    const int * pcol = pmprowcol;
-    const float * pdLim = pdst + crow;
-    __m128 min = _mm_set1_ps(-std::numeric_limits<float>::infinity());
-
-    if (pmprowindices == nullptr)
-    {
-        int size = pindices[0];
-        const int * piLim = pindices + 1 + size;
-        for (float * pd = pdst; pd < pdLim; pd++)
-        {
-            const float * ps = psrc + *pcol++;
-            const int * pi = pindices + 1;
-
-            __m128 res = min;
-            for (; pi + 4 <= piLim; pi += 4)
-                res = _mm_max_ps(res, _load4(ps, pi));
-            for (; pi < piLim; pi++)
-                res = _mm_max_ss(res, _load1(ps, pi));
-            __m128 x1 = _mm_shuffle_ps(res, res, 0xB1);
-            res = _mm_max_ps(res, x1);
-            x1 = _mm_shuffle_ps(res, res, 0x02);
-            res = _mm_max_ss(res, x1);
-
-            if (add)
-                res = _mm_add_ss(res, _mm_set_ss(*pd));
-            _mm_store_ss(pd, res);
-        }
-    }
-    else
-    {
-        const int * pii = pmprowindices;
-        for (float * pd = pdst; pd < pdLim; pd++)
-        {
-            const float * ps = psrc + *pcol++;
-            int ii = *pii++;
-
-            const int * pi = pindices + ii;
-            int size = *pi++;
-            const int * piLim = pi + size;
-            __m128 res = min;
-            for (; pi + 4 <= piLim; pi += 4)
-                res = _mm_max_ps(res, _load4(ps, pi));
-            for (; pi < piLim; pi++)
-                res = _mm_max_ss(res, _load1(ps, pi));
-            __m128 x1 = _mm_shuffle_ps(res, res, 0xB1);
-            res = _mm_max_ps(res, x1);
-            x1 = _mm_shuffle_ps(res, res, 0x02);
-            res = _mm_max_ss(res, x1);
-
-            if (add)
-                res = _mm_add_ss(res, _mm_set_ss(*pd));
-            _mm_store_ss(pd, res);
-        }
+            res = _mm_add_ps(res, _mm_loadu_ps(pd));
+        _mm_storeu_ps(pd, res);
     }
 }
 
-// REVIEW: Try out SSE/AVX after padding support is added. AVX math platform uses the same code below.
-EXPORT_API(void) RespNormU(bool add, float alpha, float beta, bool avgOverFullKernel, float offset,
-    _In_ const int * pmprowcol, _In_opt_ const int * pmprowindices, _In_ const int * pindices,
-    _In_ const float * psrc, _Inout_ float * pdst, int crow)
-{
-    const int * pcol = pmprowcol;
-    const float * pdLim = pdst + crow;
-
-    if (pmprowindices == nullptr)
-    {
-        int size = pindices[0];
-        float scale = alpha / size;
-        const int * piLim = pindices + 1 + size;
-        for (float * pd = pdst; pd < pdLim; pd++)
-        {
-            const float * ps = psrc + *pcol++;
-            const int * pi = pindices + 1;
-            float res = 0;
-            for (; pi < piLim; pi++)
-            {
-                float cur = ps[*pi];
-                res += cur * cur;
-            }
-            res = ps[0] * powf(offset + scale * res, -beta);
-            *pd = add ? *pd + res : res;
-        }
-    }
-    else
-    {
-        int kernelSize = pindices[0];
-        const int * pii = pmprowindices;
-        for (float * pd = pdst; pd < pdLim; pd++)
-        {
-            const float * ps = psrc + *pcol++;
-            int ii = *pii++;
-            const int * pi = pindices + ii;
-            int size = *pi++;
-            const int * piLim = pi + size;
-            float res = 0;
-            for (; pi < piLim; pi++)
-            {
-                float cur = ps[*pi];
-                res += cur * cur;
-            }
-            int avgDenom = avgOverFullKernel ? kernelSize : size;
-            res = ps[0] * powf(offset + alpha / avgDenom * res, -beta);
-            *pd = add ? *pd + res : res;
-        }
-    }
-}
-
-EXPORT_API(void) MatMulTranA(bool add, _In_ const float * pmat, _In_ const float * psrc, _Inout_ float * pdst, int crow, int ccol)
+EXPORT_API(void) MatMulTran(bool add, _In_ const float * pmat, _In_ const float * psrc, _Inout_ float * pdst, int crow, int ccol)
 {
     const float * psLim = psrc + ccol;
     const float * pdLim = pdst + crow;
@@ -487,7 +167,7 @@ EXPORT_API(void) MatMulTranA(bool add, _In_ const float * pmat, _In_ const float
 
     if (!add)
     {
-        __m128 x01 = _mm_load_ps(ps);
+        __m128 x01 = _mm_loadu_ps(ps);
         // Replicate each slot of x01 into its own register.
         __m128 x11 = _mm_shuffle_ps(x01, x01, 0x55);
         __m128 x21 = _mm_shuffle_ps(x01, x01, 0xAA);
@@ -497,10 +177,10 @@ EXPORT_API(void) MatMulTranA(bool add, _In_ const float * pmat, _In_ const float
         for (float * pd = pdst; pd < pdLim; pd += 4, pm += 4)
         {
             const float * pmTmp;
-            __m128 x02 = _mm_load_ps(pmTmp = pm);
-            __m128 x12 = _mm_load_ps(pmTmp += crow);
-            __m128 x22 = _mm_load_ps(pmTmp += crow);
-            __m128 x32 = _mm_load_ps(pmTmp += crow);
+            __m128 x02 = _mm_loadu_ps(pmTmp = pm);
+            __m128 x12 = _mm_loadu_ps(pmTmp += crow);
+            __m128 x22 = _mm_loadu_ps(pmTmp += crow);
+            __m128 x32 = _mm_loadu_ps(pmTmp += crow);
             x02 = _mm_mul_ps(x01, x02);
             x12 = _mm_mul_ps(x11, x12);
             x22 = _mm_mul_ps(x21, x22);
@@ -508,7 +188,7 @@ EXPORT_API(void) MatMulTranA(bool add, _In_ const float * pmat, _In_ const float
             x02 = _mm_add_ps(x02, x12);
             x22 = _mm_add_ps(x22, x32);
             x02 = _mm_add_ps(x02, x22);
-            _mm_store_ps(pd, x02);
+            _mm_storeu_ps(pd, x02);
         }
 
         pm += 3 * crow;
@@ -516,7 +196,7 @@ EXPORT_API(void) MatMulTranA(bool add, _In_ const float * pmat, _In_ const float
 
     for (; ps < psLim; ps += 4)
     {
-        __m128 x01 = _mm_load_ps(ps);
+        __m128 x01 = _mm_loadu_ps(ps);
         // Replicate each slot of x01 into its own register.
         __m128 x11 = _mm_shuffle_ps(x01, x01, 0x55);
         __m128 x21 = _mm_shuffle_ps(x01, x01, 0xAA);
@@ -525,11 +205,11 @@ EXPORT_API(void) MatMulTranA(bool add, _In_ const float * pmat, _In_ const float
         for (float * pd = pdst; pd < pdLim; pd += 4, pm += 4)
         {
             const float * pmTmp;
-            __m128 x02 = _mm_load_ps(pmTmp = pm);
-            __m128 x12 = _mm_load_ps(pmTmp += crow);
-            __m128 x22 = _mm_load_ps(pmTmp += crow);
-            __m128 x32 = _mm_load_ps(pmTmp += crow);
-            __m128 x3 = _mm_load_ps(pd);
+            __m128 x02 = _mm_loadu_ps(pmTmp = pm);
+            __m128 x12 = _mm_loadu_ps(pmTmp += crow);
+            __m128 x22 = _mm_loadu_ps(pmTmp += crow);
+            __m128 x32 = _mm_loadu_ps(pmTmp += crow);
+            __m128 x3 = _mm_loadu_ps(pd);
             x02 = _mm_mul_ps(x01, x02);
             x12 = _mm_mul_ps(x11, x12);
             x22 = _mm_mul_ps(x21, x22);
@@ -538,7 +218,7 @@ EXPORT_API(void) MatMulTranA(bool add, _In_ const float * pmat, _In_ const float
             x22 = _mm_add_ps(x22, x32);
             x02 = _mm_add_ps(x02, x22);
             x3 = _mm_add_ps(x02, x3);
-            _mm_store_ps(pd, x3);
+            _mm_storeu_ps(pd, x3);
         }
 
         pm += 3 * crow;
@@ -546,7 +226,7 @@ EXPORT_API(void) MatMulTranA(bool add, _In_ const float * pmat, _In_ const float
 }
 
 // Partial sparse source vector.
-EXPORT_API(void) MatMulTranPA(bool add, _In_ const float * pmat, _In_ const int * pposSrc, _In_ const float * psrc,
+EXPORT_API(void) MatMulTranP(bool add, _In_ const float * pmat, _In_ const int * pposSrc, _In_ const float * psrc,
     int posMin, int iposMin, int iposLim, _Inout_ float * pdst, int crow)
 {
     const int * ppos = pposSrc + iposMin;
@@ -560,9 +240,9 @@ EXPORT_API(void) MatMulTranPA(bool add, _In_ const float * pmat, _In_ const int
         __m128 x0 = _mm_set1_ps(psrc[col]);
         for (float * pd = pdst; pd < pdLim; pd += 4, pm += 4)
         {
-            __m128 x1 = _mm_load_ps(pm);
+            __m128 x1 = _mm_loadu_ps(pm);
             x1 = _mm_mul_ps(x1, x0);
-            _mm_store_ps(pd, x1);
+            _mm_storeu_ps(pd, x1);
         }
     }
 
@@ -574,837 +254,107 @@ EXPORT_API(void) MatMulTranPA(bool add, _In_ const float * pmat, _In_ const int
         const float * pm = pmat + col * crow;
         for (float * pd = pdst; pd < pdLim; pd += 4, pm += 4)
         {
-            __m128 x1 = _mm_load_ps(pm);
-            __m128 x2 = _mm_load_ps(pd);
+            __m128 x1 = _mm_loadu_ps(pm);
+            __m128 x2 = _mm_loadu_ps(pd);
             x1 = _mm_mul_ps(x1, x0);
             x2 = _mm_add_ps(x2, x1);
-            _mm_store_ps(pd, x2);
+            _mm_storeu_ps(pd, x2);
         }
     }
 }
 
-// Sparse matrix.
-EXPORT_API(void) MatMulTranRU(bool add, _In_ const int * pstarts, _In_ const int * pindices, _In_ const float * pcoefs,
-    _In_ const float * psrc, _Inout_ float * pd, int crow, int ccol)
-{
-    if (!add)
-        memset(pd, 0, crow * sizeof(float));
-
-    const int * pii = pstarts + 1;
-    const int * pi = pindices;
-    const float * pm = pcoefs;
-    const float * psLim = psrc + ccol;
-    for (const float * ps = psrc; ps < psLim; ps++)
-    {
-        float x = *ps;
-        const int * piLim = pindices + *pii++;
-
-        __m128 x1 = _mm_set1_ps(x);
-        for (; pi + 4 <= piLim; pi += 4, pm += 4)
-        {
-            __m128 x2 = _mm_mul_ps(x1, _mm_loadu_ps(pm));
-            x2 = _mm_add_ps(x2, _load4(pd, pi));
-            _store4(x2, pd, pi);
-        }
-        for (; pi < piLim; pi++, pm++)
-        {
-            __m128 x2 = _mm_mul_ss(x1, _mm_set_ss(*pm));
-            x2 = _mm_add_ss(x2, _load1(pd, pi));
-            _store1(x2, pd, pi);
-        }
-    }
-}
-
-// Unpadded convolution.
-EXPORT_API(void) MatMulTranCU(bool add, _In_ const int * pmpcoliv, _In_ const int * pmpcolrow,
-    _In_ const int * pruns, _In_ const float * pcoefs, _In_ const float * psrc, _Inout_ float * pdst, int crow, int ccol)
-{
-    if (!add)
-        memset(pdst, 0, crow * sizeof(float));
-
-    int size = pruns[1];
-    const int * psupport = pruns + 2;
-    const int * piv = pmpcoliv;
-    const int * prow = pmpcolrow;
-    const int * piLim = psupport + size;
-    const float * psLim = psrc + ccol;
-    for (const float * ps = psrc; ps < psLim; ps++)
-    {
-        const float * pm = pcoefs + *piv++;
-        float * pd = pdst + *prow++;
-        const int * pi = psupport;
-
-        float x = *ps;
-        __m128 x1 = _mm_set1_ps(x);
-        for (; pi + 4 <= piLim; pm += 4, pi += 4)
-        {
-            __m128 x2 = _mm_mul_ps(x1, _mm_loadu_ps(pm));
-            x2 = _mm_add_ps(x2, _load4(pd, pi));
-            _store4(x2, pd, pi);
-        }
-        for (; pi < piLim; pi++, pm++)
-        {
-            __m128 x2 = _mm_mul_ss(x1, _mm_set_ss(*pm));
-            x2 = _mm_add_ss(x2, _load1(pd, pi));
-            _store1(x2, pd, pi);
-        }
-    }
-}
-
-// Padded convolution.
-EXPORT_API(void) MatMulTranDU(bool add, _In_ const int * pmpcoliv, _In_ const int * pmpcolrow, _In_ const int * pmpcolrun,
-    _In_ const int * pruns, _In_ const float * pcoefs, _In_ const float * psrc, _Inout_ float * pdst, int crow, int ccol)
-{
-    if (!add)
-        memset(pdst, 0, crow * sizeof(float));
-
-    const int * piv = pmpcoliv;
-    const int * prow = pmpcolrow;
-    const float * psLim = psrc + ccol;
-
-    const int * pirun = pmpcolrun;
-    for (const float * ps = psrc; ps < psLim; ps++)
-    {
-        const float * pm = pcoefs + *piv++;
-        float * pd = pdst + *prow++;
-        int irun = *pirun++;
-        const int * pi = pruns + 2 + irun;
-        const int * piLim = pi + pi[-1];
-
-        float x = *ps;
-        __m128 x1 = _mm_set1_ps(x);
-        if (irun == 0)
-        {
-            // No masking needed.
-            for (; pi + 4 <= piLim; pi += 4, pm += 4)
-            {
-                __m128 x2 = _mm_mul_ps(x1, _mm_loadu_ps(pm));
-                x2 = _mm_add_ps(x2, _load4(pd, pi));
-                _store4(x2, pd, pi);
-            }
-            for (; pi < piLim; pi++, pm++)
-            {
-                __m128 x2 = _mm_mul_ss(x1, _mm_set_ss(*pm));
-                x2 = _mm_add_ss(x2, _load1(pd, pi));
-                _store1(x2, pd, pi);
-            }
-        }
-        else
-        {
-            // Need masking.
-            pm += pi[-2];
-            const float * pmask = reinterpret_cast<const float *>(piLim);
-            for (; pi + 4 <= piLim; pi += 4, pm += 4, pmask += 4)
-            {
-                __m128 x2 = _mm_mul_ps(_mm_and_ps(_mm_loadu_ps(pmask), x1), _mm_loadu_ps(pm));
-                x2 = _mm_add_ps(x2, _load4(pd, pi));
-                _store4(x2, pd, pi);
-            }
-            for (; pi < piLim; pi++, pm++, pmask++)
-            {
-                __m128 x2 = _mm_mul_ss(_mm_and_ps(_mm_set_ss(*pmask), x1), _mm_set_ss(*pm));
-                x2 = _mm_add_ss(x2, _load1(pd, pi));
-                _store1(x2, pd, pi);
-            }
-        }
-    }
-}
-
-// Mean pooling back prop.
-EXPORT_API(void) MeanBackU(bool add, _In_ const int * pmpcolrow, _In_opt_ const int * pmpcolindices, _In_ const int * pindices,
-    _In_ const float * psrc, _Inout_ float * pdst, int crow, int ccol)
-{
-    if (!add)
-        memset(pdst, 0, crow * sizeof(float));
-
-    const int * prow = pmpcolrow;
-    const float * psLim = psrc + ccol;
-    if (pmpcolindices == nullptr)
-    {
-        int size = pindices[0];
-        const int * piLim = pindices + 1 + size;
-        for (const float * ps = psrc; ps < psLim; ps++)
-        {
-            float * pd = pdst + *prow++;
-            const int * pi = pindices + 1;
-
-            float x = *ps / size;
-            __m128 x1 = _mm_set1_ps(x);
-            for (; pi + 4 <= piLim; pi += 4)
-            {
-                __m128 x2 = _mm_add_ps(x1, _load4(pd, pi));
-                _store4(x2, pd, pi);
-            }
-            for (; pi < piLim; pi++)
-            {
-                __m128 x2 = _mm_add_ss(x1, _load1(pd, pi));
-                _store1(x2, pd, pi);
-            }
-        }
-    }
-    else
-    {
-        const int * pii = pmpcolindices;
-        for (const float * ps = psrc; ps < psLim; ps++)
-        {
-            float * pd = pdst + *prow++;
-            int ii = *pii++;
-
-            const int * pi = pindices + ii;
-            int size = *pi++;
-            const int * piLim = pi + size;
-
-            float x = *ps / size;
-            __m128 x1 = _mm_set1_ps(x);
-            for (; pi + 4 <= piLim; pi += 4)
-            {
-                __m128 x2 = _mm_add_ps(x1, _load4(pd, pi));
-                _store4(x2, pd, pi);
-            }
-            for (; pi < piLim; pi++)
-            {
-                __m128 x2 = _mm_add_ss(x1, _load1(pd, pi));
-                _store1(x2, pd, pi);
-            }
-        }
-    }
-}
-
-// Max pooling back prop.
-EXPORT_API(void) MaxBackU(bool add, _In_ const int * pmpcolrow, _In_opt_ const int * pmpcolindices, _In_ const int * pindices,
-    _In_ const float * psrc, _Inout_ float * pdst, _In_ const float * pval, int crow, int ccol)
+EXPORT_API(float) MaxAbsU(const float * ps, int c)
 {
-    if (!add)
-        memset(pdst, 0, crow * sizeof(float));
-
-    const int * prow = pmpcolrow;
-    const float * psLim = psrc + ccol;
-    if (pmpcolindices == nullptr)
-    {
-        const int * piLim = pindices + 1 + pindices[0];
-        for (const float * ps = psrc; ps < psLim; ps++)
-        {
-            int rowBase = *prow++;
-            float * pd = pdst + rowBase;
-            const float * pv = pval + rowBase;
-            const int * pi = pindices + 1;
-
-            int j = *pi++;
-            float m = pv[j];
-            for (; pi < piLim; pi++)
-            {
-                if (m < pv[*pi])
-                {
-                    j = *pi;
-                    m = pv[j];
-                }
-            }
-            pd[j] += *ps;
-        }
-    }
-    else
-    {
-        const int * pii = pmpcolindices;
-        for (const float * ps = psrc; ps < psLim; ps++)
-        {
-            int rowBase = *prow++;
-            int ii = *pii++;
-            float * pd = pdst + rowBase;
-            const float * pv = pval + rowBase;
-            const int * pi = pindices + ii + 1;
-            const int * piLim = pi + pi[-1];
-
-            int j = *pi++;
-            float m = pv[j];
-            for (; pi < piLim; pi++)
-            {
-                if (m < pv[*pi])
-                {
-                    j = *pi;
-                    m = pv[j];
-                }
-            }
-            pd[j] += *ps;
-        }
-    }
-}
+    const float * psLim = ps + c;
 
-// REVIEW: Try out SSE/AVX after padding support is added. AVX math platform uses the same code below.
-EXPORT_API(void) RespNormBackU(bool add, float alpha, float beta, bool avgOverFullKernel, float offset,
-    _In_ const int * pmpcolrow, _In_opt_ const int * pmpcolindices, _In_ const int * pindices,
-    _In_ const float * perrors, _Inout_ float * perrorsPrev, _In_ const float * pvaluesPrev, int crow, int ccol)
-{
-    if (!add)
-        memset(perrorsPrev, 0, crow * sizeof(float));
+    __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
+    __m128 res = _mm_setzero_ps();
+    for (; ps + 4 <= psLim; ps += 4)
+        res = _mm_max_ps(res, _mm_and_ps(_mm_loadu_ps(ps), mask));
 
-    const int * prow = pmpcolrow;
-    const float * psLim = perrors + ccol;
-    if (pmpcolindices == nullptr)
-    {
-        int size = pindices[0];
-        float scale = alpha / size;
-        const int * piMin = pindices + 1;
-        const int * piLim = piMin + size;
-        for (const float * ps = perrors; ps < psLim; ps++)
-        {
-            int rowBase = *prow++;
-            // First compute denominator: denom = offset + scale * Sum(Xj^2)
-            float denom = 0;
-            const float * pv = pvaluesPrev + rowBase;
-
-            for (const int * pi = piMin; pi < piLim; pi++)
-            {
-                float cur = pv[*pi];
-                denom += cur * cur;
-            }
-            denom = offset + scale * denom;
-            float denomPow = powf(denom, -beta);
-            // The output.
-            float y = pv[0] * denomPow;
-
-            // The update logic:
-            //     srcError(*ps) X the derivative.
-            //     derivative at i wrt center point = powf(denom, -beta) - 2* scale * beta * X[i] * y / denom.
-            //     derivative at i wrt other points = - 2* scale * beta * X[i] * y / denom.
-            float commonUpdate = *ps * (-2 * scale * beta * y) / denom;
-
-            float * pd = perrorsPrev + rowBase;
-            for (const int * pi = piMin; pi < piLim; pi++)
-                pd[*pi] += pv[*pi] * commonUpdate;
-
-            // Additional update for the center point.
-            pd[0] += *ps * denomPow;
-        }
-    }
-    else
-    {
-        int kernelSize = pindices[0];
-        const int * pii = pmpcolindices;
-        for (const float * ps = perrors; ps < psLim; ps++)
-        {
-            int rowBase = *prow++;
-            // First compute denominator: denom = 1 + scale * Sum(Xj^2)
-            float denom = 0;
-            const float * pv = pvaluesPrev + rowBase;
-            int ii = *pii++;
-
-            const int * piMin = pindices + ii;
-            int size = *piMin++;
-            const int * piLim = piMin + size;
-
-            for (const int * pi = piMin; pi < piLim; pi++)
-            {
-                float cur = pv[*pi];
-                denom += cur * cur;
-            }
-            float scale = alpha / (avgOverFullKernel ? kernelSize : size);
-            denom = offset + scale * denom;
-            float denomPow = powf(denom, -beta);
-            // The output.
-            float y = pv[0] * denomPow;
-
-            // The update logic:
-            //     srcError(*ps) X the derivative.
-            //     derivative at i wrt center point = powf(denom, -beta) - 2* scale * beta * X[i] * y / denom.
-            //     derivative at i wrt other points = - 2* scale * beta * X[i] * y / denom.
-            float commonUpdate = *ps * (-2 * scale * beta * y) / denom;
-
-            float * pd = perrorsPrev + rowBase;
-            for (const int * pi = piMin; pi < piLim; pi++)
-                pd[*pi] += pv[*pi] * commonUpdate;
-
-            // Additional update for the center point.
-            pd[0] += *ps * denomPow;
-        }
-    }
-}
+    __m128 x1 = _mm_shuffle_ps(res, res, 0xB1);
+    res = _mm_max_ps(res, x1);
+    x1 = _mm_shuffle_ps(res, res, 0x02);
+    res = _mm_max_ss(res, x1);
 
-template <bool useDecay>
-void AddXYTranACore(float a, _In_ const float * px, _In_ const float * py, _Inout_ float * pmat, int crow, int ccol, float decay)
-{
-    const float * pyBase = py;
-    const float * pxLim = px + crow;
-    const float * pyLim = py + ccol;
-    float * pm = pmat;
-    __m128 wd;
-    if (useDecay)
-        wd = _mm_set1_ps(1 - decay);
-    for (; px < pxLim; px++)
-    {
-        float r = a * *px;
-        py = pyBase;
+    for (; ps < psLim; ps++)
+        res = _mm_max_ss(res, _mm_and_ps(_mm_load_ss(ps), mask));
 
-        __m128 x1 = _mm_set1_ps(r);
-        for (; py + 16 <= pyLim; py += 16, pm += 16)
-        {
-            __m128 x02 = _mm_load_ps(py);
-            __m128 x12 = _mm_load_ps(py + 4);
-            __m128 x22 = _mm_load_ps(py + 8);
-            __m128 x32 = _mm_load_ps(py + 12);
-            __m128 x03 = _mm_load_ps(pm);
-            __m128 x13 = _mm_load_ps(pm + 4);
-            __m128 x23 = _mm_load_ps(pm + 8);
-            __m128 x33 = _mm_load_ps(pm + 12);
-            x02 = _mm_mul_ps(x1, x02);
-            x12 = _mm_mul_ps(x1, x12);
-            x22 = _mm_mul_ps(x1, x22);
-            x32 = _mm_mul_ps(x1, x32);
-            if (useDecay)
-            {
-                x03 = _mm_mul_ps(wd, x03);
-                x13 = _mm_mul_ps(wd, x13);
-                x23 = _mm_mul_ps(wd, x23);
-                x33 = _mm_mul_ps(wd, x33);
-            }
-            x03 = _mm_add_ps(x02, x03);
-            x13 = _mm_add_ps(x12, x13);
-            x23 = _mm_add_ps(x22, x23);
-            x33 = _mm_add_ps(x32, x33);
-            _mm_store_ps(pm, x03);
-            _mm_store_ps(pm + 4, x13);
-            _mm_store_ps(pm + 8, x23);
-            _mm_store_ps(pm + 12, x33);
-        }
-        for (; py < pyLim; py += 4, pm += 4)
-        {
-            __m128 x02 = _mm_load_ps(py);
-            __m128 x03 = _mm_load_ps(pm);
-            x02 = _mm_mul_ps(x1, x02);
-            if (useDecay)
-                x03 = _mm_mul_ps(wd, x03);
-            x03 = _mm_add_ps(x02, x03);
-            _mm_store_ps(pm, x03);
-        }
-    }
+    return _mm_cvtss_f32(res);
 }
 
-EXPORT_API(void) AddXYTranA(float a, _In_ const float * px, _In_ const float * py, _Inout_ float * pmat, int crow, int ccol, float decay)
+EXPORT_API(float) MaxAbsDiffU(float mean, const float * ps, int c)
 {
-    if (decay == 0)
-        AddXYTranACore<false>(a, px, py, pmat, crow, ccol, decay);
-    else
-        AddXYTranACore<true>(a, px, py, pmat, crow, ccol, decay);
-}
+    const float * psLim = ps + c;
 
-// Partial sparse source vector.
-EXPORT_API(void) AddXYTranPA(float a, _In_ const float * px, _In_ const int * pposY, _In_ const float * pvaluesY,
-    int posMinY, int iposMinY, int iposLimY, _Inout_ float * pmat, int crow, int ccol)
-{
-#if 1
-    // REVIEW: This is faster for MNIST, but the version below is faster for extremely sparse input.
-    const int * pposMin = pposY + iposMinY;
-    const int * pposLim = pposY + iposLimY;
-    const float * pxLim = px + crow;
-    float * pm0 = pmat - posMinY;
-    const float * py = pvaluesY - posMinY;
-
-    __m128 x0 = _mm_set1_ps(a);
-    for (; px < pxLim; px += 4, pm0 += 4 * ccol)
+    __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
+    __m128 res = _mm_setzero_ps();
+    __m128 m = _mm_set1_ps(mean);
+    for (; ps + 4 <= psLim; ps += 4)
     {
-        float * pm1 = pm0 + ccol;
-        float * pm2 = pm1 + ccol;
-        float * pm3 = pm2 + ccol;
-
-        __m128 x1 = _mm_load_ps(px);
-        x1 = _mm_mul_ps(x1, x0);
-
-        for (const int * ppos = pposMin; ppos < pposLim; ppos++)
-        {
-            int col = *ppos;
-            __m128 x2 = _mm_set1_ps(py[col]);
-            __m128 x3 = _mm_setr_ps(pm0[col], pm1[col], pm2[col], pm3[col]);
-            x2 = _mm_mul_ps(x2, x1);
-            x3 = _mm_add_ps(x3, x2);
-
-            _mm_store_ss(pm0 + col, x3); x3 = _rotate(x3);
-            _mm_store_ss(pm1 + col, x3); x3 = _rotate(x3);
-            _mm_store_ss(pm2 + col, x3); x3 = _rotate(x3);
-            _mm_store_ss(pm3 + col, x3);
-        }
+        __m128 x = _mm_loadu_ps(ps);
+        x = _mm_sub_ps(x, m);
+        res = _mm_max_ps(res, _mm_and_ps(x, mask));
     }
-#else
-    const int * pposMin = pposY + iposMinY;
-    const int * pposLim = pposY + iposLimY;
-    const float * pxLim = px + crow;
-    float * pm = pmat - posMinY;
-    const float * py = pvaluesY - posMinY;
-
-    __m128 x0 = _mm_set1_ps(a);
-    int d1 = 1 * ccol;
-    int d2 = 2 * ccol;
-    int d3 = 3 * ccol;
-    int d4 = 4 * ccol;
-    for (const int * ppos = pposMin; ppos < pposLim; ppos++)
-    {
-        int col = *ppos;
-        __m128 x2 = _mm_set1_ps(py[col]);
-        x2 = _mm_mul_ps(x2, x0);
 
-        float * pm0 = pm + col;
-        for (const float * px0 = px; px0 < pxLim; px0 += 4, pm0 += d4)
-        {
-            __m128 x1 = _mm_load_ps(px0);
-            __m128 x3 = _mm_setr_ps(pm0[0], pm0[d1], pm0[d2], pm0[d3]);
-            x1 = _mm_mul_ps(x1, x2);
-            x3 = _mm_add_ps(x3, x1);
-
-            _mm_store_ss(pm0, x3); x3 = _rotate(x3);
-            _mm_store_ss(pm0 + d1, x3); x3 = _rotate(x3);
-            _mm_store_ss(pm0 + d2, x3); x3 = _rotate(x3);
-            _mm_store_ss(pm0 + d3, x3);
-        }
-    }
-#endif
-}
+    __m128 x1 = _mm_shuffle_ps(res, res, 0xB1);
+    res = _mm_max_ps(res, x1);
+    x1 = _mm_shuffle_ps(res, res, 0x02);
+    res = _mm_max_ss(res, x1);
 
-template <bool useDecay>
-void AddXYTranRUCore(float a, _In_ const float * px, _In_ const float * py,
-    _In_ const int * pstarts, _In_ const int * pindices, _Inout_ float * pcoefs, int crow, float decay)
-{
-    const int * pii = pstarts + 1;
-    const int * pi = pindices;
-    float * pm = pcoefs;
-    const float * pxLim = px + crow;
-    __m128 wd;
-    if (useDecay)
-        wd = _mm_set1_ps(1 - decay);
-    for (; px < pxLim; px++)
+    for (; ps < psLim; ps++)
     {
-        const int * piLim = pindices + *pii++;
-        float r = a * *px;
-
-        __m128 x1 = _mm_set1_ps(r);
-        for (; pi + 4 <= piLim; pi += 4, pm += 4)
-        {
-            __m128 x2 = _mm_mul_ps(x1, _load4(py, pi));
-            __m128 x3 = _mm_loadu_ps(pm);
-            if (useDecay)
-                x3 = _mm_mul_ps(x3, wd);
-            x2 = _mm_add_ps(x2, x3);
-            _mm_storeu_ps(pm, x2);
-        }
-        for (; pi < piLim; pi++, pm++)
-            *pm = (useDecay ? (*pm * (1 - decay)) : *pm) + py[*pi] * r;
+        __m128 x = _mm_load_ss(ps);
+        x = _mm_sub_ss(x, m);
+        res = _mm_max_ss(res, _mm_and_ps(x, mask));
     }
-}
-
-// Sparse matrix.
-EXPORT_API(void) AddXYTranRU(float a, _In_ const float * px, _In_ const float * py,
-    _In_ const int * pstarts, _In_ const int * pindices, _Inout_ float * pcoefs, int crow, float decay)
-{
-    if (decay == 0)
-        AddXYTranRUCore<false>(a, px, py, pstarts, pindices, pcoefs, crow, decay);
-    else
-        AddXYTranRUCore<true>(a, px, py, pstarts, pindices, pcoefs, crow, decay);
-}
-
-// Unpadded convolution.
-EXPORT_API(void) AddXYTranCU(float a, _In_ const float * px, _In_ const float * py, _In_ const int * pmprowiv,
-    _In_ const int * pmprowcol, _In_ const int * pruns, _Inout_ float * pcoefs, int crow)
-{
-    int size = pruns[1];
-    const int * psupport = pruns + 2;
-    const int * piv = pmprowiv;
-    const int * pcol = pmprowcol;
-    const float * pxLim = px + crow;
-    const int * piLim = psupport + size;
-
-    for (; px < pxLim; px++)
-    {
-        float * pm = pcoefs + *piv++;
-        const float * ps = py + *pcol++;
-        const int * pi = psupport;
-        float r = a * *px;
 
-        __m128 x1 = _mm_set1_ps(r);
-        for (; pi + 4 <= piLim; pi += 4, pm += 4)
-        {
-            __m128 x2 = _mm_mul_ps(x1, _load4(ps, pi));
-            x2 = _mm_add_ps(x2, _mm_loadu_ps(pm));
-            _mm_storeu_ps(pm, x2);
-        }
-        for (; pi < piLim; pi++, pm++)
-            *pm += ps[*pi] * r;
-        // Update the bias.
-        *pm += r;
-    }
+    return _mm_cvtss_f32(res);
 }
 
-// Padded convolution.
-EXPORT_API(void) AddXYTranDU(float a, _In_ const float * px, _In_ const float * py, _In_ const int * pmprowiv,
-    _In_ const int * pmprowcol, _In_ const int * pmprowrun, _In_ const int * pruns, _Inout_ float * pcoefs, int crow)
+EXPORT_API(void) MulElementWiseU(_In_ const float * ps1, _In_ const float * ps2, _Inout_ float * pd, int c)
 {
-    const int * piv = pmprowiv;
-    const int * pcol = pmprowcol;
-    const float * pxLim = px + crow;
-    int kernelSize = pruns[1];
+    float * pdLim = pd + c;
 
-    const int * pirun = pmprowrun;
-    for (; px < pxLim; px++)
+    for (; pd + 4 <= pdLim; pd += 4, ps1 += 4, ps2 += 4)
     {
-        float * pm = pcoefs + *piv++;
-        const float * ps = py + *pcol++;
-        int irun = *pirun++;
-        const int * pi = pruns + 2 + irun;
-        const int * piLim = pi + pi[-1];
-
-        float r = a * *px;
-
-        // Update the bias.
-        pm[kernelSize] += r;
-
-        __m128 x1 = _mm_set1_ps(r);
-        if (irun == 0)
-        {
-            // No masking needed.
-            for (; pi + 4 <= piLim; pi += 4, pm += 4)
-            {
-                __m128 x2 = _mm_mul_ps(x1, _load4(ps, pi));
-                x2 = _mm_add_ps(x2, _mm_loadu_ps(pm));
-                _mm_storeu_ps(pm, x2);
-            }
-            for (; pi < piLim; pi++, pm++)
-                *pm += ps[*pi] * r;
-        }
-        else
-        {
-            // Need masking.
-            pm += pi[-2];
-            const float * pmask = reinterpret_cast<const float *>(piLim);
-            for (; pi + 4 <= piLim; pi += 4, pm += 4, pmask += 4)
-            {
-                __m128 x2 = _mm_mul_ps(_mm_and_ps(_mm_loadu_ps(pmask), x1), _load4(ps, pi));
-                x2 = _mm_add_ps(x2, _mm_loadu_ps(pm));
-                _mm_storeu_ps(pm, x2);
-            }
-            for (; pi < piLim; pi++, pm++, pmask++)
-            {
-                __m128 x2 = _mm_mul_ss(_mm_and_ps(_mm_set_ss(*pmask), x1), _load1(ps, pi));
-                x2 = _mm_add_ss(x2, _mm_set_ss(*pm));
-                _mm_store_ss(pm, x2);
-            }
-        }
+        __m128 x1 = _mm_loadu_ps(ps1);
+        __m128 x2 = _mm_loadu_ps(ps2);
+        x2 = _mm_mul_ps(x1, x2);
+        _mm_storeu_ps(pd, x2);
     }
-}
 
-// With momentum.
-EXPORT_API(void) AddXYTranMomA(float a, _In_ const float * px, _In_ const float * py, _Inout_ float * pmat, float momentum, _Inout_ float * pdel, int crow, int ccol)
-{
-    const float * pyBase = py;
-    const float * pxLim = px + crow;
-    const float * pyLim = py + ccol;
-    float * pm = pmat;
-    float * pd = pdel;
-
-    __m128 x0 = _mm_set1_ps(momentum);
-    for (; px < pxLim; px++)
+    for (; pd < pdLim; pd++, ps1++, ps2++)
     {
-        float r = a * *px;
-
-        __m128 x1 = _mm_set1_ps(r);
-        for (py = pyBase; py < pyLim; pm += 4, pd += 4, py += 4)
-        {
-            __m128 x2 = _mm_load_ps(py);
-            __m128 x3 = _mm_load_ps(pd);
-            __m128 x4 = _mm_load_ps(pm);
-            x2 = _mm_mul_ps(x1, x2);
-            x3 = _mm_mul_ps(x0, x3);
-            x3 = _mm_add_ps(x2, x3);
-            x4 = _mm_add_ps(x3, x4);
-
-            _mm_store_ps(pd, x3);
-            _mm_store_ps(pm, x4);
-        }
+        __m128 x1 = _mm_load_ss(ps1);
+        __m128 x2 = _mm_load_ss(ps2);
+        x2 = _mm_mul_ps(x1, x2);
+        _mm_store_ss(pd, x2);
     }
 }
 
-// coef: coefs to update, ag: accumulated grads, au: accumulated updates, g: cur grads.
-// Note: parameters coef, ag, au and g will be updated, do not reuse parameter g in calling code.
-__forceinline void UpdateAdadelta(__m128& coef, __m128& ag, __m128& au, __m128& g, const __m128& dec, const __m128& decc, const __m128& c)
-{
-    __m128 x4 = _mm_mul_ps(g, g);   // x4 == g * g
-    x4 = _mm_mul_ps(decc, x4);      // x4 == (1 - decay) * g * g
-    ag = _mm_mul_ps(dec, ag);       // ag == decay * accG
-    ag = _mm_add_ps(ag, x4);        // ag == decay * accG + (1 - decay) * g * g
-    __m128 x41 = _mm_add_ps(ag, c); // x41 == ag + cond
-    __m128 x51 = _mm_add_ps(au, c); // x51 == accU + cond
-#if 0
-    // naive version:
-    x51 = _mm_div_ps(x51, x41);
-    x41 = _mm_sqrt_ps(x51);         // x41 == rate
-#else
-    // faster (approximate) version:
-    x41 = _mm_rsqrt_ps(x41);
-    __m128 x52 = _mm_rsqrt_ps(x51);
-    x51 = _mm_mul_ps(x51, x52);
-    x41 = _mm_mul_ps(x41, x51);     // x41 == rate
-#endif
-    g = _mm_mul_ps(g, x41);         // g - current update
-    coef = _mm_add_ps(coef, g);
-
-    g = _mm_mul_ps(g, g);           // g  == newU * newU
-    g = _mm_mul_ps(decc, g);        // g  == (1 - decay) * newU * newU
-    au = _mm_mul_ps(dec, au);       // au == decay * accU
-    au = _mm_add_ps(au, g);         // au == decay * accU + (1 - decay) * newU * newU
-}
 
-// For Adadelta.
-EXPORT_API(void) AddXYTranGradA(_In_ const float * px, _In_ const float * py, _Inout_ float * pmat, _Inout_ float * paccGrads, _Inout_ float * paccUpdates,
-    float decay, float cond, int crow, int ccol)
+EXPORT_API(void) AddScaleCopyU(float a, _In_ const float * ps, _In_ const float * pd, _Inout_ float * pr, int c)
 {
-    const float * pyBase = py;
-    const float * pxLim = px + crow;
-    const float * pyLim = py + ccol;
-    float * pm = pmat;
-    float * pag = paccGrads;
-    float * pau = paccUpdates;
-
-    __m128 dec = _mm_set1_ps(decay);
-    __m128 decc = _mm_set1_ps(1 - decay);
-    __m128 c = _mm_set1_ps(cond);
-    for (; px < pxLim; px++)
-    {
-        float r = *px;
-
-        __m128 x1 = _mm_set1_ps(r);
-        for (py = pyBase; py < pyLim; pm += 4, pag += 4, pau += 4, py += 4)
-        {
-            __m128 x2 = _mm_load_ps(py);
-            __m128 ag = _mm_load_ps(pag);
-            __m128 au = _mm_load_ps(pau);
-            __m128 coef = _mm_load_ps(pm);
-            x2 = _mm_mul_ps(x1, x2);        // x2 == g
-
-            UpdateAdadelta(coef, ag, au, x2, dec, decc, c);
-
-            _mm_store_ps(pm, coef);
-            _mm_store_ps(pag, ag);
-            _mm_store_ps(pau, au);
-        }
-    }
-}
+    float * prLim = pr + c;
 
-// For Adadelta, sparse matrix.
-EXPORT_API(void) AddXYTranGradRU(_In_ const float * px, _In_ const float * py, _In_ const int * pstarts, _In_ const int * pindices,
-    _Inout_ float * pcoefs, _Inout_ float * paccGrads, _Inout_ float * paccUpdates, float decay, float cond, int crow)
-{
-    const int * pii = pstarts + 1;
-    const int * pi = pindices;
-    float * pm = pcoefs;
-    const float * pxLim = px + crow;
-    float * pag = paccGrads;
-    float * pau = paccUpdates;
-
-    __m128 dec = _mm_set1_ps(decay);
-    __m128 decc = _mm_set1_ps(1 - decay);
-    __m128 c = _mm_set1_ps(cond);
-
-    for (; px < pxLim; px++)
+    __m128 x1 = _mm_set1_ps(a);
+    for (; pr + 4 <= prLim; pr += 4, pd += 4, ps += 4)
     {
-        const int * piLim = pindices + *pii++;
-        float r = *px;
-
-        __m128 x1 = _mm_set1_ps(r);
-        for (; pi + 4 <= piLim; pi += 4, pm += 4, pag += 4, pau += 4)
-        {
-            __m128 g = _mm_mul_ps(x1, _load4(py, pi));
-            __m128 ag = _mm_loadu_ps(pag);
-            __m128 au = _mm_loadu_ps(pau);
-            __m128 coef = _mm_loadu_ps(pm);
-
-            UpdateAdadelta(coef, ag, au, g, dec, decc, c);
-
-            _mm_storeu_ps(pm, coef);
-            _mm_storeu_ps(pag, ag);
-            _mm_storeu_ps(pau, au);
-        }
-
-        if (pi < piLim)
-        {
-            size_t ctail = piLim - pi;
-            __m128 g = _mm_mul_ss(_load1(py, pi++), x1);
-            __m128 ag = _mm_load_ss(pag++);
-            __m128 au = _mm_load_ss(pau++);
-            __m128 coef = _mm_load_ss(pm++);
-            for (; pi < piLim; pi++, pm++, pag++, pau++)
-            {
-                g = _mm_or_ps(_mm_mul_ss(_load1(py, pi), x1), _rotate(g));
-                ag = _mm_or_ps(_mm_load_ss(pag), _rotate(ag));
-                au = _mm_or_ps(_mm_load_ss(pau), _rotate(au));
-                coef = _mm_or_ps(_mm_load_ss(pm), _rotate(coef));
-            }
-            UpdateAdadelta(coef, ag, au, g, dec, decc, c);
-            for (int i = 0; i < ctail; i++)
-            {
-                _mm_store_ss(pm - i - 1, coef);
-                coef = _rotate_reverse(coef);
-                _mm_store_ss(pag - i - 1, ag);
-                ag = _rotate_reverse(ag);
-                _mm_store_ss(pau - i - 1, au);
-                au = _rotate_reverse(au);
-            }
-        }
+        __m128 x2 = _mm_loadu_ps(ps);
+        __m128 x3 = _mm_loadu_ps(pd);
+        x2 = _mm_mul_ps(x2, x1);
+        x3 = _mm_add_ps(x3, x2);
+        _mm_storeu_ps(pr, x3);
     }
-}
 
-// For Adadelta, partial sparse source vector.
-EXPORT_API(void) AddXYTranGradPA(_In_ const float * px, _In_ const int * pposY, _In_ const float * pvaluesY,
-    int posMinY, int iposMinY, int iposLimY, _Inout_ float * pmat, _Inout_ float * paccGrads, _Inout_ float * paccUpdates,
-    float decay, float cond, int crow, int ccol)
-{
-    const int * pposMin = pposY + iposMinY;
-    const int * pposLim = pposY + iposLimY;
-    const float * pxLim = px + crow;
-    const float * py = pvaluesY - posMinY;
-    float * pm0 = pmat - posMinY;
-    float * pag0 = paccGrads - posMinY;
-    float * pau0 = paccUpdates - posMinY;
-
-    __m128 dec = _mm_set1_ps(decay);
-    __m128 decc = _mm_set1_ps(1 - decay);
-    __m128 c = _mm_set1_ps(cond);
-    for (; px < pxLim; px += 4, pm0 += 4 * ccol, pag0 += 4 * ccol, pau0 += 4 * ccol)
+    for (; pr < prLim; pr++, pd++, ps++)
     {
-        float * pm1 = pm0 + ccol;
-        float * pm2 = pm1 + ccol;
-        float * pm3 = pm2 + ccol;
-
-        float * pag1 = pag0 + ccol;
-        float * pag2 = pag1 + ccol;
-        float * pag3 = pag2 + ccol;
-
-        float * pau1 = pau0 + ccol;
-        float * pau2 = pau1 + ccol;
-        float * pau3 = pau2 + ccol;
-
-        __m128 x1 = _mm_load_ps(px);
-
-        for (const int * ppos = pposMin; ppos < pposLim; ppos++)
-        {
-            int col = *ppos;
-            __m128 x2 = _mm_set1_ps(py[col]);
-            __m128 ag = _mm_setr_ps(pag0[col], pag1[col], pag2[col], pag3[col]);
-            __m128 au = _mm_setr_ps(pau0[col], pau1[col], pau2[col], pau3[col]);
-            __m128 coef = _mm_setr_ps(pm0[col], pm1[col], pm2[col], pm3[col]);
-            x2 = _mm_mul_ps(x2, x1);
-
-            UpdateAdadelta(coef, ag, au, x2, dec, decc, c);
-
-            _mm_store_ss(pm0 + col, coef); coef = _rotate(coef);
-            _mm_store_ss(pm1 + col, coef); coef = _rotate(coef);
-            _mm_store_ss(pm2 + col, coef); coef = _rotate(coef);
-            _mm_store_ss(pm3 + col, coef);
-
-            _mm_store_ss(pag0 + col, ag); ag = _rotate(ag);
-            _mm_store_ss(pag1 + col, ag); ag = _rotate(ag);
-            _mm_store_ss(pag2 + col, ag); ag = _rotate(ag);
-            _mm_store_ss(pag3 + col, ag);
-
-            _mm_store_ss(pau0 + col, au); au = _rotate(au);
-            _mm_store_ss(pau1 + col, au); au = _rotate(au);
-            _mm_store_ss(pau2 + col, au); au = _rotate(au);
-            _mm_store_ss(pau3 + col, au);
-        }
+        __m128 x2 = _mm_load_ss(ps);
+        __m128 x3 = _mm_load_ss(pd);
+        x2 = _mm_mul_ss(x2, x1);
+        x3 = _mm_add_ss(x3, x2);
+        _mm_store_ss(pr, x3);
     }
 }
 
@@ -1449,19 +399,6 @@ EXPORT_API(void) ScaleU(float a, _Inout_ float * pd, int c)
     }
 }
 
-EXPORT_API(void) ScaleA(float a, _Inout_ float * pd, int c)
-{
-    float * pdLim = pd + c;
-
-    __m128 x1 = _mm_set1_ps(a);
-    for (; pd < pdLim; pd += 4)
-    {
-        __m128 x2 = _mm_load_ps(pd);
-        x2 = _mm_mul_ps(x1, x2);
-        _mm_store_ps(pd, x2);
-    }
-}
-
 EXPORT_API(void) ScaleSrcU(float a, _In_ const float * ps, _Inout_ float * pd, int c)
 {
     float * pdLim = pd + c;
@@ -1506,133 +443,6 @@ EXPORT_API(void) ScaleAddU(float a, float b, _Inout_ float * pd, int c)
     }
 }
 
-EXPORT_API(void) ScaleMaxNormA(float maxNorm, _Inout_ float * pmat, int crow, int ccol)
-{
-    float * pm = pmat;
-    float maxNormSq = maxNorm * maxNorm;
-    __m128 m = _mm_set1_ps(maxNorm);
-    for (int irow = 0; irow < crow; irow++)
-    {
-        __m128 rowNorm = _mm_set1_ps(0);
-        float * pms = pm;
-        float * pmLim = pm + ccol;
-        for (; pm < pmLim; pm += 4)
-        {
-            __m128 x1 = _mm_load_ps(pm);
-            x1 = _mm_mul_ps(x1, x1);
-            rowNorm = _mm_add_ps(x1, rowNorm);
-        }
-        rowNorm = _mm_hadd_ps(rowNorm, rowNorm);
-        rowNorm = _mm_hadd_ps(rowNorm, rowNorm);
-        float rowNormRes = _mm_cvtss_f32(rowNorm);
-        if (rowNormRes > maxNormSq)
-        {
-            __m128 scale = _mm_set1_ps(rowNormRes);
-#if 0
-            // REVIEW: this is faster but it uses approximation so results differ significantly from CLR.
-            scale = _mm_rsqrt_ps(scale);
-            scale = _mm_mul_ps(scale, m);
-#else
-            scale = _mm_sqrt_ps(scale);
-            scale = _mm_div_ps(m, scale);
-#endif
-            for (pm = pms; pm < pmLim; pm += 4)
-            {
-                __m128 x1 = _mm_load_ps(pm);
-                x1 = _mm_mul_ps(x1, scale);
-                _mm_store_ps(pm, x1);
-            }
-        }
-    }
-}
-
-EXPORT_API(void) ScaleMaxNormTranU(float maxNorm, _Inout_ float * pmat, int crow, int ccol)
-{
-    for (int icol = 0; icol < ccol; icol++)
-    {
-        float * pm = pmat + icol;
-        float rowNorm = 0;
-        for (int irow = 0; irow < crow; irow++)
-        {
-            rowNorm += *pm * *pm;
-            pm += ccol;
-        }
-        if (rowNorm > maxNorm * maxNorm)
-        {
-            float scale = maxNorm / sqrtf(rowNorm);
-            pm = pmat + icol;
-            for (int irow = 0; irow < crow; irow++)
-            {
-                *pm *= scale;
-                pm += ccol;
-            }
-        }
-    }
-}
-
-// Sparse matrix.
-EXPORT_API(void) ScaleMaxNormRU(float maxNorm, _In_ const int * pstarts, _Inout_ float * pmat, int crow)
-{
-    for (int irow = 0; irow < crow; irow++)
-    {
-        float rowNorm = 0;
-        for (int idx = pstarts[irow]; idx < pstarts[irow + 1]; idx++)
-        {
-            rowNorm += pmat[idx] * pmat[idx];
-        }
-        if (rowNorm > maxNorm * maxNorm)
-        {
-            float scale = maxNorm / sqrtf(rowNorm);
-            for (int idx = pstarts[irow]; idx < pstarts[irow + 1]; idx++)
-            {
-                pmat[idx] *= scale;
-            }
-        }
-    }
-}
-
-// Convolution.
-EXPORT_API(void) ScaleMaxNormCU(float maxNorm, int kernCount, int kernSize, _Inout_ float * pmat)
-{
-    float * pm = pmat;
-    for (int irow = 0; irow < kernCount; irow++)
-    {
-        float rowNorm = 0;
-        for (int icol = 0; icol < kernSize; icol++)
-        {
-            rowNorm += *pm * *pm;
-            pm++;
-        }
-        if (rowNorm > maxNorm * maxNorm)
-        {
-            float scale = maxNorm / sqrtf(rowNorm);
-            pm -= kernSize;
-            for (int icol = 0; icol < kernSize; icol++)
-            {
-                *pm *= scale;
-                pm++;
-            }
-        }
-        // Skip bias.
-        pm++;
-    }
-}
-
-EXPORT_API(void) AddScaleA(float a, _In_ const float * ps, _Inout_ float * pd, int c)
-{
-    float * pdLim = pd + c;
-
-    __m128 x1 = _mm_set1_ps(a);
-    for (; pd < pdLim; pd += 4, ps += 4)
-    {
-        __m128 x2 = _mm_load_ps(ps);
-        __m128 x3 = _mm_load_ps(pd);
-        x2 = _mm_mul_ps(x1, x2);
-        x3 = _mm_add_ps(x2, x3);
-        _mm_store_ps(pd, x3);
-    }
-}
-
 EXPORT_API(void) AddScaleU(float a, _In_ const float * ps, _Inout_ float * pd, int c)
 {
     float * pdLim = pd + c;
@@ -1657,30 +467,6 @@ EXPORT_API(void) AddScaleU(float a, _In_ const float * ps, _Inout_ float * pd, i
     }
 }
 
-EXPORT_API(void) AddScaleCopyU(float a, _In_ const float * ps, _In_ const float * pd, _Inout_ float * pr, int c)
-{
-    float * prLim = pr + c;
-
-    __m128 x1 = _mm_set1_ps(a);
-    for (; pr + 4 <= prLim; pr += 4, pd += 4, ps += 4)
-    {
-        __m128 x2 = _mm_loadu_ps(ps);
-        __m128 x3 = _mm_loadu_ps(pd);
-        x2 = _mm_mul_ps(x2, x1);
-        x3 = _mm_add_ps(x3, x2);
-        _mm_storeu_ps(pr, x3);
-    }
-
-    for (; pr < prLim; pr++, pd++, ps++)
-    {
-        __m128 x2 = _mm_load_ss(ps);
-        __m128 x3 = _mm_load_ss(pd);
-        x2 = _mm_mul_ss(x2, x1);
-        x3 = _mm_add_ss(x3, x2);
-        _mm_store_ss(pr, x3);
-    }
-}
-
 EXPORT_API(void) AddScaleSU(float a, _In_ const float * ps, _In_ const int * pi, _Inout_ float * pd, int c)
 {
     const int * piLim = pi + c;
@@ -1699,97 +485,6 @@ EXPORT_API(void) AddScaleSU(float a, _In_ const float * ps, _In_ const int * pi,
         pd[*pi] += a * *ps;
 }
 
-EXPORT_API(void) AddScaleMomA(float a, _In_ const float * ps, _Inout_ float * pd, float momentum, _Inout_ float * pe, int c)
-{
-    float * pdLim = pd + c;
-
-    __m128 x0 = _mm_set1_ps(momentum);
-    __m128 x1 = _mm_set1_ps(a);
-    for (; pd < pdLim; pd += 4, pe += 4, ps += 4)
-    {
-        __m128 x2 = _mm_load_ps(ps);
-        __m128 x3 = _mm_load_ps(pe);
-        __m128 x4 = _mm_load_ps(pd);
-        x2 = _mm_mul_ps(x1, x2);
-        x3 = _mm_mul_ps(x0, x3);
-        x3 = _mm_add_ps(x2, x3);
-        x4 = _mm_add_ps(x3, x4);
-        _mm_store_ps(pe, x3);
-        _mm_store_ps(pd, x4);
-    }
-}
-
-EXPORT_API(void) AddScaleGradA(_In_ const float * ps, _Inout_ float * pd, _Inout_ float * paccGrads, _Inout_ float * paccUpdates,
-    float decay, float cond, int c)
-{
-    float * pdLim = pd + c;
-
-    __m128 dec = _mm_set1_ps(decay);
-    __m128 decc = _mm_set1_ps(1 - decay);
-    __m128 cnd = _mm_set1_ps(cond);
-    for (; pd < pdLim; pd += 4, ps += 4, paccGrads += 4, paccUpdates += 4)
-    {
-        __m128 g = _mm_load_ps(ps);
-        __m128 ag = _mm_load_ps(paccGrads);
-        __m128 au = _mm_load_ps(paccUpdates);
-        __m128 coef = _mm_load_ps(pd);
-
-        UpdateAdadelta(coef, ag, au, g, dec, decc, cnd);
-
-        _mm_store_ps(pd, coef);
-        _mm_store_ps(paccGrads, ag);
-        _mm_store_ps(paccUpdates, au);
-    }
-}
-
-EXPORT_API(void) AddScaleMultiA(int count, _In_ const float * ps, _Inout_ float * pd, _Inout_ float * paccGrads, _Inout_ float * paccUpdates,
-    float decay, float cond, int size)
-{
-    if (1 == count)
-        AddScaleGradA(ps, pd, paccGrads, paccUpdates, decay, cond, size);
-    else
-    {
-        float * pdLim = pd + size;
-
-        __m128 dec = _mm_set1_ps(decay);
-        __m128 decc = _mm_set1_ps(1 - decay);
-        __m128 cnd = _mm_set1_ps(cond);
-        for (; pd < pdLim; pd += 4, ps += 4, paccGrads += 4, paccUpdates += 4)
-        {
-            __m128 g = _mm_set1_ps(0);
-            const float * ps1 = ps;
-            // REVIEW: unroll?
-            for (int i = 0; i < count; i++, ps1 += size)
-            {
-                __m128 x1 = _mm_load_ps(ps1);
-                g = _mm_add_ps(x1, g);
-            }
-            __m128 ag = _mm_load_ps(paccGrads);
-            __m128 au = _mm_load_ps(paccUpdates);
-            __m128 coef = _mm_load_ps(pd);
-
-            UpdateAdadelta(coef, ag, au, g, dec, decc, cnd);
-
-            _mm_store_ps(pd, coef);
-            _mm_store_ps(paccGrads, ag);
-            _mm_store_ps(paccUpdates, au);
-        }
-    }
-}
-
-EXPORT_API(void) AddA(_In_ const float * ps, _Inout_ float * pd, int c)
-{
-    float * pdLim = pd + c;
-
-    for (; pd < pdLim; pd += 4, ps += 4)
-    {
-        __m128 x1 = _mm_load_ps(ps);
-        __m128 x2 = _mm_load_ps(pd);
-        x2 = _mm_add_ps(x1, x2);
-        _mm_store_ps(pd, x2);
-    }
-}
-
 EXPORT_API(void) AddU(_In_ const float * ps, _Inout_ float * pd, int c)
 {
     float * pdLim = pd + c;
@@ -1811,71 +506,20 @@ EXPORT_API(void) AddU(_In_ const float * ps, _Inout_ float * pd, int c)
     }
 }
 
-EXPORT_API(void) AddSU(_In_ const float * ps, _In_ const int * pi, _Inout_ float * pd, int c)
-{
-    const int * piLim = pi + c;
-
-    for (; pi + 4 <= piLim; pi += 4, ps += 4)
-    {
-        __m128 x1 = _load4(pd, pi);
-        __m128 x2 = _mm_loadu_ps(ps);
-        x1 = _mm_add_ps(x1, x2);
-        _store4(x1, pd, pi);
-    }
-
-    for (; pi < piLim; pi++, ps++)
-        pd[*pi] += *ps;
-}
-
-EXPORT_API(void) MulElementWiseU(_In_ const float * ps1, _In_ const float * ps2, _Inout_ float * pd, int c)
-{
-    float * pdLim = pd + c;
-
-    for (; pd + 4 <= pdLim; pd += 4, ps1 += 4, ps2 += 4)
-    {
-        __m128 x1 = _mm_loadu_ps(ps1);
-        __m128 x2 = _mm_loadu_ps(ps2);
-        x2 = _mm_mul_ps(x1, x2);
-        _mm_storeu_ps(pd, x2);
-    }
-
-    for (; pd < pdLim; pd++, ps1++, ps2++)
-    {
-        __m128 x1 = _mm_load_ss(ps1);
-        __m128 x2 = _mm_load_ss(ps2);
-        x2 = _mm_mul_ps(x1, x2);
-        _mm_store_ss(pd, x2);
-    }
-}
-
-EXPORT_API(void) MulElementWiseSU(_In_ const float * ps1, _In_ const float * ps2, _In_ const int * pi, _Inout_ float * pd, int c)
-{
-    const int * piLim = pi + c;
-
-    for (; pi + 4 <= piLim; pi += 4)
-    {
-        __m128 x1 = _load4(ps1, pi);
-        __m128 x2 = _load4(ps2, pi);
-        x2 = _mm_mul_ps(x1, x2);
-        _store4(x2, pd, pi);
-    }
-
-    for (; pi < piLim; pi++)   
-        pd[*pi] = ps1[*pi] * ps2[*pi];    
-}
-
-EXPORT_API(float) SumA(const float * ps, int c)
-{
-    const float * psLim = ps + c;
-
-    __m128 res = _mm_setzero_ps();
-    for (; ps < psLim; ps += 4)
-        res = _mm_add_ps(res, _mm_load_ps(ps));
+EXPORT_API(void) AddSU(_In_ const float * ps, _In_ const int * pi, _Inout_ float * pd, int c)
+{
+    const int * piLim = pi + c;
 
-    res = _mm_hadd_ps(res, res);
-    res = _mm_hadd_ps(res, res);
+    for (; pi + 4 <= piLim; pi += 4, ps += 4)
+    {
+        __m128 x1 = _load4(pd, pi);
+        __m128 x2 = _mm_loadu_ps(ps);
+        x1 = _mm_add_ps(x1, x2);
+        _store4(x1, pd, pi);
+    }
 
-    return _mm_cvtss_f32(res);
+    for (; pi < piLim; pi++, ps++)
+        pd[*pi] += *ps;
 }
 
 EXPORT_API(float) SumU(const float * ps, int c)
@@ -1989,55 +633,6 @@ EXPORT_API(float) SumAbsDiffU(float mean, const float * ps, int c)
     return _mm_cvtss_f32(res);
 }
 
-EXPORT_API(float) MaxAbsU(const float * ps, int c)
-{
-    const float * psLim = ps + c;
-
-    __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
-    __m128 res = _mm_setzero_ps();
-    for (; ps + 4 <= psLim; ps += 4)
-        res = _mm_max_ps(res, _mm_and_ps(_mm_loadu_ps(ps), mask));
-
-    __m128 x1 = _mm_shuffle_ps(res, res, 0xB1);
-    res = _mm_max_ps(res, x1);
-    x1 = _mm_shuffle_ps(res, res, 0x02);
-    res = _mm_max_ss(res, x1);
-
-    for (; ps < psLim; ps++)
-        res = _mm_max_ss(res, _mm_and_ps(_mm_load_ss(ps), mask));
-
-    return _mm_cvtss_f32(res);
-}
-
-EXPORT_API(float) MaxAbsDiffU(float mean, const float * ps, int c)
-{
-    const float * psLim = ps + c;
-
-    __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
-    __m128 res = _mm_setzero_ps();
-    __m128 m = _mm_set1_ps(mean);
-    for (; ps + 4 <= psLim; ps += 4)
-    {
-        __m128 x = _mm_loadu_ps(ps);
-        x = _mm_sub_ps(x, m);
-        res = _mm_max_ps(res, _mm_and_ps(x, mask));
-    }
-
-    __m128 x1 = _mm_shuffle_ps(res, res, 0xB1);
-    res = _mm_max_ps(res, x1);
-    x1 = _mm_shuffle_ps(res, res, 0x02);
-    res = _mm_max_ss(res, x1);
-
-    for (; ps < psLim; ps++)
-    {
-        __m128 x = _mm_load_ss(ps);
-        x = _mm_sub_ss(x, m);
-        res = _mm_max_ss(res, _mm_and_ps(x, mask));
-    }
-
-    return _mm_cvtss_f32(res);
-}
-
 EXPORT_API(float) DotU(const float * pa, const float * pb, int c)
 {
     const float * paLim = pa + c;
@@ -2100,486 +695,6 @@ EXPORT_API(float) Dist2(const float * px, const float * py, int c)
     return norm2;
 }
 
-// This is modeled after double-based SSE code
-
-// 1 / ln(2).
-const float RecipLn2 = (float)1.44269504088896340735992468100;
-
-// Used for computing a 4th degree polynomial approximation of e^x.
-const float Coef1 = (float)0.013555747234814917704030793;
-const float Coef2 = (float)0.065588116243247810171479524;
-const float Coef3 = (float)0.3069678791803394491901401;
-
-const float ExpInf = 128;
-const int ExpBias = 127;
-const int ExpShift = 23;
-
-float ExpFast(float arg)
-{
-    bool neg = false;
-    if (arg < 0)
-    {
-        arg = -arg;
-        neg = true;
-    }
-
-    arg *= RecipLn2;
-    if (arg >= ExpInf)
-        return neg ? 0.0f : std::numeric_limits<float>::infinity();
-
-    int exp = (int)arg;
-    arg -= exp;
-    exp += ExpBias;
-    exp <<= ExpShift;
-
-    float res = (1 + arg) + (arg - 1) * arg * ((Coef1 * arg + Coef2) * arg + Coef3);
-    res *= *(float *)&exp;
-
-    if (neg)
-        res = 1 / res;
-    return res;
-}
-
-// Implements a fast approximation of sigmoid/tanh.
-template<bool isTanh>
-void ApplySigmoidCoreA(_In_ const float * ps, _Inout_ float * pd, int c)
-{
-    float * pdLim = pd + c;
-
-    __m128 cSign = _mm_set1_ps(-0.0f);
-    __m128 cZero = _mm_set1_ps(0.0f);
-    __m128 cOne = _mm_set1_ps(1.0f);
-
-    __m128 cMax = _mm_set1_ps(ExpInf);
-    __m128i cBias = _mm_set1_epi32(ExpBias);
-    __m128 c0 = _mm_set1_ps(RecipLn2);
-    __m128 c1 = _mm_set1_ps(Coef1);
-    __m128 c2 = _mm_set1_ps(Coef2);
-    __m128 c3 = _mm_set1_ps(Coef3);
-
-    if (isTanh)
-        c0 = _mm_add_ps(c0, c0);
-
-    for (; pd < pdLim; ps += 4, pd += 4)
-    {
-        // Get the argument, capture its sign and take its absolute value.
-        __m128 xArg = _mm_load_ps(ps);
-        // maskNaN is set to zero if xArg is not NaN and set equal to xArg otherwise.
-        __m128 maskNaN = _mm_and_ps(_mm_cmpneq_ps(xArg, xArg), xArg);
-        __m128 xSign = _mm_and_ps(xArg, cSign);
-        xArg = _mm_xor_ps(xArg, xSign);
-
-        // Multiply by 1/ln(2) and check for out of bounds.
-        xArg = _mm_mul_ps(xArg, c0);
-        __m128 xGood = _mm_cmplt_ps(xArg, cMax);
-        xArg = _mm_and_ps(xArg, xGood);
-
-        // Get the integer and fractional parts.
-        __m128i xInt = _mm_cvttps_epi32(xArg);
-        xArg = _mm_sub_ps(xArg, _mm_cvtepi32_ps(xInt));
-
-        // Add the exponent bias to xInt, then convert to a floating point
-        // power of two by shifting past the mantissa bits.
-        xInt = _mm_add_epi32(xInt, cBias);
-        xInt = _mm_slli_epi32(xInt, ExpShift);
-
-        // Approximate 2 raised to the fractional part.
-        // (1 + f) + (f - 1) * f * ((c1 * f + c2) * f + c3)
-
-        // x1 = (c1 * f + c2) * f + c3
-        __m128 x1 = _mm_mul_ps(c1, xArg);
-        x1 = _mm_add_ps(x1, c2);
-        x1 = _mm_mul_ps(x1, xArg);
-        x1 = _mm_add_ps(x1, c3);
-
-        // x2 = f * (f - 1)
-        __m128 x2 = _mm_sub_ps(xArg, cOne);
-        x2 = _mm_mul_ps(xArg, x2);
-
-        // Add (1 + f). Note that for tanh, we only add f, so we are approximating
-        // 2^f - 1. This is necessary to preserve precision near zero. In particular,
-        // near zero, tanh(x) ~ x.
-        x1 = _mm_mul_ps(x2, x1);
-        if (!isTanh)
-            xArg = _mm_add_ps(xArg, cOne);
-        x1 = _mm_add_ps(xArg, x1);
-
-        // Multiply by 2^n, where n is the integer part.
-        __m128 x3 = _mm_castsi128_ps(xInt);
-        x1 = _mm_mul_ps(x1, x3);
-
-        if (!isTanh)
-        {
-            // Add 1, and take the reciprocal.
-            x1 = _mm_add_ps(x1, cOne);
-            x1 = _mm_div_ps(cOne, x1);
-
-            // Deal with out of bounds.
-            x1 = _mm_and_ps(x1, xGood);
-            // If the input was NaN, xGood is zero, so x1 is zero. So can simply or in maskNaN.
-            x1 = _mm_or_ps(x1, maskNaN);
-
-            // Deal with the sign. Set:
-            // * x2 =     x1 if xSign is -0 (0x80000000)
-            // * x2 = 1 - x1 if xSign is +0 (0x00000000).
-            x1 = _mm_or_ps(x1, xSign);
-            x2 = _mm_or_ps(xSign, cOne);
-            x2 = _mm_max_ps(x2, cZero);
-            x2 = _mm_sub_ps(x2, x1);
-        }
-        else
-        {
-            // [2^n(2^f - 1) + (2^n - 1)] / [2^n(2^f - 1) + (2^n + 1)]
-            x2 = _mm_add_ps(x1, _mm_sub_ps(x3, cOne));
-            x1 = _mm_add_ps(x1, _mm_add_ps(x3, cOne));
-            x2 = _mm_div_ps(x2, x1);
-
-            // Deal with out of bounds: x2 = (x2 & xGood) | ((1 + maskNaN) & ~xGood)
-            x2 = _mm_and_ps(x2, xGood);
-            x1 = _mm_andnot_ps(xGood, _mm_add_ps(maskNaN, cOne));
-            x2 = _mm_or_ps(x2, x1);
-
-            // Deal with the sign.
-            x2 = _mm_or_ps(x2, xSign);
-        }
-
-        _mm_store_ps(pd, x2);
-    }
-
-    // If we overshot, back fill with zero! Since tanh(0) = 0, we only need to do this for sigmoid.
-    if (!isTanh)
-    {
-        while (pd > pdLim)
-            *--pd = 0.0f;
-    }
-}
-
-EXPORT_API(void) ApplySigmoidA(_In_ const float * ps, _Inout_ float * pd, int c)
-{
-    ApplySigmoidCoreA<false>(ps, pd, c);
-}
-
-EXPORT_API(void) ApplySoftMaxU(_In_ const float * ps, _Inout_ float * pd, int c)
-{
-    // REVIEW: Use SSE - do 4 at a time.
-
-    const float * psLim = ps + c;
-
-    // Compute max output.
-    float maxOut = -std::numeric_limits<float>::infinity();
-    for (const float * p = ps; p < psLim; p++)
-    {
-        float v = *p;
-        if (maxOut < v)
-            maxOut = v;
-    }
-
-    // Compute exp and sum.
-    float sum = 0;
-    const float * p = ps;
-    for (float * q = pd; p < psLim; p++, q++)
-    {
-        float v = ExpFast(*p - maxOut);
-        *q = v;
-        sum += v;
-    }
-
-    // Normalize.
-    for (float * q = pd; q < pd + c; q++)
-        *q /= sum;
-}
-
-EXPORT_API(void) ApplyRectifiedLinearA(_In_ const float * ps, _Inout_ float * pd, int c)
-{
-    const float * psLim = ps + c;
-
-    __m128 cZero = _mm_set1_ps(0.0f);
-    for (; ps < psLim; ps += 4, pd += 4)
-    {
-        __m128 x1 = _mm_load_ps(ps);
-        x1 = _mm_max_ps(x1, cZero);
-        _mm_store_ps(pd, x1);
-    }
-}
-
-EXPORT_API(void) ApplySquareA(_In_ const float * ps, _Inout_ float * pd, int c)
-{
-    const float * psLim = ps + c;
-
-    for (; ps < psLim; ps += 4, pd += 4)
-    {
-        __m128 x1 = _mm_load_ps(ps);
-        x1 = _mm_mul_ps(x1, x1);
-        _mm_store_ps(pd, x1);
-    }
-}
-
-EXPORT_API(void) ApplySqrtA(_In_ const float * ps, _Inout_ float * pd, int c)
-{
-    const float * psLim = ps + c;
-
-    __m128 cZero = _mm_set1_ps(0.0f);
-    for (; ps < psLim; ps += 4, pd += 4)
-    {
-        __m128 x1 = _mm_load_ps(ps);
-        x1 = _mm_max_ps(x1, cZero);
-        x1 = _mm_sqrt_ps(x1);
-        _mm_store_ps(pd, x1);
-    }
-}
-
-EXPORT_API(void) ApplySoftRectifiedLinearU(_In_ const float * ps, _Inout_ float * pd, int c)
-{
-    const float * psLim = ps + c;
-
-    // Apply: f(x) = log(1 + e^x). To avoid overflow for large x, we use the identity: f(x) = x + f(-x).
-    // REVIEW: Should we implement a "LogFast"?
-    // REVIEW: Do 4 at a time.
-    const float * p = ps;
-    for (float * q = pd; p < psLim; p++, q++)
-    {
-        float x = *p;
-        if (x > 0)
-            *q = x + log(1 + ExpFast(-x));
-        else
-            *q = log(1 + ExpFast(x));
-    }
-}
-
-EXPORT_API(void) ApplyAbsA(_In_ const float * ps, _Inout_ float * pd, int c)
-{
-    const float * psLim = ps + c;
-
-    __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
-    for (; ps < psLim; ps += 4, pd += 4)
-    {
-        __m128 x1 = _mm_load_ps(ps);
-        x1 = _mm_and_ps(x1, mask);
-        _mm_store_ps(pd, x1);
-    }
-}
-
-EXPORT_API(void) ApplyTanhA(_In_ const float * ps, _Inout_ float * pd, int c)
-{
-    ApplySigmoidCoreA<true>(ps, pd, c);
-}
-
-EXPORT_API(void) ApplyBoundedRectifiedLinearA(_In_ const float * ps, _Inout_ float * pd, int c)
-{
-    const float * psLim = ps + c;
-
-    __m128 cZero = _mm_set1_ps(0.0f);
-    __m128 cOne = _mm_set1_ps(1.0f);
-    for (; ps < psLim; ps += 4, pd += 4)
-    {
-        __m128 x1 = _mm_load_ps(ps);
-        x1 = _mm_max_ps(x1, cZero);
-        x1 = _mm_min_ps(x1, cOne);
-        _mm_store_ps(pd, x1);
-    }
-}
-
-EXPORT_API(void) ApplySigmoidDerivativeA(_In_ const float * pv, _Inout_ float * pg, int c)
-{
-    float * pgLim = pg + c;
-
-    // pg[i] *= pv[i] * (1 - pv[i])
-    __m128 cOne = _mm_set1_ps(1.0f);
-    for (; pg < pgLim; pg += 4, pv += 4)
-    {
-        __m128 x1 = _mm_load_ps(pv);
-        __m128 x2 = _mm_load_ps(pg);
-        __m128 x3 = _mm_sub_ps(cOne, x1);
-        x1 = _mm_mul_ps(x1, x3);
-        x2 = _mm_mul_ps(x2, x1);
-        _mm_store_ps(pg, x2);
-    }
-}
-
-EXPORT_API(void) ApplyRectifiedLinearDerivativeA(_In_ const float * pv, _Inout_ float * pg, int c)
-{
-    float * pgLim = pg + c;
-
-    __m128 cZero = _mm_set1_ps(0.0f);
-    for (; pg < pgLim; pg += 4, pv += 4)
-    {
-        __m128 x1 = _mm_load_ps(pv);
-        __m128 x2 = _mm_load_ps(pg);
-        x1 = _mm_cmpgt_ps(x1, cZero);
-        x2 = _mm_and_ps(x2, x1);
-        _mm_store_ps(pg, x2);
-    }
-}
-
-EXPORT_API(void) ApplySquareDerivativeA(_In_ const float * px, _In_opt_ const float * py, _Inout_ float * pg, int c, bool drop)
-{
-    float * pgLim = pg + c;
-
-    if (drop)
-    {
-        __m128 cZero = _mm_set1_ps(0.0f);
-        for (; pg < pgLim; pg += 4, px += 4, py += 4)
-        {
-            __m128 x0 = _mm_cmpgt_ps(_mm_load_ps(py), cZero);
-            __m128 x1 = _mm_load_ps(px);
-            __m128 x2 = _mm_load_ps(pg);
-            x1 = _mm_add_ps(x1, x1);
-            x2 = _mm_mul_ps(x2, x1);
-            x2 = _mm_and_ps(x2, x0);
-            _mm_store_ps(pg, x2);
-        }
-    }
-    else
-    {
-        for (; pg < pgLim; pg += 4, px += 4)
-        {
-            __m128 x1 = _mm_load_ps(px);
-            __m128 x2 = _mm_load_ps(pg);
-            x1 = _mm_add_ps(x1, x1);
-            x2 = _mm_mul_ps(x2, x1);
-            _mm_store_ps(pg, x2);
-        }
-    }
-}
-
-EXPORT_API(void) ApplySqrtDerivativeA(_In_ const float * pv, _Inout_ float * pg, int c)
-{
-    float * pgLim = pg + c;
-    static const float smallValue = 1e-10F;
-
-    __m128 cZero = _mm_set1_ps(0.0f);
-    __m128 cSmall = _mm_set1_ps(smallValue);
-    for (; pg < pgLim; pg += 4, pv += 4)
-    {
-        __m128 x1 = _mm_load_ps(pv);
-        __m128 x2 = _mm_load_ps(pg);
-        __m128 x3 = _mm_cmpgt_ps(x1, cZero);
-        x1 = _mm_max_ps(x1, cSmall);
-        x1 = _mm_add_ps(x1, x1);
-        x2 = _mm_and_ps(x2, x3);
-        x2 = _mm_div_ps(x2, x1);
-        _mm_store_ps(pg, x2);
-    }
-}
-
-EXPORT_API(void) ApplySoftRectifiedLinearDerivativeU(_In_opt_ const float * px, _In_ const float * py, _Inout_ float * pg, int c)
-{
-    UNUSED(px);
-
-    float * pgLim = pg + c;
-
-    // Use the identity: y' = 1 - e^(-y). This has a few nice properties:
-    // * If x is large enough that x == y (after rounding), we'll compute y' as 1.
-    // * If x is small enough that y == 0 (after rounding), we'll compute y' as 0.
-    // * If y is zero because of drop out, we'll compute y' as 0.
-    // REVIEW: Do 4 at a time.
-    for (; pg < pgLim; pg++, py++)
-        *pg *= 1 - ExpFast(-*py);
-}
-
-EXPORT_API(void) ApplyAbsDerivativeA(_In_ const float * px, _In_opt_ const float * py, _Inout_ float * pg, int c, bool drop)
-{
-    float * pgLim = pg + c;
-
-    __m128 cZero = _mm_set1_ps(0.0f);
-    __m128 cSign = _mm_set1_ps(-0.0f);
-    if (drop)
-    {
-        for (; pg < pgLim; pg += 4, px += 4, py += 4)
-        {
-            __m128 x1 = _mm_and_ps(_mm_load_ps(px), cSign);
-            __m128 x2 = _mm_cmpgt_ps(_mm_load_ps(py), cZero);
-            __m128 x3 = _mm_load_ps(pg);
-            x3 = _mm_xor_ps(x3, x1);
-            x3 = _mm_and_ps(x3, x2);
-            _mm_store_ps(pg, x3);
-        }
-    }
-    else
-    {
-        for (; pg < pgLim; pg += 4, px += 4)
-        {
-            __m128 x0 = _mm_load_ps(px);
-            __m128 x1 = _mm_and_ps(x0, cSign);
-            __m128 x2 = _mm_cmpneq_ps(x0, cZero);
-            __m128 x3 = _mm_load_ps(pg);
-            x3 = _mm_xor_ps(x3, x1);
-            x3 = _mm_and_ps(x3, x2);
-            _mm_store_ps(pg, x3);
-        }
-    }
-}
-
-EXPORT_API(void) ApplyTanhDerivativeA(_In_ const float * pv, _Inout_ float * pg, int c)
-{
-    float * pgLim = pg + c;
-
-    // pg[i] *= 1 - pv[i] * pv[i]
-    __m128 cOne = _mm_set1_ps(1.0f);
-    for (; pg < pgLim; pg += 4, pv += 4)
-    {
-        __m128 x1 = _mm_load_ps(pv);
-        __m128 x2 = _mm_load_ps(pg);
-        x1 = _mm_mul_ps(x1, x1);
-        x1 = _mm_sub_ps(cOne, x1);
-        x2 = _mm_mul_ps(x2, x1);
-        _mm_store_ps(pg, x2);
-    }
-}
-
-EXPORT_API(void) ApplyBoundedRectifiedLinearDerivativeA(_In_ const float * pv, _Inout_ float * pg, int c)
-{
-    float * pgLim = pg + c;
-
-    __m128 cZero = _mm_set1_ps(0.0f);
-    __m128 cOne = _mm_set1_ps(1.0f);
-    for (; pg < pgLim; pg += 4, pv += 4)
-    {
-        __m128 x1 = _mm_load_ps(pv);
-        __m128 x2 = _mm_load_ps(pg);
-        x2 = _mm_and_ps(x2, _mm_cmpgt_ps(x1, cZero));
-        x2 = _mm_and_ps(x2, _mm_cmplt_ps(x1, cOne));
-        _mm_store_ps(pg, x2);
-    }
-}
-
-EXPORT_API(void) ZeroItemsU(_Inout_ float * pd, int c, _In_ const int * pindices, int cindices)
-{
-    DEBUG_ONLY(c);
-    for (int i = 0; i < cindices; ++i)
-    {
-        int iv = pindices[i];
-        assert(0 <= iv && iv < c);
-        pd[iv] = 0;
-    }
-}
-
-EXPORT_API(void) ZeroMatrixItemsCore(_Inout_ float * pd, int c, int ccol, int cfltRow, _In_ const int * pindices, int cindices)
-{
-    DEBUG_ONLY(c);
-    int ivLogMin = 0;
-    int ivLogLim = ccol;
-    int ivPhyMin = 0;
-    for (int i = 0; i < cindices; ++i)
-    {
-        int iv = pindices[i];
-        assert(0 <= iv && iv < c);
-
-        int col = iv - ivLogMin;
-        if ((unsigned int)col >= (unsigned int)ccol)
-        {
-            assert(ivLogMin > iv || iv >= ivLogLim);
-            int row = iv / ccol;
-            ivLogMin = row * ccol;
-            ivLogLim = ivLogMin + ccol;
-            ivPhyMin = row * cfltRow;
-            assert(ivLogMin <= iv && iv < ivLogLim);
-            col = iv - ivLogMin;
-        }
-        pd[ivPhyMin + col] = 0;
-    }
-}
-
 EXPORT_API(void) SdcaL1UpdateU(float primalUpdate, _In_ const float * ps, float threshold, _Inout_ float *pd1, _Inout_ float * pd2, int c)
 {
     const float * psLim = ps + c;
@@ -2643,70 +758,4 @@ EXPORT_API(void) SdcaL1UpdateSU(float primalUpdate, _In_ const float * ps, _In_
         float d1 = pd1[i];
         pd2[i] = std::abs(d1) > threshold ? (d1 > 0 ? d1 - threshold : d1 + threshold) : 0;
     }
-}
-
-EXPORT_API(void) ScaleAdadeltaU(_Inout_ float * mat, _Inout_ float * accGrads, _Inout_ float * accUpdates, float decay, float cond, _In_ const float * grads, int size)
-{
-    float * pm = mat;
-    float * pmLim = pm + size;
-    float * pag = accGrads;
-    float * pau = accUpdates;
-    const float * pg = grads;
-
-    __m128 dec = _mm_set1_ps(decay);
-    __m128 decc = _mm_set1_ps(1 - decay);
-    __m128 c = _mm_set1_ps(cond);
-
-    for (; pm + 4 <= pmLim; pm += 4, pag += 4, pau += 4, pg += 4)
-    {
-        __m128 g = _mm_loadu_ps(pg);
-        __m128 ag = _mm_loadu_ps(pag);
-        __m128 au = _mm_loadu_ps(pau);
-        __m128 coef = _mm_loadu_ps(pm);
-
-        UpdateAdadelta(coef, ag, au, g, dec, decc, c);
-
-        _mm_storeu_ps(pm, coef);
-        _mm_storeu_ps(pag, ag);
-        _mm_storeu_ps(pau, au);
-    }
-
-    for (; pm < pmLim; pm++, pag++, pau++, pg++)
-    {
-        float g = *pg;
-        float accGrad = decay * *pag + (1 - decay) * g * g;
-        float accUpd = *pau;
-
-        float newUpd = sqrtf((accUpd + cond) / (accGrad + cond)) * g;
-        *pm += newUpd;
-        *pag = accGrad;
-        *pau = decay * accUpd + (1 - decay) * newUpd * newUpd;
-    }
-}
-
-EXPORT_API(void) ScaleAdadeltaA(_Inout_ float * mat, _Inout_ float * accGrads, _Inout_ float * accUpdates, float decay, float cond, _Inout_ float * grads, int size)
-{
-    float * pm = mat;
-    float * pmLim = pm + size;
-    float * pag = accGrads;
-    float * pau = accUpdates;
-    float * pg = grads;
-
-    __m128 dec = _mm_set1_ps(decay);
-    __m128 decc = _mm_set1_ps(1 - decay);
-    __m128 c = _mm_set1_ps(cond);
-
-    for (; pm < pmLim; pm += 4, pag += 4, pau += 4, pg += 4)
-    {
-        __m128 g = _mm_load_ps(pg);
-        __m128 ag = _mm_load_ps(pag);
-        __m128 au = _mm_load_ps(pau);
-        __m128 coef = _mm_load_ps(pm);
-
-        UpdateAdadelta(coef, ag, au, g, dec, decc, c);
-
-        _mm_store_ps(pm, coef);
-        _mm_store_ps(pag, ag);
-        _mm_store_ps(pau, au);
-    }
-}
+}
\ No newline at end of file
diff --git a/src/Native/FactorizationMachineNative/FactorizationMachineCore.cpp b/src/Native/FactorizationMachineNative/FactorizationMachineCore.cpp
index 982b266683..e2b1551f73 100644
--- a/src/Native/FactorizationMachineNative/FactorizationMachineCore.cpp
+++ b/src/Native/FactorizationMachineNative/FactorizationMachineCore.cpp
@@ -48,7 +48,7 @@ EXPORT_API(void) CalculateIntermediateVariablesNative(int fieldCount, int latent
 
         for (int k = 0; k + 4 <= d; k += 4)
         {
-            const __m128 _v = _mm_load_ps(vjf + k);
+            const __m128 _v = _mm_loadu_ps(vjf + k);
             _tmp = _mm_sub_ps(_tmp, _mm_mul_ps(_mm_mul_ps(_v, _v), _xx));
         }
 
@@ -62,10 +62,10 @@ EXPORT_API(void) CalculateIntermediateVariablesNative(int fieldCount, int latent
             // q_f,f' += v_j,f' * x
             for (int k = 0; k + 4 <= d; k += 4)
             {
-                const __m128 _v = _mm_load_ps(vjfprime + k);
-                __m128 _q = _mm_load_ps(qffprime + k);
+                const __m128 _v = _mm_loadu_ps(vjfprime + k);
+                __m128 _q = _mm_loadu_ps(qffprime + k);
                 _q = _mm_add_ps(_q, _mm_mul_ps(_v, _x));
-                _mm_store_ps(qffprime + k, _q);
+                _mm_storeu_ps(qffprime + k, _q);
             }
         }
     }
@@ -76,7 +76,7 @@ EXPORT_API(void) CalculateIntermediateVariablesNative(int fieldCount, int latent
         const float * qff = pq + f * m * d + f * d;
         for (int k = 0; k + 4 <= d; k += 4)
         {
-            __m128 _qff = _mm_load_ps(qff + k);
+            __m128 _qff = _mm_loadu_ps(qff + k);
 
             // Intra-field interactions. 
             _tmp = _mm_add_ps(_tmp, _mm_mul_ps(_qff, _qff));
@@ -91,8 +91,8 @@ EXPORT_API(void) CalculateIntermediateVariablesNative(int fieldCount, int latent
             for (int k = 0; k + 4 <= d; k += 4)
             {
                 // Inter-field interaction.
-                __m128 _qffprime = _mm_load_ps(qffprime + k);
-                __m128 _qfprimef = _mm_load_ps(qfprimef + k);
+                __m128 _qffprime = _mm_loadu_ps(qffprime + k);
+                __m128 _qfprimef = _mm_loadu_ps(qfprimef + k);
                 _y = _mm_add_ps(_y, _mm_mul_ps(_qffprime, _qfprimef));
             }
         }
@@ -153,8 +153,8 @@ EXPORT_API(void) CalculateGradientAndUpdateNative(float lambdaLinear, float lamb
 
             for (int k = 0; k + 4 <= d; k += 4)
             {
-                __m128 _v = _mm_load_ps(vjfprime + k);
-                __m128 _q = _mm_load_ps(qfprimef + k);
+                __m128 _v = _mm_loadu_ps(vjfprime + k);
+                __m128 _q = _mm_loadu_ps(qfprimef + k);
 
                 // Calculate L2-norm regularization's gradient.
                 __m128 _g = _mm_mul_ps(_lambdav, _v);
@@ -167,12 +167,12 @@ EXPORT_API(void) CalculateGradientAndUpdateNative(float lambdaLinear, float lamb
                 _g = _mm_mul_ps(_wei, _g);
 
                 // Accumulate the gradient of latent vectors.
-                const __m128 _h = _mm_add_ps(_mm_load_ps(hvjfprime + k), _mm_mul_ps(_g, _g));
+                const __m128 _h = _mm_add_ps(_mm_loadu_ps(hvjfprime + k), _mm_mul_ps(_g, _g));
 
                 // Perform ADAGRAD update rule to adjust latent vector.
                 _v = _mm_sub_ps(_v, _mm_mul_ps(_lr, _mm_mul_ps(_mm_rsqrt_ps(_h), _g)));
-                _mm_store_ps(vjfprime + k, _v);
-                _mm_store_ps(hvjfprime + k, _h);
+                _mm_storeu_ps(vjfprime + k, _v);
+                _mm_storeu_ps(hvjfprime + k, _h);
             }
         }
     }
diff --git a/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.cs b/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.cs
index 7284ce6c1a..651783f12e 100644
--- a/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.cs
+++ b/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.cs
@@ -14,10 +14,9 @@ public class CpuMathUtilsUnitTests
     {
         private readonly float[][] _testArrays;
         private readonly int[] _testIndexArray;
-        private readonly AlignedArray[] _testMatrices;
-        private readonly AlignedArray[] _testSrcVectors;
-        private readonly AlignedArray[] _testDstVectors;
-        private readonly int _vectorAlignment = CpuMathUtils.GetVectorAlignment();
+        private readonly float[][] _testMatrices;
+        private readonly float[][] _testSrcVectors;
+        private readonly float[][] _testDstVectors;
         private readonly FloatEqualityComparer _comparer;
         private readonly FloatEqualityComparerForMatMul _matMulComparer;
 
@@ -50,49 +49,49 @@ public CpuMathUtilsUnitTests()
                 testMatrix2[i] = i + 1;
             }
 
-            AlignedArray testMatrixAligned1 = new AlignedArray(8 * 8, _vectorAlignment);
-            AlignedArray testMatrixAligned2 = new AlignedArray(8 * 16, _vectorAlignment);
-            testMatrixAligned1.CopyFrom(testMatrix1, 0, testMatrix1.Length);
-            testMatrixAligned2.CopyFrom(testMatrix2, 0, testMatrix2.Length);
+            float[] testMatrixAligned1 = new float[8 * 8];
+            float[] testMatrixAligned2 = new float[8 * 16];
+            Array.Copy(testMatrix1, 0, testMatrixAligned1, 0, testMatrix1.Length);
+            Array.Copy(testMatrix2, 0, testMatrixAligned2, 0, testMatrix2.Length);
 
-            _testMatrices = new AlignedArray[] { testMatrixAligned1, testMatrixAligned2 };
+            _testMatrices = new float[][] { testMatrixAligned1, testMatrixAligned2 };
 
             // Padded source vectors whose dimensions are multiples of 8
             float[] testSrcVector1 = new float[8] { 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f };
             float[] testSrcVector2 = new float[16] { 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f, 9f, 10f, 11f, 12f, 13f, 14f, 15f, 16f };
 
-            AlignedArray testSrcVectorAligned1 = new AlignedArray(8, _vectorAlignment);
-            AlignedArray testSrcVectorAligned2 = new AlignedArray(16, _vectorAlignment);
-            testSrcVectorAligned1.CopyFrom(testSrcVector1, 0, testSrcVector1.Length);
-            testSrcVectorAligned2.CopyFrom(testSrcVector2, 0, testSrcVector2.Length);
+            float[] testSrcVectorAligned1 = new float[8];
+            float[] testSrcVectorAligned2 = new float[16];
+            Array.Copy(testSrcVector1, 0, testSrcVectorAligned1, 0, testSrcVector1.Length);
+            Array.Copy(testSrcVector2, 0, testSrcVectorAligned2, 0, testSrcVector2.Length);
 
-            _testSrcVectors = new AlignedArray[] { testSrcVectorAligned1, testSrcVectorAligned2 };
+            _testSrcVectors = new float[][] { testSrcVectorAligned1, testSrcVectorAligned2 };
 
             // Padded destination vectors whose dimensions are multiples of 8
             float[] testDstVector1 = new float[8] { 0f, 1f, 2f, 3f, 4f, 5f, 6f, 7f };
             float[] testDstVector2 = new float[16] { 0f, 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f, 9f, 10f, 11f, 12f, 13f, 14f, 15f };
 
-            AlignedArray testDstVectorAligned1 = new AlignedArray(8, _vectorAlignment);
-            AlignedArray testDstVectorAligned2 = new AlignedArray(16, _vectorAlignment);
-            testDstVectorAligned1.CopyFrom(testDstVector1, 0, testDstVector1.Length);
-            testDstVectorAligned2.CopyFrom(testDstVector2, 0, testDstVector2.Length);
+            float[] testDstVectorAligned1 = new float[8];
+            float[] testDstVectorAligned2 = new float[16];
 
-            _testDstVectors = new AlignedArray[] { testDstVectorAligned1, testDstVectorAligned2 };
+            Array.Copy(testDstVector1, 0, testDstVectorAligned1, 0, testDstVector1.Length);
+            Array.Copy(testDstVector2, 0, testDstVectorAligned2, 0, testDstVector2.Length);
+            _testDstVectors = new float[][] { testDstVectorAligned1, testDstVectorAligned2 };
         }
 
         [Theory]
         [InlineData(0, 0, 0, new float[] { -416.6801f, -416.6801f, -416.6801f, -416.6801f, -416.6801f, -416.6801f, -416.6801f, -416.6801f })]
         [InlineData(1, 1, 0, new float[] { 1496f, 3672f, 5848f, 8024f, 10200f, 12376f, 14552f, 16728f })]
         [InlineData(1, 0, 1, new float[] { 204f, 492f, 780f, 1068f, 1356f, 1644f, 1932f, 2220f, 2508f, 2796f, 3084f, 3372f, 3660f, 3948f, 4236f, 4524f })]
-        public void MatMulATest(int matTest, int srcTest, int dstTest, float[] expected)
+        public void MatMulTest(int matTest, int srcTest, int dstTest, float[] expected)
         {
-            AlignedArray mat = _testMatrices[matTest];
-            AlignedArray src = _testSrcVectors[srcTest];
-            AlignedArray dst = _testDstVectors[dstTest];
+            float[] mat = _testMatrices[matTest];
+            float[] src = _testSrcVectors[srcTest];
+            float[] dst = _testDstVectors[dstTest];
 
-            CpuMathUtils.MatTimesSrc(false, false, mat, src, dst, dst.Size);
-            float[] actual = new float[dst.Size];
-            dst.CopyTo(actual, 0, dst.Size);
+            CpuMathUtils.MatTimesSrc(false, false, mat, src, dst, dst.Length);
+            float[] actual = new float[dst.Length];
+            Array.Copy(dst, 0, actual, 0, dst.Length);
             Assert.Equal(expected, actual, _matMulComparer);
         }
 
@@ -100,15 +99,15 @@ public void MatMulATest(int matTest, int srcTest, int dstTest, float[] expected)
         [InlineData(0, 0, 0, new float[] { -416.6801f, -415.6801f, -414.6801f, -413.6801f, -412.6801f, -411.6801f, -410.6801f, -409.6801f })]
         [InlineData(1, 1, 0, new float[] { 1496f, 3673f, 5850f, 8027f, 10204f, 12381f, 14558f, 16735f })]
         [InlineData(1, 0, 1, new float[] { 204f, 493f, 782f, 1071f, 1360f, 1649f, 1938f, 2227f, 2516f, 2805f, 3094f, 3383f, 3672f, 3961f, 4250f, 4539f })]
-        public void MatMulAAddTest(int matTest, int srcTest, int dstTest, float[] expected)
+        public void MatMulAddTest(int matTest, int srcTest, int dstTest, float[] expected)
         {
-            AlignedArray mat = _testMatrices[matTest];
-            AlignedArray src = _testSrcVectors[srcTest];
-            AlignedArray dst = _testDstVectors[dstTest];
+            float[] mat = _testMatrices[matTest];
+            float[] src = _testSrcVectors[srcTest];
+            float[] dst = _testDstVectors[dstTest];
 
-            CpuMathUtils.MatTimesSrc(false, true, mat, src, dst, dst.Size);
-            float[] actual = new float[dst.Size];
-            dst.CopyTo(actual, 0, dst.Size);
+            CpuMathUtils.MatTimesSrc(false, true, mat, src, dst, dst.Length);
+            float[] actual = new float[dst.Length];
+            Array.Copy(dst, 0, actual, 0, dst.Length);
             Assert.Equal(expected, actual, _matMulComparer);
         }
 
@@ -116,15 +115,15 @@ public void MatMulAAddTest(int matTest, int srcTest, int dstTest, float[] expect
         [InlineData(0, 0, 0, new float[] { 70.56001f, -85.68f, -351.36f, 498.24f, -3829.32f, -969.48f, 1168.2f, 118.44f })]
         [InlineData(1, 0, 1, new float[] { 2724f, 2760f, 2796f, 2832f, 2868f, 2904f, 2940f, 2976f, 3012f, 3048f, 3084f, 3120f, 3156f, 3192f, 3228f, 3264f })]
         [InlineData(1, 1, 0, new float[] { 11016f, 11152f, 11288f, 11424f, 11560f, 11696f, 11832f, 11968f })]
-        public void MatMulTranATest(int matTest, int srcTest, int dstTest, float[] expected)
+        public void MatMulTranTest(int matTest, int srcTest, int dstTest, float[] expected)
         {
-            AlignedArray mat = _testMatrices[matTest];
-            AlignedArray src = _testSrcVectors[srcTest];
-            AlignedArray dst = _testDstVectors[dstTest];
+            float[] mat = _testMatrices[matTest];
+            float[] src = _testSrcVectors[srcTest];
+            float[] dst = _testDstVectors[dstTest];
 
-            CpuMathUtils.MatTimesSrc(true, false, mat, src, dst, src.Size);
-            float[] actual = new float[dst.Size];
-            dst.CopyTo(actual, 0, dst.Size);
+            CpuMathUtils.MatTimesSrc(true, false, mat, src, dst, src.Length);
+            float[] actual = new float[dst.Length];
+            Array.Copy(dst, 0, actual, 0, dst.Length);
             Assert.Equal(expected, actual, _matMulComparer);
         }
 
@@ -132,15 +131,15 @@ public void MatMulTranATest(int matTest, int srcTest, int dstTest, float[] expec
         [InlineData(0, 0, 0, new float[] { 70.56001f, -84.68f, -349.36f, 501.24f, -3825.32f, -964.48f, 1174.2f, 125.44f })]
         [InlineData(1, 0, 1, new float[] { 2724f, 2761f, 2798f, 2835f, 2872f, 2909f, 2946f, 2983f, 3020f, 3057f, 3094f, 3131f, 3168f, 3205f, 3242f, 3279f })]
         [InlineData(1, 1, 0, new float[] { 11016f, 11153f, 11290f, 11427f, 11564f, 11701f, 11838f, 11975f })]
-        public void MatMulTranAAddTest(int matTest, int srcTest, int dstTest, float[] expected)
+        public void MatMulTranAddTest(int matTest, int srcTest, int dstTest, float[] expected)
         {
-            AlignedArray mat = _testMatrices[matTest];
-            AlignedArray src = _testSrcVectors[srcTest];
-            AlignedArray dst = _testDstVectors[dstTest];
+            float[] mat = _testMatrices[matTest];
+            float[] src = _testSrcVectors[srcTest];
+            float[] dst = _testDstVectors[dstTest];
 
-            CpuMathUtils.MatTimesSrc(true, true, mat, src, dst, src.Size);
-            float[] actual = new float[dst.Size];
-            dst.CopyTo(actual, 0, dst.Size);
+            CpuMathUtils.MatTimesSrc(true, true, mat, src, dst, src.Length);
+            float[] actual = new float[dst.Length];
+            Array.Copy(dst, 0, actual, 0, dst.Length);
             Assert.Equal(expected, actual, _matMulComparer);
         }
 
@@ -148,16 +147,16 @@ public void MatMulTranAAddTest(int matTest, int srcTest, int dstTest, float[] ex
         [InlineData(0, 0, 0, new float[] { 38.25002f, 38.25002f, 38.25002f, 38.25002f, 38.25002f, 38.25002f, 38.25002f, 38.25002f })]
         [InlineData(1, 1, 0, new float[] { 910f, 2190f, 3470f, 4750f, 6030f, 7310f, 8590f, 9870f })]
         [InlineData(1, 0, 1, new float[] { 95f, 231f, 367f, 503f, 639f, 775f, 911f, 1047f, 1183f, 1319f, 1455f, 1591f, 1727f, 1863f, 1999f, 2135f })]
-        public void MatMulPATest(int matTest, int srcTest, int dstTest, float[] expected)
+        public void MatMulPTest(int matTest, int srcTest, int dstTest, float[] expected)
         {
-            AlignedArray mat = _testMatrices[matTest];
-            AlignedArray src = _testSrcVectors[srcTest];
-            AlignedArray dst = _testDstVectors[dstTest];
+            float[] mat = _testMatrices[matTest];
+            float[] src = _testSrcVectors[srcTest];
+            float[] dst = _testDstVectors[dstTest];
             int[] idx = _testIndexArray;
 
-            CpuMathUtils.MatTimesSrc(false, false, mat, idx, src, 0, 0, (srcTest == 0) ? 4 : 9, dst, dst.Size);
-            float[] actual = new float[dst.Size];
-            dst.CopyTo(actual, 0, dst.Size);
+            CpuMathUtils.MatTimesSrc(false, false, mat, idx, src, 0, 0, (srcTest == 0) ? 4 : 9, dst, dst.Length);
+            float[] actual = new float[dst.Length];
+            Array.Copy(dst, 0, actual, 0, dst.Length);
             Assert.Equal(expected, actual, _matMulComparer);
         }
 
@@ -165,16 +164,16 @@ public void MatMulPATest(int matTest, int srcTest, int dstTest, float[] expected
         [InlineData(0, 0, 0, new float[] { 38.25002f, 39.25002f, 40.25002f, 41.25002f, 42.25002f, 43.25002f, 44.25002f, 45.25002f })]
         [InlineData(1, 1, 0, new float[] { 910f, 2191f, 3472f, 4753f, 6034f, 7315f, 8596f, 9877f })]
         [InlineData(1, 0, 1, new float[] { 95f, 232f, 369f, 506f, 643f, 780f, 917f, 1054f, 1191f, 1328f, 1465f, 1602f, 1739f, 1876f, 2013f, 2150f })]
-        public void MatMulPAAddTest(int matTest, int srcTest, int dstTest, float[] expected)
+        public void MatMulPAddTest(int matTest, int srcTest, int dstTest, float[] expected)
         {
-            AlignedArray mat = _testMatrices[matTest];
-            AlignedArray src = _testSrcVectors[srcTest];
-            AlignedArray dst = _testDstVectors[dstTest];
+            float[] mat = _testMatrices[matTest];
+            float[] src = _testSrcVectors[srcTest];
+            float[] dst = _testDstVectors[dstTest];
             int[] idx = _testIndexArray;
 
-            CpuMathUtils.MatTimesSrc(false, true, mat, idx, src, 0, 0, (srcTest == 0) ? 4 : 9, dst, dst.Size);
-            float[] actual = new float[dst.Size];
-            dst.CopyTo(actual, 0, dst.Size);
+            CpuMathUtils.MatTimesSrc(false, true, mat, idx, src, 0, 0, (srcTest == 0) ? 4 : 9, dst, dst.Length);
+            float[] actual = new float[dst.Length];
+            Array.Copy(dst, 0, actual, 0, dst.Length);
             Assert.Equal(expected, actual, _matMulComparer);
         }
 
@@ -182,16 +181,16 @@ public void MatMulPAAddTest(int matTest, int srcTest, int dstTest, float[] expec
         [InlineData(0, 0, 0, new float[] { 33.32f, -40.46f, -165.92f, 235.28f, -1808.29f, -457.81f, 551.65f, 55.93f })]
         [InlineData(1, 0, 1, new float[] { 1265f, 1282f, 1299f, 1316f, 1333f, 1350f, 1367f, 1384f, 1401f, 1418f, 1435f, 1452f, 1469f, 1486f, 1503f, 1520f })]
         [InlineData(1, 1, 0, new float[] { 6720f, 6800f, 6880f, 6960f, 7040f, 7120f, 7200f, 7280f })]
-        public void MatMulTranPATest(int matTest, int srcTest, int dstTest, float[] expected)
+        public void MatMulTranPTest(int matTest, int srcTest, int dstTest, float[] expected)
         {
-            AlignedArray mat = _testMatrices[matTest];
-            AlignedArray src = _testSrcVectors[srcTest];
-            AlignedArray dst = _testDstVectors[dstTest];
+            float[] mat = _testMatrices[matTest];
+            float[] src = _testSrcVectors[srcTest];
+            float[] dst = _testDstVectors[dstTest];
             int[] idx = _testIndexArray;
 
-            CpuMathUtils.MatTimesSrc(true, false, mat, idx, src, 0, 0, (srcTest == 0) ? 4 : 9, dst, src.Size);
-            float[] actual = new float[dst.Size];
-            dst.CopyTo(actual, 0, dst.Size);
+            CpuMathUtils.MatTimesSrc(true, false, mat, idx, src, 0, 0, (srcTest == 0) ? 4 : 9, dst, src.Length);
+            float[] actual = new float[dst.Length];
+            Array.Copy(dst, 0, actual, 0, dst.Length);
             Assert.Equal(expected, actual, _matMulComparer);
         }
 
@@ -199,16 +198,16 @@ public void MatMulTranPATest(int matTest, int srcTest, int dstTest, float[] expe
         [InlineData(0, 0, 0, new float[] { 33.32f, -39.46f, -163.92f, 238.28f, -1804.29f, -452.81f, 557.65f, 62.93f })]
         [InlineData(1, 0, 1, new float[] { 1265f, 1283f, 1301f, 1319f, 1337f, 1355f, 1373f, 1391f, 1409f, 1427f, 1445f, 1463f, 1481f, 1499f, 1517f, 1535f })]
         [InlineData(1, 1, 0, new float[] { 6720f, 6801f, 6882f, 6963f, 7044f, 7125f, 7206f, 7287f })]
-        public void MatMulTranPAAddTest(int matTest, int srcTest, int dstTest, float[] expected)
+        public void MatMulTranPAddTest(int matTest, int srcTest, int dstTest, float[] expected)
         {
-            AlignedArray mat = _testMatrices[matTest];
-            AlignedArray src = _testSrcVectors[srcTest];
-            AlignedArray dst = _testDstVectors[dstTest];
+            float[] mat = _testMatrices[matTest];
+            float[] src = _testSrcVectors[srcTest];
+            float[] dst = _testDstVectors[dstTest];
             int[] idx = _testIndexArray;
 
-            CpuMathUtils.MatTimesSrc(true, true, mat, idx, src, 0, 0, (srcTest == 0) ? 4 : 9, dst, src.Size);
-            float[] actual = new float[dst.Size];
-            dst.CopyTo(actual, 0, dst.Size);
+            CpuMathUtils.MatTimesSrc(true, true, mat, idx, src, 0, 0, (srcTest == 0) ? 4 : 9, dst, src.Length);
+            float[] actual = new float[dst.Length];
+            Array.Copy(dst, 0, actual, 0, dst.Length);
             Assert.Equal(expected, actual, _matMulComparer);
         }
 
@@ -456,16 +455,6 @@ public void SumSqDiffUTest(int test, float expected)
             Assert.Equal(expected, actual, 2);
         }
 
-        [Theory]
-        [InlineData(0, 393.96f)]
-        [InlineData(1, 390.67f)]
-        public void SumAbsUTest(int test, float expected)
-        {
-            float[] src = (float[])_testArrays[test].Clone();
-            var actual = CpuMathUtils.SumAbs(src, src.Length);
-            Assert.Equal(expected, actual, 2);
-        }
-
         [Theory]
         [InlineData(0, 393.96f)]
         [InlineData(1, 392.37f)]
@@ -476,16 +465,6 @@ public void SumAbsDiffUTest(int test, float expected)
             Assert.Equal(expected, actual, 2);
         }
 
-        [Theory]
-        [InlineData(0, 106.37f)]
-        [InlineData(1, 106.37f)]
-        public void MaxAbsUTest(int test, float expected)
-        {
-            float[] src = (float[])_testArrays[test].Clone();
-            var actual = CpuMathUtils.MaxAbs(src, src.Length);
-            Assert.Equal(expected, actual, 2);
-        }
-
         [Theory]
         [InlineData(0, 108.07f)]
         [InlineData(1, 108.07f)]
@@ -550,34 +529,6 @@ public void Dist2Test(int test, float expected)
             Assert.Equal(expected, actual, 0);
         }
 
-        [Theory]
-        [InlineData(0, new int[] { 0, 2, 5, 6 }, new float[] { 0f, 2f, 0f, 4f, 5f, 0f, 0f, 8f })]
-        [InlineData(1, new int[] { 0, 2, 5, 6, 8, 11, 12, 13, 14 }, new float[] { 0f, 2f, 0f, 4f, 5f, 0f, 0f, 8f, 0f, 10f, 11f, 0f, 0f, 0f, 0f, 16f })]
-        public void ZeroItemsUTest(int test, int[] idx, float[] expected)
-        {
-            AlignedArray src = new AlignedArray(8 + 8 * test, _vectorAlignment);
-            src.CopyFrom(_testSrcVectors[test]);
-
-            CpuMathUtils.ZeroMatrixItems(src, src.Size, src.Size, idx);
-            float[] actual = new float[src.Size];
-            src.CopyTo(actual, 0, src.Size);
-            Assert.Equal(expected, actual, _comparer);
-        }
-
-        [Theory]
-        [InlineData(0, new int[] { 0, 2, 5 }, new float[] { 0f, 2f, 0f, 4f, 5f, 6f, 0f, 8f })]
-        [InlineData(1, new int[] { 0, 2, 5, 6, 8, 11, 12, 13 }, new float[] { 0f, 2f, 0f, 4f, 5f, 0f, 0f, 8f, 9f, 0f, 11f, 12f, 0f, 0f, 0f, 16f })]
-        public void ZeroMatrixItemsCoreTest(int test, int[] idx, float[] expected)
-        {
-            AlignedArray src = new AlignedArray(8 + 8 * test, _vectorAlignment);
-            src.CopyFrom(_testSrcVectors[test]);
-
-            CpuMathUtils.ZeroMatrixItems(src, src.Size / 2 - 1, src.Size / 2, idx);
-            float[] actual = new float[src.Size];
-            src.CopyTo(actual, 0, src.Size);
-            Assert.Equal(expected, actual, _comparer);
-        }
-
         [Theory]
         [InlineData(0)]
         [InlineData(1)]