diff --git a/src/Microsoft.ML.StandardLearners/FactorizationMachine/AlignedArray.cs b/src/Microsoft.ML.CpuMath/AlignedArray.cs similarity index 53% rename from src/Microsoft.ML.StandardLearners/FactorizationMachine/AlignedArray.cs rename to src/Microsoft.ML.CpuMath/AlignedArray.cs index f01da9fe28..0a631be0e9 100644 --- a/src/Microsoft.ML.StandardLearners/FactorizationMachine/AlignedArray.cs +++ b/src/Microsoft.ML.CpuMath/AlignedArray.cs @@ -5,7 +5,7 @@ using Microsoft.ML.Runtime.Internal.CpuMath.Core; using System; -namespace Microsoft.ML.Runtime.FactorizationMachine +namespace Microsoft.ML.Runtime.Internal.CpuMath { using Float = System.Single; @@ -17,6 +17,7 @@ namespace Microsoft.ML.Runtime.FactorizationMachine /// /// The ctor takes an alignment value, which must be a power of two at least sizeof(Float). /// + [BestFriend] internal sealed class AlignedArray { // Items includes "head" items filled with NaN, followed by _size entries, followed by "tail" @@ -109,6 +110,60 @@ public Float this[int index] } } + public void CopyTo(Span dst, int index, int count) + { + Contracts.Assert(0 <= count && count <= _size); + Contracts.Assert(dst != null); + Contracts.Assert(0 <= index && index <= dst.Length - count); + Items.AsSpan(_base, count).CopyTo(dst.Slice(index)); + } + + public void CopyTo(int start, Span dst, int index, int count) + { + Contracts.Assert(0 <= count); + Contracts.Assert(0 <= start && start <= _size - count); + Contracts.Assert(dst != null); + Contracts.Assert(0 <= index && index <= dst.Length - count); + Items.AsSpan(start + _base, count).CopyTo(dst.Slice(index)); + } + + public void CopyFrom(ReadOnlySpan src) + { + Contracts.Assert(src.Length <= _size); + src.CopyTo(Items.AsSpan(_base)); + } + + public void CopyFrom(int start, ReadOnlySpan src) + { + Contracts.Assert(0 <= start && start <= _size - src.Length); + src.CopyTo(Items.AsSpan(start + _base)); + } + + // Copies values from a sparse vector. + // valuesSrc contains only the non-zero entries. Those are copied into their logical positions in the dense array. + // rgposSrc contains the logical positions + offset of the non-zero entries in the dense array. + // rgposSrc runs parallel to the valuesSrc array. + public void CopyFrom(ReadOnlySpan rgposSrc, ReadOnlySpan valuesSrc, int posMin, int iposMin, int iposLim, bool zeroItems) + { + Contracts.Assert(rgposSrc != null); + Contracts.Assert(valuesSrc != null); + Contracts.Assert(rgposSrc.Length <= valuesSrc.Length); + Contracts.Assert(0 <= iposMin && iposMin <= iposLim && iposLim <= rgposSrc.Length); + + // Zeroing-out and setting the values in one-pass does not seem to give any perf benefit. + // So explicitly zeroing and then setting the values. + if (zeroItems) + ZeroItems(); + + for (int ipos = iposMin; ipos < iposLim; ++ipos) + { + Contracts.Assert(posMin <= rgposSrc[ipos]); + int iv = _base + rgposSrc[ipos] - posMin; + Contracts.Assert(iv < _size + _base); + Items[iv] = valuesSrc[ipos]; + } + } + public void CopyFrom(AlignedArray src) { Contracts.Assert(src != null); @@ -116,5 +171,41 @@ public void CopyFrom(AlignedArray src) Contracts.Assert(src._cbAlign == _cbAlign); Array.Copy(src.Items, src._base, Items, _base, _size); } + + public void ZeroItems() + { + Array.Clear(Items, _base, _size); + } + + public void ZeroItems(int[] rgposSrc, int posMin, int iposMin, int iposLim) + { + Contracts.Assert(rgposSrc != null); + Contracts.Assert(0 <= iposMin && iposMin <= iposLim && iposLim <= rgposSrc.Length); + Contracts.Assert(iposLim - iposMin <= _size); + + int ivCur = 0; + for (int ipos = iposMin; ipos < iposLim; ++ipos) + { + int ivNextNonZero = rgposSrc[ipos] - posMin; + Contracts.Assert(ivCur <= ivNextNonZero && ivNextNonZero < _size); + while (ivCur < ivNextNonZero) + Items[_base + ivCur++] = 0; + Contracts.Assert(ivCur == ivNextNonZero); + // Skip the non-zero element at ivNextNonZero. + ivCur++; + } + + while (ivCur < _size) + Items[_base + ivCur++] = 0; + } + + // REVIEW: This is hackish and slightly dangerous. Perhaps we should wrap this in an + // IDisposable that "locks" this, prohibiting GetBase from being called, while the buffer + // is "checked out". + public void GetRawBuffer(out Float[] items, out int offset) + { + items = Items; + offset = _base; + } } } \ No newline at end of file diff --git a/src/Microsoft.ML.CpuMath/AlignedMatrix.cs b/src/Microsoft.ML.CpuMath/AlignedMatrix.cs new file mode 100644 index 0000000000..6d550fc3fc --- /dev/null +++ b/src/Microsoft.ML.CpuMath/AlignedMatrix.cs @@ -0,0 +1,681 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Float = System.Single; + +using Microsoft.ML.Runtime.Internal.CpuMath.Core; +using System; +using System.Collections; +using System.Collections.Generic; + +namespace Microsoft.ML.Runtime.Internal.CpuMath +{ + using Conditional = System.Diagnostics.ConditionalAttribute; + + /// + /// This implements a logical array of Floats that is automatically aligned for SSE/AVX operations. + /// This is a thin wrapper around the AlignedArray type implemented in C++. This simply couples + /// the AlignedArray with a logical size, which does not include padding, while the AlignedArray + /// size does include padding. + /// + [BestFriend] + internal sealed class CpuAlignedVector : ICpuVector + { + private readonly AlignedArray _items; + private readonly int _size; // The logical size. + + /// + /// The value count. + /// + public int ValueCount { get { return _size; } } + + /// + /// The logical size of the vector. + /// + public int VectorSize { get { return _size; } } + + // Round cflt up to a multiple of cfltAlign. + private static int RoundUp(int cflt, int cfltAlign) + { + Contracts.Assert(0 < cflt); + // cfltAlign should be a power of two. + Contracts.Assert(0 < cfltAlign && (cfltAlign & (cfltAlign - 1)) == 0); + + // Determine the number of "blobs" of size cfltAlign. + int cblob = (cflt + cfltAlign - 1) / cfltAlign; + return cblob * cfltAlign; + } + + /// + /// Allocate an aligned vector with the given alignment (in bytes). + /// The alignment must be a power of two and at least sizeof(Float). + /// + public CpuAlignedVector(int size, int cbAlign) + { + Contracts.Assert(0 < size); + // cbAlign should be a power of two. + Contracts.Assert(sizeof(Float) <= cbAlign); + Contracts.Assert((cbAlign & (cbAlign - 1)) == 0); + + int cfltAlign = cbAlign / sizeof(Float); + int cflt = RoundUp(size, cfltAlign); + _items = new AlignedArray(cflt, cbAlign); + _size = size; + AssertValid(); + } + + public void Dispose() + { + } + + [Conditional("DEBUG")] + private void AssertValid() + { +#if DEBUG + Contracts.Assert(0 < _size && _size <= _items.Size); + + // The padding, [_size, _items.Size), should contain zeros. + for (int i = _size; i < _items.Size; i++) + Contracts.Assert(_items[i] == 0); +#endif + } + + /// + /// The physical AligenedArray items. + /// + public AlignedArray Items { get { return _items; } } + + /// + /// The alignment. + /// + public int CbAlign + { + get { return _items.CbAlign; } + } + + /// + /// Set and get the value of the vector at the given index. + /// + /// The index + /// The value at the given index + public Float this[int index] + { + get + { + Contracts.Assert(0 <= index && index < _size); + return _items[index]; + } + set + { + Contracts.Assert(0 <= index && index < _size); + _items[index] = value; + } + } + + /// + /// Get the value of the vector at the given index. + /// + /// The index + /// The value at the given index + public Float GetValue(int i) + { + Contracts.Assert(0 <= i && i < _size); + return _items[i]; + } + + /// + /// Assign randomized values to the vector elements via the input function. + /// + /// The input rand om function that takes no arguments and returns a float value + public void Randomize(Func rand) + { + Contracts.AssertValue(rand); + for (int i = 0; i < _size; i++) + _items[i] = rand(); + } + + /// + /// Assign zeros to the vector elements. + /// + public void Zero() + { + _items.ZeroItems(); + } + + /// + /// Copy the values into dst, starting at slot ivDst and advancing ivDst. + /// + /// The destination array + /// The starting index in the destination array + public void CopyTo(Float[] dst, ref int ivDst) + { + Contracts.AssertValue(dst); + Contracts.Assert(0 <= ivDst && ivDst <= dst.Length - _size); + _items.CopyTo(dst, ivDst, _size); + ivDst += _size; + } + + /// + /// Copy the values from this vector starting at slot ivSrc into dst, starting at slot ivDst. + /// The number of values that are copied is determined by count. + /// + /// The staring index in this vector + /// The destination array + /// The starting index in the destination array + /// The number of elements to be copied + public void CopyTo(int ivSrc, Float[] dst, int ivDst, int count) + { + Contracts.AssertValue(dst); + Contracts.Assert(0 <= count && count <= dst.Length); + Contracts.Assert(0 <= ivSrc && ivSrc <= _size - count); + Contracts.Assert(0 <= ivDst && ivDst <= dst.Length - count); + _items.CopyTo(ivSrc, dst, ivDst, count); + } + + /// + /// Copy the values from src, starting at slot index and advancing index, into this vector. + /// + /// The source array + /// The starting index in the source array + public void CopyFrom(Float[] src, ref int index) + { + Contracts.AssertValue(src); + Contracts.Assert(0 <= index && index <= src.Length - _size); + _items.CopyFrom(src.AsSpan(index, _size)); + index += _size; + } + + /// + /// Copy the values from src, starting at slot index and advancing index, into this vector, starting at slot ivDst. + /// The number of values that are copied is determined by count. + /// + /// The staring index in this vector + /// The source array + /// The starting index in the source array + /// The number of elements to be copied + public void CopyFrom(int ivDst, Float[] src, int ivSrc, int count) + { + Contracts.AssertValue(src); + Contracts.Assert(0 <= count && count <= src.Length); + Contracts.Assert(0 <= ivDst && ivDst <= _size - count); + Contracts.Assert(0 <= ivSrc && ivSrc <= src.Length - count); + _items.CopyFrom(ivDst, src.AsSpan(ivSrc, _size)); + } + + /// + /// Copy the values of src vector into this vector. The src vector must have the same size as this vector. + /// + /// The source vector + public void CopyFrom(CpuAlignedVector src) + { + Contracts.AssertValue(src); + Contracts.Assert(src._size == _size); + _items.CopyFrom(src._items); + } + + /// + /// Get the underlying AlignedArray as IEnumerator<Float>. + /// + public IEnumerator GetEnumerator() + { + for (int i = 0; i < _size; i++) + yield return _items[i]; + } + + IEnumerator IEnumerable.GetEnumerator() + { + return GetEnumerator(); + } + } + + /// + /// This implements a logical matrix of Floats that is automatically aligned for SSE/AVX operations. + /// The ctor takes an alignment value, which must be a power of two at least sizeof(Float). + /// + [BestFriend] + internal abstract class CpuAlignedMatrixBase + { + // _items includes "head" items filled with NaN, followed by RunLenPhy * RunCntPhy entries, followed by + // "tail" items, also filled with NaN. Note that RunLenPhy and RunCntPhy are divisible by the alignment + // specified in the ctor and are >= RunLen and RunCnt, respectively. It is illegal to access any slot + // outsize [_base, _base + RunLenPhy * RunCntPhy). The padding should all be zero (and maintained as such). + // The items are arranged in "runs" of length RunLen. There are RunCnt such runs. Each run ends with + // (RunLenPhy - RunLen) padding slots. There are an addition (RunCntPhy - RunCnt) padding runs of length + // RunLenPhy, which are entirely zero. Any native code should be able to assume and should maintain + // these invariants. + public AlignedArray Items { get; } + + protected readonly int FloatAlign; // The alignment. + + // Since FloatAlign is a power of two, shifting by Shift = log_2(FloatAlign) is the same as multiplying/dividing by FloatAlign. + protected readonly int Shift; + // Since FloatAlign is a power of two, bitwise and with Mask = FloatAlign - 1 will be the same as moding by FloatAlign. + protected readonly int Mask; + + // Logical length of runs (RunLen) and number of runs (RunCnt). + public readonly int RunLen; + public readonly int RunCnt; + + // Physical (padded) length and number of runs. + public readonly int RunLenPhy; + public readonly int RunCntPhy; + + /// + /// The logical number values in the matrix + /// + public int ValueCount => RunLen * RunCnt; + + /// + /// The logical number of rows + /// + public abstract int RowCount { get; } + + /// + /// The logical number of columns + /// + public abstract int ColCount { get; } + + /// + /// The physical number of rows + /// + public abstract int RowCountPhy { get; } + + /// + /// The pysical number of columns + /// + public abstract int ColCountPhy { get; } + + // Round cflt up to a multiple of cfltAlign. + protected static int RoundUp(int cflt, int cfltAlign) + { + Contracts.Assert(0 < cflt); + // cfltAlign should be a power of two. + Contracts.Assert(0 < cfltAlign && (cfltAlign & (cfltAlign - 1)) == 0); + + // Determine the number of "blobs" of size cfltAlign. + int cblob = (cflt + cfltAlign - 1) / cfltAlign; + return cblob * cfltAlign; + } + + /// + /// Allocate an aligned matrix with the given alignment (in bytes). + /// + protected CpuAlignedMatrixBase(int runLen, int runCnt, int cbAlign) + { + Contracts.Assert(0 < runLen); + Contracts.Assert(0 < runCnt); + // cbAlign should be a power of two. + Contracts.Assert(sizeof(Float) <= cbAlign); + Contracts.Assert((cbAlign & (cbAlign - 1)) == 0); + + RunLen = runLen; + RunCnt = runCnt; + + FloatAlign = cbAlign / sizeof(Float); + Shift = GeneralUtils.CbitLowZero((uint)FloatAlign); + Mask = FloatAlign - 1; + + RunLenPhy = RoundUp(runLen, FloatAlign); + RunCntPhy = RoundUp(runCnt, FloatAlign); + Items = new AlignedArray(RunLenPhy * RunCntPhy, cbAlign); + + AssertValid(); + } + + [Conditional("DEBUG")] + protected void AssertValid() + { +#if DEBUG + Contracts.Assert(0 < RunLen && RunLen <= RunLenPhy); + Contracts.Assert(0 < RunCnt && RunCnt <= RunCntPhy); + Contracts.Assert(RunLenPhy * RunCntPhy == Items.Size); + + // Assert that the padding at the end of each run contains zeros. + for (int i = 0; i < RunCnt; i++) + { + for (int j = RunLen; j < RunLenPhy; j++) + Contracts.Assert(Items[i * RunLenPhy + j] == 0); + } + + // Assert that the padding runs contain zeros. + for (int i = RunCnt; i < RunCntPhy; i++) + { + for (int j = 0; j < RunLenPhy; j++) + Contracts.Assert(Items[i * RunLenPhy + j] == 0); + } +#endif + } + + public void Dispose() + { + } + + /// + /// Assign randomized values to the matrix elements via the input function. + /// + /// The input rand om function that takes no arguments and returns a float value + public void Randomize(Func rand) + { + Contracts.AssertValue(rand); + for (int i = 0, k = 0; i < RunCnt; i++) + { + Contracts.Assert(k == i * RunLenPhy); + for (int j = 0; j < RunLen; j++) + Items[k + j] = rand(); + k += RunLenPhy; + } + } + + /// + /// Assign zeros to the matrix elements. + /// + public void Zero() + { + Items.ZeroItems(); + } + + /// + /// Copy the values of src matrix into this matrix. The src matrix must have the same physical and logical size as this matrix. + /// + /// The source matrix + public void CopyFrom(CpuAlignedMatrixBase src) + { + AssertValid(); + Contracts.AssertValue(src); + src.AssertValid(); + Contracts.Assert(src.RunLen == RunLen); + Contracts.Assert(src.RunCnt == RunCnt); + Contracts.Assert(src.RunLenPhy == RunLenPhy); + Contracts.Assert(src.RunCntPhy == RunCntPhy); + Items.CopyFrom(src.Items); + } + } + + /// + /// This implements a logical row-major matrix of Floats that is automatically aligned for SSE/AVX operations. + /// The ctor takes an alignment value, which must be a power of two at least sizeof(Float). + /// + [BestFriend] + internal abstract class CpuAlignedMatrixRowBase : CpuAlignedMatrixBase, ICpuBuffer + { + protected CpuAlignedMatrixRowBase(int crow, int ccol, int cbAlign) + : base(ccol, crow, cbAlign) + { + } + + /// + /// The logical number of rows + /// + public override int RowCount => RunCnt; + + /// + /// The logical number of columns + /// + public override int ColCount { get { return RunLen; } } + + /// + /// The physical number of rows + /// + public override int RowCountPhy { get { return RunCntPhy; } } + + /// + /// The physical number of columns + /// + public override int ColCountPhy { get { return RunLenPhy; } } + + /// + /// Copy the values into dst, starting at slot ivDst and advancing ivDst. + /// + /// The destination array + /// The starting index in the destination array + public void CopyTo(Float[] dst, ref int ivDst) + { + Contracts.AssertValue(dst); + Contracts.Assert(0 <= ivDst && ivDst <= dst.Length - ValueCount); + + if (ColCount == ColCountPhy) + { + // Can copy all at once. + Items.CopyTo(0, dst, ivDst, ValueCount); + ivDst += ValueCount; + } + else + { + // Copy each row. + int ivSrc = 0; + for (int row = 0; row < RowCount; row++) + { + Items.CopyTo(ivSrc, dst, ivDst, ColCount); + ivSrc += ColCountPhy; + ivDst += ColCount; + } + } + } + + /// + /// Copy the values from src, starting at slot ivSrc and advancing ivSrc. + /// + /// The source array + /// The starting index in the source array + public void CopyFrom(Float[] src, ref int ivSrc) + { + Contracts.AssertValue(src); + Contracts.Assert(0 <= ivSrc && ivSrc <= src.Length - ValueCount); + + if (ColCount == ColCountPhy) + { + Items.CopyFrom(src.AsSpan(ivSrc, ValueCount)); + ivSrc += ValueCount; + } + else + { + for (int row = 0; row < RowCount; row++) + { + Items.CopyFrom(row * ColCountPhy, src.AsSpan(ivSrc, ColCount)); + ivSrc += ColCount; + } + } + } + + /// + /// Get the underlying AlignedArray as IEnumerator<Float>. + /// + public IEnumerator GetEnumerator() + { + for (int row = 0; row < RowCount; row++) + { + int ivBase = row * ColCountPhy; + for (int col = 0; col < ColCount; col++) + yield return Items[ivBase + col]; + } + } + + IEnumerator IEnumerable.GetEnumerator() + { + return GetEnumerator(); + } + } + + /// + /// This implements a row-major matrix of Floats that is automatically aligned for SSE/AVX operations. + /// The ctor takes an alignment value, which must be a power of two at least sizeof(Float). + /// + [BestFriend] + internal sealed class CpuAlignedMatrixRow : CpuAlignedMatrixRowBase, ICpuFullMatrix + { + public CpuAlignedMatrixRow(int crow, int ccol, int cbAlign) + : base(crow, ccol, cbAlign) + { + } + + /// + /// The logical number of rows + /// + public override int RowCount { get { return RunCnt; } } + + /// + /// The logical number of columns + /// + public override int ColCount { get { return RunLen; } } + + /// + /// The physical number of rows + /// + public override int RowCountPhy { get { return RunCntPhy; } } + + /// + /// The physical number of columns + /// + public override int ColCountPhy { get { return RunLenPhy; } } + + /// + /// Copy the values from this matrix, starting from the row into dst, starting at slot ivDst and advancing ivDst. + /// + /// The starting row in this matrix + /// The destination array + /// The starting index in the destination array + public void CopyTo(int row, Float[] dst, ref int ivDst) + { + Contracts.AssertValue(dst); + Contracts.Assert(0 <= row && row < RowCount); + Contracts.Assert(0 <= ivDst && ivDst <= dst.Length - ColCount); + + Items.CopyTo(row * ColCountPhy, dst, ivDst, ColCount); + ivDst += ColCount; + } + + /// + /// Assign zeros to the values at the indices + /// + /// The indices + public void ZeroItems(int[] indices) + { + Contracts.AssertValue(indices); + + // REVIEW: Ideally, we'd adjust the indices once so we wouldn't need to + // repeatedly deal with padding adjustments. + CpuMathUtils.ZeroMatrixItems(Items, ColCount, ColCountPhy, indices); + } + } + + /// + /// This implements a logical matrix of Floats that is automatically aligned for SSE/AVX operations. + /// The ctor takes an alignment value, which must be a power of two at least sizeof(Float). + /// + [BestFriend] + internal sealed class CpuAlignedMatrixCol : CpuAlignedMatrixBase, ICpuFullMatrix + { + /// + /// Allocate an aligned matrix with the given alignment (in bytes). + /// + public CpuAlignedMatrixCol(int crow, int ccol, int cbAlign) + : base(crow, ccol, cbAlign) + { + } + + /// + /// The logical number of rows + /// + public override int RowCount { get { return RunCnt; } } + + /// + /// The logical number of columns + /// + public override int ColCount { get { return RunLen; } } + + /// + /// The physical number of rows + /// + public override int RowCountPhy { get { return RunCntPhy; } } + + /// + /// The physical number of columns + /// + public override int ColCountPhy { get { return RunLenPhy; } } + + /// + /// Copy the values into dst, starting at slot ivDst and advancing ivDst. + /// + /// The destination array + /// The starting index in the destination array + public void CopyTo(Float[] dst, ref int ivDst) + { + Contracts.AssertValue(dst); + Contracts.Assert(0 <= ivDst && ivDst <= dst.Length - ValueCount); + + for (int row = 0; row < RowCount; row++) + { + for (int col = 0; col < ColCount; col++) + dst[ivDst++] = Items[row + col * RowCountPhy]; + } + } + + /// + /// Copy the values from this matrix, starting from the row into dst, starting at slot ivDst and advancing ivDst. + /// + /// The starting row in this matrix + /// The destination array + /// The starting index in the destination array + public void CopyTo(int row, Float[] dst, ref int ivDst) + { + Contracts.AssertValue(dst); + Contracts.Assert(0 <= row && row < RowCount); + Contracts.Assert(0 <= ivDst && ivDst <= dst.Length - ColCount); + + for (int col = 0; col < ColCount; col++) + dst[ivDst++] = Items[row + col * RowCountPhy]; + } + + /// + /// Copy the values from src, starting at slot ivSrc and advancing ivSrc. + /// + /// The source array + /// The starting index in the source array + public void CopyFrom(Float[] src, ref int ivSrc) + { + Contracts.AssertValue(src); + Contracts.Assert(0 <= ivSrc && ivSrc <= src.Length - ValueCount); + for (int row = 0; row < RowCount; row++) + { + for (int col = 0; col < ColCount; col++) + Items[row + col * RowCountPhy] = src[ivSrc++]; + } + } + + /// + /// Assign zeros to the values at the indices + /// + /// The indices + public void ZeroItems(int[] indices) + { + Contracts.AssertValue(indices); + + // REVIEW: Ideally, we'd adjust the indices once so we wouldn't need to + // repeatedly deal with padding adjustments. + foreach (int iv in indices) + { + int row = iv / ColCount; + int col = iv % ColCount; + Items[row + col * ColCountPhy] = 0; + } + } + + /// + /// Get the underlying AlignedArray as IEnumerator<Float>. + /// + public IEnumerator GetEnumerator() + { + for (int row = 0; row < RowCount; row++) + { + for (int col = 0; col < ColCount; col++) + yield return Items[row + col * RowCountPhy]; + } + } + + IEnumerator IEnumerable.GetEnumerator() + { + return GetEnumerator(); + } + } +} \ No newline at end of file diff --git a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs index d6fb169094..2156ddf5fa 100644 --- a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs +++ b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs @@ -152,6 +152,11 @@ private static Vector256 MultiplyAdd(Vector256 src1, Vector256 mat, ReadOnlySpan src, Span dst, int crow, int ccol) { fixed (float* psrc = &MemoryMarshal.GetReference(src)) @@ -164,9 +169,8 @@ public static unsafe void MatMul(ReadOnlySpan mat, ReadOnlySpan sr float* pDstEnd = pdst + crow; float* pDstCurrent = pdst; float* pMatCurrent = pmat; - int numRows = crow; - while (pDstCurrent + 4 <= pDstEnd) + while (pDstCurrent < pDstEnd) { Vector256 res0 = Avx.SetZeroVector256(); Vector256 res1 = Avx.SetZeroVector256(); @@ -180,11 +184,10 @@ public static unsafe void MatMul(ReadOnlySpan mat, ReadOnlySpan sr int misalignment = (int)(address % 32); int remainder = 0; - if ((misalignment & 3) != 0 || (ccol % 8 != 0)) + if ((misalignment & 3) != 0) { // Handles cases where the data is not 32-bit aligned and we can't ever use aligned operations - remainder = length % 8; - while (pSrcCurrent + 8 <= pSrcEnd) + while (pSrcCurrent < pSrcEnd) { Vector256 vector = Avx.LoadVector256(pSrcCurrent); @@ -256,32 +259,32 @@ public static unsafe void MatMul(ReadOnlySpan mat, ReadOnlySpan sr // unaligned loads where we mask the input each time. remainder = length; } - } - if (remainder != 0) - { - // Handle any trailing elements that don't fit into a 256-bit block by moving back so that the next - // unaligned load will read to the end of the array and then mask out any elements already processed + if (remainder != 0) + { + // Handle any trailing elements that don't fit into a 256-bit block by moving back so that the next + // unaligned load will read to the end of the array and then mask out any elements already processed - pMatCurrent -= (8 - remainder); - pSrcCurrent -= (8 - remainder); + pMatCurrent -= (8 - remainder); + pSrcCurrent -= (8 - remainder); - Vector256 mask = Avx.LoadVector256(((float*)(pTrailingAlignmentMask)) + (remainder * 8)); + Vector256 mask = Avx.LoadVector256(((float*)(pTrailingAlignmentMask)) + (remainder * 8)); - float* pMatTemp = pMatCurrent; - Vector256 x01 = Avx.And(mask, Avx.LoadVector256(pMatTemp)); - Vector256 x11 = Avx.And(mask, Avx.LoadVector256(pMatTemp += ccol)); - Vector256 x21 = Avx.And(mask, Avx.LoadVector256(pMatTemp += ccol)); - Vector256 x31 = Avx.And(mask, Avx.LoadVector256(pMatTemp += ccol)); - Vector256 vector = Avx.And(mask, Avx.LoadVector256(pSrcCurrent)); + float* pMatTemp = pMatCurrent; + Vector256 x01 = Avx.And(mask, Avx.LoadVector256(pMatTemp)); + Vector256 x11 = Avx.And(mask, Avx.LoadVector256(pMatTemp += ccol)); + Vector256 x21 = Avx.And(mask, Avx.LoadVector256(pMatTemp += ccol)); + Vector256 x31 = Avx.And(mask, Avx.LoadVector256(pMatTemp += ccol)); + Vector256 vector = Avx.And(mask, Avx.LoadVector256(pSrcCurrent)); - res0 = MultiplyAdd(x01, vector, res0); - res1 = MultiplyAdd(x11, vector, res1); - res2 = MultiplyAdd(x21, vector, res2); - res3 = MultiplyAdd(x31, vector, res3); + res0 = MultiplyAdd(x01, vector, res0); + res1 = MultiplyAdd(x11, vector, res1); + res2 = MultiplyAdd(x21, vector, res2); + res3 = MultiplyAdd(x31, vector, res3); - pMatCurrent += 8; - pSrcCurrent += 8; + pMatCurrent += 8; + pSrcCurrent += 8; + } } // Add up the entries of each, with the 4 results in res0 @@ -294,58 +297,17 @@ public static unsafe void MatMul(ReadOnlySpan mat, ReadOnlySpan sr pDstCurrent += 4; pMatCurrent += 3 * ccol; - numRows -= 4; - } - - // falling through the case statements - switch (numRows) - { - case 3: - *(pDstCurrent + 2) = RowMultiply(pMatCurrent + 2 * ccol, psrc, pSrcEnd, ccol); - goto case 2; - case 2: - *(pDstCurrent + 1) = RowMultiply(pMatCurrent + ccol, psrc, pSrcEnd, ccol); - goto case 1; - case 1: - *pDstCurrent = RowMultiply(pMatCurrent, psrc, pSrcEnd, ccol); - break; } } } - private static unsafe float RowMultiply(float* pMatCurrent, float* pSrcCurrent, float* pSrcEnd, int ccol) + // Partial sparse source vector. + public static unsafe void MatMulP(AlignedArray mat, ReadOnlySpan rgposSrc, AlignedArray src, + int posMin, int iposMin, int iposEnd, AlignedArray dst, int crow, int ccol) { - Vector256 res0 = Avx.SetZeroVector256(); - int remainder = ccol % 8; - while (pSrcCurrent + 8 <= pSrcEnd) - { - Vector256 vector = Avx.LoadVector256(pSrcCurrent); - - float* pMatTemp = pMatCurrent; - res0 = MultiplyAdd(pMatTemp, vector, res0); - - pSrcCurrent += 8; - pMatCurrent += 8; - } - - res0 = VectorSum256(in res0); - float sum = Sse.ConvertToSingle(Sse.AddScalar(Avx.GetLowerHalf(res0), GetHigh(res0))); - - // falling through the case statements - switch (remainder) - { - case 7: sum += *(pSrcCurrent + 6) * *(pMatCurrent + 6); goto case 6; - case 6: sum += *(pSrcCurrent + 5) * *(pMatCurrent + 5); goto case 5; - case 5: sum += *(pSrcCurrent + 4) * *(pMatCurrent + 4); goto case 4; - case 4: sum += *(pSrcCurrent + 3) * *(pMatCurrent + 3); goto case 3; - case 3: sum += *(pSrcCurrent + 2) * *(pMatCurrent + 2); goto case 2; - case 2: sum += *(pSrcCurrent + 1) * *(pMatCurrent + 1); goto case 1; - case 1: sum += *(pSrcCurrent) * *(pMatCurrent); break; - } - return sum; + MatMulP(mat.Items, rgposSrc, src.Items, posMin, iposMin, iposEnd, dst.Items, crow, ccol); } - // Partial sparse source vector. public static unsafe void MatMulP(ReadOnlySpan mat, ReadOnlySpan rgposSrc, ReadOnlySpan src, int posMin, int iposMin, int iposEnd, Span dst, int crow, int ccol) { @@ -499,6 +461,11 @@ Vector256 SparseMultiplicationAcrossRow() } } + public static unsafe void MatMulTran(AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol) + { + MatMulTran(mat.Items, src.Items, dst.Items, crow, ccol); + } + public static unsafe void MatMulTran(ReadOnlySpan mat, ReadOnlySpan src, Span dst, int crow, int ccol) { fixed (float* psrc = &MemoryMarshal.GetReference(src)) @@ -511,31 +478,43 @@ public static unsafe void MatMulTran(ReadOnlySpan mat, ReadOnlySpan h01 = Sse.LoadVector128(pSrcCurrent); + // Replicate each slot of h01 (ABCD) into its own register. + Vector128 h11 = Avx.Permute(h01, 0x55); // B + Vector128 h21 = Avx.Permute(h01, 0xAA); // C + Vector128 h31 = Avx.Permute(h01, 0xFF); // D h01 = Avx.Permute(h01, 0x00); // A Vector256 x01 = Avx.SetHighLow(h01, h01); + Vector256 x11 = Avx.SetHighLow(h11, h11); + Vector256 x21 = Avx.SetHighLow(h21, h21); + Vector256 x31 = Avx.SetHighLow(h31, h31); + int length = crow; float* pDstCurrent = pdst; nuint address = (nuint)(pMatCurrent); int misalignment = (int)(address % 32); - if ((misalignment & 3) != 0 || (crow % 8 != 0)) + if ((misalignment & 3) != 0) { // Handles cases where the data is not 32-bit aligned and we can't ever use aligned operations - remainder = crow % 8; - while (pDstCurrent + 8 <= pDstEnd) + while (pDstCurrent < pDstEnd) { float* pMatTemp = pMatCurrent; Vector256 x02 = Avx.Multiply(x01, Avx.LoadVector256(pMatTemp)); + Vector256 x12 = Avx.Multiply(x11, Avx.LoadVector256(pMatTemp += crow)); + Vector256 x22 = Avx.Multiply(x21, Avx.LoadVector256(pMatTemp += crow)); + Vector256 x32 = Avx.Multiply(x31, Avx.LoadVector256(pMatTemp += crow)); + + x02 = Avx.Add(x02, x12); + x22 = Avx.Add(x22, x32); + x02 = Avx.Add(x02, x22); Avx.Store(pDstCurrent, x02); pDstCurrent += 8; @@ -544,6 +523,7 @@ public static unsafe void MatMulTran(ReadOnlySpan mat, ReadOnlySpan mat, ReadOnlySpan x02 = Avx.And(leadingMask, Avx.LoadVector256(pMatTemp)); + Vector256 x12 = Avx.And(leadingMask, Avx.LoadVector256(pMatTemp += crow)); + Vector256 x22 = Avx.And(leadingMask, Avx.LoadVector256(pMatTemp += crow)); + Vector256 x32 = Avx.And(leadingMask, Avx.LoadVector256(pMatTemp += crow)); + x02 = Avx.Multiply(x01, x02); + x12 = Avx.Multiply(x11, x12); + x22 = Avx.Multiply(x21, x22); + x32 = Avx.Multiply(x31, x32); + + x02 = Avx.Add(x02, x12); + x22 = Avx.Add(x22, x32); + x02 = Avx.Add(x02, x22); Vector256 trailingMask = Avx.LoadVector256(((float*)(pTrailingAlignmentMask)) + ((8 - misalignment) * 8)); Vector256 x3 = Avx.LoadVector256(pDstCurrent); - x02 = Avx.Add(x02, Avx.And(x3, trailingMask)); + x02 = Avx.Or(x02, Avx.And(x3, trailingMask)); Avx.Store(pDstCurrent, x02); pMatCurrent += misalignment; @@ -578,7 +569,15 @@ public static unsafe void MatMulTran(ReadOnlySpan mat, ReadOnlySpan x02 = Avx.Multiply(x01, Avx.LoadVector256(pMatTemp)); + Vector256 x12 = Avx.Multiply(x11, Avx.LoadVector256(pMatTemp += crow)); + Vector256 x22 = Avx.Multiply(x21, Avx.LoadVector256(pMatTemp += crow)); + Vector256 x32 = Avx.Multiply(x31, Avx.LoadVector256(pMatTemp += crow)); + + x02 = Avx.Add(x02, x12); + x22 = Avx.Add(x22, x32); + x02 = Avx.Add(x02, x22); Avx.Store(pDstCurrent, x02); pDstCurrent += 8; @@ -592,36 +591,47 @@ public static unsafe void MatMulTran(ReadOnlySpan mat, ReadOnlySpan trailingMask = Avx.LoadVector256(((float*)(pTrailingAlignmentMask)) + (remainder * 8)); + pMatCurrent -= (8 - remainder); + pDstCurrent -= (8 - remainder); + Vector256 trailingMask = Avx.LoadVector256(((float*)(pTrailingAlignmentMask)) + (remainder * 8)); - float* pMatTemp = pMatCurrent; - Vector256 x02 = Avx.And(trailingMask, Avx.LoadVector256(pMatTemp)); - x02 = Avx.Multiply(x01, x02); + float* pMatTemp = pMatCurrent; + Vector256 x02 = Avx.And(trailingMask, Avx.LoadVector256(pMatTemp)); + Vector256 x12 = Avx.And(trailingMask, Avx.LoadVector256(pMatTemp += crow)); + Vector256 x22 = Avx.And(trailingMask, Avx.LoadVector256(pMatTemp += crow)); + Vector256 x32 = Avx.And(trailingMask, Avx.LoadVector256(pMatTemp += crow)); - Vector256 leadingMask = Avx.LoadVector256(((float*)(pLeadingAlignmentMask)) + ((8 - remainder) * 8)); - Vector256 x3 = Avx.LoadVector256(pDstCurrent); - x02 = Avx.Add(x02, Avx.And(x3, leadingMask)); + x02 = Avx.Multiply(x01, x02); + x12 = Avx.Multiply(x11, x12); + x22 = Avx.Multiply(x21, x22); + x32 = Avx.Multiply(x31, x32); - Avx.Store(pDstCurrent, x02); - pDstCurrent += 8; - pMatCurrent += 8; + x02 = Avx.Add(x02, x12); + x22 = Avx.Add(x22, x32); + x02 = Avx.Add(x02, x22); + + Vector256 leadingMask = Avx.LoadVector256(((float*)(pLeadingAlignmentMask)) + ((8 - remainder) * 8)); + Vector256 x3 = Avx.LoadVector256(pDstCurrent); + x02 = Avx.Or(x02, Avx.And(x3, leadingMask)); + + Avx.Store(pDstCurrent, x02); + pDstCurrent += 8; + pMatCurrent += 8; + } } - pSrcCurrent += 1; - numCol -= 1; + pMatCurrent += 3 * crow; + pSrcCurrent += 4; } // We do 4-way unrolling - while (pSrcCurrent + 4 <= pSrcEnd) + while (pSrcCurrent < pSrcEnd) { Vector128 h01 = Sse.LoadVector128(pSrcCurrent); // Replicate each slot of h01 (ABCD) into its own register. @@ -641,10 +651,9 @@ public static unsafe void MatMulTran(ReadOnlySpan mat, ReadOnlySpan x02 = Avx.Multiply(x01, Avx.LoadVector256(pMatTemp)); @@ -665,6 +674,7 @@ public static unsafe void MatMulTran(ReadOnlySpan mat, ReadOnlySpan mat, ReadOnlySpan trailingMask = Avx.LoadVector256(((float*)(pTrailingAlignmentMask)) + ((8 - misalignment) * 8)); Vector256 x3 = Avx.LoadVector256(pDstCurrent); - x02 = Avx.Add(x02, Avx.And(x3, trailingMask)); + x02 = Avx.Or(x02, Avx.And(x3, trailingMask)); x02 = Avx.Add(x02, Avx.And(x3, leadingMask)); @@ -728,92 +738,46 @@ public static unsafe void MatMulTran(ReadOnlySpan mat, ReadOnlySpan trailingMask = Avx.LoadVector256(((float*)(pTrailingAlignmentMask)) + (remainder * 8)); + if (remainder != 0) + { + pMatCurrent -= (8 - remainder); + pDstCurrent -= (8 - remainder); + Vector256 trailingMask = Avx.LoadVector256(((float*)(pTrailingAlignmentMask)) + (remainder * 8)); - float* pMatTemp = pMatCurrent; - Vector256 x02 = Avx.And(trailingMask, Avx.LoadVector256(pMatTemp)); - Vector256 x12 = Avx.And(trailingMask, Avx.LoadVector256(pMatTemp += crow)); - Vector256 x22 = Avx.And(trailingMask, Avx.LoadVector256(pMatTemp += crow)); - Vector256 x32 = Avx.And(trailingMask, Avx.LoadVector256(pMatTemp += crow)); + float* pMatTemp = pMatCurrent; + Vector256 x02 = Avx.And(trailingMask, Avx.LoadVector256(pMatTemp)); + Vector256 x12 = Avx.And(trailingMask, Avx.LoadVector256(pMatTemp += crow)); + Vector256 x22 = Avx.And(trailingMask, Avx.LoadVector256(pMatTemp += crow)); + Vector256 x32 = Avx.And(trailingMask, Avx.LoadVector256(pMatTemp += crow)); - x02 = Avx.Multiply(x01, x02); - x12 = Avx.Multiply(x11, x12); - x22 = Avx.Multiply(x21, x22); - x32 = Avx.Multiply(x31, x32); + x02 = Avx.Multiply(x01, x02); + x12 = Avx.Multiply(x11, x12); + x22 = Avx.Multiply(x21, x22); + x32 = Avx.Multiply(x31, x32); - x02 = Avx.Add(x02, x12); - x22 = Avx.Add(x22, x32); - x02 = Avx.Add(x02, x22); + x02 = Avx.Add(x02, x12); + x22 = Avx.Add(x22, x32); + x02 = Avx.Add(x02, x22); - Vector256 leadingMask = Avx.LoadVector256(((float*)(pLeadingAlignmentMask)) + ((8 - remainder) * 8)); - Vector256 x3 = Avx.LoadVector256(pDstCurrent); - x02 = Avx.Add(x02, Avx.And(x3, leadingMask)); + Vector256 leadingMask = Avx.LoadVector256(((float*)(pLeadingAlignmentMask)) + ((8 - remainder) * 8)); + Vector256 x3 = Avx.LoadVector256(pDstCurrent); + x02 = Avx.Or(x02, Avx.And(x3, leadingMask)); - x02 = Avx.Add(x02, Avx.And(x3, trailingMask)); + x02 = Avx.Add(x02, Avx.And(x3, trailingMask)); - Avx.Store(pDstCurrent, x02); - pDstCurrent += 8; - pMatCurrent += 8; + Avx.Store(pDstCurrent, x02); + pDstCurrent += 8; + pMatCurrent += 8; + } } pMatCurrent += 3 * crow; pSrcCurrent += 4; - numCol -= 4; - } - - // falling through the case statements - switch (numCol) - { - case 3: ColumnMultiply(pMatCurrent + 2 * crow, pSrcCurrent + 2, pdst, pDstEnd, crow); goto case 2; - case 2: ColumnMultiply(pMatCurrent + crow, pSrcCurrent + 1, pdst, pDstEnd, crow); goto case 1; - case 1: ColumnMultiply(pMatCurrent, pSrcCurrent, pdst, pDstEnd, crow); break; } } } - private static unsafe void ColumnMultiply(float* pMatCurrent, float* pSrcCurrent, float* pdst, float* pDstEnd, int crow) - { - Vector128 h01 = Sse.LoadVector128(pSrcCurrent); - // Replicate each slot of h01 (ABCD) into its own register. - h01 = Avx.Permute(h01, 0x00); // A - Vector256 x01 = Avx.SetHighLow(h01, h01); - int remainder = crow % 8; - float* pDstCurrent = pdst; - - while (pDstCurrent + 8 <= pDstEnd) - { - // If we aren't using the VEX-encoding, the JIT will only fold away aligned loads - // (due to semantics of the legacy encoding). - // We don't need an assert, since the instruction will throw for unaligned inputs. - float* pMatTemp = pMatCurrent; - Vector256 x02 = Avx.Multiply(x01, Avx.LoadVector256(pMatTemp)); - x02 = Avx.Add(x02, Avx.LoadVector256(pDstCurrent)); - - Avx.Store(pDstCurrent, x02); - pDstCurrent += 8; - pMatCurrent += 8; - } - - // falling through the case statements - switch (remainder) - { - case 7: *(pDstCurrent + 6) += *(pSrcCurrent) * *(pMatCurrent + 6); goto case 6; - case 6: *(pDstCurrent + 5) += *(pSrcCurrent) * *(pMatCurrent + 5); goto case 5; - case 5: *(pDstCurrent + 4) += *(pSrcCurrent) * *(pMatCurrent + 4); goto case 4; - case 4: *(pDstCurrent + 3) += *(pSrcCurrent) * *(pMatCurrent + 3); goto case 3; - case 3: *(pDstCurrent + 2) += *(pSrcCurrent) * *(pMatCurrent + 2); goto case 2; - case 2: *(pDstCurrent + 1) += *(pSrcCurrent) * *(pMatCurrent + 1); goto case 1; - case 1: *pDstCurrent += *(pSrcCurrent) * *(pMatCurrent); break; - } - return; - } - // dst[i] += scale public static unsafe void AddScalarU(float scalar, Span dst) { diff --git a/src/Microsoft.ML.CpuMath/CpuAligenedMathUtils.cs b/src/Microsoft.ML.CpuMath/CpuAligenedMathUtils.cs new file mode 100644 index 0000000000..33690055b2 --- /dev/null +++ b/src/Microsoft.ML.CpuMath/CpuAligenedMathUtils.cs @@ -0,0 +1,148 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.ML.Runtime.Internal.CpuMath.Core; + +namespace Microsoft.ML.Runtime.Internal.CpuMath +{ + [BestFriend] + internal static class CpuAligenedMathUtils + where TMatrix : CpuAlignedMatrixBase, ICpuFullMatrix + { + /// + /// Assert the compatibility of the underlying AlignedArray for the input matrix in terms of alignment amount. + /// + /// The input matrix + public static void AssertCompatible(ICpuFullMatrix values) + { +#if DEBUG + var mat = values as TMatrix; + Contracts.AssertValue(mat); + Contracts.Assert((mat.Items.CbAlign % CpuMathUtils.GetVectorAlignment()) == 0); +#endif + } + + /// + /// Assert the compatibility of the underlying AlignedArray for the input vector in terms of alignment amount. + /// + /// The input vector + public static void AssertCompatible(ICpuVector values) + { +#if DEBUG + CpuAlignedVector vec = values as CpuAlignedVector; + Contracts.AssertValue(vec); + Contracts.Assert((vec.Items.CbAlign % CpuMathUtils.GetVectorAlignment()) == 0); +#endif + } + + private static TMatrix A(ICpuFullMatrix x) + { + AssertCompatible(x); + return (TMatrix)x; + } + + private static CpuAlignedVector A(ICpuVector x) + { + AssertCompatible(x); + return (CpuAlignedVector)x; + } + + private static void AssertCompatibleCore(ICpuMatrix mat, ICpuVector src, ICpuVector dst) + { + AssertCompatible(src); + AssertCompatible(dst); + Contracts.Assert(mat.ColCount == src.VectorSize); + Contracts.Assert(mat.RowCount == dst.VectorSize); + } + + /// + /// Asserts the following: + /// 1. The compatibility of the underlying AlignedArray for mat in terms of alignment amount. + /// 2. The compatibility of the underlying AlignedArray for src in terms of alignment amount. + /// 3. The compatibility of the underlying AlignedArray for dst in terms of alignment amount. + /// 4. The compatibility of the matrix-vector multiplication mat * src = dst. + /// + /// + /// + /// + public static void AssertCompatible(ICpuFullMatrix mat, ICpuVector src, ICpuVector dst) + { + // Also check the physical sizes. + AssertCompatible(mat); + AssertCompatibleCore(mat, src, dst); + var m = A(mat); + Contracts.Assert(m.ColCountPhy == A(src).Items.Size); + Contracts.Assert(m.RowCountPhy == A(dst).Items.Size); + } + + /// + /// Matrix multiplication: + /// dst = mat * src + /// + /// The multiplier matrix + /// The source vector + /// The destination vector + public static void MatTimesSrc(ICpuFullMatrix mat, ICpuVector src, ICpuVector dst) + { + bool colMajor = typeof(TMatrix) == typeof(CpuAlignedMatrixCol); + AssertCompatible(mat, src, dst); + var m = A(mat); + CpuMathUtils.MatrixTimesSource(colMajor, m.Items, A(src).Items, A(dst).Items, m.RunCnt); + } + + /// + /// Matrix transpose multiplication: + /// dst = mat' * src + /// + /// The multiplier matrix + /// The source vector + /// The destination vector + public static void MatTranTimesSrc(ICpuFullMatrix mat, ICpuVector src, ICpuVector dst) + { + bool colMajor = typeof(TMatrix) == typeof(CpuAlignedMatrixCol); + AssertCompatible(mat, dst, src); + var m = A(mat); + CpuMathUtils.MatrixTimesSource(!colMajor, m.Items, A(src).Items, A(dst).Items, m.RunCnt); + } + } + + public static class GeneralUtils + { + /// + /// Count the number of zero bits in the lonest string of zero's from the lowest significant bit of the input integer. + /// + /// The input integer + /// + public static int CbitLowZero(uint u) + { + if (u == 0) + return 32; + + int cbit = 0; + if ((u & 0x0000FFFF) == 0) + { + cbit += 16; + u >>= 16; + } + if ((u & 0x000000FF) == 0) + { + cbit += 8; + u >>= 8; + } + if ((u & 0x0000000F) == 0) + { + cbit += 4; + u >>= 4; + } + if ((u & 0x00000003) == 0) + { + cbit += 2; + u >>= 2; + } + if ((u & 0x00000001) == 0) + cbit += 1; + return cbit; + } + } +} diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs index 6546eeef31..d895e590a9 100644 --- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs +++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs @@ -11,62 +11,76 @@ namespace Microsoft.ML.Runtime.Internal.CpuMath { internal static partial class CpuMathUtils { - public static void MatrixTimesSource(bool transpose, ReadOnlySpan matrix, ReadOnlySpan source, Span destination, int stride) + // The count of bytes in Vector128, corresponding to _cbAlign in AlignedArray + private const int Vector128Alignment = 16; + + // The count of bytes in Vector256, corresponding to _cbAlign in AlignedArray + private const int Vector256Alignment = 32; + + // The count of bytes in a 32-bit float, corresponding to _cbAlign in AlignedArray + private const int FloatAlignment = 4; + + // If neither AVX nor SSE is supported, return basic alignment for a 4-byte float. + [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] + public static int GetVectorAlignment() + => Avx.IsSupported ? Vector256Alignment : (Sse.IsSupported ? Vector128Alignment : FloatAlignment); + + public static void MatrixTimesSource(bool transpose, AlignedArray matrix, AlignedArray source, AlignedArray destination, int stride) { - Contracts.AssertNonEmpty(matrix); - Contracts.AssertNonEmpty(source); - Contracts.AssertNonEmpty(destination); - Contracts.Assert(matrix.Length == destination.Length * source.Length); + Contracts.Assert(matrix.Size == destination.Size * source.Size); Contracts.Assert(stride >= 0); - if (!transpose) + if (Avx.IsSupported) { - if (Avx.IsSupported && source.Length >= 8) + if (!transpose) { - Contracts.Assert(stride <= destination.Length); - AvxIntrinsics.MatMul(matrix, source, destination, stride, source.Length); + Contracts.Assert(stride <= destination.Size); + AvxIntrinsics.MatMul(matrix, source, destination, stride, source.Size); } - else if (Sse.IsSupported && source.Length >= 4) + else { - Contracts.Assert(stride <= destination.Length); - SseIntrinsics.MatMul(matrix, source, destination, stride, source.Length); + Contracts.Assert(stride <= source.Size); + AvxIntrinsics.MatMulTran(matrix, source, destination, destination.Size, stride); + } + } + else if (Sse.IsSupported) + { + if (!transpose) + { + Contracts.Assert(stride <= destination.Size); + SseIntrinsics.MatMul(matrix, source, destination, stride, source.Size); } else { - Contracts.Assert(stride <= destination.Length); + Contracts.Assert(stride <= source.Size); + SseIntrinsics.MatMulTran(matrix, source, destination, destination.Size, stride); + } + } + else + { + if (!transpose) + { + Contracts.Assert(stride <= destination.Size); for (int i = 0; i < stride; i++) { float dotProduct = 0; - for (int j = 0; j < source.Length; j++) + for (int j = 0; j < source.Size; j++) { - dotProduct += matrix[i * source.Length + j] * source[j]; + dotProduct += matrix[i * source.Size + j] * source[j]; } destination[i] = dotProduct; } } - } - else - { - if (Avx.IsSupported && destination.Length >= 8) - { - Contracts.Assert(stride <= source.Length); - AvxIntrinsics.MatMulTran(matrix, source, destination, destination.Length, stride); - } - else if (Sse.IsSupported && destination.Length >=4) - { - Contracts.Assert(stride <= source.Length); - SseIntrinsics.MatMulTran(matrix, source, destination, destination.Length, stride); - } else { - Contracts.Assert(stride <= source.Length); - for (int i = 0; i < destination.Length; i++) + Contracts.Assert(stride <= source.Size); + for (int i = 0; i < destination.Size; i++) { float dotProduct = 0; for (int j = 0; j < stride; j++) { - dotProduct += matrix[j * destination.Length + i] * source[j]; + dotProduct += matrix[j * source.Size + i] * source[j]; } destination[i] = dotProduct; @@ -75,22 +89,17 @@ public static void MatrixTimesSource(bool transpose, ReadOnlySpan matrix, } } - public static void MatrixTimesSource(ReadOnlySpan matrix, ReadOnlySpan rgposSrc, ReadOnlySpan sourceValues, - int posMin, int iposMin, int iposLimit, Span destination, int stride) + public static void MatrixTimesSource(AlignedArray matrix, ReadOnlySpan rgposSrc, AlignedArray sourceValues, + int posMin, int iposMin, int iposLimit, AlignedArray destination, int stride) { Contracts.Assert(iposMin >= 0); Contracts.Assert(iposMin <= iposLimit); Contracts.Assert(iposLimit <= rgposSrc.Length); - Contracts.AssertNonEmpty(matrix); - Contracts.AssertNonEmpty(sourceValues); - Contracts.AssertNonEmpty(destination); - Contracts.AssertNonEmpty(rgposSrc); - Contracts.Assert(stride > 0); - Contracts.Assert(matrix.Length == destination.Length * sourceValues.Length); + Contracts.Assert(matrix.Size == destination.Size * sourceValues.Size); if (iposMin >= iposLimit) { - destination.Clear(); + destination.ZeroItems(); return; } @@ -99,24 +108,24 @@ public static void MatrixTimesSource(ReadOnlySpan matrix, ReadOnlySpan left, ReadOnlySpan } } + public static void ZeroMatrixItems(AlignedArray destination, int ccol, int cfltRow, int[] indices) + { + Contracts.Assert(ccol > 0); + Contracts.Assert(ccol <= cfltRow); + + if (ccol == cfltRow) + { + ZeroItemsU(destination, destination.Size, indices, indices.Length); + } + else + { + ZeroMatrixItemsCore(destination, destination.Size, ccol, cfltRow, indices, indices.Length); + } + } + + private static unsafe void ZeroItemsU(AlignedArray destination, int c, int[] indices, int cindices) + { + fixed (float* pdst = &destination.Items[0]) + fixed (int* pidx = &indices[0]) + { + for (int i = 0; i < cindices; ++i) + { + int index = pidx[i]; + Contracts.Assert(index >= 0); + Contracts.Assert(index < c); + pdst[index] = 0; + } + } + } + + private static unsafe void ZeroMatrixItemsCore(AlignedArray destination, int c, int ccol, int cfltRow, int[] indices, int cindices) + { + fixed (float* pdst = &destination.Items[0]) + fixed (int* pidx = &indices[0]) + { + int ivLogMin = 0; + int ivLogLim = ccol; + int ivPhyMin = 0; + + for (int i = 0; i < cindices; ++i) + { + int index = pidx[i]; + Contracts.Assert(index >= 0); + Contracts.Assert(index < c); + + int col = index - ivLogMin; + if ((uint)col >= (uint)ccol) + { + Contracts.Assert(ivLogMin > index || index >= ivLogLim); + + int row = index / ccol; + ivLogMin = row * ccol; + ivLogLim = ivLogMin + ccol; + ivPhyMin = row * cfltRow; + + Contracts.Assert(index >= ivLogMin); + Contracts.Assert(index < ivLogLim); + col = index - ivLogMin; + } + + pdst[ivPhyMin + col] = 0; + } + } + } + public static void SdcaL1UpdateDense(float primalUpdate, int count, ReadOnlySpan source, float threshold, Span v, Span w) { Contracts.AssertNonEmpty(source); diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs index 400b70d651..5ecbc62be1 100644 --- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs +++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs @@ -11,10 +11,17 @@ namespace Microsoft.ML.Runtime.Internal.CpuMath [BestFriend] internal static partial class CpuMathUtils { - public static void MatrixTimesSource(bool transpose, ReadOnlySpan matrix, ReadOnlySpan source, Span destination, int stride) => SseUtils.MatTimesSrc(transpose, matrix, source, destination, stride); + // The count of bytes in Vector128, corresponding to _cbAlign in AlignedArray + private const int Vector128Alignment = 16; - public static void MatrixTimesSource(ReadOnlySpan matrix, ReadOnlySpan rgposSrc, ReadOnlySpan sourceValues, - int posMin, int iposMin, int iposLimit, Span destination, int stride) => SseUtils.MatTimesSrc(matrix, rgposSrc, sourceValues, posMin, iposMin, iposLimit, destination, stride); + [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] + public static int GetVectorAlignment() + => Vector128Alignment; + + public static void MatrixTimesSource(bool transpose, AlignedArray matrix, AlignedArray source, AlignedArray destination, int stride) => SseUtils.MatTimesSrc(transpose, matrix, source, destination, stride); + + public static void MatrixTimesSource(AlignedArray matrix, ReadOnlySpan rgposSrc, AlignedArray sourceValues, + int posMin, int iposMin, int iposLimit, AlignedArray destination, int stride) => SseUtils.MatTimesSrc(matrix, rgposSrc, sourceValues, posMin, iposMin, iposLimit, destination, stride); public static void Add(float value, Span destination) => SseUtils.Add(value, destination); @@ -56,6 +63,8 @@ public static void MatrixTimesSource(ReadOnlySpan matrix, ReadOnlySpan left, ReadOnlySpan right, int count) => SseUtils.L2DistSquared(left, right, count); + public static void ZeroMatrixItems(AlignedArray destination, int ccol, int cfltRow, int[] indices) => SseUtils.ZeroMatrixItems(destination, ccol, cfltRow, indices); + public static void SdcaL1UpdateDense(float primalUpdate, int count, ReadOnlySpan source, float threshold, Span v, Span w) => SseUtils.SdcaL1UpdateDense(primalUpdate, count, source, threshold, v, w); diff --git a/src/Microsoft.ML.CpuMath/Sse.cs b/src/Microsoft.ML.CpuMath/Sse.cs index d57b400e9c..8b1c4da70f 100644 --- a/src/Microsoft.ML.CpuMath/Sse.cs +++ b/src/Microsoft.ML.CpuMath/Sse.cs @@ -15,53 +15,74 @@ namespace Microsoft.ML.Runtime.Internal.CpuMath [BestFriend] internal static class SseUtils { - public static void MatTimesSrc(bool tran, ReadOnlySpan mat, ReadOnlySpan src, Span dst, int crun) + public const int CbAlign = 16; + + private static bool Compat(AlignedArray a) + { + Contracts.AssertValue(a); + Contracts.Assert(a.Size > 0); + return a.CbAlign == CbAlign; + } + + private static unsafe float* Ptr(AlignedArray a, float* p) + { + Contracts.AssertValue(a); + float* q = p + a.GetBase((long)p); + Contracts.Assert(((long)q & (CbAlign - 1)) == 0); + return q; + } + + public static void MatTimesSrc(bool tran, AlignedArray mat, AlignedArray src, AlignedArray dst, int crun) { - Contracts.Assert(mat.Length == dst.Length * src.Length); + Contracts.Assert(Compat(mat)); + Contracts.Assert(Compat(src)); + Contracts.Assert(Compat(dst)); + Contracts.Assert(mat.Size == dst.Size * src.Size); unsafe { - fixed (float* pmat = &mat[0]) - fixed (float* psrc = &src[0]) - fixed (float* pdst = &dst[0]) + fixed (float* pmat = &mat.Items[0]) + fixed (float* psrc = &src.Items[0]) + fixed (float* pdst = &dst.Items[0]) { if (!tran) { - Contracts.Assert(0 <= crun && crun <= dst.Length); - Thunk.MatMul(pmat, psrc, pdst, crun, src.Length); + Contracts.Assert(0 <= crun && crun <= dst.Size); + Thunk.MatMul(Ptr(mat, pmat), Ptr(src, psrc), Ptr(dst, pdst), crun, src.Size); } else { - Contracts.Assert(0 <= crun && crun <= src.Length); - Thunk.MatMulTran(pmat, psrc, pdst, dst.Length, crun); + Contracts.Assert(0 <= crun && crun <= src.Size); + Thunk.MatMulTran(Ptr(mat, pmat), Ptr(src, psrc), Ptr(dst, pdst), dst.Size, crun); } } } } - public static void MatTimesSrc(ReadOnlySpan mat, ReadOnlySpan rgposSrc, ReadOnlySpan srcValues, - int posMin, int iposMin, int iposLim, Span dst, int crun) + public static void MatTimesSrc(AlignedArray mat, ReadOnlySpan rgposSrc, AlignedArray srcValues, + int posMin, int iposMin, int iposLim, AlignedArray dst, int crun) { + Contracts.Assert(Compat(mat)); + Contracts.Assert(Compat(srcValues)); + Contracts.Assert(Compat(dst)); Contracts.Assert(0 <= iposMin && iposMin <= iposLim && iposLim <= rgposSrc.Length); - Contracts.Assert(mat.Length == dst.Length * srcValues.Length); + Contracts.Assert(mat.Size == dst.Size * srcValues.Size); if (iposMin >= iposLim) { - dst.Clear(); + dst.ZeroItems(); return; } - Contracts.AssertNonEmpty(rgposSrc); - unsafe { - fixed (float* pdst = &dst[0]) - fixed (float* pmat = &mat[0]) - fixed (float* psrc = &srcValues[0]) + fixed (float* pdst = &dst.Items[0]) + fixed (float* pmat = &mat.Items[0]) + fixed (float* psrc = &srcValues.Items[0]) fixed (int* ppossrc = &rgposSrc[0]) { - Contracts.Assert(0 <= crun && crun <= dst.Length); - Thunk.MatMulP(pmat, ppossrc, psrc, posMin, iposMin, iposLim, pdst, crun, srcValues.Length); + Contracts.Assert(0 <= crun && crun <= dst.Size); + Thunk.MatMulP(Ptr(mat, pmat), ppossrc, Ptr(srcValues, psrc), posMin, iposMin, iposLim, Ptr(dst, pdst), crun, srcValues.Size); } } } @@ -345,6 +366,23 @@ public static float L2DistSquared(ReadOnlySpan a, ReadOnlySpan b, } } + public static void ZeroMatrixItems(AlignedArray dst, int ccol, int cfltRow, int[] indices) + { + Contracts.Assert(0 < ccol && ccol <= cfltRow); + + unsafe + { + fixed (float* pdst = &dst.Items[0]) + fixed (int* pi = &indices[0]) + { + if (ccol == cfltRow) + Thunk.ZeroItemsU(Ptr(dst, pdst), dst.Size, pi, indices.Length); + else + Thunk.ZeroMatrixItemsCore(Ptr(dst, pdst), dst.Size, ccol, cfltRow, pi, indices.Length); + } + } + } + public static void SdcaL1UpdateDense(float primalUpdate, int count, ReadOnlySpan src, float threshold, Span v, Span w) { Contracts.AssertNonEmpty(src); diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs index 98394f5d7d..b83fd6bbc6 100644 --- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs +++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs @@ -116,6 +116,11 @@ internal static Vector128 GetNewDst128(in Vector128 xDst1, in Vect } // Multiply matrix times vector into vector. + public static unsafe void MatMul(AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol) + { + MatMul(mat.Items, src.Items, dst.Items, crow, ccol); + } + public static unsafe void MatMul(ReadOnlySpan mat, ReadOnlySpan src, Span dst, int crow, int ccol) { fixed (float* psrc = &MemoryMarshal.GetReference(src)) @@ -128,9 +133,8 @@ public static unsafe void MatMul(ReadOnlySpan mat, ReadOnlySpan sr float* pDstEnd = pdst + crow; float* pDstCurrent = pdst; float* pMatCurrent = pmat; - int numRows = crow; - while (pDstCurrent + 4 <= pDstEnd) + while (pDstCurrent < pDstEnd) { Vector128 res0 = Sse.SetZeroVector128(); Vector128 res1 = Sse.SetZeroVector128(); @@ -144,11 +148,10 @@ public static unsafe void MatMul(ReadOnlySpan mat, ReadOnlySpan sr int misalignment = (int)(address % 16); int remainder = 0; - if ((misalignment & 3) != 0 || (ccol % 4 != 0)) + if ((misalignment & 3) != 0) { // Handles cases where the data is not 32-bit aligned and we can't ever use aligned operations - remainder = length % 4; - while (pSrcCurrent + 4 <= pSrcEnd) + while (pSrcCurrent < pSrcEnd) { Vector128 vector = Sse.LoadVector128(pSrcCurrent); @@ -230,32 +233,32 @@ public static unsafe void MatMul(ReadOnlySpan mat, ReadOnlySpan sr // unaligned loads where we mask the input each time. remainder = length; } - } - if (remainder != 0) - { - // Handle any trailing elements that don't fit into a 128-bit block by moving back so that the next - // unaligned load will read to the end of the array and then mask out any elements already processed + if (remainder != 0) + { + // Handle any trailing elements that don't fit into a 128-bit block by moving back so that the next + // unaligned load will read to the end of the array and then mask out any elements already processed - pMatCurrent -= (4 - remainder); - pSrcCurrent -= (4 - remainder); + pMatCurrent -= (4 - remainder); + pSrcCurrent -= (4 - remainder); - Vector128 mask = Sse.LoadVector128(((float*)(pTrailingAlignmentMask)) + (remainder * 4)); + Vector128 mask = Sse.LoadVector128(((float*)(pTrailingAlignmentMask)) + (remainder * 4)); - float* pMatTemp = pMatCurrent; - Vector128 x01 = Sse.And(mask, Sse.LoadVector128(pMatTemp)); - Vector128 x11 = Sse.And(mask, Sse.LoadVector128(pMatTemp += ccol)); - Vector128 x21 = Sse.And(mask, Sse.LoadVector128(pMatTemp += ccol)); - Vector128 x31 = Sse.And(mask, Sse.LoadVector128(pMatTemp += ccol)); - Vector128 vector = Sse.And(mask, Sse.LoadVector128(pSrcCurrent)); + float* pMatTemp = pMatCurrent; + Vector128 x01 = Sse.And(mask, Sse.LoadVector128(pMatTemp)); + Vector128 x11 = Sse.And(mask, Sse.LoadVector128(pMatTemp += ccol)); + Vector128 x21 = Sse.And(mask, Sse.LoadVector128(pMatTemp += ccol)); + Vector128 x31 = Sse.And(mask, Sse.LoadVector128(pMatTemp += ccol)); + Vector128 vector = Sse.And(mask, Sse.LoadVector128(pSrcCurrent)); - res0 = Sse.Add(res0, Sse.Multiply(x01, vector)); - res1 = Sse.Add(res1, Sse.Multiply(x11, vector)); - res2 = Sse.Add(res2, Sse.Multiply(x21, vector)); - res3 = Sse.Add(res3, Sse.Multiply(x31, vector)); + res0 = Sse.Add(res0, Sse.Multiply(x01, vector)); + res1 = Sse.Add(res1, Sse.Multiply(x11, vector)); + res2 = Sse.Add(res2, Sse.Multiply(x21, vector)); + res3 = Sse.Add(res3, Sse.Multiply(x31, vector)); - pMatCurrent += 4; - pSrcCurrent += 4; + pMatCurrent += 4; + pSrcCurrent += 4; + } } // Add up the entries of each, with the 4 results in res0 @@ -266,56 +269,17 @@ public static unsafe void MatMul(ReadOnlySpan mat, ReadOnlySpan sr Sse.Store(pDstCurrent, res0); pDstCurrent += 4; pMatCurrent += 3 * ccol; - numRows -= 4; - } - - // falling through the case statements - switch (numRows) - { - case 3: - *(pDstCurrent + 2) = RowMultiply(pMatCurrent + 2 * ccol, psrc, pSrcEnd, ccol); - goto case 2; - case 2: - *(pDstCurrent + 1) = RowMultiply(pMatCurrent + ccol, psrc, pSrcEnd, ccol); - goto case 1; - case 1: - *pDstCurrent = RowMultiply(pMatCurrent, psrc, pSrcEnd, ccol); - break; } } } - private static unsafe float RowMultiply(float* pMatCurrent, float* pSrcCurrent, float* pSrcEnd, int ccol) + // Partial sparse source vector. + public static unsafe void MatMulP(AlignedArray mat, ReadOnlySpan rgposSrc, AlignedArray src, + int posMin, int iposMin, int iposEnd, AlignedArray dst, int crow, int ccol) { - Vector128 res0 = Sse.SetZeroVector128(); - int remainder = ccol % 4; - while (pSrcCurrent + 4 <= pSrcEnd) - { - Vector128 vector = Sse.LoadVector128(pSrcCurrent); - - float* pMatTemp = pMatCurrent; - Vector128 x01 = Sse.Multiply(vector, Sse.LoadVector128(pMatTemp)); - res0 = Sse.Add(res0, x01); - - pSrcCurrent += 4; - pMatCurrent += 4; - } - - // Add up the entries of each, with the 4 results in res0 - res0 = VectorSum128(in res0); - float sum = Sse.ConvertToSingle(res0); - - // falling through the case statements - switch (remainder) - { - case 3: sum += *(pSrcCurrent + 2) * *(pMatCurrent + 2); goto case 2; - case 2: sum += *(pSrcCurrent + 1) * *(pMatCurrent + 1); goto case 1; - case 1: sum += *(pSrcCurrent) * *(pMatCurrent); break; - } - return sum; + MatMulP(mat.Items, rgposSrc, src.Items, posMin, iposMin, iposEnd, dst.Items, crow, ccol); } - // Partial sparse source vector. public static unsafe void MatMulP(ReadOnlySpan mat, ReadOnlySpan rgposSrc, ReadOnlySpan src, int posMin, int iposMin, int iposEnd, Span dst, int crow, int ccol) { @@ -472,6 +436,11 @@ Vector128 SparseMultiplicationAcrossRow() } } + public static unsafe void MatMulTran(AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol) + { + MatMulTran(mat.Items, src.Items, dst.Items, crow, ccol); + } + public static unsafe void MatMulTran(ReadOnlySpan mat, ReadOnlySpan src, Span dst, int crow, int ccol) { fixed (float* psrc = &MemoryMarshal.GetReference(src)) @@ -484,14 +453,16 @@ public static unsafe void MatMulTran(ReadOnlySpan mat, ReadOnlySpan x01 = Sse.LoadVector128(pSrcCurrent); + // Replicate each 32-bit slot of x01 (ABCD) into its own register. + Vector128 x11 = Sse.Shuffle(x01, x01, 0x55); // B + Vector128 x21 = Sse.Shuffle(x01, x01, 0xAA); // C + Vector128 x31 = Sse.Shuffle(x01, x01, 0xFF); // D x01 = Sse.Shuffle(x01, x01, 0x00); // A int length = crow; @@ -500,14 +471,20 @@ public static unsafe void MatMulTran(ReadOnlySpan mat, ReadOnlySpan x02 = Sse.Multiply(x01, Sse.LoadVector128(pMatTemp)); + Vector128 x12 = Sse.Multiply(x11, Sse.LoadVector128(pMatTemp += crow)); + Vector128 x22 = Sse.Multiply(x21, Sse.LoadVector128(pMatTemp += crow)); + Vector128 x32 = Sse.Multiply(x31, Sse.LoadVector128(pMatTemp += crow)); + + x02 = Sse.Add(x02, x12); + x22 = Sse.Add(x22, x32); + x02 = Sse.Add(x02, x22); Sse.Store(pDstCurrent, x02); pDstCurrent += 4; @@ -516,6 +493,7 @@ public static unsafe void MatMulTran(ReadOnlySpan mat, ReadOnlySpan mat, ReadOnlySpan x02 = Sse.And(leadingMask, Sse.LoadVector128(pMatTemp)); + Vector128 x12 = Sse.And(leadingMask, Sse.LoadVector128(pMatTemp += crow)); + Vector128 x22 = Sse.And(leadingMask, Sse.LoadVector128(pMatTemp += crow)); + Vector128 x32 = Sse.And(leadingMask, Sse.LoadVector128(pMatTemp += crow)); + x02 = Sse.Multiply(x01, x02); + x12 = Sse.Multiply(x11, x12); + x22 = Sse.Multiply(x21, x22); + x32 = Sse.Multiply(x31, x32); + + x02 = Sse.Add(x02, x12); + x22 = Sse.Add(x22, x32); + x02 = Sse.Add(x02, x22); Vector128 trailingMask = Sse.LoadVector128(((float*)(pTrailingAlignmentMask)) + ((4 - misalignment) * 4)); Vector128 x3 = Sse.LoadVector128(pDstCurrent); - x02 = Sse.Add(x02, Sse.And(x3, trailingMask)); + x02 = Sse.Or(x02, Sse.And(x3, trailingMask)); Sse.Store(pDstCurrent, x02); pMatCurrent += misalignment; @@ -549,7 +538,15 @@ public static unsafe void MatMulTran(ReadOnlySpan mat, ReadOnlySpan x02 = Sse.Multiply(x01, Sse.LoadAlignedVector128(pMatTemp)); + Vector128 x12 = Sse.Multiply(x11, Sse.LoadAlignedVector128(pMatTemp += crow)); + Vector128 x22 = Sse.Multiply(x21, Sse.LoadAlignedVector128(pMatTemp += crow)); + Vector128 x32 = Sse.Multiply(x31, Sse.LoadAlignedVector128(pMatTemp += crow)); + + x02 = Sse.Add(x02, x12); + x22 = Sse.Add(x22, x32); + x02 = Sse.Add(x02, x22); Sse.Store(pDstCurrent, x02); pDstCurrent += 4; @@ -563,36 +560,47 @@ public static unsafe void MatMulTran(ReadOnlySpan mat, ReadOnlySpan trailingMask = Sse.LoadVector128(((float*)(pTrailingAlignmentMask)) + (remainder * 4)); + Vector128 trailingMask = Sse.LoadVector128(((float*)(pTrailingAlignmentMask)) + (remainder * 4)); - float* pMatTemp = pMatCurrent; - Vector128 x02 = Sse.And(trailingMask, Sse.LoadVector128(pMatTemp)); - x02 = Sse.Multiply(x01, x02); + float* pMatTemp = pMatCurrent; + Vector128 x02 = Sse.And(trailingMask, Sse.LoadVector128(pMatTemp)); + Vector128 x12 = Sse.And(trailingMask, Sse.LoadVector128(pMatTemp += crow)); + Vector128 x22 = Sse.And(trailingMask, Sse.LoadVector128(pMatTemp += crow)); + Vector128 x32 = Sse.And(trailingMask, Sse.LoadVector128(pMatTemp += crow)); - Vector128 leadingMask = Sse.LoadVector128(((float*)(pLeadingAlignmentMask)) + ((4 - remainder) * 4)); - Vector128 x3 = Sse.LoadVector128(pDstCurrent); - x02 = Sse.Add(x02, Sse.And(x3, leadingMask)); + x02 = Sse.Multiply(x01, x02); + x12 = Sse.Multiply(x11, x12); + x22 = Sse.Multiply(x21, x22); + x32 = Sse.Multiply(x31, x32); - Sse.Store(pDstCurrent, x02); - pDstCurrent += 4; - pMatCurrent += 4; + x02 = Sse.Add(x02, x12); + x22 = Sse.Add(x22, x32); + x02 = Sse.Add(x02, x22); + + Vector128 leadingMask = Sse.LoadVector128(((float*)(pLeadingAlignmentMask)) + ((4 - remainder) * 4)); + Vector128 x3 = Sse.LoadVector128(pDstCurrent); + x02 = Sse.Or(x02, Sse.And(x3, leadingMask)); + + Sse.Store(pDstCurrent, x02); + pDstCurrent += 4; + pMatCurrent += 4; + } } - numCol -= 1; - pSrcCurrent += 1; + pMatCurrent += 3 * crow; + pSrcCurrent += 4; } // We do 4-way unrolling - while (pSrcCurrent + 4 <= pSrcEnd) + while (pSrcCurrent < pSrcEnd) { Vector128 x01 = Sse.LoadVector128(pSrcCurrent); // Replicate each 32-bit slot of x01 (ABCD) into its own register. @@ -607,10 +615,9 @@ public static unsafe void MatMulTran(ReadOnlySpan mat, ReadOnlySpan x02 = Sse.Multiply(x01, Sse.LoadVector128(pMatTemp)); @@ -631,6 +638,7 @@ public static unsafe void MatMulTran(ReadOnlySpan mat, ReadOnlySpan mat, ReadOnlySpan trailingMask = Sse.LoadVector128(((float*)(pTrailingAlignmentMask)) + ((4 - misalignment) * 4)); Vector128 x3 = Sse.LoadVector128(pDstCurrent); - x02 = Sse.Add(x02, Sse.And(x3, trailingMask)); + x02 = Sse.Or(x02, Sse.And(x3, trailingMask)); x02 = Sse.Add(x02, Sse.And(x3, leadingMask)); @@ -693,84 +701,43 @@ public static unsafe void MatMulTran(ReadOnlySpan mat, ReadOnlySpan trailingMask = Sse.LoadVector128(((float*)(pTrailingAlignmentMask)) + (remainder * 4)); + if (remainder != 0) + { + pMatCurrent -= (4 - remainder); + pDstCurrent -= (4 - remainder); + Vector128 trailingMask = Sse.LoadVector128(((float*)(pTrailingAlignmentMask)) + (remainder * 4)); - float* pMatTemp = pMatCurrent; - Vector128 x02 = Sse.And(trailingMask, Sse.LoadVector128(pMatTemp)); - Vector128 x12 = Sse.And(trailingMask, Sse.LoadVector128(pMatTemp += crow)); - Vector128 x22 = Sse.And(trailingMask, Sse.LoadVector128(pMatTemp += crow)); - Vector128 x32 = Sse.And(trailingMask, Sse.LoadVector128(pMatTemp += crow)); + float* pMatTemp = pMatCurrent; + Vector128 x02 = Sse.And(trailingMask, Sse.LoadVector128(pMatTemp)); + Vector128 x12 = Sse.And(trailingMask, Sse.LoadVector128(pMatTemp += crow)); + Vector128 x22 = Sse.And(trailingMask, Sse.LoadVector128(pMatTemp += crow)); + Vector128 x32 = Sse.And(trailingMask, Sse.LoadVector128(pMatTemp += crow)); - x02 = Sse.Multiply(x01, x02); - x12 = Sse.Multiply(x11, x12); - x22 = Sse.Multiply(x21, x22); - x32 = Sse.Multiply(x31, x32); + x02 = Sse.Multiply(x01, x02); + x12 = Sse.Multiply(x11, x12); + x22 = Sse.Multiply(x21, x22); + x32 = Sse.Multiply(x31, x32); - x02 = Sse.Add(x02, x12); - x22 = Sse.Add(x22, x32); - x02 = Sse.Add(x02, x22); + x02 = Sse.Add(x02, x12); + x22 = Sse.Add(x22, x32); + x02 = Sse.Add(x02, x22); - Vector128 leadingMask = Sse.LoadVector128(((float*)(pLeadingAlignmentMask)) + ((4 - remainder) * 4)); - Vector128 x3 = Sse.LoadVector128(pDstCurrent); - x02 = Sse.Add(x02, Sse.And(x3, leadingMask)); + Vector128 leadingMask = Sse.LoadVector128(((float*)(pLeadingAlignmentMask)) + ((4 - remainder) * 4)); + Vector128 x3 = Sse.LoadVector128(pDstCurrent); + x02 = Sse.Or(x02, Sse.And(x3, leadingMask)); - x02 = Sse.Add(x02, Sse.And(x3, trailingMask)); - Sse.Store(pDstCurrent, x02); - pDstCurrent += 4; - pMatCurrent += 4; + x02 = Sse.Add(x02, Sse.And(x3, trailingMask)); + Sse.Store(pDstCurrent, x02); + pDstCurrent += 4; + pMatCurrent += 4; + } } pMatCurrent += 3 * crow; pSrcCurrent += 4; - numCol -= 4; } - - // falling through the case statements - switch (numCol) - { - case 3: ColumnMultiply(pMatCurrent + 2 * crow, pSrcCurrent + 2, pdst, pDstEnd, crow); goto case 2; - case 2: ColumnMultiply(pMatCurrent + crow, pSrcCurrent + 1, pdst, pDstEnd, crow); goto case 1; - case 1: ColumnMultiply(pMatCurrent, pSrcCurrent, pdst, pDstEnd, crow); break; - } - } - } - - private static unsafe void ColumnMultiply(float* pMatCurrent, float* pSrcCurrent, float* pdst, float* pDstEnd, int crow) - { - Vector128 x01 = Sse.LoadVector128(pSrcCurrent); - // Replicate each slot of h01 (ABCD) into its own register. - x01 = Sse.Shuffle(x01, x01, 0x00); // A - int remainder = crow % 4; - float* pDstCurrent = pdst; - - while (pDstCurrent + 4 <= pDstEnd) - { - // If we aren't using the VEX-encoding, the JIT will only fold away aligned loads - // (due to semantics of the legacy encoding). - // We don't need an assert, since the instruction will throw for unaligned inputs. - float* pMatTemp = pMatCurrent; - Vector128 x02 = Sse.Multiply(x01, Sse.LoadVector128(pMatTemp)); - x02 = Sse.Add(x02, Sse.LoadVector128(pDstCurrent)); - - Sse.Store(pDstCurrent, x02); - pDstCurrent += 4; - pMatCurrent += 4; - } - - // falling through the case statements - switch (remainder) - { - case 3: *(pDstCurrent + 2) += *(pSrcCurrent) * *(pMatCurrent + 2); goto case 2; - case 2: *(pDstCurrent + 1) += *(pSrcCurrent) * *(pMatCurrent + 1); goto case 1; - case 1: *pDstCurrent += *(pSrcCurrent) * *(pMatCurrent); break; } - return; } // dst[i] += scale diff --git a/src/Microsoft.ML.CpuMath/Thunk.cs b/src/Microsoft.ML.CpuMath/Thunk.cs index 9505db8766..8ff725b54a 100644 --- a/src/Microsoft.ML.CpuMath/Thunk.cs +++ b/src/Microsoft.ML.CpuMath/Thunk.cs @@ -86,6 +86,12 @@ public static extern void MatMulP(/*const*/ float* pmat, /*const*/ int* pposSrc, [DllImport(NativePath), SuppressUnmanagedCodeSecurity] public static extern float Dist2(/*const*/ float* px, /*const*/ float* py, int c); + [DllImport(NativePath), SuppressUnmanagedCodeSecurity] + public static extern void ZeroItemsU(float* pd, int c, /*const*/ int* pindices, int cindices); + + [DllImport(NativePath), SuppressUnmanagedCodeSecurity] + public static extern void ZeroMatrixItemsCore(float* pd, int c, int ccol, int cfltRow, /*const*/ int* pindices, int cindices); + [DllImport(NativePath), SuppressUnmanagedCodeSecurity] public static extern void SdcaL1UpdateU(float primalUpdate, /*const*/ float* ps, float threshold, float* pd1, float* pd2, int c); diff --git a/src/Microsoft.ML.TimeSeries/AdaptiveSingularSpectrumSequenceModeler.cs b/src/Microsoft.ML.TimeSeries/AdaptiveSingularSpectrumSequenceModeler.cs index 88cbc536dd..a473ccec29 100644 --- a/src/Microsoft.ML.TimeSeries/AdaptiveSingularSpectrumSequenceModeler.cs +++ b/src/Microsoft.ML.TimeSeries/AdaptiveSingularSpectrumSequenceModeler.cs @@ -174,8 +174,8 @@ public ModelInfo Info private Single[] _alpha; private Single[] _state; private readonly FixedSizeQueue _buffer; - private float[] _x; - private float[] _xSmooth; + private CpuAlignedVector _x; + private CpuAlignedVector _xSmooth; private int _windowSize; private readonly int _seriesLength; private readonly RankSelectionMethod _rankSelectionMethod; @@ -188,14 +188,14 @@ public ModelInfo Info private readonly IHost _host; - private float[] _wTrans; + private CpuAlignedMatrixRow _wTrans; private Single _observationNoiseVariance; private Single _observationNoiseMean; private Single _autoregressionNoiseVariance; private Single _autoregressionNoiseMean; private int _rank; - private float[] _y; + private CpuAlignedVector _y; private Single _nextPrediction; /// @@ -290,8 +290,8 @@ public AdaptiveSingularSpectrumSequenceModeler(IHostEnvironment env, int trainSi _alpha = new Single[windowSize - 1]; _state = new Single[windowSize - 1]; - _x = new float[windowSize]; - _xSmooth = new float[windowSize]; + _x = new CpuAlignedVector(windowSize, SseUtils.CbAlign); + _xSmooth = new CpuAlignedVector(windowSize, SseUtils.CbAlign); ShouldComputeForecastIntervals = shouldComputeForecastIntervals; _observationNoiseVariance = 0; @@ -345,14 +345,14 @@ private AdaptiveSingularSpectrumSequenceModeler(AdaptiveSingularSpectrumSequence _state = new Single[_windowSize - 1]; Array.Copy(model._state, _state, _windowSize - 1); - _x = new float[_windowSize]; - _xSmooth = new float[_windowSize]; + _x = new CpuAlignedVector(_windowSize, SseUtils.CbAlign); + _xSmooth = new CpuAlignedVector(_windowSize, SseUtils.CbAlign); if (model._wTrans != null) { - _y = new float[_rank]; - _wTrans = new float[_rank * _windowSize]; - Array.Copy(model._wTrans, _wTrans, _rank * _windowSize); + _y = new CpuAlignedVector(_rank, SseUtils.CbAlign); + _wTrans = new CpuAlignedMatrixRow(_rank, _windowSize, SseUtils.CbAlign); + _wTrans.CopyFrom(model._wTrans); } } @@ -452,16 +452,18 @@ public AdaptiveSingularSpectrumSequenceModeler(IHostEnvironment env, ModelLoadCo { var tempArray = ctx.Reader.ReadFloatArray(); _host.CheckDecode(Utils.Size(tempArray) == _rank * _windowSize); - _wTrans = new float[_rank * _windowSize]; - Array.Copy(tempArray, _wTrans, tempArray.Length); + _wTrans = new CpuAlignedMatrixRow(_rank, _windowSize, SseUtils.CbAlign); + int i = 0; + _wTrans.CopyFrom(tempArray, ref i); tempArray = ctx.Reader.ReadFloatArray(); - _y = new float[_rank]; - Array.Copy(tempArray, _y, tempArray.Length); + i = 0; + _y = new CpuAlignedVector(_rank, SseUtils.CbAlign); + _y.CopyFrom(tempArray, ref i); } _buffer = TimeSeriesUtils.DeserializeFixedSizeQueueSingle(ctx.Reader, _host); - _x = new float[_windowSize]; - _xSmooth = new float[_windowSize]; + _x = new CpuAlignedVector(_windowSize, SseUtils.CbAlign); + _xSmooth = new CpuAlignedVector(_windowSize, SseUtils.CbAlign); } public override void Save(ModelSaveContext ctx) @@ -525,11 +527,14 @@ public override void Save(ModelSaveContext ctx) if (_wTrans != null) { + // REVIEW: this may not be the most efficient way for serializing an aligned matrix. var tempArray = new Single[_rank * _windowSize]; - Array.Copy(_wTrans, tempArray, _wTrans.Length); + int iv = 0; + _wTrans.CopyTo(tempArray, ref iv); ctx.Writer.WriteSingleArray(tempArray); tempArray = new float[_rank]; - Array.Copy(_y, tempArray, tempArray.Length); + iv = 0; + _y.CopyTo(tempArray, ref iv); ctx.Writer.WriteSingleArray(tempArray); } @@ -1125,14 +1130,15 @@ internal override void Consume(ref Single input, bool updateModel = false) if (_wTrans == null) { - _y = new float[_rank]; - _wTrans = new float[_rank * _windowSize]; + _y = new CpuAlignedVector(_rank, SseUtils.CbAlign); + _wTrans = new CpuAlignedMatrixRow(_rank, _windowSize, SseUtils.CbAlign); Single[] vecs = new Single[_rank * _windowSize]; for (i = 0; i < _rank; ++i) vecs[(_windowSize + 1) * i] = 1; - Array.Copy(_wTrans, vecs, _rank * _windowSize); + i = 0; + _wTrans.CopyFrom(vecs, ref i); } // Forming vector x @@ -1151,10 +1157,10 @@ internal override void Consume(ref Single input, bool updateModel = false) _x[_windowSize - 1] = input; // Computing y: Eq. (11) in https://hal-institut-mines-telecom.archives-ouvertes.fr/hal-00479772/file/twocolumns.pdf - CpuMathUtils.MatrixTimesSource(transpose: false, _wTrans, _x, _y, _y.Length); + CpuAligenedMathUtils.MatTimesSrc(_wTrans, _x, _y); // Updating the state vector - CpuMathUtils.MatrixTimesSource(transpose: true, _wTrans, _y, _xSmooth, _y.Length); + CpuAligenedMathUtils.MatTranTimesSrc(_wTrans, _y, _xSmooth); _nextPrediction = _autoregressionNoiseMean + _observationNoiseMean; for (i = 0; i < _windowSize - 2; ++i) @@ -1305,8 +1311,8 @@ private void TrainCore(Single[] dataArray, int originalSeriesLength) _maxRank = _windowSize / 2; _alpha = new Single[_windowSize - 1]; _state = new Single[_windowSize - 1]; - _x = new float[_windowSize]; - _xSmooth = new float[_windowSize]; + _x = new CpuAlignedVector(_windowSize, SseUtils.CbAlign); + _xSmooth = new CpuAlignedVector(_windowSize, SseUtils.CbAlign); TrainCore(dataArray, originalSeriesLength); return; @@ -1343,11 +1349,12 @@ private void TrainCore(Single[] dataArray, int originalSeriesLength) } // Setting the the y vector - _y = new float[_rank]; + _y = new CpuAlignedVector(_rank, SseUtils.CbAlign); // Setting the weight matrix - _wTrans = new float[_rank * _windowSize]; - Array.Copy(leftSingularVecs, _wTrans, _wTrans.Length); + _wTrans = new CpuAlignedMatrixRow(_rank, _windowSize, SseUtils.CbAlign); + i = 0; + _wTrans.CopyFrom(leftSingularVecs, ref i); // Setting alpha Single nu = 0; @@ -1357,7 +1364,7 @@ private void TrainCore(Single[] dataArray, int originalSeriesLength) nu += _y[i] * _y[i]; } - CpuMathUtils.MatrixTimesSource(transpose: true, _wTrans, _y, _xSmooth, _y.Length); + CpuAligenedMathUtils.MatTranTimesSrc(_wTrans, _y, _xSmooth); for (i = 0; i < _windowSize - 1; ++i) _alpha[i] = _xSmooth[i] / (1 - nu); @@ -1402,8 +1409,8 @@ private void TrainCore(Single[] dataArray, int originalSeriesLength) _x[i - originalSeriesLength + _windowSize] = dataArray[i]; } - CpuMathUtils.MatrixTimesSource(transpose: false, _wTrans, _x, _y, _y.Length); - CpuMathUtils.MatrixTimesSource(transpose: true, _wTrans, _y, _xSmooth, _y.Length); + CpuAligenedMathUtils.MatTimesSrc(_wTrans, _x, _y); + CpuAligenedMathUtils.MatTranTimesSrc(_wTrans, _y, _xSmooth); for (i = 1; i < _windowSize; ++i) { diff --git a/src/Microsoft.ML.Transforms/RandomFourierFeaturizing.cs b/src/Microsoft.ML.Transforms/RandomFourierFeaturizing.cs index 8f4e2b6221..f1b7d49895 100644 --- a/src/Microsoft.ML.Transforms/RandomFourierFeaturizing.cs +++ b/src/Microsoft.ML.Transforms/RandomFourierFeaturizing.cs @@ -95,10 +95,10 @@ private sealed class TransformInfo public readonly int SrcDim; // the matrix containing the random fourier vectors - public readonly float[] RndFourierVectors; + public readonly AlignedArray RndFourierVectors; // the random rotations - public readonly float[] RotationTerms; + public readonly AlignedArray RotationTerms; private readonly IFourierDistributionSampler _matrixGenerator; private readonly bool _useSin; @@ -120,10 +120,10 @@ public TransformInfo(IHost host, ColumnInfo column, int d, float avgDist) var generator = column.Generator; _matrixGenerator = generator.CreateComponent(host, avgDist); - int roundedUpD = RoundToMultipleOf4(NewDim); - int roundedUpNumFeatures = RoundToMultipleOf4(SrcDim); - RndFourierVectors = new float[roundedUpD * roundedUpNumFeatures]; - RotationTerms = _useSin ? null : new float[roundedUpD]; + int roundedUpD = RoundUp(NewDim, _cfltAlign); + int roundedUpNumFeatures = RoundUp(SrcDim, _cfltAlign); + RndFourierVectors = new AlignedArray(roundedUpD * roundedUpNumFeatures, CpuMathUtils.GetVectorAlignment()); + RotationTerms = _useSin ? null : new AlignedArray(roundedUpD, CpuMathUtils.GetVectorAlignment()); InitializeFourierCoefficients(roundedUpNumFeatures, roundedUpD); } @@ -154,10 +154,10 @@ public TransformInfo(IHostEnvironment env, ModelLoadContext ctx, string director ctx.LoadModelOrNull(env, out _matrixGenerator, directoryName)); // initialize the transform matrix - int roundedUpD = RoundToMultipleOf4(NewDim); - int roundedUpNumFeatures = RoundToMultipleOf4(SrcDim); - RndFourierVectors = new float[roundedUpD * roundedUpNumFeatures]; - RotationTerms = _useSin ? null : new float[roundedUpD]; + int roundedUpD = RoundUp(NewDim, _cfltAlign); + int roundedUpNumFeatures = RoundUp(SrcDim, _cfltAlign); + RndFourierVectors = new AlignedArray(roundedUpD * roundedUpNumFeatures, CpuMathUtils.GetVectorAlignment()); + RotationTerms = _useSin ? null : new AlignedArray(roundedUpD, CpuMathUtils.GetVectorAlignment()); InitializeFourierCoefficients(roundedUpNumFeatures, roundedUpD); } @@ -225,6 +225,8 @@ private static VersionInfo GetVersionInfo() private readonly TransformInfo[] _transformInfos; + private static readonly int _cfltAlign = CpuMathUtils.GetVectorAlignment() / sizeof(float); + private static string TestColumnType(ColumnType type) { if (type.ItemType == NumberType.Float && type.IsKnownSizeVector) @@ -293,11 +295,16 @@ public RandomFourierFeaturizingTransformer(IHostEnvironment env, IDataView input } } - private static int RoundToMultipleOf4(int number) + // Round cflt up to a multiple of cfltAlign. + private static int RoundUp(int cflt, int cfltAlign) { - Contracts.Assert(0 < number); - int multipleOf4 = (number + 3) / 4; - return multipleOf4 * 4; + Contracts.Assert(0 < cflt); + // cfltAlign should be a power of two. + Contracts.Assert(0 < cfltAlign && (cfltAlign & (cfltAlign - 1)) == 0); + + // Determine the number of "blobs" of size cfltAlign. + int cblob = (cflt + cfltAlign - 1) / cfltAlign; + return cblob * cfltAlign; } private float[] GetAvgDistances(ColumnInfo[] columns, IDataView input) @@ -548,14 +555,14 @@ private ValueGetter> GetterFromVectorType(Row input, int iinfo) var getSrc = input.GetGetter>(_srcCols[iinfo]); var src = default(VBuffer); - var features = new float[RoundToMultipleOf4(_srcTypes[iinfo].ValueCount)]; - var product = new float[RoundToMultipleOf4(_parent._transformInfos[iinfo].NewDim)]; + var featuresAligned = new AlignedArray(RoundUp(_srcTypes[iinfo].ValueCount, _cfltAlign), CpuMathUtils.GetVectorAlignment()); + var productAligned = new AlignedArray(RoundUp(_parent._transformInfos[iinfo].NewDim, _cfltAlign), CpuMathUtils.GetVectorAlignment()); return (ref VBuffer dst) => { getSrc(ref src); - TransformFeatures(in src, ref dst, _parent._transformInfos[iinfo], features, product); + TransformFeatures(in src, ref dst, _parent._transformInfos[iinfo], featuresAligned, productAligned); }; } @@ -565,8 +572,8 @@ private ValueGetter> GetterFromFloatType(Row input, int iinfo) var getSrc = input.GetGetter(_srcCols[iinfo]); var src = default(float); - var featuresAligned = new float[4]; - var productAligned = new float[RoundToMultipleOf4(_parent._transformInfos[iinfo].NewDim)]; + var featuresAligned = new AlignedArray(RoundUp(1, _cfltAlign), CpuMathUtils.GetVectorAlignment()); + var productAligned = new AlignedArray(RoundUp(_parent._transformInfos[iinfo].NewDim, _cfltAlign), CpuMathUtils.GetVectorAlignment()); var oneDimensionalVector = new VBuffer(1, new float[] { 0 }); @@ -580,7 +587,7 @@ private ValueGetter> GetterFromFloatType(Row input, int iinfo) } private void TransformFeatures(in VBuffer src, ref VBuffer dst, TransformInfo transformInfo, - float[] features, float[] product) + AlignedArray featuresAligned, AlignedArray productAligned) { Host.Check(src.Length == transformInfo.SrcDim, "column does not have the expected dimensionality."); @@ -599,9 +606,9 @@ private void TransformFeatures(in VBuffer src, ref VBuffer dst, Tr if (src.IsDense) { - src.GetValues().CopyTo(features); - CpuMathUtils.MatrixTimesSource(transpose: false, transformInfo.RndFourierVectors, features, product, - RoundToMultipleOf4(transformInfo.NewDim)); + featuresAligned.CopyFrom(src.GetValues()); + CpuMathUtils.MatrixTimesSource(false, transformInfo.RndFourierVectors, featuresAligned, productAligned, + transformInfo.NewDim); } else { @@ -609,21 +616,15 @@ private void TransformFeatures(in VBuffer src, ref VBuffer dst, Tr // no need to zero them out. var srcValues = src.GetValues(); var srcIndices = src.GetIndices(); - - for (int i = 0; i < srcValues.Length; i++) - { - int iv = srcIndices[i]; - features[iv] = srcValues[i]; - } - - CpuMathUtils.MatrixTimesSource(transformInfo.RndFourierVectors, srcIndices, features, 0, 0, - srcValues.Length, product, RoundToMultipleOf4(transformInfo.NewDim)); + featuresAligned.CopyFrom(srcIndices, srcValues, 0, 0, srcValues.Length, zeroItems: false); + CpuMathUtils.MatrixTimesSource(transformInfo.RndFourierVectors, srcIndices, featuresAligned, 0, 0, + srcValues.Length, productAligned, transformInfo.NewDim); } var dstEditor = VBufferEditor.Create(ref dst, newDstLength); for (int i = 0; i < transformInfo.NewDim; i++) { - var dotProduct = product[i]; + var dotProduct = productAligned[i]; if (transformInfo.RotationTerms != null) dstEditor.Values[i] = (float)MathUtils.Cos(dotProduct + transformInfo.RotationTerms[i]) * scale; else diff --git a/src/Native/CpuMathNative/Sse.cpp b/src/Native/CpuMathNative/Sse.cpp index 2240d630eb..607af332d1 100644 --- a/src/Native/CpuMathNative/Sse.cpp +++ b/src/Native/CpuMathNative/Sse.cpp @@ -57,69 +57,15 @@ const unsigned int TrailingAlignmentMask[16] = 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, }; -float RowMultiply(const float* pMatCurrent, const float* pSrcCurrent, const float* pSrcEnd, int ccol) -{ - __m128 res0 = _mm_setzero_ps(); - int remainder = ccol % 4; - - while (pSrcCurrent + 4 <= pSrcEnd) - { - __m128 vector = _mm_loadu_ps(pSrcCurrent); - const float* pMatTemp = pMatCurrent; - __m128 x01 = _mm_mul_ps(vector, _mm_loadu_ps(pMatTemp)); - res0 = _mm_add_ps(res0, x01); - - pSrcCurrent += 4; - pMatCurrent += 4; - } - - res0 = _mm_hadd_ps(res0, res0); - res0 = _mm_hadd_ps(res0, res0); - - float sum = _mm_cvtss_f32(res0); - - // falling through the case statements - switch (remainder) - { - case 3: - sum += *(pSrcCurrent + 2) * *(pMatCurrent + 2); - case 2: - sum += *(pSrcCurrent + 1) * *(pMatCurrent + 1); - case 1: - sum += *(pSrcCurrent) * *(pMatCurrent); - } - return sum; -} - // Multiply matrix times vector into vector. EXPORT_API(void) MatMul(_In_ const float * pmat, _In_ const float * psrc, _Inout_ float * pdst, int crow, int ccol) { - if (ccol < 4) - { - for (int i = 0 ; i < crow; i++) - { - float dotProduct = 0; - switch (ccol) - { - case 3: - dotProduct += pmat[i * ccol + 2] * psrc[2]; - case 2: - dotProduct += pmat[i * ccol + 1] * psrc[1]; - case 1: - dotProduct += pmat[i * ccol + 0] * psrc[0]; - } - pdst[i] = dotProduct; - } - return; - } - const float * pSrcEnd = psrc + ccol; const float * pDstEnd = pdst + crow; float* pDstCurrent = pdst; const float* pMatCurrent = pmat; - int numRows = crow; - while (pDstCurrent + 4 <= pDstEnd) + while (pDstCurrent < pDstEnd) { __m128 res0 = _mm_setzero_ps(); __m128 res1 = res0; @@ -133,11 +79,10 @@ EXPORT_API(void) MatMul(_In_ const float * pmat, _In_ const float * psrc, _Inout uintptr_t misalignment = address % 16; int remainder = 0; - if ((misalignment & 3) != 0 || (ccol % 4 != 0)) + if ((misalignment & 3) != 0) { // Handles cases where the data is not 32-bit aligned and we can't ever use aligned operations - remainder = length % 4; - while (pSrcCurrent + 4 <= pSrcEnd) + while (pSrcCurrent < pSrcEnd) { __m128 vector = _mm_loadu_ps(pSrcCurrent); @@ -216,32 +161,32 @@ EXPORT_API(void) MatMul(_In_ const float * pmat, _In_ const float * psrc, _Inout // unaligned loads where we mask the input each time. remainder = length; } - } - if (remainder != 0) - { - // Handle any trailing elements that don't fit into a 128-bit block by moving back so that the next - // unaligned load will read to the end of the array and then mask out any elements already processed + if (remainder != 0) + { + // Handle any trailing elements that don't fit into a 128-bit block by moving back so that the next + // unaligned load will read to the end of the array and then mask out any elements already processed - pMatCurrent -= (4 - remainder); - pSrcCurrent -= (4 - remainder); + pMatCurrent -= (4 - remainder); + pSrcCurrent -= (4 - remainder); - __m128 mask = _mm_loadu_ps(((float*)(&TrailingAlignmentMask)) + (remainder * 4)); + __m128 mask = _mm_loadu_ps(((float*)(&TrailingAlignmentMask)) + (remainder * 4)); - const float* pMatTemp = pMatCurrent; - __m128 x01 = _mm_and_ps(mask, _mm_loadu_ps(pMatTemp)); - __m128 x11 = _mm_and_ps(mask, _mm_loadu_ps(pMatTemp += ccol)); - __m128 x21 = _mm_and_ps(mask, _mm_loadu_ps(pMatTemp += ccol)); - __m128 x31 = _mm_and_ps(mask, _mm_loadu_ps(pMatTemp += ccol)); - __m128 vector = _mm_and_ps(mask, _mm_loadu_ps(pSrcCurrent)); + const float* pMatTemp = pMatCurrent; + __m128 x01 = _mm_and_ps(mask, _mm_loadu_ps(pMatTemp)); + __m128 x11 = _mm_and_ps(mask, _mm_loadu_ps(pMatTemp += ccol)); + __m128 x21 = _mm_and_ps(mask, _mm_loadu_ps(pMatTemp += ccol)); + __m128 x31 = _mm_and_ps(mask, _mm_loadu_ps(pMatTemp += ccol)); + __m128 vector = _mm_and_ps(mask, _mm_loadu_ps(pSrcCurrent)); - res0 = _mm_add_ps(res0, _mm_mul_ps(x01, vector)); - res1 = _mm_add_ps(res1, _mm_mul_ps(x11, vector)); - res2 = _mm_add_ps(res2, _mm_mul_ps(x21, vector)); - res3 = _mm_add_ps(res3, _mm_mul_ps(x31, vector)); + res0 = _mm_add_ps(res0, _mm_mul_ps(x01, vector)); + res1 = _mm_add_ps(res1, _mm_mul_ps(x11, vector)); + res2 = _mm_add_ps(res2, _mm_mul_ps(x21, vector)); + res3 = _mm_add_ps(res3, _mm_mul_ps(x31, vector)); - pMatCurrent += 4; - pSrcCurrent += 4; + pMatCurrent += 4; + pSrcCurrent += 4; + } } // Add up the entries of each, with the 4 results in res0 @@ -253,19 +198,6 @@ EXPORT_API(void) MatMul(_In_ const float * pmat, _In_ const float * psrc, _Inout pDstCurrent += 4; pMatCurrent += 3 * ccol; - numRows -= 4; - } - - // falling through the case statements - switch(numRows) - { - case 3: - *(pDstCurrent + 2) = RowMultiply(pMatCurrent + 2 * ccol, psrc, pSrcEnd, ccol); - case 2: - *(pDstCurrent + 1) = RowMultiply(pMatCurrent + 1 * ccol, psrc, pSrcEnd, ccol); - case 1: - *pDstCurrent = RowMultiply(pMatCurrent, psrc, pSrcEnd, ccol); - break; } } @@ -424,62 +356,21 @@ EXPORT_API(void) MatMulP(_In_ const float * pmat, _In_ const int * pposSrc, _In_ } } -void ColumnMultiply(const float* pMatCurrent, const float* pSrcCurrent, float* pdst, const float* pDstEnd, int crow) -{ - __m128 x01 = _mm_loadu_ps(pSrcCurrent); - x01 = _mm_shuffle_ps(x01, x01, 0x00); - float* pDstCurrent = pdst; - int remainder = crow % 4; - - while (pDstCurrent + 4 <= pDstEnd) - { - const float* pMatTemp = pMatCurrent; - __m128 x02 = _mm_mul_ps(x01, _mm_loadu_ps(pMatTemp)); - x02 = _mm_add_ps(x02, _mm_loadu_ps(pDstCurrent)); - - _mm_storeu_ps(pDstCurrent, x02); - - pDstCurrent += 4; - pMatCurrent += 4; - } - - // falling through the case statements - switch (remainder) - { - case 3: *(pDstCurrent + 2) += *(pSrcCurrent) * *(pMatCurrent + 2); - case 2: *(pDstCurrent + 1) += *(pSrcCurrent) * *(pMatCurrent + 1); - case 1: *pDstCurrent += *(pSrcCurrent) * *(pMatCurrent); break; - } - return; -} - EXPORT_API(void) MatMulTran(_In_ const float * pmat, _In_ const float * psrc, _Inout_ float * pdst, int crow, int ccol) { - if (crow < 4) - { - for (int i = 0 ; i < crow; i++) - { - float dotProduct = 0; - for (int j = 0; j < ccol; j++) - { - dotProduct += pmat[j * crow + i] * psrc[j]; - } - pdst[i] = dotProduct; - } - return; - } - const float * pSrcEnd = psrc + ccol; const float * pDstEnd = pdst + crow; const float* pMatCurrent = pmat; const float* pSrcCurrent = psrc; - int remainder = 0; - int numCol = ccol; if (pSrcCurrent < pSrcEnd) { __m128 x01 = _mm_loadu_ps(pSrcCurrent); + // Replicate each slot of x01 into its own register. + __m128 x11 = _mm_shuffle_ps(x01, x01, 0x55); + __m128 x21 = _mm_shuffle_ps(x01, x01, 0xAA); + __m128 x31 = _mm_shuffle_ps(x01, x01, 0xFF); x01 = _mm_shuffle_ps(x01, x01, 0x00); int length = crow; @@ -489,14 +380,20 @@ EXPORT_API(void) MatMulTran(_In_ const float * pmat, _In_ const float * psrc, _I uintptr_t misalignment = address % 16; int remainder = 0; - if ((misalignment & 3) != 0 || (crow % 4 != 0)) + if ((misalignment & 3) != 0) { // Handles cases where the data is not 32-bit aligned and we can't ever use aligned operations - remainder = crow % 4; - while (pDstCurrent + 4 <= pDstEnd) + while (pDstCurrent < pDstEnd) { const float* pMatTemp = pMatCurrent; __m128 x02 = _mm_mul_ps(x01, _mm_loadu_ps(pMatTemp)); + __m128 x12 = _mm_mul_ps(x11, _mm_loadu_ps(pMatTemp += crow)); + __m128 x22 = _mm_mul_ps(x21, _mm_loadu_ps(pMatTemp += crow)); + __m128 x32 = _mm_mul_ps(x31, _mm_loadu_ps(pMatTemp += crow)); + + x02 = _mm_add_ps(x02, x12); + x22 = _mm_add_ps(x22, x32); + x02 = _mm_add_ps(x02, x22); _mm_storeu_ps(pDstCurrent, x02); pDstCurrent += 4; @@ -505,6 +402,7 @@ EXPORT_API(void) MatMulTran(_In_ const float * pmat, _In_ const float * psrc, _I } else { + int remainder = 0; if (misalignment != 0) { // Handle cases where the data is not 128-bit aligned by doing an unaligned read and then @@ -517,11 +415,22 @@ EXPORT_API(void) MatMulTran(_In_ const float * pmat, _In_ const float * psrc, _I // We only align pMat since it has significantly more reads. const float* pMatTemp = pMatCurrent; __m128 x02 = _mm_and_ps(leadingMask, _mm_loadu_ps(pMatTemp)); + __m128 x12 = _mm_and_ps(leadingMask, _mm_loadu_ps(pMatTemp += crow)); + __m128 x22 = _mm_and_ps(leadingMask, _mm_loadu_ps(pMatTemp += crow)); + __m128 x32 = _mm_and_ps(leadingMask, _mm_loadu_ps(pMatTemp += crow)); + x02 = _mm_mul_ps(x01, x02); + x12 = _mm_mul_ps(x11, x12); + x22 = _mm_mul_ps(x21, x22); + x32 = _mm_mul_ps(x31, x32); + + x02 = _mm_add_ps(x02, x12); + x22 = _mm_add_ps(x22, x32); + x02 = _mm_add_ps(x02, x22); __m128 trailingMask = _mm_loadu_ps(((float*)(&TrailingAlignmentMask)) + ((4 - misalignment) * 4)); __m128 x3 = _mm_loadu_ps(pDstCurrent); - x02 = _mm_add_ps(x02, _mm_and_ps(x3, trailingMask)); + x02 = _mm_or_ps(x02, _mm_and_ps(x3, trailingMask)); _mm_storeu_ps(pDstCurrent, x02); pMatCurrent += misalignment; @@ -538,6 +447,13 @@ EXPORT_API(void) MatMulTran(_In_ const float * pmat, _In_ const float * psrc, _I { const float* pMatTemp = pMatCurrent; __m128 x02 = _mm_mul_ps(x01, _mm_load_ps(pMatTemp)); + __m128 x12 = _mm_mul_ps(x11, _mm_load_ps(pMatTemp += crow)); + __m128 x22 = _mm_mul_ps(x21, _mm_load_ps(pMatTemp += crow)); + __m128 x32 = _mm_mul_ps(x31, _mm_load_ps(pMatTemp += crow)); + + x02 = _mm_add_ps(x02, x12); + x22 = _mm_add_ps(x22, x32); + x02 = _mm_add_ps(x02, x22); _mm_storeu_ps(pDstCurrent, x02); @@ -552,36 +468,47 @@ EXPORT_API(void) MatMulTran(_In_ const float * pmat, _In_ const float * psrc, _I // unaligned loads where we mask the input each time. remainder = length; } - } - if (remainder != 0) - { - // Handle any trailing elements that don't fit into a 128-bit block by moving back so that the next - // unaligned load will read to the end of the array and then mask out any elements already processed + if (remainder != 0) + { + // Handle any trailing elements that don't fit into a 128-bit block by moving back so that the next + // unaligned load will read to the end of the array and then mask out any elements already processed - pMatCurrent -= (4 - remainder); - pDstCurrent -= (4 - remainder); + pMatCurrent -= (4 - remainder); + pDstCurrent -= (4 - remainder); - __m128 trailingMask = _mm_loadu_ps(((float*)(&TrailingAlignmentMask)) + (remainder * 4)); + __m128 trailingMask = _mm_loadu_ps(((float*)(&TrailingAlignmentMask)) + (remainder * 4)); + + const float* pMatTemp = pMatCurrent; + __m128 x02 = _mm_and_ps(trailingMask, _mm_loadu_ps(pMatTemp)); + __m128 x12 = _mm_and_ps(trailingMask, _mm_loadu_ps(pMatTemp += crow)); + __m128 x22 = _mm_and_ps(trailingMask, _mm_loadu_ps(pMatTemp += crow)); + __m128 x32 = _mm_and_ps(trailingMask, _mm_loadu_ps(pMatTemp += crow)); - const float* pMatTemp = pMatCurrent; - __m128 x02 = _mm_and_ps(trailingMask, _mm_loadu_ps(pMatTemp)); - x02 = _mm_mul_ps(x01, x02); + x02 = _mm_mul_ps(x01, x02); + x12 = _mm_mul_ps(x11, x12); + x22 = _mm_mul_ps(x21, x22); + x32 = _mm_mul_ps(x31, x32); - __m128 leadingMask = _mm_loadu_ps(((float*)(&LeadingAlignmentMask)) + ((4 - remainder) * 4)); - __m128 x3 = _mm_loadu_ps(pDstCurrent); - x02 = _mm_add_ps(x02, _mm_and_ps(x3, leadingMask)); + x02 = _mm_add_ps(x02, x12); + x22 = _mm_add_ps(x22, x32); + x02 = _mm_add_ps(x02, x22); - _mm_storeu_ps(pDstCurrent, x02); - pMatCurrent += 4; - pDstCurrent += 4; + __m128 leadingMask = _mm_loadu_ps(((float*)(&LeadingAlignmentMask)) + ((4 - remainder) * 4)); + __m128 x3 = _mm_loadu_ps(pDstCurrent); + x02 = _mm_or_ps(x02, _mm_and_ps(x3, leadingMask)); + + _mm_storeu_ps(pDstCurrent, x02); + pMatCurrent += 4; + pDstCurrent += 4; + } } - numCol -= 1; - pSrcCurrent += 1; + pMatCurrent += 3 * crow; + pSrcCurrent += 4; } - - while (pSrcCurrent + 4 <= pSrcEnd) + + while (pSrcCurrent < pSrcEnd) { __m128 x01 = _mm_loadu_ps(pSrcCurrent); // Replicate each slot of x01 into its own register. @@ -597,10 +524,9 @@ EXPORT_API(void) MatMulTran(_In_ const float * pmat, _In_ const float * psrc, _I uintptr_t misalignment = address % 16; int remainder = 0; - if ((misalignment & 3) != 0 || (crow % 4 != 0)) + if ((misalignment & 3) != 0) { - remainder = length % 4; - while (pDstCurrent + 4 <= pDstEnd) + while (pDstCurrent < pDstEnd) { const float* pMatTemp = pMatCurrent; __m128 x02 = _mm_mul_ps(x01, _mm_loadu_ps(pMatTemp)); @@ -621,6 +547,7 @@ EXPORT_API(void) MatMulTran(_In_ const float * pmat, _In_ const float * psrc, _I } else { + int remainder = 0; if (misalignment != 0) { misalignment >>= 2; @@ -646,7 +573,7 @@ EXPORT_API(void) MatMulTran(_In_ const float * pmat, _In_ const float * psrc, _I __m128 trailingMask = _mm_loadu_ps(((float*)(&TrailingAlignmentMask)) + ((4 - misalignment) * 4)); __m128 x3 = _mm_loadu_ps(pDstCurrent); - x02 = _mm_add_ps(x02, _mm_and_ps(x3, trailingMask)); + x02 = _mm_or_ps(x02, _mm_and_ps(x3, trailingMask)); x02 = _mm_add_ps(x02, _mm_and_ps(x3, leadingMask)); _mm_storeu_ps(pDstCurrent, x02); @@ -682,52 +609,43 @@ EXPORT_API(void) MatMulTran(_In_ const float * pmat, _In_ const float * psrc, _I { remainder = length; } - } - if (remainder != 0) - { - pMatCurrent -= (4 - remainder); - pDstCurrent -= (4 - remainder); + if (remainder != 0) + { + pMatCurrent -= (4 - remainder); + pDstCurrent -= (4 - remainder); - __m128 trailingMask = _mm_loadu_ps(((float*)(&TrailingAlignmentMask)) + (remainder * 4)); + __m128 trailingMask = _mm_loadu_ps(((float*)(&TrailingAlignmentMask)) + (remainder * 4)); - const float* pMatTemp = pMatCurrent; - __m128 x02 = _mm_and_ps(trailingMask, _mm_loadu_ps(pMatTemp)); - __m128 x12 = _mm_and_ps(trailingMask, _mm_loadu_ps(pMatTemp += crow)); - __m128 x22 = _mm_and_ps(trailingMask, _mm_loadu_ps(pMatTemp += crow)); - __m128 x32 = _mm_and_ps(trailingMask, _mm_loadu_ps(pMatTemp += crow)); + const float* pMatTemp = pMatCurrent; + __m128 x02 = _mm_and_ps(trailingMask, _mm_loadu_ps(pMatTemp)); + __m128 x12 = _mm_and_ps(trailingMask, _mm_loadu_ps(pMatTemp += crow)); + __m128 x22 = _mm_and_ps(trailingMask, _mm_loadu_ps(pMatTemp += crow)); + __m128 x32 = _mm_and_ps(trailingMask, _mm_loadu_ps(pMatTemp += crow)); - x02 = _mm_mul_ps(x01, x02); - x12 = _mm_mul_ps(x11, x12); - x22 = _mm_mul_ps(x21, x22); - x32 = _mm_mul_ps(x31, x32); + x02 = _mm_mul_ps(x01, x02); + x12 = _mm_mul_ps(x11, x12); + x22 = _mm_mul_ps(x21, x22); + x32 = _mm_mul_ps(x31, x32); - x02 = _mm_add_ps(x02, x12); - x22 = _mm_add_ps(x22, x32); - x02 = _mm_add_ps(x02, x22); + x02 = _mm_add_ps(x02, x12); + x22 = _mm_add_ps(x22, x32); + x02 = _mm_add_ps(x02, x22); - __m128 leadingMask = _mm_loadu_ps(((float*)(&LeadingAlignmentMask)) + ((4 - remainder) * 4)); - __m128 x3 = _mm_loadu_ps(pDstCurrent); - x02 = _mm_add_ps(x02, _mm_and_ps(x3, leadingMask)); + __m128 leadingMask = _mm_loadu_ps(((float*)(&LeadingAlignmentMask)) + ((4 - remainder) * 4)); + __m128 x3 = _mm_loadu_ps(pDstCurrent); + x02 = _mm_or_ps(x02, _mm_and_ps(x3, leadingMask)); - x02 = _mm_add_ps(x02, _mm_and_ps(x3, trailingMask)); - _mm_storeu_ps(pDstCurrent, x02); - pMatCurrent += 4; - pDstCurrent += 4; + x02 = _mm_add_ps(x02, _mm_and_ps(x3, trailingMask)); + _mm_storeu_ps(pDstCurrent, x02); + pMatCurrent += 4; + pDstCurrent += 4; + } } - numCol -= 4; pMatCurrent += 3 * crow; pSrcCurrent += 4; } - - // falling through the case statements - switch (numCol) - { - case 3: ColumnMultiply(pMatCurrent + 2 * crow, pSrcCurrent + 2, pdst, pDstEnd, crow); - case 2: ColumnMultiply(pMatCurrent + crow, pSrcCurrent + 1, pdst, pDstEnd, crow); - case 1: ColumnMultiply(pMatCurrent, pSrcCurrent, pdst, pDstEnd, crow); break; - } } // pd[i] += a @@ -1320,6 +1238,43 @@ EXPORT_API(float) Dist2(const float * px, const float * py, int c) return norm2; } +EXPORT_API(void) ZeroItemsU(_Inout_ float * pd, int c, _In_ const int * pindices, int cindices) +{ + DEBUG_ONLY(c); + for (int i = 0; i < cindices; ++i) + { + int iv = pindices[i]; + assert(0 <= iv && iv < c); + pd[iv] = 0; + } +} + +EXPORT_API(void) ZeroMatrixItemsCore(_Inout_ float * pd, int c, int ccol, int cfltRow, _In_ const int * pindices, int cindices) +{ + DEBUG_ONLY(c); + int ivLogMin = 0; + int ivLogLim = ccol; + int ivPhyMin = 0; + for (int i = 0; i < cindices; ++i) + { + int iv = pindices[i]; + assert(0 <= iv && iv < c); + + int col = iv - ivLogMin; + if ((unsigned int)col >= (unsigned int)ccol) + { + assert(ivLogMin > iv || iv >= ivLogLim); + int row = iv / ccol; + ivLogMin = row * ccol; + ivLogLim = ivLogMin + ccol; + ivPhyMin = row * cfltRow; + assert(ivLogMin <= iv && iv < ivLogLim); + col = iv - ivLogMin; + } + pd[ivPhyMin + col] = 0; + } +} + EXPORT_API(void) SdcaL1UpdateU(float primalUpdate, _In_ const float * ps, float threshold, _Inout_ float *pd1, _Inout_ float * pd2, int c) { const float * psLim = ps + c; diff --git a/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.cs b/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.cs index 5adb641b92..25996ec42c 100644 --- a/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.cs +++ b/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.cs @@ -14,14 +14,14 @@ public class CpuMathUtilsUnitTests { private readonly float[][] _testArrays; private readonly int[] _testIndexArray; - private readonly float[][] _testMatrices; - private readonly float[][] _testSrcVectors; - private readonly float[][] _testDstVectors; + private readonly AlignedArray[] _testMatrices; + private readonly AlignedArray[] _testSrcVectors; + private readonly AlignedArray[] _testDstVectors; + private readonly int _vectorAlignment = CpuMathUtils.GetVectorAlignment(); private readonly FloatEqualityComparer _comparer; private readonly FloatEqualityComparerForMatMul _matMulComparer; private const float DefaultScale = 1.7f; - private const int DefaultSeed = 253421; public CpuMathUtilsUnitTests() { @@ -50,19 +50,34 @@ public CpuMathUtilsUnitTests() testMatrix2[i] = i + 1; } - _testMatrices = new float[][] { testMatrix1, testMatrix2 }; + AlignedArray testMatrixAligned1 = new AlignedArray(8 * 8, _vectorAlignment); + AlignedArray testMatrixAligned2 = new AlignedArray(8 * 16, _vectorAlignment); + testMatrixAligned1.CopyFrom(testMatrix1); + testMatrixAligned2.CopyFrom(testMatrix2); + + _testMatrices = new AlignedArray[] { testMatrixAligned1, testMatrixAligned2 }; // Padded source vectors whose dimensions are multiples of 8 float[] testSrcVector1 = new float[8] { 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f }; float[] testSrcVector2 = new float[16] { 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f, 9f, 10f, 11f, 12f, 13f, 14f, 15f, 16f }; - _testSrcVectors = new float[][] { testSrcVector1, testSrcVector2 }; + AlignedArray testSrcVectorAligned1 = new AlignedArray(8, _vectorAlignment); + AlignedArray testSrcVectorAligned2 = new AlignedArray(16, _vectorAlignment); + testSrcVectorAligned1.CopyFrom(testSrcVector1); + testSrcVectorAligned2.CopyFrom(testSrcVector2); + + _testSrcVectors = new AlignedArray[] { testSrcVectorAligned1, testSrcVectorAligned2 }; // Padded destination vectors whose dimensions are multiples of 8 float[] testDstVector1 = new float[8] { 0f, 1f, 2f, 3f, 4f, 5f, 6f, 7f }; float[] testDstVector2 = new float[16] { 0f, 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f, 9f, 10f, 11f, 12f, 13f, 14f, 15f }; - _testDstVectors = new float[][] { testDstVector1, testDstVector2 }; + AlignedArray testDstVectorAligned1 = new AlignedArray(8, _vectorAlignment); + AlignedArray testDstVectorAligned2 = new AlignedArray(16, _vectorAlignment); + testDstVectorAligned1.CopyFrom(testDstVector1); + testDstVectorAligned2.CopyFrom(testDstVector2); + + _testDstVectors = new AlignedArray[] { testDstVectorAligned1, testDstVectorAligned2 }; } [Theory] @@ -71,125 +86,29 @@ public CpuMathUtilsUnitTests() [InlineData(1, 0, 1, new float[] { 204f, 492f, 780f, 1068f, 1356f, 1644f, 1932f, 2220f, 2508f, 2796f, 3084f, 3372f, 3660f, 3948f, 4236f, 4524f })] public void MatMulTest(int matTest, int srcTest, int dstTest, float[] expected) { - float[] mat = _testMatrices[matTest]; - float[] src = _testSrcVectors[srcTest]; - float[] dst = _testDstVectors[dstTest]; + AlignedArray mat = _testMatrices[matTest]; + AlignedArray src = _testSrcVectors[srcTest]; + AlignedArray dst = _testDstVectors[dstTest]; - CpuMathUtils.MatrixTimesSource(false, mat, src, dst, dst.Length); - float[] actual = new float[dst.Length]; - Array.Copy(dst, actual, dst.Length); + CpuMathUtils.MatrixTimesSource(false, mat, src, dst, dst.Size); + float[] actual = new float[dst.Size]; + dst.CopyTo(actual, 0, dst.Size); Assert.Equal(expected, actual, _matMulComparer); } - [Theory] - [InlineData(10, 5)] - [InlineData(10, 8)] - [InlineData(10, 11)] - [InlineData(11, 8)] - [InlineData(8, 23)] - [InlineData(2, 8)] - [InlineData(2, 9)] - [InlineData(2, 3)] - [InlineData(2, 5)] - [InlineData(4, 5)] - [InlineData(4, 7)] - [InlineData(4, 9)] - [InlineData(5, 7)] - [InlineData(5, 9)] - private void MatMulAnyDimensionTest(int col, int row) - { - Random rand = new Random(DefaultSeed); - float[] mat = new float[col * row]; - for (int i = 0; i < col * row; i++) - { - mat[i] = rand.Next(-10, 10); - } - - float[] src = new float[col]; - for (int i = 0; i < col; i++) - { - src[i] = rand.Next(-10, 10); - } - - float[] dst = new float[row]; - float[] expected = new float[row]; - - for (int i = 0; i < row; i++) - { - float dotProduct = 0; - for (int j = 0; j < src.Length; j++) - { - dotProduct += mat[i * src.Length + j] * src[j]; - } - - expected[i] = dotProduct; - } - - CpuMathUtils.MatrixTimesSource(false, mat, src, dst, dst.Length); - Assert.Equal(expected, dst, _matMulComparer); - } - - [Theory] - [InlineData(10, 5)] - [InlineData(10, 8)] - [InlineData(10, 11)] - [InlineData(11, 8)] - [InlineData(8, 23)] - [InlineData(2, 8)] - [InlineData(2, 9)] - [InlineData(2, 3)] - [InlineData(2, 5)] - [InlineData(4, 5)] - [InlineData(4, 7)] - [InlineData(4, 9)] - [InlineData(5, 7)] - [InlineData(5, 9)] - private void MatMulTranAnyDimensionTest(int col, int row) - { - float[] mat = new float[col * row]; - Random rand = new Random(DefaultSeed); - for (int i = 0; i < col * row; i++) - { - mat[i] = rand.Next(0, 10); - } - - float[] src = new float[row]; - for (int i = 0; i < row; i++) - { - src[i] = rand.Next(0, 10); - } - - float[] dst = new float[col]; - float[] expected = new float[col]; - - for (int i = 0; i < dst.Length; i++) - { - float dotProduct = 0; - for (int j = 0; j < row; j++) - { - dotProduct += mat[j * dst.Length + i] * src[j]; - } - - expected[i] = dotProduct; - } - - CpuMathUtils.MatrixTimesSource(true, mat, src, dst, row); - Assert.Equal(expected, dst, _matMulComparer); - } - [Theory] [InlineData(0, 0, 0, new float[] { 70.56001f, -85.68f, -351.36f, 498.24f, -3829.32f, -969.48f, 1168.2f, 118.44f })] [InlineData(1, 0, 1, new float[] { 2724f, 2760f, 2796f, 2832f, 2868f, 2904f, 2940f, 2976f, 3012f, 3048f, 3084f, 3120f, 3156f, 3192f, 3228f, 3264f })] [InlineData(1, 1, 0, new float[] { 11016f, 11152f, 11288f, 11424f, 11560f, 11696f, 11832f, 11968f })] public void MatMulTranTest(int matTest, int srcTest, int dstTest, float[] expected) { - float[] mat = _testMatrices[matTest]; - float[] src = _testSrcVectors[srcTest]; - float[] dst = _testDstVectors[dstTest]; + AlignedArray mat = _testMatrices[matTest]; + AlignedArray src = _testSrcVectors[srcTest]; + AlignedArray dst = _testDstVectors[dstTest]; - CpuMathUtils.MatrixTimesSource(true, mat, src, dst, src.Length); - float[] actual = new float[dst.Length]; - Array.Copy(dst, actual, dst.Length); + CpuMathUtils.MatrixTimesSource(true, mat, src, dst, src.Size); + float[] actual = new float[dst.Size]; + dst.CopyTo(actual, 0, dst.Size); Assert.Equal(expected, actual, _matMulComparer); } @@ -199,14 +118,14 @@ public void MatMulTranTest(int matTest, int srcTest, int dstTest, float[] expect [InlineData(1, 0, 1, new float[] { 95f, 231f, 367f, 503f, 639f, 775f, 911f, 1047f, 1183f, 1319f, 1455f, 1591f, 1727f, 1863f, 1999f, 2135f })] public void MatTimesSrcSparseTest(int matTest, int srcTest, int dstTest, float[] expected) { - float[] mat = _testMatrices[matTest]; - float[] src = _testSrcVectors[srcTest]; - float[] dst = _testDstVectors[dstTest]; + AlignedArray mat = _testMatrices[matTest]; + AlignedArray src = _testSrcVectors[srcTest]; + AlignedArray dst = _testDstVectors[dstTest]; int[] idx = _testIndexArray; - CpuMathUtils.MatrixTimesSource(mat, idx, src, 0, 0, (srcTest == 0) ? 4 : 9, dst, dst.Length); - float[] actual = new float[dst.Length]; - Array.Copy(dst, actual, dst.Length); + CpuMathUtils.MatrixTimesSource(mat, idx, src, 0, 0, (srcTest == 0) ? 4 : 9, dst, dst.Size); + float[] actual = new float[dst.Size]; + dst.CopyTo(actual, 0, dst.Size); Assert.Equal(expected, actual, _matMulComparer); } @@ -548,6 +467,34 @@ public void Dist2Test(int test, float expected) Assert.Equal(expected, actual, 0); } + [Theory] + [InlineData(0, new int[] { 0, 2, 5, 6 }, new float[] { 0f, 2f, 0f, 4f, 5f, 0f, 0f, 8f })] + [InlineData(1, new int[] { 0, 2, 5, 6, 8, 11, 12, 13, 14 }, new float[] { 0f, 2f, 0f, 4f, 5f, 0f, 0f, 8f, 0f, 10f, 11f, 0f, 0f, 0f, 0f, 16f })] + public void ZeroItemsUTest(int test, int[] idx, float[] expected) + { + AlignedArray src = new AlignedArray(8 + 8 * test, _vectorAlignment); + src.CopyFrom(_testSrcVectors[test]); + + CpuMathUtils.ZeroMatrixItems(src, src.Size, src.Size, idx); + float[] actual = new float[src.Size]; + src.CopyTo(actual, 0, src.Size); + Assert.Equal(expected, actual, _comparer); + } + + [Theory] + [InlineData(0, new int[] { 0, 2, 5 }, new float[] { 0f, 2f, 0f, 4f, 5f, 6f, 0f, 8f })] + [InlineData(1, new int[] { 0, 2, 5, 6, 8, 11, 12, 13 }, new float[] { 0f, 2f, 0f, 4f, 5f, 0f, 0f, 8f, 9f, 0f, 11f, 12f, 0f, 0f, 0f, 16f })] + public void ZeroMatrixItemsCoreTest(int test, int[] idx, float[] expected) + { + AlignedArray src = new AlignedArray(8 + 8 * test, _vectorAlignment); + src.CopyFrom(_testSrcVectors[test]); + + CpuMathUtils.ZeroMatrixItems(src, src.Size / 2 - 1, src.Size / 2, idx); + float[] actual = new float[src.Size]; + src.CopyTo(actual, 0, src.Size); + Assert.Equal(expected, actual, _comparer); + } + [Theory] [InlineData(0)] [InlineData(1)] diff --git a/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipeBase.cs b/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipeBase.cs index d8141426ea..16ceae22d3 100644 --- a/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipeBase.cs +++ b/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipeBase.cs @@ -17,7 +17,6 @@ using Microsoft.ML.Runtime.Model; using Microsoft.ML.TestFramework; using Xunit; -using Xunit.Abstractions; namespace Microsoft.ML.Runtime.RunTests { @@ -775,6 +774,7 @@ protected bool SaveLoadTransposed(IDataView view, IHostEnvironment env, string s public abstract partial class TestDataViewBase : BaseTestBaseline { + public class SentimentData { [ColumnName("Label")] @@ -934,7 +934,7 @@ protected bool CheckSameValues(RowCursor curs1, RowCursor curs2, bool exactTypes var comp = comps[col]; if (comp != null && !comp()) { - Fail($"Different values in column {col} of row {curs1.Position}"); + Fail("Different values in column {0} of row {1}", col, curs1.Position); return Failed(); } if (idComp != null && !idComp()) @@ -1158,10 +1158,12 @@ protected Func GetColumnComparer(Row r1, Row r2, int col, ColumnType type, throw Contracts.Except("Unknown type in GetColumnComparer: '{0}'", type); } - private bool EqualWithEpsDouble(Double x, Double y) + private const Double DoubleEps = 1e-9; + + private static bool EqualWithEpsDouble(Double x, Double y) { // bitwise comparison is needed because Abs(Inf-Inf) and Abs(NaN-NaN) are not 0s. - return FloatUtils.GetBits(x) == FloatUtils.GetBits(y) || CompareNumbersWithTolerance(x, y, null, 3); + return FloatUtils.GetBits(x) == FloatUtils.GetBits(y) || Math.Abs(x - y) < DoubleEps; } private const float SingleEps = 1e-6f; diff --git a/test/Microsoft.ML.TimeSeries.Tests/TimeSeries.cs b/test/Microsoft.ML.TimeSeries.Tests/TimeSeries.cs index 90428934ba..39340d225b 100644 --- a/test/Microsoft.ML.TimeSeries.Tests/TimeSeries.cs +++ b/test/Microsoft.ML.TimeSeries.Tests/TimeSeries.cs @@ -72,7 +72,7 @@ public void SavePipeSsaSpike() Done(); } - [Fact] + [ConditionalFact(typeof(BaseTestBaseline), nameof(BaseTestBaseline.LessThanNetCore30OrNotNetCore))] // netcore3.0 output differs from Baseline public void SavePipeSsaSpikeNoData() { string pathData = DeleteOutputPath("SavePipe", "SsaSpikeNoData.txt"); diff --git a/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesDirectApi.cs b/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesDirectApi.cs index 274a229630..0bd4604daa 100644 --- a/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesDirectApi.cs +++ b/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesDirectApi.cs @@ -87,7 +87,7 @@ public void ChangeDetection() } } - [Fact] + [ConditionalFact(typeof(BaseTestBaseline), nameof(BaseTestBaseline.LessThanNetCore30OrNotNetCore))] // netcore3.0 output differs from Baseline public void ChangePointDetectionWithSeasonality() { var env = new MLContext(conc: 1); @@ -130,14 +130,14 @@ public void ChangePointDetectionWithSeasonality() while (enumerator.MoveNext() && index < expectedValues.Count) { row = enumerator.Current; - Assert.Equal(expectedValues[index++], row.Change[0], precision: 5); // Alert - Assert.Equal(expectedValues[index++], row.Change[1], precision: 5); // Raw score - Assert.Equal(expectedValues[index++], row.Change[2], precision: 5); // P-Value score - Assert.Equal(expectedValues[index++], row.Change[3], precision: 5); // Martingale score + Assert.Equal(expectedValues[index++], row.Change[0], precision: 7); // Alert + Assert.Equal(expectedValues[index++], row.Change[1], precision: 7); // Raw score + Assert.Equal(expectedValues[index++], row.Change[2], precision: 7); // P-Value score + Assert.Equal(expectedValues[index++], row.Change[3], precision: 7); // Martingale score } } - [Fact] + [ConditionalFact(typeof(BaseTestBaseline), nameof(BaseTestBaseline.LessThanNetCore30OrNotNetCore))] public void ChangePointDetectionWithSeasonalityPredictionEngineNoColumn() { const int ChangeHistorySize = 10; @@ -190,10 +190,10 @@ public void ChangePointDetectionWithSeasonalityPredictionEngineNoColumn() var engine2 = model2.CreateTimeSeriesPredictionFunction(ml); var prediction2 = engine2.Predict(new Data(1)); //Raw score after first input. - Assert.Equal(1.1661833524703979, prediction2.Change[1], precision: 4); // Raw score + Assert.Equal(1.1661833524703979, prediction2.Change[1], precision: 5); // Raw score prediction2 = engine2.Predict(new Data(1)); //Raw score after second input. - Assert.Equal(0.12216401100158691, prediction2.Change[1], precision: 4); // Raw score + Assert.Equal(0.12216401100158691, prediction2.Change[1], precision: 5); // Raw score //Even though time series column is not requested it will // pass the observation through time series transform and update the state with the first input. @@ -210,10 +210,10 @@ public void ChangePointDetectionWithSeasonalityPredictionEngineNoColumn() //and raw score should match the raw score obtained by passing the two input in the first model. var engine3 = model3.CreateTimeSeriesPredictionFunction(ml); var prediction3 = engine3.Predict(new Data(1)); - Assert.Equal(0.12216401100158691, prediction2.Change[1], precision: 4); // Raw score + Assert.Equal(0.12216401100158691, prediction2.Change[1], precision: 5); // Raw score } - [Fact] + [ConditionalFact(typeof(BaseTestBaseline), nameof(BaseTestBaseline.LessThanNetCore30OrNotNetCore))] public void ChangePointDetectionWithSeasonalityPredictionEngine() { const int ChangeHistorySize = 10; @@ -264,7 +264,7 @@ public void ChangePointDetectionWithSeasonalityPredictionEngine() //Model 1: Prediction #2 prediction = engine.Predict(new Data(1)); Assert.Equal(0, prediction.Change[0], precision: 7); // Alert - Assert.Equal(0.12216401100158691, prediction.Change[1], precision: 4); // Raw score + Assert.Equal(0.12216401100158691, prediction.Change[1], precision: 5); // Raw score Assert.Equal(0.14823824685192111, prediction.Change[2], precision: 5); // P-Value score Assert.Equal(1.5292508189989167E-07, prediction.Change[3], precision: 7); // Martingale score @@ -277,7 +277,7 @@ public void ChangePointDetectionWithSeasonalityPredictionEngine() engine = model2.CreateTimeSeriesPredictionFunction(ml); prediction = engine.Predict(new Data(1)); Assert.Equal(0, prediction.Change[0], precision: 7); // Alert - Assert.Equal(0.12216401100158691, prediction.Change[1], precision: 4); // Raw score + Assert.Equal(0.12216401100158691, prediction.Change[1], precision: 5); // Raw score Assert.Equal(0.14823824685192111, prediction.Change[2], precision: 5); // P-Value score Assert.Equal(1.5292508189989167E-07, prediction.Change[3], precision: 5); // Martingale score } diff --git a/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesEstimatorTests.cs b/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesEstimatorTests.cs index 0e6a5063bd..a7892701d7 100644 --- a/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesEstimatorTests.cs +++ b/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesEstimatorTests.cs @@ -41,7 +41,7 @@ public TimeSeriesEstimatorTests(ITestOutputHelper output) : base(output) { } - [ConditionalFact(typeof(BaseTestBaseline), nameof(BaseTestBaseline.LessThanNetCore30OrNotNetCoreAnd64BitProcess))] // 32bit and netcore3.0 output differs from Baseline + [ConditionalFact(typeof(BaseTestBaseline), nameof(BaseTestBaseline.LessThanNetCore30OrNotNetCore))] // netcore3.0 output differs from Baseline void TestSsaChangePointEstimator() { int Confidence = 95; @@ -75,7 +75,7 @@ void TestSsaChangePointEstimator() Done(); } - [Fact] + [ConditionalFact(typeof(BaseTestBaseline), nameof(BaseTestBaseline.LessThanNetCore30OrNotNetCore))] // netcore3.0 output differs from Baseline void TestSsaSpikeEstimator() { int Confidence = 95;