@@ -20,6 +20,10 @@ internal static class AvxIntrinsics
20
20
{
21
21
private static readonly Vector256 < float > _absMask256 = Avx . StaticCast < int , float > ( Avx . SetAllVector256 ( 0x7FFFFFFF ) ) ;
22
22
23
+ // The count of 32-bit floats in Vector256<T>
24
+ private const int AvxAlignment = 8 ;
25
+
26
+ // The count of bytes in Vector256<T>, corresponding to _cbAlign in AlignedArray
23
27
private const int Vector256Alignment = 32 ;
24
28
25
29
[ MethodImplAttribute ( MethodImplOptions . AggressiveInlining ) ]
@@ -415,32 +419,32 @@ public static unsafe void AddScalarU(float scalar, Span<float> dst)
415
419
{
416
420
fixed ( float * pdst = dst )
417
421
{
418
- float * pDstEnd = pdst + dst . Length ;
419
- float * pDstCurrent = pdst ;
420
-
421
422
Vector256 < float > scalarVector256 = Avx . SetAllVector256 ( scalar ) ;
423
+ int countAvx = Math . DivRem ( dst . Length , AvxAlignment , out int remainderAvx ) ;
424
+ float * pDstCurrent = pdst ;
422
425
423
- while ( pDstCurrent + 8 <= pDstEnd )
426
+ for ( int i = 0 ; i < countAvx ; i ++ )
424
427
{
425
428
Vector256 < float > dstVector = Avx . LoadVector256 ( pDstCurrent ) ;
426
429
dstVector = Avx . Add ( dstVector , scalarVector256 ) ;
427
430
Avx . Store ( pDstCurrent , dstVector ) ;
428
431
429
- pDstCurrent += 8 ;
432
+ pDstCurrent += AvxAlignment ;
430
433
}
431
434
432
435
Vector128 < float > scalarVector128 = Sse . SetAllVector128 ( scalar ) ;
436
+ int countSse = Math . DivRem ( remainderAvx , SseIntrinsics . SseAlignment , out int remainderSse ) ;
433
437
434
- if ( pDstCurrent + 4 <= pDstEnd )
438
+ if ( countSse > 0 )
435
439
{
436
440
Vector128 < float > dstVector = Sse . LoadVector128 ( pDstCurrent ) ;
437
441
dstVector = Sse . Add ( dstVector , scalarVector128 ) ;
438
442
Sse . Store ( pDstCurrent , dstVector ) ;
439
443
440
- pDstCurrent += 4 ;
444
+ pDstCurrent += SseIntrinsics . SseAlignment ;
441
445
}
442
446
443
- while ( pDstCurrent < pDstEnd )
447
+ for ( int i = 0 ; i < remainderSse ; i ++ )
444
448
{
445
449
Vector128 < float > dstVector = Sse . LoadScalarVector128 ( pDstCurrent ) ;
446
450
dstVector = Sse . AddScalar ( dstVector , scalarVector128 ) ;
0 commit comments