@@ -65,7 +65,7 @@ ESIMD_INLINE void cmk_write(ty *buf, uint32_t offset, simd<ty, size> v) {
65
65
// Function bitonic_exchange{1,2,4,8} compares and swaps elements with
66
66
// the particular strides
67
67
ESIMD_INLINE simd<uint32_t , BASE_SZ>
68
- bitonic_exchange8 (simd<uint32_t , BASE_SZ> A, simd< ushort , 32 > flip) {
68
+ bitonic_exchange8 (simd<uint32_t , BASE_SZ> A, simd_mask< 32 > flip) {
69
69
simd<uint32_t , BASE_SZ> B;
70
70
#pragma unroll
71
71
for (int i = 0 ; i < BASE_SZ; i += 32 ) {
@@ -80,7 +80,7 @@ bitonic_exchange8(simd<uint32_t, BASE_SZ> A, simd<ushort, 32> flip) {
80
80
}
81
81
82
82
ESIMD_INLINE simd<uint32_t , BASE_SZ>
83
- bitonic_exchange4 (simd<uint32_t , BASE_SZ> A, simd< ushort , 32 > flip) {
83
+ bitonic_exchange4 (simd<uint32_t , BASE_SZ> A, simd_mask< 32 > flip) {
84
84
simd<uint32_t , BASE_SZ> B;
85
85
#pragma unroll
86
86
for (int i = 0 ; i < BASE_SZ; i += 32 ) {
@@ -109,7 +109,7 @@ bitonic_exchange4(simd<uint32_t, BASE_SZ> A, simd<ushort, 32> flip) {
109
109
// each mov copies four 64-bit data, which is 4X SIMD efficiency
110
110
// improvement over the straightforward implementation.
111
111
ESIMD_INLINE simd<uint32_t , BASE_SZ>
112
- bitonic_exchange2 (simd<uint32_t , BASE_SZ> A, simd< ushort , 32 > flip) {
112
+ bitonic_exchange2 (simd<uint32_t , BASE_SZ> A, simd_mask< 32 > flip) {
113
113
simd<uint32_t , BASE_SZ> B;
114
114
#pragma unroll
115
115
for (int i = 0 ; i < BASE_SZ; i += 32 ) {
@@ -124,7 +124,7 @@ bitonic_exchange2(simd<uint32_t, BASE_SZ> A, simd<ushort, 32> flip) {
124
124
}
125
125
126
126
ESIMD_INLINE simd<uint32_t , BASE_SZ>
127
- bitonic_exchange1 (simd<uint32_t , BASE_SZ> A, simd< ushort , 32 > flip) {
127
+ bitonic_exchange1 (simd<uint32_t , BASE_SZ> A, simd_mask< 32 > flip) {
128
128
simd<uint32_t , BASE_SZ> B;
129
129
#pragma unroll
130
130
// each thread is handling 256-element chunk. Each iteration
@@ -219,8 +219,8 @@ ESIMD_INLINE void bitonic_merge(uint32_t offset, simd<uint32_t, BASE_SZ> &A,
219
219
// similar to bitonic_exchange{1,2,4,8}.
220
220
221
221
// exchange 8
222
- simd< ushort , 32 > flip13 = esimd_unpack_mask<32 >(0xff00ff00 ); // (init_mask13);
223
- simd< ushort , 32 > flip14 = esimd_unpack_mask<32 >(0x00ff00ff ); // (init_mask14);
222
+ simd_mask< 32 > flip13 = esimd_unpack_mask<32 >(0xff00ff00 ); // (init_mask13);
223
+ simd_mask< 32 > flip14 = esimd_unpack_mask<32 >(0x00ff00ff ); // (init_mask14);
224
224
simd<uint32_t , BASE_SZ> B;
225
225
for (int i = 0 ; i < BASE_SZ; i += 32 ) {
226
226
B.select <8 , 1 >(i) = A.select <8 , 1 >(i + 8 );
@@ -239,8 +239,8 @@ ESIMD_INLINE void bitonic_merge(uint32_t offset, simd<uint32_t, BASE_SZ> &A,
239
239
}
240
240
241
241
// exchange 4
242
- simd< ushort , 32 > flip15 = esimd_unpack_mask<32 >(0xf0f0f0f0 ); // (init_mask15);
243
- simd< ushort , 32 > flip16 = esimd_unpack_mask<32 >(0x0f0f0f0f ); // (init_mask16);
242
+ simd_mask< 32 > flip15 = esimd_unpack_mask<32 >(0xf0f0f0f0 ); // (init_mask15);
243
+ simd_mask< 32 > flip16 = esimd_unpack_mask<32 >(0x0f0f0f0f ); // (init_mask16);
244
244
#pragma unroll
245
245
for (int i = 0 ; i < BASE_SZ; i += 32 ) {
246
246
auto MA = A.select <32 , 1 >(i).bit_cast_view <uint32_t , 4 , 8 >();
@@ -259,8 +259,8 @@ ESIMD_INLINE void bitonic_merge(uint32_t offset, simd<uint32_t, BASE_SZ> &A,
259
259
}
260
260
261
261
// exchange 2
262
- simd< ushort , 32 > flip17 = esimd_unpack_mask<32 >(0xcccccccc ); // (init_mask17);
263
- simd< ushort , 32 > flip18 = esimd_unpack_mask<32 >(0x33333333 ); // (init_mask18);
262
+ simd_mask< 32 > flip17 = esimd_unpack_mask<32 >(0xcccccccc ); // (init_mask17);
263
+ simd_mask< 32 > flip18 = esimd_unpack_mask<32 >(0x33333333 ); // (init_mask18);
264
264
#pragma unroll
265
265
for (int i = 0 ; i < BASE_SZ; i += 32 ) {
266
266
auto MB = B.select <32 , 1 >(i).bit_cast_view <long long , 4 , 4 >();
@@ -279,8 +279,8 @@ ESIMD_INLINE void bitonic_merge(uint32_t offset, simd<uint32_t, BASE_SZ> &A,
279
279
flip18);
280
280
}
281
281
// exchange 1
282
- simd< ushort , 32 > flip19 = esimd_unpack_mask<32 >(0xaaaaaaaa ); // (init_mask19);
283
- simd< ushort , 32 > flip20 = esimd_unpack_mask<32 >(0x55555555 ); // (init_mask20);
282
+ simd_mask< 32 > flip19 = esimd_unpack_mask<32 >(0xaaaaaaaa ); // (init_mask19);
283
+ simd_mask< 32 > flip20 = esimd_unpack_mask<32 >(0x55555555 ); // (init_mask20);
284
284
#pragma unroll
285
285
// Each iteration compares and swaps 2 32-element chunks
286
286
for (int i = 0 ; i < BASE_SZ; i += 32 ) {
@@ -323,28 +323,28 @@ ESIMD_INLINE void cmk_bitonic_sort_256(uint32_t *buf1, uint32_t *buf2,
323
323
simd<uint32_t , BASE_SZ> B;
324
324
A = cmk_read<uint32_t , BASE_SZ>(buf1, offset);
325
325
326
- simd< ushort , 32 > flip1 = esimd_unpack_mask<32 >(0x66666666 ); // (init_mask1);
326
+ simd_mask< 32 > flip1 = esimd_unpack_mask<32 >(0x66666666 ); // (init_mask1);
327
327
328
328
simd<unsigned short , 32 > mask;
329
329
// stage 0
330
330
B = bitonic_exchange1 (A, flip1);
331
331
// stage 1
332
- simd< ushort , 32 > flip2 = esimd_unpack_mask<32 >(0x3c3c3c3c ); // (init_mask2);
333
- simd< ushort , 32 > flip3 = esimd_unpack_mask<32 >(0x5a5a5a5a ); // (init_mask3);
332
+ simd_mask< 32 > flip2 = esimd_unpack_mask<32 >(0x3c3c3c3c ); // (init_mask2);
333
+ simd_mask< 32 > flip3 = esimd_unpack_mask<32 >(0x5a5a5a5a ); // (init_mask3);
334
334
A = bitonic_exchange2 (B, flip2);
335
335
B = bitonic_exchange1 (A, flip3);
336
336
// stage 2
337
- simd< ushort , 32 > flip4 = esimd_unpack_mask<32 >(0x0ff00ff0 ); // (init_mask4);
338
- simd< ushort , 32 > flip5 = esimd_unpack_mask<32 >(0x33cc33cc ); // (init_mask5);
339
- simd< ushort , 32 > flip6 = esimd_unpack_mask<32 >(0x55aa55aa ); // (init_mask6);
337
+ simd_mask< 32 > flip4 = esimd_unpack_mask<32 >(0x0ff00ff0 ); // (init_mask4);
338
+ simd_mask< 32 > flip5 = esimd_unpack_mask<32 >(0x33cc33cc ); // (init_mask5);
339
+ simd_mask< 32 > flip6 = esimd_unpack_mask<32 >(0x55aa55aa ); // (init_mask6);
340
340
A = bitonic_exchange4 (B, flip4);
341
341
B = bitonic_exchange2 (A, flip5);
342
342
A = bitonic_exchange1 (B, flip6);
343
343
// stage 3
344
- simd< ushort , 32 > flip7 = esimd_unpack_mask<32 >(0x00ffff00 ); // (init_mask7);
345
- simd< ushort , 32 > flip8 = esimd_unpack_mask<32 >(0x0f0ff0f0 ); // (init_mask8);
346
- simd< ushort , 32 > flip9 = esimd_unpack_mask<32 >(0x3333cccc ); // (init_mask9);
347
- simd< ushort , 32 > flip10 = esimd_unpack_mask<32 >(0x5555aaaa ); // (init_mask10);
344
+ simd_mask< 32 > flip7 = esimd_unpack_mask<32 >(0x00ffff00 ); // (init_mask7);
345
+ simd_mask< 32 > flip8 = esimd_unpack_mask<32 >(0x0f0ff0f0 ); // (init_mask8);
346
+ simd_mask< 32 > flip9 = esimd_unpack_mask<32 >(0x3333cccc ); // (init_mask9);
347
+ simd_mask< 32 > flip10 = esimd_unpack_mask<32 >(0x5555aaaa ); // (init_mask10);
348
348
B = bitonic_exchange8 (A, flip7);
349
349
A = bitonic_exchange4 (B, flip8);
350
350
B = bitonic_exchange2 (A, flip9);
0 commit comments