Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

Commit 184bff5

Browse files
authored
[ESIMD] Use new simd_mask<N> class instead of simd<unsigned short,N> (#388)
* [ESIMD] Add simd_mask test, fix other tests to use simd_mask<N>. * [ESIMD] Fix unsupported simd_view += id<1> operation in test. * Fix simd mask usage in ESIMD tests, stride=1 for 1-element select. Complementary test change to intel/llvm#4230 Signed-off-by: kbobrovs <[email protected]>
1 parent f05e1bd commit 184bff5

18 files changed

+728
-71
lines changed

SYCL/ESIMD/BitonicSortK.cpp

+22-22
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ const mask_type_t<32> init_mask20 = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
148148
// Function bitonic_exchange{1,2,4,8} compares and swaps elements with
149149
// the particular strides
150150
ESIMD_INLINE simd<uint32_t, BASE_SZ>
151-
bitonic_exchange8(simd<uint32_t, BASE_SZ> A, simd<ushort, 32> flip) {
151+
bitonic_exchange8(simd<uint32_t, BASE_SZ> A, simd_mask<32> flip) {
152152
simd<uint32_t, BASE_SZ> B;
153153
#pragma unroll
154154
for (int i = 0; i < BASE_SZ; i += 32) {
@@ -163,7 +163,7 @@ bitonic_exchange8(simd<uint32_t, BASE_SZ> A, simd<ushort, 32> flip) {
163163
}
164164

165165
ESIMD_INLINE simd<uint32_t, BASE_SZ>
166-
bitonic_exchange4(simd<uint32_t, BASE_SZ> A, simd<ushort, 32> flip) {
166+
bitonic_exchange4(simd<uint32_t, BASE_SZ> A, simd_mask<32> flip) {
167167
simd<uint32_t, BASE_SZ> B;
168168
#pragma unroll
169169
for (int i = 0; i < BASE_SZ; i += 32) {
@@ -192,7 +192,7 @@ bitonic_exchange4(simd<uint32_t, BASE_SZ> A, simd<ushort, 32> flip) {
192192
// each mov copies four 64-bit data, which is 4X SIMD efficiency
193193
// improvement over the straightforward implementation.
194194
ESIMD_INLINE simd<uint32_t, BASE_SZ>
195-
bitonic_exchange2(simd<uint32_t, BASE_SZ> A, simd<ushort, 32> flip) {
195+
bitonic_exchange2(simd<uint32_t, BASE_SZ> A, simd_mask<32> flip) {
196196
simd<uint32_t, BASE_SZ> B;
197197
#pragma unroll
198198
for (int i = 0; i < BASE_SZ; i += 32) {
@@ -207,7 +207,7 @@ bitonic_exchange2(simd<uint32_t, BASE_SZ> A, simd<ushort, 32> flip) {
207207
}
208208

209209
ESIMD_INLINE simd<uint32_t, BASE_SZ>
210-
bitonic_exchange1(simd<uint32_t, BASE_SZ> A, simd<ushort, 32> flip) {
210+
bitonic_exchange1(simd<uint32_t, BASE_SZ> A, simd_mask<32> flip) {
211211
simd<uint32_t, BASE_SZ> B;
212212
#pragma unroll
213213
// each thread is handling 256-element chunk. Each iteration
@@ -302,8 +302,8 @@ ESIMD_INLINE void bitonic_merge(uint32_t offset, simd<uint32_t, BASE_SZ> &A,
302302
// similar to bitonic_exchange{1,2,4,8}.
303303

304304
// exchange 8
305-
simd<ushort, 32> flip13(init_mask13);
306-
simd<ushort, 32> flip14(init_mask14);
305+
simd_mask<32> flip13(init_mask13);
306+
simd_mask<32> flip14(init_mask14);
307307
simd<uint32_t, BASE_SZ> B;
308308
for (int i = 0; i < BASE_SZ; i += 32) {
309309
B.select<8, 1>(i) = A.select<8, 1>(i + 8);
@@ -322,8 +322,8 @@ ESIMD_INLINE void bitonic_merge(uint32_t offset, simd<uint32_t, BASE_SZ> &A,
322322
}
323323

324324
// exchange 4
325-
simd<ushort, 32> flip15(init_mask15);
326-
simd<ushort, 32> flip16(init_mask16);
325+
simd_mask<32> flip15(init_mask15);
326+
simd_mask<32> flip16(init_mask16);
327327
#pragma unroll
328328
for (int i = 0; i < BASE_SZ; i += 32) {
329329
auto MA = A.select<32, 1>(i).bit_cast_view<uint32_t, 4, 8>();
@@ -342,8 +342,8 @@ ESIMD_INLINE void bitonic_merge(uint32_t offset, simd<uint32_t, BASE_SZ> &A,
342342
}
343343

344344
// exchange 2
345-
simd<ushort, 32> flip17(init_mask17);
346-
simd<ushort, 32> flip18(init_mask18);
345+
simd_mask<32> flip17(init_mask17);
346+
simd_mask<32> flip18(init_mask18);
347347
#pragma unroll
348348
for (int i = 0; i < BASE_SZ; i += 32) {
349349
auto MB = B.select<32, 1>(i).bit_cast_view<long long, 4, 4>();
@@ -362,8 +362,8 @@ ESIMD_INLINE void bitonic_merge(uint32_t offset, simd<uint32_t, BASE_SZ> &A,
362362
flip18);
363363
}
364364
// exchange 1
365-
simd<ushort, 32> flip19(init_mask19);
366-
simd<ushort, 32> flip20(init_mask20);
365+
simd_mask<32> flip19(init_mask19);
366+
simd_mask<32> flip20(init_mask20);
367367
#pragma unroll
368368
// Each iteration compares and swaps 2 32-element chunks
369369
for (int i = 0; i < BASE_SZ; i += 32) {
@@ -406,28 +406,28 @@ ESIMD_INLINE void cmk_bitonic_sort_256(AccTy1 buf1, AccTy2 buf2, uint32_t idx) {
406406
simd<uint32_t, BASE_SZ> B;
407407
A = cmk_read<uint32_t, BASE_SZ, AccTy1>(buf1, offset);
408408

409-
simd<ushort, 32> flip1(init_mask1);
409+
simd_mask<32> flip1(init_mask1);
410410

411411
simd<unsigned short, 32> mask;
412412
// stage 0
413413
B = bitonic_exchange1(A, flip1);
414414
// stage 1
415-
simd<ushort, 32> flip2(init_mask2);
416-
simd<ushort, 32> flip3(init_mask3);
415+
simd_mask<32> flip2(init_mask2);
416+
simd_mask<32> flip3(init_mask3);
417417
A = bitonic_exchange2(B, flip2);
418418
B = bitonic_exchange1(A, flip3);
419419
// stage 2
420-
simd<ushort, 32> flip4(init_mask4);
421-
simd<ushort, 32> flip5(init_mask5);
422-
simd<ushort, 32> flip6(init_mask6);
420+
simd_mask<32> flip4(init_mask4);
421+
simd_mask<32> flip5(init_mask5);
422+
simd_mask<32> flip6(init_mask6);
423423
A = bitonic_exchange4(B, flip4);
424424
B = bitonic_exchange2(A, flip5);
425425
A = bitonic_exchange1(B, flip6);
426426
// stage 3
427-
simd<ushort, 32> flip7(init_mask7);
428-
simd<ushort, 32> flip8(init_mask8);
429-
simd<ushort, 32> flip9(init_mask9);
430-
simd<ushort, 32> flip10(init_mask10);
427+
simd_mask<32> flip7(init_mask7);
428+
simd_mask<32> flip8(init_mask8);
429+
simd_mask<32> flip9(init_mask9);
430+
simd_mask<32> flip10(init_mask10);
431431
B = bitonic_exchange8(A, flip7);
432432
A = bitonic_exchange4(B, flip8);
433433
B = bitonic_exchange2(A, flip9);

SYCL/ESIMD/BitonicSortKv2.cpp

+22-22
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ ESIMD_INLINE void cmk_write(ty *buf, uint32_t offset, simd<ty, size> v) {
6565
// Function bitonic_exchange{1,2,4,8} compares and swaps elements with
6666
// the particular strides
6767
ESIMD_INLINE simd<uint32_t, BASE_SZ>
68-
bitonic_exchange8(simd<uint32_t, BASE_SZ> A, simd<ushort, 32> flip) {
68+
bitonic_exchange8(simd<uint32_t, BASE_SZ> A, simd_mask<32> flip) {
6969
simd<uint32_t, BASE_SZ> B;
7070
#pragma unroll
7171
for (int i = 0; i < BASE_SZ; i += 32) {
@@ -80,7 +80,7 @@ bitonic_exchange8(simd<uint32_t, BASE_SZ> A, simd<ushort, 32> flip) {
8080
}
8181

8282
ESIMD_INLINE simd<uint32_t, BASE_SZ>
83-
bitonic_exchange4(simd<uint32_t, BASE_SZ> A, simd<ushort, 32> flip) {
83+
bitonic_exchange4(simd<uint32_t, BASE_SZ> A, simd_mask<32> flip) {
8484
simd<uint32_t, BASE_SZ> B;
8585
#pragma unroll
8686
for (int i = 0; i < BASE_SZ; i += 32) {
@@ -109,7 +109,7 @@ bitonic_exchange4(simd<uint32_t, BASE_SZ> A, simd<ushort, 32> flip) {
109109
// each mov copies four 64-bit data, which is 4X SIMD efficiency
110110
// improvement over the straightforward implementation.
111111
ESIMD_INLINE simd<uint32_t, BASE_SZ>
112-
bitonic_exchange2(simd<uint32_t, BASE_SZ> A, simd<ushort, 32> flip) {
112+
bitonic_exchange2(simd<uint32_t, BASE_SZ> A, simd_mask<32> flip) {
113113
simd<uint32_t, BASE_SZ> B;
114114
#pragma unroll
115115
for (int i = 0; i < BASE_SZ; i += 32) {
@@ -124,7 +124,7 @@ bitonic_exchange2(simd<uint32_t, BASE_SZ> A, simd<ushort, 32> flip) {
124124
}
125125

126126
ESIMD_INLINE simd<uint32_t, BASE_SZ>
127-
bitonic_exchange1(simd<uint32_t, BASE_SZ> A, simd<ushort, 32> flip) {
127+
bitonic_exchange1(simd<uint32_t, BASE_SZ> A, simd_mask<32> flip) {
128128
simd<uint32_t, BASE_SZ> B;
129129
#pragma unroll
130130
// each thread is handling 256-element chunk. Each iteration
@@ -219,8 +219,8 @@ ESIMD_INLINE void bitonic_merge(uint32_t offset, simd<uint32_t, BASE_SZ> &A,
219219
// similar to bitonic_exchange{1,2,4,8}.
220220

221221
// exchange 8
222-
simd<ushort, 32> flip13 = esimd_unpack_mask<32>(0xff00ff00); //(init_mask13);
223-
simd<ushort, 32> flip14 = esimd_unpack_mask<32>(0x00ff00ff); //(init_mask14);
222+
simd_mask<32> flip13 = esimd_unpack_mask<32>(0xff00ff00); //(init_mask13);
223+
simd_mask<32> flip14 = esimd_unpack_mask<32>(0x00ff00ff); //(init_mask14);
224224
simd<uint32_t, BASE_SZ> B;
225225
for (int i = 0; i < BASE_SZ; i += 32) {
226226
B.select<8, 1>(i) = A.select<8, 1>(i + 8);
@@ -239,8 +239,8 @@ ESIMD_INLINE void bitonic_merge(uint32_t offset, simd<uint32_t, BASE_SZ> &A,
239239
}
240240

241241
// exchange 4
242-
simd<ushort, 32> flip15 = esimd_unpack_mask<32>(0xf0f0f0f0); //(init_mask15);
243-
simd<ushort, 32> flip16 = esimd_unpack_mask<32>(0x0f0f0f0f); //(init_mask16);
242+
simd_mask<32> flip15 = esimd_unpack_mask<32>(0xf0f0f0f0); //(init_mask15);
243+
simd_mask<32> flip16 = esimd_unpack_mask<32>(0x0f0f0f0f); //(init_mask16);
244244
#pragma unroll
245245
for (int i = 0; i < BASE_SZ; i += 32) {
246246
auto MA = A.select<32, 1>(i).bit_cast_view<uint32_t, 4, 8>();
@@ -259,8 +259,8 @@ ESIMD_INLINE void bitonic_merge(uint32_t offset, simd<uint32_t, BASE_SZ> &A,
259259
}
260260

261261
// exchange 2
262-
simd<ushort, 32> flip17 = esimd_unpack_mask<32>(0xcccccccc); //(init_mask17);
263-
simd<ushort, 32> flip18 = esimd_unpack_mask<32>(0x33333333); //(init_mask18);
262+
simd_mask<32> flip17 = esimd_unpack_mask<32>(0xcccccccc); //(init_mask17);
263+
simd_mask<32> flip18 = esimd_unpack_mask<32>(0x33333333); //(init_mask18);
264264
#pragma unroll
265265
for (int i = 0; i < BASE_SZ; i += 32) {
266266
auto MB = B.select<32, 1>(i).bit_cast_view<long long, 4, 4>();
@@ -279,8 +279,8 @@ ESIMD_INLINE void bitonic_merge(uint32_t offset, simd<uint32_t, BASE_SZ> &A,
279279
flip18);
280280
}
281281
// exchange 1
282-
simd<ushort, 32> flip19 = esimd_unpack_mask<32>(0xaaaaaaaa); //(init_mask19);
283-
simd<ushort, 32> flip20 = esimd_unpack_mask<32>(0x55555555); //(init_mask20);
282+
simd_mask<32> flip19 = esimd_unpack_mask<32>(0xaaaaaaaa); //(init_mask19);
283+
simd_mask<32> flip20 = esimd_unpack_mask<32>(0x55555555); //(init_mask20);
284284
#pragma unroll
285285
// Each iteration compares and swaps 2 32-element chunks
286286
for (int i = 0; i < BASE_SZ; i += 32) {
@@ -323,28 +323,28 @@ ESIMD_INLINE void cmk_bitonic_sort_256(uint32_t *buf1, uint32_t *buf2,
323323
simd<uint32_t, BASE_SZ> B;
324324
A = cmk_read<uint32_t, BASE_SZ>(buf1, offset);
325325

326-
simd<ushort, 32> flip1 = esimd_unpack_mask<32>(0x66666666); //(init_mask1);
326+
simd_mask<32> flip1 = esimd_unpack_mask<32>(0x66666666); //(init_mask1);
327327

328328
simd<unsigned short, 32> mask;
329329
// stage 0
330330
B = bitonic_exchange1(A, flip1);
331331
// stage 1
332-
simd<ushort, 32> flip2 = esimd_unpack_mask<32>(0x3c3c3c3c); //(init_mask2);
333-
simd<ushort, 32> flip3 = esimd_unpack_mask<32>(0x5a5a5a5a); //(init_mask3);
332+
simd_mask<32> flip2 = esimd_unpack_mask<32>(0x3c3c3c3c); //(init_mask2);
333+
simd_mask<32> flip3 = esimd_unpack_mask<32>(0x5a5a5a5a); //(init_mask3);
334334
A = bitonic_exchange2(B, flip2);
335335
B = bitonic_exchange1(A, flip3);
336336
// stage 2
337-
simd<ushort, 32> flip4 = esimd_unpack_mask<32>(0x0ff00ff0); //(init_mask4);
338-
simd<ushort, 32> flip5 = esimd_unpack_mask<32>(0x33cc33cc); //(init_mask5);
339-
simd<ushort, 32> flip6 = esimd_unpack_mask<32>(0x55aa55aa); //(init_mask6);
337+
simd_mask<32> flip4 = esimd_unpack_mask<32>(0x0ff00ff0); //(init_mask4);
338+
simd_mask<32> flip5 = esimd_unpack_mask<32>(0x33cc33cc); //(init_mask5);
339+
simd_mask<32> flip6 = esimd_unpack_mask<32>(0x55aa55aa); //(init_mask6);
340340
A = bitonic_exchange4(B, flip4);
341341
B = bitonic_exchange2(A, flip5);
342342
A = bitonic_exchange1(B, flip6);
343343
// stage 3
344-
simd<ushort, 32> flip7 = esimd_unpack_mask<32>(0x00ffff00); //(init_mask7);
345-
simd<ushort, 32> flip8 = esimd_unpack_mask<32>(0x0f0ff0f0); //(init_mask8);
346-
simd<ushort, 32> flip9 = esimd_unpack_mask<32>(0x3333cccc); //(init_mask9);
347-
simd<ushort, 32> flip10 = esimd_unpack_mask<32>(0x5555aaaa); //(init_mask10);
344+
simd_mask<32> flip7 = esimd_unpack_mask<32>(0x00ffff00); //(init_mask7);
345+
simd_mask<32> flip8 = esimd_unpack_mask<32>(0x0f0ff0f0); //(init_mask8);
346+
simd_mask<32> flip9 = esimd_unpack_mask<32>(0x3333cccc); //(init_mask9);
347+
simd_mask<32> flip10 = esimd_unpack_mask<32>(0x5555aaaa); //(init_mask10);
348348
B = bitonic_exchange8(A, flip7);
349349
A = bitonic_exchange4(B, flip8);
350350
B = bitonic_exchange2(A, flip9);

SYCL/ESIMD/PrefixSum.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ void cmk_acum_iterative(unsigned *buf, unsigned h_pos,
162162
cnt_table.select<1, 1, TUPLE_SZ, 1>(1, 0);
163163

164164
simd<unsigned, 8> voff(0, 1); // 0, 1, 2, 3
165-
simd<ushort, 8> p = voff < TUPLE_SZ; // predicate
165+
simd_mask<8> p = voff < TUPLE_SZ; // predicate
166166
voff = (voff + (global_offset + stride_threads * TUPLE_SZ - TUPLE_SZ)) *
167167
sizeof(unsigned);
168168
scatter<unsigned, 8>(buf, S.select<8, 1>(0), voff, p);
@@ -182,7 +182,7 @@ void cmk_acum_final(unsigned *buf, unsigned h_pos, unsigned int stride_elems,
182182
simd<unsigned, TUPLE_SZ> prev = 0;
183183
for (unsigned i = 0; i < remaining; i += 32) {
184184

185-
simd<ushort, 32> p = elm32 < remaining;
185+
simd_mask<32> p = elm32 < remaining;
186186

187187
S = gather_rgba<unsigned int, 32, GATHER_SCATTER_MASK>(buf, element_offset,
188188
p);

SYCL/ESIMD/Prefix_Local_sum1.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ void cmk_sum_tuple_count(unsigned int *buf, unsigned int h_pos) {
101101
cnt_table.select<1, 1, TUPLE_SZ, 1>(1, 0);
102102

103103
simd<unsigned, 8> voff(0, 1); // 0, 1, 2, 3
104-
simd<ushort, 8> p = voff < TUPLE_SZ; // predicate
104+
simd_mask<8> p = voff < TUPLE_SZ; // predicate
105105
voff = (voff + ((h_pos + 1) * PREFIX_ENTRIES * TUPLE_SZ - TUPLE_SZ)) *
106106
sizeof(unsigned);
107107
scatter<unsigned, 8>(buf, S.select<8, 1>(0), voff, p);

SYCL/ESIMD/Prefix_Local_sum2.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ void cmk_acum_iterative(unsigned *buf, unsigned h_pos,
9595
simd<unsigned, 8> result = 0;
9696
result.select<TUPLE_SZ, 1>(0) = sum;
9797
simd<unsigned, 8> voff(0, 1); // 0, 1, 2, 3
98-
simd<ushort, 8> p = voff < TUPLE_SZ; // predicate
98+
simd_mask<8> p = voff < TUPLE_SZ; // predicate
9999
voff = (voff + (global_offset + stride_threads * TUPLE_SZ - TUPLE_SZ)) *
100100
sizeof(unsigned);
101101
scatter<unsigned, 8>(buf, result, voff, p);

SYCL/ESIMD/Prefix_Local_sum3.cpp

+3-3
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ void cmk_acum_iterative(unsigned *buf, unsigned h_pos,
124124
cnt_table.select<1, 1, TUPLE_SZ, 1>(1, 0);
125125

126126
simd<unsigned, 8> voff(0, 1); // 0, 1, 2, 3
127-
simd<ushort, 8> p = voff < TUPLE_SZ; // predicate
127+
simd_mask<8> p = voff < TUPLE_SZ; // predicate
128128
voff = (voff + (global_offset + stride_threads * TUPLE_SZ - TUPLE_SZ)) *
129129
sizeof(unsigned);
130130
scatter<unsigned, 8>(buf, S.select<8, 1>(0), voff, p);
@@ -175,7 +175,7 @@ void cmk_acum_iterative_low(unsigned *buf, unsigned h_pos,
175175
cnt_table.select<1, 1, TUPLE_SZ, 1>(1, 0);
176176

177177
simd<unsigned, 8> voff(0, 1); // 0, 1, 2, 3
178-
simd<ushort, 8> p = voff < TUPLE_SZ; // predicate
178+
simd_mask<8> p = voff < TUPLE_SZ; // predicate
179179
voff = (voff + (global_offset + stride_threads * TUPLE_SZ - TUPLE_SZ)) *
180180
sizeof(unsigned);
181181
scatter<unsigned, 8>(buf, S.select<8, 1>(0), voff, p);
@@ -195,7 +195,7 @@ void cmk_acum_final(unsigned *buf, unsigned h_pos, unsigned int stride_elems,
195195
simd<unsigned, TUPLE_SZ> prev = 0;
196196
for (unsigned i = 0; i < remaining; i += 32) {
197197

198-
simd<ushort, 32> p = elm32 < remaining;
198+
simd_mask<32> p = elm32 < remaining;
199199

200200
S = gather_rgba<unsigned int, 32, GATHER_SCATTER_MASK>(buf, element_offset,
201201
p);

SYCL/ESIMD/Stencil.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ int main(int argc, char *argv[]) {
164164
in.row(i + 10).select<WIDTH, 1>(5) * 0.02f;
165165

166166
// predciate output
167-
simd<ushort, WIDTH> p = (elm16 + h_pos * WIDTH) < DIM_SIZE - 10;
167+
simd_mask<WIDTH> p = (elm16 + h_pos * WIDTH) < (DIM_SIZE - 10);
168168

169169
simd<unsigned, WIDTH> elm16_off = elm16 * sizeof(float) + out_off;
170170
scatter<float, WIDTH>(outputMatrix, sum, elm16_off, p);

0 commit comments

Comments
 (0)