Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

[ESIMD] Use new simd_mask<N> class instead of simd<unsigned short,N> #388

Merged
merged 3 commits into from
Sep 21, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 22 additions & 22 deletions SYCL/ESIMD/BitonicSortK.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ const mask_type_t<32> init_mask20 = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
// Function bitonic_exchange{1,2,4,8} compares and swaps elements with
// the particular strides
ESIMD_INLINE simd<uint32_t, BASE_SZ>
bitonic_exchange8(simd<uint32_t, BASE_SZ> A, simd<ushort, 32> flip) {
bitonic_exchange8(simd<uint32_t, BASE_SZ> A, simd_mask<32> flip) {
simd<uint32_t, BASE_SZ> B;
#pragma unroll
for (int i = 0; i < BASE_SZ; i += 32) {
Expand All @@ -163,7 +163,7 @@ bitonic_exchange8(simd<uint32_t, BASE_SZ> A, simd<ushort, 32> flip) {
}

ESIMD_INLINE simd<uint32_t, BASE_SZ>
bitonic_exchange4(simd<uint32_t, BASE_SZ> A, simd<ushort, 32> flip) {
bitonic_exchange4(simd<uint32_t, BASE_SZ> A, simd_mask<32> flip) {
simd<uint32_t, BASE_SZ> B;
#pragma unroll
for (int i = 0; i < BASE_SZ; i += 32) {
Expand Down Expand Up @@ -192,7 +192,7 @@ bitonic_exchange4(simd<uint32_t, BASE_SZ> A, simd<ushort, 32> flip) {
// each mov copies four 64-bit data, which is 4X SIMD efficiency
// improvement over the straightforward implementation.
ESIMD_INLINE simd<uint32_t, BASE_SZ>
bitonic_exchange2(simd<uint32_t, BASE_SZ> A, simd<ushort, 32> flip) {
bitonic_exchange2(simd<uint32_t, BASE_SZ> A, simd_mask<32> flip) {
simd<uint32_t, BASE_SZ> B;
#pragma unroll
for (int i = 0; i < BASE_SZ; i += 32) {
Expand All @@ -207,7 +207,7 @@ bitonic_exchange2(simd<uint32_t, BASE_SZ> A, simd<ushort, 32> flip) {
}

ESIMD_INLINE simd<uint32_t, BASE_SZ>
bitonic_exchange1(simd<uint32_t, BASE_SZ> A, simd<ushort, 32> flip) {
bitonic_exchange1(simd<uint32_t, BASE_SZ> A, simd_mask<32> flip) {
simd<uint32_t, BASE_SZ> B;
#pragma unroll
// each thread is handling 256-element chunk. Each iteration
Expand Down Expand Up @@ -302,8 +302,8 @@ ESIMD_INLINE void bitonic_merge(uint32_t offset, simd<uint32_t, BASE_SZ> &A,
// similar to bitonic_exchange{1,2,4,8}.

// exchange 8
simd<ushort, 32> flip13(init_mask13);
simd<ushort, 32> flip14(init_mask14);
simd_mask<32> flip13(init_mask13);
simd_mask<32> flip14(init_mask14);
simd<uint32_t, BASE_SZ> B;
for (int i = 0; i < BASE_SZ; i += 32) {
B.select<8, 1>(i) = A.select<8, 1>(i + 8);
Expand All @@ -322,8 +322,8 @@ ESIMD_INLINE void bitonic_merge(uint32_t offset, simd<uint32_t, BASE_SZ> &A,
}

// exchange 4
simd<ushort, 32> flip15(init_mask15);
simd<ushort, 32> flip16(init_mask16);
simd_mask<32> flip15(init_mask15);
simd_mask<32> flip16(init_mask16);
#pragma unroll
for (int i = 0; i < BASE_SZ; i += 32) {
auto MA = A.select<32, 1>(i).bit_cast_view<uint32_t, 4, 8>();
Expand All @@ -342,8 +342,8 @@ ESIMD_INLINE void bitonic_merge(uint32_t offset, simd<uint32_t, BASE_SZ> &A,
}

// exchange 2
simd<ushort, 32> flip17(init_mask17);
simd<ushort, 32> flip18(init_mask18);
simd_mask<32> flip17(init_mask17);
simd_mask<32> flip18(init_mask18);
#pragma unroll
for (int i = 0; i < BASE_SZ; i += 32) {
auto MB = B.select<32, 1>(i).bit_cast_view<long long, 4, 4>();
Expand All @@ -362,8 +362,8 @@ ESIMD_INLINE void bitonic_merge(uint32_t offset, simd<uint32_t, BASE_SZ> &A,
flip18);
}
// exchange 1
simd<ushort, 32> flip19(init_mask19);
simd<ushort, 32> flip20(init_mask20);
simd_mask<32> flip19(init_mask19);
simd_mask<32> flip20(init_mask20);
#pragma unroll
// Each iteration compares and swaps 2 32-element chunks
for (int i = 0; i < BASE_SZ; i += 32) {
Expand Down Expand Up @@ -406,28 +406,28 @@ ESIMD_INLINE void cmk_bitonic_sort_256(AccTy1 buf1, AccTy2 buf2, uint32_t idx) {
simd<uint32_t, BASE_SZ> B;
A = cmk_read<uint32_t, BASE_SZ, AccTy1>(buf1, offset);

simd<ushort, 32> flip1(init_mask1);
simd_mask<32> flip1(init_mask1);

simd<unsigned short, 32> mask;
// stage 0
B = bitonic_exchange1(A, flip1);
// stage 1
simd<ushort, 32> flip2(init_mask2);
simd<ushort, 32> flip3(init_mask3);
simd_mask<32> flip2(init_mask2);
simd_mask<32> flip3(init_mask3);
A = bitonic_exchange2(B, flip2);
B = bitonic_exchange1(A, flip3);
// stage 2
simd<ushort, 32> flip4(init_mask4);
simd<ushort, 32> flip5(init_mask5);
simd<ushort, 32> flip6(init_mask6);
simd_mask<32> flip4(init_mask4);
simd_mask<32> flip5(init_mask5);
simd_mask<32> flip6(init_mask6);
A = bitonic_exchange4(B, flip4);
B = bitonic_exchange2(A, flip5);
A = bitonic_exchange1(B, flip6);
// stage 3
simd<ushort, 32> flip7(init_mask7);
simd<ushort, 32> flip8(init_mask8);
simd<ushort, 32> flip9(init_mask9);
simd<ushort, 32> flip10(init_mask10);
simd_mask<32> flip7(init_mask7);
simd_mask<32> flip8(init_mask8);
simd_mask<32> flip9(init_mask9);
simd_mask<32> flip10(init_mask10);
B = bitonic_exchange8(A, flip7);
A = bitonic_exchange4(B, flip8);
B = bitonic_exchange2(A, flip9);
Expand Down
44 changes: 22 additions & 22 deletions SYCL/ESIMD/BitonicSortKv2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ ESIMD_INLINE void cmk_write(ty *buf, uint32_t offset, simd<ty, size> v) {
// Function bitonic_exchange{1,2,4,8} compares and swaps elements with
// the particular strides
ESIMD_INLINE simd<uint32_t, BASE_SZ>
bitonic_exchange8(simd<uint32_t, BASE_SZ> A, simd<ushort, 32> flip) {
bitonic_exchange8(simd<uint32_t, BASE_SZ> A, simd_mask<32> flip) {
simd<uint32_t, BASE_SZ> B;
#pragma unroll
for (int i = 0; i < BASE_SZ; i += 32) {
Expand All @@ -80,7 +80,7 @@ bitonic_exchange8(simd<uint32_t, BASE_SZ> A, simd<ushort, 32> flip) {
}

ESIMD_INLINE simd<uint32_t, BASE_SZ>
bitonic_exchange4(simd<uint32_t, BASE_SZ> A, simd<ushort, 32> flip) {
bitonic_exchange4(simd<uint32_t, BASE_SZ> A, simd_mask<32> flip) {
simd<uint32_t, BASE_SZ> B;
#pragma unroll
for (int i = 0; i < BASE_SZ; i += 32) {
Expand Down Expand Up @@ -109,7 +109,7 @@ bitonic_exchange4(simd<uint32_t, BASE_SZ> A, simd<ushort, 32> flip) {
// each mov copies four 64-bit data, which is 4X SIMD efficiency
// improvement over the straightforward implementation.
ESIMD_INLINE simd<uint32_t, BASE_SZ>
bitonic_exchange2(simd<uint32_t, BASE_SZ> A, simd<ushort, 32> flip) {
bitonic_exchange2(simd<uint32_t, BASE_SZ> A, simd_mask<32> flip) {
simd<uint32_t, BASE_SZ> B;
#pragma unroll
for (int i = 0; i < BASE_SZ; i += 32) {
Expand All @@ -124,7 +124,7 @@ bitonic_exchange2(simd<uint32_t, BASE_SZ> A, simd<ushort, 32> flip) {
}

ESIMD_INLINE simd<uint32_t, BASE_SZ>
bitonic_exchange1(simd<uint32_t, BASE_SZ> A, simd<ushort, 32> flip) {
bitonic_exchange1(simd<uint32_t, BASE_SZ> A, simd_mask<32> flip) {
simd<uint32_t, BASE_SZ> B;
#pragma unroll
// each thread is handling 256-element chunk. Each iteration
Expand Down Expand Up @@ -219,8 +219,8 @@ ESIMD_INLINE void bitonic_merge(uint32_t offset, simd<uint32_t, BASE_SZ> &A,
// similar to bitonic_exchange{1,2,4,8}.

// exchange 8
simd<ushort, 32> flip13 = esimd_unpack_mask<32>(0xff00ff00); //(init_mask13);
simd<ushort, 32> flip14 = esimd_unpack_mask<32>(0x00ff00ff); //(init_mask14);
simd_mask<32> flip13 = esimd_unpack_mask<32>(0xff00ff00); //(init_mask13);
simd_mask<32> flip14 = esimd_unpack_mask<32>(0x00ff00ff); //(init_mask14);
simd<uint32_t, BASE_SZ> B;
for (int i = 0; i < BASE_SZ; i += 32) {
B.select<8, 1>(i) = A.select<8, 1>(i + 8);
Expand All @@ -239,8 +239,8 @@ ESIMD_INLINE void bitonic_merge(uint32_t offset, simd<uint32_t, BASE_SZ> &A,
}

// exchange 4
simd<ushort, 32> flip15 = esimd_unpack_mask<32>(0xf0f0f0f0); //(init_mask15);
simd<ushort, 32> flip16 = esimd_unpack_mask<32>(0x0f0f0f0f); //(init_mask16);
simd_mask<32> flip15 = esimd_unpack_mask<32>(0xf0f0f0f0); //(init_mask15);
simd_mask<32> flip16 = esimd_unpack_mask<32>(0x0f0f0f0f); //(init_mask16);
#pragma unroll
for (int i = 0; i < BASE_SZ; i += 32) {
auto MA = A.select<32, 1>(i).bit_cast_view<uint32_t, 4, 8>();
Expand All @@ -259,8 +259,8 @@ ESIMD_INLINE void bitonic_merge(uint32_t offset, simd<uint32_t, BASE_SZ> &A,
}

// exchange 2
simd<ushort, 32> flip17 = esimd_unpack_mask<32>(0xcccccccc); //(init_mask17);
simd<ushort, 32> flip18 = esimd_unpack_mask<32>(0x33333333); //(init_mask18);
simd_mask<32> flip17 = esimd_unpack_mask<32>(0xcccccccc); //(init_mask17);
simd_mask<32> flip18 = esimd_unpack_mask<32>(0x33333333); //(init_mask18);
#pragma unroll
for (int i = 0; i < BASE_SZ; i += 32) {
auto MB = B.select<32, 1>(i).bit_cast_view<long long, 4, 4>();
Expand All @@ -279,8 +279,8 @@ ESIMD_INLINE void bitonic_merge(uint32_t offset, simd<uint32_t, BASE_SZ> &A,
flip18);
}
// exchange 1
simd<ushort, 32> flip19 = esimd_unpack_mask<32>(0xaaaaaaaa); //(init_mask19);
simd<ushort, 32> flip20 = esimd_unpack_mask<32>(0x55555555); //(init_mask20);
simd_mask<32> flip19 = esimd_unpack_mask<32>(0xaaaaaaaa); //(init_mask19);
simd_mask<32> flip20 = esimd_unpack_mask<32>(0x55555555); //(init_mask20);
#pragma unroll
// Each iteration compares and swaps 2 32-element chunks
for (int i = 0; i < BASE_SZ; i += 32) {
Expand Down Expand Up @@ -323,28 +323,28 @@ ESIMD_INLINE void cmk_bitonic_sort_256(uint32_t *buf1, uint32_t *buf2,
simd<uint32_t, BASE_SZ> B;
A = cmk_read<uint32_t, BASE_SZ>(buf1, offset);

simd<ushort, 32> flip1 = esimd_unpack_mask<32>(0x66666666); //(init_mask1);
simd_mask<32> flip1 = esimd_unpack_mask<32>(0x66666666); //(init_mask1);

simd<unsigned short, 32> mask;
// stage 0
B = bitonic_exchange1(A, flip1);
// stage 1
simd<ushort, 32> flip2 = esimd_unpack_mask<32>(0x3c3c3c3c); //(init_mask2);
simd<ushort, 32> flip3 = esimd_unpack_mask<32>(0x5a5a5a5a); //(init_mask3);
simd_mask<32> flip2 = esimd_unpack_mask<32>(0x3c3c3c3c); //(init_mask2);
simd_mask<32> flip3 = esimd_unpack_mask<32>(0x5a5a5a5a); //(init_mask3);
A = bitonic_exchange2(B, flip2);
B = bitonic_exchange1(A, flip3);
// stage 2
simd<ushort, 32> flip4 = esimd_unpack_mask<32>(0x0ff00ff0); //(init_mask4);
simd<ushort, 32> flip5 = esimd_unpack_mask<32>(0x33cc33cc); //(init_mask5);
simd<ushort, 32> flip6 = esimd_unpack_mask<32>(0x55aa55aa); //(init_mask6);
simd_mask<32> flip4 = esimd_unpack_mask<32>(0x0ff00ff0); //(init_mask4);
simd_mask<32> flip5 = esimd_unpack_mask<32>(0x33cc33cc); //(init_mask5);
simd_mask<32> flip6 = esimd_unpack_mask<32>(0x55aa55aa); //(init_mask6);
A = bitonic_exchange4(B, flip4);
B = bitonic_exchange2(A, flip5);
A = bitonic_exchange1(B, flip6);
// stage 3
simd<ushort, 32> flip7 = esimd_unpack_mask<32>(0x00ffff00); //(init_mask7);
simd<ushort, 32> flip8 = esimd_unpack_mask<32>(0x0f0ff0f0); //(init_mask8);
simd<ushort, 32> flip9 = esimd_unpack_mask<32>(0x3333cccc); //(init_mask9);
simd<ushort, 32> flip10 = esimd_unpack_mask<32>(0x5555aaaa); //(init_mask10);
simd_mask<32> flip7 = esimd_unpack_mask<32>(0x00ffff00); //(init_mask7);
simd_mask<32> flip8 = esimd_unpack_mask<32>(0x0f0ff0f0); //(init_mask8);
simd_mask<32> flip9 = esimd_unpack_mask<32>(0x3333cccc); //(init_mask9);
simd_mask<32> flip10 = esimd_unpack_mask<32>(0x5555aaaa); //(init_mask10);
B = bitonic_exchange8(A, flip7);
A = bitonic_exchange4(B, flip8);
B = bitonic_exchange2(A, flip9);
Expand Down
4 changes: 2 additions & 2 deletions SYCL/ESIMD/PrefixSum.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ void cmk_acum_iterative(unsigned *buf, unsigned h_pos,
cnt_table.select<1, 1, TUPLE_SZ, 1>(1, 0);

simd<unsigned, 8> voff(0, 1); // 0, 1, 2, 3
simd<ushort, 8> p = voff < TUPLE_SZ; // predicate
simd_mask<8> p = voff < TUPLE_SZ; // predicate
voff = (voff + (global_offset + stride_threads * TUPLE_SZ - TUPLE_SZ)) *
sizeof(unsigned);
scatter<unsigned, 8>(buf, S.select<8, 1>(0), voff, p);
Expand All @@ -182,7 +182,7 @@ void cmk_acum_final(unsigned *buf, unsigned h_pos, unsigned int stride_elems,
simd<unsigned, TUPLE_SZ> prev = 0;
for (unsigned i = 0; i < remaining; i += 32) {

simd<ushort, 32> p = elm32 < remaining;
simd_mask<32> p = elm32 < remaining;

S = gather_rgba<unsigned int, 32, GATHER_SCATTER_MASK>(buf, element_offset,
p);
Expand Down
2 changes: 1 addition & 1 deletion SYCL/ESIMD/Prefix_Local_sum1.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ void cmk_sum_tuple_count(unsigned int *buf, unsigned int h_pos) {
cnt_table.select<1, 1, TUPLE_SZ, 1>(1, 0);

simd<unsigned, 8> voff(0, 1); // 0, 1, 2, 3
simd<ushort, 8> p = voff < TUPLE_SZ; // predicate
simd_mask<8> p = voff < TUPLE_SZ; // predicate
voff = (voff + ((h_pos + 1) * PREFIX_ENTRIES * TUPLE_SZ - TUPLE_SZ)) *
sizeof(unsigned);
scatter<unsigned, 8>(buf, S.select<8, 1>(0), voff, p);
Expand Down
2 changes: 1 addition & 1 deletion SYCL/ESIMD/Prefix_Local_sum2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ void cmk_acum_iterative(unsigned *buf, unsigned h_pos,
simd<unsigned, 8> result = 0;
result.select<TUPLE_SZ, 1>(0) = sum;
simd<unsigned, 8> voff(0, 1); // 0, 1, 2, 3
simd<ushort, 8> p = voff < TUPLE_SZ; // predicate
simd_mask<8> p = voff < TUPLE_SZ; // predicate
voff = (voff + (global_offset + stride_threads * TUPLE_SZ - TUPLE_SZ)) *
sizeof(unsigned);
scatter<unsigned, 8>(buf, result, voff, p);
Expand Down
6 changes: 3 additions & 3 deletions SYCL/ESIMD/Prefix_Local_sum3.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ void cmk_acum_iterative(unsigned *buf, unsigned h_pos,
cnt_table.select<1, 1, TUPLE_SZ, 1>(1, 0);

simd<unsigned, 8> voff(0, 1); // 0, 1, 2, 3
simd<ushort, 8> p = voff < TUPLE_SZ; // predicate
simd_mask<8> p = voff < TUPLE_SZ; // predicate
voff = (voff + (global_offset + stride_threads * TUPLE_SZ - TUPLE_SZ)) *
sizeof(unsigned);
scatter<unsigned, 8>(buf, S.select<8, 1>(0), voff, p);
Expand Down Expand Up @@ -175,7 +175,7 @@ void cmk_acum_iterative_low(unsigned *buf, unsigned h_pos,
cnt_table.select<1, 1, TUPLE_SZ, 1>(1, 0);

simd<unsigned, 8> voff(0, 1); // 0, 1, 2, 3
simd<ushort, 8> p = voff < TUPLE_SZ; // predicate
simd_mask<8> p = voff < TUPLE_SZ; // predicate
voff = (voff + (global_offset + stride_threads * TUPLE_SZ - TUPLE_SZ)) *
sizeof(unsigned);
scatter<unsigned, 8>(buf, S.select<8, 1>(0), voff, p);
Expand All @@ -195,7 +195,7 @@ void cmk_acum_final(unsigned *buf, unsigned h_pos, unsigned int stride_elems,
simd<unsigned, TUPLE_SZ> prev = 0;
for (unsigned i = 0; i < remaining; i += 32) {

simd<ushort, 32> p = elm32 < remaining;
simd_mask<32> p = elm32 < remaining;

S = gather_rgba<unsigned int, 32, GATHER_SCATTER_MASK>(buf, element_offset,
p);
Expand Down
2 changes: 1 addition & 1 deletion SYCL/ESIMD/Stencil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ int main(int argc, char *argv[]) {
in.row(i + 10).select<WIDTH, 1>(5) * 0.02f;

// predciate output
simd<ushort, WIDTH> p = (elm16 + h_pos * WIDTH) < DIM_SIZE - 10;
simd_mask<WIDTH> p = (elm16 + h_pos * WIDTH) < (DIM_SIZE - 10);

simd<unsigned, WIDTH> elm16_off = elm16 * sizeof(float) + out_off;
scatter<float, WIDTH>(outputMatrix, sum, elm16_off, p);
Expand Down
Loading