Skip to content

Commit 42c0092

Browse files
authored
[SYCL] Add tests for atomic operations on malloc_shared-allocated memory (#12480)
This duplicates existing SYCL atomic tests that use buffers with identical tests that use USM and memory allocated using `sycl::malloc_shared`. The motivation behind exercising atomics on `malloc_shared`-allocated memory is to catch issues such as ROCm/ROCm#848 where native atomic instructions may fail when crossing the PCIe bus.
1 parent 68fd440 commit 42c0092

File tree

12 files changed

+839
-0
lines changed

12 files changed

+839
-0
lines changed

sycl/test-e2e/AtomicRef/add.h

Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,47 @@ void add_fetch_test(queue q, size_t N) {
104104
assert(std::unique(output.begin(), output.end()) == output.end());
105105
}
106106

107+
template <template <typename, memory_order, memory_scope, access::address_space>
108+
class AtomicRef,
109+
access::address_space space, typename T, typename Difference = T,
110+
memory_order order = memory_order::relaxed,
111+
memory_scope scope = memory_scope::device>
112+
void add_fetch_test_usm_shared(queue q, size_t N) {
113+
T *sum = malloc_shared<T>(1, q);
114+
T *output = malloc_shared<T>(N, q);
115+
T *output_begin = &output[0], *output_end = &output[N];
116+
sum[0] = T(0);
117+
std::fill(output_begin, output_end, T(0));
118+
{
119+
q.submit([&](handler &cgh) {
120+
cgh.parallel_for(range<1>(N), [=](item<1> it) {
121+
int gid = it.get_id(0);
122+
auto atm = AtomicRef < T,
123+
(order == memory_order::acquire || order == memory_order::release)
124+
? memory_order::relaxed
125+
: order,
126+
scope, space > (sum[0]);
127+
output[gid] = atm.fetch_add(Difference(1), order);
128+
});
129+
}).wait_and_throw();
130+
}
131+
132+
// All work-items increment by 1, so final value should be equal to N.
133+
assert(sum[0] == T(N));
134+
135+
// Fetch returns original value: will be in [0, N-1].
136+
auto min_e = std::min_element(output_begin, output_end);
137+
auto max_e = std::max_element(output_begin, output_end);
138+
assert(*min_e == T(0) && *max_e == T(N - 1));
139+
140+
// Intermediate values should be unique.
141+
std::sort(output_begin, output_end);
142+
assert(std::unique(output_begin, output_end) == output_end);
143+
144+
free(sum, q);
145+
free(output, q);
146+
}
147+
107148
template <template <typename, memory_order, memory_scope, access::address_space>
108149
class AtomicRef,
109150
access::address_space space, typename T, typename Difference = T,
@@ -146,6 +187,47 @@ void add_plus_equal_test(queue q, size_t N) {
146187
assert(std::unique(output.begin(), output.end()) == output.end());
147188
}
148189

190+
template <template <typename, memory_order, memory_scope, access::address_space>
191+
class AtomicRef,
192+
access::address_space space, typename T, typename Difference = T,
193+
memory_order order = memory_order::relaxed,
194+
memory_scope scope = memory_scope::device>
195+
void add_plus_equal_test_usm_shared(queue q, size_t N) {
196+
T *sum = malloc_shared<T>(1, q);
197+
T *output = malloc_shared<T>(N, q);
198+
T *output_begin = &output[0], *output_end = &output[N];
199+
sum[0] = T(0);
200+
std::fill(output_begin, output_end, T(0));
201+
{
202+
q.submit([&](handler &cgh) {
203+
cgh.parallel_for(range<1>(N), [=](item<1> it) {
204+
int gid = it.get_id(0);
205+
auto atm = AtomicRef < T,
206+
(order == memory_order::acquire || order == memory_order::release)
207+
? memory_order::relaxed
208+
: order,
209+
scope, space > (sum[0]);
210+
output[gid] = atm += Difference(1);
211+
});
212+
}).wait_and_throw();
213+
}
214+
215+
// All work-items increment by 1, so final value should be equal to N.
216+
assert(sum[0] == T(N));
217+
218+
// += returns updated value: will be in [1, N].
219+
auto min_e = std::min_element(output_begin, output_end);
220+
auto max_e = std::max_element(output_begin, output_end);
221+
assert(*min_e == T(1) && *max_e == T(N));
222+
223+
// Intermediate values should be unique.
224+
std::sort(output_begin, output_end);
225+
assert(std::unique(output_begin, output_end) == output_end);
226+
227+
free(sum, q);
228+
free(output, q);
229+
}
230+
149231
template <template <typename, memory_order, memory_scope, access::address_space>
150232
class AtomicRef,
151233
access::address_space space, typename T, typename Difference = T,
@@ -188,6 +270,46 @@ void add_pre_inc_test(queue q, size_t N) {
188270
assert(std::unique(output.begin(), output.end()) == output.end());
189271
}
190272

273+
template <template <typename, memory_order, memory_scope, access::address_space>
274+
class AtomicRef,
275+
access::address_space space, typename T, typename Difference = T,
276+
memory_order order = memory_order::relaxed,
277+
memory_scope scope = memory_scope::device>
278+
void add_pre_inc_test_usm_shared(queue q, size_t N) {
279+
T *sum = malloc_shared<T>(1, q);
280+
T *output = malloc_shared<T>(N, q);
281+
T *output_begin = &output[0], *output_end = &output[N];
282+
sum[0] = T(0);
283+
{
284+
q.submit([&](handler &cgh) {
285+
cgh.parallel_for(range<1>(N), [=](item<1> it) {
286+
int gid = it.get_id(0);
287+
auto atm = AtomicRef < T,
288+
(order == memory_order::acquire || order == memory_order::release)
289+
? memory_order::relaxed
290+
: order,
291+
scope, space > (sum[0]);
292+
output[gid] = ++atm;
293+
});
294+
}).wait_and_throw();
295+
}
296+
297+
// All work-items increment by 1, so final value should be equal to N.
298+
assert(sum[0] == T(N));
299+
300+
// Pre-increment returns updated value: will be in [1, N].
301+
auto min_e = std::min_element(output_begin, output_end);
302+
auto max_e = std::max_element(output_begin, output_end);
303+
assert(*min_e == T(1) && *max_e == T(N));
304+
305+
// Intermediate values should be unique.
306+
std::sort(output_begin, output_end);
307+
assert(std::unique(output_begin, output_end) == output_end);
308+
309+
free(sum, q);
310+
free(output, q);
311+
}
312+
191313
template <template <typename, memory_order, memory_scope, access::address_space>
192314
class AtomicRef,
193315
access::address_space space, typename T, typename Difference = T,
@@ -230,6 +352,46 @@ void add_post_inc_test(queue q, size_t N) {
230352
assert(std::unique(output.begin(), output.end()) == output.end());
231353
}
232354

355+
template <template <typename, memory_order, memory_scope, access::address_space>
356+
class AtomicRef,
357+
access::address_space space, typename T, typename Difference = T,
358+
memory_order order = memory_order::relaxed,
359+
memory_scope scope = memory_scope::device>
360+
void add_post_inc_test_usm_shared(queue q, size_t N) {
361+
T *sum = malloc_shared<T>(1, q);
362+
T *output = malloc_shared<T>(N, q);
363+
T *output_begin = &output[0], *output_end = &output[N];
364+
sum[0] = T(0);
365+
{
366+
q.submit([&](handler &cgh) {
367+
cgh.parallel_for(range<1>(N), [=](item<1> it) {
368+
int gid = it.get_id(0);
369+
auto atm = AtomicRef < T,
370+
(order == memory_order::acquire || order == memory_order::release)
371+
? memory_order::relaxed
372+
: order,
373+
scope, space > (sum[0]);
374+
output[gid] = atm++;
375+
});
376+
}).wait_and_throw();
377+
}
378+
379+
// All work-items increment by 1, so final value should be equal to N.
380+
assert(sum[0] == T(N));
381+
382+
// Post-increment returns original value: will be in [0, N-1].
383+
auto min_e = std::min_element(output_begin, output_end);
384+
auto max_e = std::max_element(output_begin, output_end);
385+
assert(*min_e == T(0) && *max_e == T(N - 1));
386+
387+
// Intermediate values should be unique.
388+
std::sort(output_begin, output_end);
389+
assert(std::unique(output_begin, output_end) == output_end);
390+
391+
free(sum, q);
392+
free(output, q);
393+
}
394+
233395
template <access::address_space space, typename T, typename Difference = T,
234396
memory_order order = memory_order::relaxed,
235397
memory_scope scope = memory_scope::device>
@@ -241,6 +403,7 @@ void add_test(queue q, size_t N) {
241403
space == access::address_space::global_space ||
242404
(space == access::address_space::generic_space && !TEST_GENERIC_IN_LOCAL);
243405
constexpr bool do_ext_tests = space != access::address_space::generic_space;
406+
bool do_usm_tests = q.get_device().has(aspect::usm_shared_allocations);
244407
if constexpr (do_local_tests) {
245408
#ifdef RUN_DEPRECATED
246409
if constexpr (do_ext_tests) {
@@ -259,23 +422,47 @@ void add_test(queue q, size_t N) {
259422
order, scope>(q, N);
260423
add_plus_equal_test<::sycl::ext::oneapi::atomic_ref, space, T, Difference,
261424
order, scope>(q, N);
425+
if (do_usm_tests) {
426+
add_fetch_test_usm_shared<::sycl::ext::oneapi::atomic_ref, space, T,
427+
Difference, order, scope>(q, N);
428+
add_plus_equal_test_usm_shared<::sycl::ext::oneapi::atomic_ref, space,
429+
T, Difference, order, scope>(q, N);
430+
}
262431
if constexpr (!std::is_floating_point_v<T>) {
263432
add_pre_inc_test<::sycl::ext::oneapi::atomic_ref, space, T, Difference,
264433
order, scope>(q, N);
265434
add_post_inc_test<::sycl::ext::oneapi::atomic_ref, space, T, Difference,
266435
order, scope>(q, N);
436+
if (do_usm_tests) {
437+
add_pre_inc_test_usm_shared<::sycl::ext::oneapi::atomic_ref, space, T,
438+
Difference, order, scope>(q, N);
439+
add_post_inc_test_usm_shared<::sycl::ext::oneapi::atomic_ref, space,
440+
T, Difference, order, scope>(q, N);
441+
}
267442
}
268443
}
269444
#else
270445
add_fetch_test<::sycl::atomic_ref, space, T, Difference, order, scope>(q,
271446
N);
272447
add_plus_equal_test<::sycl::atomic_ref, space, T, Difference, order, scope>(
273448
q, N);
449+
if (do_usm_tests) {
450+
add_fetch_test_usm_shared<::sycl::atomic_ref, space, T, Difference, order,
451+
scope>(q, N);
452+
add_plus_equal_test_usm_shared<::sycl::atomic_ref, space, T, Difference,
453+
order, scope>(q, N);
454+
}
274455
if constexpr (!std::is_floating_point_v<T>) {
275456
add_pre_inc_test<::sycl::atomic_ref, space, T, Difference, order, scope>(
276457
q, N);
277458
add_post_inc_test<::sycl::atomic_ref, space, T, Difference, order, scope>(
278459
q, N);
460+
if (do_usm_tests) {
461+
add_pre_inc_test_usm_shared<::sycl::atomic_ref, space, T, Difference,
462+
order, scope>(q, N);
463+
add_post_inc_test_usm_shared<::sycl::atomic_ref, space, T, Difference,
464+
order, scope>(q, N);
465+
}
279466
}
280467
#endif
281468
}

sycl/test-e2e/AtomicRef/and.h

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,44 @@ void and_global_test(queue q) {
9797
assert(std::unique(output.begin(), output.end()) == output.end());
9898
}
9999

100+
template <template <typename, memory_order, memory_scope, access::address_space>
101+
class AtomicRef,
102+
access::address_space space, typename T,
103+
memory_order order = memory_order::relaxed,
104+
memory_scope scope = memory_scope::device>
105+
void and_global_test_usm_shared(queue q) {
106+
const size_t N = 32;
107+
const T initial = T((1ll << N) - 1);
108+
T *cum = malloc_shared<T>(1, q);
109+
cum[0] = initial;
110+
T *output = malloc_shared<T>(N, q);
111+
T *output_begin = &output[0], *output_end = &output[N];
112+
std::fill(output_begin, output_end, T(0));
113+
{
114+
q.submit([&](handler &cgh) {
115+
cgh.parallel_for(range<1>(N), [=](item<1> it) {
116+
size_t gid = it.get_id(0);
117+
auto atm = AtomicRef < T,
118+
(order == memory_order::acquire || order == memory_order::release)
119+
? memory_order::relaxed
120+
: order,
121+
scope, space > (cum[0]);
122+
output[gid] = atm.fetch_and(~T(1ll << gid), order);
123+
});
124+
}).wait_and_throw();
125+
}
126+
127+
// Final value should be equal to 0.
128+
assert(cum[0] == 0);
129+
130+
// All other values should be unique; each work-item sets one bit to 0.
131+
std::sort(output_begin, output_end);
132+
assert(std::unique(output_begin, output_end) == output_end);
133+
134+
free(cum, q);
135+
free(output, q);
136+
}
137+
100138
template <access::address_space space, typename T,
101139
memory_order order = memory_order::relaxed,
102140
memory_scope scope = memory_scope::device>
@@ -108,6 +146,7 @@ void and_test(queue q) {
108146
space == access::address_space::global_space ||
109147
(space == access::address_space::generic_space && !TEST_GENERIC_IN_LOCAL);
110148
constexpr bool do_ext_tests = space != access::address_space::generic_space;
149+
bool do_usm_tests = q.get_device().has(aspect::usm_shared_allocations);
111150
if constexpr (do_local_tests) {
112151
#ifdef RUN_DEPRECATED
113152
if constexpr (do_ext_tests) {
@@ -123,9 +162,16 @@ void and_test(queue q) {
123162
if constexpr (do_ext_tests) {
124163
and_global_test<::sycl::ext::oneapi::atomic_ref, space, T, order, scope>(
125164
q);
165+
if (do_usm_tests) {
166+
and_global_test_usm_shared<::sycl::ext::oneapi::atomic_ref, space, T,
167+
order, scope>(q);
168+
}
126169
}
127170
#else
128171
and_global_test<::sycl::atomic_ref, space, T, order, scope>(q);
172+
if (do_usm_tests) {
173+
and_global_test_usm_shared<::sycl::atomic_ref, space, T, order, scope>(q);
174+
}
129175
#endif
130176
}
131177
}

sycl/test-e2e/AtomicRef/assignment.h

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,17 +40,63 @@ void assignment_test(queue q, size_t N) {
4040
assert(assignment >= T(0) && assignment <= T(N - 1));
4141
}
4242

43+
template <template <typename, memory_order, memory_scope, access::address_space>
44+
class AtomicRef,
45+
access::address_space address_space, typename T>
46+
class assignment_usm_kernel;
47+
48+
template <template <typename, memory_order, memory_scope, access::address_space>
49+
class AtomicRef,
50+
access::address_space address_space, typename T>
51+
void assignment_test_usm_shared(queue q, size_t N) {
52+
T initial = T(N);
53+
T *st = malloc_shared<T>(1, q);
54+
st[0] = initial;
55+
{
56+
q.submit([&](handler &cgh) {
57+
cgh.parallel_for<assignment_usm_kernel<AtomicRef, address_space, T>>(
58+
range<1>(N), [=](item<1> it) {
59+
size_t gid = it.get_id(0);
60+
auto atm = AtomicRef<T, memory_order::relaxed,
61+
memory_scope::device, address_space>(st[0]);
62+
atm = T(gid);
63+
});
64+
}).wait_and_throw();
65+
}
66+
67+
// The initial value should have been overwritten by a work-item ID.
68+
// Atomicity isn't tested here, but support for assignment() is.
69+
assert(st[0] != initial);
70+
assert(st[0] >= T(0) && st[0] <= T(N - 1));
71+
72+
free(st, q);
73+
}
74+
4375
template <typename T> void assignment_test(queue q, size_t N) {
76+
bool do_usm_tests = q.get_device().has(aspect::usm_shared_allocations);
4477
#ifdef RUN_DEPRECATED
4578
assignment_test<::sycl::ext::oneapi::atomic_ref,
4679
access::address_space::global_space, T>(q, N);
80+
if (do_usm_tests) {
81+
assignment_test_usm_shared<::sycl::ext::oneapi::atomic_ref,
82+
access::address_space::global_space, T>(q, N);
83+
}
4784
#else
4885
assignment_test<::sycl::atomic_ref, access::address_space::global_space, T>(
4986
q, N);
87+
if (do_usm_tests) {
88+
assignment_test_usm_shared<::sycl::atomic_ref,
89+
access::address_space::global_space, T>(q, N);
90+
}
5091
#endif
5192
}
5293

5394
template <typename T> void assignment_generic_test(queue q, size_t N) {
95+
bool do_usm_tests = q.get_device().has(aspect::usm_shared_allocations);
5496
assignment_test<::sycl::atomic_ref, access::address_space::generic_space, T>(
5597
q, N);
98+
if (do_usm_tests) {
99+
assignment_test_usm_shared<::sycl::atomic_ref,
100+
access::address_space::generic_space, T>(q, N);
101+
}
56102
}

0 commit comments

Comments
 (0)