Skip to content

Commit c18cf38

Browse files
authored
[SYCL][COMPAT] Support for experimental masked shifts (#12972)
This PR adds an experimental approach to support `select_from_sub_group`, `shift_sub_group_left`, `shift_sub_group_right`, and `permute_sub_group_by_xor` functions on SPIRV devices. As these are based on still to be supported intrinsics, tests have been left out.
1 parent 5f4629b commit c18cf38

File tree

2 files changed

+232
-0
lines changed

2 files changed

+232
-0
lines changed

sycl/doc/syclcompat/README.md

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1088,6 +1088,10 @@ functionality to `sycl::select_from_group`, `sycl::shift_group_left`,
10881088
However, they provide an optional argument to represent the `logical_group` size
10891089
(default 32).
10901090

1091+
Experimental support for masked versions of `select_from_sub_group`,
1092+
`shift_sub_group_left`, `shift_sub_group_right` and `permute_sub_group_by_xor` is
1093+
provided only for SPIRV devices.
1094+
10911095
```c++
10921096
namespace syclcompat {
10931097

@@ -1116,6 +1120,25 @@ template <typename ValueT>
11161120
ValueT permute_sub_group_by_xor(sycl::sub_group g, ValueT x, unsigned int mask,
11171121
int logical_sub_group_size = 32);
11181122

1123+
namespace experimental {
1124+
1125+
template <typename ValueT>
1126+
ValueT select_from_sub_group(unsigned int member_mask, sycl::sub_group g, ValueT x,
1127+
int remote_local_id, int logical_sub_group_size = 32);
1128+
1129+
template <typename ValueT>
1130+
ValueT shift_sub_group_left(unsigned int member_mask, sycl::sub_group g, ValueT x,
1131+
unsigned int delta, int logical_sub_group_size = 32);
1132+
1133+
template <typename ValueT>
1134+
ValueT shift_sub_group_right(unsigned int member_mask, sycl::sub_group g, ValueT x,
1135+
unsigned int delta, int logical_sub_group_size = 32);
1136+
1137+
template <typename ValueT>
1138+
ValueT permute_sub_group_by_xor(unsigned int member_mask, sycql::sub_group g, ValueT x,
1139+
unsigned int mask, int logical_sub_group_size = 32);
1140+
1141+
} // namespace experimental
11191142
} // namespace syclcompat
11201143
```
11211144

sycl/include/syclcompat/util.hpp

Lines changed: 209 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,25 @@
3939
#include <syclcompat/math.hpp>
4040
#include <syclcompat/memory.hpp>
4141

42+
// TODO: Remove these function definitions once they exist in the DPC++ compiler
43+
#if defined(__SYCL_DEVICE_ONLY__) && defined(__INTEL_LLVM_COMPILER)
44+
template <typename T>
45+
__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT
46+
__attribute__((noduplicate)) T
47+
__spirv_GroupNonUniformShuffle(__spv::Scope::Flag, T, unsigned) noexcept;
48+
49+
template <typename T>
50+
__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT
51+
__attribute__((noduplicate)) T
52+
__spirv_GroupNonUniformShuffleDown(__spv::Scope::Flag, T,
53+
unsigned) noexcept;
54+
55+
template <typename T>
56+
__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT
57+
__attribute__((noduplicate)) T
58+
__spirv_GroupNonUniformShuffleUp(__spv::Scope::Flag, T, unsigned) noexcept;
59+
#endif
60+
4261
namespace syclcompat {
4362

4463
namespace detail {
@@ -269,6 +288,196 @@ T permute_sub_group_by_xor(sycl::sub_group g, T x, unsigned int mask,
269288
: id);
270289
}
271290

291+
namespace experimental {
292+
/// Masked version of select_from_sub_group, which execute masked sub-group
293+
/// operation. The parameter member_mask indicating the work-items participating
294+
/// the call. Whether the n-th bit is set to 1 representing whether the
295+
/// work-item with id n is participating the call. All work-items named in
296+
/// member_mask must be executed with the same member_mask, or the result is
297+
/// undefined.
298+
/// \tparam T Input value type
299+
/// \param [in] member_mask Input mask
300+
/// \param [in] g Input sub_group
301+
/// \param [in] x Input value
302+
/// \param [in] remote_local_id Input source work item id
303+
/// \param [in] logical_sub_group_size Input logical sub_group size
304+
/// \returns The result
305+
template <typename T>
306+
T select_from_sub_group(unsigned int member_mask, sycl::sub_group g, T x,
307+
int remote_local_id, int logical_sub_group_size = 32) {
308+
unsigned int start_index =
309+
g.get_local_linear_id() / logical_sub_group_size * logical_sub_group_size;
310+
unsigned logical_remote_id =
311+
start_index + remote_local_id % logical_sub_group_size;
312+
#if defined(__SYCL_DEVICE_ONLY__) && defined(__INTEL_LLVM_COMPILER)
313+
#if defined(__SPIR__)
314+
return __spirv_GroupNonUniformShuffle(__spv::Scope::Subgroup, x,
315+
logical_remote_id);
316+
#else
317+
// TODO: Check
318+
// https://github.com/intel/llvm/pull/12972#issuecomment-2034218881 for the
319+
// __NVPTX__ implementation
320+
throw sycl::exception(sycl::errc::runtime,
321+
"[SYCLcompat] Masked version of select_from_sub_group "
322+
"only supports SPIR-V backends.");
323+
#endif // __SPIR__
324+
#else
325+
(void)g;
326+
(void)x;
327+
(void)remote_local_id;
328+
(void)logical_sub_group_size;
329+
(void)member_mask;
330+
throw sycl::exception(
331+
sycl::errc::runtime,
332+
"[SYCLcompat] Masked version of select_from_sub_group not "
333+
"supported on host device and none intel compiler.");
334+
#endif // __SYCL_DEVICE_ONLY__ && __INTEL_LLVM_COMPILER
335+
}
336+
337+
/// Masked version of shift_sub_group_left, which execute masked sub-group
338+
/// operation. The parameter member_mask indicating the work-items participating
339+
/// the call. Whether the n-th bit is set to 1 representing whether the
340+
/// work-item with id n is participating the call. All work-items named in
341+
/// member_mask must be executed with the same member_mask, or the result is
342+
/// undefined.
343+
/// \tparam T Input value type
344+
/// \param [in] member_mask Input mask
345+
/// \param [in] g Input sub_group
346+
/// \param [in] x Input value
347+
/// \param [in] delta Input delta
348+
/// \param [in] logical_sub_group_size Input logical sub_group size
349+
/// \returns The result
350+
template <typename T>
351+
T shift_sub_group_left(unsigned int member_mask, sycl::sub_group g, T x,
352+
unsigned int delta, int logical_sub_group_size = 32) {
353+
unsigned int id = g.get_local_linear_id();
354+
unsigned int end_index =
355+
(id / logical_sub_group_size + 1) * logical_sub_group_size;
356+
#if defined(__SYCL_DEVICE_ONLY__) && defined(__INTEL_LLVM_COMPILER)
357+
#if defined(__SPIR__)
358+
T result =
359+
__spirv_GroupNonUniformShuffleDown(__spv::Scope::Subgroup, x, delta);
360+
if ((id + delta) >= end_index) {
361+
result = x;
362+
}
363+
return result;
364+
#else
365+
// TODO: Check
366+
// https://github.com/intel/llvm/pull/12972#issuecomment-2034218881 for the
367+
// __NVPTX__ implementation
368+
throw sycl::exception(sycl::errc::runtime,
369+
"[SYCLcompat] Masked version of shift_sub_group_left "
370+
"only supports SPIR-V backends.");
371+
#endif // __SPIR__
372+
#else
373+
(void)g;
374+
(void)x;
375+
(void)delta;
376+
(void)logical_sub_group_size;
377+
(void)member_mask;
378+
throw sycl::exception(
379+
sycl::errc::runtime,
380+
"[SYCLcompat] Masked version of select_from_sub_group not "
381+
"supported on host device and none intel compiler.");
382+
#endif // __SYCL_DEVICE_ONLY__ && __INTEL_LLVM_COMPILER
383+
}
384+
385+
/// Masked version of shift_sub_group_right, which execute masked sub-group
386+
/// operation. The parameter member_mask indicating the work-items participating
387+
/// the call. Whether the n-th bit is set to 1 representing whether the
388+
/// work-item with id n is participating the call. All work-items named in
389+
/// member_mask must be executed with the same member_mask, or the result is
390+
/// undefined.
391+
/// \tparam T Input value type
392+
/// \param [in] member_mask Input mask
393+
/// \param [in] g Input sub_group
394+
/// \param [in] x Input value
395+
/// \param [in] delta Input delta
396+
/// \param [in] logical_sub_group_size Input logical sub_group size
397+
/// \returns The result
398+
template <typename T>
399+
T shift_sub_group_right(unsigned int member_mask, sycl::sub_group g, T x,
400+
unsigned int delta, int logical_sub_group_size = 32) {
401+
unsigned int id = g.get_local_linear_id();
402+
unsigned int start_index =
403+
id / logical_sub_group_size * logical_sub_group_size;
404+
#if defined(__SYCL_DEVICE_ONLY__) && defined(__INTEL_LLVM_COMPILER)
405+
#if defined(__SPIR__)
406+
T result = __spirv_GroupNonUniformShuffleUp(__spv::Scope::Subgroup, x, delta);
407+
if ((id - start_index) < delta) {
408+
result = x;
409+
}
410+
return result;
411+
#else
412+
// TODO: Check
413+
// https://github.com/intel/llvm/pull/12972#issuecomment-2034218881 for the
414+
// __NVPTX__ implementation
415+
throw sycl::exception(sycl::errc::runtime,
416+
"Masked version of shift_sub_group_right "
417+
"only supports SPIR-V backends.");
418+
#endif // __SPIR__
419+
#else
420+
(void)g;
421+
(void)x;
422+
(void)delta;
423+
(void)logical_sub_group_size;
424+
(void)member_mask;
425+
throw sycl::exception(sycl::errc::runtime,
426+
"Masked version of select_from_sub_group not "
427+
"supported on host device and none intel compiler.");
428+
#endif // __SYCL_DEVICE_ONLY && __INTEL_LLVM_COMPILER
429+
}
430+
431+
/// Masked version of permute_sub_group_by_xor, which execute masked sub-group
432+
/// operation. The parameter member_mask indicating the work-items participating
433+
/// the call. Whether the n-th bit is set to 1 representing whether the
434+
/// work-item with id n is participating the call. All work-items named in
435+
/// member_mask must be executed with the same member_mask, or the result is
436+
/// undefined.
437+
/// \tparam T Input value type
438+
/// \param [in] member_mask Input mask
439+
/// \param [in] g Input sub_group
440+
/// \param [in] x Input value
441+
/// \param [in] mask Input mask
442+
/// \param [in] logical_sub_group_size Input logical sub_group size
443+
/// \returns The result
444+
template <typename T>
445+
T permute_sub_group_by_xor(unsigned int member_mask, sycl::sub_group g, T x,
446+
unsigned int mask, int logical_sub_group_size = 32) {
447+
unsigned int id = g.get_local_linear_id();
448+
unsigned int start_index =
449+
id / logical_sub_group_size * logical_sub_group_size;
450+
unsigned int target_offset = (id % logical_sub_group_size) ^ mask;
451+
unsigned logical_remote_id = (target_offset < logical_sub_group_size)
452+
? start_index + target_offset
453+
: id;
454+
#if defined(__SYCL_DEVICE_ONLY__) && defined(__INTEL_LLVM_COMPILER)
455+
#if defined(__SPIR__)
456+
return __spirv_GroupNonUniformShuffle(__spv::Scope::Subgroup, x,
457+
logical_remote_id);
458+
#else
459+
// TODO: Check
460+
// https://github.com/intel/llvm/pull/12972#issuecomment-2034218881 for the
461+
// __NVPTX__ implementation
462+
throw sycl::exception(
463+
sycl::errc::runtime,
464+
"[SYCLcompat] Masked version of permute_sub_group_by_xor "
465+
"only supports SPIR-V backends.");
466+
#endif // __SPIR__
467+
#else
468+
(void)g;
469+
(void)x;
470+
(void)mask;
471+
(void)logical_sub_group_size;
472+
(void)member_mask;
473+
throw sycl::exception(
474+
sycl::errc::runtime,
475+
"[SYCLcompat]Masked version of select_from_sub_group not "
476+
"supported on host device and none intel compiler.");
477+
#endif // __SYCL_DEVICE_ONLY__ && __INTEL_LLVM_COMPILER
478+
}
479+
} // namespace experimental
480+
272481
/// Inherited from the original SYCLomatic compatibility headers.
273482
/// @return compiler's SYCL version if defined, 202000 otherwise.
274483
inline int get_sycl_language_version() {

0 commit comments

Comments
 (0)