Skip to content

[SYCL] Sub-group load/store for raw pointers #3255

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Mar 12, 2021
Original file line number Diff line number Diff line change
Expand Up @@ -134,12 +134,18 @@ The load and store sub-group functions enable developers to assert that all work
|===
|Function|Description

|+template <typename T> T load(sub_group sg, const T *src)+
|Load contiguous data from _src_. Returns one element per work-item, corresponding to the memory location at _src_ + +get_local_id()+. The value of _src_ must be the same for all work-items in the sub-group. The address space information is deduced automatically. Only pointers to global and local address spaces are valid. Passing a pointer to private address space will cause an assertion. Other address spaces are cast to global with potentially undefined behavior.

|+template <typename T, access::address_space Space> T load(sub_group sg, const multi_ptr<T,Space> src)+
|Load contiguous data from _src_. Returns one element per work-item, corresponding to the memory location at _src_ + +get_local_id()+. The value of _src_ must be the same for all work-items in the sub-group. _Space_ must be +access::address_space::global_space+ or +access::address_space::local_space+.

|+template <int N, typename T, access::address_space Space> vec<T,N> load(sub_group sg, const multi_ptr<T,Space> src)+
|Load contiguous data from _src_. Returns _N_ elements per work-item, corresponding to the _N_ memory locations at _src_ + +i+ * +get_max_local_range()+ + +get_local_id()+ for +i+ between 0 and _N_. The value of _src_ must be the same for all work-items in the sub-group. _Space_ must be +access::address_space::global_space+ or +access::address_space::local_space+.

|+template <typename T> void store(sub_group sg, T *dst, const T& x)+
|Store contiguous data to _dst_. The value of _x_ from each work-item is written to the memory location at _dst_ + +get_local_id()+. The value of _dst_ must be the same for all work-items in the sub-group. The address space information is deduced automatically. Only pointers to global and local address spaces are valid. Passing a pointer to private address space will cause an assertion. Other address spaces are cast to global with potentially undefined behavior.

|+template <typename T, access::address_space Space> void store(sub_group sg, multi_ptr<T,Space> dst, const T& x)+
|Store contiguous data to _dst_. The value of _x_ from each work-item is written to the memory location at _dst_ + +get_local_id()+. The value of _dst_ must be the same for all work-items in the sub-group. _Space_ must be +access::address_space::global_space+ or +access::address_space::local_space+.

Expand All @@ -165,6 +171,7 @@ None.
|========================================
|Rev|Date|Author|Changes
|1|2020-03-16|John Pennycook|*Initial public working draft*
|2|2021-02-26|Vladimir Lazarev|*Add load/store method for raw pointers*
|========================================

//************************************************************************
Expand Down
44 changes: 44 additions & 0 deletions sycl/include/CL/__spirv/spirv_ops.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,50 @@ __SPIRV_ATOMICS(__SPIRV_ATOMIC_UNSIGNED, unsigned long)
__SPIRV_ATOMICS(__SPIRV_ATOMIC_UNSIGNED, unsigned long long)
__SPIRV_ATOMICS(__SPIRV_ATOMIC_MINMAX, Min)
__SPIRV_ATOMICS(__SPIRV_ATOMIC_MINMAX, Max)
extern SYCL_EXTERNAL __attribute__((opencl_generic)) void *
__spirv_PtrCastToGeneric(const void *Ptr) noexcept;

extern SYCL_EXTERNAL __attribute__((opencl_global)) void *
__spirv_GenericCastToPtrExplicit_ToGlobal(const void *Ptr,
__spv::StorageClass::Flag S) noexcept;

extern SYCL_EXTERNAL __attribute__((opencl_local)) void *
__spirv_GenericCastToPtrExplicit_ToLocal(const void *Ptr,
__spv::StorageClass::Flag S) noexcept;

extern SYCL_EXTERNAL __attribute__((opencl_private)) void *
__spirv_GenericCastToPtrExplicit_ToPrivate(
const void *Ptr, __spv::StorageClass::Flag S) noexcept;

template <typename dataT>
extern __attribute__((opencl_generic)) dataT *
__spirv_PtrCastToGeneric(const void *Ptr) noexcept {
return (__attribute__((opencl_generic)) dataT *)__spirv_PtrCastToGeneric(Ptr);
}

template <typename dataT>
extern __attribute__((opencl_global)) dataT *
__spirv_GenericCastToPtrExplicit_ToGlobal(
const void *Ptr, __spv::StorageClass::Flag S) noexcept {
return (__attribute__((opencl_global))
dataT *)__spirv_GenericCastToPtrExplicit_ToGlobal(Ptr, S);
}

template <typename dataT>
extern __attribute__((opencl_local)) dataT *
__spirv_GenericCastToPtrExplicit_ToLocal(const void *Ptr,
__spv::StorageClass::Flag S) noexcept {
return (__attribute__((opencl_local))
dataT *)__spirv_GenericCastToPtrExplicit_ToLocal(Ptr, S);
}

template <typename dataT>
extern __attribute__((opencl_private)) dataT *
__spirv_GenericCastToPtrExplicit_ToPrivate(
const void *Ptr, __spv::StorageClass::Flag S) noexcept {
return (__attribute__((opencl_private))
dataT *)__spirv_GenericCastToPtrExplicit_ToPrivate(Ptr, S);
}

template <typename dataT>
__SYCL_CONVERGENT__ extern SYCL_EXTERNAL dataT
Expand Down
40 changes: 40 additions & 0 deletions sycl/include/CL/__spirv/spirv_types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,46 @@ struct Scope {
Flag flag_value;
};

struct StorageClass {
enum Flag : uint32_t {
UniformConstant = 0,
Input = 1,
Uniform = 2,
Output = 3,
Workgroup = 4,
CrossWorkgroup = 5,
Private = 6,
Function = 7,
Generic = 8,
PushConstant = 9,
AtomicCounter = 10,
Image = 11,
StorageBuffer = 12,
CallableDataKHR = 5328,
CallableDataNV = 5328,
IncomingCallableDataKHR = 5329,
IncomingCallableDataNV = 5329,
RayPayloadKHR = 5338,
RayPayloadNV = 5338,
HitAttributeKHR = 5339,
HitAttributeNV = 5339,
IncomingRayPayloadKHR = 5342,
IncomingRayPayloadNV = 5342,
ShaderRecordBufferKHR = 5343,
ShaderRecordBufferNV = 5343,
PhysicalStorageBuffer = 5349,
PhysicalStorageBufferEXT = 5349,
CodeSectionINTEL = 5605,
CapabilityUSMStorageClassesINTEL = 5935,
DeviceOnlyINTEL = 5936,
HostOnlyINTEL = 5937,
Max = 0x7fffffff,
};
constexpr StorageClass(Flag flag) : flag_value(flag) {}
constexpr operator uint32_t() const { return flag_value; }
Flag flag_value;
};

struct MemorySemanticsMask {

enum Flag : uint32_t {
Expand Down
99 changes: 99 additions & 0 deletions sycl/include/CL/sycl/ONEAPI/sub_group.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,52 @@ struct sub_group {

/* --- sub_group load/stores --- */
/* these can map to SIMD or block read/write hardware where available */
#ifdef __SYCL_DEVICE_ONLY__
// Method for decorated pointer
template <typename T>
detail::enable_if_t<
!std::is_same<typename detail::remove_AS<T>::type, T>::value, T>
load(T *src) const {
return load(sycl::multi_ptr<typename detail::remove_AS<T>::type,
sycl::detail::deduce_AS<T>::value>(
(typename detail::remove_AS<T>::type *)src));
}

// Method for raw pointer
template <typename T>
detail::enable_if_t<
std::is_same<typename detail::remove_AS<T>::type, T>::value, T>
load(T *src) const {

#ifdef __NVPTX__
return src[get_local_id()[0]];
#else // __NVPTX__
auto l = __spirv_GenericCastToPtrExplicit_ToLocal<T>(
src, __spv::StorageClass::Workgroup);
if (l)
return load(l);

auto g = __spirv_GenericCastToPtrExplicit_ToGlobal<T>(
src, __spv::StorageClass::CrossWorkgroup);
if (g)
return load(g);

auto p = __spirv_GenericCastToPtrExplicit_ToPrivate<T>(
src, __spv::StorageClass::Function);
assert((p == nullptr) &&
"Sub-group load() is not supported for private pointers.");

// Fallback for other address spaces to be mapped to global
return load(__spirv_PtrCastToGeneric<T>(src));
#endif // __NVPTX__
}
#else //__SYCL_DEVICE_ONLY__
template <typename T> T load(T *src) const {
(void)src;
throw runtime_error("Sub-groups are not supported on host device.",
PI_INVALID_DEVICE);
}
#endif //__SYCL_DEVICE_ONLY__

template <typename T, access::address_space Space>
sycl::detail::enable_if_t<
Expand Down Expand Up @@ -315,6 +361,59 @@ struct sub_group {
#endif
}

#ifdef __SYCL_DEVICE_ONLY__
// Method for decorated pointer
template <typename T>
detail::enable_if_t<
!std::is_same<typename detail::remove_AS<T>::type, T>::value>
store(T *dst, const typename detail::remove_AS<T>::type &x) const {
store(sycl::multi_ptr<typename detail::remove_AS<T>::type,
sycl::detail::deduce_AS<T>::value>(
(typename detail::remove_AS<T>::type *)dst),
x);
}

// Method for raw pointer
template <typename T>
detail::enable_if_t<
std::is_same<typename detail::remove_AS<T>::type, T>::value>
store(T *dst, const typename detail::remove_AS<T>::type &x) const {

#ifdef __NVPTX__
dst[get_local_id()[0]] = x;
#else // __NVPTX__
auto l = __spirv_GenericCastToPtrExplicit_ToLocal<T>(
dst, __spv::StorageClass::Workgroup);
if (l) {
store(l, x);
return;
}

auto g = __spirv_GenericCastToPtrExplicit_ToGlobal<T>(
dst, __spv::StorageClass::CrossWorkgroup);
if (g) {
store(g, x);
return;
}

auto p = __spirv_GenericCastToPtrExplicit_ToPrivate<T>(
dst, __spv::StorageClass::Function);
assert((p == nullptr) &&
"Sub-group store() is not supported for private pointers.");

// Fallback for other address spaces to be mapped to global
store(__spirv_PtrCastToGeneric<T>(dst), x);
#endif // __NVPTX__
}
#else //__SYCL_DEVICE_ONLY__
template <typename T> void store(T *dst, const T &x) const {
(void)dst;
(void)x;
throw runtime_error("Sub-groups are not supported on host device.",
PI_INVALID_DEVICE);
}
#endif //__SYCL_DEVICE_ONLY__

template <typename T, access::address_space Space>
sycl::detail::enable_if_t<
sycl::detail::sub_group::AcceptableForGlobalLoadStore<T, Space>::value>
Expand Down
60 changes: 44 additions & 16 deletions sycl/include/CL/sycl/access/access.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,13 +115,15 @@ constexpr bool modeWritesNewData(access::mode m) {
#define __OPENCL_LOCAL_AS__ __attribute__((opencl_local))
#define __OPENCL_CONSTANT_AS__ __attribute__((opencl_constant))
#define __OPENCL_PRIVATE_AS__ __attribute__((opencl_private))
#define __OPENCL_GENERIC_AS__ __attribute__((opencl_generic))
#else
#define __OPENCL_GLOBAL_AS__
#define __OPENCL_GLOBAL_DEVICE_AS__
#define __OPENCL_GLOBAL_HOST_AS__
#define __OPENCL_LOCAL_AS__
#define __OPENCL_CONSTANT_AS__
#define __OPENCL_PRIVATE_AS__
#define __OPENCL_GENERIC_AS__
#endif

template <access::target accessTarget> struct TargetToAS {
Expand Down Expand Up @@ -187,17 +189,15 @@ template <typename ElementType>
struct DecoratedType<ElementType, access::address_space::local_space> {
using type = __OPENCL_LOCAL_AS__ ElementType;
};
template <class T> struct remove_AS { typedef T type; };

template <class T>
struct remove_AS {
typedef T type;
template <class T> struct deduce_AS {
static const access::address_space value =
access::address_space::global_space;
};

#ifdef __SYCL_DEVICE_ONLY__
template <class T>
struct remove_AS<__OPENCL_GLOBAL_AS__ T> {
typedef T type;
};
template <class T> struct remove_AS<__OPENCL_GLOBAL_AS__ T> { typedef T type; };

#ifdef __ENABLE_USM_ADDR_SPACE__
template <class T> struct remove_AS<__OPENCL_GLOBAL_DEVICE_AS__ T> {
Expand All @@ -207,22 +207,50 @@ template <class T> struct remove_AS<__OPENCL_GLOBAL_DEVICE_AS__ T> {
template <class T> struct remove_AS<__OPENCL_GLOBAL_HOST_AS__ T> {
typedef T type;
};

template <class T> struct deduce_AS<__OPENCL_GLOBAL_DEVICE_AS__ T> {
static const access::address_space value =
access::address_space::global_device_space;
};

template <class T> struct deduce_AS<__OPENCL_GLOBAL_HOST_AS__ T> {
static const access::address_space value =
access::address_space::global_host_space;
};
#endif // __ENABLE_USM_ADDR_SPACE__

template <class T>
struct remove_AS<__OPENCL_PRIVATE_AS__ T> {
template <class T> struct remove_AS<__OPENCL_PRIVATE_AS__ T> {
typedef T type;
};

template <class T>
struct remove_AS<__OPENCL_LOCAL_AS__ T> {
template <class T> struct remove_AS<__OPENCL_LOCAL_AS__ T> { typedef T type; };

template <class T> struct remove_AS<__OPENCL_CONSTANT_AS__ T> {
typedef T type;
};

template <class T>
struct remove_AS<__OPENCL_CONSTANT_AS__ T> {
template <class T> struct remove_AS<__OPENCL_GENERIC_AS__ T> {
typedef T type;
};

template <class T> struct deduce_AS<__OPENCL_PRIVATE_AS__ T> {
static const access::address_space value =
access::address_space::private_space;
};

template <class T> struct deduce_AS<__OPENCL_LOCAL_AS__ T> {
static const access::address_space value = access::address_space::local_space;
};

template <class T> struct deduce_AS<__OPENCL_CONSTANT_AS__ T> {
static const access::address_space value =
access::address_space::constant_space;
};

template <class T> struct deduce_AS<__OPENCL_GENERIC_AS__ T> {
static const access::address_space value =
access::address_space::global_space;
};
#endif

#undef __OPENCL_GLOBAL_AS__
Expand All @@ -231,8 +259,8 @@ struct remove_AS<__OPENCL_CONSTANT_AS__ T> {
#undef __OPENCL_LOCAL_AS__
#undef __OPENCL_CONSTANT_AS__
#undef __OPENCL_PRIVATE_AS__

#undef __OPENCL_GENERIC_AS__
} // namespace detail

} // namespace sycl
} // __SYCL_INLINE_NAMESPACE(cl)
} // namespace sycl
} // __SYCL_INLINE_NAMESPACE(cl)
2 changes: 1 addition & 1 deletion sycl/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ set_target_properties(check-sycl-deploy PROPERTIES FOLDER "SYCL tests")
add_lit_testsuite(check-sycl-spirv "Running device-agnostic SYCL regression tests for SPIR-V"
${CMAKE_CURRENT_BINARY_DIR}
ARGS ${RT_TEST_ARGS}
PARAMS "SYCL_TRIPLE=spir64-unknown-linux-sycldevice"
PARAMS "SYCL_TRIPLE=spir64-unknown-unknown-sycldevice"
DEPENDS ${SYCL_TEST_DEPS}
EXCLUDE_FROM_CHECK_ALL
)
Expand Down
Loading