Skip to content

[SYCL] Sub-group load/store for raw pointers #3255

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Mar 12, 2021
Original file line number Diff line number Diff line change
Expand Up @@ -134,12 +134,18 @@ The load and store sub-group functions enable developers to assert that all work
|===
|Function|Description

|+template <typename T> T load(sub_group sg, const T *src)+
|Load contiguous data from _src_. Returns one element per work-item, corresponding to the memory location at _src_ + +get_local_id()+. The value of _src_ must be the same for all work-items in the sub-group. The address space information is deduced automatically. Only pointers to global and local address spaces are valid. Passing a pointer to other address spaces will cause the run time assertion.

|+template <typename T, access::address_space Space> T load(sub_group sg, const multi_ptr<T,Space> src)+
|Load contiguous data from _src_. Returns one element per work-item, corresponding to the memory location at _src_ + +get_local_id()+. The value of _src_ must be the same for all work-items in the sub-group. _Space_ must be +access::address_space::global_space+ or +access::address_space::local_space+.

|+template <int N, typename T, access::address_space Space> vec<T,N> load(sub_group sg, const multi_ptr<T,Space> src)+
|Load contiguous data from _src_. Returns _N_ elements per work-item, corresponding to the _N_ memory locations at _src_ + +i+ * +get_max_local_range()+ + +get_local_id()+ for +i+ between 0 and _N_. The value of _src_ must be the same for all work-items in the sub-group. _Space_ must be +access::address_space::global_space+ or +access::address_space::local_space+.

|+template <typename T> void store(sub_group sg, T *dst, const T& x)+
|Store contiguous data to _dst_. The value of _x_ from each work-item is written to the memory location at _dst_ + +get_local_id()+. The value of _dst_ must be the same for all work-items in the sub-group. The address space information is deduced automatically. Only pointers to global and local address spaces are valid. Passing a pointer to other address spaces will cause the run time assertion.

|+template <typename T, access::address_space Space> void store(sub_group sg, multi_ptr<T,Space> dst, const T& x)+
|Store contiguous data to _dst_. The value of _x_ from each work-item is written to the memory location at _dst_ + +get_local_id()+. The value of _dst_ must be the same for all work-items in the sub-group. _Space_ must be +access::address_space::global_space+ or +access::address_space::local_space+.

Expand All @@ -165,6 +171,7 @@ None.
|========================================
|Rev|Date|Author|Changes
|1|2020-03-16|John Pennycook|*Initial public working draft*
|2|2021-02-26|Vladimir Lazarev|*Add load/store method for raw pointers*
|========================================

//************************************************************************
Expand Down
24 changes: 24 additions & 0 deletions sycl/include/CL/__spirv/spirv_ops.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,30 @@ __SPIRV_ATOMICS(__SPIRV_ATOMIC_UNSIGNED, unsigned long long)
__SPIRV_ATOMICS(__SPIRV_ATOMIC_MINMAX, Min)
__SPIRV_ATOMICS(__SPIRV_ATOMIC_MINMAX, Max)

extern SYCL_EXTERNAL __attribute__((opencl_global)) void *
__spirv_GenericCastToPtrExplicit_ToGlobal(const void *Ptr,
__spv::StorageClass::Flag S) noexcept;

extern SYCL_EXTERNAL __attribute__((opencl_local)) void *
__spirv_GenericCastToPtrExplicit_ToLocal(const void *Ptr,
__spv::StorageClass::Flag S) noexcept;

template <typename dataT>
extern __attribute__((opencl_global)) dataT *
__spirv_GenericCastToPtrExplicit_ToGlobal(
const void *Ptr, __spv::StorageClass::Flag S) noexcept {
return (__attribute__((opencl_global))
dataT *)__spirv_GenericCastToPtrExplicit_ToGlobal(Ptr, S);
}

template <typename dataT>
extern __attribute__((opencl_local)) dataT *
__spirv_GenericCastToPtrExplicit_ToLocal(const void *Ptr,
__spv::StorageClass::Flag S) noexcept {
return (__attribute__((opencl_local))
dataT *)__spirv_GenericCastToPtrExplicit_ToLocal(Ptr, S);
}

template <typename dataT>
__SYCL_CONVERGENT__ extern SYCL_EXTERNAL dataT
__spirv_SubgroupShuffleINTEL(dataT Data, uint32_t InvocationId) noexcept;
Expand Down
40 changes: 40 additions & 0 deletions sycl/include/CL/__spirv/spirv_types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,46 @@ struct Scope {
Flag flag_value;
};

struct StorageClass {
enum Flag : uint32_t {
UniformConstant = 0,
Input = 1,
Uniform = 2,
Output = 3,
Workgroup = 4,
CrossWorkgroup = 5,
Private = 6,
Function = 7,
Generic = 8,
PushConstant = 9,
AtomicCounter = 10,
Image = 11,
StorageBuffer = 12,
CallableDataKHR = 5328,
CallableDataNV = 5328,
IncomingCallableDataKHR = 5329,
IncomingCallableDataNV = 5329,
RayPayloadKHR = 5338,
RayPayloadNV = 5338,
HitAttributeKHR = 5339,
HitAttributeNV = 5339,
IncomingRayPayloadKHR = 5342,
IncomingRayPayloadNV = 5342,
ShaderRecordBufferKHR = 5343,
ShaderRecordBufferNV = 5343,
PhysicalStorageBuffer = 5349,
PhysicalStorageBufferEXT = 5349,
CodeSectionINTEL = 5605,
CapabilityUSMStorageClassesINTEL = 5935,
DeviceOnlyINTEL = 5936,
HostOnlyINTEL = 5937,
Max = 0x7fffffff,
};
constexpr StorageClass(Flag flag) : flag_value(flag) {}
constexpr operator uint32_t() const { return flag_value; }
Flag flag_value;
};

struct MemorySemanticsMask {

enum Flag : uint32_t {
Expand Down
90 changes: 90 additions & 0 deletions sycl/include/CL/sycl/ONEAPI/sub_group.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,47 @@ struct sub_group {

/* --- sub_group load/stores --- */
/* these can map to SIMD or block read/write hardware where available */
#ifdef __SYCL_DEVICE_ONLY__
// Method for decorated pointer
template <typename T>
detail::enable_if_t<
!std::is_same<typename detail::remove_AS<T>::type, T>::value, T>
load(T *src) const {
return load(sycl::multi_ptr<typename detail::remove_AS<T>::type,
sycl::detail::deduce_AS<T>::value>(
(typename detail::remove_AS<T>::type *)src));
}

// Method for raw pointer
template <typename T>
detail::enable_if_t<
std::is_same<typename detail::remove_AS<T>::type, T>::value, T>
load(T *src) const {

#ifdef __NVPTX__
return src[get_local_id()[0]];
#else // __NVPTX__
auto l = __spirv_GenericCastToPtrExplicit_ToLocal<T>(
src, __spv::StorageClass::Workgroup);
if (l)
return load(l);

auto g = __spirv_GenericCastToPtrExplicit_ToGlobal<T>(
src, __spv::StorageClass::CrossWorkgroup);
if (g)
return load(g);

assert(!"Sub-group load() is supported for local or global pointers only.");
return {};
#endif // __NVPTX__
}
#else //__SYCL_DEVICE_ONLY__
template <typename T> T load(T *src) const {
(void)src;
throw runtime_error("Sub-groups are not supported on host device.",
PI_INVALID_DEVICE);
}
#endif //__SYCL_DEVICE_ONLY__

template <typename T, access::address_space Space>
sycl::detail::enable_if_t<
Expand Down Expand Up @@ -315,6 +356,55 @@ struct sub_group {
#endif
}

#ifdef __SYCL_DEVICE_ONLY__
// Method for decorated pointer
template <typename T>
detail::enable_if_t<
!std::is_same<typename detail::remove_AS<T>::type, T>::value>
store(T *dst, const typename detail::remove_AS<T>::type &x) const {
store(sycl::multi_ptr<typename detail::remove_AS<T>::type,
sycl::detail::deduce_AS<T>::value>(
(typename detail::remove_AS<T>::type *)dst),
x);
}

// Method for raw pointer
template <typename T>
detail::enable_if_t<
std::is_same<typename detail::remove_AS<T>::type, T>::value>
store(T *dst, const typename detail::remove_AS<T>::type &x) const {

#ifdef __NVPTX__
dst[get_local_id()[0]] = x;
#else // __NVPTX__
auto l = __spirv_GenericCastToPtrExplicit_ToLocal<T>(
dst, __spv::StorageClass::Workgroup);
if (l) {
store(l, x);
return;
}

auto g = __spirv_GenericCastToPtrExplicit_ToGlobal<T>(
dst, __spv::StorageClass::CrossWorkgroup);
if (g) {
store(g, x);
return;
}

assert(
!"Sub-group store() is supported for local or global pointers only.");
return;
#endif // __NVPTX__
}
#else //__SYCL_DEVICE_ONLY__
template <typename T> void store(T *dst, const T &x) const {
(void)dst;
(void)x;
throw runtime_error("Sub-groups are not supported on host device.",
PI_INVALID_DEVICE);
}
#endif //__SYCL_DEVICE_ONLY__

template <typename T, access::address_space Space>
sycl::detail::enable_if_t<
sycl::detail::sub_group::AcceptableForGlobalLoadStore<T, Space>::value>
Expand Down
57 changes: 39 additions & 18 deletions sycl/include/CL/sycl/access/access.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -187,18 +187,16 @@ template <typename ElementType>
struct DecoratedType<ElementType, access::address_space::local_space> {
using type = __OPENCL_LOCAL_AS__ ElementType;
};

template <class T>
struct remove_AS {
typedef T type;
};
template <class T> struct remove_AS { typedef T type; };

#ifdef __SYCL_DEVICE_ONLY__
template <class T>
struct remove_AS<__OPENCL_GLOBAL_AS__ T> {
typedef T type;
template <class T> struct deduce_AS {
static_assert(!std::is_same<typename detail::remove_AS<T>::type, T>::value,
"Only types with address space attributes are supported");
};

template <class T> struct remove_AS<__OPENCL_GLOBAL_AS__ T> { typedef T type; };

#ifdef __ENABLE_USM_ADDR_SPACE__
template <class T> struct remove_AS<__OPENCL_GLOBAL_DEVICE_AS__ T> {
typedef T type;
Expand All @@ -207,21 +205,45 @@ template <class T> struct remove_AS<__OPENCL_GLOBAL_DEVICE_AS__ T> {
template <class T> struct remove_AS<__OPENCL_GLOBAL_HOST_AS__ T> {
typedef T type;
};

template <class T> struct deduce_AS<__OPENCL_GLOBAL_DEVICE_AS__ T> {
static const access::address_space value =
access::address_space::global_device_space;
};

template <class T> struct deduce_AS<__OPENCL_GLOBAL_HOST_AS__ T> {
static const access::address_space value =
access::address_space::global_host_space;
};
#endif // __ENABLE_USM_ADDR_SPACE__

template <class T>
struct remove_AS<__OPENCL_PRIVATE_AS__ T> {
template <class T> struct remove_AS<__OPENCL_PRIVATE_AS__ T> {
typedef T type;
};

template <class T>
struct remove_AS<__OPENCL_LOCAL_AS__ T> {
template <class T> struct remove_AS<__OPENCL_LOCAL_AS__ T> { typedef T type; };

template <class T> struct remove_AS<__OPENCL_CONSTANT_AS__ T> {
typedef T type;
};

template <class T>
struct remove_AS<__OPENCL_CONSTANT_AS__ T> {
typedef T type;
template <class T> struct deduce_AS<__OPENCL_GLOBAL_AS__ T> {
static const access::address_space value =
access::address_space::global_space;
};

template <class T> struct deduce_AS<__OPENCL_PRIVATE_AS__ T> {
static const access::address_space value =
access::address_space::private_space;
};

template <class T> struct deduce_AS<__OPENCL_LOCAL_AS__ T> {
static const access::address_space value = access::address_space::local_space;
};

template <class T> struct deduce_AS<__OPENCL_CONSTANT_AS__ T> {
static const access::address_space value =
access::address_space::constant_space;
};
#endif

Expand All @@ -231,8 +253,7 @@ struct remove_AS<__OPENCL_CONSTANT_AS__ T> {
#undef __OPENCL_LOCAL_AS__
#undef __OPENCL_CONSTANT_AS__
#undef __OPENCL_PRIVATE_AS__

} // namespace detail

} // namespace sycl
} // __SYCL_INLINE_NAMESPACE(cl)
} // namespace sycl
} // __SYCL_INLINE_NAMESPACE(cl)
2 changes: 1 addition & 1 deletion sycl/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ set_target_properties(check-sycl-deploy PROPERTIES FOLDER "SYCL tests")
add_lit_testsuite(check-sycl-spirv "Running device-agnostic SYCL regression tests for SPIR-V"
${CMAKE_CURRENT_BINARY_DIR}
ARGS ${RT_TEST_ARGS}
PARAMS "SYCL_TRIPLE=spir64-unknown-linux-sycldevice"
PARAMS "SYCL_TRIPLE=spir64-unknown-unknown-sycldevice"
DEPENDS ${SYCL_TEST_DEPS}
EXCLUDE_FROM_CHECK_ALL
)
Expand Down
38 changes: 38 additions & 0 deletions sycl/test/basic_tests/address_space_traits.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
// RUN: %clangxx -fsycl -fsycl-device-only -D__ENABLE_USM_ADDR_SPACE__ -fsycl-targets=%sycl_triple %s -c

#include <CL/sycl.hpp>
#include <cassert>

using namespace cl::sycl;
int main() {

queue myQueue;
myQueue.submit([&](handler &cgh) {
cgh.single_task<class dummy>([=]() {
static_assert(
detail::deduce_AS<__attribute__((opencl_global)) int>::value ==
access::address_space::global_space,
"Unexpected address space");
static_assert(
detail::deduce_AS<__attribute__((opencl_local)) int>::value ==
access::address_space::local_space,
"Unexpected address space");
static_assert(
detail::deduce_AS<__attribute__((opencl_private)) int>::value ==
access::address_space::private_space,
"Unexpected address space");
static_assert(
detail::deduce_AS<__attribute__((opencl_constant)) int>::value ==
access::address_space::constant_space,
"Unexpected address space");
static_assert(
detail::deduce_AS<__attribute__((opencl_global_device)) int>::value ==
access::address_space::global_device_space,
"Unexpected address space");
static_assert(
detail::deduce_AS<__attribute__((opencl_global_host)) int>::value ==
access::address_space::global_host_space,
"Unexpected address space");
});
});
}
Loading