Skip to content

[SYCL] Add fixed_size_group support to algorithms #9181

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Apr 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 69 additions & 0 deletions sycl/include/CL/__spirv/spirv_ops.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -972,6 +972,10 @@ template <typename ValueT, typename IdT>
__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT
__spirv_GroupNonUniformBroadcast(__spv::Scope::Flag, ValueT, IdT);

template <typename ValueT, typename IdT>
__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT
__spirv_GroupNonUniformShuffle(__spv::Scope::Flag, ValueT, IdT) noexcept;

__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT bool
__spirv_GroupNonUniformAll(__spv::Scope::Flag, bool);

Expand Down Expand Up @@ -1030,6 +1034,71 @@ template <typename ValueT>
__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT
__spirv_GroupNonUniformBitwiseAnd(__spv::Scope::Flag, unsigned int, ValueT);

template <typename ValueT>
__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT
__spirv_GroupNonUniformSMin(__spv::Scope::Flag, unsigned int, ValueT,
unsigned int);

template <typename ValueT>
__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT
__spirv_GroupNonUniformUMin(__spv::Scope::Flag, unsigned int, ValueT,
unsigned int);

template <typename ValueT>
__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT
__spirv_GroupNonUniformFMin(__spv::Scope::Flag, unsigned int, ValueT,
unsigned int);

template <typename ValueT>
__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT
__spirv_GroupNonUniformSMax(__spv::Scope::Flag, unsigned int, ValueT,
unsigned int);

template <typename ValueT>
__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT
__spirv_GroupNonUniformUMax(__spv::Scope::Flag, unsigned int, ValueT,
unsigned int);

template <typename ValueT>
__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT
__spirv_GroupNonUniformFMax(__spv::Scope::Flag, unsigned int, ValueT,
unsigned int);

template <typename ValueT>
__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT
__spirv_GroupNonUniformIAdd(__spv::Scope::Flag, unsigned int, ValueT,
unsigned int);

template <typename ValueT>
__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT
__spirv_GroupNonUniformFAdd(__spv::Scope::Flag, unsigned int, ValueT,
unsigned int);

template <typename ValueT>
__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT
__spirv_GroupNonUniformIMul(__spv::Scope::Flag, unsigned int, ValueT,
unsigned int);

template <typename ValueT>
__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT
__spirv_GroupNonUniformFMul(__spv::Scope::Flag, unsigned int, ValueT,
unsigned int);

template <typename ValueT>
__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT
__spirv_GroupNonUniformBitwiseOr(__spv::Scope::Flag, unsigned int, ValueT,
unsigned int);

template <typename ValueT>
__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT
__spirv_GroupNonUniformBitwiseXor(__spv::Scope::Flag, unsigned int, ValueT,
unsigned int);

template <typename ValueT>
__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT
__spirv_GroupNonUniformBitwiseAnd(__spv::Scope::Flag, unsigned int, ValueT,
unsigned int);

extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT void
__clc_BarrierInitialize(int64_t *state, int32_t expected_count) noexcept;

Expand Down
3 changes: 2 additions & 1 deletion sycl/include/CL/__spirv/spirv_types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,8 @@ struct MemorySemanticsMask {
enum class GroupOperation : uint32_t {
Reduce = 0,
InclusiveScan = 1,
ExclusiveScan = 2
ExclusiveScan = 2,
ClusteredReduce = 3,
};

#if (SYCL_EXT_ONEAPI_MATRIX_VERSION > 1)
Expand Down
87 changes: 87 additions & 0 deletions sycl/include/sycl/detail/spirv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ namespace oneapi {
struct sub_group;
namespace experimental {
template <typename ParentGroup> class ballot_group;
template <size_t PartitionSize, typename ParentGroup> class fixed_size_group;
} // namespace experimental
} // namespace oneapi
} // namespace ext
Expand Down Expand Up @@ -65,6 +66,12 @@ struct group_scope<sycl::ext::oneapi::experimental::ballot_group<ParentGroup>> {
static constexpr __spv::Scope::Flag value = group_scope<ParentGroup>::value;
};

template <size_t PartitionSize, typename ParentGroup>
struct group_scope<sycl::ext::oneapi::experimental::fixed_size_group<
PartitionSize, ParentGroup>> {
static constexpr __spv::Scope::Flag value = group_scope<ParentGroup>::value;
};

// Generic shuffles and broadcasts may require multiple calls to
// intrinsics, and should use the fewest broadcasts possible
// - Loop over chunks until remaining bytes < chunk size
Expand Down Expand Up @@ -118,6 +125,16 @@ bool GroupAll(ext::oneapi::experimental::ballot_group<ParentGroup> g,
return __spirv_GroupNonUniformAll(group_scope<ParentGroup>::value, pred);
}
}
template <size_t PartitionSize, typename ParentGroup>
bool GroupAll(
ext::oneapi::experimental::fixed_size_group<PartitionSize, ParentGroup>,
bool pred) {
// GroupNonUniformAll doesn't support cluster size, so use a reduction
return __spirv_GroupNonUniformBitwiseAnd(
group_scope<ParentGroup>::value,
static_cast<uint32_t>(__spv::GroupOperation::ClusteredReduce),
static_cast<uint32_t>(pred), PartitionSize);
}

template <typename Group> bool GroupAny(Group, bool pred) {
return __spirv_GroupAny(group_scope<Group>::value, pred);
Expand All @@ -134,6 +151,16 @@ bool GroupAny(ext::oneapi::experimental::ballot_group<ParentGroup> g,
return __spirv_GroupNonUniformAny(group_scope<ParentGroup>::value, pred);
}
}
template <size_t PartitionSize, typename ParentGroup>
bool GroupAny(
ext::oneapi::experimental::fixed_size_group<PartitionSize, ParentGroup>,
bool pred) {
// GroupNonUniformAny doesn't support cluster size, so use a reduction
return __spirv_GroupNonUniformBitwiseOr(
group_scope<ParentGroup>::value,
static_cast<uint32_t>(__spv::GroupOperation::ClusteredReduce),
static_cast<uint32_t>(pred), PartitionSize);
}

// Native broadcasts map directly to a SPIR-V GroupBroadcast intrinsic
// FIXME: Do not special-case for half or vec once all backends support all data
Expand Down Expand Up @@ -231,6 +258,29 @@ GroupBroadcast(sycl::ext::oneapi::experimental::ballot_group<ParentGroup> g,
OCLX, OCLId);
}
}
template <size_t PartitionSize, typename ParentGroup, typename T, typename IdT>
EnableIfNativeBroadcast<T, IdT> GroupBroadcast(
ext::oneapi::experimental::fixed_size_group<PartitionSize, ParentGroup> g,
T x, IdT local_id) {
// Remap local_id to its original numbering in ParentGroup
auto LocalId = g.get_group_linear_id() * PartitionSize + local_id;

// TODO: Refactor to avoid duplication after design settles.
using GroupIdT = typename GroupId<ParentGroup>::type;
GroupIdT GroupLocalId = static_cast<GroupIdT>(LocalId);
using OCLT = detail::ConvertToOpenCLType_t<T>;
using WidenedT = WidenOpenCLTypeTo32_t<OCLT>;
using OCLIdT = detail::ConvertToOpenCLType_t<GroupIdT>;
WidenedT OCLX = detail::convertDataToType<T, OCLT>(x);
OCLIdT OCLId = detail::convertDataToType<GroupIdT, OCLIdT>(GroupLocalId);

// NonUniformBroadcast requires Id to be dynamically uniform, which does not
// hold here; each partition is broadcasting a separate index. We could
// fallback to either NonUniformShuffle or a NonUniformBroadcast per
// partition, and it's unclear which will be faster in practice.
return __spirv_GroupNonUniformShuffle(group_scope<ParentGroup>::value, OCLX,
OCLId);
}

template <typename Group, typename T, typename IdT>
EnableIfBitcastBroadcast<T, IdT> GroupBroadcast(Group g, T x, IdT local_id) {
Expand Down Expand Up @@ -950,6 +1000,43 @@ ControlBarrier(Group, memory_scope FenceScope, memory_order Order) {
} else { \
return __spirv_GroupNonUniform##Instruction(Scope, OpInt, Arg); \
} \
} \
\
template <__spv::GroupOperation Op, size_t PartitionSize, \
typename ParentGroup, typename T> \
inline T Group##Instruction( \
ext::oneapi::experimental::fixed_size_group<PartitionSize, ParentGroup> \
g, \
T x) { \
using ConvertedT = detail::ConvertToOpenCLType_t<T>; \
\
using OCLT = std::conditional_t< \
std::is_same<ConvertedT, cl_char>() || \
std::is_same<ConvertedT, cl_short>(), \
cl_int, \
std::conditional_t<std::is_same<ConvertedT, cl_uchar>() || \
std::is_same<ConvertedT, cl_ushort>(), \
cl_uint, ConvertedT>>; \
OCLT Arg = x; \
constexpr auto Scope = group_scope<ParentGroup>::value; \
/* SPIR-V only defines a ClusteredReduce, with no equivalents for scan. */ \
/* Emulate Clustered*Scan using control flow to separate clusters. */ \
if constexpr (Op == __spv::GroupOperation::Reduce) { \
constexpr auto OpInt = \
static_cast<unsigned int>(__spv::GroupOperation::ClusteredReduce); \
return __spirv_GroupNonUniform##Instruction(Scope, OpInt, Arg, \
PartitionSize); \
} else { \
T tmp; \
for (size_t Cluster = 0; Cluster < g.get_group_linear_range(); \
++Cluster) { \
if (Cluster == g.get_group_linear_id()) { \
constexpr auto OpInt = static_cast<unsigned int>(Op); \
tmp = __spirv_GroupNonUniform##Instruction(Scope, OpInt, Arg); \
} \
} \
return tmp; \
} \
}

__SYCL_GROUP_COLLECTIVE_OVERLOAD(SMin)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -137,5 +137,11 @@ struct is_user_constructed_group<fixed_size_group<PartitionSize, ParentGroup>>
: std::true_type {};

} // namespace ext::oneapi::experimental

template <size_t PartitionSize, typename ParentGroup>
struct is_group<
ext::oneapi::experimental::fixed_size_group<PartitionSize, ParentGroup>>
: std::true_type {};

} // __SYCL_INLINE_VER_NAMESPACE(_V1)
} // namespace sycl
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ namespace ext::oneapi::experimental {

// Forward declarations of non-uniform group types for algorithm definitions
template <typename ParentGroup> class ballot_group;
template <size_t PartitionSize, typename ParentGroup> class fixed_size_group;

} // namespace ext::oneapi::experimental

Expand Down
133 changes: 133 additions & 0 deletions sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
// RUN: %clangxx -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
//
// UNSUPPORTED: cpu || cuda || hip

#include <sycl/sycl.hpp>
#include <vector>
namespace syclex = sycl::ext::oneapi::experimental;

template <size_t PartitionSize> class TestKernel;

template <size_t PartitionSize> void test() {
sycl::queue Q;

constexpr uint32_t SGSize = 32;
auto SGSizes = Q.get_device().get_info<sycl::info::device::sub_group_sizes>();
if (std::find(SGSizes.begin(), SGSizes.end(), SGSize) == SGSizes.end()) {
std::cout << "Test skipped due to missing support for sub-group size 32."
<< std::endl;
}

sycl::buffer<size_t, 1> TmpBuf{sycl::range{SGSize}};
sycl::buffer<bool, 1> BarrierBuf{sycl::range{SGSize}};
sycl::buffer<bool, 1> BroadcastBuf{sycl::range{SGSize}};
sycl::buffer<bool, 1> AnyBuf{sycl::range{SGSize}};
sycl::buffer<bool, 1> AllBuf{sycl::range{SGSize}};
sycl::buffer<bool, 1> NoneBuf{sycl::range{SGSize}};
sycl::buffer<bool, 1> ReduceBuf{sycl::range{SGSize}};
sycl::buffer<bool, 1> ExScanBuf{sycl::range{SGSize}};
sycl::buffer<bool, 1> IncScanBuf{sycl::range{SGSize}};

const auto NDR = sycl::nd_range<1>{SGSize, SGSize};
Q.submit([&](sycl::handler &CGH) {
sycl::accessor TmpAcc{TmpBuf, CGH, sycl::write_only};
sycl::accessor BarrierAcc{BarrierBuf, CGH, sycl::write_only};
sycl::accessor BroadcastAcc{BroadcastBuf, CGH, sycl::write_only};
sycl::accessor AnyAcc{AnyBuf, CGH, sycl::write_only};
sycl::accessor AllAcc{AllBuf, CGH, sycl::write_only};
sycl::accessor NoneAcc{NoneBuf, CGH, sycl::write_only};
sycl::accessor ReduceAcc{ReduceBuf, CGH, sycl::write_only};
sycl::accessor ExScanAcc{ExScanBuf, CGH, sycl::write_only};
sycl::accessor IncScanAcc{IncScanBuf, CGH, sycl::write_only};
const auto KernelFunc =
[=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(SGSize)]] {
auto WI = item.get_global_id();
auto SG = item.get_sub_group();

// Split into partitions of fixed size
auto Partition = syclex::get_fixed_size_group<PartitionSize>(SG);

// Check all other members' writes are visible after a barrier.
TmpAcc[WI] = 1;
sycl::group_barrier(Partition);
size_t Visible = 0;
for (size_t Other = 0; Other < SGSize; ++Other) {
if ((WI / PartitionSize) == (Other / PartitionSize)) {
Visible += TmpAcc[Other];
}
}
BarrierAcc[WI] = (Visible == PartitionSize);

// Simple check of group algorithms.
uint32_t OriginalLID = SG.get_local_linear_id();
uint32_t LID = Partition.get_local_linear_id();

uint32_t PartitionLeader =
(OriginalLID / PartitionSize) * PartitionSize;
uint32_t BroadcastResult =
sycl::group_broadcast(Partition, OriginalLID, 0);
BroadcastAcc[WI] = (BroadcastResult == PartitionLeader);

bool AnyResult = sycl::any_of_group(Partition, (LID == 0));
AnyAcc[WI] = (AnyResult == true);

bool Predicate = ((OriginalLID / PartitionSize) % 2 == 0);
bool AllResult = sycl::all_of_group(Partition, Predicate);
if (Predicate) {
AllAcc[WI] = (AllResult == true);
} else {
AllAcc[WI] = (AllResult == false);
}

bool NoneResult = sycl::none_of_group(Partition, Predicate);
if (Predicate) {
NoneAcc[WI] = (NoneResult == false);
} else {
NoneAcc[WI] = (NoneResult == true);
}

uint32_t ReduceResult =
sycl::reduce_over_group(Partition, 1, sycl::plus<>());
ReduceAcc[WI] = (ReduceResult == PartitionSize);

uint32_t ExScanResult =
sycl::exclusive_scan_over_group(Partition, 1, sycl::plus<>());
ExScanAcc[WI] = (ExScanResult == LID);

uint32_t IncScanResult =
sycl::inclusive_scan_over_group(Partition, 1, sycl::plus<>());
IncScanAcc[WI] = (IncScanResult == LID + 1);
};
CGH.parallel_for<TestKernel<PartitionSize>>(NDR, KernelFunc);
});

sycl::host_accessor BarrierAcc{BarrierBuf, sycl::read_only};
sycl::host_accessor BroadcastAcc{BroadcastBuf, sycl::read_only};
sycl::host_accessor AnyAcc{AnyBuf, sycl::read_only};
sycl::host_accessor AllAcc{AllBuf, sycl::read_only};
sycl::host_accessor NoneAcc{NoneBuf, sycl::read_only};
sycl::host_accessor ReduceAcc{ReduceBuf, sycl::read_only};
sycl::host_accessor ExScanAcc{ExScanBuf, sycl::read_only};
sycl::host_accessor IncScanAcc{IncScanBuf, sycl::read_only};
for (int WI = 0; WI < SGSize; ++WI) {
assert(BarrierAcc[WI] == true);
assert(BroadcastAcc[WI] == true);
assert(AnyAcc[WI] == true);
assert(AllAcc[WI] == true);
assert(NoneAcc[WI] == true);
assert(ReduceAcc[WI] == true);
assert(ExScanAcc[WI] == true);
assert(IncScanAcc[WI] == true);
}
}

int main() {
test<1>();
test<2>();
test<4>();
test<8>();
test<16>();
test<32>();
return 0;
}