diff --git a/sycl/include/sycl/detail/group_sort_impl.hpp b/sycl/include/sycl/detail/group_sort_impl.hpp index 7410761cac303..6974413492a7c 100644 --- a/sycl/include/sycl/detail/group_sort_impl.hpp +++ b/sycl/include/sycl/detail/group_sort_impl.hpp @@ -15,11 +15,43 @@ #include #include #include +#include + +#include namespace sycl { inline namespace _V1 { namespace detail { +// Helpers for sorting algorithms +#ifdef __SYCL_DEVICE_ONLY__ +template +static __SYCL_ALWAYS_INLINE T *align_scratch(sycl::span scratch, + Group g, + size_t number_of_elements) { + // Adjust the scratch pointer based on alignment of the type T. + // Per extension specification if scratch size is less than the value + // returned by memory_required then behavior is undefined, so we don't check + // that the scratch size statisfies the requirement. + T *scratch_begin = nullptr; + // We must have a barrier here before array placement new because it is + // possible that scratch memory is already in use, so we need to synchronize + // work items. + sycl::group_barrier(g); + if (g.leader()) { + void *scratch_ptr = scratch.data(); + size_t space = scratch.size(); + scratch_ptr = std::align(alignof(T), number_of_elements * sizeof(T), + scratch_ptr, space); + scratch_begin = ::new (scratch_ptr) T[number_of_elements]; + } + // Broadcast leader's pointer (the beginning of the scratch) to all work + // items in the group. + scratch_begin = sycl::group_broadcast(g, scratch_begin); + return scratch_begin; +} +#endif + // ---- merge sort implementation // following two functions could be useless if std::[lower|upper]_bound worked diff --git a/sycl/include/sycl/ext/oneapi/experimental/group_helpers_sorters.hpp b/sycl/include/sycl/ext/oneapi/experimental/group_helpers_sorters.hpp index e790678e7e884..2885a7673795b 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/group_helpers_sorters.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/group_helpers_sorters.hpp @@ -15,9 +15,10 @@ #include // for PI_ERROR_INVALID_DEVICE #include // for sycl_category, exception #include // for bfloat16 -#include // for memory_scope -#include // for range -#include // for span +#include +#include // for memory_scope +#include // for range +#include // for span #ifdef __SYCL_DEVICE_ONLY__ #include @@ -36,6 +37,54 @@ namespace sycl { inline namespace _V1 { namespace ext::oneapi::experimental { +enum class group_algorithm_data_placement { blocked, striped }; + +struct input_data_placement_key + : detail::compile_time_property_key { + template + using value_t = + property_value(Placement)>>; +}; + +struct output_data_placement_key + : detail::compile_time_property_key { + template + using value_t = + property_value(Placement)>>; +}; + +template +inline constexpr input_data_placement_key::value_t + input_data_placement; + +template +inline constexpr output_data_placement_key::value_t + output_data_placement; + +namespace detail { + +template +constexpr bool isInputBlocked(Properties properties) { + if constexpr (properties.template has_property()) + return properties.template get_property() == + input_data_placement; + else + return true; +} + +template +constexpr bool isOutputBlocked(Properties properties) { + if constexpr (properties.template has_property()) + return properties.template get_property() == + output_data_placement; + else + return true; +} + +} // namespace detail + // ---- group helpers template class group_with_scratchpad { Group g; @@ -63,26 +112,9 @@ template > class default_sorter { void operator()([[maybe_unused]] Group g, [[maybe_unused]] Ptr first, [[maybe_unused]] Ptr last) { #ifdef __SYCL_DEVICE_ONLY__ - // Adjust the scratch pointer based on alignment of the type T. - // Per extension specification if scratch size is less than the value - // returned by memory_required then behavior is undefined, so we don't check - // that the scratch size statisfies the requirement. using T = typename sycl::detail::GetValueType::type; - T *scratch_begin = nullptr; size_t n = last - first; - // We must have a barrier here before array placement new because it is - // possible that scratch memory is already in use, so we need to synchronize - // work items. - sycl::group_barrier(g); - if (g.leader()) { - void *scratch_ptr = scratch.data(); - size_t space = scratch.size(); - scratch_ptr = std::align(alignof(T), n * sizeof(T), scratch_ptr, space); - scratch_begin = ::new (scratch_ptr) T[n]; - } - // Broadcast leader's pointer (the beginning of the scratch) to all work - // items in the group. - scratch_begin = sycl::group_broadcast(g, scratch_begin); + T *scratch_begin = sycl::detail::align_scratch(scratch, g, n); sycl::detail::merge_sort(g, first, n, comp, scratch_begin); #else throw sycl::exception( @@ -94,29 +126,10 @@ template > class default_sorter { template T operator()([[maybe_unused]] Group g, T val) { #ifdef __SYCL_DEVICE_ONLY__ - // Adjust the scratch pointer based on alignment of the type T. - // Per extension specification if scratch size is less than the value - // returned by memory_required then behavior is undefined, so we don't check - // that the scratch size statisfies the requirement. - T *scratch_begin = nullptr; std::size_t local_id = g.get_local_linear_id(); auto range_size = g.get_local_range().size(); - // We must have a barrier here before array placement new because it is - // possible that scratch memory is already in use, so we need to synchronize - // work items. - sycl::group_barrier(g); - if (g.leader()) { - void *scratch_ptr = scratch.data(); - size_t space = scratch.size(); - scratch_ptr = - std::align(alignof(T), /* output storage and temporary storage */ 2 * - range_size * sizeof(T), - scratch_ptr, space); - scratch_begin = ::new (scratch_ptr) T[2 * range_size]; - } - // Broadcast leader's pointer (the beginning of the scratch) to all work - // items in the group. - scratch_begin = sycl::group_broadcast(g, scratch_begin); + T *scratch_begin = sycl::detail::align_scratch( + scratch, g, /* output storage and temporary storage */ 2 * range_size); scratch_begin[local_id] = val; sycl::detail::merge_sort(g, scratch_begin, range_size, comp, scratch_begin + range_size); @@ -252,26 +265,9 @@ template > class joint_sorter { void operator()([[maybe_unused]] Group g, [[maybe_unused]] Ptr first, [[maybe_unused]] Ptr last) { #ifdef __SYCL_DEVICE_ONLY__ - // Adjust the scratch pointer based on alignment of the type T. - // Per extension specification if scratch size is less than the value - // returned by memory_required then behavior is undefined, so we don't check - // that the scratch size statisfies the requirement. using T = typename sycl::detail::GetValueType::type; - T *scratch_begin = nullptr; size_t n = last - first; - // We must have a barrier here before array placement new because it is - // possible that scratch memory is already in use, so we need to synchronize - // work items. - sycl::group_barrier(g); - if (g.leader()) { - void *scratch_ptr = scratch.data(); - size_t space = scratch.size(); - scratch_ptr = std::align(alignof(T), n * sizeof(T), scratch_ptr, space); - scratch_begin = ::new (scratch_ptr) T[n]; - } - // Broadcast leader's pointer (the beginning of the scratch) to all work - // items in the group. - scratch_begin = sycl::group_broadcast(g, scratch_begin); + T *scratch_begin = sycl::detail::align_scratch(scratch, g, n); sycl::detail::merge_sort(g, first, n, comp, scratch_begin); #else throw sycl::exception( @@ -300,29 +296,10 @@ class group_sorter { template T operator()([[maybe_unused]] Group g, T val) { #ifdef __SYCL_DEVICE_ONLY__ - // Adjust the scratch pointer based on alignment of the type T. - // Per extension specification if scratch size is less than the value - // returned by memory_required then behavior is undefined, so we don't check - // that the scratch size statisfies the requirement. - T *scratch_begin = nullptr; std::size_t local_id = g.get_local_linear_id(); auto range_size = g.get_local_range().size(); - // We must have a barrier here before array placement new because it is - // possible that scratch memory is already in use, so we need to synchronize - // work items. - sycl::group_barrier(g); - if (g.leader()) { - void *scratch_ptr = scratch.data(); - size_t space = scratch.size(); - scratch_ptr = - std::align(alignof(T), /* output storage and temporary storage */ 2 * - range_size * sizeof(T), - scratch_ptr, space); - scratch_begin = ::new (scratch_ptr) T[2 * range_size]; - } - // Broadcast leader's pointer (the beginning of the scratch) to all work - // items in the group. - scratch_begin = sycl::group_broadcast(g, scratch_begin); + T *scratch_begin = sycl::detail::align_scratch( + scratch, g, /* output storage and temporary storage */ 2 * range_size); scratch_begin[local_id] = val; sycl::detail::merge_sort(g, scratch_begin, range_size, comp, scratch_begin + range_size); @@ -335,6 +312,34 @@ class group_sorter { return val; } + template + void operator()([[maybe_unused]] Group g, + [[maybe_unused]] sycl::span values, + [[maybe_unused]] Properties properties) { +#ifdef __SYCL_DEVICE_ONLY__ + std::size_t local_id = g.get_local_linear_id(); + auto wg_size = g.get_local_range().size(); + auto number_of_elements = wg_size * ElementsPerWorkItem; + T *scratch_begin = sycl::detail::align_scratch( + scratch, g, + /* output storage and temporary storage */ 2 * number_of_elements); + for (std::uint32_t i = 0; i < ElementsPerWorkItem; ++i) + scratch_begin[local_id * ElementsPerWorkItem + i] = values[i]; + sycl::detail::merge_sort(g, scratch_begin, number_of_elements, comp, + scratch_begin + number_of_elements); + + std::size_t shift{}; + for (std::uint32_t i = 0; i < ElementsPerWorkItem; ++i) { + if constexpr (detail::isOutputBlocked(properties)) { + shift = local_id * ElementsPerWorkItem + i; + } else { + shift = i * wg_size + local_id; + } + values[i] = scratch_begin[shift]; + } +#endif + } + static std::size_t memory_required(sycl::memory_scope scope, size_t range_size) { return 2 * joint_sorter<>::template memory_required( @@ -480,6 +485,19 @@ class group_sorter { #endif } + template + void operator()([[maybe_unused]] Group g, + [[maybe_unused]] sycl::span values, + [[maybe_unused]] Properties properties) { +#ifdef __SYCL_DEVICE_ONLY__ + sycl::detail::privateStaticSort< + /*is_key_value=*/false, detail::isOutputBlocked(properties), + OrderT == sorting_order::ascending, ElementsPerWorkItem, bits>( + g, values.data(), /*empty*/ values.data(), scratch.data(), first_bit, + last_bit); +#endif + } + static constexpr size_t memory_required([[maybe_unused]] sycl::memory_scope scope, size_t range_size) { diff --git a/sycl/include/sycl/ext/oneapi/experimental/group_sort.hpp b/sycl/include/sycl/ext/oneapi/experimental/group_sort.hpp index d5b67d0cb4df9..5dece1c54f7c4 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/group_sort.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/group_sort.hpp @@ -114,6 +114,59 @@ sort_over_group(experimental::group_with_scratchpad exec, default_sorters::group_sorter(exec.get_memory())); } +template +std::enable_if_t>, + void> +sort_over_group([[maybe_unused]] Group g, + [[maybe_unused]] sycl::span values, + [[maybe_unused]] Sorter sorter, + [[maybe_unused]] Properties properties = {}) { +#ifdef __SYCL_DEVICE_ONLY__ + return sorter(g, values, properties); +#else + throw sycl::exception( + std::error_code(PI_ERROR_INVALID_DEVICE, sycl::sycl_category()), + "Group algorithms are not supported on host device."); +#endif +} + +template +std::enable_if_t>, + void> +sort_over_group(experimental::group_with_scratchpad exec, + sycl::span values, + Properties properties = {}) { + return sort_over_group( + exec.get_group(), values, + default_sorters::group_sorter, ElementsPerWorkItem>( + exec.get_memory()), + properties); +} + +template +std::enable_if_t> && + sycl::ext::oneapi::experimental::is_property_list_v< + std::decay_t>, + void> +sort_over_group(experimental::group_with_scratchpad exec, + sycl::span values, Compare comp, + Properties properties = {}) { + return sort_over_group( + exec.get_group(), values, + default_sorters::group_sorter( + exec.get_memory(), comp), + properties); +} + // ---- joint_sort template std::enable_if_t::value, void> diff --git a/sycl/include/sycl/ext/oneapi/properties/property.hpp b/sycl/include/sycl/ext/oneapi/properties/property.hpp index 559dd20c5fe09..3f1bb28268d39 100644 --- a/sycl/include/sycl/ext/oneapi/properties/property.hpp +++ b/sycl/include/sycl/ext/oneapi/properties/property.hpp @@ -207,8 +207,10 @@ enum PropKind : uint32_t { SingleTaskKernel = 66, IndirectlyCallable = 67, CallsIndirectly = 68, + InputDataPlacement = 69, + OutputDataPlacement = 70, // PropKindSize must always be the last value. - PropKindSize = 69, + PropKindSize = 71, }; struct property_key_base_tag {}; diff --git a/sycl/test-e2e/GroupAlgorithm/SYCL2020/group_sort/array_input_sort.cpp b/sycl/test-e2e/GroupAlgorithm/SYCL2020/group_sort/array_input_sort.cpp new file mode 100644 index 0000000000000..b7ced46f10edb --- /dev/null +++ b/sycl/test-e2e/GroupAlgorithm/SYCL2020/group_sort/array_input_sort.cpp @@ -0,0 +1,250 @@ +// REQUIRES: sg-8 +// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out +// RUN: %{run} %t.out + +// The test verifies sorting APIs for fixed-size array input from group_sort +// extension. +#include "common.hpp" +#include + +#include +#include +#include +#include +#include + +template +void RunSortOverGroupArray(sycl::queue &Q, const std::vector &DataToSort, + const Compare &Comp, Property Prop) { + + const size_t WorkSize = DataToSort.size() / ElementsPerWorkItem; + const size_t NumSubGroups = WorkSize / ReqSubGroupSize + 1; + + const sycl::nd_range NDRange = [&]() { + if constexpr (Dims == 1) + return sycl::nd_range<1>{{WorkSize}, {WorkSize}}; + else + return sycl::nd_range<2>{{1, WorkSize}, {1, WorkSize}}; + static_assert(Dims < 3, + "Only one and two dimensional kernels are supported"); + }(); + + using DefaultSorterT = + oneapi_exp::default_sorters::group_sorter; + + using RadixSorterT = oneapi_exp::radix_sorters::group_sorter< + typename ConvertToSimpleType::Type, + ConvertToSortingOrder::Type, ElementsPerWorkItem>; + + constexpr bool IsSG = (UseGroup == UseGroupT::SubGroup); + auto Scope = + IsSG ? sycl::memory_scope::sub_group : sycl::memory_scope::work_group; + auto WGSize = NDRange.get_local_range().size(); + auto GroupSize = IsSG ? ReqSubGroupSize : WGSize; + std::size_t LocalMemorySizeDefault = + DefaultSorterT::memory_required(Scope, GroupSize); + std::size_t LocalMemorySizeRadix = + RadixSorterT::memory_required(Scope, GroupSize); + std::array, 4> DataToSortCase = {DataToSort, DataToSort, + DataToSort, DataToSort}; + + // Sort data using 3 different versions of sort_over_group API + { + std::array>, 4> BufToSort; + for (int i = 0; i < 4; i++) + BufToSort[i].reset(new sycl::buffer(DataToSortCase[i].data(), + DataToSortCase[i].size())); + + Q.submit([&](sycl::handler &CGH) { + auto AccToSort0 = sycl::accessor(*BufToSort[0], CGH); + auto AccToSort1 = sycl::accessor(*BufToSort[1], CGH); + auto AccToSort2 = sycl::accessor(*BufToSort[2], CGH); + auto AccToSort3 = sycl::accessor(*BufToSort[3], CGH); + + // Allocate local memory for all sub-groups in a work-group + const size_t TotalLocalMemSizeDefault = + IsSG ? LocalMemorySizeDefault * NumSubGroups + : LocalMemorySizeDefault; + sycl::local_accessor ScratchDefault( + {TotalLocalMemSizeDefault}, CGH); + + const size_t TotalLocalMemSizeRadix = + IsSG ? LocalMemorySizeRadix * NumSubGroups : LocalMemorySizeRadix; + + sycl::local_accessor ScratchRadix({TotalLocalMemSizeRadix}, + CGH); + + CGH.parallel_for( + NDRange, [=](sycl::nd_item id) [[intel::reqd_sub_group_size( + ReqSubGroupSize)]] { + const size_t GlobalLinearID = id.get_global_linear_id(); + using RadixSorterT = oneapi_exp::radix_sorters::group_sorter< + typename ConvertToSimpleType::Type, + ConvertToSortingOrder::Type, ElementsPerWorkItem>; + + auto Group = [&]() { + if constexpr (IsSG) + return id.get_sub_group(); + else + return id.get_group(); + }(); + + // Each sub-group should use its own part of the scratch pad + const size_t ScratchShiftDefault = + IsSG ? id.get_sub_group().get_group_linear_id() * + LocalMemorySizeDefault + : 0; + std::byte *ScratchPtrDefault = + &ScratchDefault[0] + ScratchShiftDefault; + + T ValsPrivate[ElementsPerWorkItem]; + + auto ReadToPrivate = [&](auto Acc) { + for (std::size_t I = 0; I < ElementsPerWorkItem; ++I) + ValsPrivate[I] = Acc[GlobalLinearID * ElementsPerWorkItem + I]; + }; + auto WriteToGlobal = [&](auto Acc) { + for (std::size_t I = 0; I < ElementsPerWorkItem; ++I) + Acc[GlobalLinearID * ElementsPerWorkItem + I] = ValsPrivate[I]; + }; + + auto Scratch = + sycl::span{ScratchPtrDefault, LocalMemorySizeDefault}; + auto PrivateArr = sycl::span{ + ValsPrivate, ValsPrivate + ElementsPerWorkItem}; + if constexpr (std::is_same_v>) { + ReadToPrivate(AccToSort0); + oneapi_exp::sort_over_group( + oneapi_exp::group_with_scratchpad(Group, Scratch), + PrivateArr, Prop); // (4) + WriteToGlobal(AccToSort0); + } + + ReadToPrivate(AccToSort1); + oneapi_exp::sort_over_group( + oneapi_exp::group_with_scratchpad(Group, Scratch), PrivateArr, + Comp, Prop); // (5) + WriteToGlobal(AccToSort1); + + ReadToPrivate(AccToSort2); + oneapi_exp::sort_over_group(Group, PrivateArr, + DefaultSorterT(Scratch), Prop); // (6) + WriteToGlobal(AccToSort2); + + // Each sub-group should use its own part of the scratch pad + const size_t ScratchShiftRadix = + IsSG ? id.get_sub_group().get_group_linear_id() * + LocalMemorySizeRadix + : 0; + std::byte *ScratchPtrRadix = &ScratchRadix[0] + ScratchShiftRadix; + + // Radix doesn't support custom types + if constexpr (!std::is_same_v) { + ReadToPrivate(AccToSort3); + oneapi_exp::sort_over_group( + Group, PrivateArr, + RadixSorterT( + sycl::span{ScratchPtrRadix, LocalMemorySizeRadix}), + Prop); // (6) + WriteToGlobal(AccToSort3); + } + }); + }).wait_and_throw(); + } + + // Verification + { + // Emulate independent sorting of each work-group/sub-group + const size_t ChunkSize = GroupSize * ElementsPerWorkItem; + std::vector TempSorted = DataToSort; + auto It = TempSorted.begin(); + for (; (It + ChunkSize) < TempSorted.end(); It += ChunkSize) + std::sort(It, It + ChunkSize, Comp); + + // Sort reminder + std::sort(It, TempSorted.end(), Comp); + std::vector DataSorted; + DataSorted.resize(TempSorted.size()); + writeBlockedOrStriped(/*In */ TempSorted, /* Out */ DataSorted, + GroupSize, ElementsPerWorkItem, Prop); + + if constexpr (std::is_same_v>) + assert(DataToSortCase[0] == DataSorted); + + assert(DataToSortCase[1] == DataSorted); + assert(DataToSortCase[2] == DataSorted); + // Radix doesn't support custom types + if constexpr (!std::is_same_v) + assert(DataToSortCase[3] == DataSorted); + } +} + +template < + UseGroupT UseGroup, int Dim, size_t ElementsPerWorkItem, class T, + typename Compare, + typename Properties = sycl::ext::oneapi::experimental::empty_properties_t> +void RunOnData(sycl::queue &Q, const std::vector &Data, + const Compare &Comparator, Properties Prop = {}) { + if constexpr (UseGroup == UseGroupT::SubGroup) + if (Q.get_backend() == sycl::backend::ext_oneapi_cuda || + Q.get_backend() == sycl::backend::ext_oneapi_hip) { + std::cout << "Note! Skipping sub group testing on CUDA BE" << std::endl; + return; + } + + RunSortOverGroupArray(Q, Data, Comparator, + Prop); +}; + +template void RunOverType(sycl::queue &Q, size_t DataSize) { + constexpr size_t PerWI = 4; + std::vector ArrayDataRandom(DataSize * PerWI); + + // Fill using random numbers + std::default_random_engine generator; + std::normal_distribution distribution((10.0), (2.0)); + for (T &Elem : ArrayDataRandom) + Elem = T(distribution(generator)); + + auto blocked = oneapi_exp::properties{oneapi_exp::input_data_placement< + oneapi_exp::group_algorithm_data_placement::blocked>}; + auto striped = oneapi_exp::properties{oneapi_exp::input_data_placement< + oneapi_exp::group_algorithm_data_placement::striped>}; + RunOnData(Q, ArrayDataRandom, std::less{}, + blocked); + RunOnData(Q, ArrayDataRandom, + std::greater{}, striped); + RunOnData(Q, ArrayDataRandom, std::less{}, + blocked); + RunOnData(Q, ArrayDataRandom, + std::greater{}, striped); + RunOnData(Q, ArrayDataRandom, + std::greater{}, blocked); + RunOnData(Q, ArrayDataRandom, std::less{}, + striped); + RunOnData(Q, ArrayDataRandom, + std::greater{}, blocked); + RunOnData(Q, ArrayDataRandom, std::less{}, + striped); +} + +int main() { + try { + sycl::queue Q; + + static constexpr size_t Size = 18; + + RunOverType(Q, Size); + RunOverType(Q, Size); + + std::cout << "Test passed." << std::endl; + return 0; + } catch (std::exception &E) { + std::cout << "Test failed" << std::endl; + std::cout << E.what() << std::endl; + return 1; + } +} diff --git a/sycl/test-e2e/GroupAlgorithm/SYCL2020/group_sort/common.hpp b/sycl/test-e2e/GroupAlgorithm/SYCL2020/group_sort/common.hpp index bec228a074b47..8f3317addb6e0 100644 --- a/sycl/test-e2e/GroupAlgorithm/SYCL2020/group_sort/common.hpp +++ b/sycl/test-e2e/GroupAlgorithm/SYCL2020/group_sort/common.hpp @@ -48,3 +48,45 @@ constexpr size_t ReqSubGroupSize = 8; template class KernelNameOverGroup; template class KernelNameJoint; + +template +constexpr bool IsOutputBlocked(Properties properties) { + if constexpr (properties.template has_property< + oneapi_exp::output_data_placement_key>()) + return properties.template get_property< + oneapi_exp::output_data_placement_key>() == + oneapi_exp::output_data_placement< + oneapi_exp::group_algorithm_data_placement::blocked>; + else + return true; +} + +// Write data from In to Out in blocked/striped way. +template +void writeBlockedOrStriped(const std::vector &In, std::vector &Out, + size_t MaxGroupSize, size_t ElementsPerWorkItem, + Properties Prop) { + assert(In.size() == Out.size()); + size_t index = {}; + size_t shift = {}; + auto ChunkSize = MaxGroupSize * ElementsPerWorkItem; + std::uint32_t ChunkStart = 0; + for (std::uint32_t ChunkStart = 0; ChunkStart < In.size(); + ChunkStart += ChunkSize) { + auto GroupSize = (In.size() - ChunkStart) >= ChunkSize + ? MaxGroupSize + : (In.size() - ChunkStart) / ElementsPerWorkItem; + for (std::uint32_t j = 0; j < GroupSize; ++j) { + for (std::uint32_t k = 0; k < ElementsPerWorkItem; ++k) { + index = ChunkStart + j * ElementsPerWorkItem + k; + if constexpr (IsOutputBlocked(Prop)) { + shift = index; + } else { + shift = ChunkStart + k * GroupSize + j; + } + if (index < Out.size() && shift < In.size()) + Out[index] = In[shift]; + } + } + } +}