Skip to content

[SYCL] Add sorting APIs for fixed-size private array input #14185

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jun 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions sycl/include/sycl/detail/group_sort_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,43 @@
#include <sycl/builtins.hpp>
#include <sycl/group_algorithm.hpp>
#include <sycl/group_barrier.hpp>
#include <sycl/sycl_span.hpp>

#include <memory>

namespace sycl {
inline namespace _V1 {
namespace detail {

// Helpers for sorting algorithms
#ifdef __SYCL_DEVICE_ONLY__
template <typename T, typename Group>
static __SYCL_ALWAYS_INLINE T *align_scratch(sycl::span<std::byte> scratch,
Group g,
size_t number_of_elements) {
// Adjust the scratch pointer based on alignment of the type T.
// Per extension specification if scratch size is less than the value
// returned by memory_required then behavior is undefined, so we don't check
// that the scratch size statisfies the requirement.
T *scratch_begin = nullptr;
// We must have a barrier here before array placement new because it is
// possible that scratch memory is already in use, so we need to synchronize
// work items.
sycl::group_barrier(g);
if (g.leader()) {
void *scratch_ptr = scratch.data();
size_t space = scratch.size();
scratch_ptr = std::align(alignof(T), number_of_elements * sizeof(T),
scratch_ptr, space);
scratch_begin = ::new (scratch_ptr) T[number_of_elements];
}
// Broadcast leader's pointer (the beginning of the scratch) to all work
// items in the group.
scratch_begin = sycl::group_broadcast(g, scratch_begin);
return scratch_begin;
}
#endif

// ---- merge sort implementation

// following two functions could be useless if std::[lower|upper]_bound worked
Expand Down
180 changes: 99 additions & 81 deletions sycl/include/sycl/ext/oneapi/experimental/group_helpers_sorters.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,10 @@
#include <sycl/detail/pi.h> // for PI_ERROR_INVALID_DEVICE
#include <sycl/exception.hpp> // for sycl_category, exception
#include <sycl/ext/oneapi/bfloat16.hpp> // for bfloat16
#include <sycl/memory_enums.hpp> // for memory_scope
#include <sycl/range.hpp> // for range
#include <sycl/sycl_span.hpp> // for span
#include <sycl/ext/oneapi/properties/properties.hpp>
#include <sycl/memory_enums.hpp> // for memory_scope
#include <sycl/range.hpp> // for range
#include <sycl/sycl_span.hpp> // for span

#ifdef __SYCL_DEVICE_ONLY__
#include <sycl/detail/group_sort_impl.hpp>
Expand All @@ -36,6 +37,54 @@ namespace sycl {
inline namespace _V1 {
namespace ext::oneapi::experimental {

enum class group_algorithm_data_placement { blocked, striped };

struct input_data_placement_key
: detail::compile_time_property_key<detail::PropKind::InputDataPlacement> {
template <group_algorithm_data_placement Placement>
using value_t =
property_value<input_data_placement_key,
std::integral_constant<int, static_cast<int>(Placement)>>;
};

struct output_data_placement_key
: detail::compile_time_property_key<detail::PropKind::OutputDataPlacement> {
template <group_algorithm_data_placement Placement>
using value_t =
property_value<output_data_placement_key,
std::integral_constant<int, static_cast<int>(Placement)>>;
};

template <group_algorithm_data_placement Placement>
inline constexpr input_data_placement_key::value_t<Placement>
input_data_placement;

template <group_algorithm_data_placement Placement>
inline constexpr output_data_placement_key::value_t<Placement>
output_data_placement;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We still have output_data_placement_key and input_data_placement_key in namespace sycl::ext::oneapi::experimental.
Is it expected? What to do with that?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is expected, those ones are necessary for kernel properties functionality to work.

Copy link
Contributor

@andreyfe1 andreyfe1 Jun 21, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, we need to change the Spec accordingly, right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes


namespace detail {

template <typename Properties>
constexpr bool isInputBlocked(Properties properties) {
if constexpr (properties.template has_property<input_data_placement_key>())
return properties.template get_property<input_data_placement_key>() ==
input_data_placement<group_algorithm_data_placement::blocked>;
else
return true;
}

template <typename Properties>
constexpr bool isOutputBlocked(Properties properties) {
if constexpr (properties.template has_property<output_data_placement_key>())
return properties.template get_property<output_data_placement_key>() ==
output_data_placement<group_algorithm_data_placement::blocked>;
else
return true;
}

} // namespace detail

// ---- group helpers
template <typename Group, size_t Extent> class group_with_scratchpad {
Group g;
Expand Down Expand Up @@ -63,26 +112,9 @@ template <typename Compare = std::less<>> class default_sorter {
void operator()([[maybe_unused]] Group g, [[maybe_unused]] Ptr first,
[[maybe_unused]] Ptr last) {
#ifdef __SYCL_DEVICE_ONLY__
// Adjust the scratch pointer based on alignment of the type T.
// Per extension specification if scratch size is less than the value
// returned by memory_required then behavior is undefined, so we don't check
// that the scratch size statisfies the requirement.
using T = typename sycl::detail::GetValueType<Ptr>::type;
T *scratch_begin = nullptr;
size_t n = last - first;
// We must have a barrier here before array placement new because it is
// possible that scratch memory is already in use, so we need to synchronize
// work items.
sycl::group_barrier(g);
if (g.leader()) {
void *scratch_ptr = scratch.data();
size_t space = scratch.size();
scratch_ptr = std::align(alignof(T), n * sizeof(T), scratch_ptr, space);
scratch_begin = ::new (scratch_ptr) T[n];
}
// Broadcast leader's pointer (the beginning of the scratch) to all work
// items in the group.
scratch_begin = sycl::group_broadcast(g, scratch_begin);
T *scratch_begin = sycl::detail::align_scratch<T>(scratch, g, n);
sycl::detail::merge_sort(g, first, n, comp, scratch_begin);
#else
throw sycl::exception(
Expand All @@ -94,29 +126,10 @@ template <typename Compare = std::less<>> class default_sorter {
template <typename Group, typename T>
T operator()([[maybe_unused]] Group g, T val) {
#ifdef __SYCL_DEVICE_ONLY__
// Adjust the scratch pointer based on alignment of the type T.
// Per extension specification if scratch size is less than the value
// returned by memory_required then behavior is undefined, so we don't check
// that the scratch size statisfies the requirement.
T *scratch_begin = nullptr;
std::size_t local_id = g.get_local_linear_id();
auto range_size = g.get_local_range().size();
// We must have a barrier here before array placement new because it is
// possible that scratch memory is already in use, so we need to synchronize
// work items.
sycl::group_barrier(g);
if (g.leader()) {
void *scratch_ptr = scratch.data();
size_t space = scratch.size();
scratch_ptr =
std::align(alignof(T), /* output storage and temporary storage */ 2 *
range_size * sizeof(T),
scratch_ptr, space);
scratch_begin = ::new (scratch_ptr) T[2 * range_size];
}
// Broadcast leader's pointer (the beginning of the scratch) to all work
// items in the group.
scratch_begin = sycl::group_broadcast(g, scratch_begin);
T *scratch_begin = sycl::detail::align_scratch<T>(
scratch, g, /* output storage and temporary storage */ 2 * range_size);
scratch_begin[local_id] = val;
sycl::detail::merge_sort(g, scratch_begin, range_size, comp,
scratch_begin + range_size);
Expand Down Expand Up @@ -252,26 +265,9 @@ template <typename CompareT = std::less<>> class joint_sorter {
void operator()([[maybe_unused]] Group g, [[maybe_unused]] Ptr first,
[[maybe_unused]] Ptr last) {
#ifdef __SYCL_DEVICE_ONLY__
// Adjust the scratch pointer based on alignment of the type T.
// Per extension specification if scratch size is less than the value
// returned by memory_required then behavior is undefined, so we don't check
// that the scratch size statisfies the requirement.
using T = typename sycl::detail::GetValueType<Ptr>::type;
T *scratch_begin = nullptr;
size_t n = last - first;
// We must have a barrier here before array placement new because it is
// possible that scratch memory is already in use, so we need to synchronize
// work items.
sycl::group_barrier(g);
if (g.leader()) {
void *scratch_ptr = scratch.data();
size_t space = scratch.size();
scratch_ptr = std::align(alignof(T), n * sizeof(T), scratch_ptr, space);
scratch_begin = ::new (scratch_ptr) T[n];
}
// Broadcast leader's pointer (the beginning of the scratch) to all work
// items in the group.
scratch_begin = sycl::group_broadcast(g, scratch_begin);
T *scratch_begin = sycl::detail::align_scratch<T>(scratch, g, n);
sycl::detail::merge_sort(g, first, n, comp, scratch_begin);
#else
throw sycl::exception(
Expand Down Expand Up @@ -300,29 +296,10 @@ class group_sorter {

template <typename Group> T operator()([[maybe_unused]] Group g, T val) {
#ifdef __SYCL_DEVICE_ONLY__
// Adjust the scratch pointer based on alignment of the type T.
// Per extension specification if scratch size is less than the value
// returned by memory_required then behavior is undefined, so we don't check
// that the scratch size statisfies the requirement.
T *scratch_begin = nullptr;
std::size_t local_id = g.get_local_linear_id();
auto range_size = g.get_local_range().size();
// We must have a barrier here before array placement new because it is
// possible that scratch memory is already in use, so we need to synchronize
// work items.
sycl::group_barrier(g);
if (g.leader()) {
void *scratch_ptr = scratch.data();
size_t space = scratch.size();
scratch_ptr =
std::align(alignof(T), /* output storage and temporary storage */ 2 *
range_size * sizeof(T),
scratch_ptr, space);
scratch_begin = ::new (scratch_ptr) T[2 * range_size];
}
// Broadcast leader's pointer (the beginning of the scratch) to all work
// items in the group.
scratch_begin = sycl::group_broadcast(g, scratch_begin);
T *scratch_begin = sycl::detail::align_scratch<T>(
scratch, g, /* output storage and temporary storage */ 2 * range_size);
scratch_begin[local_id] = val;
sycl::detail::merge_sort(g, scratch_begin, range_size, comp,
scratch_begin + range_size);
Expand All @@ -335,6 +312,34 @@ class group_sorter {
return val;
}

template <typename Group, typename Properties>
void operator()([[maybe_unused]] Group g,
[[maybe_unused]] sycl::span<T, ElementsPerWorkItem> values,
[[maybe_unused]] Properties properties) {
#ifdef __SYCL_DEVICE_ONLY__
std::size_t local_id = g.get_local_linear_id();
auto wg_size = g.get_local_range().size();
auto number_of_elements = wg_size * ElementsPerWorkItem;
T *scratch_begin = sycl::detail::align_scratch<T>(
scratch, g,
/* output storage and temporary storage */ 2 * number_of_elements);
for (std::uint32_t i = 0; i < ElementsPerWorkItem; ++i)
scratch_begin[local_id * ElementsPerWorkItem + i] = values[i];
sycl::detail::merge_sort(g, scratch_begin, number_of_elements, comp,
scratch_begin + number_of_elements);

std::size_t shift{};
for (std::uint32_t i = 0; i < ElementsPerWorkItem; ++i) {
if constexpr (detail::isOutputBlocked(properties)) {
shift = local_id * ElementsPerWorkItem + i;
} else {
shift = i * wg_size + local_id;
}
values[i] = scratch_begin[shift];
}
#endif
}

static std::size_t memory_required(sycl::memory_scope scope,
size_t range_size) {
return 2 * joint_sorter<>::template memory_required<T>(
Expand Down Expand Up @@ -480,6 +485,19 @@ class group_sorter {
#endif
}

template <typename Group, typename Properties>
void operator()([[maybe_unused]] Group g,
[[maybe_unused]] sycl::span<ValT, ElementsPerWorkItem> values,
[[maybe_unused]] Properties properties) {
#ifdef __SYCL_DEVICE_ONLY__
sycl::detail::privateStaticSort<
/*is_key_value=*/false, detail::isOutputBlocked(properties),
OrderT == sorting_order::ascending, ElementsPerWorkItem, bits>(
g, values.data(), /*empty*/ values.data(), scratch.data(), first_bit,
last_bit);
#endif
}

static constexpr size_t
memory_required([[maybe_unused]] sycl::memory_scope scope,
size_t range_size) {
Expand Down
53 changes: 53 additions & 0 deletions sycl/include/sycl/ext/oneapi/experimental/group_sort.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,59 @@ sort_over_group(experimental::group_with_scratchpad<Group, Extent> exec,
default_sorters::group_sorter<T>(exec.get_memory()));
}

template <typename Group, typename T, std::size_t ElementsPerWorkItem,
typename Sorter,
typename Properties = ext::oneapi::experimental::empty_properties_t>
std::enable_if_t<sycl::ext::oneapi::experimental::is_property_list_v<
std::decay_t<Properties>>,
void>
sort_over_group([[maybe_unused]] Group g,
[[maybe_unused]] sycl::span<T, ElementsPerWorkItem> values,
[[maybe_unused]] Sorter sorter,
[[maybe_unused]] Properties properties = {}) {
#ifdef __SYCL_DEVICE_ONLY__
return sorter(g, values, properties);
#else
throw sycl::exception(
std::error_code(PI_ERROR_INVALID_DEVICE, sycl::sycl_category()),
"Group algorithms are not supported on host device.");
#endif
}

template <typename Group, typename T, std::size_t Extent,
std::size_t ElementsPerWorkItem,
typename Properties = ext::oneapi::experimental::empty_properties_t>
std::enable_if_t<sycl::ext::oneapi::experimental::is_property_list_v<
std::decay_t<Properties>>,
void>
sort_over_group(experimental::group_with_scratchpad<Group, Extent> exec,
sycl::span<T, ElementsPerWorkItem> values,
Properties properties = {}) {
return sort_over_group(
exec.get_group(), values,
default_sorters::group_sorter<T, std::less<T>, ElementsPerWorkItem>(
exec.get_memory()),
properties);
}

template <typename Group, typename T, std::size_t Extent,
std::size_t ElementsPerWorkItem, typename Compare,
typename Properties = ext::oneapi::experimental::empty_properties_t>
std::enable_if_t<!sycl::ext::oneapi::experimental::is_property_list_v<
std::decay_t<Compare>> &&
sycl::ext::oneapi::experimental::is_property_list_v<
std::decay_t<Properties>>,
void>
sort_over_group(experimental::group_with_scratchpad<Group, Extent> exec,
sycl::span<T, ElementsPerWorkItem> values, Compare comp,
Properties properties = {}) {
return sort_over_group(
exec.get_group(), values,
default_sorters::group_sorter<T, Compare, ElementsPerWorkItem>(
exec.get_memory(), comp),
properties);
}

// ---- joint_sort
template <typename Group, typename Iter, typename Sorter>
std::enable_if_t<detail::is_sorter<Sorter, Group, Iter>::value, void>
Expand Down
4 changes: 3 additions & 1 deletion sycl/include/sycl/ext/oneapi/properties/property.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -207,8 +207,10 @@ enum PropKind : uint32_t {
SingleTaskKernel = 66,
IndirectlyCallable = 67,
CallsIndirectly = 68,
InputDataPlacement = 69,
OutputDataPlacement = 70,
// PropKindSize must always be the last value.
PropKindSize = 69,
PropKindSize = 71,
};

struct property_key_base_tag {};
Expand Down
Loading
Loading