Skip to content

[SYCL][Group algorithms] Add group sorting algorithms implementation #4439

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Sep 15, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ namespace sycl::ext::oneapi::experimental {
class default_sorter {
public:
template<std::size_t Extent>
default_sorter(sycl::span<uint8_t, Extent> scratch, Compare comp = Compare());
default_sorter(sycl::span<std::byte, Extent> scratch, Compare comp = Compare());

template<typename Group, typename Ptr>
void operator()(Group g, Ptr first, Ptr last);
Expand All @@ -167,7 +167,7 @@ namespace sycl::ext::oneapi::experimental {
class radix_sorter {
public:
template<std::size_t Extent>
radix_sorter(sycl::span<uint8_t, Extent> scratch,
radix_sorter(sycl::span<std::byte, Extent> scratch,
const std::bitset<sizeof(T) * CHAR_BIT> mask =
std::bitset<sizeof(T) * CHAR_BIT> (std::numeric_limits<unsigned long long>::max()));

Expand Down Expand Up @@ -215,7 +215,7 @@ Table 4. Constructors of the `default_sorter` class.
|Constructor|Description

|`template<std::size_t Extent>
default_sorter(sycl::span<uint8_t, Extent> scratch, Compare comp = Compare())`
default_sorter(sycl::span<std::byte, Extent> scratch, Compare comp = Compare())`
|Creates the `default_sorter` object using `comp`.
Additional memory for the algorithm is provided using `scratch`.
If `scratch.size()` is less than the value returned by
Expand Down Expand Up @@ -264,7 +264,7 @@ Table 6. Constructors of the `radix_sorter` class.
|Constructor|Description

|`template<std::size_t Extent>
radix_sorter(sycl::span<uint8_t, Extent> scratch, const std::bitset<sizeof(T) * CHAR_BIT> mask = std::bitset<sizeof(T) * CHAR_BIT>
radix_sorter(sycl::span<std::byte, Extent> scratch, const std::bitset<sizeof(T) * CHAR_BIT> mask = std::bitset<sizeof(T) * CHAR_BIT>
(std::numeric_limits<unsigned long long>::max()))`
|Creates the `radix_sorter` object to sort values considering only bits
that corresponds to 1 in `mask`.
Expand Down Expand Up @@ -350,16 +350,16 @@ namespace sycl::ext::oneapi::experimental {
class group_with_scratchpad
{
public:
group_with_scratchpad(Group group, sycl::span<uint8_t, Extent> scratch);
group_with_scratchpad(Group group, sycl::span<std::byte, Extent> scratch);
Group get_group() const;

sycl::span<uint8_t, Extent>
sycl::span<std::byte, Extent>
get_memory() const;
};

// Deduction guides
template<typename Group, std::size_t Extent>
group_with_scratchpad(Group, sycl::span<uint8_t, Extent>)
group_with_scratchpad(Group, sycl::span<std::byte, Extent>)
-> group_with_scratchpad<Group, Extent>;

}
Expand All @@ -372,7 +372,7 @@ Table 9. Constructors of the `group_with_scratchpad` class.
|===
|Constructor|Description

|`group_with_scratchpad(Group group, sycl::span<uint8_t, Extent> scratch)`
|`group_with_scratchpad(Group group, sycl::span<std::byte, Extent> scratch)`
|Creates the `group_with_scratchpad` object using `group` and `scratch`.
`sycl::is_group_v<std::decay_t<Group>>` must be true.
`scratch.size()` must not be less than value returned by the `memory_required` method
Expand All @@ -388,7 +388,7 @@ Table 10. Member functions of the `group_with_scratchpad` class.
|`Group get_group() const`
|Returns the `Group` class object that is handled by the `group_with_scratchpad` object.

|`sycl::span<uint8_t, Extent>
|`sycl::span<std::byte, Extent>
get_memory() const`
|Returns `sycl::span` that represents an additional memory
that is handled by the `group_with_scratchpad` object.
Expand Down Expand Up @@ -508,7 +508,7 @@ size_t temp_memory_size =

q.submit([&](sycl::handler& h) {
auto acc = sycl::accessor(buf, h);
auto scratch = sycl::local_accessor<uint8_t, 1>( {temp_memory_size}, h );
auto scratch = sycl::local_accessor<std::byte, 1>( {temp_memory_size}, h );

h.parallel_for(
sycl::nd_range<1>{ /*global_size = */ {256}, /*local_size = */ {256} },
Expand Down Expand Up @@ -546,7 +546,7 @@ size_t temp_memory_size =

q.submit([&](sycl::handler& h) {
auto acc = sycl::accessor(buf, h);
auto scratch = sycl::local_accessor<uint8_t, 1>( {temp_memory_size}, h);
auto scratch = sycl::local_accessor<std::byte, 1>( {temp_memory_size}, h);

h.parallel_for(
sycl::nd_range<1>{ local_range, local_range },
Expand Down Expand Up @@ -583,7 +583,7 @@ size_t temp_memory_size =
q.submit([&](sycl::handler& h) {
auto keys_acc = sycl::accessor(keys_buf, h);
auto vals_acc = sycl::accessor(vals_buf, h);
auto scratch = sycl::local_accessor<uint8_t, 1>( {temp_memory_size}, h);
auto scratch = sycl::local_accessor<std::byte, 1>( {temp_memory_size}, h);

h.parallel_for(
sycl::nd_range<1>{ /*global_size = */ {1024}, /*local_size = */ {256} },
Expand Down
256 changes: 256 additions & 0 deletions sycl/include/CL/sycl/detail/group_sort_impl.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,256 @@
//==------------ group_sort_impl.hpp ---------------------------------------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// This file includes some functions for group sorting algorithm implementations
//

#pragma once

#if __cplusplus >= 201703L
#include <CL/sycl/detail/helpers.hpp>

#ifdef __SYCL_DEVICE_ONLY__

__SYCL_INLINE_NAMESPACE(cl) {
namespace sycl {
namespace detail {

// ---- merge sort implementation

// following two functions could be useless if std::[lower|upper]_bound worked
// well
template <typename Acc, typename Value, typename Compare>
std::size_t lower_bound(Acc acc, std::size_t first, std::size_t last,
const Value &value, Compare comp) {
std::size_t n = last - first;
std::size_t cur = n;
std::size_t it;
while (n > 0) {
it = first;
cur = n / 2;
it += cur;
if (comp(acc[it], value)) {
n -= cur + 1, first = ++it;
} else
n = cur;
}
return first;
}

template <typename Acc, typename Value, typename Compare>
std::size_t upper_bound(Acc acc, const std::size_t first,
const std::size_t last, const Value &value,
Compare comp) {
return detail::lower_bound(acc, first, last, value,
[comp](auto x, auto y) { return !comp(y, x); });
}

// swap for all data types including tuple-like types
template <typename T> void swap_tuples(T &a, T &b) { std::swap(a, b); }

template <template <typename...> class TupleLike, typename T1, typename T2>
void swap_tuples(TupleLike<T1, T2> &&a, TupleLike<T1, T2> &&b) {
std::swap(std::get<0>(a), std::get<0>(b));
std::swap(std::get<1>(a), std::get<1>(b));
}

template <typename Iter> struct GetValueType {
using type = typename std::iterator_traits<Iter>::value_type;
};

template <typename ElementType, access::address_space Space>
struct GetValueType<sycl::multi_ptr<ElementType, Space>> {
using type = ElementType;
};

// since we couldn't assign data to raw memory, it's better to use placement
// for first assignment
template <typename Acc, typename T>
void set_value(Acc ptr, const std::size_t idx, const T &val, bool is_first) {
if (is_first) {
::new (ptr + idx) T(val);
} else {
ptr[idx] = val;
}
}

template <typename InAcc, typename OutAcc, typename Compare>
void merge(const std::size_t offset, InAcc &in_acc1, OutAcc &out_acc1,
const std::size_t start_1, const std::size_t end_1,
const std::size_t end_2, const std::size_t start_out, Compare comp,
const std::size_t chunk, bool is_first) {
const std::size_t start_2 = end_1;
// Borders of the sequences to merge within this call
const std::size_t local_start_1 =
sycl::min(static_cast<std::size_t>(offset + start_1), end_1);
const std::size_t local_end_1 =
sycl::min(static_cast<std::size_t>(local_start_1 + chunk), end_1);
const std::size_t local_start_2 =
sycl::min(static_cast<std::size_t>(offset + start_2), end_2);
const std::size_t local_end_2 =
sycl::min(static_cast<std::size_t>(local_start_2 + chunk), end_2);

const std::size_t local_size_1 = local_end_1 - local_start_1;
const std::size_t local_size_2 = local_end_2 - local_start_2;

// TODO: process cases where all elements of 1st sequence > 2nd, 2nd > 1st
// to improve performance

// Process 1st sequence
if (local_start_1 < local_end_1) {
// Reduce the range for searching within the 2nd sequence and handle bound
// items find left border in 2nd sequence
const auto local_l_item_1 = in_acc1[local_start_1];
std::size_t l_search_bound_2 =
detail::lower_bound(in_acc1, start_2, end_2, local_l_item_1, comp);
const std::size_t l_shift_1 = local_start_1 - start_1;
const std::size_t l_shift_2 = l_search_bound_2 - start_2;

set_value(out_acc1, start_out + l_shift_1 + l_shift_2, local_l_item_1,
is_first);

std::size_t r_search_bound_2{};
// find right border in 2nd sequence
if (local_size_1 > 1) {
const auto local_r_item_1 = in_acc1[local_end_1 - 1];
r_search_bound_2 = detail::lower_bound(in_acc1, l_search_bound_2, end_2,
local_r_item_1, comp);
const auto r_shift_1 = local_end_1 - 1 - start_1;
const auto r_shift_2 = r_search_bound_2 - start_2;

set_value(out_acc1, start_out + r_shift_1 + r_shift_2, local_r_item_1,
is_first);
}

// Handle intermediate items
for (std::size_t idx = local_start_1 + 1; idx < local_end_1 - 1; ++idx) {
const auto intermediate_item_1 = in_acc1[idx];
// we shouldn't seek in whole 2nd sequence. Just for the part where the
// 1st sequence should be
l_search_bound_2 =
detail::lower_bound(in_acc1, l_search_bound_2, r_search_bound_2,
intermediate_item_1, comp);
const std::size_t shift_1 = idx - start_1;
const std::size_t shift_2 = l_search_bound_2 - start_2;

set_value(out_acc1, start_out + shift_1 + shift_2, intermediate_item_1,
is_first);
}
}
// Process 2nd sequence
if (local_start_2 < local_end_2) {
// Reduce the range for searching within the 1st sequence and handle bound
// items find left border in 1st sequence
const auto local_l_item_2 = in_acc1[local_start_2];
std::size_t l_search_bound_1 =
detail::upper_bound(in_acc1, start_1, end_1, local_l_item_2, comp);
const std::size_t l_shift_1 = l_search_bound_1 - start_1;
const std::size_t l_shift_2 = local_start_2 - start_2;

set_value(out_acc1, start_out + l_shift_1 + l_shift_2, local_l_item_2,
is_first);

std::size_t r_search_bound_1{};
// find right border in 1st sequence
if (local_size_2 > 1) {
const auto local_r_item_2 = in_acc1[local_end_2 - 1];
r_search_bound_1 = detail::upper_bound(in_acc1, l_search_bound_1, end_1,
local_r_item_2, comp);
const std::size_t r_shift_1 = r_search_bound_1 - start_1;
const std::size_t r_shift_2 = local_end_2 - 1 - start_2;

set_value(out_acc1, start_out + r_shift_1 + r_shift_2, local_r_item_2,
is_first);
}

// Handle intermediate items
for (auto idx = local_start_2 + 1; idx < local_end_2 - 1; ++idx) {
const auto intermediate_item_2 = in_acc1[idx];
// we shouldn't seek in whole 1st sequence. Just for the part where the
// 2nd sequence should be
l_search_bound_1 =
detail::upper_bound(in_acc1, l_search_bound_1, r_search_bound_1,
intermediate_item_2, comp);
const std::size_t shift_1 = l_search_bound_1 - start_1;
const std::size_t shift_2 = idx - start_2;

set_value(out_acc1, start_out + shift_1 + shift_2, intermediate_item_2,
is_first);
}
}
}

template <typename Iter, typename Compare>
void bubble_sort(Iter first, const std::size_t begin, const std::size_t end,
Compare comp) {
if (begin < end) {
for (std::size_t i = begin; i < end; ++i) {
// Handle intermediate items
for (std::size_t idx = i + 1; idx < end; ++idx) {
if (comp(first[idx], first[i])) {
detail::swap_tuples(first[i], first[idx]);
}
}
}
}
}

template <typename Group, typename Iter, typename Compare>
void merge_sort(Group group, Iter first, const std::size_t n, Compare comp,
std::byte *scratch) {
using T = typename GetValueType<Iter>::type;
auto id = sycl::detail::Builder::getNDItem<Group::dimensions>();
const std::size_t idx = id.get_local_linear_id();
const std::size_t local = group.get_local_range().size();
const std::size_t chunk = (n - 1) / local + 1;

// we need to sort within work item first
bubble_sort(first, idx * chunk, sycl::min((idx + 1) * chunk, n), comp);
id.barrier();

T *temp = reinterpret_cast<T *>(scratch);
bool data_in_temp = false;
bool is_first = true;
std::size_t sorted_size = 1;
while (sorted_size * chunk < n) {
const std::size_t start_1 =
sycl::min(2 * sorted_size * chunk * (idx / sorted_size), n);
const std::size_t end_1 = sycl::min(start_1 + sorted_size * chunk, n);
const std::size_t end_2 = sycl::min(end_1 + sorted_size * chunk, n);
const std::size_t offset = chunk * (idx % sorted_size);

if (!data_in_temp) {
merge(offset, first, temp, start_1, end_1, end_2, start_1, comp, chunk,
is_first);
} else {
merge(offset, temp, first, start_1, end_1, end_2, start_1, comp, chunk,
/*is_first*/ false);
}
id.barrier();

data_in_temp = !data_in_temp;
sorted_size *= 2;
if (is_first)
is_first = false;
}

// copy back if data is in a temporary storage
if (data_in_temp) {
for (std::size_t i = 0; i < chunk; ++i) {
if (idx * chunk + i < n) {
first[idx * chunk + i] = temp[idx * chunk + i];
}
}
id.barrier();
}
}

} // namespace detail
} // namespace sycl
} // __SYCL_INLINE_NAMESPACE(cl)
#endif
#endif // __cplusplus >=201703L
1 change: 1 addition & 0 deletions sycl/include/CL/sycl/group_algorithm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <CL/sycl/nd_item.hpp>
#include <CL/sycl/sub_group.hpp>
#include <sycl/ext/oneapi/functional.hpp>
#include <sycl/ext/oneapi/group_sort.hpp>

__SYCL_INLINE_NAMESPACE(cl) {
namespace sycl {
Expand Down
Loading