Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

[SYCL] Extend sub-group load/store tests to cover 3-, 16-elements vectors #253

Merged
merged 7 commits into from
May 5, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 21 additions & 9 deletions SYCL/SubGroup/helper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
//
//===----------------------------------------------------------------------===//
#include <CL/sycl.hpp>
#include <bitset>
#include <cmath>
#include <complex>
#include <iostream>
Expand Down Expand Up @@ -36,6 +37,17 @@ template <typename T2> struct utils<T2, 2> {
std::to_string((T2)v.s1()) + " )";
}
};
template <typename T2> struct utils<T2, 3> {
static T2 add_vec(const vec<T2, 3> &v) { return v.s0() + v.s1() + v.s2(); }
static bool cmp_vec(const vec<T2, 3> &v, const vec<T2, 3> &r) {
return v.s0() == r.s0() && v.s1() == r.s1() && v.s2() == r.s2();
}
static std::string stringify_vec(const vec<T2, 2> &v) {
return std::string("(") + std::to_string((T2)v.s0()) + ", " +
std::to_string((T2)v.s1()) + ", " + std::to_string((T2)v.s3()) +
" )";
}
};
template <typename T2> struct utils<T2, 4> {
static T2 add_vec(const vec<T2, 4> &v) {
return v.s0() + v.s1() + v.s2() + v.s3();
Expand Down Expand Up @@ -98,9 +110,12 @@ template <typename T2> struct utils<T2, 16> {

template <typename T> void exit_if_not_equal(T val, T ref, const char *name) {
if (std::is_floating_point<T>::value) {
if (std::fabs(val - ref) > 0.01) {
std::cout << "Unexpected result for " << name << ": " << (double)val
<< " expected value: " << (double)ref << std::endl;
auto cmp_val = std::bitset<CHAR_BIT * sizeof(T)>(val);
auto cmp_ref = std::bitset<CHAR_BIT * sizeof(T)>(ref);
if (cmp_val != cmp_ref) {
std::cout << "Unexpected result for " << name << ": " << val << "("
<< cmp_val << ") expected value: " << ref << "(" << cmp_ref
<< ")" << std::endl;
exit(1);
}
} else {
Expand All @@ -115,12 +130,9 @@ template <typename T> void exit_if_not_equal(T val, T ref, const char *name) {
template <typename T>
void exit_if_not_equal(std::complex<T> val, std::complex<T> ref,
const char *name) {
if (std::fabs(val.real() - ref.real()) > 0.01 ||
std::fabs(val.imag() - ref.imag()) > 0.01) {
std::cout << "Unexpected result for " << name << ": " << val
<< " expected value: " << ref << std::endl;
exit(1);
}
std::string Name{name};
exit_if_not_equal(val.real(), ref.real(), (Name + ".real()").c_str());
exit_if_not_equal(val.imag(), ref.imag(), (Name + ".imag()").c_str());
}

template <typename T> void exit_if_not_equal(T *val, T *ref, const char *name) {
Expand Down
50 changes: 30 additions & 20 deletions SYCL/SubGroup/load_store.cpp
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
// UNSUPPORTED: cpu
// #2252 Disable until all variants of built-ins are available in OpenCL CPU
// runtime for every supported ISA
//
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %HOST_RUN_PLACEHOLDER %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// #2252 Disable until all variants of built-ins are available in OpenCL CPU
// runtime for every supported ISA
// RUNx %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
// RUN: %ACC_RUN_PLACEHOLDER %t.out
//
Expand All @@ -23,15 +21,10 @@ template <typename T, int N> class sycl_subgr;
using namespace cl::sycl;

template <typename T, int N> void check(queue &Queue) {
const int G = 1024, L = 128;
const int G = 512, L = 256;

// Pad arrays based on sub-group size to ensure no out-of-bounds accesses
// Workaround for info::device::sub_group_sizes support on some devices
size_t max_sg_size = 128;
#if 0
auto sg_sizes = Queue.get_device().get_info<info::device::sub_group_sizes>();
size_t max_sg_size = *std::max_element(sg_sizes.begin(), sg_sizes.end());
#endif

try {
nd_range<1> NdRange(G, L);
Expand All @@ -41,7 +34,7 @@ template <typename T, int N> void check(queue &Queue) {
auto acc = syclbuf.template get_access<access::mode::read_write>();
for (int i = 0; i < G; i++) {
acc[i] = i;
acc[i] += 0.1; // Check that floating point types are not casted to int
acc[i] += 0.25; // Check that floating point types are not casted to int
}
}
Queue.submit([&](handler &cgh) {
Expand All @@ -51,22 +44,24 @@ template <typename T, int N> void check(queue &Queue) {
{L + max_sg_size * N}, cgh);
cgh.parallel_for<sycl_subgr<T, N>>(NdRange, [=](nd_item<1> NdItem) {
ONEAPI::sub_group SG = NdItem.get_sub_group();
if (SG.get_group_id().get(0) % N == 0) {
size_t SGOffset =
SG.get_group_id().get(0) * SG.get_max_local_range().get(0);
auto SGid = SG.get_group_id().get(0);
auto SGsize = SG.get_max_local_range().get(0);
/* Avoid overlapping data ranges inside and between local groups */
if (SGid % N == 0 && (SGid + N) * SGsize <= L) {
size_t SGOffset = SGid * SGsize;
size_t WGSGoffset = NdItem.get_group(0) * L + SGOffset;
multi_ptr<T, access::address_space::global_space> mp(
&acc[WGSGoffset]);
multi_ptr<T, access::address_space::local_space> MPL(
&LocalMem[SGOffset]);
// Add all values in read block
vec<T, N> v(utils<T, N>::add_vec(SG.load<N, T>(mp)));
vec<T, N> v(SG.load<N, T>(mp));
SG.store<N, T>(MPL, v);
vec<T, N> t(utils<T, N>::add_vec(SG.load<N, T>(MPL)));
SG.store<N, T>(mp, t);
}
if (NdItem.get_global_id(0) == 0)
sgsizeacc[0] = SG.get_max_local_range()[0];
sgsizeacc[0] = SGsize;
});
});
auto acc = syclbuf.template get_access<access::mode::read_write>();
Expand All @@ -86,12 +81,11 @@ template <typename T, int N> void check(queue &Queue) {
ref = acc[j - (SGid % N) * sg_size];
} else {
for (int i = 0; i < N; i++) {
ref += (T)(j + i * sg_size) + 0.1;
ref += (T)(j + i * sg_size) + 0.25;
}
ref *= N;
}
/* There is no defined out-of-range behavior for these functions. */
if ((SGid + N) * sg_size < L) {
if ((SGid + N) * sg_size <= L) {
std::string s("Vector<");
s += std::string(typeid(ref).name()) + std::string(",") +
std::to_string(N) + std::string(">[") + std::to_string(j) +
Expand Down Expand Up @@ -181,37 +175,47 @@ int main() {
check<aligned_int>(Queue);
check<aligned_int, 1>(Queue);
check<aligned_int, 2>(Queue);
check<aligned_int, 3>(Queue);
check<aligned_int, 4>(Queue);
check<aligned_int, 8>(Queue);
check<aligned_int, 16>(Queue);
typedef unsigned int aligned_uint __attribute__((aligned(16)));
check<aligned_uint>(Queue);
check<aligned_uint, 1>(Queue);
check<aligned_uint, 2>(Queue);
check<aligned_uint, 3>(Queue);
check<aligned_uint, 4>(Queue);
check<aligned_uint, 8>(Queue);
check<aligned_uint, 16>(Queue);
typedef float aligned_float __attribute__((aligned(16)));
check<aligned_float>(Queue);
check<aligned_float, 1>(Queue);
check<aligned_float, 2>(Queue);
check<aligned_float, 3>(Queue);
check<aligned_float, 4>(Queue);
check<aligned_float, 8>(Queue);
check<aligned_float, 16>(Queue);
}
if (Queue.get_device().has_extension("cl_intel_subgroups_short") ||
PlatformName.find("CUDA") != std::string::npos) {
typedef short aligned_short __attribute__((aligned(16)));
check<aligned_short>(Queue);
check<aligned_short, 1>(Queue);
check<aligned_short, 2>(Queue);
check<aligned_short, 3>(Queue);
check<aligned_short, 4>(Queue);
check<aligned_short, 8>(Queue);
check<aligned_short, 16>(Queue);
if (Queue.get_device().has_extension("cl_khr_fp16") ||
PlatformName.find("CUDA") != std::string::npos) {
typedef half aligned_half __attribute__((aligned(16)));
check<aligned_half>(Queue);
check<aligned_half, 1>(Queue);
check<aligned_half, 2>(Queue);
check<aligned_half, 3>(Queue);
check<aligned_half, 4>(Queue);
check<aligned_half, 8>(Queue);
check<aligned_half, 16>(Queue);
}
}
if (Queue.get_device().has_extension("cl_intel_subgroups_long") ||
Expand All @@ -220,20 +224,26 @@ int main() {
check<aligned_long>(Queue);
check<aligned_long, 1>(Queue);
check<aligned_long, 2>(Queue);
check<aligned_long, 3>(Queue);
check<aligned_long, 4>(Queue);
check<aligned_long, 8>(Queue);
check<aligned_long, 16>(Queue);
typedef unsigned long aligned_ulong __attribute__((aligned(16)));
check<aligned_ulong>(Queue);
check<aligned_ulong, 1>(Queue);
check<aligned_ulong, 2>(Queue);
check<aligned_ulong, 3>(Queue);
check<aligned_ulong, 4>(Queue);
check<aligned_ulong, 8>(Queue);
check<aligned_ulong, 16>(Queue);
typedef double aligned_double __attribute__((aligned(16)));
check<aligned_double>(Queue);
check<aligned_double, 1>(Queue);
check<aligned_double, 2>(Queue);
check<aligned_double, 3>(Queue);
check<aligned_double, 4>(Queue);
check<aligned_double, 8>(Queue);
check<aligned_double, 16>(Queue);
}
std::cout << "Test passed." << std::endl;
return 0;
Expand Down