Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

Commit c78a789

Browse files
authored
[SYCL] Extend sub-group load/store tests to cover 3-, 16-elements vectors (#253)
- Add processing of vectors of 3 and 16 elements. - Increase local group size to fit 16-element vectors when sub-group size is 16. - Tune check to avoid overlapping data ranges inside and between local groups. - Implemented bitwise comparison for floating point types except half. Valid range for half type is too narrow and will require to rework data for several tests. This is out of scope for current PR. .
1 parent 2e66331 commit c78a789

File tree

2 files changed

+51
-29
lines changed

2 files changed

+51
-29
lines changed

SYCL/SubGroup/helper.hpp

+21-9
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
//
77
//===----------------------------------------------------------------------===//
88
#include <CL/sycl.hpp>
9+
#include <bitset>
910
#include <cmath>
1011
#include <complex>
1112
#include <iostream>
@@ -36,6 +37,17 @@ template <typename T2> struct utils<T2, 2> {
3637
std::to_string((T2)v.s1()) + " )";
3738
}
3839
};
40+
template <typename T2> struct utils<T2, 3> {
41+
static T2 add_vec(const vec<T2, 3> &v) { return v.s0() + v.s1() + v.s2(); }
42+
static bool cmp_vec(const vec<T2, 3> &v, const vec<T2, 3> &r) {
43+
return v.s0() == r.s0() && v.s1() == r.s1() && v.s2() == r.s2();
44+
}
45+
static std::string stringify_vec(const vec<T2, 2> &v) {
46+
return std::string("(") + std::to_string((T2)v.s0()) + ", " +
47+
std::to_string((T2)v.s1()) + ", " + std::to_string((T2)v.s3()) +
48+
" )";
49+
}
50+
};
3951
template <typename T2> struct utils<T2, 4> {
4052
static T2 add_vec(const vec<T2, 4> &v) {
4153
return v.s0() + v.s1() + v.s2() + v.s3();
@@ -98,9 +110,12 @@ template <typename T2> struct utils<T2, 16> {
98110

99111
template <typename T> void exit_if_not_equal(T val, T ref, const char *name) {
100112
if (std::is_floating_point<T>::value) {
101-
if (std::fabs(val - ref) > 0.01) {
102-
std::cout << "Unexpected result for " << name << ": " << (double)val
103-
<< " expected value: " << (double)ref << std::endl;
113+
auto cmp_val = std::bitset<CHAR_BIT * sizeof(T)>(val);
114+
auto cmp_ref = std::bitset<CHAR_BIT * sizeof(T)>(ref);
115+
if (cmp_val != cmp_ref) {
116+
std::cout << "Unexpected result for " << name << ": " << val << "("
117+
<< cmp_val << ") expected value: " << ref << "(" << cmp_ref
118+
<< ")" << std::endl;
104119
exit(1);
105120
}
106121
} else {
@@ -115,12 +130,9 @@ template <typename T> void exit_if_not_equal(T val, T ref, const char *name) {
115130
template <typename T>
116131
void exit_if_not_equal(std::complex<T> val, std::complex<T> ref,
117132
const char *name) {
118-
if (std::fabs(val.real() - ref.real()) > 0.01 ||
119-
std::fabs(val.imag() - ref.imag()) > 0.01) {
120-
std::cout << "Unexpected result for " << name << ": " << val
121-
<< " expected value: " << ref << std::endl;
122-
exit(1);
123-
}
133+
std::string Name{name};
134+
exit_if_not_equal(val.real(), ref.real(), (Name + ".real()").c_str());
135+
exit_if_not_equal(val.imag(), ref.imag(), (Name + ".imag()").c_str());
124136
}
125137

126138
template <typename T> void exit_if_not_equal(T *val, T *ref, const char *name) {

SYCL/SubGroup/load_store.cpp

+30-20
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,8 @@
1-
// UNSUPPORTED: cpu
2-
// #2252 Disable until all variants of built-ins are available in OpenCL CPU
3-
// runtime for every supported ISA
4-
//
51
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
62
// RUN: %HOST_RUN_PLACEHOLDER %t.out
7-
// RUN: %CPU_RUN_PLACEHOLDER %t.out
3+
// #2252 Disable until all variants of built-ins are available in OpenCL CPU
4+
// runtime for every supported ISA
5+
// RUNx %CPU_RUN_PLACEHOLDER %t.out
86
// RUN: %GPU_RUN_PLACEHOLDER %t.out
97
// RUN: %ACC_RUN_PLACEHOLDER %t.out
108
//
@@ -23,15 +21,10 @@ template <typename T, int N> class sycl_subgr;
2321
using namespace cl::sycl;
2422

2523
template <typename T, int N> void check(queue &Queue) {
26-
const int G = 1024, L = 128;
24+
const int G = 512, L = 256;
2725

28-
// Pad arrays based on sub-group size to ensure no out-of-bounds accesses
29-
// Workaround for info::device::sub_group_sizes support on some devices
30-
size_t max_sg_size = 128;
31-
#if 0
3226
auto sg_sizes = Queue.get_device().get_info<info::device::sub_group_sizes>();
3327
size_t max_sg_size = *std::max_element(sg_sizes.begin(), sg_sizes.end());
34-
#endif
3528

3629
try {
3730
nd_range<1> NdRange(G, L);
@@ -41,7 +34,7 @@ template <typename T, int N> void check(queue &Queue) {
4134
auto acc = syclbuf.template get_access<access::mode::read_write>();
4235
for (int i = 0; i < G; i++) {
4336
acc[i] = i;
44-
acc[i] += 0.1; // Check that floating point types are not casted to int
37+
acc[i] += 0.25; // Check that floating point types are not casted to int
4538
}
4639
}
4740
Queue.submit([&](handler &cgh) {
@@ -51,22 +44,24 @@ template <typename T, int N> void check(queue &Queue) {
5144
{L + max_sg_size * N}, cgh);
5245
cgh.parallel_for<sycl_subgr<T, N>>(NdRange, [=](nd_item<1> NdItem) {
5346
ONEAPI::sub_group SG = NdItem.get_sub_group();
54-
if (SG.get_group_id().get(0) % N == 0) {
55-
size_t SGOffset =
56-
SG.get_group_id().get(0) * SG.get_max_local_range().get(0);
47+
auto SGid = SG.get_group_id().get(0);
48+
auto SGsize = SG.get_max_local_range().get(0);
49+
/* Avoid overlapping data ranges inside and between local groups */
50+
if (SGid % N == 0 && (SGid + N) * SGsize <= L) {
51+
size_t SGOffset = SGid * SGsize;
5752
size_t WGSGoffset = NdItem.get_group(0) * L + SGOffset;
5853
multi_ptr<T, access::address_space::global_space> mp(
5954
&acc[WGSGoffset]);
6055
multi_ptr<T, access::address_space::local_space> MPL(
6156
&LocalMem[SGOffset]);
6257
// Add all values in read block
63-
vec<T, N> v(utils<T, N>::add_vec(SG.load<N, T>(mp)));
58+
vec<T, N> v(SG.load<N, T>(mp));
6459
SG.store<N, T>(MPL, v);
6560
vec<T, N> t(utils<T, N>::add_vec(SG.load<N, T>(MPL)));
6661
SG.store<N, T>(mp, t);
6762
}
6863
if (NdItem.get_global_id(0) == 0)
69-
sgsizeacc[0] = SG.get_max_local_range()[0];
64+
sgsizeacc[0] = SGsize;
7065
});
7166
});
7267
auto acc = syclbuf.template get_access<access::mode::read_write>();
@@ -86,12 +81,11 @@ template <typename T, int N> void check(queue &Queue) {
8681
ref = acc[j - (SGid % N) * sg_size];
8782
} else {
8883
for (int i = 0; i < N; i++) {
89-
ref += (T)(j + i * sg_size) + 0.1;
84+
ref += (T)(j + i * sg_size) + 0.25;
9085
}
91-
ref *= N;
9286
}
9387
/* There is no defined out-of-range behavior for these functions. */
94-
if ((SGid + N) * sg_size < L) {
88+
if ((SGid + N) * sg_size <= L) {
9589
std::string s("Vector<");
9690
s += std::string(typeid(ref).name()) + std::string(",") +
9791
std::to_string(N) + std::string(">[") + std::to_string(j) +
@@ -181,37 +175,47 @@ int main() {
181175
check<aligned_int>(Queue);
182176
check<aligned_int, 1>(Queue);
183177
check<aligned_int, 2>(Queue);
178+
check<aligned_int, 3>(Queue);
184179
check<aligned_int, 4>(Queue);
185180
check<aligned_int, 8>(Queue);
181+
check<aligned_int, 16>(Queue);
186182
typedef unsigned int aligned_uint __attribute__((aligned(16)));
187183
check<aligned_uint>(Queue);
188184
check<aligned_uint, 1>(Queue);
189185
check<aligned_uint, 2>(Queue);
186+
check<aligned_uint, 3>(Queue);
190187
check<aligned_uint, 4>(Queue);
191188
check<aligned_uint, 8>(Queue);
189+
check<aligned_uint, 16>(Queue);
192190
typedef float aligned_float __attribute__((aligned(16)));
193191
check<aligned_float>(Queue);
194192
check<aligned_float, 1>(Queue);
195193
check<aligned_float, 2>(Queue);
194+
check<aligned_float, 3>(Queue);
196195
check<aligned_float, 4>(Queue);
197196
check<aligned_float, 8>(Queue);
197+
check<aligned_float, 16>(Queue);
198198
}
199199
if (Queue.get_device().has_extension("cl_intel_subgroups_short") ||
200200
PlatformName.find("CUDA") != std::string::npos) {
201201
typedef short aligned_short __attribute__((aligned(16)));
202202
check<aligned_short>(Queue);
203203
check<aligned_short, 1>(Queue);
204204
check<aligned_short, 2>(Queue);
205+
check<aligned_short, 3>(Queue);
205206
check<aligned_short, 4>(Queue);
206207
check<aligned_short, 8>(Queue);
208+
check<aligned_short, 16>(Queue);
207209
if (Queue.get_device().has_extension("cl_khr_fp16") ||
208210
PlatformName.find("CUDA") != std::string::npos) {
209211
typedef half aligned_half __attribute__((aligned(16)));
210212
check<aligned_half>(Queue);
211213
check<aligned_half, 1>(Queue);
212214
check<aligned_half, 2>(Queue);
215+
check<aligned_half, 3>(Queue);
213216
check<aligned_half, 4>(Queue);
214217
check<aligned_half, 8>(Queue);
218+
check<aligned_half, 16>(Queue);
215219
}
216220
}
217221
if (Queue.get_device().has_extension("cl_intel_subgroups_long") ||
@@ -220,20 +224,26 @@ int main() {
220224
check<aligned_long>(Queue);
221225
check<aligned_long, 1>(Queue);
222226
check<aligned_long, 2>(Queue);
227+
check<aligned_long, 3>(Queue);
223228
check<aligned_long, 4>(Queue);
224229
check<aligned_long, 8>(Queue);
230+
check<aligned_long, 16>(Queue);
225231
typedef unsigned long aligned_ulong __attribute__((aligned(16)));
226232
check<aligned_ulong>(Queue);
227233
check<aligned_ulong, 1>(Queue);
228234
check<aligned_ulong, 2>(Queue);
235+
check<aligned_ulong, 3>(Queue);
229236
check<aligned_ulong, 4>(Queue);
230237
check<aligned_ulong, 8>(Queue);
238+
check<aligned_ulong, 16>(Queue);
231239
typedef double aligned_double __attribute__((aligned(16)));
232240
check<aligned_double>(Queue);
233241
check<aligned_double, 1>(Queue);
234242
check<aligned_double, 2>(Queue);
243+
check<aligned_double, 3>(Queue);
235244
check<aligned_double, 4>(Queue);
236245
check<aligned_double, 8>(Queue);
246+
check<aligned_double, 16>(Queue);
237247
}
238248
std::cout << "Test passed." << std::endl;
239249
return 0;

0 commit comments

Comments
 (0)