1
- // UNSUPPORTED: cpu
2
- // #2252 Disable until all variants of built-ins are available in OpenCL CPU
3
- // runtime for every supported ISA
4
- //
5
1
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
6
2
// RUN: %HOST_RUN_PLACEHOLDER %t.out
7
- // RUN: %CPU_RUN_PLACEHOLDER %t.out
3
+ // #2252 Disable until all variants of built-ins are available in OpenCL CPU
4
+ // runtime for every supported ISA
5
+ // RUNx %CPU_RUN_PLACEHOLDER %t.out
8
6
// RUN: %GPU_RUN_PLACEHOLDER %t.out
9
7
// RUN: %ACC_RUN_PLACEHOLDER %t.out
10
8
//
@@ -23,15 +21,10 @@ template <typename T, int N> class sycl_subgr;
23
21
using namespace cl ::sycl;
24
22
25
23
template <typename T, int N> void check (queue &Queue) {
26
- const int G = 1024 , L = 128 ;
24
+ const int G = 512 , L = 256 ;
27
25
28
- // Pad arrays based on sub-group size to ensure no out-of-bounds accesses
29
- // Workaround for info::device::sub_group_sizes support on some devices
30
- size_t max_sg_size = 128 ;
31
- #if 0
32
26
auto sg_sizes = Queue.get_device ().get_info <info::device::sub_group_sizes>();
33
27
size_t max_sg_size = *std::max_element (sg_sizes.begin (), sg_sizes.end ());
34
- #endif
35
28
36
29
try {
37
30
nd_range<1 > NdRange (G, L);
@@ -41,7 +34,7 @@ template <typename T, int N> void check(queue &Queue) {
41
34
auto acc = syclbuf.template get_access <access ::mode::read_write>();
42
35
for (int i = 0 ; i < G; i++) {
43
36
acc[i] = i;
44
- acc[i] += 0.1 ; // Check that floating point types are not casted to int
37
+ acc[i] += 0.25 ; // Check that floating point types are not casted to int
45
38
}
46
39
}
47
40
Queue.submit ([&](handler &cgh) {
@@ -51,22 +44,24 @@ template <typename T, int N> void check(queue &Queue) {
51
44
{L + max_sg_size * N}, cgh);
52
45
cgh.parallel_for <sycl_subgr<T, N>>(NdRange, [=](nd_item<1 > NdItem) {
53
46
ONEAPI::sub_group SG = NdItem.get_sub_group ();
54
- if (SG.get_group_id ().get (0 ) % N == 0 ) {
55
- size_t SGOffset =
56
- SG.get_group_id ().get (0 ) * SG.get_max_local_range ().get (0 );
47
+ auto SGid = SG.get_group_id ().get (0 );
48
+ auto SGsize = SG.get_max_local_range ().get (0 );
49
+ /* Avoid overlapping data ranges inside and between local groups */
50
+ if (SGid % N == 0 && (SGid + N) * SGsize <= L) {
51
+ size_t SGOffset = SGid * SGsize;
57
52
size_t WGSGoffset = NdItem.get_group (0 ) * L + SGOffset;
58
53
multi_ptr<T, access ::address_space::global_space> mp (
59
54
&acc[WGSGoffset]);
60
55
multi_ptr<T, access ::address_space::local_space> MPL (
61
56
&LocalMem[SGOffset]);
62
57
// Add all values in read block
63
- vec<T, N> v (utils<T, N>:: add_vec ( SG.load <N, T>(mp) ));
58
+ vec<T, N> v (SG.load <N, T>(mp));
64
59
SG.store <N, T>(MPL, v);
65
60
vec<T, N> t (utils<T, N>::add_vec (SG.load <N, T>(MPL)));
66
61
SG.store <N, T>(mp, t);
67
62
}
68
63
if (NdItem.get_global_id (0 ) == 0 )
69
- sgsizeacc[0 ] = SG. get_max_local_range ()[ 0 ] ;
64
+ sgsizeacc[0 ] = SGsize ;
70
65
});
71
66
});
72
67
auto acc = syclbuf.template get_access <access ::mode::read_write>();
@@ -86,12 +81,11 @@ template <typename T, int N> void check(queue &Queue) {
86
81
ref = acc[j - (SGid % N) * sg_size];
87
82
} else {
88
83
for (int i = 0 ; i < N; i++) {
89
- ref += (T)(j + i * sg_size) + 0.1 ;
84
+ ref += (T)(j + i * sg_size) + 0.25 ;
90
85
}
91
- ref *= N;
92
86
}
93
87
/* There is no defined out-of-range behavior for these functions. */
94
- if ((SGid + N) * sg_size < L) {
88
+ if ((SGid + N) * sg_size <= L) {
95
89
std::string s (" Vector<" );
96
90
s += std::string (typeid (ref).name ()) + std::string (" ," ) +
97
91
std::to_string (N) + std::string (" >[" ) + std::to_string (j) +
@@ -181,37 +175,47 @@ int main() {
181
175
check<aligned_int>(Queue);
182
176
check<aligned_int, 1 >(Queue);
183
177
check<aligned_int, 2 >(Queue);
178
+ check<aligned_int, 3 >(Queue);
184
179
check<aligned_int, 4 >(Queue);
185
180
check<aligned_int, 8 >(Queue);
181
+ check<aligned_int, 16 >(Queue);
186
182
typedef unsigned int aligned_uint __attribute__ ((aligned (16 )));
187
183
check<aligned_uint>(Queue);
188
184
check<aligned_uint, 1 >(Queue);
189
185
check<aligned_uint, 2 >(Queue);
186
+ check<aligned_uint, 3 >(Queue);
190
187
check<aligned_uint, 4 >(Queue);
191
188
check<aligned_uint, 8 >(Queue);
189
+ check<aligned_uint, 16 >(Queue);
192
190
typedef float aligned_float __attribute__ ((aligned (16 )));
193
191
check<aligned_float>(Queue);
194
192
check<aligned_float, 1 >(Queue);
195
193
check<aligned_float, 2 >(Queue);
194
+ check<aligned_float, 3 >(Queue);
196
195
check<aligned_float, 4 >(Queue);
197
196
check<aligned_float, 8 >(Queue);
197
+ check<aligned_float, 16 >(Queue);
198
198
}
199
199
if (Queue.get_device ().has_extension (" cl_intel_subgroups_short" ) ||
200
200
PlatformName.find (" CUDA" ) != std::string::npos) {
201
201
typedef short aligned_short __attribute__ ((aligned (16 )));
202
202
check<aligned_short>(Queue);
203
203
check<aligned_short, 1 >(Queue);
204
204
check<aligned_short, 2 >(Queue);
205
+ check<aligned_short, 3 >(Queue);
205
206
check<aligned_short, 4 >(Queue);
206
207
check<aligned_short, 8 >(Queue);
208
+ check<aligned_short, 16 >(Queue);
207
209
if (Queue.get_device ().has_extension (" cl_khr_fp16" ) ||
208
210
PlatformName.find (" CUDA" ) != std::string::npos) {
209
211
typedef half aligned_half __attribute__ ((aligned (16 )));
210
212
check<aligned_half>(Queue);
211
213
check<aligned_half, 1 >(Queue);
212
214
check<aligned_half, 2 >(Queue);
215
+ check<aligned_half, 3 >(Queue);
213
216
check<aligned_half, 4 >(Queue);
214
217
check<aligned_half, 8 >(Queue);
218
+ check<aligned_half, 16 >(Queue);
215
219
}
216
220
}
217
221
if (Queue.get_device ().has_extension (" cl_intel_subgroups_long" ) ||
@@ -220,20 +224,26 @@ int main() {
220
224
check<aligned_long>(Queue);
221
225
check<aligned_long, 1 >(Queue);
222
226
check<aligned_long, 2 >(Queue);
227
+ check<aligned_long, 3 >(Queue);
223
228
check<aligned_long, 4 >(Queue);
224
229
check<aligned_long, 8 >(Queue);
230
+ check<aligned_long, 16 >(Queue);
225
231
typedef unsigned long aligned_ulong __attribute__ ((aligned (16 )));
226
232
check<aligned_ulong>(Queue);
227
233
check<aligned_ulong, 1 >(Queue);
228
234
check<aligned_ulong, 2 >(Queue);
235
+ check<aligned_ulong, 3 >(Queue);
229
236
check<aligned_ulong, 4 >(Queue);
230
237
check<aligned_ulong, 8 >(Queue);
238
+ check<aligned_ulong, 16 >(Queue);
231
239
typedef double aligned_double __attribute__ ((aligned (16 )));
232
240
check<aligned_double>(Queue);
233
241
check<aligned_double, 1 >(Queue);
234
242
check<aligned_double, 2 >(Queue);
243
+ check<aligned_double, 3 >(Queue);
235
244
check<aligned_double, 4 >(Queue);
236
245
check<aligned_double, 8 >(Queue);
246
+ check<aligned_double, 16 >(Queue);
237
247
}
238
248
std::cout << " Test passed." << std::endl;
239
249
return 0 ;
0 commit comments