Skip to content

Commit f874f01

Browse files
sndmitrievbb-sycl
authored andcommitted
[SYCL][ESIMD] Add tests for lsc mem access APIs (intel#817)
* [SYCL][ESIMD] Add tests for lsc mem access APIs Signed-off-by: Sergey Dmitriev <[email protected]>
1 parent a0eb6be commit f874f01

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+2800
-0
lines changed

SYCL/ESIMD/lsc/Inputs/common.hpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
//==------- common.hpp - DPC++ ESIMD on-device test ------------------------==//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include <stdlib.h>
10+
11+
template <int case_num> class KernelID;
12+
13+
template <typename T> T get_rand() {
14+
T v = rand();
15+
if constexpr (sizeof(T) > 4)
16+
v = (v << 32) | rand();
17+
return v;
18+
}
Lines changed: 274 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,274 @@
1+
//==---------------- lsc_block_load.hpp - DPC++ ESIMD on-device test -------==//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include <CL/sycl.hpp>
10+
#include <sycl/ext/intel/esimd.hpp>
11+
12+
#include <iostream>
13+
14+
#include "common.hpp"
15+
16+
using namespace cl::sycl;
17+
using namespace sycl::ext::intel::esimd;
18+
using namespace sycl::ext::intel::esimd::detail;
19+
using namespace sycl::ext::intel::experimental::esimd;
20+
using namespace sycl::ext::intel::experimental::esimd::detail;
21+
22+
template <int case_num, typename T, uint32_t Groups, uint32_t Threads,
23+
int BlockWidth, int BlockHeight = 1, int NBlocks = 1,
24+
bool Transposed = false, bool Transformed = false,
25+
cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
26+
bool use_prefetch = false>
27+
bool test(unsigned SurfaceWidth, unsigned SurfaceHeight, unsigned SurfacePitch,
28+
int X, int Y) {
29+
// Some restrictions based on documentation
30+
static_assert(!(Transposed && Transformed),
31+
"Transposed and transformed is not supported");
32+
static_assert(BlockWidth > 0, "Block width must be positive");
33+
static_assert(BlockHeight > 0, "Block height must be positive");
34+
35+
if constexpr (Transposed) {
36+
static_assert(NBlocks == 1, "Transposed expected to be 1 block only");
37+
static_assert(sizeof(T) >= 4, "Transposed can only use D32 and D64");
38+
if constexpr (sizeof(T) == 4) {
39+
static_assert(BlockWidth <= 8,
40+
"D32 transposed allow only block width 8 and less");
41+
static_assert(BlockHeight <= 32,
42+
"D32 transposed allow only block height 32 and less");
43+
}
44+
if constexpr (sizeof(T) == 8) {
45+
static_assert(BlockWidth == 1 || BlockWidth == 2 || BlockWidth == 4,
46+
"D64 transposed allow only block width 1/2/4");
47+
static_assert(BlockHeight == 8,
48+
"D64 transposed allow only block height 8");
49+
}
50+
} else if constexpr (Transformed) {
51+
static_assert(sizeof(T) <= 2, "Transformed can only use D8 and D16");
52+
if constexpr (sizeof(T) == 2 && NBlocks == 4) {
53+
static_assert(BlockWidth <= 8,
54+
"Transformed D16x4 allow only block width 8 and less");
55+
}
56+
static_assert((sizeof(T) * BlockWidth) % 4 == 0,
57+
"Transformed block width must be aligned by DW");
58+
static_assert(BlockWidth <= 16,
59+
"Transformed block width must be 16 and less");
60+
static_assert(BlockWidth >= (4 / sizeof(T)),
61+
"Minimal transformed block width depends on data size");
62+
static_assert(BlockHeight <= 32,
63+
"Transformed block height must be 32 and less");
64+
static_assert(BlockHeight >= (4 / sizeof(T)),
65+
"Minimal transformed block height depends on data size");
66+
} else {
67+
static_assert((sizeof(T) * BlockWidth) % 4 == 0,
68+
"Block width must be aligned by DW");
69+
static_assert(sizeof(T) * BlockWidth * NBlocks <= 64,
70+
"Total block width must be 64B or less");
71+
static_assert(BlockHeight <= 32, "Block height must be 32 or less");
72+
if constexpr (sizeof(T) == 4) {
73+
static_assert(NBlocks < 4, "D32 restricted to use 1 or 2 blocks only");
74+
}
75+
if constexpr (sizeof(T) == 8) {
76+
static_assert(NBlocks < 2, "D64 restricted to use 1 block only");
77+
}
78+
}
79+
80+
constexpr int N =
81+
get_lsc_block_2d_data_size<T, NBlocks, BlockHeight, BlockWidth,
82+
Transposed, Transformed>();
83+
/* Due to store2d a is subject to stricter restrictions:
84+
* NBlocks always 1, no Transposed, no Transformed, max BlockHeight 8.
85+
* Series of 2d stores with height 1 are used to write loaded data to output
86+
* buffer. Also Transformed load2d extends BlockWidth to the next power of 2
87+
* and rounds up BlockHeight.
88+
*/
89+
constexpr int SH = Transformed
90+
? roundUpNextMultiple<BlockHeight, 4 / sizeof(T)>()
91+
: BlockHeight;
92+
constexpr int SW = Transformed ? getNextPowerOf2<BlockWidth>() : BlockWidth;
93+
constexpr int SN = get_lsc_block_2d_data_size<T, 1u, 1u, SW, false, false>();
94+
95+
std::cout << "N = " << N << std::endl;
96+
std::cout << "SN = " << SN << std::endl;
97+
std::cout << "W = " << BlockWidth << " SW = " << SW << std::endl;
98+
std::cout << "H = " << BlockHeight << " SH = " << SH << std::endl;
99+
100+
T old_val = get_rand<T>();
101+
102+
auto GPUSelector = gpu_selector{};
103+
auto q = queue{GPUSelector};
104+
auto dev = q.get_device();
105+
std::cout << "Running case #" << case_num << " on "
106+
<< dev.get_info<info::device::name>() << "\n";
107+
auto ctx = q.get_context();
108+
109+
// workgroups
110+
cl::sycl::range<1> GlobalRange{Groups};
111+
// threads in each group
112+
cl::sycl::range<1> LocalRange{Threads};
113+
cl::sycl::nd_range<1> Range{GlobalRange * LocalRange, LocalRange};
114+
115+
unsigned SurfaceSize = SurfacePitch * SurfaceHeight * NBlocks;
116+
unsigned Size = SurfaceSize * Groups * Threads;
117+
118+
T *out = static_cast<T *>(sycl::malloc_shared(Size * sizeof(T), dev, ctx));
119+
for (int i = 0; i < Size; i++)
120+
out[i] = old_val;
121+
122+
T *in = static_cast<T *>(sycl::malloc_shared(Size * sizeof(T), dev, ctx));
123+
for (int i = 0; i < Size; i++)
124+
in[i] = get_rand<T>();
125+
126+
try {
127+
auto e = q.submit([&](handler &cgh) {
128+
cgh.parallel_for<KernelID<case_num>>(
129+
Range, [=](cl::sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL {
130+
uint16_t globalID = ndi.get_global_id(0);
131+
uint32_t off = globalID * SurfaceSize;
132+
133+
unsigned width = SurfaceWidth * sizeof(T) - 1;
134+
unsigned height = SurfaceHeight - 1;
135+
unsigned pitch = SurfacePitch * sizeof(T) - 1;
136+
137+
simd<T, N> vals;
138+
if constexpr (use_prefetch) {
139+
lsc_prefetch2d<T, BlockWidth, BlockHeight, NBlocks, L1H, L3H>(
140+
in + off, width, height, pitch, X, Y);
141+
vals =
142+
lsc_load2d<T, BlockWidth, BlockHeight, NBlocks, Transposed,
143+
Transformed>(in + off, width, height, pitch, X, Y);
144+
} else {
145+
vals = lsc_load2d<T, BlockWidth, BlockHeight, NBlocks, Transposed,
146+
Transformed, L1H, L3H>(in + off, width, height,
147+
pitch, X, Y);
148+
}
149+
150+
for (int i = 0; i < NBlocks; i++) {
151+
for (int j = 0; j < SH; j++) {
152+
simd<T, SN> v =
153+
vals.template select<SN, 1>(i * SN * SH + j * SW);
154+
lsc_store2d<T, SW>(
155+
out + off, SurfaceWidth * sizeof(T) - 1, SurfaceHeight - 1,
156+
SurfacePitch * sizeof(T) - 1, X + i * SW, Y + j, v);
157+
}
158+
}
159+
});
160+
});
161+
e.wait();
162+
} catch (cl::sycl::exception const &e) {
163+
std::cout << "SYCL exception caught: " << e.what() << '\n';
164+
sycl::free(out, ctx);
165+
sycl::free(in, ctx);
166+
return false;
167+
}
168+
169+
bool passed = true;
170+
171+
if constexpr (Transposed) {
172+
for (int gid = 0; gid < Groups * Threads; gid++) {
173+
int dx = 0, dy = 0;
174+
for (int j = 0; j < SurfaceHeight; j++) {
175+
for (int i = 0; i < SurfacePitch; i++) {
176+
T e = old_val;
177+
// index in linear buffer
178+
int idx = i + j * SurfacePitch + gid * SurfaceSize;
179+
180+
// check if inside block
181+
if ((i >= X) && (i < X + BlockWidth) && (j >= Y) &&
182+
(j < Y + BlockHeight)) {
183+
if (i < SurfaceWidth) {
184+
if (X + dx < SurfaceWidth)
185+
e = in[(X + dx) + (Y + dy) * SurfacePitch + gid * SurfaceSize];
186+
else
187+
e = (T)0;
188+
}
189+
dy += 1;
190+
if (dy == BlockHeight) {
191+
dy = 0;
192+
dx += 1;
193+
}
194+
}
195+
196+
if (out[idx] != e) {
197+
passed = false;
198+
std::cout << "out" << idx << "] = 0x" << std::hex
199+
<< (uint64_t)out[idx] << " vs etalon = 0x" << (uint64_t)e
200+
<< std::dec << std::endl;
201+
}
202+
}
203+
}
204+
}
205+
} else if constexpr (Transformed) {
206+
constexpr int scale = 4 / sizeof(T);
207+
for (int gid = 0; gid < Groups * Threads; gid++) {
208+
for (int j = 0; j < SurfaceHeight; j++) {
209+
for (int i = 0; i < SurfacePitch; i++) {
210+
T e = old_val;
211+
// index in linear buffer
212+
int idx = i + j * SurfacePitch + gid * SurfaceSize;
213+
214+
// check if inside block
215+
if ((i >= X) && (i < X + SW * NBlocks) && (j >= Y) && (j < Y + SH)) {
216+
int di = i - X;
217+
int dj = j - Y;
218+
int bn = di / SW;
219+
220+
int dx, dy;
221+
dx = di / scale + bn * (BlockWidth - SW / scale) +
222+
(dj % scale) * SW / scale;
223+
dy = dj + di % scale - dj % scale;
224+
225+
if (i < SurfaceWidth) {
226+
if (dx < BlockWidth * (bn + 1) && (dx + X) < SurfaceWidth &&
227+
(dy + Y) < SurfaceHeight)
228+
e = in[(X + dx) + (Y + dy) * SurfacePitch + gid * SurfaceSize];
229+
else
230+
e = (T)0;
231+
}
232+
}
233+
234+
if (out[idx] != e) {
235+
passed = false;
236+
std::cout << std::hex << "out[0x" << idx << "] = 0x"
237+
<< (uint64_t)out[idx] << " vs etalon = 0x" << (uint64_t)e
238+
<< std::dec << std::endl;
239+
}
240+
}
241+
}
242+
}
243+
} else {
244+
for (int gid = 0; gid < Groups * Threads; gid++) {
245+
for (int j = 0; j < SurfaceHeight; j++) {
246+
for (int i = 0; i < SurfacePitch; i++) {
247+
T e = old_val;
248+
// index in linear buffer
249+
int idx = i + j * SurfacePitch + gid * SurfaceSize;
250+
251+
// check if inside block
252+
if ((i >= X) && (i < X + BlockWidth * NBlocks) &&
253+
(i < SurfaceWidth) && (j >= Y) && (j < Y + BlockHeight))
254+
e = in[idx];
255+
256+
if (out[idx] != e) {
257+
passed = false;
258+
std::cout << "out[" << idx << "] = 0x" << std::hex
259+
<< (uint64_t)out[idx] << " vs etalon = 0x" << (uint64_t)e
260+
<< std::dec << std::endl;
261+
}
262+
}
263+
}
264+
}
265+
}
266+
267+
if (!passed)
268+
std::cout << "Case #" << case_num << " FAILED" << std::endl;
269+
270+
sycl::free(out, ctx);
271+
sycl::free(in, ctx);
272+
273+
return passed;
274+
}

0 commit comments

Comments
 (0)