Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

Commit 7f0ca80

Browse files
authored
[ESIMD] Add a test for lsc_fence intrinsic (#1071)
Signed-off-by: Vyacheslav N Klochkov <[email protected]>
1 parent dbe4ce8 commit 7f0ca80

File tree

1 file changed

+204
-0
lines changed

1 file changed

+204
-0
lines changed

SYCL/ESIMD/lsc/lsc_fence.cpp

+204
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
//==------------ lsc_fence.cpp - DPC++ ESIMD on-device test ----------------==//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
// REQUIRES: gpu-intel-pvc
9+
// UNSUPPORTED: cuda || hip
10+
// RUN: %clangxx -fsycl %s -o %t.out
11+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
12+
13+
// This test verifies the intrinsic lsc_fence on PVC.
14+
// It is based on https://en.wikipedia.org/wiki/Memory_barrier#Example
15+
16+
#include <CL/sycl.hpp>
17+
#include <algorithm>
18+
#include <cmath>
19+
#include <iostream>
20+
#include <numeric>
21+
#include <sycl/ext/intel/esimd.hpp>
22+
23+
using namespace cl::sycl;
24+
using namespace sycl::ext::intel::esimd;
25+
using namespace sycl::ext::intel::experimental::esimd;
26+
27+
// Returns 'true' if the test passes.
28+
bool testGlobal(queue &Q, int OddOrEven /*0 - Even, 1 - Odd*/) {
29+
constexpr size_t LocalSize = 16;
30+
constexpr unsigned int SIMDSize = 8;
31+
constexpr size_t Size = LocalSize * SIMDSize * 64;
32+
33+
int NumErrors = 0;
34+
std::cout << "Running testGlobal() with OddOrEven = " << OddOrEven << "...";
35+
36+
int *A = aligned_alloc_shared<int>(128, Size, Q);
37+
int *B = aligned_alloc_shared<int>(128, Size, Q);
38+
int *ResA = aligned_alloc_shared<int>(128, Size, Q);
39+
int *ResB = aligned_alloc_shared<int>(128, Size, Q);
40+
std::fill(A, A + Size, 0);
41+
std::fill(B, B + Size, 0);
42+
std::fill(ResA, ResA + Size, 0);
43+
std::fill(ResB, ResB + Size, 0);
44+
45+
try {
46+
Q.submit([&](handler &h) {
47+
range<1> GlobalRange(Size / SIMDSize);
48+
range<1> LocalRange(LocalSize);
49+
// There are many pairs of threads.
50+
// 0-th thread:
51+
// A = 5;
52+
// lsc_fence;
53+
// B = 1;
54+
// 1-st thread:
55+
// BB = B; // Don't use infinite loop 'while(B == 0) {}' as
56+
// // it was in the original example at wiki-page.
57+
// // That loop had incorrect assumption about compiler
58+
// // would keep the load inside the loop.
59+
// lsc_fence;
60+
// AA = A; // If B is 1, then A must be equal to 5.
61+
h.parallel_for(nd_range<1>{GlobalRange, LocalRange},
62+
[=](nd_item<1> NdId) SYCL_ESIMD_KERNEL {
63+
auto GID = NdId.get_global_linear_id();
64+
auto Offset = GID / 2 * SIMDSize;
65+
auto ByteOffset = Offset * sizeof(int);
66+
67+
if (NdId.get_local_linear_id() % 2 == OddOrEven) {
68+
// First thread: write data and condition
69+
// and provoke gpu to reorder instructions
70+
lsc_block_store<int, SIMDSize>(
71+
A + Offset, simd<int, SIMDSize>{5});
72+
73+
// Protect from reordering the writes to A and B.
74+
lsc_fence<lsc_memory_kind::untyped_global>();
75+
76+
lsc_block_store<int, SIMDSize>(
77+
B + Offset, simd<int, SIMDSize>{1});
78+
} else {
79+
auto BVec = lsc_block_load<int, SIMDSize>(B + Offset);
80+
81+
// Protect from reordering the reads from B and A.
82+
lsc_fence<lsc_memory_kind::untyped_global>();
83+
84+
auto AVec = lsc_block_load<int, SIMDSize>(A + Offset);
85+
86+
AVec.copy_to(ResA + Offset);
87+
BVec.copy_to(ResB + Offset);
88+
}
89+
});
90+
}).wait();
91+
} catch (sycl::exception e) {
92+
std::cout << "\nSYCL exception caught: " << e.what();
93+
NumErrors = 1000;
94+
}
95+
96+
for (int I = 0; I < Size / 2 && NumErrors < 10; I++) {
97+
if (ResB[I] != 0 && ResA[I] == 0) {
98+
NumErrors++;
99+
std::cout << I << ": Error - B was written before A: A = " << ResA[I]
100+
<< ", B = " << ResB[I] << std::endl;
101+
}
102+
}
103+
104+
free(A, Q);
105+
free(B, Q);
106+
free(ResA, Q);
107+
free(ResB, Q);
108+
std::cout << " Done" << (NumErrors ? " with ERRORS" : "") << std::endl;
109+
return NumErrors == 0;
110+
}
111+
112+
bool testLocal(queue &Q) {
113+
constexpr size_t LocalSize = 16;
114+
constexpr unsigned int SIMDSize = 8;
115+
constexpr size_t Size = LocalSize * SIMDSize * 64;
116+
117+
std::cout << "Running testLocal()...";
118+
119+
int NumErrors = 0;
120+
int *ResA = aligned_alloc_shared<int>(128, Size, Q);
121+
int *ResB = aligned_alloc_shared<int>(128, Size, Q);
122+
std::fill(ResA, ResA + Size, 0);
123+
std::fill(ResB, ResB + Size, 0);
124+
125+
try {
126+
Q.submit([&](handler &h) {
127+
range<2> GlobalRange(Size / SIMDSize, 2);
128+
range<2> LocalRange(LocalSize, 2);
129+
// There are many pairs of threads.
130+
// 0-th thread:
131+
// A_SLM = 5;
132+
// mem_fence
133+
// B_SLM = 1;
134+
// 1-st thread:
135+
// BB = B_SLM;
136+
// mem_fence
137+
// AA = A_SLM; // If BB is 1, then AA must be 5.
138+
h.parallel_for(
139+
nd_range<2>{GlobalRange, LocalRange},
140+
[=](nd_item<2> NdId) SYCL_ESIMD_KERNEL {
141+
constexpr int SLMSize = LocalSize * SIMDSize * 2 * sizeof(int);
142+
// Allocate SLM memory and initialize it with zeroes.
143+
slm_init<SLMSize>();
144+
if (NdId.get_local_linear_id() == 0) {
145+
for (int I = 0; I < SLMSize; I += SIMDSize * sizeof(int))
146+
lsc_slm_block_store<int, SIMDSize>(I, simd<int, SIMDSize>(0));
147+
}
148+
barrier();
149+
150+
auto Offset = NdId.get_local_id(0) * SIMDSize;
151+
auto ByteOffsetA = Offset * sizeof(int);
152+
auto ByteOffsetB = SLMSize / 2 + ByteOffsetA;
153+
if (NdId.get_local_id(1) == 0) {
154+
lsc_slm_block_store<int, SIMDSize>(ByteOffsetA,
155+
simd<int, SIMDSize>(5));
156+
lsc_fence<lsc_memory_kind::shared_local>();
157+
lsc_slm_block_store<int, SIMDSize>(ByteOffsetB,
158+
simd<int, SIMDSize>(1));
159+
} else {
160+
simd<int, SIMDSize> BVec =
161+
lsc_slm_block_load<int, SIMDSize>(ByteOffsetB);
162+
lsc_fence<lsc_memory_kind::shared_local>();
163+
simd<int, SIMDSize> AVec =
164+
lsc_slm_block_load<int, SIMDSize>(ByteOffsetA);
165+
166+
AVec.copy_to(ResA + Offset);
167+
BVec.copy_to(ResB + Offset);
168+
}
169+
});
170+
}).wait();
171+
} catch (sycl::exception e) {
172+
std::cout << "\nSYCL exception caught: " << e.what();
173+
NumErrors = 1000;
174+
}
175+
176+
for (int I = 0; I < Size && NumErrors < 10; I++) {
177+
if (ResB[I] != 0 && ResA[I] == 0) {
178+
NumErrors++;
179+
std::cout << I << ": Error - B was written before A: A = " << ResA[I]
180+
<< ", B = " << ResB[I] << std::endl;
181+
}
182+
}
183+
184+
std::cout << " Done" << (NumErrors ? " with ERRORS" : "") << std::endl;
185+
186+
free(ResA, Q);
187+
free(ResB, Q);
188+
return NumErrors == 0;
189+
}
190+
191+
int main() {
192+
queue Q;
193+
std::cout << "Running on " << Q.get_device().get_info<info::device::name>()
194+
<< std::endl;
195+
196+
bool Passed = true;
197+
Passed &= testGlobal(Q, 0);
198+
Passed &= testGlobal(Q, 1);
199+
200+
Passed &= testLocal(Q);
201+
202+
std::cout << (Passed != 0 ? "PASSED" : "FAILED") << std::endl;
203+
return Passed ? 0 : 1;
204+
}

0 commit comments

Comments
 (0)