Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

Commit b3b471b

Browse files
authored
[SYCL][Fusion] Test caching of fused kernels (#1551)
Check that JIT compilation for kernel fusion is or is not repeated, depending on whether a newly submitted sequence of kernels is equivalent to a previous sequence. Next to the sequence of kernels, other invocation information, e.g., the user-specified internalization properties play a role to assess equivalence with a previous compilation. Different scenarios are tested by the test added in this PR. Implementation: intel/llvm#8051 Signed-off-by: Lukas Sommer <[email protected]>
1 parent 4b9e723 commit b3b471b

File tree

1 file changed

+142
-0
lines changed

1 file changed

+142
-0
lines changed

SYCL/KernelFusion/jit_caching.cpp

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
2+
// RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
3+
// RUN: %CPU_CHECK_PLACEHOLDER --implicit-check-not "COMPUTATION ERROR" --implicit-check-not "WRONG INTERNALIZATION"
4+
// RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
5+
// RUN: %GPU_CHECK_PLACEHOLDER --implicit-check-not "COMPUTATION ERROR" --implicit-check-not "WRONG INTERNALIZATION"
6+
// UNSUPPORTED: cuda || hip
7+
// REQUIRES: fusion
8+
9+
// Test caching for JIT fused kernels. Also test for debug messages being
10+
// printed when SYCL_RT_WARNING_LEVEL=1.
11+
12+
#include <iostream>
13+
#include <sycl/sycl.hpp>
14+
15+
using namespace sycl;
16+
17+
constexpr size_t dataSize = 512;
18+
19+
enum class Internalization { None, Local, Private };
20+
21+
void performFusion(queue &q, Internalization internalize, range<1> globalSize,
22+
int beta, int gamma, bool insertBarriers = false) {
23+
int alpha = 1;
24+
int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];
25+
26+
for (size_t i = 0; i < dataSize; ++i) {
27+
in1[i] = i * 2;
28+
in2[i] = i * 3;
29+
in3[i] = i * 4;
30+
tmp[i] = -1;
31+
out[i] = -1;
32+
}
33+
{
34+
buffer<int> bIn1{in1, globalSize};
35+
buffer<int> bIn2{in2, globalSize};
36+
buffer<int> bIn3{in3, globalSize};
37+
buffer<int> bTmp{tmp, globalSize};
38+
buffer<int> bOut{out, globalSize};
39+
40+
ext::codeplay::experimental::fusion_wrapper fw{q};
41+
fw.start_fusion();
42+
43+
assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
44+
45+
q.submit([&](handler &cgh) {
46+
auto accIn1 = bIn1.get_access(cgh);
47+
auto accIn2 = bIn2.get_access(cgh);
48+
property_list properties{};
49+
if (internalize == Internalization::Private) {
50+
properties = {
51+
sycl::ext::codeplay::experimental::property::promote_private{}};
52+
} else if (internalize == Internalization::Local) {
53+
properties = {
54+
sycl::ext::codeplay::experimental::property::promote_local{}};
55+
}
56+
accessor<int> accTmp = bTmp.get_access(cgh, properties);
57+
cgh.parallel_for<class KernelOne>(globalSize, [=](id<1> i) {
58+
accTmp[i] = accIn1[i] + accIn2[i] * alpha;
59+
});
60+
});
61+
62+
q.submit([&](handler &cgh) {
63+
property_list properties{};
64+
if (internalize == Internalization::Private) {
65+
properties = {
66+
sycl::ext::codeplay::experimental::property::promote_private{}};
67+
} else if (internalize == Internalization::Local) {
68+
properties = {
69+
sycl::ext::codeplay::experimental::property::promote_local{}};
70+
}
71+
accessor<int> accTmp = bTmp.get_access(cgh, properties);
72+
auto accIn3 = bIn3.get_access(cgh);
73+
auto accOut = bOut.get_access(cgh);
74+
cgh.parallel_for<class KernelTwo>(globalSize, [=](id<1> i) {
75+
accOut[i] = accTmp[i] * accIn3[i] * beta * gamma;
76+
});
77+
});
78+
79+
if (insertBarriers) {
80+
fw.complete_fusion();
81+
} else {
82+
fw.complete_fusion(
83+
{ext::codeplay::experimental::property::no_barriers{}});
84+
}
85+
86+
assert(!fw.is_in_fusion_mode() &&
87+
"Queue should not be in fusion mode anymore");
88+
}
89+
90+
// Check the results
91+
size_t numErrors = 0;
92+
size_t numInternalized = 0;
93+
for (size_t i = 0; i < dataSize; ++i) {
94+
if (i < globalSize.size() && out[i] != (20 * i * i * beta * gamma)) {
95+
++numErrors;
96+
}
97+
if (tmp[i] == -1) {
98+
++numInternalized;
99+
}
100+
}
101+
if (numErrors) {
102+
std::cout << "COMPUTATION ERROR\n";
103+
}
104+
if ((internalize == Internalization::None) && numInternalized) {
105+
std::cout << "WRONG INTERNALIZATION\n";
106+
}
107+
}
108+
109+
int main() {
110+
queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
111+
112+
// Initial invocation
113+
performFusion(q, Internalization::Private, range<1>{dataSize}, 1, 1);
114+
// CHECK: JIT DEBUG: Compiling new kernel, no suitable cached kernel found
115+
116+
// Identical invocation, should lead to JIT cache hit.
117+
performFusion(q, Internalization::Private, range<1>{dataSize}, 1, 1);
118+
// CHECK-NEXT: JIT DEBUG: Re-using cached JIT kernel
119+
// CHECK-NEXT: INFO: Re-using existing device binary for fused kernel
120+
121+
// Invocation with a different beta. Because beta was identical to alpha so
122+
// far, this should lead to a cache miss.
123+
performFusion(q, Internalization::Private, range<1>{dataSize}, 2, 1);
124+
// CHECK-NEXT: JIT DEBUG: Compiling new kernel, no suitable cached kernel found
125+
126+
// Invocation with barrier insertion should lead to a cache miss.
127+
performFusion(q, Internalization::Private, range<1>{dataSize}, 1, 1,
128+
/* insertBarriers */ true);
129+
// CHECK-NEXT: JIT DEBUG: Compiling new kernel, no suitable cached kernel found
130+
131+
// Invocation with different internalization target should lead to a cache
132+
// miss.
133+
performFusion(q, Internalization::None, range<1>{dataSize}, 1, 1);
134+
// CHECK-NEXT: JIT DEBUG: Compiling new kernel, no suitable cached kernel found
135+
136+
// Invocation with a different gamma should lead to a cache miss because gamma
137+
// participates in constant propagation.
138+
performFusion(q, Internalization::Private, range<1>{dataSize}, 1, 2);
139+
// CHECK-NEXT: JIT DEBUG: Compiling new kernel, no suitable cached kernel found
140+
141+
return 0;
142+
}

0 commit comments

Comments
 (0)