Skip to content

Commit 604f634

Browse files
authored
[SYCL][Fusion] Test fusion scheduler integration (intel/llvm-test-suite#1416)
Test integration of kernel fusion into the SYCL runtime scheduler. Check that cancellation of the fusion happens if required by synchronization rules, as described in the [extension proposal](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_codeplay_kernel_fusion.asciidoc#synchronization-in-the-sycl-application). Spec: intel#7098 Implementation: intel#7531 Signed-off-by: Lukas Sommer <[email protected]>
1 parent f83da8b commit 604f634

14 files changed

+919
-4
lines changed

SYCL/KernelFusion/cancel_fusion.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
2+
// RUN: %CPU_RUN_PLACEHOLDER %t.out
3+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
24
// UNSUPPORTED: cuda || hip
35

46
// Test cancel fusion
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
2+
// RUN: %CPU_RUN_PLACEHOLDER %t.out
3+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
4+
// UNSUPPORTED: cuda || hip
5+
6+
// Test validity of events after cancel_fusion.
7+
8+
#include "fusion_event_test_common.h"
9+
#include <sycl/sycl.hpp>
10+
11+
using namespace sycl;
12+
13+
int main() {
14+
constexpr size_t dataSize = 512;
15+
16+
queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
17+
18+
int *in1 = sycl::malloc_shared<int>(dataSize, q);
19+
int *in2 = sycl::malloc_shared<int>(dataSize, q);
20+
int *in3 = sycl::malloc_shared<int>(dataSize, q);
21+
int *tmp = sycl::malloc_shared<int>(dataSize, q);
22+
int *out = sycl::malloc_shared<int>(dataSize, q);
23+
24+
for (size_t i = 0; i < dataSize; ++i) {
25+
in1[i] = i * 2;
26+
in2[i] = i * 3;
27+
in3[i] = i * 4;
28+
tmp[i] = -1;
29+
out[i] = -1;
30+
}
31+
32+
ext::codeplay::experimental::fusion_wrapper fw{q};
33+
fw.start_fusion();
34+
35+
assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
36+
37+
auto kernel1 = q.submit([&](handler &cgh) {
38+
cgh.parallel_for<class KernelOne>(
39+
dataSize, [=](id<1> i) { tmp[i] = in1[i] + in2[i]; });
40+
});
41+
42+
auto kernel2 = q.submit([&](handler &cgh) {
43+
cgh.depends_on(kernel1);
44+
cgh.parallel_for<class KernelTwo>(
45+
dataSize, [=](id<1> i) { out[i] = tmp[i] * in3[i]; });
46+
});
47+
48+
fw.cancel_fusion();
49+
50+
assert(!fw.is_in_fusion_mode() &&
51+
"Queue should not be in fusion mode anymore");
52+
53+
kernel1.wait();
54+
assert(isEventComplete(kernel1) && "Event should be complete");
55+
// The event returned by submit while in fusion mode depends on both
56+
// individual kernels to be executed.
57+
assert(kernel1.get_wait_list().size() == 2);
58+
59+
kernel2.wait();
60+
assert(isEventComplete(kernel2) && "Event should be complete");
61+
// The event returned by submit while in fusion mode depends on both
62+
// individual kernels to be executed.
63+
assert(kernel2.get_wait_list().size() == 2);
64+
65+
// Check the results
66+
for (size_t i = 0; i < dataSize; ++i) {
67+
assert(out[i] == (20 * i * i) && "Computation error");
68+
}
69+
70+
sycl::free(in1, q);
71+
sycl::free(in2, q);
72+
sycl::free(in3, q);
73+
sycl::free(tmp, q);
74+
sycl::free(out, q);
75+
76+
return 0;
77+
}
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
#include <sycl/sycl.hpp>
2+
3+
using namespace sycl;
4+
5+
static bool isEventComplete(sycl::event &ev) {
6+
return ev.get_info<info::event::command_execution_status>() ==
7+
info::event_command_status::complete;
8+
}

SYCL/KernelFusion/sync_acc_mem_op.cpp

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
2+
// RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
3+
// RUN: %CPU_CHECK_PLACEHOLDER
4+
// RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
5+
// RUN: %GPU_CHECK_PLACEHOLDER
6+
// UNSUPPORTED: cuda || hip
7+
8+
// Test fusion cancellation on an explicit memory operation on an accessor
9+
// happening before complete_fusion.
10+
11+
#include <sycl/sycl.hpp>
12+
13+
using namespace sycl;
14+
15+
int main() {
16+
constexpr size_t dataSize = 512;
17+
int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];
18+
int dst[dataSize];
19+
20+
for (size_t i = 0; i < dataSize; ++i) {
21+
in1[i] = i * 2;
22+
in2[i] = i * 3;
23+
in3[i] = i * 4;
24+
tmp[i] = -1;
25+
out[i] = -1;
26+
dst[i] = -1;
27+
}
28+
29+
queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
30+
31+
{
32+
buffer<int> bIn1{in1, range{dataSize}};
33+
buffer<int> bIn2{in2, range{dataSize}};
34+
buffer<int> bIn3{in3, range{dataSize}};
35+
buffer<int> bTmp{tmp, range{dataSize}};
36+
buffer<int> bOut{out, range{dataSize}};
37+
38+
ext::codeplay::experimental::fusion_wrapper fw{q};
39+
fw.start_fusion();
40+
41+
assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
42+
43+
q.submit([&](handler &cgh) {
44+
auto accIn1 = bIn1.get_access<access::mode::read>(cgh);
45+
auto accIn2 = bIn2.get_access<access::mode::read>(cgh);
46+
auto accTmp = bTmp.get_access(cgh);
47+
cgh.parallel_for<class KernelOne>(
48+
dataSize, [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; });
49+
});
50+
51+
q.submit([&](handler &cgh) {
52+
auto accTmp = bTmp.get_access(cgh);
53+
auto accIn3 = bIn3.get_access(cgh);
54+
auto accOut = bOut.get_access(cgh);
55+
cgh.parallel_for<class KernelTwo>(
56+
dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; });
57+
});
58+
59+
// This explicit copy operation has an overlapping requirement with one of
60+
// the kernels and therefore requires synchronization. This should lead to
61+
// cancellation of the fusion.
62+
auto copyEvt = q.submit([&](handler &cgh) {
63+
auto accTmp = bTmp.get_access(cgh);
64+
cgh.copy(accTmp, dst);
65+
});
66+
67+
copyEvt.wait();
68+
69+
assert(!fw.is_in_fusion_mode() &&
70+
"Queue should not be in fusion mode anymore");
71+
72+
fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
73+
}
74+
75+
// Check the results
76+
for (size_t i = 0; i < dataSize; ++i) {
77+
assert(out[i] == (20 * i * i) && "Computation error");
78+
assert(dst[i] == (5 * i) && "Computation error");
79+
}
80+
81+
return 0;
82+
}
83+
84+
// CHECK: WARNING: Aborting fusion because synchronization with one of the kernels in the fusion list was requested
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
2+
// RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
3+
// RUN: %CPU_CHECK_PLACEHOLDER
4+
// RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
5+
// RUN: %GPU_CHECK_PLACEHOLDER
6+
// UNSUPPORTED: cuda || hip
7+
8+
// Test fusion cancellation on buffer destruction happening before
9+
// complete_fusion.
10+
11+
#include <sycl/sycl.hpp>
12+
13+
using namespace sycl;
14+
15+
int main() {
16+
constexpr size_t dataSize = 512;
17+
int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];
18+
19+
for (size_t i = 0; i < dataSize; ++i) {
20+
in1[i] = i * 2;
21+
in2[i] = i * 3;
22+
in3[i] = i * 4;
23+
tmp[i] = -1;
24+
out[i] = -1;
25+
}
26+
27+
queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
28+
29+
{
30+
buffer<int> bIn1{in1, range{dataSize}};
31+
buffer<int> bIn2{in2, range{dataSize}};
32+
buffer<int> bTmp{tmp, range{dataSize}};
33+
buffer<int> bOut{out, range{dataSize}};
34+
35+
ext::codeplay::experimental::fusion_wrapper fw{q};
36+
{
37+
buffer<int> bIn3{in3, range{dataSize}};
38+
39+
fw.start_fusion();
40+
41+
assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
42+
43+
q.submit([&](handler &cgh) {
44+
auto accIn1 = bIn1.get_access<access::mode::read>(cgh);
45+
auto accIn2 = bIn2.get_access<access::mode::read>(cgh);
46+
auto accTmp = bTmp.get_access(cgh);
47+
cgh.parallel_for<class KernelOne>(
48+
dataSize, [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; });
49+
});
50+
51+
q.submit([&](handler &cgh) {
52+
auto accTmp = bTmp.get_access(cgh);
53+
auto accIn3 = bIn3.get_access(cgh);
54+
auto accOut = bOut.get_access(cgh);
55+
cgh.parallel_for<class KernelTwo>(
56+
dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; });
57+
});
58+
// Buffer bIn3, which is accessed by one of the kernels in the fusion list
59+
// goes out scope, causing a blocking wait for one of the kernels in the
60+
// fusion list. This should lead to cancellation of the fusion.
61+
}
62+
assert(!fw.is_in_fusion_mode() &&
63+
"Queue should not be in fusion mode anymore");
64+
65+
fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
66+
}
67+
68+
// Check the results
69+
for (size_t i = 0; i < dataSize; ++i) {
70+
assert(out[i] == (20 * i * i) && "Computation error");
71+
}
72+
73+
return 0;
74+
}
75+
76+
// CHECK: WARNING: Aborting fusion because synchronization with one of the kernels in the fusion list was requested

SYCL/KernelFusion/sync_event_wait.cpp

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
2+
// RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
3+
// RUN: %CPU_CHECK_PLACEHOLDER
4+
// RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
5+
// RUN: %GPU_CHECK_PLACEHOLDER
6+
// UNSUPPORTED: cuda || hip
7+
8+
// Test fusion cancellation on event::wait() happening before
9+
// complete_fusion.
10+
11+
#include <sycl/sycl.hpp>
12+
13+
using namespace sycl;
14+
15+
int main() {
16+
constexpr size_t dataSize = 512;
17+
int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];
18+
19+
for (size_t i = 0; i < dataSize; ++i) {
20+
in1[i] = i * 2;
21+
in2[i] = i * 3;
22+
in3[i] = i * 4;
23+
tmp[i] = -1;
24+
out[i] = -1;
25+
}
26+
27+
queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
28+
29+
{
30+
buffer<int> bIn1{in1, range{dataSize}};
31+
buffer<int> bIn2{in2, range{dataSize}};
32+
buffer<int> bIn3{in3, range{dataSize}};
33+
buffer<int> bTmp{tmp, range{dataSize}};
34+
buffer<int> bOut{out, range{dataSize}};
35+
36+
ext::codeplay::experimental::fusion_wrapper fw{q};
37+
fw.start_fusion();
38+
39+
assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
40+
41+
auto kernel1Ev = q.submit([&](handler &cgh) {
42+
auto accIn1 = bIn1.get_access<access::mode::read>(cgh);
43+
auto accIn2 = bIn2.get_access<access::mode::read>(cgh);
44+
auto accTmp = bTmp.get_access(cgh);
45+
cgh.parallel_for<class KernelOne>(
46+
dataSize, [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; });
47+
});
48+
49+
q.submit([&](handler &cgh) {
50+
auto accTmp = bTmp.get_access(cgh);
51+
auto accIn3 = bIn3.get_access(cgh);
52+
auto accOut = bOut.get_access(cgh);
53+
cgh.parallel_for<class KernelTwo>(
54+
dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; });
55+
});
56+
57+
// This event::wait() causes a blocking wait for one of the kernels in the
58+
// fusion list. This should lead to cancellation of the fusion.
59+
kernel1Ev.wait();
60+
61+
assert(!fw.is_in_fusion_mode() &&
62+
"Queue should not be in fusion mode anymore");
63+
64+
fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
65+
}
66+
67+
// Check the results
68+
for (size_t i = 0; i < dataSize; ++i) {
69+
assert(out[i] == (20 * i * i) && "Computation error");
70+
}
71+
72+
return 0;
73+
}
74+
75+
// CHECK: WARNING: Aborting fusion because synchronization with one of the kernels in the fusion list was requested

0 commit comments

Comments
 (0)