Skip to content

Commit 4fa727a

Browse files
[SYCL][Graph[E2E] Add an extra test to invoke copy engine on L0 (#17324)
Following #16830, this PR adds an additional e2e graph test to invoke copy engine on L0 devices and test its behaviour.
1 parent 025ad53 commit 4fa727a

File tree

2 files changed

+119
-7
lines changed

2 files changed

+119
-7
lines changed

sycl/test-e2e/Graph/ValidUsage/linear_graph_copy.cpp

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,6 @@ int main() {
5050
Queue.copy(DataA.data(), PtrA, Size);
5151
Queue.copy(DataB.data(), PtrB, Size);
5252
Queue.copy(DataC.data(), PtrC, Size);
53-
Queue.wait_and_throw();
5453

5554
Graph.begin_recording(Queue);
5655
Queue.submit([&](handler &CGH) {
@@ -82,15 +81,13 @@ int main() {
8281

8382
auto GraphExec = Graph.finalize();
8483

85-
event Event;
8684
for (unsigned n = 0; n < Iterations; n++) {
87-
Event =
88-
Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); });
85+
Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); });
8986
}
9087

91-
Queue.copy(PtrA, DataA.data(), Size, Event);
92-
Queue.copy(PtrB, DataB.data(), Size, Event);
93-
Queue.copy(PtrC, DataC.data(), Size, Event);
88+
Queue.copy(PtrA, DataA.data(), Size);
89+
Queue.copy(PtrB, DataB.data(), Size);
90+
Queue.copy(PtrC, DataC.data(), Size);
9491
Queue.wait_and_throw();
9592

9693
free(PtrA, Queue);
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
// RUN: %{build} -o %t.out
2+
// RUN: %{run} %t.out
3+
// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
4+
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
5+
// Extra run to check for immediate-command-list in Level Zero
6+
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
7+
//
8+
// Extra runs to test in-order command lists path
9+
// RUN: %if level_zero %{env UR_L0_USE_DRIVER_INORDER_LISTS=1 SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
10+
// RUN: %if level_zero %{env UR_L0_USE_DRIVER_INORDER_LISTS=1 SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_USE_DRIVER_COUNTER_BASED_EVENTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
11+
//
12+
// Tests that the optimization to use the L0 Copy Engine for memory commands
13+
// does not interfere with the linear graph optimization
14+
//
15+
// REQUIRES: aspect-usm_host_allocations
16+
17+
#include "../graph_common.hpp"
18+
19+
#include <sycl/properties/queue_properties.hpp>
20+
21+
int main() {
22+
queue Queue{{sycl::property::queue::in_order{}}};
23+
24+
using T = int;
25+
26+
const T ModValue = 7;
27+
std::vector<T> DataA(Size), DataB(Size), DataC(Size);
28+
29+
std::iota(DataA.begin(), DataA.end(), 1);
30+
std::iota(DataB.begin(), DataB.end(), 10);
31+
std::iota(DataC.begin(), DataC.end(), 1000);
32+
33+
// Create reference data for output
34+
std::vector<T> ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC);
35+
for (size_t i = 0; i < Iterations; i++) {
36+
for (size_t j = 0; j < Size; j++) {
37+
ReferenceA[j] += ModValue;
38+
ReferenceB[j] = ReferenceA[j];
39+
ReferenceB[j] -= ModValue;
40+
ReferenceC[j] = ReferenceB[j];
41+
ReferenceC[j] += ModValue;
42+
}
43+
}
44+
45+
ext::oneapi::experimental::command_graph Graph{Queue.get_context(),
46+
Queue.get_device()};
47+
48+
T *PtrA = malloc_device<T>(Size, Queue);
49+
T *PtrB = malloc_device<T>(Size, Queue);
50+
T *PtrC = malloc_device<T>(Size, Queue);
51+
52+
// L0 copy engine is disabled for D2D copies so we need to create additional
53+
// D2H copy events in between to invoke it.
54+
T *PtrBHost = malloc_host<T>(Size, Queue);
55+
T *PtrCHost = malloc_host<T>(Size, Queue);
56+
57+
Queue.copy(DataA.data(), PtrA, Size);
58+
Queue.copy(DataB.data(), PtrB, Size);
59+
Queue.copy(DataC.data(), PtrC, Size);
60+
61+
Graph.begin_recording(Queue);
62+
Queue.submit([&](handler &CGH) {
63+
CGH.parallel_for(range<1>(Size), [=](item<1> id) {
64+
auto LinID = id.get_linear_id();
65+
PtrA[LinID] += ModValue;
66+
});
67+
});
68+
69+
Queue.submit(
70+
[&](handler &CGH) { CGH.memcpy(PtrBHost, PtrA, Size * sizeof(T)); });
71+
Queue.submit(
72+
[&](handler &CGH) { CGH.memcpy(PtrB, PtrBHost, Size * sizeof(T)); });
73+
74+
Queue.submit([&](handler &CGH) {
75+
CGH.parallel_for(range<1>(Size), [=](item<1> id) {
76+
auto LinID = id.get_linear_id();
77+
PtrB[LinID] -= ModValue;
78+
});
79+
});
80+
81+
Queue.submit(
82+
[&](handler &CGH) { CGH.memcpy(PtrCHost, PtrB, Size * sizeof(T)); });
83+
Queue.submit(
84+
[&](handler &CGH) { CGH.memcpy(PtrC, PtrCHost, Size * sizeof(T)); });
85+
86+
Queue.submit([&](handler &CGH) {
87+
CGH.parallel_for(range<1>(Size), [=](item<1> id) {
88+
auto LinID = id.get_linear_id();
89+
PtrC[LinID] += ModValue;
90+
});
91+
});
92+
93+
Graph.end_recording();
94+
95+
auto GraphExec = Graph.finalize();
96+
97+
for (unsigned n = 0; n < Iterations; n++) {
98+
Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); });
99+
}
100+
101+
Queue.copy(PtrA, DataA.data(), Size);
102+
Queue.copy(PtrB, DataB.data(), Size);
103+
Queue.copy(PtrC, DataC.data(), Size);
104+
Queue.wait_and_throw();
105+
106+
free(PtrA, Queue);
107+
free(PtrB, Queue);
108+
free(PtrC, Queue);
109+
110+
for (size_t i = 0; i < Size; i++) {
111+
assert(check_value(i, ReferenceA[i], DataA[i], "DataA"));
112+
assert(check_value(i, ReferenceB[i], DataB[i], "DataB"));
113+
assert(check_value(i, ReferenceC[i], DataC[i], "DataC"));
114+
}
115+
}

0 commit comments

Comments
 (0)