Skip to content

Commit 4dc23a2

Browse files
[SYCL] Improve DPC++ runtime performance running on L0 (intel#3797)
Fixes a performance issue due to intel#3612 This change does two things to fix the performance issue. The first is in the level zero plugin. Here the change is to only close and submit the batch if the event being queried is one that will be signalled one of the commands in the batch. The second change is in the sycl run-time itself. This change prevents the event cleanup code from querying every single event that is outstanding in the system. This is necessary to prevent the most recent events (which are likely to be in the plug-ins batch) from being queried. Both changes are required to fix the performance issue.
1 parent e9cf124 commit 4dc23a2

File tree

3 files changed

+30
-11
lines changed

3 files changed

+30
-11
lines changed

sycl/plugins/level_zero/pi_level_zero.cpp

+7-2
Original file line numberDiff line numberDiff line change
@@ -4111,8 +4111,13 @@ pi_result piEventGetInfo(pi_event Event, pi_event_info ParamName,
41114111
// Lock automatically releases when this goes out of scope.
41124112
std::lock_guard<std::mutex> lock(Event->Queue->PiQueueMutex);
41134113

4114-
if (auto Res = Event->Queue->executeOpenCommandList())
4115-
return Res;
4114+
// Only do the execute of the open command list if the event that
4115+
// is being queried and event that is to be signalled by something
4116+
// currently in that open command list.
4117+
if (Event->Queue->ZeOpenCommandList == Event->ZeCommandList) {
4118+
if (auto Res = Event->Queue->executeOpenCommandList())
4119+
return Res;
4120+
}
41164121
}
41174122

41184123
ze_result_t ZeResult;

sycl/source/detail/queue_impl.cpp

+13-6
Original file line numberDiff line numberDiff line change
@@ -122,14 +122,21 @@ void queue_impl::addSharedEvent(const event &Event) {
122122
// of them can be released.
123123
const size_t EventThreshold = 128;
124124
if (MEventsShared.size() >= EventThreshold) {
125+
// Generally, the vector is ordered so that the oldest events are in the
126+
// front and the newer events are in the end. So, search to find the first
127+
// event that isn't yet complete. All the events prior to that can be
128+
// erased. This could leave some few events further on that have completed
129+
// not yet erased, but that is OK. This cleanup doesn't have to be perfect.
130+
// This also keeps the algorithm linear rather than quadratic because it
131+
// doesn't continually recheck things towards the back of the list that
132+
// really haven't had time to complete.
125133
MEventsShared.erase(
126-
std::remove_if(
127-
MEventsShared.begin(), MEventsShared.end(),
128-
[](const event &E) {
129-
return E.get_info<info::event::command_execution_status>() ==
134+
MEventsShared.begin(),
135+
std::find_if(
136+
MEventsShared.begin(), MEventsShared.end(), [](const event &E) {
137+
return E.get_info<info::event::command_execution_status>() !=
130138
info::event_command_status::complete;
131-
}),
132-
MEventsShared.end());
139+
}));
133140
}
134141
MEventsShared.push_back(Event);
135142
}

sycl/unittests/queue/EventClear.cpp

+10-3
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@ struct TestCtx {
2323

2424
std::unique_ptr<TestCtx> TestContext;
2525

26+
const int ExpectedEventThreshold = 128;
27+
2628
pi_result redefinedUSMEnqueueMemset(pi_queue queue, void *ptr, pi_int32 value,
2729
size_t count,
2830
pi_uint32 num_events_in_waitlist,
@@ -44,10 +46,16 @@ pi_result redefinedEventGetInfo(pi_event event, pi_event_info param_name,
4446
size_t *param_value_size_ret) {
4547
EXPECT_EQ(param_name, PI_EVENT_INFO_COMMAND_EXECUTION_STATUS)
4648
<< "Unexpected event info requested";
47-
// Report half of events as complete
49+
// Report first half of events as complete.
50+
// Report second half of events as running.
51+
// This is important, because removal algorithm assumes that
52+
// events are likely to be removed oldest first, and stops removing
53+
// at the first non-completed event.
4854
static int Counter = 0;
4955
auto *Result = reinterpret_cast<pi_event_status *>(param_value);
50-
*Result = (++Counter % 2 == 0) ? PI_EVENT_COMPLETE : PI_EVENT_RUNNING;
56+
*Result = (Counter < (ExpectedEventThreshold / 2)) ? PI_EVENT_COMPLETE
57+
: PI_EVENT_RUNNING;
58+
Counter++;
5159
return PI_SUCCESS;
5260
}
5361

@@ -117,7 +125,6 @@ TEST(QueueEventClear, CleanupOnThreshold) {
117125
queue Q{Ctx, default_selector()};
118126

119127
unsigned char *HostAlloc = (unsigned char *)malloc_host(1, Ctx);
120-
const int ExpectedEventThreshold = 128;
121128
TestContext->EventReferenceCount = ExpectedEventThreshold;
122129
for (size_t I = 0; I < ExpectedEventThreshold; ++I) {
123130
Q.memset(HostAlloc, 42, 1).wait();

0 commit comments

Comments
 (0)