Skip to content

Commit 0c9b051

Browse files
[L0] Support for counter-based events using L0 driver
Signed-off-by: Zhang, Winston <[email protected]>
1 parent 8499b57 commit 0c9b051

File tree

9 files changed

+170
-65
lines changed

9 files changed

+170
-65
lines changed

source/adapters/level_zero/context.cpp

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -468,7 +468,8 @@ static const uint32_t MaxNumEventsPerPool = [] {
468468

469469
ur_result_t ur_context_handle_t_::getFreeSlotInExistingOrNewPool(
470470
ze_event_pool_handle_t &Pool, size_t &Index, bool HostVisible,
471-
bool ProfilingEnabled, ur_device_handle_t Device) {
471+
bool ProfilingEnabled, ur_device_handle_t Device,
472+
bool CounterBasedEventEnabled, bool UsingImmCmdList) {
472473
// Lock while updating event pool machinery.
473474
std::scoped_lock<ur_mutex> Lock(ZeEventPoolCacheMutex);
474475

@@ -477,8 +478,8 @@ ur_result_t ur_context_handle_t_::getFreeSlotInExistingOrNewPool(
477478
if (Device) {
478479
ZeDevice = Device->ZeDevice;
479480
}
480-
std::list<ze_event_pool_handle_t> *ZePoolCache =
481-
getZeEventPoolCache(HostVisible, ProfilingEnabled, ZeDevice);
481+
std::list<ze_event_pool_handle_t> *ZePoolCache = getZeEventPoolCache(
482+
HostVisible, ProfilingEnabled, CounterBasedEventEnabled, ZeDevice);
482483

483484
if (!ZePoolCache->empty()) {
484485
if (NumEventsAvailableInEventPool[ZePoolCache->front()] == 0) {
@@ -510,6 +511,18 @@ ur_result_t ur_context_handle_t_::getFreeSlotInExistingOrNewPool(
510511
ZeEventPoolDesc.flags |= ZE_EVENT_POOL_FLAG_HOST_VISIBLE;
511512
if (ProfilingEnabled)
512513
ZeEventPoolDesc.flags |= ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
514+
if (CounterBasedEventEnabled) {
515+
ZeEventPoolDesc.flags |= ZE_EVENT_POOL_FLAG_HOST_VISIBLE;
516+
ze_event_pool_counter_based_exp_desc_t counterBasedExt = {
517+
ZE_STRUCTURE_TYPE_COUNTER_BASED_EVENT_POOL_EXP_DESC};
518+
if (UsingImmCmdList) {
519+
counterBasedExt.flags = ZE_EVENT_POOL_COUNTER_BASED_EXP_FLAG_IMMEDIATE;
520+
} else {
521+
counterBasedExt.flags =
522+
ZE_EVENT_POOL_COUNTER_BASED_EXP_FLAG_NON_IMMEDIATE;
523+
}
524+
ZeEventPoolDesc.pNext = &counterBasedExt;
525+
}
513526
urPrint("ze_event_pool_desc_t flags set to: %d\n", ZeEventPoolDesc.flags);
514527

515528
std::vector<ze_device_handle_t> ZeDevices;
@@ -580,8 +593,9 @@ ur_context_handle_t_::decrementUnreleasedEventsInPool(ur_event_handle_t Event) {
580593
ZeDevice = Event->UrQueue->Device->ZeDevice;
581594
}
582595

583-
std::list<ze_event_pool_handle_t> *ZePoolCache = getZeEventPoolCache(
584-
Event->isHostVisible(), Event->isProfilingEnabled(), ZeDevice);
596+
std::list<ze_event_pool_handle_t> *ZePoolCache =
597+
getZeEventPoolCache(Event->isHostVisible(), Event->isProfilingEnabled(),
598+
Event->usingCounterBasedEvents(), ZeDevice);
585599

586600
// Put the empty pool to the cache of the pools.
587601
if (NumEventsUnreleasedInEventPool[Event->ZeEventPool] == 0)

source/adapters/level_zero/context.hpp

Lines changed: 38 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -141,9 +141,9 @@ struct ur_context_handle_t_ : _ur_object {
141141
// head.
142142
//
143143
// Cache of event pools to which host-visible events are added to.
144-
std::vector<std::list<ze_event_pool_handle_t>> ZeEventPoolCache{4};
144+
std::vector<std::list<ze_event_pool_handle_t>> ZeEventPoolCache{8};
145145
std::vector<std::unordered_map<ze_device_handle_t, size_t>>
146-
ZeEventPoolCacheDeviceMap{4};
146+
ZeEventPoolCacheDeviceMap{8};
147147

148148
// This map will be used to determine if a pool is full or not
149149
// by storing number of empty slots available in the pool.
@@ -194,7 +194,9 @@ struct ur_context_handle_t_ : _ur_object {
194194
ur_result_t getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &, size_t &,
195195
bool HostVisible,
196196
bool ProfilingEnabled,
197-
ur_device_handle_t Device);
197+
ur_device_handle_t Device,
198+
bool CounterBasedEventEnabled,
199+
bool UsingImmCmdList);
198200

199201
// Get ur_event_handle_t from cache.
200202
ur_event_handle_t getEventFromContextCache(bool HostVisible,
@@ -206,36 +208,43 @@ struct ur_context_handle_t_ : _ur_object {
206208

207209
std::list<ze_event_pool_handle_t> *
208210
getZeEventPoolCache(bool HostVisible, bool WithProfiling,
211+
bool CounterBasedEventEnabled,
209212
ze_device_handle_t ZeDevice) {
210-
if (HostVisible) {
211-
if (ZeDevice) {
212-
auto ZeEventPoolCacheMap = WithProfiling
213-
? &ZeEventPoolCacheDeviceMap[0]
214-
: &ZeEventPoolCacheDeviceMap[1];
215-
if (ZeEventPoolCacheMap->find(ZeDevice) == ZeEventPoolCacheMap->end()) {
216-
ZeEventPoolCache.emplace_back();
217-
ZeEventPoolCacheMap->insert(
218-
std::make_pair(ZeDevice, ZeEventPoolCache.size() - 1));
219-
}
220-
return &ZeEventPoolCache[(*ZeEventPoolCacheMap)[ZeDevice]];
221-
} else {
222-
return WithProfiling ? &ZeEventPoolCache[0] : &ZeEventPoolCache[1];
213+
int profiling_index_a, profiling_index_b;
214+
calculateCacheIndex(HostVisible, CounterBasedEventEnabled,
215+
&profiling_index_a, &profiling_index_b);
216+
if (ZeDevice) {
217+
auto ZeEventPoolCacheMap =
218+
WithProfiling ? &ZeEventPoolCacheDeviceMap[profiling_index_a]
219+
: &ZeEventPoolCacheDeviceMap[profiling_index_b];
220+
if (ZeEventPoolCacheMap->find(ZeDevice) == ZeEventPoolCacheMap->end()) {
221+
ZeEventPoolCache.emplace_back();
222+
ZeEventPoolCacheMap->insert(
223+
std::make_pair(ZeDevice, ZeEventPoolCache.size() - 1));
223224
}
225+
return &ZeEventPoolCache[(*ZeEventPoolCacheMap)[ZeDevice]];
224226
} else {
225-
if (ZeDevice) {
226-
auto ZeEventPoolCacheMap = WithProfiling
227-
? &ZeEventPoolCacheDeviceMap[2]
228-
: &ZeEventPoolCacheDeviceMap[3];
229-
if (ZeEventPoolCacheMap->find(ZeDevice) == ZeEventPoolCacheMap->end()) {
230-
ZeEventPoolCache.emplace_back();
231-
ZeEventPoolCacheMap->insert(
232-
std::make_pair(ZeDevice, ZeEventPoolCache.size() - 1));
233-
}
234-
return &ZeEventPoolCache[(*ZeEventPoolCacheMap)[ZeDevice]];
235-
} else {
236-
return WithProfiling ? &ZeEventPoolCache[2] : &ZeEventPoolCache[3];
237-
}
227+
return WithProfiling ? &ZeEventPoolCache[profiling_index_a]
228+
: &ZeEventPoolCache[profiling_index_b];
229+
}
230+
}
231+
232+
ur_result_t calculateCacheIndex(bool HostVisible,
233+
bool CounterBasedEventEnabled,
234+
int *profiling_index_a,
235+
int *profiling_index_b) {
236+
if (HostVisible) {
237+
*profiling_index_a = 0;
238+
*profiling_index_b = 1;
239+
} else {
240+
*profiling_index_a = 2;
241+
*profiling_index_b = 3;
242+
}
243+
if (CounterBasedEventEnabled) {
244+
*profiling_index_a += 4;
245+
*profiling_index_b += 4;
238246
}
247+
return UR_RESULT_SUCCESS;
239248
}
240249

241250
// Decrement number of events living in the pool upon event destroy

source/adapters/level_zero/device.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1054,6 +1054,19 @@ bool ur_device_handle_t_::useRelaxedAllocationLimits() {
10541054
return EnableRelaxedAllocationLimits;
10551055
}
10561056

1057+
bool ur_device_handle_t_::useDriverInOrderLists() {
1058+
// Use in-order lists implementation from L0 driver instead
1059+
// of adapter's implementation.
1060+
static const bool UseDriverInOrderLists = [] {
1061+
const char *UrRet = std::getenv("UR_L0_USE_DRIVER_INORDER_LISTS");
1062+
if (!UrRet)
1063+
return false;
1064+
return std::atoi(UrRet) != 0;
1065+
}();
1066+
1067+
return UseDriverInOrderLists;
1068+
}
1069+
10571070
ur_result_t ur_device_handle_t_::initialize(int SubSubDeviceOrdinal,
10581071
int SubSubDeviceIndex) {
10591072
// Maintain various device properties cache.

source/adapters/level_zero/device.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,9 @@ struct ur_device_handle_t_ : _ur_object {
143143
// Read env settings to select immediate commandlist mode.
144144
ImmCmdlistMode useImmediateCommandLists();
145145

146+
// Whether Adapter uses driver's implementation of in-order lists or not
147+
bool useDriverInOrderLists();
148+
146149
// Returns whether immediate command lists are used on this device.
147150
ImmCmdlistMode ImmCommandListUsed{};
148151

source/adapters/level_zero/event.cpp

Lines changed: 52 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,20 @@ static const bool UseMultipleCmdlistBarriers = [] {
4343
return std::atoi(UseMultipleCmdlistBarriersFlag) > 0;
4444
}();
4545

46+
bool WaitListEmptyOrAllEventsFromSameQueue(
47+
ur_queue_handle_t Queue, uint32_t NumEventsInWaitList,
48+
const ur_event_handle_t *EventWaitList) {
49+
if (!NumEventsInWaitList)
50+
return true;
51+
52+
for (uint32_t i = 0; i < NumEventsInWaitList; ++i) {
53+
if (Queue != EventWaitList[i]->UrQueue)
54+
return false;
55+
}
56+
57+
return true;
58+
}
59+
4660
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait(
4761
ur_queue_handle_t Queue, ///< [in] handle of the queue object
4862
uint32_t NumEventsInWaitList, ///< [in] size of the event wait list
@@ -206,21 +220,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
206220
bool IsInternal = OutEvent == nullptr;
207221
ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent;
208222

209-
auto WaitListEmptyOrAllEventsFromSameQueue = [Queue, NumEventsInWaitList,
210-
EventWaitList]() {
211-
if (!NumEventsInWaitList)
212-
return true;
213-
214-
for (uint32_t I = 0; I < NumEventsInWaitList; ++I)
215-
if (Queue != EventWaitList[I]->UrQueue)
216-
return false;
217-
218-
return true;
219-
};
220-
221223
// For in-order queue and wait-list which is empty or has events from
222224
// the same queue just use the last command event as the barrier event.
223-
if (Queue->isInOrderQueue() && WaitListEmptyOrAllEventsFromSameQueue() &&
225+
if (Queue->isInOrderQueue() &&
226+
WaitListEmptyOrAllEventsFromSameQueue(Queue, NumEventsInWaitList,
227+
EventWaitList) &&
224228
Queue->LastCommandEvent && !Queue->LastCommandEvent->IsDiscarded) {
225229
UR_CALL(urEventRetain(Queue->LastCommandEvent));
226230
*Event = Queue->LastCommandEvent;
@@ -1053,7 +1057,8 @@ ur_result_t CleanupCompletedEvent(ur_event_handle_t Event, bool QueueLocked,
10531057
//
10541058
ur_result_t EventCreate(ur_context_handle_t Context, ur_queue_handle_t Queue,
10551059
bool IsMultiDevice, bool HostVisible,
1056-
ur_event_handle_t *RetEvent) {
1060+
ur_event_handle_t *RetEvent,
1061+
bool CounterBasedEventEnabled) {
10571062

10581063
bool ProfilingEnabled = !Queue || Queue->isProfilingEnabled();
10591064

@@ -1075,14 +1080,15 @@ ur_result_t EventCreate(ur_context_handle_t Context, ur_queue_handle_t Queue,
10751080
size_t Index = 0;
10761081

10771082
if (auto Res = Context->getFreeSlotInExistingOrNewPool(
1078-
ZeEventPool, Index, HostVisible, ProfilingEnabled, Device))
1083+
ZeEventPool, Index, HostVisible, ProfilingEnabled, Device,
1084+
CounterBasedEventEnabled, Queue->UsingImmCmdLists))
10791085
return Res;
10801086

10811087
ZeStruct<ze_event_desc_t> ZeEventDesc;
10821088
ZeEventDesc.index = Index;
10831089
ZeEventDesc.wait = 0;
10841090

1085-
if (HostVisible) {
1091+
if (HostVisible || CounterBasedEventEnabled) {
10861092
ZeEventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
10871093
} else {
10881094
//
@@ -1107,7 +1113,7 @@ ur_result_t EventCreate(ur_context_handle_t Context, ur_queue_handle_t Queue,
11071113
} catch (...) {
11081114
return UR_RESULT_ERROR_UNKNOWN;
11091115
}
1110-
1116+
(*RetEvent)->CounterBasedEventsEnabled = CounterBasedEventEnabled;
11111117
if (HostVisible)
11121118
(*RetEvent)->HostVisibleEvent =
11131119
reinterpret_cast<ur_event_handle_t>(*RetEvent);
@@ -1128,8 +1134,8 @@ ur_result_t ur_event_handle_t_::reset() {
11281134

11291135
if (!isHostVisible())
11301136
HostVisibleEvent = nullptr;
1131-
1132-
ZE2UR_CALL(zeEventHostReset, (ZeEvent));
1137+
if (!usingCounterBasedEvents())
1138+
ZE2UR_CALL(zeEventHostReset, (ZeEvent));
11331139
return UR_RESULT_SUCCESS;
11341140
}
11351141

@@ -1189,6 +1195,23 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList(
11891195
CurQueue->LastCommandEvent && CurQueue->LastCommandEvent->IsDiscarded)
11901196
IncludeLastCommandEvent = false;
11911197

1198+
// If we are using L0 native implementation for handling in-order queues,
1199+
// then we don't need to add the last enqueued event into the waitlist, as
1200+
// the native driver implementation will already ensure in-order semantics.
1201+
// The only exception is when a different immediate command was last used on
1202+
// the same UR Queue.
1203+
if (CurQueue->Device->useDriverInOrderLists() && CurQueue->isInOrderQueue() &&
1204+
CurQueue->UsingImmCmdLists) {
1205+
auto QueueGroup = CurQueue->getQueueGroup(UseCopyEngine);
1206+
uint32_t QueueGroupOrdinal, QueueIndex;
1207+
auto NextIndex = QueueGroup.getQueueIndex(&QueueGroupOrdinal, &QueueIndex,
1208+
/*QueryOnly */ true);
1209+
auto NextImmCmdList = QueueGroup.ImmCmdLists[NextIndex];
1210+
IncludeLastCommandEvent &=
1211+
CurQueue->LastUsedCommandList != CurQueue->CommandListMap.end() &&
1212+
NextImmCmdList != CurQueue->LastUsedCommandList;
1213+
}
1214+
11921215
try {
11931216
uint32_t TmpListLength = 0;
11941217

@@ -1205,6 +1228,16 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList(
12051228
this->UrEventList = new ur_event_handle_t[EventListLength];
12061229
}
12071230

1231+
// For in-order queue and wait-list which is empty or has events only from
1232+
// the same queue then we don't need to wait on any other additional events
1233+
if (CurQueue->Device->useDriverInOrderLists() &&
1234+
CurQueue->isInOrderQueue() &&
1235+
WaitListEmptyOrAllEventsFromSameQueue(CurQueue, EventListLength,
1236+
EventList)) {
1237+
this->Length = TmpListLength;
1238+
return UR_RESULT_SUCCESS;
1239+
}
1240+
12081241
if (EventListLength > 0) {
12091242
for (uint32_t I = 0; I < EventListLength; I++) {
12101243
{

source/adapters/level_zero/event.hpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,8 @@ extern "C" {
3131
ur_result_t urEventReleaseInternal(ur_event_handle_t Event);
3232
ur_result_t EventCreate(ur_context_handle_t Context, ur_queue_handle_t Queue,
3333
bool IsMultiDevice, bool HostVisible,
34-
ur_event_handle_t *RetEvent);
34+
ur_event_handle_t *RetEvent,
35+
bool CounterBasedEventEnabled = false);
3536
} // extern "C"
3637

3738
// This is an experimental option that allows to disable caching of events in
@@ -222,6 +223,11 @@ struct ur_event_handle_t_ : _ur_object {
222223

223224
// Get the host-visible event or create one and enqueue its signal.
224225
ur_result_t getOrCreateHostVisibleEvent(ze_event_handle_t &HostVisibleEvent);
226+
227+
// Keeps track of whether we are using Counter-based Events.
228+
bool CounterBasedEventsEnabled = false;
229+
230+
bool usingCounterBasedEvents() const { return CounterBasedEventsEnabled; }
225231
};
226232

227233
// Helper function to implement zeHostSynchronize.

source/adapters/level_zero/kernel.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -214,8 +214,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
214214
// the code can do a urKernelRelease on this kernel.
215215
(*Event)->CommandData = (void *)Kernel;
216216

217-
// Increment the reference count of the Kernel and indicate that the Kernel is
218-
// in use. Once the event has been signalled, the code in
217+
// Increment the reference count of the Kernel and indicate that the Kernel
218+
// is in use. Once the event has been signalled, the code in
219219
// CleanupCompletedEvent(Event) will do a urKernelRelease to update the
220220
// reference count on the kernel, using the kernel saved in CommandData.
221221
UR_CALL(urKernelRetain(Kernel));

0 commit comments

Comments
 (0)