@@ -184,6 +184,14 @@ static void zePrint(const char *Format, ...) {
184
184
}
185
185
}
186
186
187
+ // Controls whether device-scope events are used.
188
+ static const bool ZeAllHostVisibleEvents = [] {
189
+ const auto DeviceEventsStr =
190
+ std::getenv (" SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS" );
191
+ bool result = (DeviceEventsStr ? (std::atoi (DeviceEventsStr) == 0 ) : true );
192
+ return result;
193
+ }();
194
+
187
195
// Helper function to implement zeHostSynchronize.
188
196
// The behavior is to avoid infinite wait during host sync under ZE_DEBUG.
189
197
// This allows for a much more responsive debugging of hangs.
@@ -382,8 +390,8 @@ pi_result _pi_mem::removeMapping(void *MappedTo, Mapping &MapInfo) {
382
390
}
383
391
384
392
pi_result
385
- _pi_context::getFreeSlotInExistingOrNewPool (ze_event_pool_handle_t &ZePool ,
386
- size_t &Index) {
393
+ _pi_context::getFreeSlotInExistingOrNewPool (ze_event_pool_handle_t &Pool ,
394
+ size_t &Index, bool HostVisible ) {
387
395
// Maximum number of events that can be present in an event ZePool is captured
388
396
// here. Setting it to 256 gave best possible performance for several
389
397
// benchmarks.
@@ -399,10 +407,23 @@ _pi_context::getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &ZePool,
399
407
return PI_INVALID_VALUE;
400
408
}
401
409
410
+ // Setup for host-visible pool as needed.
411
+ ze_event_pool_flag_t ZePoolFlag = {};
412
+ ze_event_pool_handle_t *ZePool = [&] {
413
+ if (ZeAllHostVisibleEvents) {
414
+ ZePoolFlag = ZE_EVENT_POOL_FLAG_HOST_VISIBLE;
415
+ return &ZeEventPool;
416
+ } else if (HostVisible) {
417
+ ZePoolFlag = ZE_EVENT_POOL_FLAG_HOST_VISIBLE;
418
+ return &ZeHostVisibleEventPool;
419
+ } else {
420
+ return &ZeEventPool;
421
+ }
422
+ }();
423
+
402
424
Index = 0 ;
403
425
// Create one event ZePool per MaxNumEventsPerPool events
404
- if ((ZeEventPool == nullptr ) ||
405
- (NumEventsAvailableInEventPool[ZeEventPool] == 0 )) {
426
+ if ((*ZePool == nullptr ) || (NumEventsAvailableInEventPool[*ZePool] == 0 )) {
406
427
// Creation of the new ZePool with record in NumEventsAvailableInEventPool
407
428
// and initialization of the record in NumEventsUnreleasedInEventPool must
408
429
// be done atomically. Otherwise it is possible that
@@ -417,34 +438,28 @@ _pi_context::getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &ZePool,
417
438
418
439
ZeStruct<ze_event_pool_desc_t > ZeEventPoolDesc;
419
440
ZeEventPoolDesc.count = MaxNumEventsPerPool;
420
-
421
- // Make all events visible on the host.
422
- // TODO: events that are used only on device side APIs can be optimized
423
- // to not be from the host-visible pool.
424
- //
425
- ZeEventPoolDesc.flags =
426
- ZE_EVENT_POOL_FLAG_HOST_VISIBLE | ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
441
+ ZeEventPoolDesc.flags = ZePoolFlag | ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
427
442
428
443
std::vector<ze_device_handle_t > ZeDevices;
429
444
std::for_each (Devices.begin (), Devices.end (),
430
445
[&](pi_device &D) { ZeDevices.push_back (D->ZeDevice ); });
431
446
432
447
ZE_CALL (zeEventPoolCreate, (ZeContext, &ZeEventPoolDesc, ZeDevices.size (),
433
- &ZeDevices[0 ], &ZeEventPool ));
434
- NumEventsAvailableInEventPool[ZeEventPool ] = MaxNumEventsPerPool - 1 ;
435
- NumEventsUnreleasedInEventPool[ZeEventPool ] = MaxNumEventsPerPool;
448
+ &ZeDevices[0 ], ZePool ));
449
+ NumEventsAvailableInEventPool[*ZePool ] = MaxNumEventsPerPool - 1 ;
450
+ NumEventsUnreleasedInEventPool[*ZePool ] = MaxNumEventsPerPool;
436
451
} else {
437
452
std::lock_guard<std::mutex> NumEventsAvailableInEventPoolGuard (
438
453
NumEventsAvailableInEventPoolMutex);
439
- Index = MaxNumEventsPerPool - NumEventsAvailableInEventPool[ZeEventPool ];
440
- --NumEventsAvailableInEventPool[ZeEventPool ];
454
+ Index = MaxNumEventsPerPool - NumEventsAvailableInEventPool[*ZePool ];
455
+ --NumEventsAvailableInEventPool[*ZePool ];
441
456
}
442
- ZePool = ZeEventPool ;
457
+ Pool = *ZePool ;
443
458
return PI_SUCCESS;
444
459
}
445
460
446
- pi_result _pi_context::decrementUnreleasedEventsInPool (pi_event Event) {
447
- ze_event_pool_handle_t ZePool = Event-> ZeEventPool ;
461
+ pi_result
462
+ _pi_context::decrementUnreleasedEventsInPool ( ze_event_pool_handle_t & ZePool) {
448
463
if (!ZePool) {
449
464
// This must be an interop event created on a users's pool.
450
465
// Do nothing.
@@ -463,9 +478,9 @@ pi_result _pi_context::decrementUnreleasedEventsInPool(pi_event Event) {
463
478
// multiple pi_context::ZeEventPool can be created if all slots in the pool
464
479
// are already used up. So nullifying pi_context::ZeEventPool may point
465
480
// a different EventPool than Event->ZeEventPool.
466
- if (ZeEventPool == Event-> ZeEventPool )
481
+ if (ZeEventPool == ZePool )
467
482
ZeEventPool = nullptr ;
468
- Event-> ZeEventPool = nullptr ;
483
+ ZePool = nullptr ;
469
484
}
470
485
return PI_SUCCESS;
471
486
}
@@ -764,6 +779,8 @@ pi_result _pi_context::finalize() {
764
779
NumEventsUnreleasedInEventPoolMutex);
765
780
if (ZeEventPool)
766
781
ZE_CALL (zeEventPoolDestroy, (ZeEventPool));
782
+ if (ZeHostVisibleEventPool)
783
+ ZE_CALL (zeEventPoolDestroy, (ZeHostVisibleEventPool));
767
784
768
785
// Destroy the command list used for initializations
769
786
ZE_CALL (zeCommandListDestroy, (ZeCommandListInit));
@@ -1134,7 +1151,10 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList,
1134
1151
// therefore that this Queue is idle.
1135
1152
bool CurrentlyEmpty = this ->LastCommandEvent == nullptr ;
1136
1153
1137
- this ->LastCommandEvent = CommandList->second .EventList .back ();
1154
+ // The list can be empty if command-list only contains signals of proxy
1155
+ // events.
1156
+ if (!CommandList->second .EventList .empty ())
1157
+ this ->LastCommandEvent = CommandList->second .EventList .back ();
1138
1158
1139
1159
// Batch if allowed to, but don't batch if we know there are no kernels
1140
1160
// from this queue that are currently executing. This is intended to get
@@ -1329,7 +1349,9 @@ pi_result _pi_ze_event_list_t::createAndRetainPiZeEventList(
1329
1349
PI_ASSERT (EventList[I] != nullptr , PI_INVALID_VALUE);
1330
1350
auto ZeEvent = EventList[I]->ZeEvent ;
1331
1351
1332
- if (FilterEventWaitList) {
1352
+ // Avoid polling of the device-scope events.
1353
+ // TODO: be more fine-grain and check individual events.
1354
+ if (FilterEventWaitList && ZeAllHostVisibleEvents) {
1333
1355
auto Res = ZE_CALL_NOCHECK (zeEventQueryStatus, (ZeEvent));
1334
1356
if (Res == ZE_RESULT_SUCCESS) {
1335
1357
// Event has already completed, don't put it into the list
@@ -1629,6 +1651,8 @@ pi_result piPlatformsGet(pi_uint32 NumEntries, pi_platform *Platforms,
1629
1651
if (NumPlatforms)
1630
1652
*NumPlatforms = PiPlatformsCache->size ();
1631
1653
1654
+ zePrint (" Using %s events\n " ,
1655
+ ZeAllHostVisibleEvents ? " all host-visible" : " device-only" );
1632
1656
return PI_SUCCESS;
1633
1657
}
1634
1658
@@ -4477,6 +4501,74 @@ pi_result piextKernelGetNativeHandle(pi_kernel Kernel,
4477
4501
//
4478
4502
// Events
4479
4503
//
4504
+ ze_event_handle_t _pi_event::getHostVisibleEvent () const {
4505
+ if (ZeAllHostVisibleEvents) {
4506
+ return ZeEvent;
4507
+ } else if (ZeHostVisibleEvent) {
4508
+ return ZeHostVisibleEvent;
4509
+ } else {
4510
+ die (" The host-visible proxy event missing" );
4511
+ }
4512
+ }
4513
+
4514
+ pi_result
4515
+ _pi_event::getOrCreateHostVisibleEvent (ze_event_handle_t &HostVisibleEvent) {
4516
+
4517
+ if (ZeAllHostVisibleEvents) {
4518
+ HostVisibleEvent = ZeEvent;
4519
+ } else if (ZeHostVisibleEvent) {
4520
+ HostVisibleEvent = ZeHostVisibleEvent;
4521
+ } else {
4522
+ size_t Index;
4523
+ ze_event_pool_handle_t ZeEventPool = {};
4524
+ if (auto Res =
4525
+ Context->getFreeSlotInExistingOrNewPool (ZeEventPool, Index, true ))
4526
+ return Res;
4527
+
4528
+ // Create a "proxy" host-visible event.
4529
+ //
4530
+ // TODO: consider creating just single host-visible proxy event to
4531
+ // represent multiple device-scope events. E.g. have a host-visible
4532
+ // event at the end of each command-list to represent device-scope
4533
+ // events from every command in that command-list.
4534
+ //
4535
+ ZeStruct<ze_event_desc_t > ZeEventDesc;
4536
+ ZeEventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
4537
+ ZeEventDesc.wait = 0 ;
4538
+ ZeEventDesc.index = Index;
4539
+
4540
+ ZE_CALL (zeEventCreate, (ZeEventPool, &ZeEventDesc, &ZeHostVisibleEvent));
4541
+ ZeHostVisibleEventPool = ZeEventPool;
4542
+ HostVisibleEvent = ZeHostVisibleEvent;
4543
+
4544
+ // Submit the command(s) signalling the proxy event to the queue.
4545
+ // We have to first submit a wait for the device-only event for which this
4546
+ // proxy is created.
4547
+ //
4548
+ // Get a new command list to be used on this call
4549
+ {
4550
+ std::lock_guard<std::mutex> Lock (Queue->PiQueueMutex );
4551
+
4552
+ // We want to batch these commands to avoid extra submissions (costly)
4553
+ bool OkToBatch = true ;
4554
+
4555
+ pi_command_list_ptr_t CommandList{};
4556
+ if (auto Res = Queue->Context ->getAvailableCommandList (Queue, CommandList,
4557
+ false , OkToBatch))
4558
+ return Res;
4559
+
4560
+ ZE_CALL (zeCommandListAppendWaitOnEvents,
4561
+ (CommandList->first , 1 , &ZeEvent));
4562
+ ZE_CALL (zeCommandListAppendSignalEvent,
4563
+ (CommandList->first , ZeHostVisibleEvent));
4564
+
4565
+ if (auto Res = Queue->executeCommandList (CommandList, false , OkToBatch))
4566
+ return Res;
4567
+ }
4568
+ }
4569
+ return PI_SUCCESS;
4570
+ }
4571
+
4480
4572
pi_result piEventCreate (pi_context Context, pi_event *RetEvent) {
4481
4573
size_t Index = 0 ;
4482
4574
ze_event_pool_handle_t ZeEventPool = {};
@@ -4485,12 +4577,21 @@ pi_result piEventCreate(pi_context Context, pi_event *RetEvent) {
4485
4577
4486
4578
ze_event_handle_t ZeEvent;
4487
4579
ZeStruct<ze_event_desc_t > ZeEventDesc;
4488
- // We have to set the SIGNAL flag as HOST scope because the
4489
- // Level-Zero plugin implementation waits for the events to complete
4490
- // on the host.
4491
- ZeEventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
4492
- ZeEventDesc.wait = 0 ;
4493
4580
ZeEventDesc.index = Index;
4581
+ ZeEventDesc.wait = 0 ;
4582
+ //
4583
+ // Set the scope to "device" for every event. This is sufficient for global
4584
+ // device access and peer device access. If needed to be waited on the host
4585
+ // we are doing special handling, see piEventsWait.
4586
+ //
4587
+ // TODO: see if "sub-device" (ZE_EVENT_SCOPE_FLAG_SUBDEVICE) can better be
4588
+ // used in some circumstances.
4589
+ //
4590
+ if (ZeAllHostVisibleEvents) {
4591
+ ZeEventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
4592
+ } else {
4593
+ ZeEventDesc.signal = ZE_EVENT_SCOPE_FLAG_DEVICE;
4594
+ }
4494
4595
4495
4596
ZE_CALL (zeEventCreate, (ZeEventPool, &ZeEventDesc, &ZeEvent));
4496
4597
@@ -4541,13 +4642,18 @@ pi_result piEventGetInfo(pi_event Event, pi_event_info ParamName,
4541
4642
}
4542
4643
}
4543
4644
4645
+ // Make sure that we query the host-visible event.
4646
+ ze_event_handle_t ZeHostVisibleEvent;
4647
+ if (auto Res = Event->getOrCreateHostVisibleEvent (ZeHostVisibleEvent))
4648
+ return Res;
4649
+
4544
4650
ze_result_t ZeResult;
4545
- ZeResult = ZE_CALL_NOCHECK (zeEventQueryStatus, (Event-> ZeEvent ));
4651
+ ZeResult = ZE_CALL_NOCHECK (zeEventQueryStatus, (ZeHostVisibleEvent ));
4546
4652
if (ZeResult == ZE_RESULT_SUCCESS) {
4547
4653
return getInfo (ParamValueSize, ParamValue, ParamValueSizeRet,
4548
4654
pi_int32{CL_COMPLETE}); // Untie from OpenCL
4549
4655
}
4550
- // TODO: We don't know if the status is queueed , submitted or running.
4656
+ // TODO: We don't know if the status is queued , submitted or running.
4551
4657
// For now return "running", as others are unlikely to be of
4552
4658
// interest.
4553
4659
return getInfo (ParamValueSize, ParamValue, ParamValueSizeRet,
@@ -4750,6 +4856,17 @@ pi_result piEventsWait(pi_uint32 NumEvents, const pi_event *EventList) {
4750
4856
return PI_INVALID_EVENT;
4751
4857
}
4752
4858
4859
+ // Make sure to add all host-visible "proxy" event signals if needed.
4860
+ // This ensures that all signalling commands are submitted below and
4861
+ // thus proxy events can be waited without a deadlock.
4862
+ //
4863
+ for (uint32_t I = 0 ; I < NumEvents; I++) {
4864
+ ze_event_handle_t ZeHostVisibleEvent;
4865
+ if (auto Res =
4866
+ EventList[I]->getOrCreateHostVisibleEvent (ZeHostVisibleEvent))
4867
+ return Res;
4868
+ }
4869
+
4753
4870
// Submit dependent open command lists for execution, if any
4754
4871
for (uint32_t I = 0 ; I < NumEvents; I++) {
4755
4872
auto Queue = EventList[I]->Queue ;
@@ -4765,7 +4882,7 @@ pi_result piEventsWait(pi_uint32 NumEvents, const pi_event *EventList) {
4765
4882
}
4766
4883
4767
4884
for (uint32_t I = 0 ; I < NumEvents; I++) {
4768
- ze_event_handle_t ZeEvent = EventList[I]->ZeEvent ;
4885
+ ze_event_handle_t ZeEvent = EventList[I]->getHostVisibleEvent () ;
4769
4886
zePrint (" ZeEvent = %#lx\n " , pi_cast<std::uintptr_t >(ZeEvent));
4770
4887
ZE_CALL (zeHostSynchronize, (ZeEvent));
4771
4888
@@ -4831,11 +4948,20 @@ static pi_result EventRelease(pi_event Event, pi_queue LockedQueue) {
4831
4948
if (Event->OwnZeEvent ) {
4832
4949
ZE_CALL (zeEventDestroy, (Event->ZeEvent ));
4833
4950
}
4951
+ if (Event->ZeHostVisibleEvent ) {
4952
+ ZE_CALL (zeEventDestroy, (Event->ZeHostVisibleEvent ));
4953
+ }
4834
4954
4835
4955
auto Context = Event->Context ;
4836
- if (auto Res = Context->decrementUnreleasedEventsInPool (Event))
4956
+ if (auto Res = Context->decrementUnreleasedEventsInPool (Event-> ZeEventPool ))
4837
4957
return Res;
4838
4958
4959
+ if (Event->ZeHostVisibleEvent ) {
4960
+ if (auto Res = Context->decrementUnreleasedEventsInPool (
4961
+ Event->ZeHostVisibleEventPool ))
4962
+ return Res;
4963
+ }
4964
+
4839
4965
// We intentionally incremented the reference counter when an event is
4840
4966
// created so that we can avoid pi_queue is released before the associated
4841
4967
// pi_event is released. Here we have to decrement it so pi_queue
0 commit comments