@@ -283,14 +283,46 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueEventsWait(
283
283
return UR_RESULT_SUCCESS;
284
284
}
285
285
286
+ ur_result_t ur_queue_immediate_in_order_t::enqueueEventsWaitWithBarrierImpl (
287
+ uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
288
+ ur_event_handle_t *phEvent) {
289
+ TRACK_SCOPE_LATENCY (
290
+ " ur_queue_immediate_in_order_t::enqueueEventsWaitWithBarrier" );
291
+
292
+ std::scoped_lock<ur_shared_mutex> lock (this ->Mutex );
293
+
294
+ if (!numEventsInWaitList && !phEvent) {
295
+ // nop
296
+ return UR_RESULT_SUCCESS;
297
+ }
298
+
299
+ auto signalEvent =
300
+ getSignalEvent (phEvent, UR_COMMAND_EVENTS_WAIT_WITH_BARRIER);
301
+ auto [pWaitEvents, numWaitEvents] =
302
+ getWaitListView (phEventWaitList, numEventsInWaitList);
303
+
304
+ ZE2UR_CALL (zeCommandListAppendBarrier,
305
+ (handler.commandList .get (), signalEvent->getZeEvent (),
306
+ numWaitEvents, pWaitEvents));
307
+
308
+ return UR_RESULT_SUCCESS;
309
+ }
310
+
286
311
ur_result_t ur_queue_immediate_in_order_t::enqueueEventsWaitWithBarrier (
287
312
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
288
313
ur_event_handle_t *phEvent) {
289
314
// For in-order queue we don't need a real barrier, just wait for
290
315
// requested events in potentially different queues and add a "barrier"
291
316
// event signal because it is already guaranteed that previous commands
292
- // in this queue are completed when the signal is started.
293
- return enqueueEventsWait (numEventsInWaitList, phEventWaitList, phEvent);
317
+ // in this queue are completed when the signal is started. However, we do
318
+ // need to use barrier if profiling is enabled: see
319
+ // zeCommandListAppendWaitOnEvents
320
+ if ((flags & UR_QUEUE_FLAG_PROFILING_ENABLE) != 0 ) {
321
+ return enqueueEventsWaitWithBarrierImpl (numEventsInWaitList,
322
+ phEventWaitList, phEvent);
323
+ } else {
324
+ return enqueueEventsWait (numEventsInWaitList, phEventWaitList, phEvent);
325
+ }
294
326
}
295
327
296
328
ur_result_t ur_queue_immediate_in_order_t::enqueueEventsWaitWithBarrierExt (
@@ -757,8 +789,8 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueUSMPrefetch(
757
789
getWaitListView (phEventWaitList, numEventsInWaitList);
758
790
759
791
if (pWaitEvents) {
760
- ZE2UR_CALL (zeCommandListAppendBarrier, (handler. commandList . get (), nullptr ,
761
- numWaitEvents, pWaitEvents));
792
+ ZE2UR_CALL (zeCommandListAppendWaitOnEvents ,
793
+ (handler. commandList . get (), numWaitEvents, pWaitEvents));
762
794
}
763
795
// TODO: figure out how to translate "flags"
764
796
ZE2UR_CALL (zeCommandListAppendMemoryPrefetch,
@@ -789,8 +821,8 @@ ur_queue_immediate_in_order_t::enqueueUSMAdvise(const void *pMem, size_t size,
789
821
auto [pWaitEvents, numWaitEvents] = getWaitListView (nullptr , 0 );
790
822
791
823
if (pWaitEvents) {
792
- ZE2UR_CALL (zeCommandListAppendBarrier, (handler. commandList . get (), nullptr ,
793
- numWaitEvents, pWaitEvents));
824
+ ZE2UR_CALL (zeCommandListAppendWaitOnEvents ,
825
+ (handler. commandList . get (), numWaitEvents, pWaitEvents));
794
826
}
795
827
796
828
// TODO: figure out how to translate "flags"
0 commit comments