@@ -282,18 +282,23 @@ event queue_impl::memcpyFromDeviceGlobal(
282
282
DeviceGlobalPtr, IsDeviceImageScope, Self, NumBytes, Offset, Dest);
283
283
}
284
284
285
- sycl::detail::optional<event> queue_impl::getLastEvent () {
285
+ sycl::detail::optional<event>
286
+ queue_impl::getLastEvent (const std::shared_ptr<queue_impl> &Self) {
286
287
// The external event is required to finish last if set, so it is considered
287
288
// the last event if present.
288
289
if (std::optional<event> ExternalEvent = MInOrderExternalEvent.read ())
289
290
return ExternalEvent;
290
291
291
292
std::lock_guard<std::mutex> Lock{MMutex};
292
- if (MGraph. expired () && !MDefaultGraphDeps. LastEventPtr )
293
+ if (MEmpty )
293
294
return std::nullopt;
294
- if (!MGraph.expired () && MExtGraphDeps.LastEventPtr )
295
- return detail::createSyclObjFromImpl<event>(MExtGraphDeps.LastEventPtr );
296
- return detail::createSyclObjFromImpl<event>(MDefaultGraphDeps.LastEventPtr );
295
+ auto &LastEvent = MGraph.expired () ? MDefaultGraphDeps.LastEventPtr
296
+ : MExtGraphDeps.LastEventPtr ;
297
+ // If the event comes from a graph, we must return it.
298
+ if (LastEvent)
299
+ return detail::createSyclObjFromImpl<event>(LastEvent);
300
+ // We insert a marker to represent an event at end.
301
+ return detail::createSyclObjFromImpl<event>(insertMarkerEvent (Self));
297
302
}
298
303
299
304
void queue_impl::addEvent (const event &Event) {
@@ -344,9 +349,49 @@ event queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF,
344
349
345
350
HandlerImpl->MEventMode = SubmitInfo.EventMode ();
346
351
347
- auto Event = finalizeHandler (Handler, SubmitInfo.PostProcessorFunc ());
352
+ auto isHostTask = Type == CGType::CodeplayHostTask;
353
+
354
+ // TODO: this shouldn't be needed but without this
355
+ // the legacy adapter doesn't synchronize the operations properly
356
+ // when non-immediate command lists are used.
357
+ auto isGraphSubmission = Type == CGType::ExecCommandBuffer;
358
+
359
+ auto requiresPostProcess = SubmitInfo.PostProcessorFunc () || Streams.size ();
360
+ auto noLastEventPath = !isHostTask && !isGraphSubmission &&
361
+ MNoEventMode.load (std::memory_order_relaxed) &&
362
+ !requiresPostProcess;
348
363
349
- addEvent (Event);
364
+ if (noLastEventPath) {
365
+ std::unique_lock<std::mutex> Lock (MMutex);
366
+
367
+ // Check if we are still in no event mode. There could
368
+ // have been a concurrent submit.
369
+ if (MNoEventMode.load (std::memory_order_relaxed)) {
370
+ return finalizeHandlerInOrderNoEventsUnlocked (Handler);
371
+ }
372
+ }
373
+
374
+ event Event;
375
+ if (!isInOrder ()) {
376
+ Event = finalizeHandlerOutOfOrder (Handler);
377
+ addEvent (Event);
378
+ } else {
379
+ if (isHostTask) {
380
+ std::unique_lock<std::mutex> Lock (MMutex);
381
+ Event = finalizeHandlerInOrderHostTaskUnlocked (Handler);
382
+ } else {
383
+ std::unique_lock<std::mutex> Lock (MMutex);
384
+
385
+ if (!isGraphSubmission && trySwitchingToNoEventsMode ()) {
386
+ Event = finalizeHandlerInOrderNoEventsUnlocked (Handler);
387
+ } else {
388
+ Event = finalizeHandlerInOrderWithDepsUnlocked (Handler);
389
+ }
390
+ }
391
+ }
392
+
393
+ if (SubmitInfo.PostProcessorFunc ())
394
+ handlerPostProcess (Handler, SubmitInfo.PostProcessorFunc (), Event);
350
395
351
396
const auto &EventImpl = detail::getSyclObjImpl (Event);
352
397
for (auto &Stream : Streams) {
@@ -370,63 +415,14 @@ event queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF,
370
415
#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
371
416
event queue_impl::submit_impl (const detail::type_erased_cgfo_ty &CGF,
372
417
const std::shared_ptr<queue_impl> &Self,
373
- const std::shared_ptr<queue_impl> &PrimaryQueue ,
418
+ const std::shared_ptr<queue_impl> &,
374
419
const std::shared_ptr<queue_impl> &SecondaryQueue,
375
420
bool CallerNeedsEvent,
376
421
const detail::code_location &Loc,
377
422
bool IsTopCodeLoc,
378
423
const SubmissionInfo &SubmitInfo) {
379
- #ifdef __INTEL_PREVIEW_BREAKING_CHANGES
380
- detail::handler_impl HandlerImplVal (PrimaryQueue.get (), CallerNeedsEvent);
381
- detail::handler_impl *HandlerImpl = &HandlerImplVal;
382
- handler Handler (HandlerImpl, Self);
383
- #else
384
- handler Handler (Self, CallerNeedsEvent);
385
- auto &HandlerImpl = detail::getSyclObjImpl (Handler);
386
- #endif
387
-
388
- #if XPTI_ENABLE_INSTRUMENTATION
389
- if (xptiTraceEnabled ()) {
390
- Handler.saveCodeLoc (Loc, IsTopCodeLoc);
391
- }
392
- #endif
393
-
394
- {
395
- NestedCallsTracker tracker;
396
- CGF (Handler);
397
- }
398
-
399
- // Scheduler will later omit events, that are not required to execute tasks.
400
- // Host and interop tasks, however, are not submitted to low-level runtimes
401
- // and require separate dependency management.
402
- const CGType Type = HandlerImpl->MCGType ;
403
- std::vector<StreamImplPtr> Streams;
404
- if (Type == CGType::Kernel)
405
- Streams = std::move (Handler.MStreamStorage );
406
-
407
- HandlerImpl->MEventMode = SubmitInfo.EventMode ();
408
-
409
- auto Event = finalizeHandler (Handler, SubmitInfo.PostProcessorFunc ());
410
-
411
- addEvent (Event);
412
-
413
- const auto &EventImpl = detail::getSyclObjImpl (Event);
414
- for (auto &Stream : Streams) {
415
- // We don't want stream flushing to be blocking operation that is why submit
416
- // a host task to print stream buffer. It will fire up as soon as the kernel
417
- // finishes execution.
418
- auto L = [&](handler &ServiceCGH) {
419
- Stream->generateFlushCommand (ServiceCGH);
420
- };
421
- detail::type_erased_cgfo_ty CGF{L};
422
- event FlushEvent =
423
- submit_impl (CGF, Self, PrimaryQueue, SecondaryQueue,
424
- /* CallerNeedsEvent*/ true , Loc, IsTopCodeLoc, {});
425
- EventImpl->attachEventToCompleteWeak (detail::getSyclObjImpl (FlushEvent));
426
- registerStreamServiceEvent (detail::getSyclObjImpl (FlushEvent));
427
- }
428
-
429
- return Event;
424
+ return submit_impl (CGF, Self, SecondaryQueue.get (), CallerNeedsEvent, Loc,
425
+ IsTopCodeLoc, SubmitInfo);
430
426
}
431
427
#endif
432
428
@@ -467,24 +463,19 @@ event queue_impl::submitMemOpHelper(const std::shared_ptr<queue_impl> &Self,
467
463
const std::vector<event> &ExpandedDepEvents =
468
464
getExtendDependencyList (DepEvents, MutableDepEvents, Lock);
469
465
466
+ MEmpty = false ;
467
+
470
468
// If we have a command graph set we need to capture the op through the
471
469
// handler rather than by-passing the scheduler.
472
470
if (MGraph.expired () && Scheduler::areEventsSafeForSchedulerBypass (
473
471
ExpandedDepEvents, MContext)) {
474
- if (!CallerNeedsEvent && supportsDiscardingPiEvents ()) {
472
+ auto isNoEventsMode = trySwitchingToNoEventsMode ();
473
+ if (!CallerNeedsEvent && isNoEventsMode) {
475
474
NestedCallsTracker tracker;
476
475
MemOpFunc (MemOpArgs..., getUrEvents (ExpandedDepEvents),
477
476
/* PiEvent*/ nullptr );
478
477
479
- event DiscardedEvent = createDiscardedEvent ();
480
- if (isInOrder ()) {
481
- // Store the discarded event for proper in-order dependency tracking.
482
- auto &EventToStoreIn = MGraph.expired ()
483
- ? MDefaultGraphDeps.LastEventPtr
484
- : MExtGraphDeps.LastEventPtr ;
485
- EventToStoreIn = detail::getSyclObjImpl (DiscardedEvent);
486
- }
487
- return DiscardedEvent;
478
+ return createDiscardedEvent ();
488
479
}
489
480
490
481
event ResEvent = prepareSYCLEventAssociatedWithQueue (Self);
@@ -509,7 +500,8 @@ event queue_impl::submitMemOpHelper(const std::shared_ptr<queue_impl> &Self,
509
500
}
510
501
}
511
502
512
- if (isInOrder ()) {
503
+ if (isInOrder () &&
504
+ (!isNoEventsMode || MContext->getBackend () == backend::opencl)) {
513
505
auto &EventToStoreIn = MGraph.expired () ? MDefaultGraphDeps.LastEventPtr
514
506
: MExtGraphDeps.LastEventPtr ;
515
507
EventToStoreIn = EventImpl;
@@ -637,9 +629,11 @@ void queue_impl::wait(const detail::code_location &CodeLoc) {
637
629
}
638
630
639
631
std::vector<std::weak_ptr<event_impl>> WeakEvents;
632
+ EventImplPtr LastEvent;
640
633
{
641
634
std::lock_guard<std::mutex> Lock (MMutex);
642
635
WeakEvents.swap (MEventsWeak);
636
+ LastEvent = MDefaultGraphDeps.LastEventPtr ;
643
637
644
638
MMissedCleanupRequests.unset (
645
639
[&](MissedCleanupRequestsType &MissedCleanupRequests) {
@@ -664,6 +658,11 @@ void queue_impl::wait(const detail::code_location &CodeLoc) {
664
658
}
665
659
}
666
660
}
661
+
662
+ if (LastEvent) {
663
+ LastEvent->wait (LastEvent);
664
+ }
665
+
667
666
const AdapterPtr &Adapter = getAdapter ();
668
667
Adapter->call <UrApiKind::urQueueFinish>(getHandleRef ());
669
668
@@ -755,18 +754,14 @@ ur_native_handle_t queue_impl::getNative(int32_t &NativeHandleDesc) const {
755
754
}
756
755
757
756
bool queue_impl::queue_empty () const {
758
- // If we have in-order queue where events are not discarded then just check
759
- // the status of the last event.
757
+ // If we have in-order queue with non-empty last event, just check its status.
760
758
if (isInOrder ()) {
761
759
std::lock_guard<std::mutex> Lock (MMutex);
762
- // If there is no last event we know that no work has been submitted, so it
763
- // must be trivially empty.
764
- if (!MDefaultGraphDeps.LastEventPtr )
760
+ if (MEmpty)
765
761
return true ;
766
- // Otherwise, check if the last event is finished.
767
- // Note that we fall back to the backend query if the event was discarded,
768
- // which may happend despite the queue not being a discard event queue.
769
- if (!MDefaultGraphDeps.LastEventPtr ->isDiscarded ())
762
+
763
+ if (MDefaultGraphDeps.LastEventPtr &&
764
+ !MDefaultGraphDeps.LastEventPtr ->isDiscarded ())
770
765
return MDefaultGraphDeps.LastEventPtr
771
766
->get_info <info::event::command_execution_status>() ==
772
767
info::event_command_status::complete;
@@ -779,6 +774,11 @@ bool queue_impl::queue_empty() const {
779
774
if (!IsReady)
780
775
return false ;
781
776
777
+ // If got here, it means that LastEventPtr is nullptr (so no possible Host
778
+ // Tasks) and there is nothing executing on the device.
779
+ if (isInOrder ())
780
+ return true ;
781
+
782
782
// We may have events like host tasks which are not submitted to the backend
783
783
// queue so we need to get their status separately.
784
784
std::lock_guard<std::mutex> Lock (MMutex);
0 commit comments