Skip to content

Commit 7cac7a1

Browse files
authored
[SYCL] Do not store last event for in-order queues (#18277)
unless Host Tasks are used. Without Host Tasks, we can just rely on UR for ordering. Having no last event means that ext_oneapi_get_last_event() needs to submit a barrier to return an event to the user. Similarly, ext_oneapi_submit_barrier() now always submits a barrier, even for in-order queues. Whenever Host Tasks are used we need to start recording all events. This is needed because of how kernel submission synchronizes with Host Tasks. With a following scenario: q.host_task(); q.submit_kernel(); q.host_task(): The kernel won't even be submitted to UR until the first Host Task completes. To properly synchronize the second Host Task we need to keep the event describing kernel submission.
1 parent 21c7289 commit 7cac7a1

16 files changed

+276
-435
lines changed

sycl/source/detail/queue_impl.cpp

Lines changed: 79 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -282,18 +282,23 @@ event queue_impl::memcpyFromDeviceGlobal(
282282
DeviceGlobalPtr, IsDeviceImageScope, Self, NumBytes, Offset, Dest);
283283
}
284284

285-
sycl::detail::optional<event> queue_impl::getLastEvent() {
285+
sycl::detail::optional<event>
286+
queue_impl::getLastEvent(const std::shared_ptr<queue_impl> &Self) {
286287
// The external event is required to finish last if set, so it is considered
287288
// the last event if present.
288289
if (std::optional<event> ExternalEvent = MInOrderExternalEvent.read())
289290
return ExternalEvent;
290291

291292
std::lock_guard<std::mutex> Lock{MMutex};
292-
if (MGraph.expired() && !MDefaultGraphDeps.LastEventPtr)
293+
if (MEmpty)
293294
return std::nullopt;
294-
if (!MGraph.expired() && MExtGraphDeps.LastEventPtr)
295-
return detail::createSyclObjFromImpl<event>(MExtGraphDeps.LastEventPtr);
296-
return detail::createSyclObjFromImpl<event>(MDefaultGraphDeps.LastEventPtr);
295+
auto &LastEvent = MGraph.expired() ? MDefaultGraphDeps.LastEventPtr
296+
: MExtGraphDeps.LastEventPtr;
297+
// If the event comes from a graph, we must return it.
298+
if (LastEvent)
299+
return detail::createSyclObjFromImpl<event>(LastEvent);
300+
// We insert a marker to represent an event at end.
301+
return detail::createSyclObjFromImpl<event>(insertMarkerEvent(Self));
297302
}
298303

299304
void queue_impl::addEvent(const event &Event) {
@@ -344,9 +349,49 @@ event queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF,
344349

345350
HandlerImpl->MEventMode = SubmitInfo.EventMode();
346351

347-
auto Event = finalizeHandler(Handler, SubmitInfo.PostProcessorFunc());
352+
auto isHostTask = Type == CGType::CodeplayHostTask;
353+
354+
// TODO: this shouldn't be needed but without this
355+
// the legacy adapter doesn't synchronize the operations properly
356+
// when non-immediate command lists are used.
357+
auto isGraphSubmission = Type == CGType::ExecCommandBuffer;
358+
359+
auto requiresPostProcess = SubmitInfo.PostProcessorFunc() || Streams.size();
360+
auto noLastEventPath = !isHostTask && !isGraphSubmission &&
361+
MNoEventMode.load(std::memory_order_relaxed) &&
362+
!requiresPostProcess;
348363

349-
addEvent(Event);
364+
if (noLastEventPath) {
365+
std::unique_lock<std::mutex> Lock(MMutex);
366+
367+
// Check if we are still in no event mode. There could
368+
// have been a concurrent submit.
369+
if (MNoEventMode.load(std::memory_order_relaxed)) {
370+
return finalizeHandlerInOrderNoEventsUnlocked(Handler);
371+
}
372+
}
373+
374+
event Event;
375+
if (!isInOrder()) {
376+
Event = finalizeHandlerOutOfOrder(Handler);
377+
addEvent(Event);
378+
} else {
379+
if (isHostTask) {
380+
std::unique_lock<std::mutex> Lock(MMutex);
381+
Event = finalizeHandlerInOrderHostTaskUnlocked(Handler);
382+
} else {
383+
std::unique_lock<std::mutex> Lock(MMutex);
384+
385+
if (!isGraphSubmission && trySwitchingToNoEventsMode()) {
386+
Event = finalizeHandlerInOrderNoEventsUnlocked(Handler);
387+
} else {
388+
Event = finalizeHandlerInOrderWithDepsUnlocked(Handler);
389+
}
390+
}
391+
}
392+
393+
if (SubmitInfo.PostProcessorFunc())
394+
handlerPostProcess(Handler, SubmitInfo.PostProcessorFunc(), Event);
350395

351396
const auto &EventImpl = detail::getSyclObjImpl(Event);
352397
for (auto &Stream : Streams) {
@@ -370,63 +415,14 @@ event queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF,
370415
#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
371416
event queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF,
372417
const std::shared_ptr<queue_impl> &Self,
373-
const std::shared_ptr<queue_impl> &PrimaryQueue,
418+
const std::shared_ptr<queue_impl> &,
374419
const std::shared_ptr<queue_impl> &SecondaryQueue,
375420
bool CallerNeedsEvent,
376421
const detail::code_location &Loc,
377422
bool IsTopCodeLoc,
378423
const SubmissionInfo &SubmitInfo) {
379-
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
380-
detail::handler_impl HandlerImplVal(PrimaryQueue.get(), CallerNeedsEvent);
381-
detail::handler_impl *HandlerImpl = &HandlerImplVal;
382-
handler Handler(HandlerImpl, Self);
383-
#else
384-
handler Handler(Self, CallerNeedsEvent);
385-
auto &HandlerImpl = detail::getSyclObjImpl(Handler);
386-
#endif
387-
388-
#if XPTI_ENABLE_INSTRUMENTATION
389-
if (xptiTraceEnabled()) {
390-
Handler.saveCodeLoc(Loc, IsTopCodeLoc);
391-
}
392-
#endif
393-
394-
{
395-
NestedCallsTracker tracker;
396-
CGF(Handler);
397-
}
398-
399-
// Scheduler will later omit events, that are not required to execute tasks.
400-
// Host and interop tasks, however, are not submitted to low-level runtimes
401-
// and require separate dependency management.
402-
const CGType Type = HandlerImpl->MCGType;
403-
std::vector<StreamImplPtr> Streams;
404-
if (Type == CGType::Kernel)
405-
Streams = std::move(Handler.MStreamStorage);
406-
407-
HandlerImpl->MEventMode = SubmitInfo.EventMode();
408-
409-
auto Event = finalizeHandler(Handler, SubmitInfo.PostProcessorFunc());
410-
411-
addEvent(Event);
412-
413-
const auto &EventImpl = detail::getSyclObjImpl(Event);
414-
for (auto &Stream : Streams) {
415-
// We don't want stream flushing to be blocking operation that is why submit
416-
// a host task to print stream buffer. It will fire up as soon as the kernel
417-
// finishes execution.
418-
auto L = [&](handler &ServiceCGH) {
419-
Stream->generateFlushCommand(ServiceCGH);
420-
};
421-
detail::type_erased_cgfo_ty CGF{L};
422-
event FlushEvent =
423-
submit_impl(CGF, Self, PrimaryQueue, SecondaryQueue,
424-
/*CallerNeedsEvent*/ true, Loc, IsTopCodeLoc, {});
425-
EventImpl->attachEventToCompleteWeak(detail::getSyclObjImpl(FlushEvent));
426-
registerStreamServiceEvent(detail::getSyclObjImpl(FlushEvent));
427-
}
428-
429-
return Event;
424+
return submit_impl(CGF, Self, SecondaryQueue.get(), CallerNeedsEvent, Loc,
425+
IsTopCodeLoc, SubmitInfo);
430426
}
431427
#endif
432428

@@ -467,24 +463,19 @@ event queue_impl::submitMemOpHelper(const std::shared_ptr<queue_impl> &Self,
467463
const std::vector<event> &ExpandedDepEvents =
468464
getExtendDependencyList(DepEvents, MutableDepEvents, Lock);
469465

466+
MEmpty = false;
467+
470468
// If we have a command graph set we need to capture the op through the
471469
// handler rather than by-passing the scheduler.
472470
if (MGraph.expired() && Scheduler::areEventsSafeForSchedulerBypass(
473471
ExpandedDepEvents, MContext)) {
474-
if (!CallerNeedsEvent && supportsDiscardingPiEvents()) {
472+
auto isNoEventsMode = trySwitchingToNoEventsMode();
473+
if (!CallerNeedsEvent && isNoEventsMode) {
475474
NestedCallsTracker tracker;
476475
MemOpFunc(MemOpArgs..., getUrEvents(ExpandedDepEvents),
477476
/*PiEvent*/ nullptr);
478477

479-
event DiscardedEvent = createDiscardedEvent();
480-
if (isInOrder()) {
481-
// Store the discarded event for proper in-order dependency tracking.
482-
auto &EventToStoreIn = MGraph.expired()
483-
? MDefaultGraphDeps.LastEventPtr
484-
: MExtGraphDeps.LastEventPtr;
485-
EventToStoreIn = detail::getSyclObjImpl(DiscardedEvent);
486-
}
487-
return DiscardedEvent;
478+
return createDiscardedEvent();
488479
}
489480

490481
event ResEvent = prepareSYCLEventAssociatedWithQueue(Self);
@@ -509,7 +500,8 @@ event queue_impl::submitMemOpHelper(const std::shared_ptr<queue_impl> &Self,
509500
}
510501
}
511502

512-
if (isInOrder()) {
503+
if (isInOrder() &&
504+
(!isNoEventsMode || MContext->getBackend() == backend::opencl)) {
513505
auto &EventToStoreIn = MGraph.expired() ? MDefaultGraphDeps.LastEventPtr
514506
: MExtGraphDeps.LastEventPtr;
515507
EventToStoreIn = EventImpl;
@@ -637,9 +629,11 @@ void queue_impl::wait(const detail::code_location &CodeLoc) {
637629
}
638630

639631
std::vector<std::weak_ptr<event_impl>> WeakEvents;
632+
EventImplPtr LastEvent;
640633
{
641634
std::lock_guard<std::mutex> Lock(MMutex);
642635
WeakEvents.swap(MEventsWeak);
636+
LastEvent = MDefaultGraphDeps.LastEventPtr;
643637

644638
MMissedCleanupRequests.unset(
645639
[&](MissedCleanupRequestsType &MissedCleanupRequests) {
@@ -664,6 +658,11 @@ void queue_impl::wait(const detail::code_location &CodeLoc) {
664658
}
665659
}
666660
}
661+
662+
if (LastEvent) {
663+
LastEvent->wait(LastEvent);
664+
}
665+
667666
const AdapterPtr &Adapter = getAdapter();
668667
Adapter->call<UrApiKind::urQueueFinish>(getHandleRef());
669668

@@ -755,18 +754,14 @@ ur_native_handle_t queue_impl::getNative(int32_t &NativeHandleDesc) const {
755754
}
756755

757756
bool queue_impl::queue_empty() const {
758-
// If we have in-order queue where events are not discarded then just check
759-
// the status of the last event.
757+
// If we have in-order queue with non-empty last event, just check its status.
760758
if (isInOrder()) {
761759
std::lock_guard<std::mutex> Lock(MMutex);
762-
// If there is no last event we know that no work has been submitted, so it
763-
// must be trivially empty.
764-
if (!MDefaultGraphDeps.LastEventPtr)
760+
if (MEmpty)
765761
return true;
766-
// Otherwise, check if the last event is finished.
767-
// Note that we fall back to the backend query if the event was discarded,
768-
// which may happend despite the queue not being a discard event queue.
769-
if (!MDefaultGraphDeps.LastEventPtr->isDiscarded())
762+
763+
if (MDefaultGraphDeps.LastEventPtr &&
764+
!MDefaultGraphDeps.LastEventPtr->isDiscarded())
770765
return MDefaultGraphDeps.LastEventPtr
771766
->get_info<info::event::command_execution_status>() ==
772767
info::event_command_status::complete;
@@ -779,6 +774,11 @@ bool queue_impl::queue_empty() const {
779774
if (!IsReady)
780775
return false;
781776

777+
// If got here, it means that LastEventPtr is nullptr (so no possible Host
778+
// Tasks) and there is nothing executing on the device.
779+
if (isInOrder())
780+
return true;
781+
782782
// We may have events like host tasks which are not submitted to the backend
783783
// queue so we need to get their status separately.
784784
std::lock_guard<std::mutex> Lock(MMutex);

0 commit comments

Comments
 (0)