Skip to content

Commit 6e3be6f

Browse files
[L0] Fix hang on counter based events when regular commandlist from cache resets.
Due to the dependency of commandlist on counter based events. When a commmandlist is reset there is still a chance that an associated counter based event is still linked to another commandlist. This causes a hang. For now, we will disable regular commandlist cache and create a new commandlist instead. Signed-off-by: Zhang, Winston <[email protected]>
1 parent 7936bf3 commit 6e3be6f

File tree

1 file changed

+98
-88
lines changed

1 file changed

+98
-88
lines changed

source/adapters/level_zero/context.cpp

Lines changed: 98 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -716,100 +716,110 @@ ur_result_t ur_context_handle_t_::getAvailableCommandList(
716716
// command list is available for reuse.
717717
ur_result_t ur_result = UR_RESULT_ERROR_OUT_OF_RESOURCES;
718718

719-
// Initally, we need to check if a command list has already been created
720-
// on this device that is available for use. If so, then reuse that
721-
// Level-Zero Command List and Fence for this PI call.
722-
{
723-
// Make sure to acquire the lock before checking the size, or there
724-
// will be a race condition.
725-
std::scoped_lock<ur_mutex> Lock(Queue->Context->ZeCommandListCacheMutex);
726-
// Under mutex since operator[] does insertion on the first usage for every
727-
// unique ZeDevice.
728-
auto &ZeCommandListCache =
729-
UseCopyEngine
730-
? Queue->Context->ZeCopyCommandListCache[Queue->Device->ZeDevice]
731-
: Queue->Context
732-
->ZeComputeCommandListCache[Queue->Device->ZeDevice];
733-
734-
for (auto ZeCommandListIt = ZeCommandListCache.begin();
735-
ZeCommandListIt != ZeCommandListCache.end(); ++ZeCommandListIt) {
719+
// As a limitation of regular command list, when counter based events are used
720+
// the command list in the cache still has the potential of having events that
721+
// are still associated with another command list. We disable cache for
722+
// regular command list when counter based events are enabled to avoid race
723+
// condition.
724+
if (!Queue->CounterBasedEventsEnabled) {
725+
// Initally, we need to check if a command list has already been created
726+
// on this device that is available for use. If so, then reuse that
727+
// Level-Zero Command List and Fence for this PI call.
728+
{
729+
// Make sure to acquire the lock before checking the size, or there
730+
// will be a race condition.
731+
std::scoped_lock<ur_mutex> Lock(Queue->Context->ZeCommandListCacheMutex);
732+
// Under mutex since operator[] does insertion on the first usage for
733+
// every unique ZeDevice.
734+
auto &ZeCommandListCache =
735+
UseCopyEngine
736+
? Queue->Context->ZeCopyCommandListCache[Queue->Device->ZeDevice]
737+
: Queue->Context
738+
->ZeComputeCommandListCache[Queue->Device->ZeDevice];
739+
740+
for (auto ZeCommandListIt = ZeCommandListCache.begin();
741+
ZeCommandListIt != ZeCommandListCache.end(); ++ZeCommandListIt) {
742+
// If this is an InOrder Queue, then only allow lists which are in
743+
// order.
744+
if (Queue->Device->useDriverInOrderLists() && Queue->isInOrderQueue() &&
745+
!(ZeCommandListIt->second.InOrderList)) {
746+
continue;
747+
}
748+
auto &ZeCommandList = ZeCommandListIt->first;
749+
auto it = Queue->CommandListMap.find(ZeCommandList);
750+
if (it != Queue->CommandListMap.end()) {
751+
if (ForcedCmdQueue && *ForcedCmdQueue != it->second.ZeQueue)
752+
continue;
753+
CommandList = it;
754+
if (CommandList->second.ZeFence != nullptr)
755+
CommandList->second.ZeFenceInUse = true;
756+
} else {
757+
// If there is a command list available on this context, but it
758+
// wasn't yet used in this queue then create a new entry in this
759+
// queue's map to hold the fence and other associated command
760+
// list information.
761+
auto &QGroup = Queue->getQueueGroup(UseCopyEngine);
762+
uint32_t QueueGroupOrdinal;
763+
auto &ZeCommandQueue = ForcedCmdQueue
764+
? *ForcedCmdQueue
765+
: QGroup.getZeQueue(&QueueGroupOrdinal);
766+
if (ForcedCmdQueue)
767+
QueueGroupOrdinal = QGroup.getCmdQueueOrdinal(ZeCommandQueue);
768+
769+
ze_fence_handle_t ZeFence;
770+
ZeStruct<ze_fence_desc_t> ZeFenceDesc;
771+
ZE2UR_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence));
772+
ZeStruct<ze_command_queue_desc_t> ZeQueueDesc;
773+
ZeQueueDesc.ordinal = QueueGroupOrdinal;
774+
CommandList =
775+
Queue->CommandListMap
776+
.emplace(ZeCommandList,
777+
ur_command_list_info_t{ZeFence, true, false,
778+
ZeCommandQueue, ZeQueueDesc})
779+
.first;
780+
}
781+
ZeCommandListCache.erase(ZeCommandListIt);
782+
if (auto Res =
783+
Queue->insertStartBarrierIfDiscardEventsMode(CommandList))
784+
return Res;
785+
if (auto Res = Queue->insertActiveBarriers(CommandList, UseCopyEngine))
786+
return Res;
787+
return UR_RESULT_SUCCESS;
788+
}
789+
}
790+
791+
// If there are no available command lists in the cache, then we check for
792+
// command lists that have already signalled, but have not been added to the
793+
// available list yet. Each command list has a fence associated which tracks
794+
// if a command list has completed dispatch of its commands and is ready for
795+
// reuse. If a command list is found to have been signalled, then the
796+
// command list & fence are reset and we return.
797+
for (auto it = Queue->CommandListMap.begin();
798+
it != Queue->CommandListMap.end(); ++it) {
799+
// Make sure this is the command list type needed.
800+
if (UseCopyEngine != it->second.isCopy(Queue))
801+
continue;
802+
736803
// If this is an InOrder Queue, then only allow lists which are in order.
737804
if (Queue->Device->useDriverInOrderLists() && Queue->isInOrderQueue() &&
738-
!(ZeCommandListIt->second.InOrderList)) {
805+
!(it->second.IsInOrderList)) {
739806
continue;
740807
}
741-
auto &ZeCommandList = ZeCommandListIt->first;
742-
auto it = Queue->CommandListMap.find(ZeCommandList);
743-
if (it != Queue->CommandListMap.end()) {
744-
if (ForcedCmdQueue && *ForcedCmdQueue != it->second.ZeQueue)
745-
continue;
808+
809+
ze_result_t ZeResult =
810+
ZE_CALL_NOCHECK(zeFenceQueryStatus, (it->second.ZeFence));
811+
if (ZeResult == ZE_RESULT_SUCCESS) {
812+
std::vector<ur_event_handle_t> EventListToCleanup;
813+
Queue->resetCommandList(it, false, EventListToCleanup);
814+
CleanupEventListFromResetCmdList(EventListToCleanup,
815+
true /* QueueLocked */);
746816
CommandList = it;
747-
if (CommandList->second.ZeFence != nullptr)
748-
CommandList->second.ZeFenceInUse = true;
749-
} else {
750-
// If there is a command list available on this context, but it
751-
// wasn't yet used in this queue then create a new entry in this
752-
// queue's map to hold the fence and other associated command
753-
// list information.
754-
auto &QGroup = Queue->getQueueGroup(UseCopyEngine);
755-
uint32_t QueueGroupOrdinal;
756-
auto &ZeCommandQueue = ForcedCmdQueue
757-
? *ForcedCmdQueue
758-
: QGroup.getZeQueue(&QueueGroupOrdinal);
759-
if (ForcedCmdQueue)
760-
QueueGroupOrdinal = QGroup.getCmdQueueOrdinal(ZeCommandQueue);
761-
762-
ze_fence_handle_t ZeFence;
763-
ZeStruct<ze_fence_desc_t> ZeFenceDesc;
764-
ZE2UR_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence));
765-
ZeStruct<ze_command_queue_desc_t> ZeQueueDesc;
766-
ZeQueueDesc.ordinal = QueueGroupOrdinal;
767-
CommandList =
768-
Queue->CommandListMap
769-
.emplace(ZeCommandList,
770-
ur_command_list_info_t{ZeFence, true, false,
771-
ZeCommandQueue, ZeQueueDesc})
772-
.first;
817+
CommandList->second.ZeFenceInUse = true;
818+
if (auto Res =
819+
Queue->insertStartBarrierIfDiscardEventsMode(CommandList))
820+
return Res;
821+
return UR_RESULT_SUCCESS;
773822
}
774-
ZeCommandListCache.erase(ZeCommandListIt);
775-
if (auto Res = Queue->insertStartBarrierIfDiscardEventsMode(CommandList))
776-
return Res;
777-
if (auto Res = Queue->insertActiveBarriers(CommandList, UseCopyEngine))
778-
return Res;
779-
return UR_RESULT_SUCCESS;
780-
}
781-
}
782-
783-
// If there are no available command lists in the cache, then we check for
784-
// command lists that have already signalled, but have not been added to the
785-
// available list yet. Each command list has a fence associated which tracks
786-
// if a command list has completed dispatch of its commands and is ready for
787-
// reuse. If a command list is found to have been signalled, then the
788-
// command list & fence are reset and we return.
789-
for (auto it = Queue->CommandListMap.begin();
790-
it != Queue->CommandListMap.end(); ++it) {
791-
// Make sure this is the command list type needed.
792-
if (UseCopyEngine != it->second.isCopy(Queue))
793-
continue;
794-
795-
// If this is an InOrder Queue, then only allow lists which are in order.
796-
if (Queue->Device->useDriverInOrderLists() && Queue->isInOrderQueue() &&
797-
!(it->second.IsInOrderList)) {
798-
continue;
799-
}
800-
801-
ze_result_t ZeResult =
802-
ZE_CALL_NOCHECK(zeFenceQueryStatus, (it->second.ZeFence));
803-
if (ZeResult == ZE_RESULT_SUCCESS) {
804-
std::vector<ur_event_handle_t> EventListToCleanup;
805-
Queue->resetCommandList(it, false, EventListToCleanup);
806-
CleanupEventListFromResetCmdList(EventListToCleanup,
807-
true /* QueueLocked */);
808-
CommandList = it;
809-
CommandList->second.ZeFenceInUse = true;
810-
if (auto Res = Queue->insertStartBarrierIfDiscardEventsMode(CommandList))
811-
return Res;
812-
return UR_RESULT_SUCCESS;
813823
}
814824
}
815825

0 commit comments

Comments
 (0)