diff --git a/unified-runtime/source/adapters/level_zero/helpers/kernel_helpers.hpp b/unified-runtime/source/adapters/level_zero/helpers/kernel_helpers.hpp index 49345bb57e159..5dcf0c9123045 100644 --- a/unified-runtime/source/adapters/level_zero/helpers/kernel_helpers.hpp +++ b/unified-runtime/source/adapters/level_zero/helpers/kernel_helpers.hpp @@ -56,3 +56,18 @@ ur_result_t getSuggestedLocalWorkSize(ur_device_handle_t hDevice, ze_kernel_handle_t hZeKernel, size_t GlobalWorkSize3D[3], uint32_t SuggestedLocalWorkSize3D[3]); + +/** + * Handle uncommon conditions after kernel submission. + * Resets the offset to {0, 0, 0} if one was supplied. + * @param[in] hZeKernel The kernel handle. + * @param[in] pGlobalWorkOffset Pointer to offset array. + */ +inline void postSubmit(ze_kernel_handle_t hZeKernel, + const size_t *pGlobalWorkOffset) { + // If this kernel was launched with an offset, clear it for the next launch. + // This slows down kernels with offsets but keeps the common case fast. + if (pGlobalWorkOffset != NULL) { + zeKernelSetGlobalOffsetExp(hZeKernel, 0, 0, 0); + } +} diff --git a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp index 397aa022bcee7..d6f865d80b5c3 100644 --- a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp @@ -219,6 +219,16 @@ ur_result_t ur_command_list_manager::appendKernelLaunch( waitListView.clear(); }; + // If the offset is {0, 0, 0}, pass NULL instead. + // This allows us to skip setting the offset. + bool hasOffset = false; + for (uint32_t i = 0; i < workDim; ++i) { + hasOffset |= pGlobalWorkOffset[i]; + } + if (!hasOffset) { + pGlobalWorkOffset = NULL; + } + UR_CALL(hKernel->prepareForSubmission(context, device, pGlobalWorkOffset, workDim, WG[0], WG[1], WG[2], memoryMigrate)); @@ -229,6 +239,8 @@ ur_result_t ur_command_list_manager::appendKernelLaunch( (zeCommandList.get(), hZeKernel, &zeThreadGroupDimensions, zeSignalEvent, waitListView.num, waitListView.handles)); + postSubmit(hZeKernel, pGlobalWorkOffset); + return UR_RESULT_SUCCESS; } diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp index 0c98147b53582..286623277dc47 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp @@ -809,6 +809,16 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueCooperativeKernelLaunchExp( waitListView.clear(); }; + // If the offset is {0, 0, 0}, pass NULL instead. + // This allows us to skip setting the offset. + bool hasOffset = false; + for (uint32_t i = 0; i < workDim; ++i) { + hasOffset |= pGlobalWorkOffset[i]; + } + if (!hasOffset) { + pGlobalWorkOffset = NULL; + } + UR_CALL(hKernel->prepareForSubmission(hContext, hDevice, pGlobalWorkOffset, workDim, WG[0], WG[1], WG[2], memoryMigrate)); @@ -822,6 +832,8 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueCooperativeKernelLaunchExp( recordSubmittedKernel(hKernel); + postSubmit(hZeKernel, pGlobalWorkOffset); + return UR_RESULT_SUCCESS; }