From 489c9fe9ab9d069522a09e4fd0f7241de920700d Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Tue, 29 Apr 2025 16:02:09 +0200 Subject: [PATCH 1/5] [SYCL] Pass NULL instead of {0, 0, 0} offset UR adapters check against NULL to avoid setting an offset, so passing NULL should improve performance. Signed-off-by: John Pennycook --- sycl/source/detail/cg.hpp | 7 +++++++ sycl/source/detail/scheduler/commands.cpp | 18 +++++++++++++----- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/sycl/source/detail/cg.hpp b/sycl/source/detail/cg.hpp index 3b771fb079ad5..ddf1b2a74d928 100644 --- a/sycl/source/detail/cg.hpp +++ b/sycl/source/detail/cg.hpp @@ -134,6 +134,13 @@ class NDRDescT { ClusterDimensions[I] = (I < Dims) ? N[I] : 1; } + bool hasOffset() const { + bool HasOffset = false; + for (int I = 0; I < 3; ++I) + HasOffset |= GlobalOffset[I]; + return HasOffset; + } + NDRDescT &operator=(const NDRDescT &Desc) = default; NDRDescT &operator=(NDRDescT &&Desc) = default; diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 9201be0a318b0..f2712bb64f6b4 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -2431,6 +2431,10 @@ static ur_result_t SetKernelParamsAndLaunch( size_t RequiredWGSize[3] = {0, 0, 0}; size_t *LocalSize = nullptr; + size_t *GlobalOffset = nullptr; + + if (NDRDesc.hasOffset()) + GlobalOffset = &NDRDesc.GlobalOffset[0]; if (HasLocalSize) LocalSize = &NDRDesc.LocalSize[0]; @@ -2478,9 +2482,9 @@ static ur_result_t SetKernelParamsAndLaunch( ur_event_handle_t UREvent = nullptr; ur_result_t Error = Adapter->call_nocheck( - Queue->getHandleRef(), Kernel, NDRDesc.Dims, - &NDRDesc.GlobalOffset[0], &NDRDesc.GlobalSize[0], LocalSize, - property_list.size(), property_list.data(), RawEvents.size(), + Queue->getHandleRef(), Kernel, NDRDesc.Dims, GlobalOffset, + &NDRDesc.GlobalSize[0], LocalSize, property_list.size(), + property_list.data(), RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0], OutEventImpl ? &UREvent : nullptr); if ((Error == UR_RESULT_SUCCESS) && OutEventImpl) { @@ -2497,7 +2501,7 @@ static ur_result_t SetKernelParamsAndLaunch( Args...); } return Adapter->call_nocheck(Args...); - }(Queue->getHandleRef(), Kernel, NDRDesc.Dims, &NDRDesc.GlobalOffset[0], + }(Queue->getHandleRef(), Kernel, NDRDesc.Dims, GlobalOffset, &NDRDesc.GlobalSize[0], LocalSize, RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0], OutEventImpl ? &UREvent : nullptr); @@ -2605,6 +2609,10 @@ ur_result_t enqueueImpCommandBufferKernel( size_t RequiredWGSize[3] = {0, 0, 0}; size_t *LocalSize = nullptr; + size_t *GlobalOffset = nullptr; + + if (NDRDesc.hasOffset()) + GlobalOffset = &NDRDesc.GlobalOffset[0]; if (HasLocalSize) LocalSize = &NDRDesc.LocalSize[0]; @@ -2633,7 +2641,7 @@ ur_result_t enqueueImpCommandBufferKernel( ur_result_t Res = Adapter->call_nocheck( - CommandBuffer, UrKernel, NDRDesc.Dims, &NDRDesc.GlobalOffset[0], + CommandBuffer, UrKernel, NDRDesc.Dims, GlobalOffset, &NDRDesc.GlobalSize[0], LocalSize, AltUrKernels.size(), AltUrKernels.size() ? AltUrKernels.data() : nullptr, SyncPoints.size(), SyncPoints.size() ? SyncPoints.data() : nullptr, 0, From 7c960fabaf6d79e848388ff06a3465b7fd045c31 Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Tue, 29 Apr 2025 17:45:14 +0200 Subject: [PATCH 2/5] [UR][L0] Track when offsets are used by kernels The vast majority of kernels are expected to never have an offset, because the offset feature was deprecated in SYCL 2020. A small number of kernels will still have an offset. Since this case is uncommon, it is less important to optimize. There is a pathological case where the same kernel alternates between different overloads of parallel_for with and without an offset. Keeping track of whether the last submission had an offset is intended to address this case, while still allowing us to skip the L0 call. Signed-off-by: John Pennycook --- .../source/adapters/level_zero/v2/kernel.cpp | 10 ++++++++++ .../source/adapters/level_zero/v2/kernel.hpp | 3 +++ 2 files changed, 13 insertions(+) diff --git a/unified-runtime/source/adapters/level_zero/v2/kernel.cpp b/unified-runtime/source/adapters/level_zero/v2/kernel.cpp index 80c43d301cc02..519942a380ab0 100644 --- a/unified-runtime/source/adapters/level_zero/v2/kernel.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/kernel.cpp @@ -280,9 +280,19 @@ ur_result_t ur_kernel_handle_t_::prepareForSubmission( std::function migrate) { auto hZeKernel = getZeHandle(hDevice); + // If we have a new offset, update the zeKernel object if (pGlobalWorkOffset != NULL) { UR_CALL( setKernelGlobalOffset(hContext, hZeKernel, workDim, pGlobalWorkOffset)); + hadOffset = true; + } + + // If we have no offset now but the last submission had one, clear it + else if (hadOffset) { + size_t zeroOffset[3] = {0, 0, 0}; + UR_CALL( + setKernelGlobalOffset(hContext, hZeKernel, workDim, zeroOffset)); + hadOffset = false; } ZE2UR_CALL(zeKernelSetGroupSize, diff --git a/unified-runtime/source/adapters/level_zero/v2/kernel.hpp b/unified-runtime/source/adapters/level_zero/v2/kernel.hpp index 950e3ea70c2dc..6e085c11b198a 100644 --- a/unified-runtime/source/adapters/level_zero/v2/kernel.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/kernel.hpp @@ -115,4 +115,7 @@ struct ur_kernel_handle_t_ : ur_object { // pointer to any non-null kernel in deviceKernels ur_single_device_kernel_t *nonEmptyKernel; + + // keep track of whether the offset is non-zero + bool hadOffset = false; }; From 0a2765274301ed1b35a485d35d69c33aec23a94d Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Wed, 30 Apr 2025 10:04:14 +0200 Subject: [PATCH 3/5] [UR][L0] Switch to a smarter post-submit design Avoids tracking the last global offset at the expense of making offset kernels slower. Signed-off-by: John Pennycook --- .../level_zero/helpers/kernel_helpers.hpp | 15 +++++++++++++++ .../level_zero/v2/command_list_manager.cpp | 2 ++ .../source/adapters/level_zero/v2/kernel.cpp | 10 ---------- .../source/adapters/level_zero/v2/kernel.hpp | 3 --- .../level_zero/v2/queue_immediate_in_order.cpp | 2 ++ 5 files changed, 19 insertions(+), 13 deletions(-) diff --git a/unified-runtime/source/adapters/level_zero/helpers/kernel_helpers.hpp b/unified-runtime/source/adapters/level_zero/helpers/kernel_helpers.hpp index 49345bb57e159..5dcf0c9123045 100644 --- a/unified-runtime/source/adapters/level_zero/helpers/kernel_helpers.hpp +++ b/unified-runtime/source/adapters/level_zero/helpers/kernel_helpers.hpp @@ -56,3 +56,18 @@ ur_result_t getSuggestedLocalWorkSize(ur_device_handle_t hDevice, ze_kernel_handle_t hZeKernel, size_t GlobalWorkSize3D[3], uint32_t SuggestedLocalWorkSize3D[3]); + +/** + * Handle uncommon conditions after kernel submission. + * Resets the offset to {0, 0, 0} if one was supplied. + * @param[in] hZeKernel The kernel handle. + * @param[in] pGlobalWorkOffset Pointer to offset array. + */ +inline void postSubmit(ze_kernel_handle_t hZeKernel, + const size_t *pGlobalWorkOffset) { + // If this kernel was launched with an offset, clear it for the next launch. + // This slows down kernels with offsets but keeps the common case fast. + if (pGlobalWorkOffset != NULL) { + zeKernelSetGlobalOffsetExp(hZeKernel, 0, 0, 0); + } +} diff --git a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp index 397aa022bcee7..ddac491039b0f 100644 --- a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp @@ -229,6 +229,8 @@ ur_result_t ur_command_list_manager::appendKernelLaunch( (zeCommandList.get(), hZeKernel, &zeThreadGroupDimensions, zeSignalEvent, waitListView.num, waitListView.handles)); + postSubmit(hZeKernel, pGlobalWorkOffset); + return UR_RESULT_SUCCESS; } diff --git a/unified-runtime/source/adapters/level_zero/v2/kernel.cpp b/unified-runtime/source/adapters/level_zero/v2/kernel.cpp index 519942a380ab0..80c43d301cc02 100644 --- a/unified-runtime/source/adapters/level_zero/v2/kernel.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/kernel.cpp @@ -280,19 +280,9 @@ ur_result_t ur_kernel_handle_t_::prepareForSubmission( std::function migrate) { auto hZeKernel = getZeHandle(hDevice); - // If we have a new offset, update the zeKernel object if (pGlobalWorkOffset != NULL) { UR_CALL( setKernelGlobalOffset(hContext, hZeKernel, workDim, pGlobalWorkOffset)); - hadOffset = true; - } - - // If we have no offset now but the last submission had one, clear it - else if (hadOffset) { - size_t zeroOffset[3] = {0, 0, 0}; - UR_CALL( - setKernelGlobalOffset(hContext, hZeKernel, workDim, zeroOffset)); - hadOffset = false; } ZE2UR_CALL(zeKernelSetGroupSize, diff --git a/unified-runtime/source/adapters/level_zero/v2/kernel.hpp b/unified-runtime/source/adapters/level_zero/v2/kernel.hpp index 6e085c11b198a..950e3ea70c2dc 100644 --- a/unified-runtime/source/adapters/level_zero/v2/kernel.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/kernel.hpp @@ -115,7 +115,4 @@ struct ur_kernel_handle_t_ : ur_object { // pointer to any non-null kernel in deviceKernels ur_single_device_kernel_t *nonEmptyKernel; - - // keep track of whether the offset is non-zero - bool hadOffset = false; }; diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp index 0c98147b53582..3e78772e7c3b3 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp @@ -822,6 +822,8 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueCooperativeKernelLaunchExp( recordSubmittedKernel(hKernel); + postSubmit(hZeKernel, pGlobalWorkOffset); + return UR_RESULT_SUCCESS; } From fcf96814fe94bf8986cc72cb894e29198453af6e Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Wed, 30 Apr 2025 12:36:36 +0200 Subject: [PATCH 4/5] Revert "[SYCL] Pass NULL instead of {0, 0, 0} offset" This reverts commit 489c9fe9ab9d069522a09e4fd0f7241de920700d. --- sycl/source/detail/cg.hpp | 7 ------- sycl/source/detail/scheduler/commands.cpp | 18 +++++------------- 2 files changed, 5 insertions(+), 20 deletions(-) diff --git a/sycl/source/detail/cg.hpp b/sycl/source/detail/cg.hpp index ddf1b2a74d928..3b771fb079ad5 100644 --- a/sycl/source/detail/cg.hpp +++ b/sycl/source/detail/cg.hpp @@ -134,13 +134,6 @@ class NDRDescT { ClusterDimensions[I] = (I < Dims) ? N[I] : 1; } - bool hasOffset() const { - bool HasOffset = false; - for (int I = 0; I < 3; ++I) - HasOffset |= GlobalOffset[I]; - return HasOffset; - } - NDRDescT &operator=(const NDRDescT &Desc) = default; NDRDescT &operator=(NDRDescT &&Desc) = default; diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index f2712bb64f6b4..9201be0a318b0 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -2431,10 +2431,6 @@ static ur_result_t SetKernelParamsAndLaunch( size_t RequiredWGSize[3] = {0, 0, 0}; size_t *LocalSize = nullptr; - size_t *GlobalOffset = nullptr; - - if (NDRDesc.hasOffset()) - GlobalOffset = &NDRDesc.GlobalOffset[0]; if (HasLocalSize) LocalSize = &NDRDesc.LocalSize[0]; @@ -2482,9 +2478,9 @@ static ur_result_t SetKernelParamsAndLaunch( ur_event_handle_t UREvent = nullptr; ur_result_t Error = Adapter->call_nocheck( - Queue->getHandleRef(), Kernel, NDRDesc.Dims, GlobalOffset, - &NDRDesc.GlobalSize[0], LocalSize, property_list.size(), - property_list.data(), RawEvents.size(), + Queue->getHandleRef(), Kernel, NDRDesc.Dims, + &NDRDesc.GlobalOffset[0], &NDRDesc.GlobalSize[0], LocalSize, + property_list.size(), property_list.data(), RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0], OutEventImpl ? &UREvent : nullptr); if ((Error == UR_RESULT_SUCCESS) && OutEventImpl) { @@ -2501,7 +2497,7 @@ static ur_result_t SetKernelParamsAndLaunch( Args...); } return Adapter->call_nocheck(Args...); - }(Queue->getHandleRef(), Kernel, NDRDesc.Dims, GlobalOffset, + }(Queue->getHandleRef(), Kernel, NDRDesc.Dims, &NDRDesc.GlobalOffset[0], &NDRDesc.GlobalSize[0], LocalSize, RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0], OutEventImpl ? &UREvent : nullptr); @@ -2609,10 +2605,6 @@ ur_result_t enqueueImpCommandBufferKernel( size_t RequiredWGSize[3] = {0, 0, 0}; size_t *LocalSize = nullptr; - size_t *GlobalOffset = nullptr; - - if (NDRDesc.hasOffset()) - GlobalOffset = &NDRDesc.GlobalOffset[0]; if (HasLocalSize) LocalSize = &NDRDesc.LocalSize[0]; @@ -2641,7 +2633,7 @@ ur_result_t enqueueImpCommandBufferKernel( ur_result_t Res = Adapter->call_nocheck( - CommandBuffer, UrKernel, NDRDesc.Dims, GlobalOffset, + CommandBuffer, UrKernel, NDRDesc.Dims, &NDRDesc.GlobalOffset[0], &NDRDesc.GlobalSize[0], LocalSize, AltUrKernels.size(), AltUrKernels.size() ? AltUrKernels.data() : nullptr, SyncPoints.size(), SyncPoints.size() ? SyncPoints.data() : nullptr, 0, From 69306718fa5234b3f97d50971fada1c75ff7deaa Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Wed, 30 Apr 2025 12:50:14 +0200 Subject: [PATCH 5/5] [UR][L0] Treat {0, 0, 0} as NULL offset Signed-off-by: John Pennycook --- .../adapters/level_zero/v2/command_list_manager.cpp | 10 ++++++++++ .../level_zero/v2/queue_immediate_in_order.cpp | 10 ++++++++++ 2 files changed, 20 insertions(+) diff --git a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp index ddac491039b0f..d6f865d80b5c3 100644 --- a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp @@ -219,6 +219,16 @@ ur_result_t ur_command_list_manager::appendKernelLaunch( waitListView.clear(); }; + // If the offset is {0, 0, 0}, pass NULL instead. + // This allows us to skip setting the offset. + bool hasOffset = false; + for (uint32_t i = 0; i < workDim; ++i) { + hasOffset |= pGlobalWorkOffset[i]; + } + if (!hasOffset) { + pGlobalWorkOffset = NULL; + } + UR_CALL(hKernel->prepareForSubmission(context, device, pGlobalWorkOffset, workDim, WG[0], WG[1], WG[2], memoryMigrate)); diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp index 3e78772e7c3b3..286623277dc47 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp @@ -809,6 +809,16 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueCooperativeKernelLaunchExp( waitListView.clear(); }; + // If the offset is {0, 0, 0}, pass NULL instead. + // This allows us to skip setting the offset. + bool hasOffset = false; + for (uint32_t i = 0; i < workDim; ++i) { + hasOffset |= pGlobalWorkOffset[i]; + } + if (!hasOffset) { + pGlobalWorkOffset = NULL; + } + UR_CALL(hKernel->prepareForSubmission(hContext, hDevice, pGlobalWorkOffset, workDim, WG[0], WG[1], WG[2], memoryMigrate));