diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp index 5046f4c865..f21f5b00ee 100644 --- a/source/adapters/cuda/enqueue.cpp +++ b/source/adapters/cuda/enqueue.cpp @@ -169,18 +169,10 @@ void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock, ThreadsPerBlock[0] = std::min( MaxThreadsPerBlock[0], std::min(GlobalSizeNormalized[0], MaxBlockDim[0])); - static auto IsPowerOf2 = [](size_t Value) -> bool { - return Value && !(Value & (Value - 1)); - }; - // Find a local work group size that is a divisor of the global // work group size to produce uniform work groups. - // Additionally, for best compute utilisation, the local size has - // to be a power of two. - while (0u != (GlobalSizeNormalized[0] % ThreadsPerBlock[0]) || - !IsPowerOf2(ThreadsPerBlock[0])) { + while (GlobalSizeNormalized[0] % ThreadsPerBlock[0]) --ThreadsPerBlock[0]; - } } // Helper to verify out-of-registers case (exceeded block max registers). diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp index 80ec41f984..ce048d05d0 100644 --- a/source/adapters/hip/enqueue.cpp +++ b/source/adapters/hip/enqueue.cpp @@ -51,13 +51,9 @@ ur_result_t enqueueEventsWait(ur_queue_handle_t, hipStream_t Stream, void simpleGuessLocalWorkSize(size_t *ThreadsPerBlock, const size_t *GlobalWorkSize, - const size_t MaxThreadsPerBlock[3], - ur_kernel_handle_t Kernel) { + const size_t MaxThreadsPerBlock[3]) { assert(ThreadsPerBlock != nullptr); assert(GlobalWorkSize != nullptr); - assert(Kernel != nullptr); - - std::ignore = Kernel; ThreadsPerBlock[0] = std::min(MaxThreadsPerBlock[0], GlobalWorkSize[0]); @@ -345,7 +341,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( } } else { simpleGuessLocalWorkSize(ThreadsPerBlock, pGlobalWorkSize, - MaxThreadsPerBlock, hKernel); + MaxThreadsPerBlock); } }