Fix bug in CUDA range calculation

hdelan · hdelan · commit bed33ec6588b · 2024-02-23T09:32:38.000Z
A bug in the CUDA adapter was sometimes generating Y and Z ranges that did not divide the
global Y or Z dimension. This fixes that.

Also moves some helper functions into ur/ur.hpp that may be reused by other adapters
diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp
@@ -18,6 +18,7 @@
 
 #include <cmath>
 #include <cuda.h>
+#include <ur/ur.hpp>
 
 ur_result_t enqueueEventsWait(ur_queue_handle_t CommandQueue, CUstream Stream,
                               uint32_t NumEventsInWaitList,
@@ -140,12 +141,10 @@ ur_result_t setCuMemAdvise(CUdeviceptr DevPtr, size_t Size,
 void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
                         const size_t *GlobalWorkSize, const uint32_t WorkDim,
                         const size_t MaxThreadsPerBlock[3],
-                        ur_kernel_handle_t Kernel, uint32_t LocalSize) {
+                        ur_kernel_handle_t Kernel) {
   assert(ThreadsPerBlock != nullptr);
   assert(GlobalWorkSize != nullptr);
   assert(Kernel != nullptr);
-  int MinGrid, MaxBlockSize;
-  size_t MaxBlockDim[3];
 
   // The below assumes a three dimensional range but this is not guaranteed by
   // UR.
@@ -154,33 +153,18 @@ void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
     GlobalSizeNormalized[i] = GlobalWorkSize[i];
   }
 
+  size_t MaxBlockDim[3];
+  MaxBlockDim[0] = MaxThreadsPerBlock[0];
   MaxBlockDim[1] = Device->getMaxBlockDimY();
   MaxBlockDim[2] = Device->getMaxBlockDimZ();
 
-  UR_CHECK_ERROR(
-      cuOccupancyMaxPotentialBlockSize(&MinGrid, &MaxBlockSize, Kernel->get(),
-                                       NULL, LocalSize, MaxThreadsPerBlock[0]));
-
-  ThreadsPerBlock[2] = std::min(GlobalSizeNormalized[2], MaxBlockDim[2]);
-  ThreadsPerBlock[1] =
-      std::min(GlobalSizeNormalized[1],
-               std::min(MaxBlockSize / ThreadsPerBlock[2], MaxBlockDim[1]));
-  MaxBlockDim[0] = MaxBlockSize / (ThreadsPerBlock[1] * ThreadsPerBlock[2]);
-  ThreadsPerBlock[0] = std::min(
-      MaxThreadsPerBlock[0], std::min(GlobalSizeNormalized[0], MaxBlockDim[0]));
-
-  static auto IsPowerOf2 = [](size_t Value) -> bool {
-    return Value && !(Value & (Value - 1));
-  };
-
-  // Find a local work group size that is a divisor of the global
-  // work group size to produce uniform work groups.
-  // Additionally, for best compute utilisation, the local size has
-  // to be a power of two.
-  while (0u != (GlobalSizeNormalized[0] % ThreadsPerBlock[0]) ||
-         !IsPowerOf2(ThreadsPerBlock[0])) {
-    --ThreadsPerBlock[0];
-  }
+  int MinGrid, MaxBlockSize;
+  UR_CHECK_ERROR(cuOccupancyMaxPotentialBlockSize(
+      &MinGrid, &MaxBlockSize, Kernel->get(), NULL, Kernel->getLocalSize(),
+      MaxThreadsPerBlock[0]));
+
+  roundToHighestFactorOfGlobalSizeIn3d(ThreadsPerBlock, GlobalSizeNormalized,
+                                       MaxBlockDim, MaxBlockSize);
 }
 
 // Helper to verify out-of-registers case (exceeded block max registers).
@@ -261,7 +245,7 @@ setKernelParams(const ur_context_handle_t Context,
         }
       } else {
         guessLocalWorkSize(Device, ThreadsPerBlock, GlobalWorkSize, WorkDim,
-                           MaxThreadsPerBlock, Kernel, LocalSize);
+                           MaxThreadsPerBlock, Kernel);
       }
     }
 
diff --git a/source/ur/ur.hpp b/source/ur/ur.hpp
@@ -321,3 +321,46 @@ template <typename T> class Result {
 private:
   std::variant<ur_result_t, T> value_or_err;
 };
+
+// Helper to make sure each x, y, z dim divide the global dimension.
+//
+// In/Out: ThreadsPerBlockInDim - The dimension of workgroup in some dimension
+// In:     GlobalWorkSizeInDim  - The global size in some dimension
+static inline void
+roundToHighestFactorOfGlobalSize(size_t &ThreadsPerBlockInDim,
+                                 const size_t GlobalWorkSizeInDim) {
+  while (GlobalWorkSizeInDim % ThreadsPerBlockInDim) {
+    --ThreadsPerBlockInDim;
+  }
+}
+
+// Returns whether or not Value is a power of 2
+template <typename T> inline bool isPowerOf2(const T &Value) {
+  return Value && !(Value & (Value - 1));
+}
+
+// Helper to make sure each x, y, z dim divide the global dimension.
+//
+// In/Out: ThreadsPerBlock      - The size of wg in 3d
+// In:     GlobalSize           - The global size in 3d (if dim < 3 then outer
+//                                                       dims == 1)
+// In:     MaxBlockDim          - The max size of block in 3d
+// In:     MaxBlockSize         - The max total size of block in all dimensions
+static inline void roundToHighestFactorOfGlobalSizeIn3d(
+    size_t *ThreadsPerBlock, const size_t *GlobalSize,
+    const size_t *MaxBlockDim, const size_t MaxBlockSize) {
+  ThreadsPerBlock[2] = std::min(GlobalSize[2], MaxBlockDim[2]);
+  roundToHighestFactorOfGlobalSize(ThreadsPerBlock[2], GlobalSize[2]);
+
+  ThreadsPerBlock[1] =
+      std::min(GlobalSize[1],
+               std::min(MaxBlockSize / ThreadsPerBlock[2], MaxBlockDim[1]));
+  roundToHighestFactorOfGlobalSize(ThreadsPerBlock[1], GlobalSize[1]);
+
+  ThreadsPerBlock[0] = std::min(
+      GlobalSize[0], MaxBlockSize / (ThreadsPerBlock[1] * ThreadsPerBlock[2]));
+  // Make the X dim a factor of 2
+  do {
+    roundToHighestFactorOfGlobalSize(ThreadsPerBlock[0], GlobalSize[0]);
+  } while (!isPowerOf2(ThreadsPerBlock[0]));
+}