Add local worksize calculator for HIP adapter

hdelan · hdelan · commit 96c44daff896 · 2024-02-16T15:35:13.000Z
Make the logic of the guessLocalWorksize func more complex. This now mimics the CUDA adapter.

Also remove a param from CUDA adapter's guessLocalWorksize func since the LocalSize is
contained within the ur_kernel_handle_t, also a param to the func.
diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp
@@ -140,7 +140,7 @@ ur_result_t setCuMemAdvise(CUdeviceptr DevPtr, size_t Size,
 void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
                         const size_t *GlobalWorkSize, const uint32_t WorkDim,
                         const size_t MaxThreadsPerBlock[3],
-                        ur_kernel_handle_t Kernel, uint32_t LocalSize) {
+                        ur_kernel_handle_t Kernel) {
   assert(ThreadsPerBlock != nullptr);
   assert(GlobalWorkSize != nullptr);
   assert(Kernel != nullptr);
@@ -157,9 +157,9 @@ void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
   MaxBlockDim[1] = Device->getMaxBlockDimY();
   MaxBlockDim[2] = Device->getMaxBlockDimZ();
 
-  UR_CHECK_ERROR(
-      cuOccupancyMaxPotentialBlockSize(&MinGrid, &MaxBlockSize, Kernel->get(),
-                                       NULL, LocalSize, MaxThreadsPerBlock[0]));
+  UR_CHECK_ERROR(cuOccupancyMaxPotentialBlockSize(
+      &MinGrid, &MaxBlockSize, Kernel->get(), NULL, Kernel->getLocalSize(),
+      MaxThreadsPerBlock[0]));
 
   // Helper lambda to make sure each x, y, z dim divide the global dimension.
   // Can optionally specify that we want the wg size to be a power of 2 in a
@@ -266,7 +266,7 @@ setKernelParams(const ur_context_handle_t Context,
         }
       } else {
         guessLocalWorkSize(Device, ThreadsPerBlock, GlobalWorkSize, WorkDim,
-                           MaxThreadsPerBlock, Kernel, LocalSize);
+                           MaxThreadsPerBlock, Kernel);
       }
     }
 
diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp
@@ -49,19 +49,59 @@ ur_result_t enqueueEventsWait(ur_queue_handle_t, hipStream_t Stream,
   }
 }
 
-void simpleGuessLocalWorkSize(size_t *ThreadsPerBlock,
-                              const size_t *GlobalWorkSize,
-                              const size_t MaxThreadsPerBlock[3]) {
+// Determine local work sizes that result in uniform work groups.
+// The default threadsPerBlock only require handling the first work_dim
+// dimension.
+void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
+                        const size_t *GlobalWorkSize, const uint32_t WorkDim,
+                        const size_t MaxThreadsPerBlock[3],
+                        ur_kernel_handle_t Kernel) {
   assert(ThreadsPerBlock != nullptr);
   assert(GlobalWorkSize != nullptr);
+  assert(Kernel != nullptr);
+  int MinGrid, MaxBlockSize;
+  size_t MaxBlockDim[3];
+
+  // The below assumes a three dimensional range but this is not guaranteed by
+  // UR.
+  size_t GlobalSizeNormalized[3] = {1, 1, 1};
+  for (uint32_t i = 0; i < WorkDim; i++) {
+    GlobalSizeNormalized[i] = GlobalWorkSize[i];
+  }
 
-  ThreadsPerBlock[0] = std::min(MaxThreadsPerBlock[0], GlobalWorkSize[0]);
+  MaxBlockDim[1] = Device->getMaxBlockDimY();
+  MaxBlockDim[2] = Device->getMaxBlockDimZ();
+
+  UR_CHECK_ERROR(hipOccupancyMaxPotentialBlockSize(
+      &MinGrid, &MaxBlockSize, Kernel->get(), Kernel->getLocalSize(),
+      MaxThreadsPerBlock[0]));
+
+  // Helper lambda to make sure each x, y, z dim divide the global dimension.
+  // Can optionally specify that we want the wg size to be a power of 2 in a
+  // given dimension, which is useful for the X dim for performance reasons.
+  static auto roundToHighestFactorOfGlobalSize =
+      [](size_t &ThreadsPerBlockInDim, const size_t GlobalWorkSizeInDim,
+         bool MakePowerOfTwo) {
+        auto IsPowerOf2 = [](size_t Value) -> bool {
+          return Value && !(Value & (Value - 1));
+        };
+        while (GlobalWorkSizeInDim % ThreadsPerBlockInDim ||
+               (MakePowerOfTwo && !IsPowerOf2(ThreadsPerBlockInDim)))
+          --ThreadsPerBlockInDim;
+      };
 
-  // Find a local work group size that is a divisor of the global
-  // work group size to produce uniform work groups.
-  while (GlobalWorkSize[0] % ThreadsPerBlock[0]) {
-    --ThreadsPerBlock[0];
-  }
+  ThreadsPerBlock[2] = std::min(GlobalSizeNormalized[2], MaxBlockDim[2]);
+  roundToHighestFactorOfGlobalSize(ThreadsPerBlock[2], GlobalWorkSize[2],
+                                   false);
+  ThreadsPerBlock[1] =
+      std::min(GlobalSizeNormalized[1],
+               std::min(MaxBlockSize / ThreadsPerBlock[2], MaxBlockDim[1]));
+  roundToHighestFactorOfGlobalSize(ThreadsPerBlock[1], GlobalWorkSize[1],
+                                   false);
+  MaxBlockDim[0] = MaxBlockSize / (ThreadsPerBlock[1] * ThreadsPerBlock[2]);
+  ThreadsPerBlock[0] = std::min(
+      MaxThreadsPerBlock[0], std::min(GlobalSizeNormalized[0], MaxBlockDim[0]));
+  roundToHighestFactorOfGlobalSize(ThreadsPerBlock[0], GlobalWorkSize[0], true);
 }
 
 ur_result_t setHipMemAdvise(const void *DevPtr, const size_t Size,
@@ -340,8 +380,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
           return err;
       }
     } else {
-      simpleGuessLocalWorkSize(ThreadsPerBlock, pGlobalWorkSize,
-                               MaxThreadsPerBlock);
+      guessLocalWorkSize(hQueue->getDevice(), ThreadsPerBlock, pGlobalWorkSize,
+                         workDim, MaxThreadsPerBlock, hKernel);
     }
   }