Merge pull request #1326 from hdelan/refactor-guess-local-worksize

kbenzie · web-flow · commit ed1f8bf618c8 · 2024-03-19T21:00:20.000Z
[CUDA][HIP] Fix bug in guess local worksize funcs and improve local worksize guessing in HIP adapter
diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp
@@ -18,6 +18,7 @@
 
 #include <cmath>
 #include <cuda.h>
+#include <ur/ur.hpp>
 
 ur_result_t enqueueEventsWait(ur_queue_handle_t CommandQueue, CUstream Stream,
                               uint32_t NumEventsInWaitList,
@@ -140,12 +141,10 @@ ur_result_t setCuMemAdvise(CUdeviceptr DevPtr, size_t Size,
 void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
                         const size_t *GlobalWorkSize, const uint32_t WorkDim,
                         const size_t MaxThreadsPerBlock[3],
-                        ur_kernel_handle_t Kernel, uint32_t LocalSize) {
+                        ur_kernel_handle_t Kernel) {
   assert(ThreadsPerBlock != nullptr);
   assert(GlobalWorkSize != nullptr);
   assert(Kernel != nullptr);
-  int MinGrid, MaxBlockSize;
-  size_t MaxBlockDim[3];
 
   // The below assumes a three dimensional range but this is not guaranteed by
   // UR.
@@ -154,33 +153,18 @@ void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
     GlobalSizeNormalized[i] = GlobalWorkSize[i];
   }
 
+  size_t MaxBlockDim[3];
+  MaxBlockDim[0] = MaxThreadsPerBlock[0];
   MaxBlockDim[1] = Device->getMaxBlockDimY();
   MaxBlockDim[2] = Device->getMaxBlockDimZ();
 
-  UR_CHECK_ERROR(
-      cuOccupancyMaxPotentialBlockSize(&MinGrid, &MaxBlockSize, Kernel->get(),
-                                       NULL, LocalSize, MaxThreadsPerBlock[0]));
-
-  ThreadsPerBlock[2] = std::min(GlobalSizeNormalized[2], MaxBlockDim[2]);
-  ThreadsPerBlock[1] =
-      std::min(GlobalSizeNormalized[1],
-               std::min(MaxBlockSize / ThreadsPerBlock[2], MaxBlockDim[1]));
-  MaxBlockDim[0] = MaxBlockSize / (ThreadsPerBlock[1] * ThreadsPerBlock[2]);
-  ThreadsPerBlock[0] = std::min(
-      MaxThreadsPerBlock[0], std::min(GlobalSizeNormalized[0], MaxBlockDim[0]));
-
-  static auto IsPowerOf2 = [](size_t Value) -> bool {
-    return Value && !(Value & (Value - 1));
-  };
-
-  // Find a local work group size that is a divisor of the global
-  // work group size to produce uniform work groups.
-  // Additionally, for best compute utilisation, the local size has
-  // to be a power of two.
-  while (0u != (GlobalSizeNormalized[0] % ThreadsPerBlock[0]) ||
-         !IsPowerOf2(ThreadsPerBlock[0])) {
-    --ThreadsPerBlock[0];
-  }
+  int MinGrid, MaxBlockSize;
+  UR_CHECK_ERROR(cuOccupancyMaxPotentialBlockSize(
+      &MinGrid, &MaxBlockSize, Kernel->get(), NULL, Kernel->getLocalSize(),
+      MaxThreadsPerBlock[0]));
+
+  roundToHighestFactorOfGlobalSizeIn3d(ThreadsPerBlock, GlobalSizeNormalized,
+                                       MaxBlockDim, MaxBlockSize);
 }
 
 // Helper to verify out-of-registers case (exceeded block max registers).
@@ -261,7 +245,7 @@ setKernelParams(const ur_context_handle_t Context,
         }
       } else {
         guessLocalWorkSize(Device, ThreadsPerBlock, GlobalWorkSize, WorkDim,
-                           MaxThreadsPerBlock, Kernel, LocalSize);
+                           MaxThreadsPerBlock, Kernel);
       }
     }
 
diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp
@@ -16,6 +16,8 @@
 #include "memory.hpp"
 #include "queue.hpp"
 
+#include <ur/ur.hpp>
+
 extern size_t imageElementByteSize(hipArray_Format ArrayFormat);
 
 ur_result_t enqueueEventsWait(ur_queue_handle_t, hipStream_t Stream,
@@ -48,23 +50,29 @@ ur_result_t enqueueEventsWait(ur_queue_handle_t, hipStream_t Stream,
   }
 }
 
-void simpleGuessLocalWorkSize(size_t *ThreadsPerBlock,
-                              const size_t *GlobalWorkSize,
-                              const size_t MaxThreadsPerBlock[3],
-                              ur_kernel_handle_t Kernel) {
+// Determine local work sizes that result in uniform work groups.
+// The default threadsPerBlock only require handling the first work_dim
+// dimension.
+void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
+                        const size_t *GlobalWorkSize, const uint32_t WorkDim,
+                        const size_t MaxThreadsPerBlock[3]) {
   assert(ThreadsPerBlock != nullptr);
   assert(GlobalWorkSize != nullptr);
-  assert(Kernel != nullptr);
 
-  std::ignore = Kernel;
+  // FIXME: The below assumes a three dimensional range but this is not
+  // guaranteed by UR.
+  size_t GlobalSizeNormalized[3] = {1, 1, 1};
+  for (uint32_t i = 0; i < WorkDim; i++) {
+    GlobalSizeNormalized[i] = GlobalWorkSize[i];
+  }
 
-  ThreadsPerBlock[0] = std::min(MaxThreadsPerBlock[0], GlobalWorkSize[0]);
+  size_t MaxBlockDim[3];
+  MaxBlockDim[0] = MaxThreadsPerBlock[0];
+  MaxBlockDim[1] = Device->getMaxBlockDimY();
+  MaxBlockDim[2] = Device->getMaxBlockDimZ();
 
-  // Find a local work group size that is a divisor of the global
-  // work group size to produce uniform work groups.
-  while (GlobalWorkSize[0] % ThreadsPerBlock[0]) {
-    --ThreadsPerBlock[0];
-  }
+  roundToHighestFactorOfGlobalSizeIn3d(ThreadsPerBlock, GlobalSizeNormalized,
+                                       MaxBlockDim, MaxThreadsPerBlock[0]);
 }
 
 namespace {
@@ -1786,8 +1794,8 @@ setKernelParams(const ur_device_handle_t Device, const uint32_t WorkDim,
             return err;
         }
       } else {
-        simpleGuessLocalWorkSize(ThreadsPerBlock, GlobalWorkSize,
-                                 MaxThreadsPerBlock, Kernel);
+        guessLocalWorkSize(Device, ThreadsPerBlock, GlobalWorkSize, WorkDim,
+                           MaxThreadsPerBlock);
       }
     }
 
diff --git a/source/ur/ur.hpp b/source/ur/ur.hpp
@@ -321,3 +321,56 @@ template <typename T> class Result {
 private:
   std::variant<ur_result_t, T> value_or_err;
 };
+
+// Helper to make sure each x, y, z dim divide the global dimension.
+//
+// In/Out: ThreadsPerBlockInDim - The dimension of workgroup in some dimension
+// In:     GlobalWorkSizeInDim  - The global size in some dimension
+static inline void
+roundToHighestFactorOfGlobalSize(size_t &ThreadsPerBlockInDim,
+                                 const size_t GlobalWorkSizeInDim) {
+  while (ThreadsPerBlockInDim > 1 &&
+         GlobalWorkSizeInDim % ThreadsPerBlockInDim) {
+    --ThreadsPerBlockInDim;
+  }
+}
+
+// Returns whether or not Value is a power of 2
+template <typename T> inline bool isPowerOf2(const T &Value) {
+  return Value && !(Value & (Value - 1));
+}
+
+// Helper to make sure each x, y, z dim divide the global dimension.
+// Additionally it makes sure that the inner dimension always is a power of 2
+//
+// In/Out: ThreadsPerBlock      - The size of wg in 3d
+// In:     GlobalSize           - The global size in 3d (if dim < 3 then outer
+//                                                       dims == 1)
+// In:     MaxBlockDim          - The max size of block in 3d
+// In:     MaxBlockSize         - The max total size of block in all dimensions
+// In:     WorkDim              - The workdim (1, 2 or 3)
+static inline void roundToHighestFactorOfGlobalSizeIn3d(
+    size_t *ThreadsPerBlock, const size_t *GlobalSize,
+    const size_t *MaxBlockDim, const size_t MaxBlockSize) {
+  assert(GlobalSize[0] && "GlobalSize[0] cannot be zero");
+  assert(GlobalSize[1] && "GlobalSize[1] cannot be zero");
+  assert(GlobalSize[2] && "GlobalSize[2] cannot be zero");
+
+  ThreadsPerBlock[0] =
+      std::min(GlobalSize[0], std::min(MaxBlockSize, MaxBlockDim[0]));
+  do {
+    roundToHighestFactorOfGlobalSize(ThreadsPerBlock[0], GlobalSize[0]);
+  } while (!isPowerOf2(ThreadsPerBlock[0]) && ThreadsPerBlock[0] > 32 &&
+           --ThreadsPerBlock[0]);
+
+  ThreadsPerBlock[1] =
+      std::min(GlobalSize[1],
+               std::min(MaxBlockSize / ThreadsPerBlock[0], MaxBlockDim[1]));
+  roundToHighestFactorOfGlobalSize(ThreadsPerBlock[1], GlobalSize[1]);
+
+  ThreadsPerBlock[2] = std::min(
+      GlobalSize[2],
+      std::min(MaxBlockSize / (ThreadsPerBlock[1] * ThreadsPerBlock[0]),
+               MaxBlockDim[2]));
+  roundToHighestFactorOfGlobalSize(ThreadsPerBlock[2], GlobalSize[2]);
+}
diff --git a/test/conformance/enqueue/urEnqueueKernelLaunch.cpp b/test/conformance/enqueue/urEnqueueKernelLaunch.cpp
@@ -77,53 +77,93 @@ TEST_P(urEnqueueKernelLaunchTest, InvalidWorkDimension) {
                      UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
 }
 
-struct urEnqueueKernelLaunch2DTest : uur::urKernelExecutionTest {
-    void SetUp() override {
-        program_name = "fill_2d";
-        UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp());
-    }
-
-    uint32_t val = 42;
-    size_t global_size[2] = {8, 8};
-    size_t global_offset[2] = {0, 0};
-    size_t buffer_size = sizeof(val) * global_size[0] * global_size[1];
-    size_t n_dimensions = 2;
+struct testParametersEnqueueKernel {
+    size_t X, Y, Z;
+    size_t Dims;
 };
-UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urEnqueueKernelLaunch2DTest);
 
-TEST_P(urEnqueueKernelLaunch2DTest, Success) {
-    ur_mem_handle_t buffer = nullptr;
-    AddBuffer1DArg(buffer_size, &buffer);
-    AddPodArg(val);
-    ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions,
-                                         global_offset, global_size, nullptr, 0,
-                                         nullptr, nullptr));
-    ASSERT_SUCCESS(urQueueFinish(queue));
-    ValidateBuffer(buffer, buffer_size, val);
+template <typename T>
+inline std::string printKernelLaunchTestString(
+    const testing::TestParamInfo<typename T::ParamType> &info) {
+    const auto device_handle = std::get<0>(info.param);
+    const auto platform_device_name =
+        uur::GetPlatformAndDeviceName(device_handle);
+    std::stringstream test_name;
+    test_name << platform_device_name << "__" << std::get<1>(info.param).Dims
+              << "D_" << std::get<1>(info.param).X;
+    if (std::get<1>(info.param).Dims > 1) {
+        test_name << "_" << std::get<1>(info.param).Y;
+    }
+    if (std::get<1>(info.param).Dims > 2) {
+        test_name << "_" << std::get<1>(info.param).Z;
+    }
+    test_name << "";
+    return test_name.str();
 }
 
-struct urEnqueueKernelLaunch3DTest : uur::urKernelExecutionTest {
+struct urEnqueueKernelLaunchTestWithParam
+    : uur::urBaseKernelExecutionTestWithParam<testParametersEnqueueKernel> {
     void SetUp() override {
-        program_name = "fill_3d";
-        UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp());
+        global_range[0] = std::get<1>(GetParam()).X;
+        global_range[1] = std::get<1>(GetParam()).Y;
+        global_range[2] = std::get<1>(GetParam()).Z;
+        buffer_size = sizeof(val) * global_range[0];
+        n_dimensions = std::get<1>(GetParam()).Dims;
+        if (n_dimensions == 1) {
+            program_name = "fill";
+        } else if (n_dimensions == 2) {
+            program_name = "fill_2d";
+            buffer_size *= global_range[1];
+        } else {
+            assert(n_dimensions == 3);
+            program_name = "fill_3d";
+            buffer_size *= global_range[1] * global_range[2];
+        }
+        UUR_RETURN_ON_FATAL_FAILURE(
+            urBaseKernelExecutionTestWithParam::SetUp());
+    }
+
+    void TearDown() override {
+        UUR_RETURN_ON_FATAL_FAILURE(uur::urBaseKernelExecutionTestWithParam<
+                                    testParametersEnqueueKernel>::TearDown());
     }
 
     uint32_t val = 42;
-    size_t global_size[3] = {4, 4, 4};
+    size_t global_range[3];
     size_t global_offset[3] = {0, 0, 0};
-    size_t buffer_size =
-        sizeof(val) * global_size[0] * global_size[1] * global_size[2];
-    size_t n_dimensions = 3;
+    size_t n_dimensions;
+    size_t buffer_size;
 };
-UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urEnqueueKernelLaunch3DTest);
 
-TEST_P(urEnqueueKernelLaunch3DTest, Success) {
+static std::vector<testParametersEnqueueKernel> test_cases{// 1D
+                                                           {1, 1, 1, 1},
+                                                           {31, 1, 1, 1},
+                                                           {1027, 1, 1, 1},
+                                                           {32, 1, 1, 1},
+                                                           {256, 1, 1, 1},
+                                                           // 2D
+                                                           {1, 1, 1, 2},
+                                                           {31, 7, 1, 2},
+                                                           {1027, 1, 1, 2},
+                                                           {1, 32, 1, 2},
+                                                           {256, 79, 1, 2},
+                                                           // 3D
+                                                           {1, 1, 1, 3},
+                                                           {31, 7, 1, 3},
+                                                           {1027, 1, 19, 3},
+                                                           {1, 53, 19, 3},
+                                                           {256, 79, 8, 3}};
+UUR_TEST_SUITE_P(
+    urEnqueueKernelLaunchTestWithParam, testing::ValuesIn(test_cases),
+    printKernelLaunchTestString<urEnqueueKernelLaunchTestWithParam>);
+
+TEST_P(urEnqueueKernelLaunchTestWithParam, Success) {
     ur_mem_handle_t buffer = nullptr;
     AddBuffer1DArg(buffer_size, &buffer);
     AddPodArg(val);
     ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions,
-                                         global_offset, global_size, nullptr, 0,
-                                         nullptr, nullptr));
+                                         global_offset, global_range, nullptr,
+                                         0, nullptr, nullptr));
     ASSERT_SUCCESS(urQueueFinish(queue));
     ValidateBuffer(buffer, buffer_size, val);
 }
diff --git a/test/conformance/testing/include/uur/fixtures.h b/test/conformance/testing/include/uur/fixtures.h