Make the HIP adapter use complex subgroup size calculation

hdelan · hdelan · commit 3e2effd60d6f · 2024-03-14T16:11:01.000Z
The HIP adapter was only finding a good sg size in the X dim. This
    changes it so that it now chooses a sg size that divides the global dim
    in X, Y and Z dimensions. It also chooses a power of 2 sg size in the X
    dim, which is the same that the CUDA adapter does. This may give some
    performance improvements.
diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp
@@ -16,6 +16,8 @@
 #include "memory.hpp"
 #include "queue.hpp"
 
+#include <ur/ur.hpp>
+
 extern size_t imageElementByteSize(hipArray_Format ArrayFormat);
 
 ur_result_t enqueueEventsWait(ur_queue_handle_t, hipStream_t Stream,
@@ -48,23 +50,31 @@ ur_result_t enqueueEventsWait(ur_queue_handle_t, hipStream_t Stream,
   }
 }
 
-void simpleGuessLocalWorkSize(size_t *ThreadsPerBlock,
-                              const size_t *GlobalWorkSize,
-                              const size_t MaxThreadsPerBlock[3],
-                              ur_kernel_handle_t Kernel) {
+// Determine local work sizes that result in uniform work groups.
+// The default threadsPerBlock only require handling the first work_dim
+// dimension.
+void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
+                        const size_t *GlobalWorkSize, const uint32_t WorkDim,
+                        const size_t MaxThreadsPerBlock[3],
+                        ur_kernel_handle_t Kernel) {
   assert(ThreadsPerBlock != nullptr);
   assert(GlobalWorkSize != nullptr);
   assert(Kernel != nullptr);
 
-  std::ignore = Kernel;
+  // FIXME: The below assumes a three dimensional range but this is not
+  // guaranteed by UR.
+  size_t GlobalSizeNormalized[3] = {1, 1, 1};
+  for (uint32_t i = 0; i < WorkDim; i++) {
+    GlobalSizeNormalized[i] = GlobalWorkSize[i];
+  }
 
-  ThreadsPerBlock[0] = std::min(MaxThreadsPerBlock[0], GlobalWorkSize[0]);
+  size_t MaxBlockDim[3];
+  MaxBlockDim[0] = MaxThreadsPerBlock[0];
+  MaxBlockDim[1] = Device->getMaxBlockDimY();
+  MaxBlockDim[2] = Device->getMaxBlockDimZ();
 
-  // Find a local work group size that is a divisor of the global
-  // work group size to produce uniform work groups.
-  while (GlobalWorkSize[0] % ThreadsPerBlock[0]) {
-    --ThreadsPerBlock[0];
-  }
+  roundToHighestFactorOfGlobalSizeIn3d(ThreadsPerBlock, GlobalSizeNormalized,
+                                       MaxBlockDim, MaxThreadsPerBlock[0]);
 }
 
 namespace {
@@ -1793,8 +1803,8 @@ setKernelParams(const ur_device_handle_t Device, const uint32_t WorkDim,
             return err;
         }
       } else {
-        simpleGuessLocalWorkSize(ThreadsPerBlock, GlobalWorkSize,
-                                 MaxThreadsPerBlock, Kernel);
+        guessLocalWorkSize(Device, ThreadsPerBlock, GlobalWorkSize, WorkDim,
+                           MaxThreadsPerBlock, Kernel);
       }
     }
 
diff --git a/source/ur/ur.hpp b/source/ur/ur.hpp
@@ -329,7 +329,8 @@ template <typename T> class Result {
 static inline void
 roundToHighestFactorOfGlobalSize(size_t &ThreadsPerBlockInDim,
                                  const size_t GlobalWorkSizeInDim) {
-  while (GlobalWorkSizeInDim % ThreadsPerBlockInDim) {
+  while (ThreadsPerBlockInDim > 1 &&
+         GlobalWorkSizeInDim % ThreadsPerBlockInDim) {
     --ThreadsPerBlockInDim;
   }
 }
@@ -349,18 +350,20 @@ template <typename T> inline bool isPowerOf2(const T &Value) {
 static inline void roundToHighestFactorOfGlobalSizeIn3d(
     size_t *ThreadsPerBlock, const size_t *GlobalSize,
     const size_t *MaxBlockDim, const size_t MaxBlockSize) {
-  ThreadsPerBlock[2] = std::min(GlobalSize[2], MaxBlockDim[2]);
-  roundToHighestFactorOfGlobalSize(ThreadsPerBlock[2], GlobalSize[2]);
+  ThreadsPerBlock[0] = std::min(GlobalSize[0], MaxBlockDim[0]);
+  // Make the X dim a factor of 2
+  do {
+    roundToHighestFactorOfGlobalSize(ThreadsPerBlock[0], GlobalSize[0]);
+  } while (!isPowerOf2(ThreadsPerBlock[0]) && ThreadsPerBlock[0] > 32 &&
+           --ThreadsPerBlock[0]);
 
   ThreadsPerBlock[1] =
       std::min(GlobalSize[1],
-               std::min(MaxBlockSize / ThreadsPerBlock[2], MaxBlockDim[1]));
+               std::min(MaxBlockSize / ThreadsPerBlock[0], MaxBlockDim[1]));
   roundToHighestFactorOfGlobalSize(ThreadsPerBlock[1], GlobalSize[1]);
 
-  ThreadsPerBlock[0] = std::min(
-      GlobalSize[0], MaxBlockSize / (ThreadsPerBlock[1] * ThreadsPerBlock[2]));
-  // Make the X dim a factor of 2
-  do {
-    roundToHighestFactorOfGlobalSize(ThreadsPerBlock[0], GlobalSize[0]);
-  } while (!isPowerOf2(ThreadsPerBlock[0]));
+  ThreadsPerBlock[2] = std::min(
+      GlobalSize[2], MaxBlockSize / (ThreadsPerBlock[1] * ThreadsPerBlock[0]));
+  roundToHighestFactorOfGlobalSize(ThreadsPerBlock[2], GlobalSize[2]);
+
 }
diff --git a/test/conformance/enqueue/urEnqueueKernelLaunch.cpp b/test/conformance/enqueue/urEnqueueKernelLaunch.cpp
@@ -77,53 +77,94 @@ TEST_P(urEnqueueKernelLaunchTest, InvalidWorkDimension) {
                      UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
 }
 
-struct urEnqueueKernelLaunch2DTest : uur::urKernelExecutionTest {
-    void SetUp() override {
-        program_name = "fill_2d";
-        UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp());
-    }
-
-    uint32_t val = 42;
-    size_t global_size[2] = {8, 8};
-    size_t global_offset[2] = {0, 0};
-    size_t buffer_size = sizeof(val) * global_size[0] * global_size[1];
-    size_t n_dimensions = 2;
+struct testParametersEnqueueKernel {
+    size_t X, Y, Z;
+    size_t Dims;
 };
-UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urEnqueueKernelLaunch2DTest);
 
-TEST_P(urEnqueueKernelLaunch2DTest, Success) {
-    ur_mem_handle_t buffer = nullptr;
-    AddBuffer1DArg(buffer_size, &buffer);
-    AddPodArg(val);
-    ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions,
-                                         global_offset, global_size, nullptr, 0,
-                                         nullptr, nullptr));
-    ASSERT_SUCCESS(urQueueFinish(queue));
-    ValidateBuffer(buffer, buffer_size, val);
+template <typename T>
+inline std::string printKernelLaunchTestString(
+    const testing::TestParamInfo<typename T::ParamType> &info) {
+    const auto device_handle = std::get<0>(info.param);
+    const auto platform_device_name =
+        uur::GetPlatformAndDeviceName(device_handle);
+    std::stringstream test_name;
+    test_name << platform_device_name << "__"
+              << std::get<1>(info.param).Dims << "D_"
+              << std::get<1>(info.param).X;
+    if (std::get<1>(info.param).Dims > 1) {
+        test_name << "_" << std::get<1>(info.param).Y;
+    }
+    if (std::get<1>(info.param).Dims > 2) {
+        test_name << "_" << std::get<1>(info.param).Z;
+    }
+    test_name << "";
+    return test_name.str();
 }
 
-struct urEnqueueKernelLaunch3DTest : uur::urKernelExecutionTest {
+struct urEnqueueKernelLaunchTestWithParam
+    : uur::urBaseKernelExecutionTestWithParam<testParametersEnqueueKernel> {
     void SetUp() override {
-        program_name = "fill_3d";
-        UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp());
+        global_range[0] = std::get<1>(GetParam()).X;
+        global_range[1] = std::get<1>(GetParam()).Y;
+        global_range[2] = std::get<1>(GetParam()).Z;
+        buffer_size = sizeof(val) * global_range[0];
+        n_dimensions = std::get<1>(GetParam()).Dims;
+        if (n_dimensions == 1) {
+            program_name = "fill";
+        } else if (n_dimensions == 2) {
+            program_name = "fill_2d";
+            buffer_size *= global_range[1];
+        } else {
+            assert(n_dimensions == 3);
+            program_name = "fill_3d";
+            buffer_size *= global_range[1] * global_range[2];
+        }
+        UUR_RETURN_ON_FATAL_FAILURE(
+            urBaseKernelExecutionTestWithParam::SetUp());
+    }
+
+    void TearDown() override {
+        UUR_RETURN_ON_FATAL_FAILURE(uur::urBaseKernelExecutionTestWithParam<
+                                    testParametersEnqueueKernel>::TearDown());
     }
 
     uint32_t val = 42;
-    size_t global_size[3] = {4, 4, 4};
+    size_t global_range[3];
     size_t global_offset[3] = {0, 0, 0};
-    size_t buffer_size =
-        sizeof(val) * global_size[0] * global_size[1] * global_size[2];
-    size_t n_dimensions = 3;
+    size_t n_dimensions;
+    size_t buffer_size;
 };
-UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urEnqueueKernelLaunch3DTest);
 
-TEST_P(urEnqueueKernelLaunch3DTest, Success) {
+static std::vector<testParametersEnqueueKernel> test_cases{// 1D
+                                                           {1, 1, 1, 1},
+                                                           {31, 1, 1, 1},
+                                                           {1027, 1, 1, 1},
+                                                           {32, 1, 1, 1},
+                                                           {256, 1, 1, 1},
+                                                           // 2D
+                                                           {1, 1, 1, 2},
+                                                           {31, 7, 1, 2},
+                                                           {1027, 1, 1, 2},
+                                                           {1, 32, 1, 2},
+                                                           {256, 79, 1, 2},
+                                                           // 3D
+                                                           {1, 1, 1, 3},
+                                                           {31, 7, 1, 3},
+                                                           {1027, 1, 19, 3},
+                                                           {1, 53, 19, 3},
+                                                           {256, 79, 8, 3}};
+UUR_TEST_SUITE_P(
+    urEnqueueKernelLaunchTestWithParam, testing::ValuesIn(test_cases),
+    printKernelLaunchTestString<urEnqueueKernelLaunchTestWithParam>);
+
+TEST_P(urEnqueueKernelLaunchTestWithParam, Success) {
     ur_mem_handle_t buffer = nullptr;
     AddBuffer1DArg(buffer_size, &buffer);
     AddPodArg(val);
     ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions,
-                                         global_offset, global_size, nullptr, 0,
-                                         nullptr, nullptr));
+                                         global_offset, global_range, nullptr,
+                                         0, nullptr, nullptr));
     ASSERT_SUCCESS(urQueueFinish(queue));
     ValidateBuffer(buffer, buffer_size, val);
 }
diff --git a/test/conformance/testing/include/uur/fixtures.h b/test/conformance/testing/include/uur/fixtures.h
@@ -1274,6 +1274,109 @@ struct urBaseKernelExecutionTest : urBaseKernelTest {
     uint32_t current_arg_index = 0;
 };
 
+template <typename T>
+struct urBaseKernelExecutionTestWithParam : urBaseKernelTestWithParam<T> {
+    void SetUp() override {
+        UUR_RETURN_ON_FATAL_FAILURE(urBaseKernelTestWithParam<T>::SetUp());
+        UUR_RETURN_ON_FATAL_FAILURE(urBaseKernelTestWithParam<T>::Build());
+        context = urBaseKernelTestWithParam<T>::context;
+        kernel = urBaseKernelTestWithParam<T>::kernel;
+        ASSERT_SUCCESS(urQueueCreate(
+            context, urBaseKernelTestWithParam<T>::device, 0, &queue));
+    }
+
+    void TearDown() override {
+        for (auto &buffer : buffer_args) {
+            ASSERT_SUCCESS(urMemRelease(buffer));
+        }
+        UUR_RETURN_ON_FATAL_FAILURE(urBaseKernelTestWithParam<T>::TearDown());
+        if (queue) {
+            EXPECT_SUCCESS(urQueueRelease(queue));
+        }
+    }
+
+    // Adds a kernel arg representing a sycl buffer constructed with a 1D range.
+    void AddBuffer1DArg(size_t size, ur_mem_handle_t *out_buffer) {
+        ur_mem_handle_t mem_handle = nullptr;
+        ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, size,
+                                         nullptr, &mem_handle));
+        char zero = 0;
+        ASSERT_SUCCESS(urEnqueueMemBufferFill(queue, mem_handle, &zero,
+                                              sizeof(zero), 0, size, 0, nullptr,
+                                              nullptr));
+        ASSERT_SUCCESS(urQueueFinish(queue));
+        ASSERT_SUCCESS(urKernelSetArgMemObj(kernel, current_arg_index, nullptr,
+                                            mem_handle));
+
+        // SYCL device kernels have different interfaces depending on the
+        // backend being used. Typically a kernel which takes a buffer argument
+        // will take a pointer to the start of the buffer and a sycl::id param
+        // which is a struct that encodes the accessor to the buffer. However
+        // the AMD backend handles this differently and uses three separate
+        // arguments for each of the three dimensions of the accessor.
+
+        ur_platform_backend_t backend;
+        ASSERT_SUCCESS(urPlatformGetInfo(urBaseKernelTestWithParam<T>::platform,
+                                         UR_PLATFORM_INFO_BACKEND,
+                                         sizeof(backend), &backend, nullptr));
+        if (backend == UR_PLATFORM_BACKEND_HIP) {
+            // this emulates the three offset params for buffer accessor on AMD.
+            size_t val = 0;
+            ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 1,
+                                               sizeof(size_t), nullptr, &val));
+            ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 2,
+                                               sizeof(size_t), nullptr, &val));
+            ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 3,
+                                               sizeof(size_t), nullptr, &val));
+            current_arg_index += 4;
+        } else {
+            // This emulates the offset struct sycl adds for a 1D buffer accessor.
+            struct {
+                size_t offsets[1] = {0};
+            } accessor;
+            ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 1,
+                                               sizeof(accessor), nullptr,
+                                               &accessor));
+            current_arg_index += 2;
+        }
+
+        buffer_args.push_back(mem_handle);
+        *out_buffer = mem_handle;
+    }
+
+    template <class U> void AddPodArg(U data) {
+        ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index,
+                                           sizeof(data), nullptr, &data));
+        current_arg_index++;
+    }
+
+    // Validate the contents of `buffer` according to the given validator.
+    template <class U>
+    void ValidateBuffer(ur_mem_handle_t buffer, size_t size,
+                        std::function<bool(U &)> validator) {
+        std::vector<U> read_buffer(size / sizeof(U));
+        ASSERT_SUCCESS(urEnqueueMemBufferRead(queue, buffer, true, 0, size,
+                                              read_buffer.data(), 0, nullptr,
+                                              nullptr));
+        ASSERT_TRUE(
+            std::all_of(read_buffer.begin(), read_buffer.end(), validator));
+    }
+
+    // Helper that uses the generic validate function to check for a given value.
+    template <class U>
+    void ValidateBuffer(ur_mem_handle_t buffer, size_t size, U value) {
+        auto validator = [&value](U result) -> bool { return result == value; };
+
+        ValidateBuffer<U>(buffer, size, validator);
+    }
+
+    std::vector<ur_mem_handle_t> buffer_args;
+    uint32_t current_arg_index = 0;
+    ur_context_handle_t context;
+    ur_kernel_handle_t kernel;
+    ur_queue_handle_t queue;
+};
+
 struct urKernelExecutionTest : urBaseKernelExecutionTest {
     void SetUp() {
         UUR_RETURN_ON_FATAL_FAILURE(urBaseKernelExecutionTest::SetUp());