[SYCL][CUDA] Improve function to guess local work size more efficiently. (#9787)

mmoadeli · web-flow · commit 56e05afa3b32 · 2023-06-13T15:15:14.000+01:00
* The `threadsPerBlock` values computed by `guessLocalWorkSize` are not the most optimal values. In particular the `threadsPerBlock` for `Y` and `Z` were much below the possible values. * When Y/Z values of range are prime a very poor performance is witnessed as shown in the associated [issue](#8018) * This PR compute `threadsPerBlock` for X/Y/Z to reduce corresponding `BlocksPerGrid` values. * Below presents the output of the code in associated issue without the changes in this PR. Device = NVIDIA GeForce GTX 1050 Ti N, elapsed(ms) - 1009,4.61658 - 2003,45.6869 - 3001,67.5192 - 4001,88.1543 - 5003,111.338 - 6007,132.848 - 7001,154.697 - 8009,175.452 - 9001,196.237 - 10007,219.39 - 1000,4.59423 - 2000,4.61525 - 3000,4.61935 - 4000,4.62526 - 5000,4.64623 - 6000,4.78904 - 7000,8.92251 - 8000,8.97263 - 9000,9.06992 - 10000,9.03802 * And below shows the output with the PR's updates Device = NVIDIA GeForce GTX 1050 Ti N, elapsed(ms) - 1009,4.58252 - 2003,4.60139 - 3001,3.47269 - 4001,3.62314 - 5003,4.15179 - 6007,7.07976 - 7001,7.49027 - 8009,8.00097 - 9001,9.08756 - 10007,8.0005 - 1000,4.56335 - 2000,4.60376 - 3000,4.76395 - 4000,4.63283 - 5000,4.64732 - 6000,4.63936 - 7000,8.97499 - 8000,8.9941 - 9000,9.01531 - 10000,9.00935
diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
@@ -19,6 +19,7 @@
 #include <algorithm>
 #include <cassert>
 #include <chrono>
+#include <cmath>
 #include <cuda.h>
 #include <cuda_device_runtime_api.h>
 #include <limits>
@@ -305,25 +306,49 @@ void guessLocalWorkSize(_pi_device *device, size_t *threadsPerBlock,
   assert(threadsPerBlock != nullptr);
   assert(global_work_size != nullptr);
   assert(kernel != nullptr);
-  int minGrid, maxBlockSize, gridDim[3];
+  int minGrid, maxBlockSize, maxBlockDim[3];
 
-  cuDeviceGetAttribute(&gridDim[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y,
+  static auto isPrime = [](size_t number) -> bool {
+    auto lastNumToCheck = ceil(sqrt(number));
+    if (number < 2)
+      return false;
+    if (number == 2)
+      return true;
+    if (number % 2 == 0)
+      return false;
+    for (int i = 3; i <= lastNumToCheck; i += 2) {
+      if (number % i == 0)
+        return false;
+    }
+    return true;
+  };
+
+  cuDeviceGetAttribute(&maxBlockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
                        device->get());
-  cuDeviceGetAttribute(&gridDim[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z,
+  cuDeviceGetAttribute(&maxBlockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
                        device->get());
 
-  threadsPerBlock[1] = ((global_work_size[1] - 1) / gridDim[1]) + 1;
-  threadsPerBlock[2] = ((global_work_size[2] - 1) / gridDim[2]) + 1;
-
   PI_CHECK_ERROR(cuOccupancyMaxPotentialBlockSize(
       &minGrid, &maxBlockSize, kernel->get(), NULL, local_size,
       maxThreadsPerBlock[0]));
 
-  gridDim[0] = maxBlockSize / (threadsPerBlock[1] * threadsPerBlock[2]);
-
+  threadsPerBlock[2] = std::min(global_work_size[2], size_t(maxBlockDim[2]));
+  threadsPerBlock[1] =
+      std::min(global_work_size[1], std::min(maxBlockSize / threadsPerBlock[2],
+                                             size_t(maxBlockDim[1])));
+  maxBlockDim[0] = maxBlockSize / (threadsPerBlock[1] * threadsPerBlock[2]);
   threadsPerBlock[0] =
       std::min(maxThreadsPerBlock[0],
-               std::min(global_work_size[0], static_cast<size_t>(gridDim[0])));
+               std::min(global_work_size[0], size_t(maxBlockDim[0])));
+
+  // When global_work_size[0] is prime threadPerBlock[0] will later computed as
+  // 1, which is not efficient configuration. In such case we use
+  // global_work_size[0] + 1 to compute threadPerBlock[0].
+  int adjusted_0_dim_global_work_size =
+      (isPrime(global_work_size[0]) &&
+       (threadsPerBlock[0] != global_work_size[0]))
+          ? global_work_size[0] + 1
+          : global_work_size[0];
 
   static auto isPowerOf2 = [](size_t value) -> bool {
     return value && !(value & (value - 1));
@@ -333,7 +358,7 @@ void guessLocalWorkSize(_pi_device *device, size_t *threadsPerBlock,
   // work group size to produce uniform work groups.
   // Additionally, for best compute utilisation, the local size has
   // to be a power of two.
-  while (0u != (global_work_size[0] % threadsPerBlock[0]) ||
+  while (0u != (adjusted_0_dim_global_work_size % threadsPerBlock[0]) ||
          !isPowerOf2(threadsPerBlock[0])) {
     --threadsPerBlock[0];
   }
@@ -2161,7 +2186,8 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name,
         cuDeviceGetPCIBusId(AddressBuffer, AddressBufferSize, device->get()) ==
         CUDA_SUCCESS);
     // CUDA API (8.x - 12.1) guarantees 12 bytes + \0 are written
-    sycl::detail::pi::assertion(strnlen(AddressBuffer, AddressBufferSize) == 12);
+    sycl::detail::pi::assertion(strnlen(AddressBuffer, AddressBufferSize) ==
+                                12);
     return getInfoArray(strnlen(AddressBuffer, AddressBufferSize - 1) + 1,
                         param_value_size, param_value, param_value_size_ret,
                         AddressBuffer);