Skip to content

Commit 56e05af

Browse files
authored
[SYCL][CUDA] Improve function to guess local work size more efficiently. (#9787)
* The `threadsPerBlock` values computed by `guessLocalWorkSize` are not the most optimal values. In particular the `threadsPerBlock` for `Y` and `Z` were much below the possible values. * When Y/Z values of range are prime a very poor performance is witnessed as shown in the associated [issue](#8018) * This PR compute `threadsPerBlock` for X/Y/Z to reduce corresponding `BlocksPerGrid` values. * Below presents the output of the code in associated issue without the changes in this PR. Device = NVIDIA GeForce GTX 1050 Ti N, elapsed(ms) - 1009,4.61658 - 2003,45.6869 - 3001,67.5192 - 4001,88.1543 - 5003,111.338 - 6007,132.848 - 7001,154.697 - 8009,175.452 - 9001,196.237 - 10007,219.39 - 1000,4.59423 - 2000,4.61525 - 3000,4.61935 - 4000,4.62526 - 5000,4.64623 - 6000,4.78904 - 7000,8.92251 - 8000,8.97263 - 9000,9.06992 - 10000,9.03802 * And below shows the output with the PR's updates Device = NVIDIA GeForce GTX 1050 Ti N, elapsed(ms) - 1009,4.58252 - 2003,4.60139 - 3001,3.47269 - 4001,3.62314 - 5003,4.15179 - 6007,7.07976 - 7001,7.49027 - 8009,8.00097 - 9001,9.08756 - 10007,8.0005 - 1000,4.56335 - 2000,4.60376 - 3000,4.76395 - 4000,4.63283 - 5000,4.64732 - 6000,4.63936 - 7000,8.97499 - 8000,8.9941 - 9000,9.01531 - 10000,9.00935
1 parent b7f09d8 commit 56e05af

File tree

1 file changed

+37
-11
lines changed

1 file changed

+37
-11
lines changed

sycl/plugins/cuda/pi_cuda.cpp

Lines changed: 37 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include <algorithm>
2020
#include <cassert>
2121
#include <chrono>
22+
#include <cmath>
2223
#include <cuda.h>
2324
#include <cuda_device_runtime_api.h>
2425
#include <limits>
@@ -305,25 +306,49 @@ void guessLocalWorkSize(_pi_device *device, size_t *threadsPerBlock,
305306
assert(threadsPerBlock != nullptr);
306307
assert(global_work_size != nullptr);
307308
assert(kernel != nullptr);
308-
int minGrid, maxBlockSize, gridDim[3];
309+
int minGrid, maxBlockSize, maxBlockDim[3];
309310

310-
cuDeviceGetAttribute(&gridDim[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y,
311+
static auto isPrime = [](size_t number) -> bool {
312+
auto lastNumToCheck = ceil(sqrt(number));
313+
if (number < 2)
314+
return false;
315+
if (number == 2)
316+
return true;
317+
if (number % 2 == 0)
318+
return false;
319+
for (int i = 3; i <= lastNumToCheck; i += 2) {
320+
if (number % i == 0)
321+
return false;
322+
}
323+
return true;
324+
};
325+
326+
cuDeviceGetAttribute(&maxBlockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
311327
device->get());
312-
cuDeviceGetAttribute(&gridDim[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z,
328+
cuDeviceGetAttribute(&maxBlockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
313329
device->get());
314330

315-
threadsPerBlock[1] = ((global_work_size[1] - 1) / gridDim[1]) + 1;
316-
threadsPerBlock[2] = ((global_work_size[2] - 1) / gridDim[2]) + 1;
317-
318331
PI_CHECK_ERROR(cuOccupancyMaxPotentialBlockSize(
319332
&minGrid, &maxBlockSize, kernel->get(), NULL, local_size,
320333
maxThreadsPerBlock[0]));
321334

322-
gridDim[0] = maxBlockSize / (threadsPerBlock[1] * threadsPerBlock[2]);
323-
335+
threadsPerBlock[2] = std::min(global_work_size[2], size_t(maxBlockDim[2]));
336+
threadsPerBlock[1] =
337+
std::min(global_work_size[1], std::min(maxBlockSize / threadsPerBlock[2],
338+
size_t(maxBlockDim[1])));
339+
maxBlockDim[0] = maxBlockSize / (threadsPerBlock[1] * threadsPerBlock[2]);
324340
threadsPerBlock[0] =
325341
std::min(maxThreadsPerBlock[0],
326-
std::min(global_work_size[0], static_cast<size_t>(gridDim[0])));
342+
std::min(global_work_size[0], size_t(maxBlockDim[0])));
343+
344+
// When global_work_size[0] is prime threadPerBlock[0] will later computed as
345+
// 1, which is not efficient configuration. In such case we use
346+
// global_work_size[0] + 1 to compute threadPerBlock[0].
347+
int adjusted_0_dim_global_work_size =
348+
(isPrime(global_work_size[0]) &&
349+
(threadsPerBlock[0] != global_work_size[0]))
350+
? global_work_size[0] + 1
351+
: global_work_size[0];
327352

328353
static auto isPowerOf2 = [](size_t value) -> bool {
329354
return value && !(value & (value - 1));
@@ -333,7 +358,7 @@ void guessLocalWorkSize(_pi_device *device, size_t *threadsPerBlock,
333358
// work group size to produce uniform work groups.
334359
// Additionally, for best compute utilisation, the local size has
335360
// to be a power of two.
336-
while (0u != (global_work_size[0] % threadsPerBlock[0]) ||
361+
while (0u != (adjusted_0_dim_global_work_size % threadsPerBlock[0]) ||
337362
!isPowerOf2(threadsPerBlock[0])) {
338363
--threadsPerBlock[0];
339364
}
@@ -2161,7 +2186,8 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name,
21612186
cuDeviceGetPCIBusId(AddressBuffer, AddressBufferSize, device->get()) ==
21622187
CUDA_SUCCESS);
21632188
// CUDA API (8.x - 12.1) guarantees 12 bytes + \0 are written
2164-
sycl::detail::pi::assertion(strnlen(AddressBuffer, AddressBufferSize) == 12);
2189+
sycl::detail::pi::assertion(strnlen(AddressBuffer, AddressBufferSize) ==
2190+
12);
21652191
return getInfoArray(strnlen(AddressBuffer, AddressBufferSize - 1) + 1,
21662192
param_value_size, param_value, param_value_size_ret,
21672193
AddressBuffer);

0 commit comments

Comments
 (0)