Skip to content

Commit d666b95

Browse files
authored
[SYCL][L0][CUDA][HIP] Fix PI_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE queries (#8769)
Address kernel query `global_work_size` for L0, CUDA, HIP from `PI_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE` Fixes #8766 For instance (for X-dimension) L0: `maxGroupSizeX * maxGroupCountX` CUDA: `CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X * CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X`
1 parent ace0ac5 commit d666b95

File tree

4 files changed

+84
-11
lines changed

4 files changed

+84
-11
lines changed

sycl/plugins/cuda/pi_cuda.cpp

+38-1
Original file line numberDiff line numberDiff line change
@@ -2957,6 +2957,43 @@ pi_result cuda_piKernelGetGroupInfo(pi_kernel kernel, pi_device device,
29572957
if (kernel != nullptr) {
29582958

29592959
switch (param_name) {
2960+
case PI_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE: {
2961+
size_t global_work_size[3] = {0, 0, 0};
2962+
2963+
int max_block_dimX{0}, max_block_dimY{0}, max_block_dimZ{0};
2964+
sycl::detail::pi::assertion(
2965+
cuDeviceGetAttribute(&max_block_dimX,
2966+
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
2967+
device->get()) == CUDA_SUCCESS);
2968+
sycl::detail::pi::assertion(
2969+
cuDeviceGetAttribute(&max_block_dimY,
2970+
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
2971+
device->get()) == CUDA_SUCCESS);
2972+
sycl::detail::pi::assertion(
2973+
cuDeviceGetAttribute(&max_block_dimZ,
2974+
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
2975+
device->get()) == CUDA_SUCCESS);
2976+
2977+
int max_grid_dimX{0}, max_grid_dimY{0}, max_grid_dimZ{0};
2978+
sycl::detail::pi::assertion(
2979+
cuDeviceGetAttribute(&max_grid_dimX,
2980+
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
2981+
device->get()) == CUDA_SUCCESS);
2982+
sycl::detail::pi::assertion(
2983+
cuDeviceGetAttribute(&max_grid_dimY,
2984+
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y,
2985+
device->get()) == CUDA_SUCCESS);
2986+
sycl::detail::pi::assertion(
2987+
cuDeviceGetAttribute(&max_grid_dimZ,
2988+
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z,
2989+
device->get()) == CUDA_SUCCESS);
2990+
2991+
global_work_size[0] = max_block_dimX * max_grid_dimX;
2992+
global_work_size[1] = max_block_dimY * max_grid_dimY;
2993+
global_work_size[2] = max_block_dimZ * max_grid_dimZ;
2994+
return getInfoArray(3, param_value_size, param_value,
2995+
param_value_size_ret, global_work_size);
2996+
}
29602997
case PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: {
29612998
int max_threads = 0;
29622999
sycl::detail::pi::assertion(
@@ -5629,7 +5666,7 @@ pi_result cuda_piextEnqueueWriteHostPipe(
56295666
// Windows: dynamically loaded plugins might have been unloaded already
56305667
// when this is called. Sycl RT holds onto the PI plugin so it can be
56315668
// called safely. But this is not transitive. If the PI plugin in turn
5632-
// dynamically loaded a different DLL, that may have been unloaded.
5669+
// dynamically loaded a different DLL, that may have been unloaded.
56335670
// TODO: add a global variable lifetime management code here (see
56345671
// pi_level_zero.cpp for reference) Currently this is just a NOOP.
56355672
pi_result cuda_piTearDown(void *) {

sycl/plugins/hip/pi_hip.cpp

+32-1
Original file line numberDiff line numberDiff line change
@@ -3603,6 +3603,37 @@ pi_result hip_piKernelGetGroupInfo(pi_kernel kernel, pi_device device,
36033603
if (kernel != nullptr) {
36043604

36053605
switch (param_name) {
3606+
case PI_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE: {
3607+
size_t global_work_size[3] = {0, 0, 0};
3608+
3609+
int max_block_dimX{0}, max_block_dimY{0}, max_block_dimZ{0};
3610+
sycl::detail::pi::assertion(
3611+
hipDeviceGetAttribute(&max_block_dimX, hipDeviceAttributeMaxBlockDimX,
3612+
device->get()) == hipSuccess);
3613+
sycl::detail::pi::assertion(
3614+
hipDeviceGetAttribute(&max_block_dimY, hipDeviceAttributeMaxBlockDimY,
3615+
device->get()) == hipSuccess);
3616+
sycl::detail::pi::assertion(
3617+
hipDeviceGetAttribute(&max_block_dimZ, hipDeviceAttributeMaxBlockDimZ,
3618+
device->get()) == hipSuccess);
3619+
3620+
int max_grid_dimX{0}, max_grid_dimY{0}, max_grid_dimZ{0};
3621+
sycl::detail::pi::assertion(
3622+
hipDeviceGetAttribute(&max_grid_dimX, hipDeviceAttributeMaxGridDimX,
3623+
device->get()) == hipSuccess);
3624+
sycl::detail::pi::assertion(
3625+
hipDeviceGetAttribute(&max_grid_dimY, hipDeviceAttributeMaxGridDimY,
3626+
device->get()) == hipSuccess);
3627+
sycl::detail::pi::assertion(
3628+
hipDeviceGetAttribute(&max_grid_dimZ, hipDeviceAttributeMaxGridDimZ,
3629+
device->get()) == hipSuccess);
3630+
3631+
global_work_size[0] = max_block_dimX * max_grid_dimX;
3632+
global_work_size[1] = max_block_dimY * max_grid_dimY;
3633+
global_work_size[2] = max_block_dimZ * max_grid_dimZ;
3634+
return getInfoArray(3, param_value_size, param_value,
3635+
param_value_size_ret, global_work_size);
3636+
}
36063637
case PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: {
36073638
int max_threads = 0;
36083639
sycl::detail::pi::assertion(
@@ -5448,7 +5479,7 @@ pi_result hip_piextEnqueueWriteHostPipe(
54485479
// Windows: dynamically loaded plugins might have been unloaded already
54495480
// when this is called. Sycl RT holds onto the PI plugin so it can be
54505481
// called safely. But this is not transitive. If the PI plugin in turn
5451-
// dynamically loaded a different DLL, that may have been unloaded.
5482+
// dynamically loaded a different DLL, that may have been unloaded.
54525483
// TODO: add a global variable lifetime management code here (see
54535484
// pi_level_zero.cpp for reference) Currently this is just a NOOP.
54545485
pi_result hip_piTearDown(void *PluginParameter) {

sycl/plugins/level_zero/pi_level_zero.cpp

+7-5
Original file line numberDiff line numberDiff line change
@@ -4386,13 +4386,15 @@ pi_result piKernelGetGroupInfo(pi_kernel Kernel, pi_device Device,
43864386
std::shared_lock<ur_shared_mutex> Guard(Kernel->Mutex);
43874387
switch (ParamName) {
43884388
case PI_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE: {
4389-
// TODO: To revisit after level_zero/issues/262 is resolved
43904389
struct {
43914390
size_t Arr[3];
4392-
} WorkSize = {{Device->ZeDeviceComputeProperties->maxGroupSizeX,
4393-
Device->ZeDeviceComputeProperties->maxGroupSizeY,
4394-
Device->ZeDeviceComputeProperties->maxGroupSizeZ}};
4395-
return ReturnValue(WorkSize);
4391+
} GlobalWorkSize = {{(Device->ZeDeviceComputeProperties->maxGroupSizeX *
4392+
Device->ZeDeviceComputeProperties->maxGroupCountX),
4393+
(Device->ZeDeviceComputeProperties->maxGroupSizeY *
4394+
Device->ZeDeviceComputeProperties->maxGroupCountY),
4395+
(Device->ZeDeviceComputeProperties->maxGroupSizeZ *
4396+
Device->ZeDeviceComputeProperties->maxGroupCountZ)}};
4397+
return ReturnValue(GlobalWorkSize);
43964398
}
43974399
case PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: {
43984400
// As of right now, L0 is missing API to query kernel and device specific

sycl/test-e2e/Basic/kernel_info.cpp

+7-4
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,6 @@
66
// Fail is flaky for level_zero, enable when fixed.
77
// UNSUPPORTED: level_zero
88
//
9-
// CUDA and HIP do not currently implement global_work_size
10-
// UNSUPPORTED: cuda, hip
119

1210
//==--- kernel_info.cpp - SYCL kernel info test ----------------------------==//
1311
//
@@ -68,9 +66,14 @@ int main() {
6866
assert(compileNumSg <= maxNumSg);
6967

7068
try {
69+
// To check (a) first if the kernel is device built-in, (b) then check if
70+
// the device type is custom
71+
if (!sycl::is_compatible({KernelID}, q.get_device())) {
72+
assert(dev.get_info<sycl::info::device::device_type>() ==
73+
sycl::info::device_type::custom);
74+
}
75+
7176
krn.get_info<sycl::info::kernel_device_specific::global_work_size>(dev);
72-
assert(dev.get_info<sycl::info::device::device_type>() ==
73-
sycl::info::device_type::custom);
7477
} catch (sycl::exception &e) {
7578
assert(e.code() == sycl::errc::invalid);
7679
}

0 commit comments

Comments
 (0)