Skip to content

Commit ed1f8bf

Browse files
authored
Merge pull request #1326 from hdelan/refactor-guess-local-worksize
[CUDA][HIP] Fix bug in guess local worksize funcs and improve local worksize guessing in HIP adapter
2 parents ca5c342 + 69c43b4 commit ed1f8bf

File tree

5 files changed

+262
-74
lines changed

5 files changed

+262
-74
lines changed

source/adapters/cuda/enqueue.cpp

Lines changed: 12 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
#include <cmath>
2020
#include <cuda.h>
21+
#include <ur/ur.hpp>
2122

2223
ur_result_t enqueueEventsWait(ur_queue_handle_t CommandQueue, CUstream Stream,
2324
uint32_t NumEventsInWaitList,
@@ -140,12 +141,10 @@ ur_result_t setCuMemAdvise(CUdeviceptr DevPtr, size_t Size,
140141
void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
141142
const size_t *GlobalWorkSize, const uint32_t WorkDim,
142143
const size_t MaxThreadsPerBlock[3],
143-
ur_kernel_handle_t Kernel, uint32_t LocalSize) {
144+
ur_kernel_handle_t Kernel) {
144145
assert(ThreadsPerBlock != nullptr);
145146
assert(GlobalWorkSize != nullptr);
146147
assert(Kernel != nullptr);
147-
int MinGrid, MaxBlockSize;
148-
size_t MaxBlockDim[3];
149148

150149
// The below assumes a three dimensional range but this is not guaranteed by
151150
// UR.
@@ -154,33 +153,18 @@ void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
154153
GlobalSizeNormalized[i] = GlobalWorkSize[i];
155154
}
156155

156+
size_t MaxBlockDim[3];
157+
MaxBlockDim[0] = MaxThreadsPerBlock[0];
157158
MaxBlockDim[1] = Device->getMaxBlockDimY();
158159
MaxBlockDim[2] = Device->getMaxBlockDimZ();
159160

160-
UR_CHECK_ERROR(
161-
cuOccupancyMaxPotentialBlockSize(&MinGrid, &MaxBlockSize, Kernel->get(),
162-
NULL, LocalSize, MaxThreadsPerBlock[0]));
163-
164-
ThreadsPerBlock[2] = std::min(GlobalSizeNormalized[2], MaxBlockDim[2]);
165-
ThreadsPerBlock[1] =
166-
std::min(GlobalSizeNormalized[1],
167-
std::min(MaxBlockSize / ThreadsPerBlock[2], MaxBlockDim[1]));
168-
MaxBlockDim[0] = MaxBlockSize / (ThreadsPerBlock[1] * ThreadsPerBlock[2]);
169-
ThreadsPerBlock[0] = std::min(
170-
MaxThreadsPerBlock[0], std::min(GlobalSizeNormalized[0], MaxBlockDim[0]));
171-
172-
static auto IsPowerOf2 = [](size_t Value) -> bool {
173-
return Value && !(Value & (Value - 1));
174-
};
175-
176-
// Find a local work group size that is a divisor of the global
177-
// work group size to produce uniform work groups.
178-
// Additionally, for best compute utilisation, the local size has
179-
// to be a power of two.
180-
while (0u != (GlobalSizeNormalized[0] % ThreadsPerBlock[0]) ||
181-
!IsPowerOf2(ThreadsPerBlock[0])) {
182-
--ThreadsPerBlock[0];
183-
}
161+
int MinGrid, MaxBlockSize;
162+
UR_CHECK_ERROR(cuOccupancyMaxPotentialBlockSize(
163+
&MinGrid, &MaxBlockSize, Kernel->get(), NULL, Kernel->getLocalSize(),
164+
MaxThreadsPerBlock[0]));
165+
166+
roundToHighestFactorOfGlobalSizeIn3d(ThreadsPerBlock, GlobalSizeNormalized,
167+
MaxBlockDim, MaxBlockSize);
184168
}
185169

186170
// Helper to verify out-of-registers case (exceeded block max registers).
@@ -261,7 +245,7 @@ setKernelParams(const ur_context_handle_t Context,
261245
}
262246
} else {
263247
guessLocalWorkSize(Device, ThreadsPerBlock, GlobalWorkSize, WorkDim,
264-
MaxThreadsPerBlock, Kernel, LocalSize);
248+
MaxThreadsPerBlock, Kernel);
265249
}
266250
}
267251

source/adapters/hip/enqueue.cpp

Lines changed: 22 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
#include "memory.hpp"
1717
#include "queue.hpp"
1818

19+
#include <ur/ur.hpp>
20+
1921
extern size_t imageElementByteSize(hipArray_Format ArrayFormat);
2022

2123
ur_result_t enqueueEventsWait(ur_queue_handle_t, hipStream_t Stream,
@@ -48,23 +50,29 @@ ur_result_t enqueueEventsWait(ur_queue_handle_t, hipStream_t Stream,
4850
}
4951
}
5052

51-
void simpleGuessLocalWorkSize(size_t *ThreadsPerBlock,
52-
const size_t *GlobalWorkSize,
53-
const size_t MaxThreadsPerBlock[3],
54-
ur_kernel_handle_t Kernel) {
53+
// Determine local work sizes that result in uniform work groups.
54+
// The default threadsPerBlock only require handling the first work_dim
55+
// dimension.
56+
void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
57+
const size_t *GlobalWorkSize, const uint32_t WorkDim,
58+
const size_t MaxThreadsPerBlock[3]) {
5559
assert(ThreadsPerBlock != nullptr);
5660
assert(GlobalWorkSize != nullptr);
57-
assert(Kernel != nullptr);
5861

59-
std::ignore = Kernel;
62+
// FIXME: The below assumes a three dimensional range but this is not
63+
// guaranteed by UR.
64+
size_t GlobalSizeNormalized[3] = {1, 1, 1};
65+
for (uint32_t i = 0; i < WorkDim; i++) {
66+
GlobalSizeNormalized[i] = GlobalWorkSize[i];
67+
}
6068

61-
ThreadsPerBlock[0] = std::min(MaxThreadsPerBlock[0], GlobalWorkSize[0]);
69+
size_t MaxBlockDim[3];
70+
MaxBlockDim[0] = MaxThreadsPerBlock[0];
71+
MaxBlockDim[1] = Device->getMaxBlockDimY();
72+
MaxBlockDim[2] = Device->getMaxBlockDimZ();
6273

63-
// Find a local work group size that is a divisor of the global
64-
// work group size to produce uniform work groups.
65-
while (GlobalWorkSize[0] % ThreadsPerBlock[0]) {
66-
--ThreadsPerBlock[0];
67-
}
74+
roundToHighestFactorOfGlobalSizeIn3d(ThreadsPerBlock, GlobalSizeNormalized,
75+
MaxBlockDim, MaxThreadsPerBlock[0]);
6876
}
6977

7078
namespace {
@@ -1786,8 +1794,8 @@ setKernelParams(const ur_device_handle_t Device, const uint32_t WorkDim,
17861794
return err;
17871795
}
17881796
} else {
1789-
simpleGuessLocalWorkSize(ThreadsPerBlock, GlobalWorkSize,
1790-
MaxThreadsPerBlock, Kernel);
1797+
guessLocalWorkSize(Device, ThreadsPerBlock, GlobalWorkSize, WorkDim,
1798+
MaxThreadsPerBlock);
17911799
}
17921800
}
17931801

source/ur/ur.hpp

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -321,3 +321,56 @@ template <typename T> class Result {
321321
private:
322322
std::variant<ur_result_t, T> value_or_err;
323323
};
324+
325+
// Helper to make sure each x, y, z dim divide the global dimension.
326+
//
327+
// In/Out: ThreadsPerBlockInDim - The dimension of workgroup in some dimension
328+
// In: GlobalWorkSizeInDim - The global size in some dimension
329+
static inline void
330+
roundToHighestFactorOfGlobalSize(size_t &ThreadsPerBlockInDim,
331+
const size_t GlobalWorkSizeInDim) {
332+
while (ThreadsPerBlockInDim > 1 &&
333+
GlobalWorkSizeInDim % ThreadsPerBlockInDim) {
334+
--ThreadsPerBlockInDim;
335+
}
336+
}
337+
338+
// Returns whether or not Value is a power of 2
339+
template <typename T> inline bool isPowerOf2(const T &Value) {
340+
return Value && !(Value & (Value - 1));
341+
}
342+
343+
// Helper to make sure each x, y, z dim divide the global dimension.
344+
// Additionally it makes sure that the inner dimension always is a power of 2
345+
//
346+
// In/Out: ThreadsPerBlock - The size of wg in 3d
347+
// In: GlobalSize - The global size in 3d (if dim < 3 then outer
348+
// dims == 1)
349+
// In: MaxBlockDim - The max size of block in 3d
350+
// In: MaxBlockSize - The max total size of block in all dimensions
351+
// In: WorkDim - The workdim (1, 2 or 3)
352+
static inline void roundToHighestFactorOfGlobalSizeIn3d(
353+
size_t *ThreadsPerBlock, const size_t *GlobalSize,
354+
const size_t *MaxBlockDim, const size_t MaxBlockSize) {
355+
assert(GlobalSize[0] && "GlobalSize[0] cannot be zero");
356+
assert(GlobalSize[1] && "GlobalSize[1] cannot be zero");
357+
assert(GlobalSize[2] && "GlobalSize[2] cannot be zero");
358+
359+
ThreadsPerBlock[0] =
360+
std::min(GlobalSize[0], std::min(MaxBlockSize, MaxBlockDim[0]));
361+
do {
362+
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[0], GlobalSize[0]);
363+
} while (!isPowerOf2(ThreadsPerBlock[0]) && ThreadsPerBlock[0] > 32 &&
364+
--ThreadsPerBlock[0]);
365+
366+
ThreadsPerBlock[1] =
367+
std::min(GlobalSize[1],
368+
std::min(MaxBlockSize / ThreadsPerBlock[0], MaxBlockDim[1]));
369+
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[1], GlobalSize[1]);
370+
371+
ThreadsPerBlock[2] = std::min(
372+
GlobalSize[2],
373+
std::min(MaxBlockSize / (ThreadsPerBlock[1] * ThreadsPerBlock[0]),
374+
MaxBlockDim[2]));
375+
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[2], GlobalSize[2]);
376+
}

test/conformance/enqueue/urEnqueueKernelLaunch.cpp

Lines changed: 72 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -77,53 +77,93 @@ TEST_P(urEnqueueKernelLaunchTest, InvalidWorkDimension) {
7777
UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
7878
}
7979

80-
struct urEnqueueKernelLaunch2DTest : uur::urKernelExecutionTest {
81-
void SetUp() override {
82-
program_name = "fill_2d";
83-
UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp());
84-
}
85-
86-
uint32_t val = 42;
87-
size_t global_size[2] = {8, 8};
88-
size_t global_offset[2] = {0, 0};
89-
size_t buffer_size = sizeof(val) * global_size[0] * global_size[1];
90-
size_t n_dimensions = 2;
80+
struct testParametersEnqueueKernel {
81+
size_t X, Y, Z;
82+
size_t Dims;
9183
};
92-
UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urEnqueueKernelLaunch2DTest);
9384

94-
TEST_P(urEnqueueKernelLaunch2DTest, Success) {
95-
ur_mem_handle_t buffer = nullptr;
96-
AddBuffer1DArg(buffer_size, &buffer);
97-
AddPodArg(val);
98-
ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions,
99-
global_offset, global_size, nullptr, 0,
100-
nullptr, nullptr));
101-
ASSERT_SUCCESS(urQueueFinish(queue));
102-
ValidateBuffer(buffer, buffer_size, val);
85+
template <typename T>
86+
inline std::string printKernelLaunchTestString(
87+
const testing::TestParamInfo<typename T::ParamType> &info) {
88+
const auto device_handle = std::get<0>(info.param);
89+
const auto platform_device_name =
90+
uur::GetPlatformAndDeviceName(device_handle);
91+
std::stringstream test_name;
92+
test_name << platform_device_name << "__" << std::get<1>(info.param).Dims
93+
<< "D_" << std::get<1>(info.param).X;
94+
if (std::get<1>(info.param).Dims > 1) {
95+
test_name << "_" << std::get<1>(info.param).Y;
96+
}
97+
if (std::get<1>(info.param).Dims > 2) {
98+
test_name << "_" << std::get<1>(info.param).Z;
99+
}
100+
test_name << "";
101+
return test_name.str();
103102
}
104103

105-
struct urEnqueueKernelLaunch3DTest : uur::urKernelExecutionTest {
104+
struct urEnqueueKernelLaunchTestWithParam
105+
: uur::urBaseKernelExecutionTestWithParam<testParametersEnqueueKernel> {
106106
void SetUp() override {
107-
program_name = "fill_3d";
108-
UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp());
107+
global_range[0] = std::get<1>(GetParam()).X;
108+
global_range[1] = std::get<1>(GetParam()).Y;
109+
global_range[2] = std::get<1>(GetParam()).Z;
110+
buffer_size = sizeof(val) * global_range[0];
111+
n_dimensions = std::get<1>(GetParam()).Dims;
112+
if (n_dimensions == 1) {
113+
program_name = "fill";
114+
} else if (n_dimensions == 2) {
115+
program_name = "fill_2d";
116+
buffer_size *= global_range[1];
117+
} else {
118+
assert(n_dimensions == 3);
119+
program_name = "fill_3d";
120+
buffer_size *= global_range[1] * global_range[2];
121+
}
122+
UUR_RETURN_ON_FATAL_FAILURE(
123+
urBaseKernelExecutionTestWithParam::SetUp());
124+
}
125+
126+
void TearDown() override {
127+
UUR_RETURN_ON_FATAL_FAILURE(uur::urBaseKernelExecutionTestWithParam<
128+
testParametersEnqueueKernel>::TearDown());
109129
}
110130

111131
uint32_t val = 42;
112-
size_t global_size[3] = {4, 4, 4};
132+
size_t global_range[3];
113133
size_t global_offset[3] = {0, 0, 0};
114-
size_t buffer_size =
115-
sizeof(val) * global_size[0] * global_size[1] * global_size[2];
116-
size_t n_dimensions = 3;
134+
size_t n_dimensions;
135+
size_t buffer_size;
117136
};
118-
UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urEnqueueKernelLaunch3DTest);
119137

120-
TEST_P(urEnqueueKernelLaunch3DTest, Success) {
138+
static std::vector<testParametersEnqueueKernel> test_cases{// 1D
139+
{1, 1, 1, 1},
140+
{31, 1, 1, 1},
141+
{1027, 1, 1, 1},
142+
{32, 1, 1, 1},
143+
{256, 1, 1, 1},
144+
// 2D
145+
{1, 1, 1, 2},
146+
{31, 7, 1, 2},
147+
{1027, 1, 1, 2},
148+
{1, 32, 1, 2},
149+
{256, 79, 1, 2},
150+
// 3D
151+
{1, 1, 1, 3},
152+
{31, 7, 1, 3},
153+
{1027, 1, 19, 3},
154+
{1, 53, 19, 3},
155+
{256, 79, 8, 3}};
156+
UUR_TEST_SUITE_P(
157+
urEnqueueKernelLaunchTestWithParam, testing::ValuesIn(test_cases),
158+
printKernelLaunchTestString<urEnqueueKernelLaunchTestWithParam>);
159+
160+
TEST_P(urEnqueueKernelLaunchTestWithParam, Success) {
121161
ur_mem_handle_t buffer = nullptr;
122162
AddBuffer1DArg(buffer_size, &buffer);
123163
AddPodArg(val);
124164
ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions,
125-
global_offset, global_size, nullptr, 0,
126-
nullptr, nullptr));
165+
global_offset, global_range, nullptr,
166+
0, nullptr, nullptr));
127167
ASSERT_SUCCESS(urQueueFinish(queue));
128168
ValidateBuffer(buffer, buffer_size, val);
129169
}

0 commit comments

Comments
 (0)