Skip to content

Commit 254fdeb

Browse files
committed
Add local worksize calculator for HIP adapter
Make the logic of the guessLocalWorksize func more complex. This now mimics the CUDA adapter. Also remove a param from CUDA adapter's guessLocalWorksize func since the LocalSize is contained within the ur_kernel_handle_t, also a param to the func.
1 parent 4aceeda commit 254fdeb

File tree

2 files changed

+56
-16
lines changed

2 files changed

+56
-16
lines changed

source/adapters/cuda/enqueue.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ ur_result_t setCuMemAdvise(CUdeviceptr DevPtr, size_t Size,
140140
void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
141141
const size_t *GlobalWorkSize, const uint32_t WorkDim,
142142
const size_t MaxThreadsPerBlock[3],
143-
ur_kernel_handle_t Kernel, uint32_t LocalSize) {
143+
ur_kernel_handle_t Kernel) {
144144
assert(ThreadsPerBlock != nullptr);
145145
assert(GlobalWorkSize != nullptr);
146146
assert(Kernel != nullptr);
@@ -157,9 +157,9 @@ void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
157157
MaxBlockDim[1] = Device->getMaxBlockDimY();
158158
MaxBlockDim[2] = Device->getMaxBlockDimZ();
159159

160-
UR_CHECK_ERROR(
161-
cuOccupancyMaxPotentialBlockSize(&MinGrid, &MaxBlockSize, Kernel->get(),
162-
NULL, LocalSize, MaxThreadsPerBlock[0]));
160+
UR_CHECK_ERROR(cuOccupancyMaxPotentialBlockSize(
161+
&MinGrid, &MaxBlockSize, Kernel->get(), NULL, Kernel->getLocalSize(),
162+
MaxThreadsPerBlock[0]));
163163

164164
// Helper lambda to make sure each x, y, z dim divide the global dimension.
165165
// Can optionally specify that we want the wg size to be a power of 2 in a
@@ -266,7 +266,7 @@ setKernelParams(const ur_context_handle_t Context,
266266
}
267267
} else {
268268
guessLocalWorkSize(Device, ThreadsPerBlock, GlobalWorkSize, WorkDim,
269-
MaxThreadsPerBlock, Kernel, LocalSize);
269+
MaxThreadsPerBlock, Kernel);
270270
}
271271
}
272272

source/adapters/hip/enqueue.cpp

Lines changed: 51 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -49,19 +49,59 @@ ur_result_t enqueueEventsWait(ur_queue_handle_t, hipStream_t Stream,
4949
}
5050
}
5151

52-
void simpleGuessLocalWorkSize(size_t *ThreadsPerBlock,
53-
const size_t *GlobalWorkSize,
54-
const size_t MaxThreadsPerBlock[3]) {
52+
// Determine local work sizes that result in uniform work groups.
53+
// The default threadsPerBlock only require handling the first work_dim
54+
// dimension.
55+
void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
56+
const size_t *GlobalWorkSize, const uint32_t WorkDim,
57+
const size_t MaxThreadsPerBlock[3],
58+
ur_kernel_handle_t Kernel) {
5559
assert(ThreadsPerBlock != nullptr);
5660
assert(GlobalWorkSize != nullptr);
61+
assert(Kernel != nullptr);
62+
int MinGrid, MaxBlockSize;
63+
size_t MaxBlockDim[3];
64+
65+
// The below assumes a three dimensional range but this is not guaranteed by
66+
// UR.
67+
size_t GlobalSizeNormalized[3] = {1, 1, 1};
68+
for (uint32_t i = 0; i < WorkDim; i++) {
69+
GlobalSizeNormalized[i] = GlobalWorkSize[i];
70+
}
5771

58-
ThreadsPerBlock[0] = std::min(MaxThreadsPerBlock[0], GlobalWorkSize[0]);
72+
MaxBlockDim[1] = Device->getMaxBlockDimY();
73+
MaxBlockDim[2] = Device->getMaxBlockDimZ();
74+
75+
UR_CHECK_ERROR(hipOccupancyMaxPotentialBlockSize(
76+
&MinGrid, &MaxBlockSize, Kernel->get(), Kernel->getLocalSize(),
77+
MaxThreadsPerBlock[0]));
78+
79+
// Helper lambda to make sure each x, y, z dim divide the global dimension.
80+
// Can optionally specify that we want the wg size to be a power of 2 in a
81+
// given dimension, which is useful for the X dim for performance reasons.
82+
static auto roundToHighestFactorOfGlobalSize =
83+
[](size_t &ThreadsPerBlockInDim, const size_t GlobalWorkSizeInDim,
84+
bool MakePowerOfTwo) {
85+
auto IsPowerOf2 = [](size_t Value) -> bool {
86+
return Value && !(Value & (Value - 1));
87+
};
88+
while (GlobalWorkSizeInDim % ThreadsPerBlockInDim ||
89+
(MakePowerOfTwo && !IsPowerOf2(ThreadsPerBlockInDim)))
90+
--ThreadsPerBlockInDim;
91+
};
5992

60-
// Find a local work group size that is a divisor of the global
61-
// work group size to produce uniform work groups.
62-
while (GlobalWorkSize[0] % ThreadsPerBlock[0]) {
63-
--ThreadsPerBlock[0];
64-
}
93+
ThreadsPerBlock[2] = std::min(GlobalSizeNormalized[2], MaxBlockDim[2]);
94+
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[2], GlobalWorkSize[2],
95+
false);
96+
ThreadsPerBlock[1] =
97+
std::min(GlobalSizeNormalized[1],
98+
std::min(MaxBlockSize / ThreadsPerBlock[2], MaxBlockDim[1]));
99+
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[1], GlobalWorkSize[1],
100+
false);
101+
MaxBlockDim[0] = MaxBlockSize / (ThreadsPerBlock[1] * ThreadsPerBlock[2]);
102+
ThreadsPerBlock[0] = std::min(
103+
MaxThreadsPerBlock[0], std::min(GlobalSizeNormalized[0], MaxBlockDim[0]));
104+
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[0], GlobalWorkSize[0], true);
65105
}
66106

67107
ur_result_t setHipMemAdvise(const void *DevPtr, const size_t Size,
@@ -340,8 +380,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
340380
return err;
341381
}
342382
} else {
343-
simpleGuessLocalWorkSize(ThreadsPerBlock, pGlobalWorkSize,
344-
MaxThreadsPerBlock);
383+
guessLocalWorkSize(hQueue->getDevice(), ThreadsPerBlock, pGlobalWorkSize,
384+
workDim, MaxThreadsPerBlock, hKernel);
345385
}
346386
}
347387

0 commit comments

Comments
 (0)