Skip to content

Commit 86a14fe

Browse files
committed
Make the HIP adapter use complex subgroup size calculation
The HIP adapter was only finding a good sg size in the X dim. This changes it so that it now chooses a sg size that divides the global dim in X, Y and Z dimensions. It also chooses a power of 2 sg size in the X dim, which is the same that the CUDA adapter does. This may give some performance improvements.
1 parent 878baba commit 86a14fe

File tree

1 file changed

+28
-13
lines changed

1 file changed

+28
-13
lines changed

source/adapters/hip/enqueue.cpp

Lines changed: 28 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
#include "memory.hpp"
1616
#include "queue.hpp"
1717

18+
#include <ur/ur.hpp>
19+
1820
extern size_t imageElementByteSize(hipArray_Format ArrayFormat);
1921

2022
namespace {
@@ -49,23 +51,36 @@ ur_result_t enqueueEventsWait(ur_queue_handle_t, hipStream_t Stream,
4951
}
5052
}
5153

52-
void simpleGuessLocalWorkSize(size_t *ThreadsPerBlock,
53-
const size_t *GlobalWorkSize,
54-
const size_t MaxThreadsPerBlock[3],
55-
ur_kernel_handle_t Kernel) {
54+
// Determine local work sizes that result in uniform work groups.
55+
// The default threadsPerBlock only require handling the first work_dim
56+
// dimension.
57+
void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
58+
const size_t *GlobalWorkSize, const uint32_t WorkDim,
59+
const size_t MaxThreadsPerBlock[3],
60+
ur_kernel_handle_t Kernel) {
5661
assert(ThreadsPerBlock != nullptr);
5762
assert(GlobalWorkSize != nullptr);
5863
assert(Kernel != nullptr);
5964

60-
std::ignore = Kernel;
65+
// FIXME: The below assumes a three dimensional range but this is not
66+
// guaranteed by UR.
67+
size_t GlobalSizeNormalized[3] = {1, 1, 1};
68+
for (uint32_t i = 0; i < WorkDim; i++) {
69+
GlobalSizeNormalized[i] = GlobalWorkSize[i];
70+
}
71+
72+
size_t MaxBlockDim[3];
73+
MaxBlockDim[0] = MaxThreadsPerBlock[0];
74+
MaxBlockDim[1] = Device->getMaxBlockDimY();
75+
MaxBlockDim[2] = Device->getMaxBlockDimZ();
6176

62-
ThreadsPerBlock[0] = std::min(MaxThreadsPerBlock[0], GlobalWorkSize[0]);
77+
int MinGrid, MaxBlockSize;
78+
UR_CHECK_ERROR(hipOccupancyMaxPotentialBlockSize(
79+
&MinGrid, &MaxBlockSize, Kernel->get(), Kernel->getLocalSize(),
80+
MaxThreadsPerBlock[0]));
6381

64-
// Find a local work group size that is a divisor of the global
65-
// work group size to produce uniform work groups.
66-
while (GlobalWorkSize[0] % ThreadsPerBlock[0]) {
67-
--ThreadsPerBlock[0];
68-
}
82+
roundToHighestFactorOfGlobalSizeIn3d(ThreadsPerBlock, GlobalSizeNormalized,
83+
MaxBlockDim, MaxBlockSize);
6984
}
7085

7186
ur_result_t setHipMemAdvise(const void *DevPtr, const size_t Size,
@@ -344,8 +359,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
344359
return err;
345360
}
346361
} else {
347-
simpleGuessLocalWorkSize(ThreadsPerBlock, pGlobalWorkSize,
348-
MaxThreadsPerBlock, hKernel);
362+
guessLocalWorkSize(hQueue->getDevice(), ThreadsPerBlock, pGlobalWorkSize,
363+
workDim, MaxThreadsPerBlock, hKernel);
349364
}
350365
}
351366

0 commit comments

Comments
 (0)