Skip to content

Commit 2447fe1

Browse files
committed
Make the HIP adapter use complex subgroup size calculation
The HIP adapter was only finding a good sg size in the X dim. This changes it so that it now chooses a sg size that divides the global dim in X, Y and Z dimensions. It also chooses a power of 2 sg size in the X dim, which is the same that the CUDA adapter does. This may give some performance improvements.
1 parent aa849f2 commit 2447fe1

File tree

1 file changed

+28
-13
lines changed

1 file changed

+28
-13
lines changed

source/adapters/hip/enqueue.cpp

Lines changed: 28 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
#include "memory.hpp"
1717
#include "queue.hpp"
1818

19+
#include <ur/ur.hpp>
20+
1921
extern size_t imageElementByteSize(hipArray_Format ArrayFormat);
2022

2123
ur_result_t enqueueEventsWait(ur_queue_handle_t, hipStream_t Stream,
@@ -48,23 +50,36 @@ ur_result_t enqueueEventsWait(ur_queue_handle_t, hipStream_t Stream,
4850
}
4951
}
5052

51-
void simpleGuessLocalWorkSize(size_t *ThreadsPerBlock,
52-
const size_t *GlobalWorkSize,
53-
const size_t MaxThreadsPerBlock[3],
54-
ur_kernel_handle_t Kernel) {
53+
// Determine local work sizes that result in uniform work groups.
54+
// The default threadsPerBlock only require handling the first work_dim
55+
// dimension.
56+
void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
57+
const size_t *GlobalWorkSize, const uint32_t WorkDim,
58+
const size_t MaxThreadsPerBlock[3],
59+
ur_kernel_handle_t Kernel) {
5560
assert(ThreadsPerBlock != nullptr);
5661
assert(GlobalWorkSize != nullptr);
5762
assert(Kernel != nullptr);
5863

59-
std::ignore = Kernel;
64+
// FIXME: The below assumes a three dimensional range but this is not
65+
// guaranteed by UR.
66+
size_t GlobalSizeNormalized[3] = {1, 1, 1};
67+
for (uint32_t i = 0; i < WorkDim; i++) {
68+
GlobalSizeNormalized[i] = GlobalWorkSize[i];
69+
}
70+
71+
size_t MaxBlockDim[3];
72+
MaxBlockDim[0] = MaxThreadsPerBlock[0];
73+
MaxBlockDim[1] = Device->getMaxBlockDimY();
74+
MaxBlockDim[2] = Device->getMaxBlockDimZ();
6075

61-
ThreadsPerBlock[0] = std::min(MaxThreadsPerBlock[0], GlobalWorkSize[0]);
76+
int MinGrid, MaxBlockSize;
77+
UR_CHECK_ERROR(hipOccupancyMaxPotentialBlockSize(
78+
&MinGrid, &MaxBlockSize, Kernel->get(), Kernel->getLocalSize(),
79+
MaxThreadsPerBlock[0]));
6280

63-
// Find a local work group size that is a divisor of the global
64-
// work group size to produce uniform work groups.
65-
while (GlobalWorkSize[0] % ThreadsPerBlock[0]) {
66-
--ThreadsPerBlock[0];
67-
}
81+
roundToHighestFactorOfGlobalSizeIn3d(ThreadsPerBlock, GlobalSizeNormalized,
82+
MaxBlockDim, MaxBlockSize);
6883
}
6984

7085
namespace {
@@ -1793,8 +1808,8 @@ setKernelParams(const ur_device_handle_t Device, const uint32_t WorkDim,
17931808
return err;
17941809
}
17951810
} else {
1796-
simpleGuessLocalWorkSize(ThreadsPerBlock, GlobalWorkSize,
1797-
MaxThreadsPerBlock, Kernel);
1811+
guessLocalWorkSize(Device, ThreadsPerBlock, GlobalWorkSize, WorkDim,
1812+
MaxThreadsPerBlock, Kernel);
17981813
}
17991814
}
18001815

0 commit comments

Comments
 (0)