Skip to content

Commit 3e2effd

Browse files
committed
Make the HIP adapter use complex subgroup size calculation
The HIP adapter was only finding a good sg size in the X dim. This changes it so that it now chooses a sg size that divides the global dim in X, Y and Z dimensions. It also chooses a power of 2 sg size in the X dim, which is the same that the CUDA adapter does. This may give some performance improvements.
1 parent d33dfed commit 3e2effd

File tree

4 files changed

+212
-55
lines changed

4 files changed

+212
-55
lines changed

source/adapters/hip/enqueue.cpp

Lines changed: 23 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
#include "memory.hpp"
1717
#include "queue.hpp"
1818

19+
#include <ur/ur.hpp>
20+
1921
extern size_t imageElementByteSize(hipArray_Format ArrayFormat);
2022

2123
ur_result_t enqueueEventsWait(ur_queue_handle_t, hipStream_t Stream,
@@ -48,23 +50,31 @@ ur_result_t enqueueEventsWait(ur_queue_handle_t, hipStream_t Stream,
4850
}
4951
}
5052

51-
void simpleGuessLocalWorkSize(size_t *ThreadsPerBlock,
52-
const size_t *GlobalWorkSize,
53-
const size_t MaxThreadsPerBlock[3],
54-
ur_kernel_handle_t Kernel) {
53+
// Determine local work sizes that result in uniform work groups.
54+
// The default threadsPerBlock only require handling the first work_dim
55+
// dimension.
56+
void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
57+
const size_t *GlobalWorkSize, const uint32_t WorkDim,
58+
const size_t MaxThreadsPerBlock[3],
59+
ur_kernel_handle_t Kernel) {
5560
assert(ThreadsPerBlock != nullptr);
5661
assert(GlobalWorkSize != nullptr);
5762
assert(Kernel != nullptr);
5863

59-
std::ignore = Kernel;
64+
// FIXME: The below assumes a three dimensional range but this is not
65+
// guaranteed by UR.
66+
size_t GlobalSizeNormalized[3] = {1, 1, 1};
67+
for (uint32_t i = 0; i < WorkDim; i++) {
68+
GlobalSizeNormalized[i] = GlobalWorkSize[i];
69+
}
6070

61-
ThreadsPerBlock[0] = std::min(MaxThreadsPerBlock[0], GlobalWorkSize[0]);
71+
size_t MaxBlockDim[3];
72+
MaxBlockDim[0] = MaxThreadsPerBlock[0];
73+
MaxBlockDim[1] = Device->getMaxBlockDimY();
74+
MaxBlockDim[2] = Device->getMaxBlockDimZ();
6275

63-
// Find a local work group size that is a divisor of the global
64-
// work group size to produce uniform work groups.
65-
while (GlobalWorkSize[0] % ThreadsPerBlock[0]) {
66-
--ThreadsPerBlock[0];
67-
}
76+
roundToHighestFactorOfGlobalSizeIn3d(ThreadsPerBlock, GlobalSizeNormalized,
77+
MaxBlockDim, MaxThreadsPerBlock[0]);
6878
}
6979

7080
namespace {
@@ -1793,8 +1803,8 @@ setKernelParams(const ur_device_handle_t Device, const uint32_t WorkDim,
17931803
return err;
17941804
}
17951805
} else {
1796-
simpleGuessLocalWorkSize(ThreadsPerBlock, GlobalWorkSize,
1797-
MaxThreadsPerBlock, Kernel);
1806+
guessLocalWorkSize(Device, ThreadsPerBlock, GlobalWorkSize, WorkDim,
1807+
MaxThreadsPerBlock, Kernel);
17981808
}
17991809
}
18001810

source/ur/ur.hpp

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -329,7 +329,8 @@ template <typename T> class Result {
329329
static inline void
330330
roundToHighestFactorOfGlobalSize(size_t &ThreadsPerBlockInDim,
331331
const size_t GlobalWorkSizeInDim) {
332-
while (GlobalWorkSizeInDim % ThreadsPerBlockInDim) {
332+
while (ThreadsPerBlockInDim > 1 &&
333+
GlobalWorkSizeInDim % ThreadsPerBlockInDim) {
333334
--ThreadsPerBlockInDim;
334335
}
335336
}
@@ -349,18 +350,20 @@ template <typename T> inline bool isPowerOf2(const T &Value) {
349350
static inline void roundToHighestFactorOfGlobalSizeIn3d(
350351
size_t *ThreadsPerBlock, const size_t *GlobalSize,
351352
const size_t *MaxBlockDim, const size_t MaxBlockSize) {
352-
ThreadsPerBlock[2] = std::min(GlobalSize[2], MaxBlockDim[2]);
353-
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[2], GlobalSize[2]);
353+
ThreadsPerBlock[0] = std::min(GlobalSize[0], MaxBlockDim[0]);
354+
// Make the X dim a factor of 2
355+
do {
356+
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[0], GlobalSize[0]);
357+
} while (!isPowerOf2(ThreadsPerBlock[0]) && ThreadsPerBlock[0] > 32 &&
358+
--ThreadsPerBlock[0]);
354359

355360
ThreadsPerBlock[1] =
356361
std::min(GlobalSize[1],
357-
std::min(MaxBlockSize / ThreadsPerBlock[2], MaxBlockDim[1]));
362+
std::min(MaxBlockSize / ThreadsPerBlock[0], MaxBlockDim[1]));
358363
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[1], GlobalSize[1]);
359364

360-
ThreadsPerBlock[0] = std::min(
361-
GlobalSize[0], MaxBlockSize / (ThreadsPerBlock[1] * ThreadsPerBlock[2]));
362-
// Make the X dim a factor of 2
363-
do {
364-
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[0], GlobalSize[0]);
365-
} while (!isPowerOf2(ThreadsPerBlock[0]));
365+
ThreadsPerBlock[2] = std::min(
366+
GlobalSize[2], MaxBlockSize / (ThreadsPerBlock[1] * ThreadsPerBlock[0]));
367+
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[2], GlobalSize[2]);
368+
366369
}

test/conformance/enqueue/urEnqueueKernelLaunch.cpp

Lines changed: 73 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -77,53 +77,94 @@ TEST_P(urEnqueueKernelLaunchTest, InvalidWorkDimension) {
7777
UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
7878
}
7979

80-
struct urEnqueueKernelLaunch2DTest : uur::urKernelExecutionTest {
81-
void SetUp() override {
82-
program_name = "fill_2d";
83-
UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp());
84-
}
85-
86-
uint32_t val = 42;
87-
size_t global_size[2] = {8, 8};
88-
size_t global_offset[2] = {0, 0};
89-
size_t buffer_size = sizeof(val) * global_size[0] * global_size[1];
90-
size_t n_dimensions = 2;
80+
struct testParametersEnqueueKernel {
81+
size_t X, Y, Z;
82+
size_t Dims;
9183
};
92-
UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urEnqueueKernelLaunch2DTest);
9384

94-
TEST_P(urEnqueueKernelLaunch2DTest, Success) {
95-
ur_mem_handle_t buffer = nullptr;
96-
AddBuffer1DArg(buffer_size, &buffer);
97-
AddPodArg(val);
98-
ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions,
99-
global_offset, global_size, nullptr, 0,
100-
nullptr, nullptr));
101-
ASSERT_SUCCESS(urQueueFinish(queue));
102-
ValidateBuffer(buffer, buffer_size, val);
85+
template <typename T>
86+
inline std::string printKernelLaunchTestString(
87+
const testing::TestParamInfo<typename T::ParamType> &info) {
88+
const auto device_handle = std::get<0>(info.param);
89+
const auto platform_device_name =
90+
uur::GetPlatformAndDeviceName(device_handle);
91+
std::stringstream test_name;
92+
test_name << platform_device_name << "__"
93+
<< std::get<1>(info.param).Dims << "D_"
94+
<< std::get<1>(info.param).X;
95+
if (std::get<1>(info.param).Dims > 1) {
96+
test_name << "_" << std::get<1>(info.param).Y;
97+
}
98+
if (std::get<1>(info.param).Dims > 2) {
99+
test_name << "_" << std::get<1>(info.param).Z;
100+
}
101+
test_name << "";
102+
return test_name.str();
103103
}
104104

105-
struct urEnqueueKernelLaunch3DTest : uur::urKernelExecutionTest {
105+
struct urEnqueueKernelLaunchTestWithParam
106+
: uur::urBaseKernelExecutionTestWithParam<testParametersEnqueueKernel> {
106107
void SetUp() override {
107-
program_name = "fill_3d";
108-
UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp());
108+
global_range[0] = std::get<1>(GetParam()).X;
109+
global_range[1] = std::get<1>(GetParam()).Y;
110+
global_range[2] = std::get<1>(GetParam()).Z;
111+
buffer_size = sizeof(val) * global_range[0];
112+
n_dimensions = std::get<1>(GetParam()).Dims;
113+
if (n_dimensions == 1) {
114+
program_name = "fill";
115+
} else if (n_dimensions == 2) {
116+
program_name = "fill_2d";
117+
buffer_size *= global_range[1];
118+
} else {
119+
assert(n_dimensions == 3);
120+
program_name = "fill_3d";
121+
buffer_size *= global_range[1] * global_range[2];
122+
}
123+
UUR_RETURN_ON_FATAL_FAILURE(
124+
urBaseKernelExecutionTestWithParam::SetUp());
125+
}
126+
127+
void TearDown() override {
128+
UUR_RETURN_ON_FATAL_FAILURE(uur::urBaseKernelExecutionTestWithParam<
129+
testParametersEnqueueKernel>::TearDown());
109130
}
110131

111132
uint32_t val = 42;
112-
size_t global_size[3] = {4, 4, 4};
133+
size_t global_range[3];
113134
size_t global_offset[3] = {0, 0, 0};
114-
size_t buffer_size =
115-
sizeof(val) * global_size[0] * global_size[1] * global_size[2];
116-
size_t n_dimensions = 3;
135+
size_t n_dimensions;
136+
size_t buffer_size;
117137
};
118-
UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urEnqueueKernelLaunch3DTest);
119138

120-
TEST_P(urEnqueueKernelLaunch3DTest, Success) {
139+
static std::vector<testParametersEnqueueKernel> test_cases{// 1D
140+
{1, 1, 1, 1},
141+
{31, 1, 1, 1},
142+
{1027, 1, 1, 1},
143+
{32, 1, 1, 1},
144+
{256, 1, 1, 1},
145+
// 2D
146+
{1, 1, 1, 2},
147+
{31, 7, 1, 2},
148+
{1027, 1, 1, 2},
149+
{1, 32, 1, 2},
150+
{256, 79, 1, 2},
151+
// 3D
152+
{1, 1, 1, 3},
153+
{31, 7, 1, 3},
154+
{1027, 1, 19, 3},
155+
{1, 53, 19, 3},
156+
{256, 79, 8, 3}};
157+
UUR_TEST_SUITE_P(
158+
urEnqueueKernelLaunchTestWithParam, testing::ValuesIn(test_cases),
159+
printKernelLaunchTestString<urEnqueueKernelLaunchTestWithParam>);
160+
161+
TEST_P(urEnqueueKernelLaunchTestWithParam, Success) {
121162
ur_mem_handle_t buffer = nullptr;
122163
AddBuffer1DArg(buffer_size, &buffer);
123164
AddPodArg(val);
124165
ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions,
125-
global_offset, global_size, nullptr, 0,
126-
nullptr, nullptr));
166+
global_offset, global_range, nullptr,
167+
0, nullptr, nullptr));
127168
ASSERT_SUCCESS(urQueueFinish(queue));
128169
ValidateBuffer(buffer, buffer_size, val);
129170
}

test/conformance/testing/include/uur/fixtures.h

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1274,6 +1274,109 @@ struct urBaseKernelExecutionTest : urBaseKernelTest {
12741274
uint32_t current_arg_index = 0;
12751275
};
12761276

1277+
template <typename T>
1278+
struct urBaseKernelExecutionTestWithParam : urBaseKernelTestWithParam<T> {
1279+
void SetUp() override {
1280+
UUR_RETURN_ON_FATAL_FAILURE(urBaseKernelTestWithParam<T>::SetUp());
1281+
UUR_RETURN_ON_FATAL_FAILURE(urBaseKernelTestWithParam<T>::Build());
1282+
context = urBaseKernelTestWithParam<T>::context;
1283+
kernel = urBaseKernelTestWithParam<T>::kernel;
1284+
ASSERT_SUCCESS(urQueueCreate(
1285+
context, urBaseKernelTestWithParam<T>::device, 0, &queue));
1286+
}
1287+
1288+
void TearDown() override {
1289+
for (auto &buffer : buffer_args) {
1290+
ASSERT_SUCCESS(urMemRelease(buffer));
1291+
}
1292+
UUR_RETURN_ON_FATAL_FAILURE(urBaseKernelTestWithParam<T>::TearDown());
1293+
if (queue) {
1294+
EXPECT_SUCCESS(urQueueRelease(queue));
1295+
}
1296+
}
1297+
1298+
// Adds a kernel arg representing a sycl buffer constructed with a 1D range.
1299+
void AddBuffer1DArg(size_t size, ur_mem_handle_t *out_buffer) {
1300+
ur_mem_handle_t mem_handle = nullptr;
1301+
ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, size,
1302+
nullptr, &mem_handle));
1303+
char zero = 0;
1304+
ASSERT_SUCCESS(urEnqueueMemBufferFill(queue, mem_handle, &zero,
1305+
sizeof(zero), 0, size, 0, nullptr,
1306+
nullptr));
1307+
ASSERT_SUCCESS(urQueueFinish(queue));
1308+
ASSERT_SUCCESS(urKernelSetArgMemObj(kernel, current_arg_index, nullptr,
1309+
mem_handle));
1310+
1311+
// SYCL device kernels have different interfaces depending on the
1312+
// backend being used. Typically a kernel which takes a buffer argument
1313+
// will take a pointer to the start of the buffer and a sycl::id param
1314+
// which is a struct that encodes the accessor to the buffer. However
1315+
// the AMD backend handles this differently and uses three separate
1316+
// arguments for each of the three dimensions of the accessor.
1317+
1318+
ur_platform_backend_t backend;
1319+
ASSERT_SUCCESS(urPlatformGetInfo(urBaseKernelTestWithParam<T>::platform,
1320+
UR_PLATFORM_INFO_BACKEND,
1321+
sizeof(backend), &backend, nullptr));
1322+
if (backend == UR_PLATFORM_BACKEND_HIP) {
1323+
// this emulates the three offset params for buffer accessor on AMD.
1324+
size_t val = 0;
1325+
ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 1,
1326+
sizeof(size_t), nullptr, &val));
1327+
ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 2,
1328+
sizeof(size_t), nullptr, &val));
1329+
ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 3,
1330+
sizeof(size_t), nullptr, &val));
1331+
current_arg_index += 4;
1332+
} else {
1333+
// This emulates the offset struct sycl adds for a 1D buffer accessor.
1334+
struct {
1335+
size_t offsets[1] = {0};
1336+
} accessor;
1337+
ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 1,
1338+
sizeof(accessor), nullptr,
1339+
&accessor));
1340+
current_arg_index += 2;
1341+
}
1342+
1343+
buffer_args.push_back(mem_handle);
1344+
*out_buffer = mem_handle;
1345+
}
1346+
1347+
template <class U> void AddPodArg(U data) {
1348+
ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index,
1349+
sizeof(data), nullptr, &data));
1350+
current_arg_index++;
1351+
}
1352+
1353+
// Validate the contents of `buffer` according to the given validator.
1354+
template <class U>
1355+
void ValidateBuffer(ur_mem_handle_t buffer, size_t size,
1356+
std::function<bool(U &)> validator) {
1357+
std::vector<U> read_buffer(size / sizeof(U));
1358+
ASSERT_SUCCESS(urEnqueueMemBufferRead(queue, buffer, true, 0, size,
1359+
read_buffer.data(), 0, nullptr,
1360+
nullptr));
1361+
ASSERT_TRUE(
1362+
std::all_of(read_buffer.begin(), read_buffer.end(), validator));
1363+
}
1364+
1365+
// Helper that uses the generic validate function to check for a given value.
1366+
template <class U>
1367+
void ValidateBuffer(ur_mem_handle_t buffer, size_t size, U value) {
1368+
auto validator = [&value](U result) -> bool { return result == value; };
1369+
1370+
ValidateBuffer<U>(buffer, size, validator);
1371+
}
1372+
1373+
std::vector<ur_mem_handle_t> buffer_args;
1374+
uint32_t current_arg_index = 0;
1375+
ur_context_handle_t context;
1376+
ur_kernel_handle_t kernel;
1377+
ur_queue_handle_t queue;
1378+
};
1379+
12771380
struct urKernelExecutionTest : urBaseKernelExecutionTest {
12781381
void SetUp() {
12791382
UUR_RETURN_ON_FATAL_FAILURE(urBaseKernelExecutionTest::SetUp());

0 commit comments

Comments
 (0)