Skip to content

Commit bed33ec

Browse files
committed
Fix bug in CUDA range calculation
A bug in the CUDA adapter was sometimes generating Y and Z ranges that did not divide the global Y or Z dimension. This fixes that. Also moves some helper functions into ur/ur.hpp that may be reused by other adapters
1 parent 588615e commit bed33ec

File tree

2 files changed

+55
-28
lines changed

2 files changed

+55
-28
lines changed

source/adapters/cuda/enqueue.cpp

Lines changed: 12 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
#include <cmath>
2020
#include <cuda.h>
21+
#include <ur/ur.hpp>
2122

2223
ur_result_t enqueueEventsWait(ur_queue_handle_t CommandQueue, CUstream Stream,
2324
uint32_t NumEventsInWaitList,
@@ -140,12 +141,10 @@ ur_result_t setCuMemAdvise(CUdeviceptr DevPtr, size_t Size,
140141
void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
141142
const size_t *GlobalWorkSize, const uint32_t WorkDim,
142143
const size_t MaxThreadsPerBlock[3],
143-
ur_kernel_handle_t Kernel, uint32_t LocalSize) {
144+
ur_kernel_handle_t Kernel) {
144145
assert(ThreadsPerBlock != nullptr);
145146
assert(GlobalWorkSize != nullptr);
146147
assert(Kernel != nullptr);
147-
int MinGrid, MaxBlockSize;
148-
size_t MaxBlockDim[3];
149148

150149
// The below assumes a three dimensional range but this is not guaranteed by
151150
// UR.
@@ -154,33 +153,18 @@ void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
154153
GlobalSizeNormalized[i] = GlobalWorkSize[i];
155154
}
156155

156+
size_t MaxBlockDim[3];
157+
MaxBlockDim[0] = MaxThreadsPerBlock[0];
157158
MaxBlockDim[1] = Device->getMaxBlockDimY();
158159
MaxBlockDim[2] = Device->getMaxBlockDimZ();
159160

160-
UR_CHECK_ERROR(
161-
cuOccupancyMaxPotentialBlockSize(&MinGrid, &MaxBlockSize, Kernel->get(),
162-
NULL, LocalSize, MaxThreadsPerBlock[0]));
163-
164-
ThreadsPerBlock[2] = std::min(GlobalSizeNormalized[2], MaxBlockDim[2]);
165-
ThreadsPerBlock[1] =
166-
std::min(GlobalSizeNormalized[1],
167-
std::min(MaxBlockSize / ThreadsPerBlock[2], MaxBlockDim[1]));
168-
MaxBlockDim[0] = MaxBlockSize / (ThreadsPerBlock[1] * ThreadsPerBlock[2]);
169-
ThreadsPerBlock[0] = std::min(
170-
MaxThreadsPerBlock[0], std::min(GlobalSizeNormalized[0], MaxBlockDim[0]));
171-
172-
static auto IsPowerOf2 = [](size_t Value) -> bool {
173-
return Value && !(Value & (Value - 1));
174-
};
175-
176-
// Find a local work group size that is a divisor of the global
177-
// work group size to produce uniform work groups.
178-
// Additionally, for best compute utilisation, the local size has
179-
// to be a power of two.
180-
while (0u != (GlobalSizeNormalized[0] % ThreadsPerBlock[0]) ||
181-
!IsPowerOf2(ThreadsPerBlock[0])) {
182-
--ThreadsPerBlock[0];
183-
}
161+
int MinGrid, MaxBlockSize;
162+
UR_CHECK_ERROR(cuOccupancyMaxPotentialBlockSize(
163+
&MinGrid, &MaxBlockSize, Kernel->get(), NULL, Kernel->getLocalSize(),
164+
MaxThreadsPerBlock[0]));
165+
166+
roundToHighestFactorOfGlobalSizeIn3d(ThreadsPerBlock, GlobalSizeNormalized,
167+
MaxBlockDim, MaxBlockSize);
184168
}
185169

186170
// Helper to verify out-of-registers case (exceeded block max registers).
@@ -261,7 +245,7 @@ setKernelParams(const ur_context_handle_t Context,
261245
}
262246
} else {
263247
guessLocalWorkSize(Device, ThreadsPerBlock, GlobalWorkSize, WorkDim,
264-
MaxThreadsPerBlock, Kernel, LocalSize);
248+
MaxThreadsPerBlock, Kernel);
265249
}
266250
}
267251

source/ur/ur.hpp

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -321,3 +321,46 @@ template <typename T> class Result {
321321
private:
322322
std::variant<ur_result_t, T> value_or_err;
323323
};
324+
325+
// Helper to make sure each x, y, z dim divide the global dimension.
326+
//
327+
// In/Out: ThreadsPerBlockInDim - The dimension of workgroup in some dimension
328+
// In: GlobalWorkSizeInDim - The global size in some dimension
329+
static inline void
330+
roundToHighestFactorOfGlobalSize(size_t &ThreadsPerBlockInDim,
331+
const size_t GlobalWorkSizeInDim) {
332+
while (GlobalWorkSizeInDim % ThreadsPerBlockInDim) {
333+
--ThreadsPerBlockInDim;
334+
}
335+
}
336+
337+
// Returns whether or not Value is a power of 2
338+
template <typename T> inline bool isPowerOf2(const T &Value) {
339+
return Value && !(Value & (Value - 1));
340+
}
341+
342+
// Helper to make sure each x, y, z dim divide the global dimension.
343+
//
344+
// In/Out: ThreadsPerBlock - The size of wg in 3d
345+
// In: GlobalSize - The global size in 3d (if dim < 3 then outer
346+
// dims == 1)
347+
// In: MaxBlockDim - The max size of block in 3d
348+
// In: MaxBlockSize - The max total size of block in all dimensions
349+
static inline void roundToHighestFactorOfGlobalSizeIn3d(
350+
size_t *ThreadsPerBlock, const size_t *GlobalSize,
351+
const size_t *MaxBlockDim, const size_t MaxBlockSize) {
352+
ThreadsPerBlock[2] = std::min(GlobalSize[2], MaxBlockDim[2]);
353+
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[2], GlobalSize[2]);
354+
355+
ThreadsPerBlock[1] =
356+
std::min(GlobalSize[1],
357+
std::min(MaxBlockSize / ThreadsPerBlock[2], MaxBlockDim[1]));
358+
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[1], GlobalSize[1]);
359+
360+
ThreadsPerBlock[0] = std::min(
361+
GlobalSize[0], MaxBlockSize / (ThreadsPerBlock[1] * ThreadsPerBlock[2]));
362+
// Make the X dim a factor of 2
363+
do {
364+
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[0], GlobalSize[0]);
365+
} while (!isPowerOf2(ThreadsPerBlock[0]));
366+
}

0 commit comments

Comments
 (0)