diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp index 40311648d5272..e794c89eff896 100644 --- a/sycl/plugins/cuda/pi_cuda.hpp +++ b/sycl/plugins/cuda/pi_cuda.hpp @@ -638,7 +638,22 @@ struct _pi_kernel { void add_local_arg(size_t index, size_t size) { size_t localOffset = this->get_local_size(); - add_arg(index, sizeof(size_t), (const void *)&(localOffset), size); + + // maximum required alignment is the size of the largest vector type + const size_t max_alignment = sizeof(double) * 16; + + // for arguments smaller than the maximum alignment simply align to the + // size of the argument + const size_t alignment = std::min(max_alignment, size); + + // align the argument + size_t alignedLocalOffset = localOffset; + if (localOffset % alignment != 0) { + alignedLocalOffset += alignment - (localOffset % alignment); + } + + add_arg(index, sizeof(size_t), (const void *)&(alignedLocalOffset), + size + (alignedLocalOffset - localOffset)); } void set_implicit_offset(size_t size, std::uint32_t *implicitOffset) {