Skip to content

Commit e2df8ac

Browse files
authored
Merge pull request #2575 from DBDuncan/duncan/extend-copies
[CUDA][Bindless] Add support for device to device pitched copies and host to host copies
2 parents 992ff37 + 3a31ffe commit e2df8ac

File tree

7 files changed

+108
-47
lines changed

7 files changed

+108
-47
lines changed

include/ur_api.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8975,13 +8975,15 @@ typedef enum ur_exp_image_copy_flag_t {
89758975
UR_EXP_IMAGE_COPY_FLAG_DEVICE_TO_HOST = UR_BIT(1),
89768976
/// Device to device
89778977
UR_EXP_IMAGE_COPY_FLAG_DEVICE_TO_DEVICE = UR_BIT(2),
8978+
/// Host to host
8979+
UR_EXP_IMAGE_COPY_FLAG_HOST_TO_HOST = UR_BIT(3),
89788980
/// @cond
89798981
UR_EXP_IMAGE_COPY_FLAG_FORCE_UINT32 = 0x7fffffff
89808982
/// @endcond
89818983

89828984
} ur_exp_image_copy_flag_t;
89838985
/// @brief Bit Mask for validating ur_exp_image_copy_flags_t
8984-
#define UR_EXP_IMAGE_COPY_FLAGS_MASK 0xfffffff8
8986+
#define UR_EXP_IMAGE_COPY_FLAGS_MASK 0xfffffff0
89858987

89868988
///////////////////////////////////////////////////////////////////////////////
89878989
/// @brief Sampler cubemap seamless filtering mode.

include/ur_print.hpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10405,6 +10405,9 @@ inline std::ostream &operator<<(std::ostream &os,
1040510405
case UR_EXP_IMAGE_COPY_FLAG_DEVICE_TO_DEVICE:
1040610406
os << "UR_EXP_IMAGE_COPY_FLAG_DEVICE_TO_DEVICE";
1040710407
break;
10408+
case UR_EXP_IMAGE_COPY_FLAG_HOST_TO_HOST:
10409+
os << "UR_EXP_IMAGE_COPY_FLAG_HOST_TO_HOST";
10410+
break;
1040810411
default:
1040910412
os << "unknown enumerator";
1041010413
break;
@@ -10453,6 +10456,17 @@ inline ur_result_t printFlag<ur_exp_image_copy_flag_t>(std::ostream &os,
1045310456
}
1045410457
os << UR_EXP_IMAGE_COPY_FLAG_DEVICE_TO_DEVICE;
1045510458
}
10459+
10460+
if ((val & UR_EXP_IMAGE_COPY_FLAG_HOST_TO_HOST) ==
10461+
(uint32_t)UR_EXP_IMAGE_COPY_FLAG_HOST_TO_HOST) {
10462+
val ^= (uint32_t)UR_EXP_IMAGE_COPY_FLAG_HOST_TO_HOST;
10463+
if (!first) {
10464+
os << " | ";
10465+
} else {
10466+
first = false;
10467+
}
10468+
os << UR_EXP_IMAGE_COPY_FLAG_HOST_TO_HOST;
10469+
}
1045610470
if (val != 0) {
1045710471
std::bitset<32> bits(val);
1045810472
if (!first) {

scripts/core/EXP-BINDLESS-IMAGES.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ Enums
108108
* ${X}_EXP_IMAGE_COPY_FLAG_HOST_TO_DEVICE
109109
* ${X}_EXP_IMAGE_COPY_FLAG_DEVICE_TO_HOST
110110
* ${X}_EXP_IMAGE_COPY_FLAG_DEVICE_TO_DEVICE
111+
* ${X}_EXP_IMAGE_COPY_FLAG_HOST_TO_HOST
111112

112113
* ${x}_exp_sampler_cubemap_filter_mode_t
113114
* ${X}_EXP_SAMPLER_CUBEMAP_FILTER_MODE_SEAMLESS
@@ -253,6 +254,10 @@ Changelog
253254
+----------+-------------------------------------------------------------+
254255
| 18.0 | Added BindlessImagesMapExternalLinearMemoryExp function. |
255256
+----------+-------------------------------------------------------------+
257+
| 19.0 || Added ${X}_EXP_IMAGE_COPY_FLAG_HOST_TO_HOST |
258+
| || Added support for DtoD usm pitch copies |
259+
| || Added support for HtoH copies |
260+
+----------+-------------------------------------------------------------+
256261

257262
Contributors
258263
--------------------------------------------------------------------------------

scripts/core/exp-bindless-images.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,8 @@ etors:
167167
desc: "Device to host"
168168
- name: DEVICE_TO_DEVICE
169169
desc: "Device to device"
170+
- name: HOST_TO_HOST
171+
desc: "Host to host"
170172
--- #--------------------------------------------------------------------------
171173
type: enum
172174
extend: True

source/adapters/cuda/enqueue.cpp

Lines changed: 0 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -46,34 +46,6 @@ ur_result_t enqueueEventsWait(ur_queue_handle_t CommandQueue, CUstream Stream,
4646
}
4747
}
4848

49-
template <typename PtrT>
50-
void getUSMHostOrDevicePtr(PtrT USMPtr, CUmemorytype *OutMemType,
51-
CUdeviceptr *OutDevPtr, PtrT *OutHostPtr) {
52-
// do not throw if cuPointerGetAttribute returns CUDA_ERROR_INVALID_VALUE
53-
// checks with PI_CHECK_ERROR are not suggested
54-
CUresult Ret = cuPointerGetAttribute(
55-
OutMemType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)USMPtr);
56-
// ARRAY, UNIFIED types are not supported!
57-
assert(*OutMemType != CU_MEMORYTYPE_ARRAY &&
58-
*OutMemType != CU_MEMORYTYPE_UNIFIED);
59-
60-
// pointer not known to the CUDA subsystem (possibly a system allocated ptr)
61-
if (Ret == CUDA_ERROR_INVALID_VALUE) {
62-
*OutMemType = CU_MEMORYTYPE_HOST;
63-
*OutDevPtr = 0;
64-
*OutHostPtr = USMPtr;
65-
66-
// todo: resets the above "non-stick" error
67-
} else if (Ret == CUDA_SUCCESS) {
68-
*OutDevPtr = (*OutMemType == CU_MEMORYTYPE_DEVICE)
69-
? reinterpret_cast<CUdeviceptr>(USMPtr)
70-
: 0;
71-
*OutHostPtr = (*OutMemType == CU_MEMORYTYPE_HOST) ? USMPtr : nullptr;
72-
} else {
73-
UR_CHECK_ERROR(Ret);
74-
}
75-
}
76-
7749
ur_result_t setCuMemAdvise(CUdeviceptr DevPtr, size_t Size,
7850
ur_usm_advice_flags_t URAdviceFlags,
7951
CUdevice Device) {

source/adapters/cuda/enqueue.hpp

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
//===----------------------------------------------------------------------===//
1010
#pragma once
1111

12+
#include "common.hpp"
1213
#include <cassert>
1314
#include <cuda.h>
1415
#include <ur_api.h>
@@ -17,6 +18,34 @@ ur_result_t enqueueEventsWait(ur_queue_handle_t CommandQueue, CUstream Stream,
1718
uint32_t NumEventsInWaitList,
1819
const ur_event_handle_t *EventWaitList);
1920

21+
template <typename PtrT>
22+
void getUSMHostOrDevicePtr(PtrT USMPtr, CUmemorytype *OutMemType,
23+
CUdeviceptr *OutDevPtr, PtrT *OutHostPtr) {
24+
// do not throw if cuPointerGetAttribute returns CUDA_ERROR_INVALID_VALUE
25+
// checks with PI_CHECK_ERROR are not suggested
26+
CUresult Ret = cuPointerGetAttribute(
27+
OutMemType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)USMPtr);
28+
// ARRAY, UNIFIED types are not supported!
29+
assert(*OutMemType != CU_MEMORYTYPE_ARRAY &&
30+
*OutMemType != CU_MEMORYTYPE_UNIFIED);
31+
32+
// pointer not known to the CUDA subsystem (possibly a system allocated ptr)
33+
if (Ret == CUDA_ERROR_INVALID_VALUE) {
34+
*OutMemType = CU_MEMORYTYPE_HOST;
35+
*OutDevPtr = 0;
36+
*OutHostPtr = USMPtr;
37+
38+
// todo: resets the above "non-stick" error
39+
} else if (Ret == CUDA_SUCCESS) {
40+
*OutDevPtr = (*OutMemType == CU_MEMORYTYPE_DEVICE)
41+
? reinterpret_cast<CUdeviceptr>(USMPtr)
42+
: 0;
43+
*OutHostPtr = (*OutMemType == CU_MEMORYTYPE_HOST) ? USMPtr : nullptr;
44+
} else {
45+
UR_CHECK_ERROR(Ret);
46+
}
47+
}
48+
2049
void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
2150
const size_t *GlobalWorkSize, const uint32_t WorkDim,
2251
ur_kernel_handle_t Kernel);

source/adapters/cuda/image.cpp

Lines changed: 55 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -583,7 +583,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
583583
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
584584
UR_ASSERT((imageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_HOST_TO_DEVICE ||
585585
imageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_DEVICE_TO_HOST ||
586-
imageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_DEVICE_TO_DEVICE),
586+
imageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_DEVICE_TO_DEVICE ||
587+
imageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_HOST_TO_HOST),
587588
UR_RESULT_ERROR_INVALID_VALUE);
588589
UR_ASSERT(pSrcImageFormat->channelOrder == pDstImageFormat->channelOrder,
589590
UR_RESULT_ERROR_INVALID_ARGUMENT);
@@ -651,6 +652,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
651652
cpy_desc.srcY = pCopyRegion->srcOffset.y;
652653
cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes;
653654
cpy_desc.dstY = pCopyRegion->dstOffset.y;
655+
cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width;
656+
cpy_desc.Height = pCopyRegion->copyExtent.height;
654657
cpy_desc.srcPitch = pSrcImageDesc->width * PixelSizeBytes;
655658
if (pDstImageDesc->rowPitch == 0) {
656659
cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
@@ -661,8 +664,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
661664
cpy_desc.dstDevice = (CUdeviceptr)pDst;
662665
cpy_desc.dstPitch = pDstImageDesc->rowPitch;
663666
}
664-
cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width;
665-
cpy_desc.Height = pCopyRegion->copyExtent.height;
666667
UR_CHECK_ERROR(cuMemcpy2DAsync(&cpy_desc, Stream));
667668
} else if (pDstImageDesc->type == UR_MEM_TYPE_IMAGE3D) {
668669
CUDA_MEMCPY3D cpy_desc = {};
@@ -740,22 +741,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
740741
cpy_desc.srcY = pCopyRegion->srcOffset.y;
741742
cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes;
742743
cpy_desc.dstY = pCopyRegion->dstOffset.y;
744+
cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width;
745+
cpy_desc.Height = pCopyRegion->copyExtent.height;
746+
cpy_desc.dstPitch = pDstImageDesc->width * PixelSizeBytes;
743747
cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
744748
cpy_desc.dstHost = pDst;
745749
if (pSrcImageDesc->rowPitch == 0) {
746750
cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
747751
cpy_desc.srcArray = as_CUArray(pSrc);
748752
} else {
749753
// Pitched memory
750-
cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_DEVICE;
751754
cpy_desc.srcPitch = pSrcImageDesc->rowPitch;
755+
cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_DEVICE;
752756
cpy_desc.srcDevice = (CUdeviceptr)pSrc;
753757
}
754-
cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
755-
cpy_desc.dstHost = pDst;
756-
cpy_desc.dstPitch = pDstImageDesc->width * PixelSizeBytes;
757-
cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width;
758-
cpy_desc.Height = pCopyRegion->copyExtent.height;
759758
UR_CHECK_ERROR(cuMemcpy2DAsync(&cpy_desc, Stream));
760759
} else if (pSrcImageDesc->type == UR_MEM_TYPE_IMAGE3D) {
761760
CUDA_MEMCPY3D cpy_desc = {};
@@ -797,7 +796,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
797796
UR_CHECK_ERROR(cuMemcpy3DAsync(&cpy_desc, Stream));
798797
}
799798
} else {
800-
// imageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_DEVICE_TO_DEVICE
799+
// imageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_DEVICE_TO_DEVICE ||
800+
// imageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_HOST_TO_HOST
801801

802802
// we don't support copying between different image types.
803803
if (pSrcImageDesc->type != pDstImageDesc->type) {
@@ -810,30 +810,67 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
810810
// synchronous because of the explicit call to cuStreamSynchronize at
811811
// the end
812812
if (pSrcImageDesc->type == UR_MEM_TYPE_IMAGE1D) {
813+
// Check what type of memory pSrc and pDst are to set the correct
814+
// attributes of cpy_desc.
815+
// If cuPointerGetAttribute returns something different from
816+
// CUDA_SUCCESS then we know that the memory type is a CuArray.
817+
// Otherwise, it's CU_MEMORYTYPE_DEVICE.
818+
CUmemorytype memType;
819+
bool isSrcCudaArray =
820+
cuPointerGetAttribute(&memType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
821+
(CUdeviceptr)pSrc) != CUDA_SUCCESS;
822+
bool isDstCudaArray =
823+
cuPointerGetAttribute(&memType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
824+
(CUdeviceptr)pDst) != CUDA_SUCCESS;
825+
813826
CUDA_MEMCPY2D cpy_desc = {};
814827
cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes;
815828
cpy_desc.srcY = 0;
816829
cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes;
817830
cpy_desc.dstY = 0;
818-
cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
819-
cpy_desc.srcArray = as_CUArray(pSrc);
820-
cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
821-
cpy_desc.dstArray = (CUarray)pDst;
822831
cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width;
823832
cpy_desc.Height = 1;
833+
if (isSrcCudaArray) {
834+
cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
835+
cpy_desc.srcArray = as_CUArray(pSrc);
836+
} else {
837+
getUSMHostOrDevicePtr(pSrc, &cpy_desc.srcMemoryType,
838+
&cpy_desc.srcDevice, &cpy_desc.srcHost);
839+
}
840+
if (isDstCudaArray) {
841+
cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
842+
cpy_desc.dstArray = (CUarray)pDst;
843+
} else {
844+
getUSMHostOrDevicePtr(pDst, &cpy_desc.dstMemoryType,
845+
&cpy_desc.dstDevice, &cpy_desc.dstHost);
846+
}
824847
UR_CHECK_ERROR(cuMemcpy2DAsync(&cpy_desc, Stream));
825848
} else if (pSrcImageDesc->type == UR_MEM_TYPE_IMAGE2D) {
826849
CUDA_MEMCPY2D cpy_desc = {};
827850
cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes;
828851
cpy_desc.srcY = pCopyRegion->srcOffset.y;
829852
cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes;
830853
cpy_desc.dstY = pCopyRegion->dstOffset.y;
831-
cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
832-
cpy_desc.srcArray = as_CUArray(pSrc);
833-
cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
834-
cpy_desc.dstArray = (CUarray)pDst;
835854
cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width;
836855
cpy_desc.Height = pCopyRegion->copyExtent.height;
856+
if (pSrcImageDesc->rowPitch == 0) {
857+
cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
858+
cpy_desc.srcArray = as_CUArray(pSrc);
859+
} else {
860+
// Pitched memory
861+
cpy_desc.srcPitch = pSrcImageDesc->rowPitch;
862+
getUSMHostOrDevicePtr(pSrc, &cpy_desc.srcMemoryType,
863+
&cpy_desc.srcDevice, &cpy_desc.srcHost);
864+
}
865+
if (pDstImageDesc->rowPitch == 0) {
866+
cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
867+
cpy_desc.dstArray = (CUarray)pDst;
868+
} else {
869+
// Pitched memory
870+
cpy_desc.dstPitch = pDstImageDesc->rowPitch;
871+
getUSMHostOrDevicePtr(pDst, &cpy_desc.dstMemoryType,
872+
&cpy_desc.dstDevice, &cpy_desc.dstHost);
873+
}
837874
UR_CHECK_ERROR(cuMemcpy2DAsync(&cpy_desc, Stream));
838875
} else if (pSrcImageDesc->type == UR_MEM_TYPE_IMAGE3D) {
839876
CUDA_MEMCPY3D cpy_desc = {};

0 commit comments

Comments
 (0)