From 4a71379381b4fe9bc73b623d1195cdbf31cb8e17 Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Thu, 12 Aug 2021 15:22:24 +0100 Subject: [PATCH 01/20] Implemented P2P copies for the cuda backend using buffers. --- sycl/include/CL/sycl/detail/pi.def | 3 + sycl/include/CL/sycl/detail/pi.h | 26 +- sycl/include/CL/sycl/info/info_desc.hpp | 1 + sycl/include/CL/sycl/info/platform_traits.def | 1 + sycl/plugins/cuda/pi_cuda.cpp | 318 +++++++++++++++++- sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp | 25 ++ sycl/plugins/level_zero/pi_level_zero.cpp | 30 ++ sycl/plugins/rocm/pi_rocm.cpp | 32 ++ sycl/source/detail/memory_manager.cpp | 47 ++- sycl/source/detail/pi.cpp | 2 + sycl/source/detail/platform_info.hpp | 15 + sycl/source/detail/scheduler/commands.cpp | 7 +- .../source/detail/scheduler/graph_builder.cpp | 9 +- 13 files changed, 495 insertions(+), 21 deletions(-) diff --git a/sycl/include/CL/sycl/detail/pi.def b/sycl/include/CL/sycl/detail/pi.def index 730b4afa50c0c..07ea8da8430d4 100644 --- a/sycl/include/CL/sycl/detail/pi.def +++ b/sycl/include/CL/sycl/detail/pi.def @@ -111,11 +111,14 @@ _PI_API(piEnqueueMemBufferReadRect) _PI_API(piEnqueueMemBufferWrite) _PI_API(piEnqueueMemBufferWriteRect) _PI_API(piEnqueueMemBufferCopy) +_PI_API(piextEnqueueMemBufferCopyPeer) _PI_API(piEnqueueMemBufferCopyRect) +_PI_API(piextEnqueueMemBufferCopyRectPeer) _PI_API(piEnqueueMemBufferFill) _PI_API(piEnqueueMemImageRead) _PI_API(piEnqueueMemImageWrite) _PI_API(piEnqueueMemImageCopy) +_PI_API(piextEnqueueMemImageCopyPeer) _PI_API(piEnqueueMemImageFill) _PI_API(piEnqueueMemBufferMap) _PI_API(piEnqueueMemUnmap) diff --git a/sycl/include/CL/sycl/detail/pi.h b/sycl/include/CL/sycl/detail/pi.h index 8349823df54ba..f4647eda7eaa2 100644 --- a/sycl/include/CL/sycl/detail/pi.h +++ b/sycl/include/CL/sycl/detail/pi.h @@ -126,7 +126,8 @@ typedef enum { PI_PLATFORM_INFO_NAME = CL_PLATFORM_NAME, PI_PLATFORM_INFO_PROFILE = CL_PLATFORM_PROFILE, PI_PLATFORM_INFO_VENDOR = CL_PLATFORM_VENDOR, - PI_PLATFORM_INFO_VERSION = CL_PLATFORM_VERSION + PI_PLATFORM_INFO_VERSION = CL_PLATFORM_VERSION, + PI_PLATFORM_INFO_P2P = 0x40110 } _pi_platform_info; typedef enum { @@ -1048,6 +1049,15 @@ __SYCL_EXPORT pi_result piQueueGetInfo(pi_queue command_queue, void *param_value, size_t *param_value_size_ret); + +__SYCL_EXPORT pi_result piextEnqueueMemBufferCopyPeer( + pi_queue src_queue, pi_mem src_buffer, pi_queue dst_queue, + pi_mem dst_buffer, size_t src_offset, + size_t dst_offset, size_t size, + pi_uint32 num_events_in_wait_list, + const pi_event *event_wait_list, + pi_event *event); + __SYCL_EXPORT pi_result piQueueRetain(pi_queue command_queue); __SYCL_EXPORT pi_result piQueueRelease(pi_queue command_queue); @@ -1452,6 +1462,14 @@ __SYCL_EXPORT pi_result piEnqueueMemBufferCopyRect( pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event); +__SYCL_EXPORT pi_result piextEnqueueMemBufferCopyRectPeer( + pi_queue src_queue, pi_mem src_buffer, pi_queue dst_queue, pi_mem dst_buffer, + pi_buff_rect_offset src_origin, pi_buff_rect_offset dst_origin, + pi_buff_rect_region region, size_t src_row_pitch, size_t src_slice_pitch, + size_t dst_row_pitch, size_t dst_slice_pitch, + pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, + pi_event *event); + __SYCL_EXPORT pi_result piEnqueueMemBufferFill(pi_queue command_queue, pi_mem buffer, const void *pattern, size_t pattern_size, size_t offset, @@ -1477,6 +1495,12 @@ __SYCL_EXPORT pi_result piEnqueueMemImageCopy( pi_image_region region, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event); +__SYCL_EXPORT pi_result piextEnqueueMemImageCopyPeer( + pi_queue command_queue, pi_mem src_image, pi_queue dst_queue, pi_mem dst_image, + pi_image_offset src_origin, pi_image_offset dst_origin, + pi_image_region region, pi_uint32 num_events_in_wait_list, + const pi_event *event_wait_list, pi_event *event); + __SYCL_EXPORT pi_result piEnqueueMemImageFill(pi_queue command_queue, pi_mem image, const void *fill_color, const size_t *origin, diff --git a/sycl/include/CL/sycl/info/info_desc.hpp b/sycl/include/CL/sycl/info/info_desc.hpp index 2e7ad37c7547d..4476c4ea4acee 100644 --- a/sycl/include/CL/sycl/info/info_desc.hpp +++ b/sycl/include/CL/sycl/info/info_desc.hpp @@ -30,6 +30,7 @@ enum class platform { name = PI_PLATFORM_INFO_NAME, vendor = PI_PLATFORM_INFO_VENDOR, extensions = PI_PLATFORM_INFO_EXTENSIONS, + P2P = PI_PLATFORM_INFO_P2P, }; // A.2 Context information desctiptors diff --git a/sycl/include/CL/sycl/info/platform_traits.def b/sycl/include/CL/sycl/info/platform_traits.def index 3dd2a319020d5..162792af6fe72 100644 --- a/sycl/include/CL/sycl/info/platform_traits.def +++ b/sycl/include/CL/sycl/info/platform_traits.def @@ -3,3 +3,4 @@ __SYCL_PARAM_TRAITS_SPEC(platform, version, std::string) __SYCL_PARAM_TRAITS_SPEC(platform, name, std::string) __SYCL_PARAM_TRAITS_SPEC(platform, vendor, std::string) __SYCL_PARAM_TRAITS_SPEC(platform, extensions, std::vector) +__SYCL_PARAM_TRAITS_SPEC(platform, P2P, bool) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 5ea3c70ece1bb..ed74ff083ffd2 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -3635,6 +3635,19 @@ pi_result cuda_piSamplerRelease(pi_sampler sampler) { return PI_SUCCESS; } +void copyRectAsserts(const pi_buff_rect_region ®ion, + const pi_buff_rect_offset &src_offset, + const pi_buff_rect_offset &dst_offset, + const CUmemorytype_enum &src_type, + const CUmemorytype_enum &dst_type) { + assert(region != nullptr); + assert(src_offset != nullptr); + assert(dst_offset != nullptr); + + assert(src_type == CU_MEMORYTYPE_DEVICE || src_type == CU_MEMORYTYPE_HOST); + assert(dst_type == CU_MEMORYTYPE_DEVICE || dst_type == CU_MEMORYTYPE_HOST); +} + /// General 3D memory copy operation. /// This function requires the corresponding CUDA context to be at the top of /// the context stack @@ -3647,12 +3660,7 @@ static pi_result commonEnqueueMemBufferCopyRect( const CUmemorytype_enum dst_type, pi_buff_rect_offset dst_offset, size_t dst_row_pitch, size_t dst_slice_pitch) { - assert(region != nullptr); - assert(src_offset != nullptr); - assert(dst_offset != nullptr); - - assert(src_type == CU_MEMORYTYPE_DEVICE || src_type == CU_MEMORYTYPE_HOST); - assert(dst_type == CU_MEMORYTYPE_DEVICE || dst_type == CU_MEMORYTYPE_HOST); + copyRectAsserts(region, src_offset, dst_offset, src_type, dst_type); src_row_pitch = (!src_row_pitch) ? region->width_bytes : src_row_pitch; src_slice_pitch = (!src_slice_pitch) ? (region->height_scalar * src_row_pitch) @@ -3692,6 +3700,60 @@ static pi_result commonEnqueueMemBufferCopyRect( return PI_CHECK_ERROR(cuMemcpy3DAsync(¶ms, cu_stream)); } +/// General 3D memory Peer copy operation. +/// Similar to commonEnqueueMemBufferCopyRect with the addition that two +/// contexts must be specified. +static pi_result commonEnqueueMemBufferCopyRectPeer( + CUstream cu_stream, pi_buff_rect_region region, const void *src_ptr, + const CUmemorytype_enum src_type, pi_buff_rect_offset src_offset, + size_t src_row_pitch, size_t src_slice_pitch, void *dst_ptr, + const CUmemorytype_enum dst_type, pi_buff_rect_offset dst_offset, + size_t dst_row_pitch, size_t dst_slice_pitch, CUcontext dst_context, + CUcontext src_context) { + + copyRectAsserts(region, src_offset, dst_offset, src_type, dst_type); + + src_row_pitch = (!src_row_pitch) ? region->width_bytes : src_row_pitch; + src_slice_pitch = (!src_slice_pitch) ? (region->height_scalar * src_row_pitch) + : src_slice_pitch; + dst_row_pitch = (!dst_row_pitch) ? region->width_bytes : dst_row_pitch; + dst_slice_pitch = (!dst_slice_pitch) ? (region->height_scalar * dst_row_pitch) + : dst_slice_pitch; + + CUDA_MEMCPY3D_PEER params = {}; + + params.WidthInBytes = region->width_bytes; + params.Height = region->height_scalar; + params.Depth = region->depth_scalar; + + params.srcMemoryType = src_type; + params.srcDevice = src_type == CU_MEMORYTYPE_DEVICE + ? *static_cast(src_ptr) + : 0; + params.srcHost = src_type == CU_MEMORYTYPE_HOST ? src_ptr : nullptr; + params.srcXInBytes = src_offset->x_bytes; + params.srcY = src_offset->y_scalar; + params.srcZ = src_offset->z_scalar; + params.srcPitch = src_row_pitch; + params.srcHeight = src_slice_pitch / src_row_pitch; + + params.dstMemoryType = dst_type; + params.dstDevice = dst_type == CU_MEMORYTYPE_DEVICE + ? *static_cast(dst_ptr) + : 0; + params.dstHost = dst_type == CU_MEMORYTYPE_HOST ? dst_ptr : nullptr; + params.dstXInBytes = dst_offset->x_bytes; + params.dstY = dst_offset->y_scalar; + params.dstZ = dst_offset->z_scalar; + params.dstPitch = dst_row_pitch; + params.dstHeight = dst_slice_pitch / dst_row_pitch; + + params.dstContext = dst_context; + params.srcContext = src_context; + + return PI_CHECK_ERROR(cuMemcpy3DPeerAsync(¶ms, cu_stream)); +} + pi_result cuda_piEnqueueMemBufferReadRect( pi_queue command_queue, pi_mem buffer, pi_bool blocking_read, pi_buff_rect_offset buffer_offset, pi_buff_rect_offset host_offset, @@ -3841,6 +3903,61 @@ pi_result cuda_piEnqueueMemBufferCopy(pi_queue command_queue, pi_mem src_buffer, } } +pi_result cuda_piextEnqueueMemBufferCopyPeer( + pi_queue src_queue, pi_mem src_buffer, pi_queue dst_queue, + pi_mem dst_buffer, size_t src_offset, size_t dst_offset, size_t size, + pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, + pi_event *event) { + + assert(src_buffer != nullptr); + assert(dst_buffer != nullptr); + + if (!dst_queue) { + return PI_INVALID_QUEUE; + } + + std::unique_ptr<_pi_event> retImplEv{nullptr}; + + try { + ScopedContext active(dst_queue->get_context()); + if (event_wait_list) { + cuda_piEnqueueEventsWait(src_queue, num_events_in_wait_list, + event_wait_list, nullptr); + } + + pi_result result; + + if (event) { + retImplEv = std::unique_ptr<_pi_event>( + _pi_event::make_native(PI_COMMAND_TYPE_MEM_BUFFER_COPY, dst_queue)); + result = retImplEv->start(); + } + + auto stream = dst_queue->get(); + auto src = src_buffer->mem_.buffer_mem_.get() + src_offset; + auto dst = dst_buffer->mem_.buffer_mem_.get() + dst_offset; + + auto dst_context = dst_queue->get_context()->get(); + auto src_context = src_queue->get_context()->get(); + + cuCtxEnablePeerAccess(src_context, 0); + + result = PI_CHECK_ERROR( + cuMemcpyPeerAsync(dst, dst_context, src, src_context, size, stream)); + + if (event) { + result = retImplEv->record(); + *event = retImplEv.release(); + } + + return result; + } catch (pi_result err) { + return err; + } catch (...) { + return PI_ERROR_UNKNOWN; + } +} + pi_result cuda_piEnqueueMemBufferCopyRect( pi_queue command_queue, pi_mem src_buffer, pi_mem dst_buffer, pi_buff_rect_offset src_origin, pi_buff_rect_offset dst_origin, @@ -3887,6 +4004,63 @@ pi_result cuda_piEnqueueMemBufferCopyRect( return retErr; } +pi_result cuda_piextEnqueueMemBufferCopyRectPeer( + pi_queue src_queue, pi_mem src_buffer, pi_queue dst_queue, + pi_mem dst_buffer, pi_buff_rect_offset src_origin, + pi_buff_rect_offset dst_origin, pi_buff_rect_region region, + size_t src_row_pitch, size_t src_slice_pitch, size_t dst_row_pitch, + size_t dst_slice_pitch, pi_uint32 num_events_in_wait_list, + const pi_event *event_wait_list, pi_event *event) { + + assert(src_buffer != nullptr); + assert(dst_buffer != nullptr); + + if (!dst_queue) { + return PI_INVALID_QUEUE; + } + + pi_result retErr = PI_SUCCESS; + + std::unique_ptr<_pi_event> retImplEv{nullptr}; + + CUstream cuStream = dst_queue->get(); + CUdeviceptr srcPtr = src_buffer->mem_.buffer_mem_.get(); + CUdeviceptr dstPtr = dst_buffer->mem_.buffer_mem_.get(); + + auto dstContext = dst_queue->get_context()->get(); + auto srcContext = src_queue->get_context()->get(); + + cuCtxEnablePeerAccess(srcContext, 0); + + try { + ScopedContext active(dst_queue->get_context()); + if (event_wait_list) { + retErr = cuda_piEnqueueEventsWait(src_queue, num_events_in_wait_list, + event_wait_list, nullptr); + } + + if (event) { + retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native( + PI_COMMAND_TYPE_MEM_BUFFER_COPY_RECT, dst_queue)); + retImplEv->start(); + } + + retErr = commonEnqueueMemBufferCopyRectPeer( + cuStream, region, &srcPtr, CU_MEMORYTYPE_DEVICE, src_origin, + src_row_pitch, src_slice_pitch, &dstPtr, CU_MEMORYTYPE_DEVICE, + dst_origin, dst_row_pitch, dst_slice_pitch, dstContext, srcContext); + + if (event) { + retImplEv->record(); + *event = retImplEv.release(); + } + + } catch (pi_result err) { + retErr = err; + } + return retErr; +} + pi_result cuda_piEnqueueMemBufferFill(pi_queue command_queue, pi_mem buffer, const void *pattern, size_t pattern_size, size_t offset, size_t size, @@ -4076,6 +4250,54 @@ static pi_result commonEnqueueMemImageNDCopy( return PI_INVALID_VALUE; } +/// Similar to commonEnqueueMemImageNDCopy for Peer to Peer copies. +static pi_result commonEnqueueMemImageNDCopyPeer( + CUstream cu_stream, pi_mem_type img_type, const size_t *region, + const void *src_ptr, const CUmemorytype_enum src_type, + const size_t *src_offset, void *dst_ptr, const CUmemorytype_enum dst_type, + const size_t *dst_offset, CUcontext dst_context, CUcontext src_context) { + assert(region != nullptr); + + assert(src_type == CU_MEMORYTYPE_ARRAY || src_type == CU_MEMORYTYPE_HOST); + assert(dst_type == CU_MEMORYTYPE_ARRAY || dst_type == CU_MEMORYTYPE_HOST); + + CUDA_MEMCPY3D_PEER cpyDesc; + memset(&cpyDesc, 0, sizeof(cpyDesc)); + cpyDesc.srcMemoryType = src_type; + if (src_type == CU_MEMORYTYPE_ARRAY) { + cpyDesc.srcArray = *static_cast(src_ptr); + cpyDesc.srcXInBytes = src_offset[0]; + cpyDesc.srcY = src_offset[1]; + cpyDesc.srcZ = src_offset[2]; + } else { + cpyDesc.srcDevice = src_type == CU_MEMORYTYPE_DEVICE + ? *static_cast(src_ptr) + : 0; + cpyDesc.srcHost = src_type == CU_MEMORYTYPE_HOST ? src_ptr : nullptr; + } + cpyDesc.dstMemoryType = dst_type; + if (dst_type == CU_MEMORYTYPE_ARRAY) { + cpyDesc.dstArray = *static_cast(dst_ptr); + cpyDesc.dstXInBytes = dst_offset[0]; + cpyDesc.dstY = dst_offset[1]; + cpyDesc.dstZ = dst_offset[2]; + } else { + cpyDesc.dstDevice = dst_type == CU_MEMORYTYPE_DEVICE + ? *static_cast(dst_ptr) + : 0; + cpyDesc.dstHost = dst_type == CU_MEMORYTYPE_HOST ? dst_ptr : nullptr; + } + cpyDesc.WidthInBytes = region[0]; + cpyDesc.Height = region[1]; + cpyDesc.Depth = region[2]; + cpyDesc.dstContext = dst_context; + cpyDesc.srcContext = src_context; + + return PI_CHECK_ERROR(cuMemcpy3DPeerAsync(&cpyDesc, cu_stream)); + + return PI_INVALID_VALUE; +} + pi_result cuda_piEnqueueMemImageRead( pi_queue command_queue, pi_mem image, pi_bool blocking_read, const size_t *origin, const size_t *region, size_t row_pitch, @@ -4290,6 +4512,87 @@ pi_result cuda_piEnqueueMemImageCopy(pi_queue command_queue, pi_mem src_image, return retErr; } +pi_result cuda_piextEnqueueMemImageCopyPeer( + pi_queue src_queue, pi_mem src_image, pi_queue dst_queue, pi_mem dst_image, + const size_t *src_origin, const size_t *dst_origin, const size_t *region, + pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, + pi_event *event) { + + assert(src_image->mem_type_ == _pi_mem::mem_type::surface); + assert(dst_image->mem_type_ == _pi_mem::mem_type::surface); + assert(src_image->mem_.surface_mem_.get_image_type() == + dst_image->mem_.surface_mem_.get_image_type()); + + if (!dst_queue) { + return PI_INVALID_QUEUE; + } + + pi_result retErr = PI_SUCCESS; + CUstream cuStream = dst_queue->get(); + + try { + ScopedContext active(dst_queue->get_context()); + + if (event_wait_list) { + cuda_piEnqueueEventsWait(src_queue, num_events_in_wait_list, + event_wait_list, nullptr); + } + + CUarray srcArray = src_image->mem_.surface_mem_.get_array(); + CUarray dstArray = dst_image->mem_.surface_mem_.get_array(); + + CUDA_ARRAY_DESCRIPTOR srcArrayDesc; + retErr = PI_CHECK_ERROR(cuArrayGetDescriptor(&srcArrayDesc, srcArray)); + CUDA_ARRAY_DESCRIPTOR dstArrayDesc; + retErr = PI_CHECK_ERROR(cuArrayGetDescriptor(&dstArrayDesc, dstArray)); + + assert(srcArrayDesc.Format == dstArrayDesc.Format); + assert(srcArrayDesc.NumChannels == dstArrayDesc.NumChannels); + + int elementByteSize = imageElementByteSize(srcArrayDesc); + + size_t dstByteOffsetX = + dst_origin[0] * elementByteSize * srcArrayDesc.NumChannels; + size_t srcByteOffsetX = + src_origin[0] * elementByteSize * dstArrayDesc.NumChannels; + size_t bytesToCopy = elementByteSize * srcArrayDesc.NumChannels * region[0]; + + pi_mem_type imgType = src_image->mem_.surface_mem_.get_image_type(); + + auto dstContext = dst_queue->get_context()->get(); + auto srcContext = src_queue->get_context()->get(); + + cuCtxEnablePeerAccess(srcContext, 0); + + size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]}; + size_t srcOffset[3] = {srcByteOffsetX, src_origin[1], src_origin[2]}; + size_t dstOffset[3] = {dstByteOffsetX, dst_origin[1], dst_origin[2]}; + + retErr = commonEnqueueMemImageNDCopyPeer( + cuStream, imgType, adjustedRegion, &srcArray, CU_MEMORYTYPE_ARRAY, + srcOffset, &dstArray, CU_MEMORYTYPE_ARRAY, dstOffset, dstContext, + srcContext); + + if (retErr != PI_SUCCESS) { + return retErr; + } + + if (event) { + auto new_event = + _pi_event::make_native(PI_COMMAND_TYPE_IMAGE_COPY, dst_queue); + new_event->record(); + *event = new_event; + } + + } catch (pi_result err) { + return err; + } catch (...) { + return PI_ERROR_UNKNOWN; + } + + return retErr; +} + /// \TODO Not implemented in CUDA, requires untie from OpenCL pi_result cuda_piEnqueueMemImageFill(pi_queue, pi_mem, const void *, const size_t *, const size_t *, pi_uint32, @@ -4830,6 +5133,9 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piMemBufferPartition, cuda_piMemBufferPartition) _PI_CL(piextMemGetNativeHandle, cuda_piextMemGetNativeHandle) _PI_CL(piextMemCreateWithNativeHandle, cuda_piextMemCreateWithNativeHandle) + _PI_CL(piextEnqueueMemBufferCopyPeer, cuda_piextEnqueueMemBufferCopyPeer) + _PI_CL(piextEnqueueMemBufferCopyRectPeer, cuda_piextEnqueueMemBufferCopyRectPeer) + _PI_CL(piextEnqueueMemImageCopyPeer, cuda_piextEnqueueMemImageCopyPeer) // Program _PI_CL(piProgramCreate, cuda_piProgramCreate) _PI_CL(piclProgramCreateWithSource, cuda_piclProgramCreateWithSource) diff --git a/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp b/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp index 2b29c46382738..f50a825fff68d 100644 --- a/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp +++ b/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp @@ -1010,6 +1010,13 @@ pi_result piEnqueueMemBufferCopy(pi_queue, pi_mem, pi_mem, size_t, size_t, return PI_SUCCESS; } +pi_result piextEnqueueMemBufferCopyPeer(pi_queue, pi_mem, pi_queue, pi_mem, + size_t, size_t, size_t, pi_uint32, + const pi_event *, pi_event *) { + DIE_NO_IMPLEMENTATION; + return PI_SUCCESS; +} + pi_result piEnqueueMemBufferCopyRect(pi_queue, pi_mem, pi_mem, pi_buff_rect_offset, pi_buff_rect_offset, pi_buff_rect_region, size_t, size_t, @@ -1019,6 +1026,16 @@ pi_result piEnqueueMemBufferCopyRect(pi_queue, pi_mem, pi_mem, return PI_SUCCESS; } +pi_result piextEnqueueMemBufferCopyRectPeer(pi_queue, pi_mem, pi_queue, pi_mem, + pi_buff_rect_offset, + pi_buff_rect_offset, + pi_buff_rect_region, size_t, size_t, + size_t, size_t, pi_uint32, + const pi_event *, pi_event *) { + DIE_NO_IMPLEMENTATION; + return PI_SUCCESS; +} + pi_result piEnqueueMemBufferFill(pi_queue, pi_mem, const void *, size_t, size_t, size_t, pi_uint32, const pi_event *, pi_event *) { @@ -1080,6 +1097,14 @@ pi_result piEnqueueMemImageCopy(pi_queue, pi_mem, pi_mem, pi_image_offset, return PI_SUCCESS; } +pi_result piextEnqueueMemImageCopyPeer(pi_queue, pi_mem, pi_queue, pi_mem, + pi_image_offset, pi_image_offset, + pi_image_region, pi_uint32, + const pi_event *, pi_event *) { + DIE_NO_IMPLEMENTATION; + return PI_SUCCESS; +} + pi_result piEnqueueMemImageFill(pi_queue, pi_mem, const void *, const size_t *, const size_t *, pi_uint32, const pi_event *, pi_event *) { diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 85e6416092501..64a39f97b8b2f 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -5180,6 +5180,17 @@ pi_result piEnqueueMemBufferCopy(pi_queue Queue, pi_mem SrcBuffer, NumEventsInWaitList, EventWaitList, Event, PreferCopyEngine); } +pi_result piextEnqueueMemBufferCopyPeer(pi_queue src_queue, pi_mem src_buffer, + pi_queue dst_queue, pi_mem dst_buffer, + size_t src_offset, size_t dst_offset, + size_t size, + pi_uint32 num_events_in_wait_list, + const pi_event *event_wait_list, + pi_event *event) { + die("piextEnqueueMemBufferCopyPeer: not implemented"); + return {}; +} + pi_result piEnqueueMemBufferCopyRect( pi_queue Queue, pi_mem SrcBuffer, pi_mem DstBuffer, pi_buff_rect_offset SrcOrigin, pi_buff_rect_offset DstOrigin, @@ -5199,6 +5210,16 @@ pi_result piEnqueueMemBufferCopyRect( NumEventsInWaitList, EventWaitList, Event, PreferCopyEngine); } +pi_result piextEnqueueMemBufferCopyRectPeer( + pi_queue Queue, pi_mem SrcBuffer, pi_queue DstQueue, pi_mem DstBuffer, + pi_buff_rect_offset SrcOrigin, pi_buff_rect_offset DstOrigin, + pi_buff_rect_region Region, size_t SrcRowPitch, size_t SrcSlicePitch, + size_t DstRowPitch, size_t DstSlicePitch, pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, pi_event *Event) { + die("piextEnqueueMemBufferCopyRectPeer: not implemented"); + return {}; +} + } // extern "C" // @@ -5799,6 +5820,15 @@ piEnqueueMemImageCopy(pi_queue Queue, pi_mem SrcImage, pi_mem DstImage, NumEventsInWaitList, EventWaitList, Event, PreferCopyEngine); } +pi_result piextEnqueueMemImageCopyPeer( + pi_queue Queue, pi_mem SrcImage, pi_queue dst_queue, pi_mem DstImage, + pi_image_offset SrcOrigin, pi_image_offset DstOrigin, + pi_image_region Region, pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, pi_event *Event) { + die("piextEnqueueMemImageCopyPeer: not implemented"); + return {}; +} + pi_result piEnqueueMemImageFill(pi_queue Queue, pi_mem Image, const void *FillColor, const size_t *Origin, const size_t *Region, diff --git a/sycl/plugins/rocm/pi_rocm.cpp b/sycl/plugins/rocm/pi_rocm.cpp index a5fc03f2b0cc2..6350dd1f2cd70 100644 --- a/sycl/plugins/rocm/pi_rocm.cpp +++ b/sycl/plugins/rocm/pi_rocm.cpp @@ -3664,6 +3664,15 @@ pi_result rocm_piEnqueueMemBufferCopy(pi_queue command_queue, pi_mem src_buffer, } } +pi_result rocm_piextEnqueueMemBufferCopyPeer( + pi_queue src_queue, pi_mem src_buffer, pi_queue dst_queue, + pi_mem dst_buffer, size_t src_offset, size_t dst_offset, size_t size, + pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, + pi_event *event) { + die("rocm_piextEnqueueMemBufferCopyPeer: not implemented"); + return {}; +} + pi_result rocm_piEnqueueMemBufferCopyRect( pi_queue command_queue, pi_mem src_buffer, pi_mem dst_buffer, pi_buff_rect_offset src_origin, pi_buff_rect_offset dst_origin, @@ -3710,6 +3719,17 @@ pi_result rocm_piEnqueueMemBufferCopyRect( return retErr; } +pi_result rocm_piextEnqueueMemBufferCopyRectPeer( + pi_queue src_queue, pi_mem src_buffer, pi_queue dst_queue, + pi_mem dst_buffer, pi_buff_rect_offset src_origin, + pi_buff_rect_offset dst_origin, pi_buff_rect_region region, + size_t src_row_pitch, size_t src_slice_pitch, size_t dst_row_pitch, + size_t dst_slice_pitch, pi_uint32 num_events_in_wait_list, + const pi_event *event_wait_list, pi_event *event) { + die("rocm_piextEnqueueMemBufferCopyRectPeer not implemented"); + return {}; +} + pi_result rocm_piEnqueueMemBufferFill(pi_queue command_queue, pi_mem buffer, const void *pattern, size_t pattern_size, size_t offset, size_t size, @@ -4118,6 +4138,15 @@ pi_result rocm_piEnqueueMemImageCopy(pi_queue command_queue, pi_mem src_image, return retErr; } +pi_result rocm_piextEnqueueMemImageCopyPeer( + pi_queue Queue, pi_mem SrcImage, pi_queue dst_queue, pi_mem DstImage, + pi_image_offset SrcOrigin, pi_image_offset DstOrigin, + pi_image_region Region, pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, pi_event *Event) { + die("rocm_piextEnqueueMemImageCopyPeer: not implemented"); + return {}; +} + /// \TODO Not implemented in HIP, requires untie from OpenCL pi_result rocm_piEnqueueMemImageFill(pi_queue command_queue, pi_mem image, const void *fill_color, @@ -4683,11 +4712,14 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piEnqueueMemBufferWrite, rocm_piEnqueueMemBufferWrite) _PI_CL(piEnqueueMemBufferWriteRect, rocm_piEnqueueMemBufferWriteRect) _PI_CL(piEnqueueMemBufferCopy, rocm_piEnqueueMemBufferCopy) + _PI_CL(piextEnqueueMemBufferCopyPeer, rocm_piextEnqueueMemBufferCopyPeer) _PI_CL(piEnqueueMemBufferCopyRect, rocm_piEnqueueMemBufferCopyRect) + _PI_CL(piextEnqueueMemBufferCopyRectPeer, rocm_piextEnqueueMemBufferCopyRectPeer) _PI_CL(piEnqueueMemBufferFill, rocm_piEnqueueMemBufferFill) _PI_CL(piEnqueueMemImageRead, rocm_piEnqueueMemImageRead) _PI_CL(piEnqueueMemImageWrite, rocm_piEnqueueMemImageWrite) _PI_CL(piEnqueueMemImageCopy, rocm_piEnqueueMemImageCopy) + _PI_CL(piextEnqueueMemImageCopyPeer, rocm_piextEnqueueMemImageCopyPeer) _PI_CL(piEnqueueMemImageFill, rocm_piEnqueueMemImageFill) _PI_CL(piEnqueueMemBufferMap, rocm_piEnqueueMemBufferMap) _PI_CL(piEnqueueMemUnmap, rocm_piEnqueueMemUnmap) diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp index d8d4a05c4b75c..24fdaebe88c6b 100644 --- a/sycl/source/detail/memory_manager.cpp +++ b/sycl/source/detail/memory_manager.cpp @@ -421,15 +421,18 @@ void copyD2H(SYCLMemObjI *SYCLMemObj, RT::PiMem SrcMem, QueueImplPtr SrcQueue, void copyD2D(SYCLMemObjI *SYCLMemObj, RT::PiMem SrcMem, QueueImplPtr SrcQueue, unsigned int DimSrc, sycl::range<3> SrcSize, sycl::range<3> SrcAccessRange, sycl::id<3> SrcOffset, - unsigned int SrcElemSize, RT::PiMem DstMem, QueueImplPtr, + unsigned int SrcElemSize, RT::PiMem DstMem, QueueImplPtr DstQueue, unsigned int DimDst, sycl::range<3> DstSize, sycl::range<3>, sycl::id<3> DstOffset, unsigned int DstElemSize, std::vector DepEvents, RT::PiEvent &OutEvent) { assert(SYCLMemObj && "The SYCLMemObj is nullptr"); const RT::PiQueue Queue = SrcQueue->getHandleRef(); + const RT::PiQueue QueueDst = DstQueue->getHandleRef(); const detail::plugin &Plugin = SrcQueue->getPlugin(); + auto sameCtxt = SrcQueue->get_context() == DstQueue->get_context(); + detail::SYCLMemObjI::MemObjType MemType = SYCLMemObj->getType(); TermPositions SrcPos, DstPos; prepTermPositions(SrcPos, DimSrc, MemType); @@ -443,10 +446,17 @@ void copyD2D(SYCLMemObjI *SYCLMemObj, RT::PiMem SrcMem, QueueImplPtr SrcQueue, if (MemType == detail::SYCLMemObjI::MemObjType::Buffer) { if (1 == DimDst && 1 == DimSrc) { - Plugin.call( - Queue, SrcMem, DstMem, SrcXOffBytes, DstXOffBytes, - SrcAccessRangeWidthBytes, DepEvents.size(), DepEvents.data(), - &OutEvent); + if (sameCtxt) { + Plugin.call( + Queue, SrcMem, DstMem, SrcXOffBytes, DstXOffBytes, + SrcAccessRangeWidthBytes, DepEvents.size(), DepEvents.data(), + &OutEvent); + } else { + Plugin.call( + Queue, SrcMem, QueueDst, DstMem, SrcXOffBytes, DstXOffBytes, + SrcAccessRangeWidthBytes, DepEvents.size(), DepEvents.data(), + &OutEvent); + } } else { // passing 0 for pitches not allowed. Because clEnqueueCopyBufferRect will // calculate both src and dest pitch using region[0], which is not correct @@ -468,10 +478,17 @@ void copyD2D(SYCLMemObjI *SYCLMemObj, RT::PiMem SrcMem, QueueImplPtr SrcQueue, SrcAccessRange[SrcPos.YTerm], SrcAccessRange[SrcPos.ZTerm]}; - Plugin.call( - Queue, SrcMem, DstMem, &SrcOrigin, &DstOrigin, &Region, SrcRowPitch, - SrcSlicePitch, DstRowPitch, DstSlicePitch, DepEvents.size(), - DepEvents.data(), &OutEvent); + if (sameCtxt) { + Plugin.call( + Queue, SrcMem, DstMem, &SrcOrigin, &DstOrigin, &Region, SrcRowPitch, + SrcSlicePitch, DstRowPitch, DstSlicePitch, DepEvents.size(), + DepEvents.data(), &OutEvent); + } else { + Plugin.call( + Queue, SrcMem, QueueDst, DstMem, &SrcOrigin, &DstOrigin, &Region, + SrcRowPitch, SrcSlicePitch, DstRowPitch, DstSlicePitch, + DepEvents.size(), DepEvents.data(), &OutEvent); + } } } else { pi_image_offset_struct SrcOrigin{SrcOffset[SrcPos.XTerm], @@ -484,9 +501,15 @@ void copyD2D(SYCLMemObjI *SYCLMemObj, RT::PiMem SrcMem, QueueImplPtr SrcQueue, SrcAccessRange[SrcPos.YTerm], SrcAccessRange[SrcPos.ZTerm]}; - Plugin.call( - Queue, SrcMem, DstMem, &SrcOrigin, &DstOrigin, &Region, - DepEvents.size(), DepEvents.data(), &OutEvent); + if (sameCtxt) { + Plugin.call( + Queue, SrcMem, DstMem, &SrcOrigin, &DstOrigin, &Region, + DepEvents.size(), DepEvents.data(), &OutEvent); + } else { + Plugin.call( + Queue, SrcMem, QueueDst, DstMem, &SrcOrigin, &DstOrigin, &Region, + DepEvents.size(), DepEvents.data(), &OutEvent); + } } } diff --git a/sycl/source/detail/pi.cpp b/sycl/source/detail/pi.cpp index 7db4a3f073704..3b09906dc3756 100644 --- a/sycl/source/detail/pi.cpp +++ b/sycl/source/detail/pi.cpp @@ -201,6 +201,8 @@ std::string platformInfoToString(pi_platform_info info) { return "PI_PLATFORM_INFO_VENDOR"; case PI_PLATFORM_INFO_EXTENSIONS: return "PI_PLATFORM_INFO_EXTENSIONS"; + case PI_PLATFORM_INFO_P2P: + return "PI_PLATFORM_INFO_P2P"; } die("Unknown pi_platform_info value passed to " "cl::sycl::detail::pi::platformInfoToString"); diff --git a/sycl/source/detail/platform_info.hpp b/sycl/source/detail/platform_info.hpp index ff385c3bb4250..1f482bd1c9475 100644 --- a/sycl/source/detail/platform_info.hpp +++ b/sycl/source/detail/platform_info.hpp @@ -49,11 +49,26 @@ struct get_platform_info, info::platform::extensions> { } }; +template <> struct get_platform_info { + static bool get(RT::PiPlatform plt, const plugin &Plugin) { + + std::string vendor_name = + get_platform_info::get(plt, Plugin); + + bool result = (vendor_name == "NVIDIA CUDA BACKEND") ? true : false; + return result; + } +}; + // Host platform information methods template inline typename info::param_traits::return_type get_platform_info_host() = delete; +template <> inline bool get_platform_info_host() { + return false; +} + template <> inline std::string get_platform_info_host() { return "FULL PROFILE"; diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 7c206440c620d..9daad9e25d67c 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -509,7 +509,12 @@ Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep) { ContextImplPtr DepEventContext = DepEvent->getContextImpl(); // If contexts don't match we'll connect them using host task - if (DepEventContext != WorkerContext && !WorkerContext->is_host()) { + if (DepEventContext != WorkerContext && !WorkerContext->is_host() && + !(WorkerQueue->get_device().get_backend() == + DepEventContext->getDevices()[0].get_platform().get_backend() && + WorkerQueue->get_device() + .get_platform() + .get_info())) { Scheduler::GraphBuilder &GB = Scheduler::getInstance().MGraphBuilder; ConnectionCmd = GB.connectDepEvent(this, DepEvent, Dep); } else diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index c59aa11046e1a..1bf752bb6ff53 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -942,7 +942,14 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr CommandGroup, NeedMemMoveToHost = true; MemMoveTargetQueue = HT.MQueue; } - } else if (!Queue->is_host() && !Record->MCurContext->is_host()) + } else if (!Queue->is_host() && !Record->MCurContext->is_host() && + !(Queue->get_device().get_platform().get_backend() == + Record->MCurContext->getDevices()[0] + .get_platform() + .get_backend() && + Queue->get_device() + .get_platform() + .get_info())) NeedMemMoveToHost = true; if (NeedMemMoveToHost) From c771093a2970a25054c0a97cc547496859be3001 Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Fri, 13 Aug 2021 12:40:59 +0100 Subject: [PATCH 02/20] Switched to using vendor name in P2P info query. --- sycl/source/detail/platform_info.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sycl/source/detail/platform_info.hpp b/sycl/source/detail/platform_info.hpp index 1f482bd1c9475..f69beb23cdab1 100644 --- a/sycl/source/detail/platform_info.hpp +++ b/sycl/source/detail/platform_info.hpp @@ -53,9 +53,9 @@ template <> struct get_platform_info { static bool get(RT::PiPlatform plt, const plugin &Plugin) { std::string vendor_name = - get_platform_info::get(plt, Plugin); - - bool result = (vendor_name == "NVIDIA CUDA BACKEND") ? true : false; + get_platform_info::get(plt, + Plugin); + bool result = (vendor_name == "NVIDIA Corporation") ? true : false; return result; } }; From 6abe9fb8a7bea68209bec61cc8e0832067f67a8f Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Mon, 23 Aug 2021 21:57:13 +0100 Subject: [PATCH 03/20] Corrected the scoped context in guessLocalWorkSize to prevent stale contexts. Signed-off-by: JackAKirk --- sycl/plugins/cuda/pi_cuda.cpp | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index ed74ff083ffd2..987ff05022b62 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -244,14 +244,21 @@ int getAttribute(pi_device device, CUdevice_attribute attribute) { // Determine local work sizes that result in uniform work groups. // The default threadsPerBlock only require handling the first work_dim // dimension. -void guessLocalWorkSize(int *threadsPerBlock, const size_t *global_work_size, - const size_t maxThreadsPerBlock[3], pi_kernel kernel, - pi_uint32 local_size) { +pi_result guessLocalWorkSize(int *threadsPerBlock, + const size_t *global_work_size, + const size_t maxThreadsPerBlock[3], + pi_kernel kernel, pi_uint32 local_size) { assert(threadsPerBlock != nullptr); assert(global_work_size != nullptr); assert(kernel != nullptr); int recommendedBlockSize, minGrid; + try { + ScopedContext active(kernel->context_); + } catch (pi_result err) { + return err; + } + PI_CHECK_ERROR(cuOccupancyMaxPotentialBlockSize( &minGrid, &recommendedBlockSize, kernel->get(), NULL, local_size, maxThreadsPerBlock[0])); @@ -268,6 +275,7 @@ void guessLocalWorkSize(int *threadsPerBlock, const size_t *global_work_size, while (0u != (global_work_size[0] % threadsPerBlock[0])) { --threadsPerBlock[0]; } + return PI_SUCCESS; } } // anonymous namespace @@ -2597,8 +2605,10 @@ pi_result cuda_piEnqueueKernelLaunch( return err; } } else { - guessLocalWorkSize(threadsPerBlock, global_work_size, maxThreadsPerBlock, - kernel, local_size); + auto err = guessLocalWorkSize(threadsPerBlock, global_work_size, + maxThreadsPerBlock, kernel); + if (err != PI_SUCCESS) + return err; } } From a3c251e7bcc09961a70c573435906eaf2d2ed651 Mon Sep 17 00:00:00 2001 From: "jack.kirk" Date: Wed, 25 Aug 2021 09:09:51 +0000 Subject: [PATCH 04/20] Added binary device query for P2P memcpy instead of platform query. Switched off redundancy check creating conncmd in Command::processDepEvent. Signed-off-by: jack.kirk --- sycl/include/CL/sycl/detail/pi.def | 1 + sycl/include/CL/sycl/detail/pi.h | 7 ++- sycl/include/CL/sycl/info/info_desc.hpp | 1 - sycl/include/CL/sycl/info/platform_traits.def | 1 - sycl/plugins/cuda/pi_cuda.cpp | 13 +++++- sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp | 10 +++++ sycl/plugins/level_zero/pi_level_zero.cpp | 10 +++++ sycl/plugins/opencl/pi_opencl.cpp | 9 ++++ sycl/plugins/rocm/pi_rocm.cpp | 11 +++++ sycl/source/detail/pi.cpp | 2 - sycl/source/detail/platform_info.hpp | 15 ------- sycl/source/detail/scheduler/commands.cpp | 14 +----- .../source/detail/scheduler/graph_builder.cpp | 45 ++++++++++--------- 13 files changed, 83 insertions(+), 56 deletions(-) diff --git a/sycl/include/CL/sycl/detail/pi.def b/sycl/include/CL/sycl/detail/pi.def index 07ea8da8430d4..8522d932bd116 100644 --- a/sycl/include/CL/sycl/detail/pi.def +++ b/sycl/include/CL/sycl/detail/pi.def @@ -33,6 +33,7 @@ _PI_API(piextDeviceSelectBinary) _PI_API(piextGetDeviceFunctionPointer) _PI_API(piextDeviceGetNativeHandle) _PI_API(piextDeviceCreateWithNativeHandle) +_PI_API(piextP2P) // Context _PI_API(piContextCreate) _PI_API(piContextGetInfo) diff --git a/sycl/include/CL/sycl/detail/pi.h b/sycl/include/CL/sycl/detail/pi.h index f4647eda7eaa2..a3f13294c8094 100644 --- a/sycl/include/CL/sycl/detail/pi.h +++ b/sycl/include/CL/sycl/detail/pi.h @@ -126,8 +126,7 @@ typedef enum { PI_PLATFORM_INFO_NAME = CL_PLATFORM_NAME, PI_PLATFORM_INFO_PROFILE = CL_PLATFORM_PROFILE, PI_PLATFORM_INFO_VENDOR = CL_PLATFORM_VENDOR, - PI_PLATFORM_INFO_VERSION = CL_PLATFORM_VERSION, - PI_PLATFORM_INFO_P2P = 0x40110 + PI_PLATFORM_INFO_VERSION = CL_PLATFORM_VERSION } _pi_platform_info; typedef enum { @@ -1058,6 +1057,10 @@ __SYCL_EXPORT pi_result piextEnqueueMemBufferCopyPeer( const pi_event *event_wait_list, pi_event *event); +/// p2p is set true if PI API's, piextEnqueueMemBufferCopyPeer/piextEnqueueMemBufferCopyRectPeer/piextEnqueueMemImageCopyPeer, for peer to peer memory copy may be called. +/// +__SYCL_EXPORT pi_result piextP2P(pi_device src_device, pi_device dst_device, bool* p2p); + __SYCL_EXPORT pi_result piQueueRetain(pi_queue command_queue); __SYCL_EXPORT pi_result piQueueRelease(pi_queue command_queue); diff --git a/sycl/include/CL/sycl/info/info_desc.hpp b/sycl/include/CL/sycl/info/info_desc.hpp index 4476c4ea4acee..2e7ad37c7547d 100644 --- a/sycl/include/CL/sycl/info/info_desc.hpp +++ b/sycl/include/CL/sycl/info/info_desc.hpp @@ -30,7 +30,6 @@ enum class platform { name = PI_PLATFORM_INFO_NAME, vendor = PI_PLATFORM_INFO_VENDOR, extensions = PI_PLATFORM_INFO_EXTENSIONS, - P2P = PI_PLATFORM_INFO_P2P, }; // A.2 Context information desctiptors diff --git a/sycl/include/CL/sycl/info/platform_traits.def b/sycl/include/CL/sycl/info/platform_traits.def index 162792af6fe72..3dd2a319020d5 100644 --- a/sycl/include/CL/sycl/info/platform_traits.def +++ b/sycl/include/CL/sycl/info/platform_traits.def @@ -3,4 +3,3 @@ __SYCL_PARAM_TRAITS_SPEC(platform, version, std::string) __SYCL_PARAM_TRAITS_SPEC(platform, name, std::string) __SYCL_PARAM_TRAITS_SPEC(platform, vendor, std::string) __SYCL_PARAM_TRAITS_SPEC(platform, extensions, std::vector) -__SYCL_PARAM_TRAITS_SPEC(platform, P2P, bool) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 987ff05022b62..93bc4d36ad158 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -2605,8 +2605,8 @@ pi_result cuda_piEnqueueKernelLaunch( return err; } } else { - auto err = guessLocalWorkSize(threadsPerBlock, global_work_size, - maxThreadsPerBlock, kernel); + auto err = guessLocalWorkSize(threadsPerBlock, global_work_size, maxThreadsPerBlock, + kernel, local_size); if (err != PI_SUCCESS) return err; } @@ -5074,6 +5074,14 @@ pi_result cuda_piextUSMGetMemAllocInfo(pi_context context, const void *ptr, return result; } +pi_result cuda_piextP2P(pi_device src_device, pi_device dst_device, bool* p2p) +{ + assert(src_device != nullptr); + assert(dst_device != nullptr); + *p2p = true; + return PI_SUCCESS; +} + // This API is called by Sycl RT to notify the end of the plugin lifetime. // TODO: add a global variable lifetime management code here (see // pi_level_zero.cpp for reference) Currently this is just a NOOP. @@ -5115,6 +5123,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piextDeviceGetNativeHandle, cuda_piextDeviceGetNativeHandle) _PI_CL(piextDeviceCreateWithNativeHandle, cuda_piextDeviceCreateWithNativeHandle) + _PI_CL(piextP2P, cuda_piextP2P) // Context _PI_CL(piextContextSetExtendedDeleter, cuda_piextContextSetExtendedDeleter) _PI_CL(piContextCreate, cuda_piContextCreate) diff --git a/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp b/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp index f50a825fff68d..18b8b58f2661b 100644 --- a/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp +++ b/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp @@ -1260,6 +1260,16 @@ pi_result piTearDown(void *) { return PI_SUCCESS; } +// TODO properly implement this function if pi_esimd_cpu Peer to Peer copies become enabled. +// Currently Peer to Peer copies are unimplemented using the pi_esimd_cpu backend so p2p is always set false. +pi_result piextP2P(pi_device src_device, pi_device dst_device, bool* p2p) +{ + assert(src_device != nullptr); + assert(dst_device != nullptr); + *p2p = false; + return PI_SUCCESS; +} + pi_result piPluginInit(pi_plugin *PluginInit) { assert(PluginInit); size_t PluginVersionSize = sizeof(PluginInit->PluginVersion); diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 015e9cc32db63..56afa39f3aab9 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -6806,6 +6806,16 @@ pi_result piextPluginGetOpaqueData(void *opaque_data_param, return PI_ERROR_UNKNOWN; } +// TODO properly implement this function when level_zero Peer to Peer copies are enabled. +// Currently Peer to Peer copies are unimplemented using the level_zero backend so p2p is always set false. +pi_result piextP2P(pi_device src_device, pi_device dst_device, bool* p2p) +{ + assert(src_device != nullptr); + assert(dst_device != nullptr); + *p2p = false; + return PI_SUCCESS; +} + // SYCL RT calls this api to notify the end of plugin lifetime. // It can include all the jobs to tear down resources before // the plugin is unloaded from memory. diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index fb05ceca94b32..eebd1f7033d59 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -1223,6 +1223,15 @@ pi_result piextKernelGetNativeHandle(pi_kernel kernel, return piextGetNativeHandle(kernel, nativeHandle); } +// Peer to Peer copies are not supported using the OpenCL backend so p2p is always set false. +pi_result piextP2P(pi_device src_device, pi_device dst_device, bool* p2p) +{ + assert(src_device != nullptr); + assert(dst_device != nullptr); + *p2p = false; + return PI_SUCCESS; +} + // This API is called by Sycl RT to notify the end of the plugin lifetime. // TODO: add a global variable lifetime management code here (see // pi_level_zero.cpp for reference) Currently this is just a NOOP. diff --git a/sycl/plugins/rocm/pi_rocm.cpp b/sycl/plugins/rocm/pi_rocm.cpp index 26354a254464e..43ac63eeeda48 100644 --- a/sycl/plugins/rocm/pi_rocm.cpp +++ b/sycl/plugins/rocm/pi_rocm.cpp @@ -4599,6 +4599,16 @@ pi_result rocm_piextUSMGetMemAllocInfo(pi_context context, const void *ptr, return result; } +// TODO properly implement this function when rocm Peer to Peer copies are enabled. +// Currently Peer to Peer copies are unimplemented using the rocm backend so p2p is always set false. +pi_result rocm_piextP2P(pi_device src_device, pi_device dst_device, bool* p2p) +{ + assert(src_device != nullptr); + assert(dst_device != nullptr); + *p2p = false; + return PI_SUCCESS; +} + // This API is called by Sycl RT to notify the end of the plugin lifetime. // TODO: add a global variable lifetime management code here (see // pi_level_zero.cpp for reference) Currently this is just a NOOP. @@ -4640,6 +4650,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piextDeviceGetNativeHandle, rocm_piextDeviceGetNativeHandle) _PI_CL(piextDeviceCreateWithNativeHandle, rocm_piextDeviceCreateWithNativeHandle) + _PI_CL(piextP2P, rocm_piextP2P) // Context _PI_CL(piextContextSetExtendedDeleter, rocm_piextContextSetExtendedDeleter) _PI_CL(piContextCreate, rocm_piContextCreate) diff --git a/sycl/source/detail/pi.cpp b/sycl/source/detail/pi.cpp index 3b09906dc3756..7db4a3f073704 100644 --- a/sycl/source/detail/pi.cpp +++ b/sycl/source/detail/pi.cpp @@ -201,8 +201,6 @@ std::string platformInfoToString(pi_platform_info info) { return "PI_PLATFORM_INFO_VENDOR"; case PI_PLATFORM_INFO_EXTENSIONS: return "PI_PLATFORM_INFO_EXTENSIONS"; - case PI_PLATFORM_INFO_P2P: - return "PI_PLATFORM_INFO_P2P"; } die("Unknown pi_platform_info value passed to " "cl::sycl::detail::pi::platformInfoToString"); diff --git a/sycl/source/detail/platform_info.hpp b/sycl/source/detail/platform_info.hpp index f69beb23cdab1..ff385c3bb4250 100644 --- a/sycl/source/detail/platform_info.hpp +++ b/sycl/source/detail/platform_info.hpp @@ -49,26 +49,11 @@ struct get_platform_info, info::platform::extensions> { } }; -template <> struct get_platform_info { - static bool get(RT::PiPlatform plt, const plugin &Plugin) { - - std::string vendor_name = - get_platform_info::get(plt, - Plugin); - bool result = (vendor_name == "NVIDIA Corporation") ? true : false; - return result; - } -}; - // Host platform information methods template inline typename info::param_traits::return_type get_platform_info_host() = delete; -template <> inline bool get_platform_info_host() { - return false; -} - template <> inline std::string get_platform_info_host() { return "FULL PROFILE"; diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 059223141b430..47a3b5b7e3db0 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -486,7 +486,6 @@ void Command::makeTraceEventEpilog() { Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep) { const QueueImplPtr &WorkerQueue = getWorkerQueue(); - const ContextImplPtr &WorkerContext = WorkerQueue->getContextImplPtr(); // 1. Async work is not supported for host device. // 2. Some types of commands do not produce PI events after they are enqueued @@ -510,18 +509,7 @@ Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep) { WorkerQueue->has_property()) return nullptr; - ContextImplPtr DepEventContext = DepEvent->getContextImpl(); - // If contexts don't match we'll connect them using host task - if (DepEventContext != WorkerContext && !WorkerContext->is_host() && - !(WorkerQueue->get_device().get_backend() == - DepEventContext->getDevices()[0].get_platform().get_backend() && - WorkerQueue->get_device() - .get_platform() - .get_info())) { - Scheduler::GraphBuilder &GB = Scheduler::getInstance().MGraphBuilder; - ConnectionCmd = GB.connectDepEvent(this, DepEvent, Dep); - } else - MPreparedDepsEvents.push_back(std::move(DepEvent)); + MPreparedDepsEvents.push_back(std::move(DepEvent)); return ConnectionCmd; } diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index 1bf752bb6ff53..e84e4186892dc 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -52,6 +52,21 @@ static bool IsSuitableSubReq(const Requirement *Req) { return Req->MIsSubBuffer; } +/// Finds the correct AllocaCommand matching the context of Record. +AllocaCommandBase *findAllocaCmd(MemObjRecord *Record){ + auto IsSuitableAlloca = [Record](AllocaCommandBase *AllocaCmd) { + bool Res = sameCtx(AllocaCmd->getQueue()->getContextImplPtr(), + Record->MCurContext) && + // Looking for a parent buffer alloca command + AllocaCmd->getType() == Command::CommandType::ALLOCA; + return Res; + }; + const auto It = + std::find_if(Record->MAllocaCommands.begin(), + Record->MAllocaCommands.end(), IsSuitableAlloca); + return (Record->MAllocaCommands.end() != It) ? *It : nullptr; +} + /// Checks if the required access mode is allowed under the current one. static bool isAccessModeAllowed(access::mode Required, access::mode Current) { switch (Current) { @@ -328,17 +343,7 @@ Command *Scheduler::GraphBuilder::insertMemoryMove( // Since no alloca command for the sub buffer requirement was found in the // current context, need to find a parent alloca command for it (it must be // there) - auto IsSuitableAlloca = [Record](AllocaCommandBase *AllocaCmd) { - bool Res = sameCtx(AllocaCmd->getQueue()->getContextImplPtr(), - Record->MCurContext) && - // Looking for a parent buffer alloca command - AllocaCmd->getType() == Command::CommandType::ALLOCA; - return Res; - }; - const auto It = - std::find_if(Record->MAllocaCommands.begin(), - Record->MAllocaCommands.end(), IsSuitableAlloca); - AllocaCmdSrc = (Record->MAllocaCommands.end() != It) ? *It : nullptr; + AllocaCmdSrc = findAllocaCmd(Record); } if (!AllocaCmdSrc) throw runtime_error("Cannot find buffer allocation", PI_INVALID_VALUE); @@ -942,16 +947,16 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr CommandGroup, NeedMemMoveToHost = true; MemMoveTargetQueue = HT.MQueue; } - } else if (!Queue->is_host() && !Record->MCurContext->is_host() && - !(Queue->get_device().get_platform().get_backend() == - Record->MCurContext->getDevices()[0] + } else if (!Queue->is_host() && !Record->MCurContext->is_host()) + { + bool p2p = false; + Queue->getPlugin().call(Queue->getDeviceImplPtr()->getHandleRef(), + findAllocaCmd(Record)->getQueue()->getDeviceImplPtr()->getHandleRef(), &p2p); + if(!(p2p && Queue->get_device().get_platform().get_backend() == Record->MCurContext->getDevices()[0] .get_platform() - .get_backend() && - Queue->get_device() - .get_platform() - .get_info())) - NeedMemMoveToHost = true; - + .get_backend())) + NeedMemMoveToHost = true; + } if (NeedMemMoveToHost) insertMemoryMove(Record, Req, Scheduler::getInstance().getDefaultHostQueue(), From 3cd69115b949caa1d96fd285469056e462009576 Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Wed, 25 Aug 2021 11:00:33 +0100 Subject: [PATCH 05/20] Corrected formatting. Signed-off-by: JackAKirk --- sycl/include/CL/sycl/detail/pi.h | 7 ++- sycl/plugins/cuda/pi_cuda.cpp | 7 ++- sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp | 8 ++-- sycl/plugins/level_zero/pi_level_zero.cpp | 8 ++-- sycl/plugins/opencl/pi_opencl.cpp | 6 +-- sycl/plugins/rocm/pi_rocm.cpp | 8 ++-- .../source/detail/scheduler/graph_builder.cpp | 44 ++++++++++--------- 7 files changed, 47 insertions(+), 41 deletions(-) diff --git a/sycl/include/CL/sycl/detail/pi.h b/sycl/include/CL/sycl/detail/pi.h index a3f13294c8094..e8d63842cd5a2 100644 --- a/sycl/include/CL/sycl/detail/pi.h +++ b/sycl/include/CL/sycl/detail/pi.h @@ -1057,9 +1057,12 @@ __SYCL_EXPORT pi_result piextEnqueueMemBufferCopyPeer( const pi_event *event_wait_list, pi_event *event); -/// p2p is set true if PI API's, piextEnqueueMemBufferCopyPeer/piextEnqueueMemBufferCopyRectPeer/piextEnqueueMemImageCopyPeer, for peer to peer memory copy may be called. +/// p2p is set true if PI API's, +/// piextEnqueueMemBufferCopyPeer/piextEnqueueMemBufferCopyRectPeer/piextEnqueueMemImageCopyPeer, +/// for peer to peer memory copy may be called. /// -__SYCL_EXPORT pi_result piextP2P(pi_device src_device, pi_device dst_device, bool* p2p); +__SYCL_EXPORT pi_result piextP2P(pi_device src_device, pi_device dst_device, + bool *p2p); __SYCL_EXPORT pi_result piQueueRetain(pi_queue command_queue); diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 93bc4d36ad158..c75b20360c3f0 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -2605,8 +2605,8 @@ pi_result cuda_piEnqueueKernelLaunch( return err; } } else { - auto err = guessLocalWorkSize(threadsPerBlock, global_work_size, maxThreadsPerBlock, - kernel, local_size); + auto err = guessLocalWorkSize(threadsPerBlock, global_work_size, + maxThreadsPerBlock, kernel, local_size); if (err != PI_SUCCESS) return err; } @@ -5074,8 +5074,7 @@ pi_result cuda_piextUSMGetMemAllocInfo(pi_context context, const void *ptr, return result; } -pi_result cuda_piextP2P(pi_device src_device, pi_device dst_device, bool* p2p) -{ +pi_result cuda_piextP2P(pi_device src_device, pi_device dst_device, bool *p2p) { assert(src_device != nullptr); assert(dst_device != nullptr); *p2p = true; diff --git a/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp b/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp index 18b8b58f2661b..8415c1a3d89d5 100644 --- a/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp +++ b/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp @@ -1260,10 +1260,10 @@ pi_result piTearDown(void *) { return PI_SUCCESS; } -// TODO properly implement this function if pi_esimd_cpu Peer to Peer copies become enabled. -// Currently Peer to Peer copies are unimplemented using the pi_esimd_cpu backend so p2p is always set false. -pi_result piextP2P(pi_device src_device, pi_device dst_device, bool* p2p) -{ +// TODO properly implement this function if pi_esimd_cpu Peer to Peer copies +// become enabled. Currently Peer to Peer copies are unimplemented using the +// pi_esimd_cpu backend so p2p is always set false. +pi_result piextP2P(pi_device src_device, pi_device dst_device, bool *p2p) { assert(src_device != nullptr); assert(dst_device != nullptr); *p2p = false; diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 56afa39f3aab9..8469ee14c494e 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -6806,10 +6806,10 @@ pi_result piextPluginGetOpaqueData(void *opaque_data_param, return PI_ERROR_UNKNOWN; } -// TODO properly implement this function when level_zero Peer to Peer copies are enabled. -// Currently Peer to Peer copies are unimplemented using the level_zero backend so p2p is always set false. -pi_result piextP2P(pi_device src_device, pi_device dst_device, bool* p2p) -{ +// TODO properly implement this function when level_zero Peer to Peer copies are +// enabled. Currently Peer to Peer copies are unimplemented using the level_zero +// backend so p2p is always set false. +pi_result piextP2P(pi_device src_device, pi_device dst_device, bool *p2p) { assert(src_device != nullptr); assert(dst_device != nullptr); *p2p = false; diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index eebd1f7033d59..655a31f781e02 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -1223,9 +1223,9 @@ pi_result piextKernelGetNativeHandle(pi_kernel kernel, return piextGetNativeHandle(kernel, nativeHandle); } -// Peer to Peer copies are not supported using the OpenCL backend so p2p is always set false. -pi_result piextP2P(pi_device src_device, pi_device dst_device, bool* p2p) -{ +// Peer to Peer copies are not supported using the OpenCL backend so p2p is +// always set false. +pi_result piextP2P(pi_device src_device, pi_device dst_device, bool *p2p) { assert(src_device != nullptr); assert(dst_device != nullptr); *p2p = false; diff --git a/sycl/plugins/rocm/pi_rocm.cpp b/sycl/plugins/rocm/pi_rocm.cpp index 43ac63eeeda48..f233c15f8ed47 100644 --- a/sycl/plugins/rocm/pi_rocm.cpp +++ b/sycl/plugins/rocm/pi_rocm.cpp @@ -4599,10 +4599,10 @@ pi_result rocm_piextUSMGetMemAllocInfo(pi_context context, const void *ptr, return result; } -// TODO properly implement this function when rocm Peer to Peer copies are enabled. -// Currently Peer to Peer copies are unimplemented using the rocm backend so p2p is always set false. -pi_result rocm_piextP2P(pi_device src_device, pi_device dst_device, bool* p2p) -{ +// TODO properly implement this function when rocm Peer to Peer copies are +// enabled. Currently Peer to Peer copies are unimplemented using the rocm +// backend so p2p is always set false. +pi_result rocm_piextP2P(pi_device src_device, pi_device dst_device, bool *p2p) { assert(src_device != nullptr); assert(dst_device != nullptr); *p2p = false; diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index e84e4186892dc..f6a54e6f61330 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -53,18 +53,17 @@ static bool IsSuitableSubReq(const Requirement *Req) { } /// Finds the correct AllocaCommand matching the context of Record. -AllocaCommandBase *findAllocaCmd(MemObjRecord *Record){ - auto IsSuitableAlloca = [Record](AllocaCommandBase *AllocaCmd) { - bool Res = sameCtx(AllocaCmd->getQueue()->getContextImplPtr(), - Record->MCurContext) && - // Looking for a parent buffer alloca command - AllocaCmd->getType() == Command::CommandType::ALLOCA; - return Res; - }; - const auto It = - std::find_if(Record->MAllocaCommands.begin(), - Record->MAllocaCommands.end(), IsSuitableAlloca); - return (Record->MAllocaCommands.end() != It) ? *It : nullptr; +AllocaCommandBase *findAllocaCmd(MemObjRecord *Record) { + auto IsSuitableAlloca = [Record](AllocaCommandBase *AllocaCmd) { + bool Res = sameCtx(AllocaCmd->getQueue()->getContextImplPtr(), + Record->MCurContext) && + // Looking for a parent buffer alloca command + AllocaCmd->getType() == Command::CommandType::ALLOCA; + return Res; + }; + const auto It = std::find_if(Record->MAllocaCommands.begin(), + Record->MAllocaCommands.end(), IsSuitableAlloca); + return (Record->MAllocaCommands.end() != It) ? *It : nullptr; } /// Checks if the required access mode is allowed under the current one. @@ -947,15 +946,20 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr CommandGroup, NeedMemMoveToHost = true; MemMoveTargetQueue = HT.MQueue; } - } else if (!Queue->is_host() && !Record->MCurContext->is_host()) - { + } else if (!Queue->is_host() && !Record->MCurContext->is_host()) { bool p2p = false; - Queue->getPlugin().call(Queue->getDeviceImplPtr()->getHandleRef(), - findAllocaCmd(Record)->getQueue()->getDeviceImplPtr()->getHandleRef(), &p2p); - if(!(p2p && Queue->get_device().get_platform().get_backend() == Record->MCurContext->getDevices()[0] - .get_platform() - .get_backend())) - NeedMemMoveToHost = true; + Queue->getPlugin().call( + Queue->getDeviceImplPtr()->getHandleRef(), + findAllocaCmd(Record) + ->getQueue() + ->getDeviceImplPtr() + ->getHandleRef(), + &p2p); + if (!(p2p && Queue->get_device().get_platform().get_backend() == + Record->MCurContext->getDevices()[0] + .get_platform() + .get_backend())) + NeedMemMoveToHost = true; } if (NeedMemMoveToHost) insertMemoryMove(Record, Req, From c384fbe092a5d7e8bedfd9cd247b773b1bf1716f Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Wed, 25 Aug 2021 11:37:34 +0100 Subject: [PATCH 06/20] Corrected Formatting. Signed-off-by: JackAKirk --- sycl/include/CL/sycl/detail/pi.h | 27 ++++++++++++--------------- sycl/plugins/cuda/pi_cuda.cpp | 11 ++++++----- sycl/plugins/rocm/pi_rocm.cpp | 3 ++- 3 files changed, 20 insertions(+), 21 deletions(-) diff --git a/sycl/include/CL/sycl/detail/pi.h b/sycl/include/CL/sycl/detail/pi.h index e8d63842cd5a2..bd46693d6964b 100644 --- a/sycl/include/CL/sycl/detail/pi.h +++ b/sycl/include/CL/sycl/detail/pi.h @@ -1048,14 +1048,11 @@ __SYCL_EXPORT pi_result piQueueGetInfo(pi_queue command_queue, void *param_value, size_t *param_value_size_ret); - __SYCL_EXPORT pi_result piextEnqueueMemBufferCopyPeer( - pi_queue src_queue, pi_mem src_buffer, pi_queue dst_queue, - pi_mem dst_buffer, size_t src_offset, - size_t dst_offset, size_t size, - pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, - pi_event *event); + pi_queue src_queue, pi_mem src_buffer, pi_queue dst_queue, + pi_mem dst_buffer, size_t src_offset, size_t dst_offset, size_t size, + pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, + pi_event *event); /// p2p is set true if PI API's, /// piextEnqueueMemBufferCopyPeer/piextEnqueueMemBufferCopyRectPeer/piextEnqueueMemImageCopyPeer, @@ -1469,12 +1466,12 @@ __SYCL_EXPORT pi_result piEnqueueMemBufferCopyRect( pi_event *event); __SYCL_EXPORT pi_result piextEnqueueMemBufferCopyRectPeer( - pi_queue src_queue, pi_mem src_buffer, pi_queue dst_queue, pi_mem dst_buffer, - pi_buff_rect_offset src_origin, pi_buff_rect_offset dst_origin, - pi_buff_rect_region region, size_t src_row_pitch, size_t src_slice_pitch, - size_t dst_row_pitch, size_t dst_slice_pitch, - pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, - pi_event *event); + pi_queue src_queue, pi_mem src_buffer, pi_queue dst_queue, + pi_mem dst_buffer, pi_buff_rect_offset src_origin, + pi_buff_rect_offset dst_origin, pi_buff_rect_region region, + size_t src_row_pitch, size_t src_slice_pitch, size_t dst_row_pitch, + size_t dst_slice_pitch, pi_uint32 num_events_in_wait_list, + const pi_event *event_wait_list, pi_event *event); __SYCL_EXPORT pi_result piEnqueueMemBufferFill(pi_queue command_queue, pi_mem buffer, @@ -1502,8 +1499,8 @@ __SYCL_EXPORT pi_result piEnqueueMemImageCopy( const pi_event *event_wait_list, pi_event *event); __SYCL_EXPORT pi_result piextEnqueueMemImageCopyPeer( - pi_queue command_queue, pi_mem src_image, pi_queue dst_queue, pi_mem dst_image, - pi_image_offset src_origin, pi_image_offset dst_origin, + pi_queue command_queue, pi_mem src_image, pi_queue dst_queue, + pi_mem dst_image, pi_image_offset src_origin, pi_image_offset dst_origin, pi_image_region region, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event); diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index c75b20360c3f0..b7bd6ea9b87ff 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -3646,10 +3646,10 @@ pi_result cuda_piSamplerRelease(pi_sampler sampler) { } void copyRectAsserts(const pi_buff_rect_region ®ion, - const pi_buff_rect_offset &src_offset, - const pi_buff_rect_offset &dst_offset, - const CUmemorytype_enum &src_type, - const CUmemorytype_enum &dst_type) { + const pi_buff_rect_offset &src_offset, + const pi_buff_rect_offset &dst_offset, + const CUmemorytype_enum &src_type, + const CUmemorytype_enum &dst_type) { assert(region != nullptr); assert(src_offset != nullptr); assert(dst_offset != nullptr); @@ -5152,7 +5152,8 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piextMemGetNativeHandle, cuda_piextMemGetNativeHandle) _PI_CL(piextMemCreateWithNativeHandle, cuda_piextMemCreateWithNativeHandle) _PI_CL(piextEnqueueMemBufferCopyPeer, cuda_piextEnqueueMemBufferCopyPeer) - _PI_CL(piextEnqueueMemBufferCopyRectPeer, cuda_piextEnqueueMemBufferCopyRectPeer) + _PI_CL(piextEnqueueMemBufferCopyRectPeer, + cuda_piextEnqueueMemBufferCopyRectPeer) _PI_CL(piextEnqueueMemImageCopyPeer, cuda_piextEnqueueMemImageCopyPeer) // Program _PI_CL(piProgramCreate, cuda_piProgramCreate) diff --git a/sycl/plugins/rocm/pi_rocm.cpp b/sycl/plugins/rocm/pi_rocm.cpp index f233c15f8ed47..a7aed9a9f0d5a 100644 --- a/sycl/plugins/rocm/pi_rocm.cpp +++ b/sycl/plugins/rocm/pi_rocm.cpp @@ -4730,7 +4730,8 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piEnqueueMemBufferCopy, rocm_piEnqueueMemBufferCopy) _PI_CL(piextEnqueueMemBufferCopyPeer, rocm_piextEnqueueMemBufferCopyPeer) _PI_CL(piEnqueueMemBufferCopyRect, rocm_piEnqueueMemBufferCopyRect) - _PI_CL(piextEnqueueMemBufferCopyRectPeer, rocm_piextEnqueueMemBufferCopyRectPeer) + _PI_CL(piextEnqueueMemBufferCopyRectPeer, + rocm_piextEnqueueMemBufferCopyRectPeer) _PI_CL(piEnqueueMemBufferFill, rocm_piEnqueueMemBufferFill) _PI_CL(piEnqueueMemImageRead, rocm_piEnqueueMemImageRead) _PI_CL(piEnqueueMemImageWrite, rocm_piEnqueueMemImageWrite) From 27c073d00a854b68f733532653ac34e4da9e8e05 Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Wed, 25 Aug 2021 12:52:56 +0100 Subject: [PATCH 07/20] Placed new PI API's after piTearDown. Signed-off-by: JackAKirk --- sycl/include/CL/sycl/detail/pi.def | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sycl/include/CL/sycl/detail/pi.def b/sycl/include/CL/sycl/detail/pi.def index 8522d932bd116..fc81abf7d8401 100644 --- a/sycl/include/CL/sycl/detail/pi.def +++ b/sycl/include/CL/sycl/detail/pi.def @@ -33,7 +33,6 @@ _PI_API(piextDeviceSelectBinary) _PI_API(piextGetDeviceFunctionPointer) _PI_API(piextDeviceGetNativeHandle) _PI_API(piextDeviceCreateWithNativeHandle) -_PI_API(piextP2P) // Context _PI_API(piContextCreate) _PI_API(piContextGetInfo) @@ -112,14 +111,11 @@ _PI_API(piEnqueueMemBufferReadRect) _PI_API(piEnqueueMemBufferWrite) _PI_API(piEnqueueMemBufferWriteRect) _PI_API(piEnqueueMemBufferCopy) -_PI_API(piextEnqueueMemBufferCopyPeer) _PI_API(piEnqueueMemBufferCopyRect) -_PI_API(piextEnqueueMemBufferCopyRectPeer) _PI_API(piEnqueueMemBufferFill) _PI_API(piEnqueueMemImageRead) _PI_API(piEnqueueMemImageWrite) _PI_API(piEnqueueMemImageCopy) -_PI_API(piextEnqueueMemImageCopyPeer) _PI_API(piEnqueueMemImageFill) _PI_API(piEnqueueMemBufferMap) _PI_API(piEnqueueMemUnmap) @@ -141,4 +137,9 @@ _PI_API(piextPluginGetOpaqueData) _PI_API(piTearDown) +_PI_API(piextEnqueueMemBufferCopyPeer) +_PI_API(piextEnqueueMemBufferCopyRectPeer) +_PI_API(piextEnqueueMemImageCopyPeer) +_PI_API(piextP2P) + #undef _PI_API From 60f276db34b773ab07a27f5d9d04f28bc5cc9dc2 Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Wed, 25 Aug 2021 15:16:03 +0100 Subject: [PATCH 08/20] Renamed piextP2P as piextDevicesSupportP2P. Signed-off-by: JackAKirk --- sycl/include/CL/sycl/detail/pi.def | 2 +- sycl/include/CL/sycl/detail/pi.h | 2 +- sycl/plugins/cuda/pi_cuda.cpp | 4 ++-- sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp | 2 +- sycl/plugins/level_zero/pi_level_zero.cpp | 2 +- sycl/plugins/opencl/pi_opencl.cpp | 2 +- sycl/plugins/rocm/pi_rocm.cpp | 4 ++-- 7 files changed, 9 insertions(+), 9 deletions(-) diff --git a/sycl/include/CL/sycl/detail/pi.def b/sycl/include/CL/sycl/detail/pi.def index fc81abf7d8401..f7e6644f1eba2 100644 --- a/sycl/include/CL/sycl/detail/pi.def +++ b/sycl/include/CL/sycl/detail/pi.def @@ -140,6 +140,6 @@ _PI_API(piTearDown) _PI_API(piextEnqueueMemBufferCopyPeer) _PI_API(piextEnqueueMemBufferCopyRectPeer) _PI_API(piextEnqueueMemImageCopyPeer) -_PI_API(piextP2P) +_PI_API(piextDevicesSupportP2P) #undef _PI_API diff --git a/sycl/include/CL/sycl/detail/pi.h b/sycl/include/CL/sycl/detail/pi.h index bd46693d6964b..cdf54cb246c24 100644 --- a/sycl/include/CL/sycl/detail/pi.h +++ b/sycl/include/CL/sycl/detail/pi.h @@ -1058,7 +1058,7 @@ __SYCL_EXPORT pi_result piextEnqueueMemBufferCopyPeer( /// piextEnqueueMemBufferCopyPeer/piextEnqueueMemBufferCopyRectPeer/piextEnqueueMemImageCopyPeer, /// for peer to peer memory copy may be called. /// -__SYCL_EXPORT pi_result piextP2P(pi_device src_device, pi_device dst_device, +__SYCL_EXPORT pi_result piextDevicesSupportP2P(pi_device src_device, pi_device dst_device, bool *p2p); __SYCL_EXPORT pi_result piQueueRetain(pi_queue command_queue); diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index b7bd6ea9b87ff..22cd6c712f45d 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -5074,7 +5074,7 @@ pi_result cuda_piextUSMGetMemAllocInfo(pi_context context, const void *ptr, return result; } -pi_result cuda_piextP2P(pi_device src_device, pi_device dst_device, bool *p2p) { +pi_result cuda_piextDevicesSupportP2P(pi_device src_device, pi_device dst_device, bool *p2p) { assert(src_device != nullptr); assert(dst_device != nullptr); *p2p = true; @@ -5122,7 +5122,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piextDeviceGetNativeHandle, cuda_piextDeviceGetNativeHandle) _PI_CL(piextDeviceCreateWithNativeHandle, cuda_piextDeviceCreateWithNativeHandle) - _PI_CL(piextP2P, cuda_piextP2P) + _PI_CL(piextDevicesSupportP2P, cuda_piextDevicesSupportP2P) // Context _PI_CL(piextContextSetExtendedDeleter, cuda_piextContextSetExtendedDeleter) _PI_CL(piContextCreate, cuda_piContextCreate) diff --git a/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp b/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp index 8415c1a3d89d5..8e3f5b6872bf1 100644 --- a/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp +++ b/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp @@ -1263,7 +1263,7 @@ pi_result piTearDown(void *) { // TODO properly implement this function if pi_esimd_cpu Peer to Peer copies // become enabled. Currently Peer to Peer copies are unimplemented using the // pi_esimd_cpu backend so p2p is always set false. -pi_result piextP2P(pi_device src_device, pi_device dst_device, bool *p2p) { +pi_result piextDevicesSupportP2P(pi_device src_device, pi_device dst_device, bool *p2p) { assert(src_device != nullptr); assert(dst_device != nullptr); *p2p = false; diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 8469ee14c494e..a6cd569c00e36 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -6809,7 +6809,7 @@ pi_result piextPluginGetOpaqueData(void *opaque_data_param, // TODO properly implement this function when level_zero Peer to Peer copies are // enabled. Currently Peer to Peer copies are unimplemented using the level_zero // backend so p2p is always set false. -pi_result piextP2P(pi_device src_device, pi_device dst_device, bool *p2p) { +pi_result piextDevicesSupportP2P(pi_device src_device, pi_device dst_device, bool *p2p) { assert(src_device != nullptr); assert(dst_device != nullptr); *p2p = false; diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index 655a31f781e02..eb412aedf0fdf 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -1225,7 +1225,7 @@ pi_result piextKernelGetNativeHandle(pi_kernel kernel, // Peer to Peer copies are not supported using the OpenCL backend so p2p is // always set false. -pi_result piextP2P(pi_device src_device, pi_device dst_device, bool *p2p) { +pi_result piextDevicesSupportP2P(pi_device src_device, pi_device dst_device, bool *p2p) { assert(src_device != nullptr); assert(dst_device != nullptr); *p2p = false; diff --git a/sycl/plugins/rocm/pi_rocm.cpp b/sycl/plugins/rocm/pi_rocm.cpp index a7aed9a9f0d5a..108cdb442d196 100644 --- a/sycl/plugins/rocm/pi_rocm.cpp +++ b/sycl/plugins/rocm/pi_rocm.cpp @@ -4602,7 +4602,7 @@ pi_result rocm_piextUSMGetMemAllocInfo(pi_context context, const void *ptr, // TODO properly implement this function when rocm Peer to Peer copies are // enabled. Currently Peer to Peer copies are unimplemented using the rocm // backend so p2p is always set false. -pi_result rocm_piextP2P(pi_device src_device, pi_device dst_device, bool *p2p) { +pi_result rocm_piextDevicesSupportP2P(pi_device src_device, pi_device dst_device, bool *p2p) { assert(src_device != nullptr); assert(dst_device != nullptr); *p2p = false; @@ -4650,7 +4650,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piextDeviceGetNativeHandle, rocm_piextDeviceGetNativeHandle) _PI_CL(piextDeviceCreateWithNativeHandle, rocm_piextDeviceCreateWithNativeHandle) - _PI_CL(piextP2P, rocm_piextP2P) + _PI_CL(piextDevicesSupportP2P, rocm_piextDevicesSupportP2P) // Context _PI_CL(piextContextSetExtendedDeleter, rocm_piextContextSetExtendedDeleter) _PI_CL(piContextCreate, rocm_piContextCreate) From 835e5c4c82fb8085e8d594b9ecb4a2ee3953a55d Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Wed, 25 Aug 2021 15:55:18 +0100 Subject: [PATCH 09/20] Made check that devices backends match before P2P query. Included header for backend_impl.hpp in graph_builder.cpp so that detail::getImplBackend may be called. Signed-off-by: JackAKirk --- sycl/include/CL/sycl/detail/pi.h | 4 +-- sycl/plugins/cuda/pi_cuda.cpp | 3 ++- sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp | 3 ++- sycl/plugins/level_zero/pi_level_zero.cpp | 3 ++- sycl/plugins/opencl/pi_opencl.cpp | 3 ++- sycl/plugins/rocm/pi_rocm.cpp | 3 ++- .../source/detail/scheduler/graph_builder.cpp | 27 ++++++++++--------- 7 files changed, 27 insertions(+), 19 deletions(-) diff --git a/sycl/include/CL/sycl/detail/pi.h b/sycl/include/CL/sycl/detail/pi.h index cdf54cb246c24..c4166a67278c2 100644 --- a/sycl/include/CL/sycl/detail/pi.h +++ b/sycl/include/CL/sycl/detail/pi.h @@ -1058,8 +1058,8 @@ __SYCL_EXPORT pi_result piextEnqueueMemBufferCopyPeer( /// piextEnqueueMemBufferCopyPeer/piextEnqueueMemBufferCopyRectPeer/piextEnqueueMemImageCopyPeer, /// for peer to peer memory copy may be called. /// -__SYCL_EXPORT pi_result piextDevicesSupportP2P(pi_device src_device, pi_device dst_device, - bool *p2p); +__SYCL_EXPORT pi_result piextDevicesSupportP2P(pi_device src_device, + pi_device dst_device, bool *p2p); __SYCL_EXPORT pi_result piQueueRetain(pi_queue command_queue); diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 22cd6c712f45d..b41bae6985dd9 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -5074,7 +5074,8 @@ pi_result cuda_piextUSMGetMemAllocInfo(pi_context context, const void *ptr, return result; } -pi_result cuda_piextDevicesSupportP2P(pi_device src_device, pi_device dst_device, bool *p2p) { +pi_result cuda_piextDevicesSupportP2P(pi_device src_device, + pi_device dst_device, bool *p2p) { assert(src_device != nullptr); assert(dst_device != nullptr); *p2p = true; diff --git a/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp b/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp index 8e3f5b6872bf1..c6b046c1eca9d 100644 --- a/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp +++ b/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp @@ -1263,7 +1263,8 @@ pi_result piTearDown(void *) { // TODO properly implement this function if pi_esimd_cpu Peer to Peer copies // become enabled. Currently Peer to Peer copies are unimplemented using the // pi_esimd_cpu backend so p2p is always set false. -pi_result piextDevicesSupportP2P(pi_device src_device, pi_device dst_device, bool *p2p) { +pi_result piextDevicesSupportP2P(pi_device src_device, pi_device dst_device, + bool *p2p) { assert(src_device != nullptr); assert(dst_device != nullptr); *p2p = false; diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index a6cd569c00e36..f52fe9601a732 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -6809,7 +6809,8 @@ pi_result piextPluginGetOpaqueData(void *opaque_data_param, // TODO properly implement this function when level_zero Peer to Peer copies are // enabled. Currently Peer to Peer copies are unimplemented using the level_zero // backend so p2p is always set false. -pi_result piextDevicesSupportP2P(pi_device src_device, pi_device dst_device, bool *p2p) { +pi_result piextDevicesSupportP2P(pi_device src_device, pi_device dst_device, + bool *p2p) { assert(src_device != nullptr); assert(dst_device != nullptr); *p2p = false; diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index eb412aedf0fdf..ed77feee70049 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -1225,7 +1225,8 @@ pi_result piextKernelGetNativeHandle(pi_kernel kernel, // Peer to Peer copies are not supported using the OpenCL backend so p2p is // always set false. -pi_result piextDevicesSupportP2P(pi_device src_device, pi_device dst_device, bool *p2p) { +pi_result piextDevicesSupportP2P(pi_device src_device, pi_device dst_device, + bool *p2p) { assert(src_device != nullptr); assert(dst_device != nullptr); *p2p = false; diff --git a/sycl/plugins/rocm/pi_rocm.cpp b/sycl/plugins/rocm/pi_rocm.cpp index 108cdb442d196..0f18fc86c296a 100644 --- a/sycl/plugins/rocm/pi_rocm.cpp +++ b/sycl/plugins/rocm/pi_rocm.cpp @@ -4602,7 +4602,8 @@ pi_result rocm_piextUSMGetMemAllocInfo(pi_context context, const void *ptr, // TODO properly implement this function when rocm Peer to Peer copies are // enabled. Currently Peer to Peer copies are unimplemented using the rocm // backend so p2p is always set false. -pi_result rocm_piextDevicesSupportP2P(pi_device src_device, pi_device dst_device, bool *p2p) { +pi_result rocm_piextDevicesSupportP2P(pi_device src_device, + pi_device dst_device, bool *p2p) { assert(src_device != nullptr); assert(dst_device != nullptr); *p2p = false; diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index f6a54e6f61330..76e506854d859 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -947,19 +948,21 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr CommandGroup, MemMoveTargetQueue = HT.MQueue; } } else if (!Queue->is_host() && !Record->MCurContext->is_host()) { - bool p2p = false; - Queue->getPlugin().call( - Queue->getDeviceImplPtr()->getHandleRef(), - findAllocaCmd(Record) - ->getQueue() - ->getDeviceImplPtr() - ->getHandleRef(), - &p2p); - if (!(p2p && Queue->get_device().get_platform().get_backend() == - Record->MCurContext->getDevices()[0] - .get_platform() - .get_backend())) + if (detail::getImplBackend(Queue) != + detail::getImplBackend(Record->MCurContext)) NeedMemMoveToHost = true; + else { + bool p2p = false; + Queue->getPlugin().call( + Queue->getDeviceImplPtr()->getHandleRef(), + findAllocaCmd(Record) + ->getQueue() + ->getDeviceImplPtr() + ->getHandleRef(), + &p2p); + if (!p2p) + NeedMemMoveToHost = true; + } } if (NeedMemMoveToHost) insertMemoryMove(Record, Req, From 9e3ef859db105c9d2f59ac86439805c18666b113 Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Wed, 25 Aug 2021 16:00:21 +0100 Subject: [PATCH 10/20] Corrected formating in graph_builder.cpp. Signed-off-by: JackAKirk --- sycl/source/detail/scheduler/graph_builder.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index 76e506854d859..35248a9dd0d7e 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -10,11 +10,11 @@ #include #include #include +#include #include #include #include #include -#include #include #include From 0d819c0d5d10e522117e35d3a4973c8dc861de19 Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Thu, 26 Aug 2021 15:15:48 +0100 Subject: [PATCH 11/20] Replaced binary device query with device_info call returning a vector of readable peer devices. Signed-off-by: JackAKirk --- sycl/include/CL/sycl/detail/pi.def | 1 - sycl/include/CL/sycl/detail/pi.h | 10 ++------ sycl/plugins/cuda/pi_cuda.cpp | 18 ++++++------- sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp | 11 -------- sycl/plugins/level_zero/pi_level_zero.cpp | 11 -------- sycl/plugins/opencl/pi_opencl.cpp | 10 -------- sycl/plugins/rocm/pi_rocm.cpp | 12 --------- .../source/detail/scheduler/graph_builder.cpp | 25 +++++++++++++------ 8 files changed, 28 insertions(+), 70 deletions(-) diff --git a/sycl/include/CL/sycl/detail/pi.def b/sycl/include/CL/sycl/detail/pi.def index f7e6644f1eba2..2cac6f88c31d6 100644 --- a/sycl/include/CL/sycl/detail/pi.def +++ b/sycl/include/CL/sycl/detail/pi.def @@ -140,6 +140,5 @@ _PI_API(piTearDown) _PI_API(piextEnqueueMemBufferCopyPeer) _PI_API(piextEnqueueMemBufferCopyRectPeer) _PI_API(piextEnqueueMemImageCopyPeer) -_PI_API(piextDevicesSupportP2P) #undef _PI_API diff --git a/sycl/include/CL/sycl/detail/pi.h b/sycl/include/CL/sycl/detail/pi.h index c4166a67278c2..72a90cd40e6b7 100644 --- a/sycl/include/CL/sycl/detail/pi.h +++ b/sycl/include/CL/sycl/detail/pi.h @@ -295,7 +295,8 @@ typedef enum { PI_DEVICE_INFO_MAX_MEM_BANDWIDTH = 0x10026, PI_DEVICE_INFO_IMAGE_SRGB = 0x10027, PI_DEVICE_INFO_ATOMIC_64 = 0x10110, - PI_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES = 0x10111 + PI_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES = 0x10111, + PI_DEVICE_INFO_P2P_READ_DEVICES = 0x10112 } _pi_device_info; typedef enum { @@ -1054,13 +1055,6 @@ __SYCL_EXPORT pi_result piextEnqueueMemBufferCopyPeer( pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event); -/// p2p is set true if PI API's, -/// piextEnqueueMemBufferCopyPeer/piextEnqueueMemBufferCopyRectPeer/piextEnqueueMemImageCopyPeer, -/// for peer to peer memory copy may be called. -/// -__SYCL_EXPORT pi_result piextDevicesSupportP2P(pi_device src_device, - pi_device dst_device, bool *p2p); - __SYCL_EXPORT pi_result piQueueRetain(pi_queue command_queue); __SYCL_EXPORT pi_result piQueueRelease(pi_queue command_queue); diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index b41bae6985dd9..0a8ea6287d0c0 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -1627,6 +1627,15 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name, } return getInfo(param_value_size, param_value, param_value_size_ret, value); } + case PI_DEVICE_INFO_P2P_READ_DEVICES: { + + std::vector devs; + + for (const auto &dev : device->get_platform()->devices_) { + devs.emplace_back(dev.get()); + } + return getInfo(param_value_size, param_value, param_value_size_ret, devs); + } // TODO: Investigate if this information is available on CUDA. case PI_DEVICE_INFO_PCI_ADDRESS: @@ -5074,14 +5083,6 @@ pi_result cuda_piextUSMGetMemAllocInfo(pi_context context, const void *ptr, return result; } -pi_result cuda_piextDevicesSupportP2P(pi_device src_device, - pi_device dst_device, bool *p2p) { - assert(src_device != nullptr); - assert(dst_device != nullptr); - *p2p = true; - return PI_SUCCESS; -} - // This API is called by Sycl RT to notify the end of the plugin lifetime. // TODO: add a global variable lifetime management code here (see // pi_level_zero.cpp for reference) Currently this is just a NOOP. @@ -5123,7 +5124,6 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piextDeviceGetNativeHandle, cuda_piextDeviceGetNativeHandle) _PI_CL(piextDeviceCreateWithNativeHandle, cuda_piextDeviceCreateWithNativeHandle) - _PI_CL(piextDevicesSupportP2P, cuda_piextDevicesSupportP2P) // Context _PI_CL(piextContextSetExtendedDeleter, cuda_piextContextSetExtendedDeleter) _PI_CL(piContextCreate, cuda_piContextCreate) diff --git a/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp b/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp index c6b046c1eca9d..f50a825fff68d 100644 --- a/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp +++ b/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp @@ -1260,17 +1260,6 @@ pi_result piTearDown(void *) { return PI_SUCCESS; } -// TODO properly implement this function if pi_esimd_cpu Peer to Peer copies -// become enabled. Currently Peer to Peer copies are unimplemented using the -// pi_esimd_cpu backend so p2p is always set false. -pi_result piextDevicesSupportP2P(pi_device src_device, pi_device dst_device, - bool *p2p) { - assert(src_device != nullptr); - assert(dst_device != nullptr); - *p2p = false; - return PI_SUCCESS; -} - pi_result piPluginInit(pi_plugin *PluginInit) { assert(PluginInit); size_t PluginVersionSize = sizeof(PluginInit->PluginVersion); diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 37b7cbe3b1b7c..e37129e4583f1 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -6824,17 +6824,6 @@ pi_result piextPluginGetOpaqueData(void *opaque_data_param, return PI_ERROR_UNKNOWN; } -// TODO properly implement this function when level_zero Peer to Peer copies are -// enabled. Currently Peer to Peer copies are unimplemented using the level_zero -// backend so p2p is always set false. -pi_result piextDevicesSupportP2P(pi_device src_device, pi_device dst_device, - bool *p2p) { - assert(src_device != nullptr); - assert(dst_device != nullptr); - *p2p = false; - return PI_SUCCESS; -} - // SYCL RT calls this api to notify the end of plugin lifetime. // It can include all the jobs to tear down resources before // the plugin is unloaded from memory. diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index ed77feee70049..fb05ceca94b32 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -1223,16 +1223,6 @@ pi_result piextKernelGetNativeHandle(pi_kernel kernel, return piextGetNativeHandle(kernel, nativeHandle); } -// Peer to Peer copies are not supported using the OpenCL backend so p2p is -// always set false. -pi_result piextDevicesSupportP2P(pi_device src_device, pi_device dst_device, - bool *p2p) { - assert(src_device != nullptr); - assert(dst_device != nullptr); - *p2p = false; - return PI_SUCCESS; -} - // This API is called by Sycl RT to notify the end of the plugin lifetime. // TODO: add a global variable lifetime management code here (see // pi_level_zero.cpp for reference) Currently this is just a NOOP. diff --git a/sycl/plugins/rocm/pi_rocm.cpp b/sycl/plugins/rocm/pi_rocm.cpp index 0f18fc86c296a..a43db720f18b6 100644 --- a/sycl/plugins/rocm/pi_rocm.cpp +++ b/sycl/plugins/rocm/pi_rocm.cpp @@ -4599,17 +4599,6 @@ pi_result rocm_piextUSMGetMemAllocInfo(pi_context context, const void *ptr, return result; } -// TODO properly implement this function when rocm Peer to Peer copies are -// enabled. Currently Peer to Peer copies are unimplemented using the rocm -// backend so p2p is always set false. -pi_result rocm_piextDevicesSupportP2P(pi_device src_device, - pi_device dst_device, bool *p2p) { - assert(src_device != nullptr); - assert(dst_device != nullptr); - *p2p = false; - return PI_SUCCESS; -} - // This API is called by Sycl RT to notify the end of the plugin lifetime. // TODO: add a global variable lifetime management code here (see // pi_level_zero.cpp for reference) Currently this is just a NOOP. @@ -4651,7 +4640,6 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piextDeviceGetNativeHandle, rocm_piextDeviceGetNativeHandle) _PI_CL(piextDeviceCreateWithNativeHandle, rocm_piextDeviceCreateWithNativeHandle) - _PI_CL(piextDevicesSupportP2P, rocm_piextDevicesSupportP2P) // Context _PI_CL(piextContextSetExtendedDeleter, rocm_piextContextSetExtendedDeleter) _PI_CL(piContextCreate, rocm_piContextCreate) diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index 35248a9dd0d7e..422e26a37634c 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -952,15 +952,24 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr CommandGroup, detail::getImplBackend(Record->MCurContext)) NeedMemMoveToHost = true; else { - bool p2p = false; - Queue->getPlugin().call( + std::vector devs; + + Queue->getPlugin().call_nocheck( Queue->getDeviceImplPtr()->getHandleRef(), - findAllocaCmd(Record) - ->getQueue() - ->getDeviceImplPtr() - ->getHandleRef(), - &p2p); - if (!p2p) + PI_DEVICE_INFO_P2P_READ_DEVICES, sizeof(devs), &devs, nullptr); + + bool can_read_peer = false; + const auto &src_dev = findAllocaCmd(Record) + ->getQueue() + ->getDeviceImplPtr() + ->getHandleRef(); + for (const auto &dev : devs) { + if (dev == src_dev) { + can_read_peer = true; + break; + } + } + if (!can_read_peer) NeedMemMoveToHost = true; } } From 43f970c4de6b79ba914266b42a9b92b0b3705e33 Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Tue, 31 Aug 2021 21:37:37 +0100 Subject: [PATCH 12/20] Removed piext Peer functions, replaced them with existing PI copy calls. Signed-off-by: JackAKirk --- sycl/include/CL/sycl/detail/pi.def | 4 - sycl/include/CL/sycl/detail/pi.h | 19 -- sycl/plugins/cuda/pi_cuda.cpp | 236 ++++------------------ sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp | 25 --- sycl/plugins/level_zero/pi_level_zero.cpp | 30 --- sycl/plugins/rocm/pi_rocm.cpp | 33 --- sycl/source/detail/memory_manager.cpp | 26 +-- 7 files changed, 39 insertions(+), 334 deletions(-) diff --git a/sycl/include/CL/sycl/detail/pi.def b/sycl/include/CL/sycl/detail/pi.def index 2cac6f88c31d6..730b4afa50c0c 100644 --- a/sycl/include/CL/sycl/detail/pi.def +++ b/sycl/include/CL/sycl/detail/pi.def @@ -137,8 +137,4 @@ _PI_API(piextPluginGetOpaqueData) _PI_API(piTearDown) -_PI_API(piextEnqueueMemBufferCopyPeer) -_PI_API(piextEnqueueMemBufferCopyRectPeer) -_PI_API(piextEnqueueMemImageCopyPeer) - #undef _PI_API diff --git a/sycl/include/CL/sycl/detail/pi.h b/sycl/include/CL/sycl/detail/pi.h index 72a90cd40e6b7..b89986404b139 100644 --- a/sycl/include/CL/sycl/detail/pi.h +++ b/sycl/include/CL/sycl/detail/pi.h @@ -1049,11 +1049,6 @@ __SYCL_EXPORT pi_result piQueueGetInfo(pi_queue command_queue, void *param_value, size_t *param_value_size_ret); -__SYCL_EXPORT pi_result piextEnqueueMemBufferCopyPeer( - pi_queue src_queue, pi_mem src_buffer, pi_queue dst_queue, - pi_mem dst_buffer, size_t src_offset, size_t dst_offset, size_t size, - pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, - pi_event *event); __SYCL_EXPORT pi_result piQueueRetain(pi_queue command_queue); @@ -1459,14 +1454,6 @@ __SYCL_EXPORT pi_result piEnqueueMemBufferCopyRect( pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event); -__SYCL_EXPORT pi_result piextEnqueueMemBufferCopyRectPeer( - pi_queue src_queue, pi_mem src_buffer, pi_queue dst_queue, - pi_mem dst_buffer, pi_buff_rect_offset src_origin, - pi_buff_rect_offset dst_origin, pi_buff_rect_region region, - size_t src_row_pitch, size_t src_slice_pitch, size_t dst_row_pitch, - size_t dst_slice_pitch, pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, pi_event *event); - __SYCL_EXPORT pi_result piEnqueueMemBufferFill(pi_queue command_queue, pi_mem buffer, const void *pattern, size_t pattern_size, size_t offset, @@ -1492,12 +1479,6 @@ __SYCL_EXPORT pi_result piEnqueueMemImageCopy( pi_image_region region, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event); -__SYCL_EXPORT pi_result piextEnqueueMemImageCopyPeer( - pi_queue command_queue, pi_mem src_image, pi_queue dst_queue, - pi_mem dst_image, pi_image_offset src_origin, pi_image_offset dst_origin, - pi_image_region region, pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, pi_event *event); - __SYCL_EXPORT pi_result piEnqueueMemImageFill(pi_queue command_queue, pi_mem image, const void *fill_color, const size_t *origin, diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 0a8ea6287d0c0..b5617a024e682 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -3907,63 +3907,18 @@ pi_result cuda_piEnqueueMemBufferCopy(pi_queue command_queue, pi_mem src_buffer, auto src = src_buffer->mem_.buffer_mem_.get() + src_offset; auto dst = dst_buffer->mem_.buffer_mem_.get() + dst_offset; - result = PI_CHECK_ERROR(cuMemcpyDtoDAsync(dst, src, size, stream)); - - if (event) { - result = retImplEv->record(); - *event = retImplEv.release(); - } - - return result; - } catch (pi_result err) { - return err; - } catch (...) { - return PI_ERROR_UNKNOWN; - } -} - -pi_result cuda_piextEnqueueMemBufferCopyPeer( - pi_queue src_queue, pi_mem src_buffer, pi_queue dst_queue, - pi_mem dst_buffer, size_t src_offset, size_t dst_offset, size_t size, - pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, - pi_event *event) { - - assert(src_buffer != nullptr); - assert(dst_buffer != nullptr); - - if (!dst_queue) { - return PI_INVALID_QUEUE; - } - - std::unique_ptr<_pi_event> retImplEv{nullptr}; - - try { - ScopedContext active(dst_queue->get_context()); - if (event_wait_list) { - cuda_piEnqueueEventsWait(src_queue, num_events_in_wait_list, - event_wait_list, nullptr); - } + if (src_buffer->context_ == dst_buffer->context_) { + result = PI_CHECK_ERROR(cuMemcpyDtoDAsync(dst, src, size, stream)); + } else { + auto dst_context = dst_buffer->context_->get(); + auto src_context = src_buffer->context_->get(); - pi_result result; + cuCtxEnablePeerAccess(src_context, 0); - if (event) { - retImplEv = std::unique_ptr<_pi_event>( - _pi_event::make_native(PI_COMMAND_TYPE_MEM_BUFFER_COPY, dst_queue)); - result = retImplEv->start(); + result = PI_CHECK_ERROR( + cuMemcpyPeerAsync(dst, dst_context, src, src_context, size, stream)); } - auto stream = dst_queue->get(); - auto src = src_buffer->mem_.buffer_mem_.get() + src_offset; - auto dst = dst_buffer->mem_.buffer_mem_.get() + dst_offset; - - auto dst_context = dst_queue->get_context()->get(); - auto src_context = src_queue->get_context()->get(); - - cuCtxEnablePeerAccess(src_context, 0); - - result = PI_CHECK_ERROR( - cuMemcpyPeerAsync(dst, dst_context, src, src_context, size, stream)); - if (event) { result = retImplEv->record(); *event = retImplEv.release(); @@ -4006,69 +3961,23 @@ pi_result cuda_piEnqueueMemBufferCopyRect( PI_COMMAND_TYPE_MEM_BUFFER_COPY_RECT, command_queue)); retImplEv->start(); } + if (src_buffer->context_ == dst_buffer->context_) { + retErr = commonEnqueueMemBufferCopyRect( + cuStream, region, &srcPtr, CU_MEMORYTYPE_DEVICE, src_origin, + src_row_pitch, src_slice_pitch, &dstPtr, CU_MEMORYTYPE_DEVICE, + dst_origin, dst_row_pitch, dst_slice_pitch); + } else { + auto dstContext = dst_buffer->context_->get(); + auto srcContext = src_buffer->context_->get(); - retErr = commonEnqueueMemBufferCopyRect( - cuStream, region, &srcPtr, CU_MEMORYTYPE_DEVICE, src_origin, - src_row_pitch, src_slice_pitch, &dstPtr, CU_MEMORYTYPE_DEVICE, - dst_origin, dst_row_pitch, dst_slice_pitch); - - if (event) { - retImplEv->record(); - *event = retImplEv.release(); - } - - } catch (pi_result err) { - retErr = err; - } - return retErr; -} - -pi_result cuda_piextEnqueueMemBufferCopyRectPeer( - pi_queue src_queue, pi_mem src_buffer, pi_queue dst_queue, - pi_mem dst_buffer, pi_buff_rect_offset src_origin, - pi_buff_rect_offset dst_origin, pi_buff_rect_region region, - size_t src_row_pitch, size_t src_slice_pitch, size_t dst_row_pitch, - size_t dst_slice_pitch, pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, pi_event *event) { - - assert(src_buffer != nullptr); - assert(dst_buffer != nullptr); - - if (!dst_queue) { - return PI_INVALID_QUEUE; - } - - pi_result retErr = PI_SUCCESS; - - std::unique_ptr<_pi_event> retImplEv{nullptr}; - - CUstream cuStream = dst_queue->get(); - CUdeviceptr srcPtr = src_buffer->mem_.buffer_mem_.get(); - CUdeviceptr dstPtr = dst_buffer->mem_.buffer_mem_.get(); - - auto dstContext = dst_queue->get_context()->get(); - auto srcContext = src_queue->get_context()->get(); - - cuCtxEnablePeerAccess(srcContext, 0); - - try { - ScopedContext active(dst_queue->get_context()); - if (event_wait_list) { - retErr = cuda_piEnqueueEventsWait(src_queue, num_events_in_wait_list, - event_wait_list, nullptr); - } + cuCtxEnablePeerAccess(srcContext, 0); - if (event) { - retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native( - PI_COMMAND_TYPE_MEM_BUFFER_COPY_RECT, dst_queue)); - retImplEv->start(); + retErr = commonEnqueueMemBufferCopyRectPeer( + cuStream, region, &srcPtr, CU_MEMORYTYPE_DEVICE, src_origin, + src_row_pitch, src_slice_pitch, &dstPtr, CU_MEMORYTYPE_DEVICE, + dst_origin, dst_row_pitch, dst_slice_pitch, dstContext, srcContext); } - retErr = commonEnqueueMemBufferCopyRectPeer( - cuStream, region, &srcPtr, CU_MEMORYTYPE_DEVICE, src_origin, - src_row_pitch, src_slice_pitch, &dstPtr, CU_MEMORYTYPE_DEVICE, - dst_origin, dst_row_pitch, dst_slice_pitch, dstContext, srcContext); - if (event) { retImplEv->record(); *event = retImplEv.release(); @@ -4507,9 +4416,21 @@ pi_result cuda_piEnqueueMemImageCopy(pi_queue command_queue, pi_mem src_image, size_t srcOffset[3] = {srcByteOffsetX, src_origin[1], src_origin[2]}; size_t dstOffset[3] = {dstByteOffsetX, dst_origin[1], dst_origin[2]}; - retErr = commonEnqueueMemImageNDCopy( - cuStream, imgType, adjustedRegion, &srcArray, CU_MEMORYTYPE_ARRAY, - srcOffset, &dstArray, CU_MEMORYTYPE_ARRAY, dstOffset); + if (src_image->context_ == dst_image->context_) { + retErr = commonEnqueueMemImageNDCopy( + cuStream, imgType, adjustedRegion, &srcArray, CU_MEMORYTYPE_ARRAY, + srcOffset, &dstArray, CU_MEMORYTYPE_ARRAY, dstOffset); + } else { + auto dstContext = dst_image->context_->get(); + auto srcContext = src_image->context_->get(); + + cuCtxEnablePeerAccess(srcContext, 0); + + retErr = commonEnqueueMemImageNDCopyPeer( + cuStream, imgType, adjustedRegion, &srcArray, CU_MEMORYTYPE_ARRAY, + srcOffset, &dstArray, CU_MEMORYTYPE_ARRAY, dstOffset, dstContext, + srcContext); + } if (retErr != PI_SUCCESS) { return retErr; @@ -4531,87 +4452,6 @@ pi_result cuda_piEnqueueMemImageCopy(pi_queue command_queue, pi_mem src_image, return retErr; } -pi_result cuda_piextEnqueueMemImageCopyPeer( - pi_queue src_queue, pi_mem src_image, pi_queue dst_queue, pi_mem dst_image, - const size_t *src_origin, const size_t *dst_origin, const size_t *region, - pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, - pi_event *event) { - - assert(src_image->mem_type_ == _pi_mem::mem_type::surface); - assert(dst_image->mem_type_ == _pi_mem::mem_type::surface); - assert(src_image->mem_.surface_mem_.get_image_type() == - dst_image->mem_.surface_mem_.get_image_type()); - - if (!dst_queue) { - return PI_INVALID_QUEUE; - } - - pi_result retErr = PI_SUCCESS; - CUstream cuStream = dst_queue->get(); - - try { - ScopedContext active(dst_queue->get_context()); - - if (event_wait_list) { - cuda_piEnqueueEventsWait(src_queue, num_events_in_wait_list, - event_wait_list, nullptr); - } - - CUarray srcArray = src_image->mem_.surface_mem_.get_array(); - CUarray dstArray = dst_image->mem_.surface_mem_.get_array(); - - CUDA_ARRAY_DESCRIPTOR srcArrayDesc; - retErr = PI_CHECK_ERROR(cuArrayGetDescriptor(&srcArrayDesc, srcArray)); - CUDA_ARRAY_DESCRIPTOR dstArrayDesc; - retErr = PI_CHECK_ERROR(cuArrayGetDescriptor(&dstArrayDesc, dstArray)); - - assert(srcArrayDesc.Format == dstArrayDesc.Format); - assert(srcArrayDesc.NumChannels == dstArrayDesc.NumChannels); - - int elementByteSize = imageElementByteSize(srcArrayDesc); - - size_t dstByteOffsetX = - dst_origin[0] * elementByteSize * srcArrayDesc.NumChannels; - size_t srcByteOffsetX = - src_origin[0] * elementByteSize * dstArrayDesc.NumChannels; - size_t bytesToCopy = elementByteSize * srcArrayDesc.NumChannels * region[0]; - - pi_mem_type imgType = src_image->mem_.surface_mem_.get_image_type(); - - auto dstContext = dst_queue->get_context()->get(); - auto srcContext = src_queue->get_context()->get(); - - cuCtxEnablePeerAccess(srcContext, 0); - - size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]}; - size_t srcOffset[3] = {srcByteOffsetX, src_origin[1], src_origin[2]}; - size_t dstOffset[3] = {dstByteOffsetX, dst_origin[1], dst_origin[2]}; - - retErr = commonEnqueueMemImageNDCopyPeer( - cuStream, imgType, adjustedRegion, &srcArray, CU_MEMORYTYPE_ARRAY, - srcOffset, &dstArray, CU_MEMORYTYPE_ARRAY, dstOffset, dstContext, - srcContext); - - if (retErr != PI_SUCCESS) { - return retErr; - } - - if (event) { - auto new_event = - _pi_event::make_native(PI_COMMAND_TYPE_IMAGE_COPY, dst_queue); - new_event->record(); - *event = new_event; - } - - } catch (pi_result err) { - return err; - } catch (...) { - return PI_ERROR_UNKNOWN; - } - - return retErr; -} - /// \TODO Not implemented in CUDA, requires untie from OpenCL pi_result cuda_piEnqueueMemImageFill(pi_queue, pi_mem, const void *, const size_t *, const size_t *, pi_uint32, @@ -5152,10 +4992,6 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piMemBufferPartition, cuda_piMemBufferPartition) _PI_CL(piextMemGetNativeHandle, cuda_piextMemGetNativeHandle) _PI_CL(piextMemCreateWithNativeHandle, cuda_piextMemCreateWithNativeHandle) - _PI_CL(piextEnqueueMemBufferCopyPeer, cuda_piextEnqueueMemBufferCopyPeer) - _PI_CL(piextEnqueueMemBufferCopyRectPeer, - cuda_piextEnqueueMemBufferCopyRectPeer) - _PI_CL(piextEnqueueMemImageCopyPeer, cuda_piextEnqueueMemImageCopyPeer) // Program _PI_CL(piProgramCreate, cuda_piProgramCreate) _PI_CL(piclProgramCreateWithSource, cuda_piclProgramCreateWithSource) diff --git a/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp b/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp index f50a825fff68d..2b29c46382738 100644 --- a/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp +++ b/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp @@ -1010,13 +1010,6 @@ pi_result piEnqueueMemBufferCopy(pi_queue, pi_mem, pi_mem, size_t, size_t, return PI_SUCCESS; } -pi_result piextEnqueueMemBufferCopyPeer(pi_queue, pi_mem, pi_queue, pi_mem, - size_t, size_t, size_t, pi_uint32, - const pi_event *, pi_event *) { - DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; -} - pi_result piEnqueueMemBufferCopyRect(pi_queue, pi_mem, pi_mem, pi_buff_rect_offset, pi_buff_rect_offset, pi_buff_rect_region, size_t, size_t, @@ -1026,16 +1019,6 @@ pi_result piEnqueueMemBufferCopyRect(pi_queue, pi_mem, pi_mem, return PI_SUCCESS; } -pi_result piextEnqueueMemBufferCopyRectPeer(pi_queue, pi_mem, pi_queue, pi_mem, - pi_buff_rect_offset, - pi_buff_rect_offset, - pi_buff_rect_region, size_t, size_t, - size_t, size_t, pi_uint32, - const pi_event *, pi_event *) { - DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; -} - pi_result piEnqueueMemBufferFill(pi_queue, pi_mem, const void *, size_t, size_t, size_t, pi_uint32, const pi_event *, pi_event *) { @@ -1097,14 +1080,6 @@ pi_result piEnqueueMemImageCopy(pi_queue, pi_mem, pi_mem, pi_image_offset, return PI_SUCCESS; } -pi_result piextEnqueueMemImageCopyPeer(pi_queue, pi_mem, pi_queue, pi_mem, - pi_image_offset, pi_image_offset, - pi_image_region, pi_uint32, - const pi_event *, pi_event *) { - DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; -} - pi_result piEnqueueMemImageFill(pi_queue, pi_mem, const void *, const size_t *, const size_t *, pi_uint32, const pi_event *, pi_event *) { diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index e37129e4583f1..7e7d97bdad531 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -5270,17 +5270,6 @@ pi_result piEnqueueMemBufferCopy(pi_queue Queue, pi_mem SrcBuffer, NumEventsInWaitList, EventWaitList, Event, PreferCopyEngine); } -pi_result piextEnqueueMemBufferCopyPeer(pi_queue src_queue, pi_mem src_buffer, - pi_queue dst_queue, pi_mem dst_buffer, - size_t src_offset, size_t dst_offset, - size_t size, - pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, - pi_event *event) { - die("piextEnqueueMemBufferCopyPeer: not implemented"); - return {}; -} - pi_result piEnqueueMemBufferCopyRect( pi_queue Queue, pi_mem SrcBuffer, pi_mem DstBuffer, pi_buff_rect_offset SrcOrigin, pi_buff_rect_offset DstOrigin, @@ -5300,16 +5289,6 @@ pi_result piEnqueueMemBufferCopyRect( NumEventsInWaitList, EventWaitList, Event, PreferCopyEngine); } -pi_result piextEnqueueMemBufferCopyRectPeer( - pi_queue Queue, pi_mem SrcBuffer, pi_queue DstQueue, pi_mem DstBuffer, - pi_buff_rect_offset SrcOrigin, pi_buff_rect_offset DstOrigin, - pi_buff_rect_region Region, size_t SrcRowPitch, size_t SrcSlicePitch, - size_t DstRowPitch, size_t DstSlicePitch, pi_uint32 NumEventsInWaitList, - const pi_event *EventWaitList, pi_event *Event) { - die("piextEnqueueMemBufferCopyRectPeer: not implemented"); - return {}; -} - } // extern "C" // @@ -5907,15 +5886,6 @@ piEnqueueMemImageCopy(pi_queue Queue, pi_mem SrcImage, pi_mem DstImage, NumEventsInWaitList, EventWaitList, Event, PreferCopyEngine); } -pi_result piextEnqueueMemImageCopyPeer( - pi_queue Queue, pi_mem SrcImage, pi_queue dst_queue, pi_mem DstImage, - pi_image_offset SrcOrigin, pi_image_offset DstOrigin, - pi_image_region Region, pi_uint32 NumEventsInWaitList, - const pi_event *EventWaitList, pi_event *Event) { - die("piextEnqueueMemImageCopyPeer: not implemented"); - return {}; -} - pi_result piEnqueueMemImageFill(pi_queue Queue, pi_mem Image, const void *FillColor, const size_t *Origin, const size_t *Region, diff --git a/sycl/plugins/rocm/pi_rocm.cpp b/sycl/plugins/rocm/pi_rocm.cpp index a43db720f18b6..aa4bbe5270efd 100644 --- a/sycl/plugins/rocm/pi_rocm.cpp +++ b/sycl/plugins/rocm/pi_rocm.cpp @@ -3669,15 +3669,6 @@ pi_result rocm_piEnqueueMemBufferCopy(pi_queue command_queue, pi_mem src_buffer, } } -pi_result rocm_piextEnqueueMemBufferCopyPeer( - pi_queue src_queue, pi_mem src_buffer, pi_queue dst_queue, - pi_mem dst_buffer, size_t src_offset, size_t dst_offset, size_t size, - pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, - pi_event *event) { - die("rocm_piextEnqueueMemBufferCopyPeer: not implemented"); - return {}; -} - pi_result rocm_piEnqueueMemBufferCopyRect( pi_queue command_queue, pi_mem src_buffer, pi_mem dst_buffer, pi_buff_rect_offset src_origin, pi_buff_rect_offset dst_origin, @@ -3724,17 +3715,6 @@ pi_result rocm_piEnqueueMemBufferCopyRect( return retErr; } -pi_result rocm_piextEnqueueMemBufferCopyRectPeer( - pi_queue src_queue, pi_mem src_buffer, pi_queue dst_queue, - pi_mem dst_buffer, pi_buff_rect_offset src_origin, - pi_buff_rect_offset dst_origin, pi_buff_rect_region region, - size_t src_row_pitch, size_t src_slice_pitch, size_t dst_row_pitch, - size_t dst_slice_pitch, pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, pi_event *event) { - die("rocm_piextEnqueueMemBufferCopyRectPeer not implemented"); - return {}; -} - pi_result rocm_piEnqueueMemBufferFill(pi_queue command_queue, pi_mem buffer, const void *pattern, size_t pattern_size, size_t offset, size_t size, @@ -4143,15 +4123,6 @@ pi_result rocm_piEnqueueMemImageCopy(pi_queue command_queue, pi_mem src_image, return retErr; } -pi_result rocm_piextEnqueueMemImageCopyPeer( - pi_queue Queue, pi_mem SrcImage, pi_queue dst_queue, pi_mem DstImage, - pi_image_offset SrcOrigin, pi_image_offset DstOrigin, - pi_image_region Region, pi_uint32 NumEventsInWaitList, - const pi_event *EventWaitList, pi_event *Event) { - die("rocm_piextEnqueueMemImageCopyPeer: not implemented"); - return {}; -} - /// \TODO Not implemented in HIP, requires untie from OpenCL pi_result rocm_piEnqueueMemImageFill(pi_queue command_queue, pi_mem image, const void *fill_color, @@ -4717,15 +4688,11 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piEnqueueMemBufferWrite, rocm_piEnqueueMemBufferWrite) _PI_CL(piEnqueueMemBufferWriteRect, rocm_piEnqueueMemBufferWriteRect) _PI_CL(piEnqueueMemBufferCopy, rocm_piEnqueueMemBufferCopy) - _PI_CL(piextEnqueueMemBufferCopyPeer, rocm_piextEnqueueMemBufferCopyPeer) _PI_CL(piEnqueueMemBufferCopyRect, rocm_piEnqueueMemBufferCopyRect) - _PI_CL(piextEnqueueMemBufferCopyRectPeer, - rocm_piextEnqueueMemBufferCopyRectPeer) _PI_CL(piEnqueueMemBufferFill, rocm_piEnqueueMemBufferFill) _PI_CL(piEnqueueMemImageRead, rocm_piEnqueueMemImageRead) _PI_CL(piEnqueueMemImageWrite, rocm_piEnqueueMemImageWrite) _PI_CL(piEnqueueMemImageCopy, rocm_piEnqueueMemImageCopy) - _PI_CL(piextEnqueueMemImageCopyPeer, rocm_piextEnqueueMemImageCopyPeer) _PI_CL(piEnqueueMemImageFill, rocm_piEnqueueMemImageFill) _PI_CL(piEnqueueMemBufferMap, rocm_piEnqueueMemBufferMap) _PI_CL(piEnqueueMemUnmap, rocm_piEnqueueMemUnmap) diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp index 24fdaebe88c6b..8203fefce9de4 100644 --- a/sycl/source/detail/memory_manager.cpp +++ b/sycl/source/detail/memory_manager.cpp @@ -427,11 +427,11 @@ void copyD2D(SYCLMemObjI *SYCLMemObj, RT::PiMem SrcMem, QueueImplPtr SrcQueue, std::vector DepEvents, RT::PiEvent &OutEvent) { assert(SYCLMemObj && "The SYCLMemObj is nullptr"); - const RT::PiQueue Queue = SrcQueue->getHandleRef(); - const RT::PiQueue QueueDst = DstQueue->getHandleRef(); const detail::plugin &Plugin = SrcQueue->getPlugin(); - auto sameCtxt = SrcQueue->get_context() == DstQueue->get_context(); + const RT::PiQueue Queue = SrcQueue->get_context() == DstQueue->get_context() + ? SrcQueue->getHandleRef() + : DstQueue->getHandleRef(); detail::SYCLMemObjI::MemObjType MemType = SYCLMemObj->getType(); TermPositions SrcPos, DstPos; @@ -446,17 +446,10 @@ void copyD2D(SYCLMemObjI *SYCLMemObj, RT::PiMem SrcMem, QueueImplPtr SrcQueue, if (MemType == detail::SYCLMemObjI::MemObjType::Buffer) { if (1 == DimDst && 1 == DimSrc) { - if (sameCtxt) { Plugin.call( Queue, SrcMem, DstMem, SrcXOffBytes, DstXOffBytes, SrcAccessRangeWidthBytes, DepEvents.size(), DepEvents.data(), &OutEvent); - } else { - Plugin.call( - Queue, SrcMem, QueueDst, DstMem, SrcXOffBytes, DstXOffBytes, - SrcAccessRangeWidthBytes, DepEvents.size(), DepEvents.data(), - &OutEvent); - } } else { // passing 0 for pitches not allowed. Because clEnqueueCopyBufferRect will // calculate both src and dest pitch using region[0], which is not correct @@ -478,17 +471,10 @@ void copyD2D(SYCLMemObjI *SYCLMemObj, RT::PiMem SrcMem, QueueImplPtr SrcQueue, SrcAccessRange[SrcPos.YTerm], SrcAccessRange[SrcPos.ZTerm]}; - if (sameCtxt) { Plugin.call( Queue, SrcMem, DstMem, &SrcOrigin, &DstOrigin, &Region, SrcRowPitch, SrcSlicePitch, DstRowPitch, DstSlicePitch, DepEvents.size(), DepEvents.data(), &OutEvent); - } else { - Plugin.call( - Queue, SrcMem, QueueDst, DstMem, &SrcOrigin, &DstOrigin, &Region, - SrcRowPitch, SrcSlicePitch, DstRowPitch, DstSlicePitch, - DepEvents.size(), DepEvents.data(), &OutEvent); - } } } else { pi_image_offset_struct SrcOrigin{SrcOffset[SrcPos.XTerm], @@ -501,15 +487,9 @@ void copyD2D(SYCLMemObjI *SYCLMemObj, RT::PiMem SrcMem, QueueImplPtr SrcQueue, SrcAccessRange[SrcPos.YTerm], SrcAccessRange[SrcPos.ZTerm]}; - if (sameCtxt) { Plugin.call( Queue, SrcMem, DstMem, &SrcOrigin, &DstOrigin, &Region, DepEvents.size(), DepEvents.data(), &OutEvent); - } else { - Plugin.call( - Queue, SrcMem, QueueDst, DstMem, &SrcOrigin, &DstOrigin, &Region, - DepEvents.size(), DepEvents.data(), &OutEvent); - } } } From 9b9a21fe4db928afd9dadcb54263b1619bdad485 Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Tue, 31 Aug 2021 21:43:32 +0100 Subject: [PATCH 13/20] Fixed formatting issues following previous commit. Signed-off-by: JackAKirk --- sycl/include/CL/sycl/detail/pi.h | 1 - sycl/source/detail/memory_manager.cpp | 22 +++++++++++----------- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/sycl/include/CL/sycl/detail/pi.h b/sycl/include/CL/sycl/detail/pi.h index b89986404b139..5207800372d69 100644 --- a/sycl/include/CL/sycl/detail/pi.h +++ b/sycl/include/CL/sycl/detail/pi.h @@ -1049,7 +1049,6 @@ __SYCL_EXPORT pi_result piQueueGetInfo(pi_queue command_queue, void *param_value, size_t *param_value_size_ret); - __SYCL_EXPORT pi_result piQueueRetain(pi_queue command_queue); __SYCL_EXPORT pi_result piQueueRelease(pi_queue command_queue); diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp index 8203fefce9de4..777897b235264 100644 --- a/sycl/source/detail/memory_manager.cpp +++ b/sycl/source/detail/memory_manager.cpp @@ -446,10 +446,10 @@ void copyD2D(SYCLMemObjI *SYCLMemObj, RT::PiMem SrcMem, QueueImplPtr SrcQueue, if (MemType == detail::SYCLMemObjI::MemObjType::Buffer) { if (1 == DimDst && 1 == DimSrc) { - Plugin.call( - Queue, SrcMem, DstMem, SrcXOffBytes, DstXOffBytes, - SrcAccessRangeWidthBytes, DepEvents.size(), DepEvents.data(), - &OutEvent); + Plugin.call( + Queue, SrcMem, DstMem, SrcXOffBytes, DstXOffBytes, + SrcAccessRangeWidthBytes, DepEvents.size(), DepEvents.data(), + &OutEvent); } else { // passing 0 for pitches not allowed. Because clEnqueueCopyBufferRect will // calculate both src and dest pitch using region[0], which is not correct @@ -471,10 +471,10 @@ void copyD2D(SYCLMemObjI *SYCLMemObj, RT::PiMem SrcMem, QueueImplPtr SrcQueue, SrcAccessRange[SrcPos.YTerm], SrcAccessRange[SrcPos.ZTerm]}; - Plugin.call( - Queue, SrcMem, DstMem, &SrcOrigin, &DstOrigin, &Region, SrcRowPitch, - SrcSlicePitch, DstRowPitch, DstSlicePitch, DepEvents.size(), - DepEvents.data(), &OutEvent); + Plugin.call( + Queue, SrcMem, DstMem, &SrcOrigin, &DstOrigin, &Region, SrcRowPitch, + SrcSlicePitch, DstRowPitch, DstSlicePitch, DepEvents.size(), + DepEvents.data(), &OutEvent); } } else { pi_image_offset_struct SrcOrigin{SrcOffset[SrcPos.XTerm], @@ -487,9 +487,9 @@ void copyD2D(SYCLMemObjI *SYCLMemObj, RT::PiMem SrcMem, QueueImplPtr SrcQueue, SrcAccessRange[SrcPos.YTerm], SrcAccessRange[SrcPos.ZTerm]}; - Plugin.call( - Queue, SrcMem, DstMem, &SrcOrigin, &DstOrigin, &Region, - DepEvents.size(), DepEvents.data(), &OutEvent); + Plugin.call( + Queue, SrcMem, DstMem, &SrcOrigin, &DstOrigin, &Region, + DepEvents.size(), DepEvents.data(), &OutEvent); } } From 52cae9faa43b2141d6bd5187d7b3e52cafba5a02 Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Wed, 1 Sep 2021 10:39:14 +0100 Subject: [PATCH 14/20] P2P copies made for 1D image arrays again. Signed-off-by: JackAKirk --- sycl/plugins/cuda/pi_cuda.cpp | 45 +++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index b5617a024e682..93c9749211031 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -4408,33 +4408,38 @@ pi_result cuda_piEnqueueMemImageCopy(pi_queue command_queue, pi_mem src_image, size_t bytesToCopy = elementByteSize * srcArrayDesc.NumChannels * region[0]; pi_mem_type imgType = src_image->mem_.surface_mem_.get_image_type(); - if (imgType == PI_MEM_TYPE_IMAGE1D) { - retErr = PI_CHECK_ERROR(cuMemcpyAtoA(dstArray, dstByteOffsetX, srcArray, - srcByteOffsetX, bytesToCopy)); - } else { - size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]}; - size_t srcOffset[3] = {srcByteOffsetX, src_origin[1], src_origin[2]}; - size_t dstOffset[3] = {dstByteOffsetX, dst_origin[1], dst_origin[2]}; + if (src_image->context_ == dst_image->context_) { + if (imgType == PI_MEM_TYPE_IMAGE1D) { + retErr = PI_CHECK_ERROR(cuMemcpyAtoA(dstArray, dstByteOffsetX, srcArray, + srcByteOffsetX, bytesToCopy)); + } else { + size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]}; + size_t srcOffset[3] = {srcByteOffsetX, src_origin[1], src_origin[2]}; + size_t dstOffset[3] = {dstByteOffsetX, dst_origin[1], dst_origin[2]}; - if (src_image->context_ == dst_image->context_) { retErr = commonEnqueueMemImageNDCopy( cuStream, imgType, adjustedRegion, &srcArray, CU_MEMORYTYPE_ARRAY, srcOffset, &dstArray, CU_MEMORYTYPE_ARRAY, dstOffset); - } else { - auto dstContext = dst_image->context_->get(); - auto srcContext = src_image->context_->get(); - - cuCtxEnablePeerAccess(srcContext, 0); - retErr = commonEnqueueMemImageNDCopyPeer( - cuStream, imgType, adjustedRegion, &srcArray, CU_MEMORYTYPE_ARRAY, - srcOffset, &dstArray, CU_MEMORYTYPE_ARRAY, dstOffset, dstContext, - srcContext); + if (retErr != PI_SUCCESS) { + return retErr; + } } + } else { - if (retErr != PI_SUCCESS) { - return retErr; - } + size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]}; + size_t srcOffset[3] = {srcByteOffsetX, src_origin[1], src_origin[2]}; + size_t dstOffset[3] = {dstByteOffsetX, dst_origin[1], dst_origin[2]}; + + auto dstContext = dst_image->context_->get(); + auto srcContext = src_image->context_->get(); + + cuCtxEnablePeerAccess(srcContext, 0); + + retErr = commonEnqueueMemImageNDCopyPeer( + cuStream, imgType, adjustedRegion, &srcArray, CU_MEMORYTYPE_ARRAY, + srcOffset, &dstArray, CU_MEMORYTYPE_ARRAY, dstOffset, dstContext, + srcContext); } if (event) { From 24b240a54669b86f127c841d9d0c356aa02472ef Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Wed, 1 Sep 2021 14:17:01 +0100 Subject: [PATCH 15/20] Return retError from call to commonEnqueueMemImageNDCopyPeer. Signed-off-by: JackAKirk --- sycl/plugins/cuda/pi_cuda.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 93c9749211031..7a3b0dea9f862 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -4426,7 +4426,6 @@ pi_result cuda_piEnqueueMemImageCopy(pi_queue command_queue, pi_mem src_image, } } } else { - size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]}; size_t srcOffset[3] = {srcByteOffsetX, src_origin[1], src_origin[2]}; size_t dstOffset[3] = {dstByteOffsetX, dst_origin[1], dst_origin[2]}; @@ -4440,6 +4439,10 @@ pi_result cuda_piEnqueueMemImageCopy(pi_queue command_queue, pi_mem src_image, cuStream, imgType, adjustedRegion, &srcArray, CU_MEMORYTYPE_ARRAY, srcOffset, &dstArray, CU_MEMORYTYPE_ARRAY, dstOffset, dstContext, srcContext); + + if (retErr != PI_SUCCESS) { + return retErr; + } } if (event) { From 99e58f1a4e5915e549925bfb180f5b850c096827 Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Thu, 2 Sep 2021 13:59:42 +0100 Subject: [PATCH 16/20] Removed all changes to memory_manager.cpp: the srcQueue can be as the command_queue for peer to peer copy. Signed-off-by: JackAKirk --- sycl/plugins/cuda/pi_cuda.cpp | 6 +++--- sycl/source/detail/memory_manager.cpp | 7 ++----- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 7a3b0dea9f862..669a1cdb8adf2 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -3913,7 +3913,7 @@ pi_result cuda_piEnqueueMemBufferCopy(pi_queue command_queue, pi_mem src_buffer, auto dst_context = dst_buffer->context_->get(); auto src_context = src_buffer->context_->get(); - cuCtxEnablePeerAccess(src_context, 0); + cuCtxEnablePeerAccess(dst_context, 0); result = PI_CHECK_ERROR( cuMemcpyPeerAsync(dst, dst_context, src, src_context, size, stream)); @@ -3970,7 +3970,7 @@ pi_result cuda_piEnqueueMemBufferCopyRect( auto dstContext = dst_buffer->context_->get(); auto srcContext = src_buffer->context_->get(); - cuCtxEnablePeerAccess(srcContext, 0); + cuCtxEnablePeerAccess(dstContext, 0); retErr = commonEnqueueMemBufferCopyRectPeer( cuStream, region, &srcPtr, CU_MEMORYTYPE_DEVICE, src_origin, @@ -4433,7 +4433,7 @@ pi_result cuda_piEnqueueMemImageCopy(pi_queue command_queue, pi_mem src_image, auto dstContext = dst_image->context_->get(); auto srcContext = src_image->context_->get(); - cuCtxEnablePeerAccess(srcContext, 0); + cuCtxEnablePeerAccess(dstContext, 0); retErr = commonEnqueueMemImageNDCopyPeer( cuStream, imgType, adjustedRegion, &srcArray, CU_MEMORYTYPE_ARRAY, diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp index 777897b235264..ffe0b6988e231 100644 --- a/sycl/source/detail/memory_manager.cpp +++ b/sycl/source/detail/memory_manager.cpp @@ -421,17 +421,14 @@ void copyD2H(SYCLMemObjI *SYCLMemObj, RT::PiMem SrcMem, QueueImplPtr SrcQueue, void copyD2D(SYCLMemObjI *SYCLMemObj, RT::PiMem SrcMem, QueueImplPtr SrcQueue, unsigned int DimSrc, sycl::range<3> SrcSize, sycl::range<3> SrcAccessRange, sycl::id<3> SrcOffset, - unsigned int SrcElemSize, RT::PiMem DstMem, QueueImplPtr DstQueue, + unsigned int SrcElemSize, RT::PiMem DstMem, QueueImplPtr, unsigned int DimDst, sycl::range<3> DstSize, sycl::range<3>, sycl::id<3> DstOffset, unsigned int DstElemSize, std::vector DepEvents, RT::PiEvent &OutEvent) { assert(SYCLMemObj && "The SYCLMemObj is nullptr"); const detail::plugin &Plugin = SrcQueue->getPlugin(); - - const RT::PiQueue Queue = SrcQueue->get_context() == DstQueue->get_context() - ? SrcQueue->getHandleRef() - : DstQueue->getHandleRef(); + const RT::PiQueue Queue = SrcQueue->getHandleRef(); detail::SYCLMemObjI::MemObjType MemType = SYCLMemObj->getType(); TermPositions SrcPos, DstPos; From fd159102b85de69e0f05ffb98f40884174f779f8 Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Thu, 2 Sep 2021 14:04:30 +0100 Subject: [PATCH 17/20] Superficial change to make the memory_manager.cpp diff empty. Signed-off-by: JackAKirk --- sycl/source/detail/memory_manager.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp index ffe0b6988e231..d8d4a05c4b75c 100644 --- a/sycl/source/detail/memory_manager.cpp +++ b/sycl/source/detail/memory_manager.cpp @@ -427,8 +427,8 @@ void copyD2D(SYCLMemObjI *SYCLMemObj, RT::PiMem SrcMem, QueueImplPtr SrcQueue, std::vector DepEvents, RT::PiEvent &OutEvent) { assert(SYCLMemObj && "The SYCLMemObj is nullptr"); - const detail::plugin &Plugin = SrcQueue->getPlugin(); const RT::PiQueue Queue = SrcQueue->getHandleRef(); + const detail::plugin &Plugin = SrcQueue->getPlugin(); detail::SYCLMemObjI::MemObjType MemType = SYCLMemObj->getType(); TermPositions SrcPos, DstPos; From ccba4461e81314a6f9068ee496f7d9df382fa090 Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Tue, 21 Sep 2021 11:07:22 +0100 Subject: [PATCH 18/20] Applied stylistic/general improvements. Signed-off-by: JackAKirk --- .../source/detail/scheduler/graph_builder.cpp | 26 +++++++------------ 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index 422e26a37634c..fbc0190759c54 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -952,25 +952,19 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr CommandGroup, detail::getImplBackend(Record->MCurContext)) NeedMemMoveToHost = true; else { - std::vector devs; + std::vector Devs; Queue->getPlugin().call_nocheck( Queue->getDeviceImplPtr()->getHandleRef(), - PI_DEVICE_INFO_P2P_READ_DEVICES, sizeof(devs), &devs, nullptr); - - bool can_read_peer = false; - const auto &src_dev = findAllocaCmd(Record) - ->getQueue() - ->getDeviceImplPtr() - ->getHandleRef(); - for (const auto &dev : devs) { - if (dev == src_dev) { - can_read_peer = true; - break; - } - } - if (!can_read_peer) - NeedMemMoveToHost = true; + PI_DEVICE_INFO_P2P_READ_DEVICES, sizeof(Devs), &Devs, nullptr); + + _pi_device *SrcDev = findAllocaCmd(Record) + ->getQueue() + ->getDeviceImplPtr() + ->getHandleRef(); + + NeedMemMoveToHost = + std::find(Devs.begin(), Devs.end(), SrcDev) != Devs.end(); } } if (NeedMemMoveToHost) From 70578495bf3ccb48ad04daed37615484d595da00 Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Thu, 23 Sep 2021 14:52:02 +0100 Subject: [PATCH 19/20] Reverted change to guessLocalWorkSize: unnecessary since #4606. Signed-off-by: JackAKirk --- sycl/plugins/cuda/pi_cuda.cpp | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 6df9dc3060b25..2f1a3bae6fab4 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -245,21 +245,14 @@ int getAttribute(pi_device device, CUdevice_attribute attribute) { // Determine local work sizes that result in uniform work groups. // The default threadsPerBlock only require handling the first work_dim // dimension. -pi_result guessLocalWorkSize(int *threadsPerBlock, - const size_t *global_work_size, - const size_t maxThreadsPerBlock[3], - pi_kernel kernel, pi_uint32 local_size) { +void guessLocalWorkSize(int *threadsPerBlock, const size_t *global_work_size, + const size_t maxThreadsPerBlock[3], pi_kernel kernel, + pi_uint32 local_size) { assert(threadsPerBlock != nullptr); assert(global_work_size != nullptr); assert(kernel != nullptr); int recommendedBlockSize, minGrid; - try { - ScopedContext active(kernel->context_); - } catch (pi_result err) { - return err; - } - PI_CHECK_ERROR(cuOccupancyMaxPotentialBlockSize( &minGrid, &recommendedBlockSize, kernel->get(), NULL, local_size, maxThreadsPerBlock[0])); @@ -276,7 +269,6 @@ pi_result guessLocalWorkSize(int *threadsPerBlock, while (0u != (global_work_size[0] % threadsPerBlock[0])) { --threadsPerBlock[0]; } - return PI_SUCCESS; } } // anonymous namespace From b64484d23bf0182bb965b47a1b1339b7224cb823 Mon Sep 17 00:00:00 2001 From: "jack.kirk" Date: Fri, 1 Oct 2021 13:18:12 +0100 Subject: [PATCH 20/20] Implemented PI_DEVICE_INFO_P2P_READ_DEVICES piDeviceGetInfo case in other backends. Signed-off-by: jack.kirk --- sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp | 3 +++ sycl/plugins/hip/pi_hip.cpp | 4 ++++ sycl/plugins/level_zero/pi_level_zero.cpp | 3 +++ sycl/source/detail/scheduler/graph_builder.cpp | 2 +- 4 files changed, 11 insertions(+), 1 deletion(-) diff --git a/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp b/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp index 3a24d1be3906d..1a119abf832a6 100644 --- a/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp +++ b/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp @@ -513,6 +513,9 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, // cl_khr_fp64, cl_khr_int64_base_atomics, // cl_khr_int64_extended_atomics return ReturnValue(""); + // P2P is currently unsupported in level zero + case PI_DEVICE_INFO_P2P_READ_DEVICES: + return ReturnValue(std::vector{}); #define UNSUPPORTED_INFO(info) \ case info: \ diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp index c959b789486c0..4974c05a8b474 100644 --- a/sycl/plugins/hip/pi_hip.cpp +++ b/sycl/plugins/hip/pi_hip.cpp @@ -1563,6 +1563,10 @@ pi_result hip_piDeviceGetInfo(pi_device device, pi_device_info param_name, } return getInfo(param_value_size, param_value, param_value_size_ret, value); } + // P2P is currently unsupported in level zero + case PI_DEVICE_INFO_P2P_READ_DEVICES: + return getInfo(param_value_size, param_value, param_value_size_ret, + std::vector{}); // TODO: Implement. case PI_DEVICE_INFO_ATOMIC_64: diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 6a013df6c8cc6..3a07e25a111f7 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -2448,6 +2448,9 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, case PI_DEVICE_INFO_MAX_MEM_BANDWIDTH: // currently not supported in level zero runtime return PI_INVALID_VALUE; + // P2P is currently unsupported in level zero + case PI_DEVICE_INFO_P2P_READ_DEVICES: + return ReturnValue(std::vector{}); default: zePrint("Unsupported ParamName in piGetDeviceInfo\n"); diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index d6127fdb8056d..e7a774ee1102c 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -963,7 +963,7 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr CommandGroup, ->getHandleRef(); NeedMemMoveToHost = - std::find(Devs.begin(), Devs.end(), SrcDev) != Devs.end(); + std::find(Devs.begin(), Devs.end(), SrcDev) == Devs.end(); } } if (NeedMemMoveToHost)