intel · JackAKirk · Aug 12, 2021 · Aug 13, 2021 · Aug 13, 2021 · Aug 23, 2021
@@ -296,7 +296,8 @@ typedef enum {
   PI_DEVICE_INFO_MAX_MEM_BANDWIDTH = 0x10026,
   PI_DEVICE_INFO_IMAGE_SRGB = 0x10027,
   PI_DEVICE_INFO_ATOMIC_64 = 0x10110,
-  PI_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES = 0x10111
+  PI_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES = 0x10111,
+  PI_DEVICE_INFO_P2P_READ_DEVICES = 0x10112
 } _pi_device_info;
 
 typedef enum {

@@ -1637,6 +1637,15 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name,
     }
     return getInfo(param_value_size, param_value, param_value_size_ret, value);
   }
+  case PI_DEVICE_INFO_P2P_READ_DEVICES: {
+
+    std::vector<pi_device> devs;
+
+    for (const auto &dev : device->get_platform()->devices_) {
+      devs.emplace_back(dev.get());
+    }
+    return getInfo(param_value_size, param_value, param_value_size_ret, devs);
+  }
 
     // TODO: Investigate if this information is available on CUDA.
   case PI_DEVICE_INFO_PCI_ADDRESS:
@@ -3648,6 +3657,19 @@ pi_result cuda_piSamplerRelease(pi_sampler sampler) {
   return PI_SUCCESS;
 }
 
+void copyRectAsserts(const pi_buff_rect_region &region,
+                     const pi_buff_rect_offset &src_offset,
+                     const pi_buff_rect_offset &dst_offset,
+                     const CUmemorytype_enum &src_type,
+                     const CUmemorytype_enum &dst_type) {
+  assert(region != nullptr);
+  assert(src_offset != nullptr);
+  assert(dst_offset != nullptr);
+
+  assert(src_type == CU_MEMORYTYPE_DEVICE || src_type == CU_MEMORYTYPE_HOST);
+  assert(dst_type == CU_MEMORYTYPE_DEVICE || dst_type == CU_MEMORYTYPE_HOST);
+}
+
 /// General 3D memory copy operation.
 /// This function requires the corresponding CUDA context to be at the top of
 /// the context stack
@@ -3660,12 +3682,7 @@ static pi_result commonEnqueueMemBufferCopyRect(
     const CUmemorytype_enum dst_type, pi_buff_rect_offset dst_offset,
     size_t dst_row_pitch, size_t dst_slice_pitch) {
 
-  assert(region != nullptr);
-  assert(src_offset != nullptr);
-  assert(dst_offset != nullptr);
-
-  assert(src_type == CU_MEMORYTYPE_DEVICE || src_type == CU_MEMORYTYPE_HOST);
-  assert(dst_type == CU_MEMORYTYPE_DEVICE || dst_type == CU_MEMORYTYPE_HOST);
+  copyRectAsserts(region, src_offset, dst_offset, src_type, dst_type);
 
   src_row_pitch = (!src_row_pitch) ? region->width_bytes + src_offset->x_bytes
                                    : src_row_pitch;
@@ -3711,6 +3728,60 @@ static pi_result commonEnqueueMemBufferCopyRect(
   return PI_CHECK_ERROR(cuMemcpy3DAsync(&params, cu_stream));
 }
 
+/// General 3D memory Peer copy operation.
+/// Similar to commonEnqueueMemBufferCopyRect with the addition that two
+/// contexts must be specified.
+static pi_result commonEnqueueMemBufferCopyRectPeer(
+    CUstream cu_stream, pi_buff_rect_region region, const void *src_ptr,
+    const CUmemorytype_enum src_type, pi_buff_rect_offset src_offset,
+    size_t src_row_pitch, size_t src_slice_pitch, void *dst_ptr,
+    const CUmemorytype_enum dst_type, pi_buff_rect_offset dst_offset,
+    size_t dst_row_pitch, size_t dst_slice_pitch, CUcontext dst_context,
+    CUcontext src_context) {
+
+  copyRectAsserts(region, src_offset, dst_offset, src_type, dst_type);
+
+  src_row_pitch = (!src_row_pitch) ? region->width_bytes : src_row_pitch;
+  src_slice_pitch = (!src_slice_pitch) ? (region->height_scalar * src_row_pitch)
+                                       : src_slice_pitch;
+  dst_row_pitch = (!dst_row_pitch) ? region->width_bytes : dst_row_pitch;
+  dst_slice_pitch = (!dst_slice_pitch) ? (region->height_scalar * dst_row_pitch)
+                                       : dst_slice_pitch;
+
+  CUDA_MEMCPY3D_PEER params = {};
+
+  params.WidthInBytes = region->width_bytes;
+  params.Height = region->height_scalar;
+  params.Depth = region->depth_scalar;
+
+  params.srcMemoryType = src_type;
+  params.srcDevice = src_type == CU_MEMORYTYPE_DEVICE
+                         ? *static_cast<const CUdeviceptr *>(src_ptr)
+                         : 0;
+  params.srcHost = src_type == CU_MEMORYTYPE_HOST ? src_ptr : nullptr;
+  params.srcXInBytes = src_offset->x_bytes;
+  params.srcY = src_offset->y_scalar;
+  params.srcZ = src_offset->z_scalar;
+  params.srcPitch = src_row_pitch;
+  params.srcHeight = src_slice_pitch / src_row_pitch;
+
+  params.dstMemoryType = dst_type;
+  params.dstDevice = dst_type == CU_MEMORYTYPE_DEVICE
+                         ? *static_cast<CUdeviceptr *>(dst_ptr)
+                         : 0;
+  params.dstHost = dst_type == CU_MEMORYTYPE_HOST ? dst_ptr : nullptr;
+  params.dstXInBytes = dst_offset->x_bytes;
+  params.dstY = dst_offset->y_scalar;
+  params.dstZ = dst_offset->z_scalar;
+  params.dstPitch = dst_row_pitch;
+  params.dstHeight = dst_slice_pitch / dst_row_pitch;
+
+  params.dstContext = dst_context;
+  params.srcContext = src_context;
+
+  return PI_CHECK_ERROR(cuMemcpy3DPeerAsync(&params, cu_stream));
+}
+
 pi_result cuda_piEnqueueMemBufferReadRect(
     pi_queue command_queue, pi_mem buffer, pi_bool blocking_read,
     pi_buff_rect_offset buffer_offset, pi_buff_rect_offset host_offset,
@@ -3845,7 +3916,17 @@ pi_result cuda_piEnqueueMemBufferCopy(pi_queue command_queue, pi_mem src_buffer,
     auto src = src_buffer->mem_.buffer_mem_.get() + src_offset;
     auto dst = dst_buffer->mem_.buffer_mem_.get() + dst_offset;
 
-    result = PI_CHECK_ERROR(cuMemcpyDtoDAsync(dst, src, size, stream));
+    if (src_buffer->context_ == dst_buffer->context_) {
+      result = PI_CHECK_ERROR(cuMemcpyDtoDAsync(dst, src, size, stream));
+    } else {
+      auto dst_context = dst_buffer->context_->get();
+      auto src_context = src_buffer->context_->get();
+
+      cuCtxEnablePeerAccess(dst_context, 0);
+
+      result = PI_CHECK_ERROR(
+          cuMemcpyPeerAsync(dst, dst_context, src, src_context, size, stream));
+    }
 
     if (event) {
       result = retImplEv->record();
@@ -3889,11 +3970,22 @@ pi_result cuda_piEnqueueMemBufferCopyRect(
           PI_COMMAND_TYPE_MEM_BUFFER_COPY_RECT, command_queue));
       retImplEv->start();
     }
+    if (src_buffer->context_ == dst_buffer->context_) {
+      retErr = commonEnqueueMemBufferCopyRect(
+          cuStream, region, &srcPtr, CU_MEMORYTYPE_DEVICE, src_origin,
+          src_row_pitch, src_slice_pitch, &dstPtr, CU_MEMORYTYPE_DEVICE,
+          dst_origin, dst_row_pitch, dst_slice_pitch);
+    } else {
+      auto dstContext = dst_buffer->context_->get();
+      auto srcContext = src_buffer->context_->get();
 
-    retErr = commonEnqueueMemBufferCopyRect(
-        cuStream, region, &srcPtr, CU_MEMORYTYPE_DEVICE, src_origin,
-        src_row_pitch, src_slice_pitch, &dstPtr, CU_MEMORYTYPE_DEVICE,
-        dst_origin, dst_row_pitch, dst_slice_pitch);
+      cuCtxEnablePeerAccess(dstContext, 0);
+
+      retErr = commonEnqueueMemBufferCopyRectPeer(
+          cuStream, region, &srcPtr, CU_MEMORYTYPE_DEVICE, src_origin,
+          src_row_pitch, src_slice_pitch, &dstPtr, CU_MEMORYTYPE_DEVICE,
+          dst_origin, dst_row_pitch, dst_slice_pitch, dstContext, srcContext);
+    }
 
     if (event) {
       retImplEv->record();
@@ -4095,6 +4187,54 @@ static pi_result commonEnqueueMemImageNDCopy(
   return PI_INVALID_VALUE;
 }
 
+/// Similar to commonEnqueueMemImageNDCopy for Peer to Peer copies.
+static pi_result commonEnqueueMemImageNDCopyPeer(
+    CUstream cu_stream, pi_mem_type img_type, const size_t *region,
+    const void *src_ptr, const CUmemorytype_enum src_type,
+    const size_t *src_offset, void *dst_ptr, const CUmemorytype_enum dst_type,
+    const size_t *dst_offset, CUcontext dst_context, CUcontext src_context) {
+  assert(region != nullptr);
+
+  assert(src_type == CU_MEMORYTYPE_ARRAY || src_type == CU_MEMORYTYPE_HOST);
+  assert(dst_type == CU_MEMORYTYPE_ARRAY || dst_type == CU_MEMORYTYPE_HOST);
+
+  CUDA_MEMCPY3D_PEER cpyDesc;
+  memset(&cpyDesc, 0, sizeof(cpyDesc));
+  cpyDesc.srcMemoryType = src_type;
+  if (src_type == CU_MEMORYTYPE_ARRAY) {
+    cpyDesc.srcArray = *static_cast<const CUarray *>(src_ptr);
+    cpyDesc.srcXInBytes = src_offset[0];
+    cpyDesc.srcY = src_offset[1];
+    cpyDesc.srcZ = src_offset[2];
+  } else {
+    cpyDesc.srcDevice = src_type == CU_MEMORYTYPE_DEVICE
+                            ? *static_cast<const CUdeviceptr *>(src_ptr)
+                            : 0;
+    cpyDesc.srcHost = src_type == CU_MEMORYTYPE_HOST ? src_ptr : nullptr;
+  }
+  cpyDesc.dstMemoryType = dst_type;
+  if (dst_type == CU_MEMORYTYPE_ARRAY) {
+    cpyDesc.dstArray = *static_cast<CUarray *>(dst_ptr);
+    cpyDesc.dstXInBytes = dst_offset[0];
+    cpyDesc.dstY = dst_offset[1];
+    cpyDesc.dstZ = dst_offset[2];
+  } else {
+    cpyDesc.dstDevice = dst_type == CU_MEMORYTYPE_DEVICE
+                            ? *static_cast<CUdeviceptr *>(dst_ptr)
+                            : 0;
+    cpyDesc.dstHost = dst_type == CU_MEMORYTYPE_HOST ? dst_ptr : nullptr;
+  }
+  cpyDesc.WidthInBytes = region[0];
+  cpyDesc.Height = region[1];
+  cpyDesc.Depth = region[2];
+  cpyDesc.dstContext = dst_context;
+  cpyDesc.srcContext = src_context;
+
+  return PI_CHECK_ERROR(cuMemcpy3DPeerAsync(&cpyDesc, cu_stream));
+
+  return PI_INVALID_VALUE;
+}
+
 pi_result cuda_piEnqueueMemImageRead(
     pi_queue command_queue, pi_mem image, pi_bool blocking_read,
     const size_t *origin, const size_t *region, size_t row_pitch,
@@ -4277,17 +4417,37 @@ pi_result cuda_piEnqueueMemImageCopy(pi_queue command_queue, pi_mem src_image,
     size_t bytesToCopy = elementByteSize * srcArrayDesc.NumChannels * region[0];
 
     pi_mem_type imgType = src_image->mem_.surface_mem_.get_image_type();
-    if (imgType == PI_MEM_TYPE_IMAGE1D) {
-      retErr = PI_CHECK_ERROR(cuMemcpyAtoA(dstArray, dstByteOffsetX, srcArray,
-                                           srcByteOffsetX, bytesToCopy));
+    if (src_image->context_ == dst_image->context_) {
+      if (imgType == PI_MEM_TYPE_IMAGE1D) {
+        retErr = PI_CHECK_ERROR(cuMemcpyAtoA(dstArray, dstByteOffsetX, srcArray,
+                                             srcByteOffsetX, bytesToCopy));
+      } else {
+        size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]};
+        size_t srcOffset[3] = {srcByteOffsetX, src_origin[1], src_origin[2]};
+        size_t dstOffset[3] = {dstByteOffsetX, dst_origin[1], dst_origin[2]};
+
+        retErr = commonEnqueueMemImageNDCopy(
+            cuStream, imgType, adjustedRegion, &srcArray, CU_MEMORYTYPE_ARRAY,
+            srcOffset, &dstArray, CU_MEMORYTYPE_ARRAY, dstOffset);
+
+        if (retErr != PI_SUCCESS) {
+          return retErr;
+        }
+      }
     } else {
       size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]};
       size_t srcOffset[3] = {srcByteOffsetX, src_origin[1], src_origin[2]};
       size_t dstOffset[3] = {dstByteOffsetX, dst_origin[1], dst_origin[2]};
 
-      retErr = commonEnqueueMemImageNDCopy(
+      auto dstContext = dst_image->context_->get();
+      auto srcContext = src_image->context_->get();
+
+      cuCtxEnablePeerAccess(dstContext, 0);
+
+      retErr = commonEnqueueMemImageNDCopyPeer(
           cuStream, imgType, adjustedRegion, &srcArray, CU_MEMORYTYPE_ARRAY,
-          srcOffset, &dstArray, CU_MEMORYTYPE_ARRAY, dstOffset);
+          srcOffset, &dstArray, CU_MEMORYTYPE_ARRAY, dstOffset, dstContext,
+          srcContext);
 
       if (retErr != PI_SUCCESS) {
         return retErr;

@@ -513,6 +513,9 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName,
     // cl_khr_fp64, cl_khr_int64_base_atomics,
     // cl_khr_int64_extended_atomics
     return ReturnValue("");
+    // P2P is currently unsupported in level zero
+  case PI_DEVICE_INFO_P2P_READ_DEVICES:
-    // P2P is currently unsupported in level zero
-  case PI_DEVICE_INFO_P2P_READ_DEVICES:
+  case PI_DEVICE_INFO_P2P_READ_DEVICES:
+    // P2P is currently unsupported in level zero
-    // P2P is currently unsupported in level zero
-  case PI_DEVICE_INFO_P2P_READ_DEVICES:
+  case PI_DEVICE_INFO_P2P_READ_DEVICES:
+    // P2P is currently unsupported in level zero
+    return ReturnValue(std::vector<pi_device>{});
 
 #define UNSUPPORTED_INFO(info)                                                 \
   case info:                                                                   \

@@ -1563,6 +1563,10 @@ pi_result hip_piDeviceGetInfo(pi_device device, pi_device_info param_name,
     }
     return getInfo(param_value_size, param_value, param_value_size_ret, value);
   }
+  // P2P is currently unsupported in level zero
+  case PI_DEVICE_INFO_P2P_READ_DEVICES:
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   std::vector<pi_device>{});
 
   // TODO: Implement.
   case PI_DEVICE_INFO_ATOMIC_64:

@@ -2448,6 +2448,9 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName,
   case PI_DEVICE_INFO_MAX_MEM_BANDWIDTH:
     // currently not supported in level zero runtime
     return PI_INVALID_VALUE;
+    // P2P is currently unsupported in level zero
+  case PI_DEVICE_INFO_P2P_READ_DEVICES:
+    return ReturnValue(std::vector<pi_device>{});
 
   default:
     zePrint("Unsupported ParamName in piGetDeviceInfo\n");

@@ -517,7 +517,6 @@ void Command::makeTraceEventEpilog() {
 
 Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep) {
   const QueueImplPtr &WorkerQueue = getWorkerQueue();
-  const ContextImplPtr &WorkerContext = WorkerQueue->getContextImplPtr();
 
   // 1. Async work is not supported for host device.
   // 2. Some types of commands do not produce PI events after they are enqueued
@@ -543,13 +542,7 @@ Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep) {
       getType() != CommandType::HOST_TASK)
     return nullptr;
 
-  ContextImplPtr DepEventContext = DepEvent->getContextImpl();
-  // If contexts don't match we'll connect them using host task
-  if (DepEventContext != WorkerContext && !WorkerContext->is_host()) {
-    Scheduler::GraphBuilder &GB = Scheduler::getInstance().MGraphBuilder;
-    ConnectionCmd = GB.connectDepEvent(this, DepEvent, Dep);
-  } else
-    MPreparedDepsEvents.push_back(std::move(DepEvent));
+  MPreparedDepsEvents.push_back(std::move(DepEvent));
 
   return ConnectionCmd;
 }

@@ -10,6 +10,7 @@
 #include <CL/sycl/access/access.hpp>
 #include <CL/sycl/detail/memory_manager.hpp>
 #include <CL/sycl/exception.hpp>
+#include <detail/backend_impl.hpp>
 #include <detail/context_impl.hpp>
 #include <detail/event_impl.hpp>
 #include <detail/queue_impl.hpp>
@@ -52,6 +53,20 @@ static bool IsSuitableSubReq(const Requirement *Req) {
   return Req->MIsSubBuffer;
 }
 
+/// Finds the correct AllocaCommand matching the context of Record.
+AllocaCommandBase *findAllocaCmd(MemObjRecord *Record) {
+  auto IsSuitableAlloca = [Record](AllocaCommandBase *AllocaCmd) {
+    bool Res = sameCtx(AllocaCmd->getQueue()->getContextImplPtr(),
+                       Record->MCurContext) &&
+               // Looking for a parent buffer alloca command
+               AllocaCmd->getType() == Command::CommandType::ALLOCA;
+    return Res;
+  };
+  const auto It = std::find_if(Record->MAllocaCommands.begin(),
+                               Record->MAllocaCommands.end(), IsSuitableAlloca);
+  return (Record->MAllocaCommands.end() != It) ? *It : nullptr;
+}
+
 /// Checks if the required access mode is allowed under the current one.
 static bool isAccessModeAllowed(access::mode Required, access::mode Current) {
   switch (Current) {
@@ -328,17 +343,7 @@ Command *Scheduler::GraphBuilder::insertMemoryMove(
     // Since no alloca command for the sub buffer requirement was found in the
     // current context, need to find a parent alloca command for it (it must be
     // there)
-    auto IsSuitableAlloca = [Record](AllocaCommandBase *AllocaCmd) {
-      bool Res = sameCtx(AllocaCmd->getQueue()->getContextImplPtr(),
-                         Record->MCurContext) &&
-                 // Looking for a parent buffer alloca command
-                 AllocaCmd->getType() == Command::CommandType::ALLOCA;
-      return Res;
-    };
-    const auto It =
-        std::find_if(Record->MAllocaCommands.begin(),
-                     Record->MAllocaCommands.end(), IsSuitableAlloca);
-    AllocaCmdSrc = (Record->MAllocaCommands.end() != It) ? *It : nullptr;
+    AllocaCmdSrc = findAllocaCmd(Record);
   }
   if (!AllocaCmdSrc)
     throw runtime_error("Cannot find buffer allocation", PI_INVALID_VALUE);
@@ -941,9 +946,26 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr<detail::CG> CommandGroup,
           NeedMemMoveToHost = true;
           MemMoveTargetQueue = HT.MQueue;
         }
-      } else if (!Queue->is_host() && !Record->MCurContext->is_host())
-        NeedMemMoveToHost = true;
+      } else if (!Queue->is_host() && !Record->MCurContext->is_host()) {
+        if (detail::getImplBackend(Queue) !=
+            detail::getImplBackend(Record->MCurContext))
+          NeedMemMoveToHost = true;
+        else {
+          std::vector<RT::PiDevice> Devs;
+
+          Queue->getPlugin().call_nocheck<PiApiKind::piDeviceGetInfo>(
+              Queue->getDeviceImplPtr()->getHandleRef(),
+              PI_DEVICE_INFO_P2P_READ_DEVICES, sizeof(Devs), &Devs, nullptr);
 
+          _pi_device *SrcDev = findAllocaCmd(Record)
+                                   ->getQueue()
+                                   ->getDeviceImplPtr()
+                                   ->getHandleRef();
+
+          NeedMemMoveToHost =
+              std::find(Devs.begin(), Devs.end(), SrcDev) == Devs.end();
+        }
+      }
       if (NeedMemMoveToHost)
         insertMemoryMove(Record, Req,
                          Scheduler::getInstance().getDefaultHostQueue(),