intel
diff --git a/‎scripts/core/CUDA.rst
Lines changed: 34 additions & 0 deletions b/‎scripts/core/CUDA.rst
Lines changed: 34 additions & 0 deletions
diff --git a/‎scripts/core/HIP.rst
Lines changed: 41 additions & 0 deletions b/‎scripts/core/HIP.rst
Lines changed: 41 additions & 0 deletions
diff --git a/‎source/adapters/cuda/command_buffer.cpp
Lines changed: 0 additions & 3 deletions b/‎source/adapters/cuda/command_buffer.cpp
Lines changed: 0 additions & 3 deletions
diff --git a/‎source/adapters/cuda/enqueue.cpp
Lines changed: 0 additions & 6 deletions b/‎source/adapters/cuda/enqueue.cpp
Lines changed: 0 additions & 6 deletions
diff --git a/‎source/adapters/cuda/kernel.hpp
Lines changed: 78 additions & 18 deletions b/‎source/adapters/cuda/kernel.hpp
Lines changed: 78 additions & 18 deletions
diff --git a/‎source/adapters/hip/command_buffer.cpp
Lines changed: 0 additions & 3 deletions b/‎source/adapters/hip/command_buffer.cpp
Lines changed: 0 additions & 3 deletions
diff --git a/‎source/adapters/hip/enqueue.cpp
Lines changed: 0 additions & 2 deletions b/‎source/adapters/hip/enqueue.cpp
Lines changed: 0 additions & 2 deletions
@@ -148,6 +148,39 @@ take the extra global offset argument. Use of the global offset is not
 recommended for non SYCL compiler toolchains. This parameter can be ignored if
 the user does not wish to use the global offset.
 
+Local Memory Arguments
+----------------------
+
+In UR local memory is a region of memory shared by all the work-items in
+a work-group. A kernel function signature can include local memory address
+space pointer arguments, which are set by the user with
+``urKernelSetArgLocal`` with the number of bytes of local memory to allocate
+and make available from the pointer argument.
+
+The CUDA adapter implements local memory in a kernel as a single ``__shared__``
+memory allocation, and each individual local memory argument is a ``u32`` byte
+offset kernel parameter which is combined inside the kernel with the
+``__shared__`` memory allocation. Therefore for ``N`` local arguments that need
+set on a kernel with ``urKernelSetArgLocal``, the total aligned size across the
+``N`` calls to ``urKernelSetArgLocal`` is calculated for the ``__shared__``
+memory allocation by the CUDA adapter and passed as the ``sharedMemBytes``
+argument to ``cuLaunchKernel`` (or variants like ``cuLaunchCooperativeKernel``
+or ``cuGraphAddKernelNode``).
+
+For each kernel ``u32`` local memory offset parameter, aligned offsets into the
+single memory location are calculated and passed at runtime by the adapter via
+``kernelParams`` when launching the kernel (or adding the kernel as a graph
+node). When a user calls ``urKernelSetArgLocal`` with an argument index that
+has already been set on the kernel, the adapter recalculates the size of the
+``__shared__`` memory allocation and offset for the index, as well as the
+offsets of any local memory arguments at following indices.
+
+.. warning::
+
+  The CUDA UR adapter implementation of local memory assumes the kernel created
+  has been created by DPC++, instrumenting the device code so that local memory
+  arguments are offsets rather than pointers.
+
 Other Notes
 ===========
 
@@ -164,4 +197,5 @@ Contributors
 ------------
 
 * Hugh Delaney `[email protected] <[email protected]>`_
+* Ewan Crawford `[email protected] <[email protected]>`_
 
@@ -91,6 +91,46 @@ take the extra global offset argument. Use of the global offset is not
 recommended for non SYCL compiler toolchains. This parameter can be ignored if
 the user does not wish to use the global offset.
 
+Local Memory Arguments
+----------------------
+
+In UR local memory is a region of memory shared by all the work-items in
+a work-group. A kernel function signature can include local memory address
+space pointer arguments, which are set by the user with
+``urKernelSetArgLocal`` with the number of bytes of local memory to allocate
+and make available from the pointer argument.
+
+The HIP adapter implements local memory in a kernel as a single ``__shared__``
+memory allocation, and each individual local memory argument is a ``u32`` byte
+offset kernel parameter which is combined inside the kernel with the
+``__shared__`` memory allocation. Therefore for ``N`` local arguments that need
+set on a kernel with ``urKernelSetArgLocal``, the total aligned size across the
+``N`` calls to ``urKernelSetArgLocal`` is calculated for the ``__shared__``
+memory allocation by the HIP adapter and passed as the ``sharedMemBytes``
+argument to ``hipModuleLaunchKernel`` or ``hipGraphAddKernelNode``.
+
+For each kernel ``u32`` local memory offset parameter, aligned offsets into the
+single memory location are calculated and passed at runtime by the adapter via
+``kernelParams`` when launching the kernel (or adding the kernel as a graph
+node). When a user calls ``urKernelSetArgLocal`` with an argument index that
+has already been set on the kernel, the adapter recalculates the size of the
+``__shared__`` memory allocation and offset for the index, as well as the
+offsets of any local memory arguments at following indices.
+
+.. warning::
+
+  The HIP UR adapter implementation of local memory assumes the kernel created
+  has been created by DPC++, instrumenting the device code so that local memory
+  arguments are offsets rather than pointers.
+
+
+HIP kernels that are generated for DPC++ kernels with SYCL local accessors
+contain extra value arguments on top of the local memory argument for the
+local accessor. For each ``urKernelSetArgLocal`` argument, a user needs
+to make 3 calls to ``urKernelSetArgValue`` with each of the next 3 consecutive
+argument indexes. This represents a 3 dimensional offset into the local
+accessor.
+
 Other Notes
 ===========
 
@@ -100,4 +140,5 @@ Contributors
 ------------
 
 * Hugh Delaney `[email protected] <[email protected]>`_
+* Ewan Crawford `[email protected] <[email protected]>`_
 
@@ -522,9 +522,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
                                         DepsList.data(), DepsList.size(),
                                         &NodeParams));
 
-    if (LocalSize != 0)
-      hKernel->clearLocalSize();
-
     // Add signal node if external return event is used.
     CUgraphNode SignalNode = nullptr;
     if (phEvent) {
 
@@ -493,9 +493,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
         ThreadsPerBlock[0], ThreadsPerBlock[1], ThreadsPerBlock[2], LocalSize,
         CuStream, const_cast<void **>(ArgIndices.data()), nullptr));
 
-    if (LocalSize != 0)
-      hKernel->clearLocalSize();
-
     if (phEvent) {
       UR_CHECK_ERROR(RetImplEvent->record());
       *phEvent = RetImplEvent.release();
@@ -673,9 +670,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
                                     const_cast<void **>(ArgIndices.data()),
                                     nullptr));
 
-    if (LocalSize != 0)
-      hKernel->clearLocalSize();
-
     if (phEvent) {
       UR_CHECK_ERROR(RetImplEvent->record());
       *phEvent = RetImplEvent.release();
 
@@ -61,10 +61,22 @@ struct ur_kernel_handle_t_ {
     using args_t = std::array<char, MaxParamBytes>;
     using args_size_t = std::vector<size_t>;
     using args_index_t = std::vector<void *>;
+    /// Storage shared by all args which is mem copied into when adding a new
+    /// argument.
     args_t Storage;
+    /// Aligned size of each parameter, including padding.
     args_size_t ParamSizes;
+    /// Byte offset into /p Storage allocation for each parameter.
     args_index_t Indices;
-    args_size_t OffsetPerIndex;
+    /// Aligned size in bytes for each local memory parameter after padding has
+    /// been added. Zero if the argument at the index isn't a local memory
+    /// argument.
+    args_size_t AlignedLocalMemSize;
+    /// Original size in bytes for each local memory parameter, prior to being
+    /// padded to appropriate alignment. Zero if the argument at the index
+    /// isn't a local memory argument.
+    args_size_t OriginalLocalMemSize;
+
     // A struct to keep track of memargs so that we can do dependency analysis
     // at urEnqueueKernelLaunch
     struct mem_obj_arg {
@@ -93,7 +105,8 @@ struct ur_kernel_handle_t_ {
         Indices.resize(Index + 2, Indices.back());
         // Ensure enough space for the new argument
         ParamSizes.resize(Index + 1);
-        OffsetPerIndex.resize(Index + 1);
+        AlignedLocalMemSize.resize(Index + 1);
+        OriginalLocalMemSize.resize(Index + 1);
       }
       ParamSizes[Index] = Size;
       // calculate the insertion point on the array
@@ -102,28 +115,81 @@ struct ur_kernel_handle_t_ {
       // Update the stored value for the argument
       std::memcpy(&Storage[InsertPos], Arg, Size);
       Indices[Index] = &Storage[InsertPos];
-      OffsetPerIndex[Index] = LocalSize;
+      AlignedLocalMemSize[Index] = LocalSize;
     }
 
-    void addLocalArg(size_t Index, size_t Size) {
-      size_t LocalOffset = this->getLocalSize();
+    /// Returns the padded size and offset of a local memory argument.
+    /// Local memory arguments need to be padded if the alignment for the size
+    /// doesn't match the current offset into the kernel local data.
+    /// @param Index Kernel arg index.
+    /// @param Size User passed size of local parameter.
+    /// @return Tuple of (Aligned size, Aligned offset into local data).
+    std::pair<size_t, size_t> calcAlignedLocalArgument(size_t Index,
+                                                       size_t Size) {
+      // Store the unpadded size of the local argument
+      if (Index + 2 > Indices.size()) {
+        AlignedLocalMemSize.resize(Index + 1);
+        OriginalLocalMemSize.resize(Index + 1);
+      }
+      OriginalLocalMemSize[Index] = Size;
+
+      // Calculate the current starting offset into local data
+      const size_t LocalOffset = std::accumulate(
+          std::begin(AlignedLocalMemSize),
+          std::next(std::begin(AlignedLocalMemSize), Index), size_t{0});
 
-      // maximum required alignment is the size of the largest vector type
+      // Maximum required alignment is the size of the largest vector type
       const size_t MaxAlignment = sizeof(double) * 16;
 
-      // for arguments smaller than the maximum alignment simply align to the
+      // For arguments smaller than the maximum alignment simply align to the
       // size of the argument
       const size_t Alignment = std::min(MaxAlignment, Size);
 
-      // align the argument
+      // Align the argument
       size_t AlignedLocalOffset = LocalOffset;
-      size_t Pad = LocalOffset % Alignment;
+      const size_t Pad = LocalOffset % Alignment;
       if (Pad != 0) {
         AlignedLocalOffset += Alignment - Pad;
       }
 
+      const size_t AlignedLocalSize = Size + (AlignedLocalOffset - LocalOffset);
+      return std::make_pair(AlignedLocalSize, AlignedLocalOffset);
+    }
+
+    void addLocalArg(size_t Index, size_t Size) {
+      // Get the aligned argument size and offset into local data
+      auto [AlignedLocalSize, AlignedLocalOffset] =
+          calcAlignedLocalArgument(Index, Size);
+
+      // Store argument details
       addArg(Index, sizeof(size_t), (const void *)&(AlignedLocalOffset),
-             Size + (AlignedLocalOffset - LocalOffset));
+             AlignedLocalSize);
+
+      // For every existing local argument which follows at later argument
+      // indices, update the offset and pointer into the kernel local memory.
+      // Required as padding will need to be recalculated.
+      const size_t NumArgs = Indices.size() - 1; // Accounts for implicit arg
+      for (auto SuccIndex = Index + 1; SuccIndex < NumArgs; SuccIndex++) {
+        const size_t OriginalLocalSize = OriginalLocalMemSize[SuccIndex];
+        if (OriginalLocalSize == 0) {
+          // Skip if successor argument isn't a local memory arg
+          continue;
+        }
+
+        // Recalculate alignment
+        auto [SuccAlignedLocalSize, SuccAlignedLocalOffset] =
+            calcAlignedLocalArgument(SuccIndex, OriginalLocalSize);
+
+        // Store new local memory size
+        AlignedLocalMemSize[SuccIndex] = SuccAlignedLocalSize;
+
+        // Store new offset into local data
+        const size_t InsertPos =
+            std::accumulate(std::begin(ParamSizes),
+                            std::begin(ParamSizes) + SuccIndex, size_t{0});
+        std::memcpy(&Storage[InsertPos], &SuccAlignedLocalOffset,
+                    sizeof(size_t));
+      }
     }
 
     void addMemObjArg(int Index, ur_mem_handle_t hMem, ur_mem_flags_t Flags) {
@@ -145,15 +211,11 @@ struct ur_kernel_handle_t_ {
       std::memcpy(ImplicitOffsetArgs, ImplicitOffset, Size);
     }
 
-    void clearLocalSize() {
-      std::fill(std::begin(OffsetPerIndex), std::end(OffsetPerIndex), 0);
-    }
-
     const args_index_t &getIndices() const noexcept { return Indices; }
 
     uint32_t getLocalSize() const {
-      return std::accumulate(std::begin(OffsetPerIndex),
-                             std::end(OffsetPerIndex), 0);
+      return std::accumulate(std::begin(AlignedLocalMemSize),
+                             std::end(AlignedLocalMemSize), 0);
     }
   } Args;
 
@@ -240,7 +302,5 @@ struct ur_kernel_handle_t_ {
 
   uint32_t getLocalSize() const noexcept { return Args.getLocalSize(); }
 
-  void clearLocalSize() { Args.clearLocalSize(); }
-
   size_t getRegsPerThread() const noexcept { return RegsPerThread; };
 };
@@ -396,9 +396,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
                                          DepsList.data(), DepsList.size(),
                                          &NodeParams));
 
-    if (LocalSize != 0)
-      hKernel->clearLocalSize();
-
     // Get sync point and register the node with it.
     auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode);
     if (pSyncPoint) {
 
@@ -324,8 +324,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
         ThreadsPerBlock[0], ThreadsPerBlock[1], ThreadsPerBlock[2],
         hKernel->getLocalSize(), HIPStream, ArgIndices.data(), nullptr));
 
-    hKernel->clearLocalSize();
-
     if (phEvent) {
       UR_CHECK_ERROR(RetImplEvent->record());
       *phEvent = RetImplEvent.release();