Skip to content

Commit c1e2957

Browse files
authored
[CUDA][LIBCLC] Implement RC11 seq_cst for PTX6.0 (#12516)
Implement `seq_cst` RC11/ptx6.0 memory consistency for CUDA backend. See https://dl.acm.org/doi/pdf/10.1145/3297858.3304043 and https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#memory-consistency-model for full details. Requires sm_70 or above. With this PR there is now a complete mapping between SYCL memory consistency model capabilities and the official CUDA model, fully exploiting CUDA capabilities when possible on supported arches. This makes the SYCL-CTS atomic_ref tests fully pass for sm_70 on the cuda backend. Fixes #11208 Depends on #12907 --------- Signed-off-by: JackAKirk <[email protected]>
1 parent 17ef793 commit c1e2957

File tree

6 files changed

+53
-7
lines changed

6 files changed

+53
-7
lines changed

libclc/ptx-nvidiacl/libspirv/atomic/atomic_add.cl

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,13 @@ Memory order is stored in the lowest 5 bits */
7171
ADDR_SPACE, ADDR_SPACE_NV) \
7272
} \
7373
break; \
74+
case SequentiallyConsistent: \
75+
if (__clc_nvvm_reflect_arch() >= 700) { \
76+
__CLC_NVVM_FENCE_SC_SM70() \
77+
__CLC_NVVM_ATOMIC_IMPL_ORDER(double, double, d, add, ADDR_SPACE, \
78+
ADDR_SPACE_NV, _acq_rel) \
79+
break; \
80+
} \
7481
} \
7582
__builtin_trap(); \
7683
__builtin_unreachable(); \

libclc/ptx-nvidiacl/libspirv/atomic/atomic_cmpxchg.cl

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
//
77
//===----------------------------------------------------------------------===//
88

9+
#include <atomic_helpers.h>
910
#include <spirv/spirv.h>
1011
#include <spirv/spirv_types.h>
1112

@@ -120,6 +121,13 @@ Memory order is stored in the lowest 5 bits */ \
120121
TYPE, TYPE_NV, TYPE_MANGLED_NV, OP, ADDR_SPACE, ADDR_SPACE_NV) \
121122
} \
122123
break; \
124+
case SequentiallyConsistent: \
125+
if (__clc_nvvm_reflect_arch() >= 700) { \
126+
__CLC_NVVM_FENCE_SC_SM70() \
127+
__CLC_NVVM_ATOMIC_CAS_IMPL_ORDER(TYPE, TYPE_NV, TYPE_MANGLED_NV, OP, \
128+
ADDR_SPACE, ADDR_SPACE_NV, _acq_rel) \
129+
break; \
130+
} \
123131
} \
124132
__builtin_trap(); \
125133
__builtin_unreachable(); \

libclc/ptx-nvidiacl/libspirv/atomic/atomic_helpers.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,15 @@ _CLC_OVERLOAD _CLC_DECL void __spirv_MemoryBarrier(unsigned int, unsigned int);
7272
} \
7373
}
7474

75+
#define __CLC_NVVM_FENCE_SC_SM70() \
76+
if (scope == CrossDevice) { \
77+
__asm__ __volatile__("fence.sc.sys;"); \
78+
} else if (scope == Device) { \
79+
__asm__ __volatile__("fence.sc.gpu;"); \
80+
} else { \
81+
__asm__ __volatile__("fence.sc.cta;"); \
82+
}
83+
7584
#define __CLC_NVVM_ATOMIC_IMPL( \
7685
TYPE, TYPE_MANGLED, TYPE_NV, TYPE_MANGLED_NV, OP, NAME_MANGLED, \
7786
ADDR_SPACE, POINTER_AND_ADDR_SPACE_MANGLED, ADDR_SPACE_NV, SUBSTITUTION) \
@@ -117,6 +126,13 @@ Memory order is stored in the lowest 5 bits */ \
117126
OP, ADDR_SPACE, ADDR_SPACE_NV) \
118127
} \
119128
break; \
129+
case SequentiallyConsistent: \
130+
if (__clc_nvvm_reflect_arch() >= 700) { \
131+
__CLC_NVVM_FENCE_SC_SM70() \
132+
__CLC_NVVM_ATOMIC_IMPL_ORDER(TYPE, TYPE_NV, TYPE_MANGLED_NV, OP, \
133+
ADDR_SPACE, ADDR_SPACE_NV, _acq_rel) \
134+
break; \
135+
} \
120136
} \
121137
__builtin_trap(); \
122138
__builtin_unreachable(); \

libclc/ptx-nvidiacl/libspirv/atomic/atomic_load.cl

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
//
77
//===----------------------------------------------------------------------===//
88

9+
#include <atomic_helpers.h>
910
#include <spirv/spirv.h>
1011
#include <spirv/spirv_types.h>
1112

@@ -53,6 +54,12 @@ Memory order is stored in the lowest 5 bits */ \
5354
case Acquire: \
5455
__CLC_NVVM_ATOMIC_LOAD_IMPL_ORDER(TYPE, TYPE_NV, TYPE_MANGLED_NV, \
5556
ADDR_SPACE, ADDR_SPACE_NV, _acquire) \
57+
break; \
58+
case SequentiallyConsistent: \
59+
__CLC_NVVM_FENCE_SC_SM70() \
60+
__CLC_NVVM_ATOMIC_LOAD_IMPL_ORDER(TYPE, TYPE_NV, TYPE_MANGLED_NV, \
61+
ADDR_SPACE, ADDR_SPACE_NV, _acquire) \
62+
break; \
5663
} \
5764
} else { \
5865
TYPE_NV res = __nvvm_volatile_ld##ADDR_SPACE_NV##TYPE_MANGLED_NV( \

libclc/ptx-nvidiacl/libspirv/atomic/atomic_store.cl

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
//
77
//===----------------------------------------------------------------------===//
88

9+
#include <atomic_helpers.h>
910
#include <spirv/spirv.h>
1011
#include <spirv/spirv_types.h>
1112

@@ -54,6 +55,13 @@ Memory order is stored in the lowest 5 bits */ \
5455
__CLC_NVVM_ATOMIC_STORE_IMPL_ORDER(TYPE, TYPE_NV, TYPE_MANGLED_NV, \
5556
ADDR_SPACE, ADDR_SPACE_NV, \
5657
_release) \
58+
break; \
59+
case SequentiallyConsistent: \
60+
__CLC_NVVM_FENCE_SC_SM70() \
61+
__CLC_NVVM_ATOMIC_STORE_IMPL_ORDER(TYPE, TYPE_NV, TYPE_MANGLED_NV, \
62+
ADDR_SPACE, ADDR_SPACE_NV, \
63+
_release) \
64+
break; \
5765
} \
5866
} else { \
5967
switch (order) { \

sycl/plugins/unified_runtime/CMakeLists.txt

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -81,14 +81,14 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT)
8181
CACHE PATH "Path to external '${name}' adapter source dir" FORCE)
8282
endfunction()
8383

84-
set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
85-
# commit 6513abc404979fa109d64500bf899e632d511291
86-
# Merge: 09be0881 6d586094
84+
set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
85+
# commit 29ee45c4451a682f744146cc9dbeb2617ecdd6b3
86+
# Merge: db4b0c14 4f5d005a
8787
# Author: Kenneth Benzie (Benie) <[email protected]>
88-
# Date: Thu Mar 14 22:38:53 2024 +0000
89-
# Merge pull request #1410 from kbenzie/benie/cmake-external-adapter-source-dirs
90-
# [CMake] Support external adapter source dirs
91-
set(UNIFIED_RUNTIME_TAG 6513abc404979fa109d64500bf899e632d511291)
88+
# Date: Mon Mar 18 12:14:26 2024 +0000
89+
# Merge pull request #1291 from JackAKirk/cuda-seq-cst-b
90+
# [CUDA] Report that devices with cc >= sm_70 support seq_cst
91+
set(UNIFIED_RUNTIME_TAG 29ee45c4451a682f744146cc9dbeb2617ecdd6b3)
9292

9393
if(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO)
9494
set(UNIFIED_RUNTIME_REPO "${SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO}")

0 commit comments

Comments
 (0)