Skip to content

Commit fa53fea

Browse files
JackAKirkkbenzie
authored andcommitted
[CUDA][LIBCLC] Implement RC11 seq_cst for PTX6.0 (intel#12516)
Implement `seq_cst` RC11/ptx6.0 memory consistency for CUDA backend. See https://dl.acm.org/doi/pdf/10.1145/3297858.3304043 and https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#memory-consistency-model for full details. Requires sm_70 or above. With this PR there is now a complete mapping between SYCL memory consistency model capabilities and the official CUDA model, fully exploiting CUDA capabilities when possible on supported arches. This makes the SYCL-CTS atomic_ref tests fully pass for sm_70 on the cuda backend. Fixes intel#11208 Depends on intel#12907 --------- Signed-off-by: JackAKirk <[email protected]>
1 parent 22e9785 commit fa53fea

File tree

6 files changed

+52
-6
lines changed

6 files changed

+52
-6
lines changed

libclc/ptx-nvidiacl/libspirv/atomic/atomic_add.cl

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,13 @@ Memory order is stored in the lowest 5 bits */
7171
ADDR_SPACE, ADDR_SPACE_NV) \
7272
} \
7373
break; \
74+
case SequentiallyConsistent: \
75+
if (__clc_nvvm_reflect_arch() >= 700) { \
76+
__CLC_NVVM_FENCE_SC_SM70() \
77+
__CLC_NVVM_ATOMIC_IMPL_ORDER(double, double, d, add, ADDR_SPACE, \
78+
ADDR_SPACE_NV, _acq_rel) \
79+
break; \
80+
} \
7481
} \
7582
__builtin_trap(); \
7683
__builtin_unreachable(); \

libclc/ptx-nvidiacl/libspirv/atomic/atomic_cmpxchg.cl

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
//
77
//===----------------------------------------------------------------------===//
88

9+
#include <atomic_helpers.h>
910
#include <spirv/spirv.h>
1011
#include <spirv/spirv_types.h>
1112

@@ -120,6 +121,13 @@ Memory order is stored in the lowest 5 bits */ \
120121
TYPE, TYPE_NV, TYPE_MANGLED_NV, OP, ADDR_SPACE, ADDR_SPACE_NV) \
121122
} \
122123
break; \
124+
case SequentiallyConsistent: \
125+
if (__clc_nvvm_reflect_arch() >= 700) { \
126+
__CLC_NVVM_FENCE_SC_SM70() \
127+
__CLC_NVVM_ATOMIC_CAS_IMPL_ORDER(TYPE, TYPE_NV, TYPE_MANGLED_NV, OP, \
128+
ADDR_SPACE, ADDR_SPACE_NV, _acq_rel) \
129+
break; \
130+
} \
123131
} \
124132
__builtin_trap(); \
125133
__builtin_unreachable(); \

libclc/ptx-nvidiacl/libspirv/atomic/atomic_helpers.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,15 @@ _CLC_OVERLOAD _CLC_DECL void __spirv_MemoryBarrier(unsigned int, unsigned int);
7272
} \
7373
}
7474

75+
#define __CLC_NVVM_FENCE_SC_SM70() \
76+
if (scope == CrossDevice) { \
77+
__asm__ __volatile__("fence.sc.sys;"); \
78+
} else if (scope == Device) { \
79+
__asm__ __volatile__("fence.sc.gpu;"); \
80+
} else { \
81+
__asm__ __volatile__("fence.sc.cta;"); \
82+
}
83+
7584
#define __CLC_NVVM_ATOMIC_IMPL( \
7685
TYPE, TYPE_MANGLED, TYPE_NV, TYPE_MANGLED_NV, OP, NAME_MANGLED, \
7786
ADDR_SPACE, POINTER_AND_ADDR_SPACE_MANGLED, ADDR_SPACE_NV, SUBSTITUTION) \
@@ -117,6 +126,13 @@ Memory order is stored in the lowest 5 bits */ \
117126
OP, ADDR_SPACE, ADDR_SPACE_NV) \
118127
} \
119128
break; \
129+
case SequentiallyConsistent: \
130+
if (__clc_nvvm_reflect_arch() >= 700) { \
131+
__CLC_NVVM_FENCE_SC_SM70() \
132+
__CLC_NVVM_ATOMIC_IMPL_ORDER(TYPE, TYPE_NV, TYPE_MANGLED_NV, OP, \
133+
ADDR_SPACE, ADDR_SPACE_NV, _acq_rel) \
134+
break; \
135+
} \
120136
} \
121137
__builtin_trap(); \
122138
__builtin_unreachable(); \

libclc/ptx-nvidiacl/libspirv/atomic/atomic_load.cl

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
//
77
//===----------------------------------------------------------------------===//
88

9+
#include <atomic_helpers.h>
910
#include <spirv/spirv.h>
1011
#include <spirv/spirv_types.h>
1112

@@ -53,6 +54,12 @@ Memory order is stored in the lowest 5 bits */ \
5354
case Acquire: \
5455
__CLC_NVVM_ATOMIC_LOAD_IMPL_ORDER(TYPE, TYPE_NV, TYPE_MANGLED_NV, \
5556
ADDR_SPACE, ADDR_SPACE_NV, _acquire) \
57+
break; \
58+
case SequentiallyConsistent: \
59+
__CLC_NVVM_FENCE_SC_SM70() \
60+
__CLC_NVVM_ATOMIC_LOAD_IMPL_ORDER(TYPE, TYPE_NV, TYPE_MANGLED_NV, \
61+
ADDR_SPACE, ADDR_SPACE_NV, _acquire) \
62+
break; \
5663
} \
5764
} else { \
5865
TYPE_NV res = __nvvm_volatile_ld##ADDR_SPACE_NV##TYPE_MANGLED_NV( \

libclc/ptx-nvidiacl/libspirv/atomic/atomic_store.cl

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
//
77
//===----------------------------------------------------------------------===//
88

9+
#include <atomic_helpers.h>
910
#include <spirv/spirv.h>
1011
#include <spirv/spirv_types.h>
1112

@@ -54,6 +55,13 @@ Memory order is stored in the lowest 5 bits */ \
5455
__CLC_NVVM_ATOMIC_STORE_IMPL_ORDER(TYPE, TYPE_NV, TYPE_MANGLED_NV, \
5556
ADDR_SPACE, ADDR_SPACE_NV, \
5657
_release) \
58+
break; \
59+
case SequentiallyConsistent: \
60+
__CLC_NVVM_FENCE_SC_SM70() \
61+
__CLC_NVVM_ATOMIC_STORE_IMPL_ORDER(TYPE, TYPE_NV, TYPE_MANGLED_NV, \
62+
ADDR_SPACE, ADDR_SPACE_NV, \
63+
_release) \
64+
break; \
5765
} \
5866
} else { \
5967
switch (order) { \

sycl/plugins/unified_runtime/CMakeLists.txt

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -57,13 +57,13 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT)
5757
include(FetchContent)
5858

5959
set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
60-
# commit 09be0881b727fadb1c04b38c00d2562d7dc6875f
61-
# Merge: bb589ca8 e9f855d4
60+
# commit 29ee45c4451a682f744146cc9dbeb2617ecdd6b3
61+
# Merge: db4b0c14 4f5d005a
6262
# Author: Kenneth Benzie (Benie) <[email protected]>
63-
# Date: Thu Mar 14 22:10:28 2024 +0000
64-
# Merge pull request #1429 from nrspruit/l0_p2p_device_query
65-
# [L0] Support for urUsmP2PPeerAccessGetInfoExp to query p2p access info
66-
set(UNIFIED_RUNTIME_TAG 09be0881b727fadb1c04b38c00d2562d7dc6875f)
63+
# Date: Mon Mar 18 12:14:26 2024 +0000
64+
# Merge pull request #1291 from JackAKirk/cuda-seq-cst-b
65+
# [CUDA] Report that devices with cc >= sm_70 support seq_cst
66+
set(UNIFIED_RUNTIME_TAG 29ee45c4451a682f744146cc9dbeb2617ecdd6b3)
6767

6868
if(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO)
6969
set(UNIFIED_RUNTIME_REPO "${SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO}")

0 commit comments

Comments
 (0)