Skip to content

[CIR][CUDA] Lowering device and shared variables #1438

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 5, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions clang/include/clang/CIR/Dialect/IR/CIRCUDAAttrs.td
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,24 @@ def CUDAKernelNameAttr : CIR_Attr<"CUDAKernelName",
let assemblyFormat = "`<` $kernel_name `>`";
}

def CUDAShadowNameAttr : CIR_Attr<"CUDAShadowName",
"cu.shadow_name"> {
let summary = "Device-side global variable name for this shadow.";
let description =
[{
This attribute is attached to global variable definitions and records the
mangled name of the global variable used on the device.

In CUDA, __device__, __constant__ and __shared__ variables, as well as
surface and texture variables, will generate a shadow symbol on host.
We must preserve the correspodence in order to generate registration
functions.
}];

let parameters = (ins "std::string":$device_side_name);
let assemblyFormat = "`<` $device_side_name `>`";
}

def CUDABinaryHandleAttr : CIR_Attr<"CUDABinaryHandle",
"cu.binary_handle"> {
let summary = "Fat binary handle for device code.";
Expand All @@ -52,4 +70,17 @@ def CUDABinaryHandleAttr : CIR_Attr<"CUDABinaryHandle",
let assemblyFormat = "`<` $name `>`";
}

def CUDAExternallyInitializedAttr : CIR_Attr<"CUDAExternallyInitialized",
"cu.externally_initialized"> {
let summary = "The marked variable is externally initialized.";
let description =
[{
CUDA __device__ and __constant__ variables, along with surface and
textures, might be initialized by host, hence "externally initialized".
Therefore they must be emitted even if they are not referenced.

The attribute corresponds to the attribute on LLVM with the same name.
}];
}

#endif // MLIR_CIR_DIALECT_CIR_CUDA_ATTRS
20 changes: 20 additions & 0 deletions clang/lib/CIR/CodeGen/CIRGenCUDARuntime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -283,3 +283,23 @@ mlir::Operation *CIRGenCUDARuntime::getKernelHandle(cir::FuncOp fn,

return globalOp;
}

void CIRGenCUDARuntime::internalizeDeviceSideVar(
const VarDecl *d, cir::GlobalLinkageKind &linkage) {
if (cgm.getLangOpts().GPURelocatableDeviceCode)
llvm_unreachable("NYI");

// __shared__ variables are odd. Shadows do get created, but
// they are not registered with the CUDA runtime, so they
// can't really be used to access their device-side
// counterparts. It's not clear yet whether it's nvcc's bug or
// a feature, but we've got to do the same for compatibility.
if (d->hasAttr<CUDADeviceAttr>() || d->hasAttr<CUDAConstantAttr>() ||
d->hasAttr<CUDASharedAttr>()) {
linkage = cir::GlobalLinkageKind::InternalLinkage;
}

if (d->getType()->isCUDADeviceBuiltinSurfaceType() ||
d->getType()->isCUDADeviceBuiltinTextureType())
llvm_unreachable("NYI");
}
2 changes: 2 additions & 0 deletions clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ class CIRGenCUDARuntime {
const CUDAKernelCallExpr *expr,
ReturnValueSlot retValue);
virtual mlir::Operation *getKernelHandle(cir::FuncOp fn, GlobalDecl GD);
virtual void internalizeDeviceSideVar(const VarDecl *d,
cir::GlobalLinkageKind &linkage);
};

} // namespace clang::CIRGen
Expand Down
27 changes: 16 additions & 11 deletions clang/lib/CIR/CodeGen/CIRGenModule.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -569,13 +569,13 @@ bool CIRGenModule::shouldEmitCUDAGlobalVar(const VarDecl *global) const {
// their device-side incarnations.

if (global->hasAttr<CUDAConstantAttr>() ||
global->hasAttr<CUDASharedAttr>() ||
global->getType()->isCUDADeviceBuiltinSurfaceType() ||
global->getType()->isCUDADeviceBuiltinTextureType()) {
llvm_unreachable("NYI");
}

return !langOpts.CUDAIsDevice || global->hasAttr<CUDADeviceAttr>();
return !langOpts.CUDAIsDevice || global->hasAttr<CUDADeviceAttr>() ||
global->hasAttr<CUDASharedAttr>();
}

void CIRGenModule::emitGlobal(GlobalDecl gd) {
Expand All @@ -598,8 +598,10 @@ void CIRGenModule::emitGlobal(GlobalDecl gd) {
assert(!global->hasAttr<CPUDispatchAttr>() && "NYI");

if (langOpts.CUDA || langOpts.HIP) {
// clang uses the same flag when building HIP code
if (langOpts.CUDAIsDevice) {
if (const auto *vd = dyn_cast<VarDecl>(global)) {
if (!shouldEmitCUDAGlobalVar(vd))
return;
} else if (langOpts.CUDAIsDevice) {
// This will implicitly mark templates and their
// specializations as __host__ __device__.
if (langOpts.OffloadImplicitHostDeviceTemplates)
Expand All @@ -621,11 +623,6 @@ void CIRGenModule::emitGlobal(GlobalDecl gd) {
return;
}
}

if (const auto *vd = dyn_cast<VarDecl>(global)) {
if (!shouldEmitCUDAGlobalVar(vd))
return;
}
}

if (langOpts.OpenMP) {
Expand Down Expand Up @@ -1394,7 +1391,7 @@ void CIRGenModule::emitGlobalVarDefinition(const clang::VarDecl *d,
d->getType()->isCUDADeviceBuiltinTextureType());
if (getLangOpts().CUDA &&
(isCudaSharedVar || isCudaShadowVar || isCudaDeviceShadowVar))
assert(0 && "not implemented");
init = UndefAttr::get(&getMLIRContext(), convertType(d->getType()));
else if (d->hasAttr<LoaderUninitializedAttr>())
assert(0 && "not implemented");
else if (!initExpr) {
Expand Down Expand Up @@ -1490,11 +1487,19 @@ void CIRGenModule::emitGlobalVarDefinition(const clang::VarDecl *d,
cir::GlobalLinkageKind linkage =
getCIRLinkageVarDefinition(d, /*IsConstant=*/false);

// TODO(cir):
// CUDA B.2.1 "The __device__ qualifier declares a variable that resides on
// the device. [...]"
// CUDA B.2.2 "The __constant__ qualifier, optionally used together with
// __device__, declares a variable that: [...]
if (langOpts.CUDA && langOpts.CUDAIsDevice) {
// __shared__ variables is not marked as externally initialized,
// because they must not be initialized.
if (linkage != cir::GlobalLinkageKind::InternalLinkage &&
(d->hasAttr<CUDADeviceAttr>())) {
gv->setAttr(CUDAExternallyInitializedAttr::getMnemonic(),
CUDAExternallyInitializedAttr::get(&getMLIRContext()));
}
}

// Set initializer and finalize emission
CIRGenModule::setInitializer(gv, init);
Expand Down
6 changes: 6 additions & 0 deletions clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2397,6 +2397,12 @@ mlir::LogicalResult CIRToLLVMGlobalOpLowering::matchAndRewrite(

attributes.push_back(rewriter.getNamedAttr("visibility_", visibility));

if (auto extInit =
op->getAttr(CUDAExternallyInitializedAttr::getMnemonic())) {
attributes.push_back(rewriter.getNamedAttr("externally_initialized",
rewriter.getUnitAttr()));
}

if (init.has_value()) {
if (mlir::isa<cir::FPAttr, cir::IntAttr, cir::BoolAttr>(init.value())) {
// If a directly equivalent attribute is available, use it.
Expand Down
10 changes: 9 additions & 1 deletion clang/test/CIR/CodeGen/CUDA/global-vars.cu
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,15 @@
// RUN: %s -o %t.cir
// RUN: FileCheck --check-prefix=CIR-DEVICE --input-file=%t.cir %s

// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fclangir \
// RUN: -fcuda-is-device -emit-llvm -target-sdk-version=12.3 \
// RUN: %s -o %t.cir
// RUN: FileCheck --check-prefix=LLVM-DEVICE --input-file=%t.cir %s

__device__ int a;
// CIR-DEVICE: cir.global external addrspace(offload_global) @a = #cir.int<0>
// LLVM-DEVICE: @a = addrspace(1) externally_initialized global i32 0, align 4

// CIR-DEVICE: cir.global external addrspace(offload_global) @a = #cir.int<0> : !s32i {alignment = 4 : i64} loc(#loc3)
__shared__ int shared;
// CIR-DEVICE: cir.global external addrspace(offload_local) @shared = #cir.undef
// LLVM-DEVICE: @shared = addrspace(3) global i32 undef, align 4
Loading