Skip to content

Commit 52cc487

Browse files
committed
[cuda] Include GPU binary into host object file and generate init/deinit code.
- added -fcuda-include-gpubinary option to incorporate results of device-side compilation into host-side one. - generate code to register GPU binaries and associated kernels with CUDA runtime and clean-up on exit. - added test case for init/deinit code generation. Differential Revision: http://reviews.llvm.org/D9507 llvm-svn: 236765
1 parent f52123b commit 52cc487

File tree

8 files changed

+277
-19
lines changed

8 files changed

+277
-19
lines changed

clang/include/clang/Driver/CC1Options.td

+2
Original file line numberDiff line numberDiff line change
@@ -631,6 +631,8 @@ def fcuda_allow_host_calls_from_host_device : Flag<["-"],
631631
def fcuda_disable_target_call_checks : Flag<["-"],
632632
"fcuda-disable-target-call-checks">,
633633
HelpText<"Disable all cross-target (host, device, etc.) call checks in CUDA">;
634+
def fcuda_include_gpubinary : Separate<["-"], "fcuda-include-gpubinary">,
635+
HelpText<"Incorporate CUDA device-side binary into host object file.">;
634636

635637
} // let Flags = [CC1Option]
636638

clang/include/clang/Frontend/CodeGenOptions.h

+5
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,11 @@ class CodeGenOptions : public CodeGenOptionsBase {
163163
/// Name of the profile file to use as input for -fprofile-instr-use
164164
std::string InstrProfileInput;
165165

166+
/// A list of file names passed with -fcuda-include-gpubinary options to
167+
/// forward to CUDA runtime back-end for incorporating them into host-side
168+
/// object file.
169+
std::vector<std::string> CudaGpuBinaryFileNames;
170+
166171
/// Regular expression to select optimizations for which we should enable
167172
/// optimization remarks. Transformation passes whose name matches this
168173
/// expression (and support this feature), will emit a diagnostic

clang/lib/CodeGen/CGCUDANV.cpp

+205-13
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
#include "llvm/IR/CallSite.h"
2121
#include "llvm/IR/Constants.h"
2222
#include "llvm/IR/DerivedTypes.h"
23-
#include <vector>
2423

2524
using namespace clang;
2625
using namespace CodeGen;
@@ -30,29 +29,66 @@ namespace {
3029
class CGNVCUDARuntime : public CGCUDARuntime {
3130

3231
private:
33-
llvm::Type *IntTy, *SizeTy;
34-
llvm::PointerType *CharPtrTy, *VoidPtrTy;
32+
llvm::Type *IntTy, *SizeTy, *VoidTy;
33+
llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy;
34+
35+
/// Convenience reference to LLVM Context
36+
llvm::LLVMContext &Context;
37+
/// Convenience reference to the current module
38+
llvm::Module &TheModule;
39+
/// Keeps track of kernel launch stubs emitted in this module
40+
llvm::SmallVector<llvm::Function *, 16> EmittedKernels;
41+
/// Keeps track of variables containing handles of GPU binaries. Populated by
42+
/// ModuleCtorFunction() and used to create corresponding cleanup calls in
43+
/// ModuleDtorFunction()
44+
llvm::SmallVector<llvm::GlobalVariable *, 16> GpuBinaryHandles;
3545

3646
llvm::Constant *getSetupArgumentFn() const;
3747
llvm::Constant *getLaunchFn() const;
3848

49+
/// Creates a function to register all kernel stubs generated in this module.
50+
llvm::Function *makeRegisterKernelsFn();
51+
52+
/// Helper function that generates a constant string and returns a pointer to
53+
/// the start of the string. The result of this function can be used anywhere
54+
/// where the C code specifies const char*.
55+
llvm::Constant *makeConstantString(const std::string &Str,
56+
const std::string &Name = "",
57+
unsigned Alignment = 0) {
58+
llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0),
59+
llvm::ConstantInt::get(SizeTy, 0)};
60+
auto *ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str());
61+
return llvm::ConstantExpr::getGetElementPtr(ConstStr->getValueType(),
62+
ConstStr, Zeros);
63+
}
64+
65+
void emitDeviceStubBody(CodeGenFunction &CGF, FunctionArgList &Args);
66+
3967
public:
4068
CGNVCUDARuntime(CodeGenModule &CGM);
4169

42-
void EmitDeviceStubBody(CodeGenFunction &CGF, FunctionArgList &Args) override;
70+
void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
71+
/// Creates module constructor function
72+
llvm::Function *makeModuleCtorFunction() override;
73+
/// Creates module destructor function
74+
llvm::Function *makeModuleDtorFunction() override;
4375
};
4476

4577
}
4678

47-
CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) : CGCUDARuntime(CGM) {
79+
CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
80+
: CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
81+
TheModule(CGM.getModule()) {
4882
CodeGen::CodeGenTypes &Types = CGM.getTypes();
4983
ASTContext &Ctx = CGM.getContext();
5084

5185
IntTy = Types.ConvertType(Ctx.IntTy);
5286
SizeTy = Types.ConvertType(Ctx.getSizeType());
87+
VoidTy = llvm::Type::getVoidTy(Context);
5388

5489
CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy));
5590
VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy));
91+
VoidPtrPtrTy = VoidPtrTy->getPointerTo();
5692
}
5793

5894
llvm::Constant *CGNVCUDARuntime::getSetupArgumentFn() const {
@@ -68,14 +104,17 @@ llvm::Constant *CGNVCUDARuntime::getSetupArgumentFn() const {
68104

69105
llvm::Constant *CGNVCUDARuntime::getLaunchFn() const {
70106
// cudaError_t cudaLaunch(char *)
71-
std::vector<llvm::Type*> Params;
72-
Params.push_back(CharPtrTy);
73-
return CGM.CreateRuntimeFunction(llvm::FunctionType::get(IntTy,
74-
Params, false),
75-
"cudaLaunch");
107+
return CGM.CreateRuntimeFunction(
108+
llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch");
109+
}
110+
111+
void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
112+
FunctionArgList &Args) {
113+
EmittedKernels.push_back(CGF.CurFn);
114+
emitDeviceStubBody(CGF, Args);
76115
}
77116

78-
void CGNVCUDARuntime::EmitDeviceStubBody(CodeGenFunction &CGF,
117+
void CGNVCUDARuntime::emitDeviceStubBody(CodeGenFunction &CGF,
79118
FunctionArgList &Args) {
80119
// Build the argument value list and the argument stack struct type.
81120
SmallVector<llvm::Value *, 16> ArgValues;
@@ -87,8 +126,7 @@ void CGNVCUDARuntime::EmitDeviceStubBody(CodeGenFunction &CGF,
87126
assert(isa<llvm::PointerType>(V->getType()) && "Arg type not PointerType");
88127
ArgTypes.push_back(cast<llvm::PointerType>(V->getType())->getElementType());
89128
}
90-
llvm::StructType *ArgStackTy = llvm::StructType::get(
91-
CGF.getLLVMContext(), ArgTypes);
129+
llvm::StructType *ArgStackTy = llvm::StructType::get(Context, ArgTypes);
92130

93131
llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
94132

@@ -120,6 +158,160 @@ void CGNVCUDARuntime::EmitDeviceStubBody(CodeGenFunction &CGF,
120158
CGF.EmitBlock(EndBlock);
121159
}
122160

161+
/// Creates internal function to register all kernel stubs generated in this
162+
/// module with the CUDA runtime.
163+
/// \code
164+
/// void __cuda_register_kernels(void** GpuBinaryHandle) {
165+
/// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);
166+
/// ...
167+
/// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...);
168+
/// }
169+
/// \endcode
170+
llvm::Function *CGNVCUDARuntime::makeRegisterKernelsFn() {
171+
llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
172+
llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
173+
llvm::GlobalValue::InternalLinkage, "__cuda_register_kernels", &TheModule);
174+
llvm::BasicBlock *EntryBB =
175+
llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
176+
CGBuilderTy Builder(Context);
177+
Builder.SetInsertPoint(EntryBB);
178+
179+
// void __cudaRegisterFunction(void **, const char *, char *, const char *,
180+
// int, uint3*, uint3*, dim3*, dim3*, int*)
181+
std::vector<llvm::Type *> RegisterFuncParams = {
182+
VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy,
183+
VoidPtrTy, VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()};
184+
llvm::Constant *RegisterFunc = CGM.CreateRuntimeFunction(
185+
llvm::FunctionType::get(IntTy, RegisterFuncParams, false),
186+
"__cudaRegisterFunction");
187+
188+
// Extract GpuBinaryHandle passed as the first argument passed to
189+
// __cuda_register_kernels() and generate __cudaRegisterFunction() call for
190+
// each emitted kernel.
191+
llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
192+
for (llvm::Function *Kernel : EmittedKernels) {
193+
llvm::Constant *KernelName = makeConstantString(Kernel->getName());
194+
llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy);
195+
llvm::Value *args[] = {
196+
&GpuBinaryHandlePtr, Builder.CreateBitCast(Kernel, VoidPtrTy),
197+
KernelName, KernelName, llvm::ConstantInt::get(IntTy, -1), NullPtr,
198+
NullPtr, NullPtr, NullPtr,
199+
llvm::ConstantPointerNull::get(IntTy->getPointerTo())};
200+
Builder.CreateCall(RegisterFunc, args);
201+
}
202+
203+
Builder.CreateRetVoid();
204+
return RegisterKernelsFunc;
205+
}
206+
207+
/// Creates a global constructor function for the module:
208+
/// \code
209+
/// void __cuda_module_ctor(void*) {
210+
/// Handle0 = __cudaRegisterFatBinary(GpuBinaryBlob0);
211+
/// __cuda_register_kernels(Handle0);
212+
/// ...
213+
/// HandleN = __cudaRegisterFatBinary(GpuBinaryBlobN);
214+
/// __cuda_register_kernels(HandleN);
215+
/// }
216+
/// \endcode
217+
llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
218+
// void __cuda_register_kernels(void* handle);
219+
llvm::Function *RegisterKernelsFunc = makeRegisterKernelsFn();
220+
// void ** __cudaRegisterFatBinary(void *);
221+
llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction(
222+
llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),
223+
"__cudaRegisterFatBinary");
224+
// struct { int magic, int version, void * gpu_binary, void * dont_care };
225+
llvm::StructType *FatbinWrapperTy =
226+
llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy, nullptr);
227+
228+
llvm::Function *ModuleCtorFunc = llvm::Function::Create(
229+
llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
230+
llvm::GlobalValue::InternalLinkage, "__cuda_module_ctor", &TheModule);
231+
llvm::BasicBlock *CtorEntryBB =
232+
llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc);
233+
CGBuilderTy CtorBuilder(Context);
234+
235+
CtorBuilder.SetInsertPoint(CtorEntryBB);
236+
237+
// For each GPU binary, register it with the CUDA runtime and store returned
238+
// handle in a global variable and save the handle in GpuBinaryHandles vector
239+
// to be cleaned up in destructor on exit. Then associate all known kernels
240+
// with the GPU binary handle so CUDA runtime can figure out what to call on
241+
// the GPU side.
242+
for (const std::string &GpuBinaryFileName :
243+
CGM.getCodeGenOpts().CudaGpuBinaryFileNames) {
244+
llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> GpuBinaryOrErr =
245+
llvm::MemoryBuffer::getFileOrSTDIN(GpuBinaryFileName);
246+
if (std::error_code EC = GpuBinaryOrErr.getError()) {
247+
CGM.getDiags().Report(diag::err_cannot_open_file) << GpuBinaryFileName
248+
<< EC.message();
249+
continue;
250+
}
251+
252+
// Create initialized wrapper structure that points to the loaded GPU binary
253+
llvm::Constant *Values[] = {
254+
llvm::ConstantInt::get(IntTy, 0x466243b1), // Fatbin wrapper magic.
255+
llvm::ConstantInt::get(IntTy, 1), // Fatbin version.
256+
makeConstantString(GpuBinaryOrErr.get()->getBuffer(), "", 16), // Data.
257+
llvm::ConstantPointerNull::get(VoidPtrTy)}; // Unused in fatbin v1.
258+
llvm::GlobalVariable *FatbinWrapper = new llvm::GlobalVariable(
259+
TheModule, FatbinWrapperTy, true, llvm::GlobalValue::InternalLinkage,
260+
llvm::ConstantStruct::get(FatbinWrapperTy, Values),
261+
"__cuda_fatbin_wrapper");
262+
263+
// GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
264+
llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
265+
RegisterFatbinFunc,
266+
CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
267+
llvm::GlobalVariable *GpuBinaryHandle = new llvm::GlobalVariable(
268+
TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,
269+
llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle");
270+
CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryHandle, false);
271+
272+
// Call __cuda_register_kernels(GpuBinaryHandle);
273+
CtorBuilder.CreateCall(RegisterKernelsFunc, RegisterFatbinCall);
274+
275+
// Save GpuBinaryHandle so we can unregister it in destructor.
276+
GpuBinaryHandles.push_back(GpuBinaryHandle);
277+
}
278+
279+
CtorBuilder.CreateRetVoid();
280+
return ModuleCtorFunc;
281+
}
282+
283+
/// Creates a global destructor function that unregisters all GPU code blobs
284+
/// registered by constructor.
285+
/// \code
286+
/// void __cuda_module_dtor(void*) {
287+
/// __cudaUnregisterFatBinary(Handle0);
288+
/// ...
289+
/// __cudaUnregisterFatBinary(HandleN);
290+
/// }
291+
/// \endcode
292+
llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
293+
// void __cudaUnregisterFatBinary(void ** handle);
294+
llvm::Constant *UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
295+
llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
296+
"__cudaUnregisterFatBinary");
297+
298+
llvm::Function *ModuleDtorFunc = llvm::Function::Create(
299+
llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
300+
llvm::GlobalValue::InternalLinkage, "__cuda_module_dtor", &TheModule);
301+
llvm::BasicBlock *DtorEntryBB =
302+
llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc);
303+
CGBuilderTy DtorBuilder(Context);
304+
DtorBuilder.SetInsertPoint(DtorEntryBB);
305+
306+
for (llvm::GlobalVariable *GpuBinaryHandle : GpuBinaryHandles) {
307+
DtorBuilder.CreateCall(UnregisterFatbinFunc,
308+
DtorBuilder.CreateLoad(GpuBinaryHandle, false));
309+
}
310+
311+
DtorBuilder.CreateRetVoid();
312+
return ModuleDtorFunc;
313+
}
314+
123315
CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) {
124316
return new CGNVCUDARuntime(CGM);
125317
}

clang/lib/CodeGen/CGCUDARuntime.h

+14-3
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,10 @@
1616
#ifndef LLVM_CLANG_LIB_CODEGEN_CGCUDARUNTIME_H
1717
#define LLVM_CLANG_LIB_CODEGEN_CGCUDARUNTIME_H
1818

19+
namespace llvm {
20+
class Function;
21+
}
22+
1923
namespace clang {
2024

2125
class CUDAKernelCallExpr;
@@ -39,10 +43,17 @@ class CGCUDARuntime {
3943
virtual RValue EmitCUDAKernelCallExpr(CodeGenFunction &CGF,
4044
const CUDAKernelCallExpr *E,
4145
ReturnValueSlot ReturnValue);
42-
43-
virtual void EmitDeviceStubBody(CodeGenFunction &CGF,
44-
FunctionArgList &Args) = 0;
4546

47+
/// Emits a kernel launch stub.
48+
virtual void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) = 0;
49+
50+
/// Constructs and returns a module initialization function or nullptr if it's
51+
/// not needed. Must be called after all kernels have been emitted.
52+
virtual llvm::Function *makeModuleCtorFunction() = 0;
53+
54+
/// Returns a module cleanup function or nullptr if it's not needed.
55+
/// Must be called after ModuleCtorFunction
56+
virtual llvm::Function *makeModuleDtorFunction() = 0;
4657
};
4758

4859
/// Creates an instance of a CUDA runtime class.

clang/lib/CodeGen/CodeGenFunction.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -878,7 +878,7 @@ void CodeGenFunction::GenerateCode(GlobalDecl GD, llvm::Function *Fn,
878878
else if (getLangOpts().CUDA &&
879879
!getLangOpts().CUDAIsDevice &&
880880
FD->hasAttr<CUDAGlobalAttr>())
881-
CGM.getCUDARuntime().EmitDeviceStubBody(*this, Args);
881+
CGM.getCUDARuntime().emitDeviceStub(*this, Args);
882882
else if (isa<CXXConversionDecl>(FD) &&
883883
cast<CXXConversionDecl>(FD)->isLambdaToBlockPointerConversion()) {
884884
// The lambda conversion to block pointer is special; the semantics can't be

clang/lib/CodeGen/CodeGenModule.cpp

+7-1
Original file line numberDiff line numberDiff line change
@@ -350,6 +350,13 @@ void CodeGenModule::Release() {
350350
if (ObjCRuntime)
351351
if (llvm::Function *ObjCInitFunction = ObjCRuntime->ModuleInitFunction())
352352
AddGlobalCtor(ObjCInitFunction);
353+
if (Context.getLangOpts().CUDA && !Context.getLangOpts().CUDAIsDevice &&
354+
CUDARuntime) {
355+
if (llvm::Function *CudaCtorFunction = CUDARuntime->makeModuleCtorFunction())
356+
AddGlobalCtor(CudaCtorFunction);
357+
if (llvm::Function *CudaDtorFunction = CUDARuntime->makeModuleDtorFunction())
358+
AddGlobalDtor(CudaDtorFunction);
359+
}
353360
if (PGOReader && PGOStats.hasDiagnostics())
354361
PGOStats.reportDiagnostics(getDiags(), getCodeGenOpts().MainFileName);
355362
EmitCtorList(GlobalCtors, "llvm.global_ctors");
@@ -3678,4 +3685,3 @@ void CodeGenModule::EmitOMPThreadPrivateDecl(const OMPThreadPrivateDecl *D) {
36783685
CXXGlobalInits.push_back(InitFunction);
36793686
}
36803687
}
3681-

clang/lib/Frontend/CompilerInvocation.cpp

+3
Original file line numberDiff line numberDiff line change
@@ -651,6 +651,9 @@ static bool ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args, InputKind IK,
651651
Args.getAllArgValues(OPT_fsanitize_recover_EQ), Diags,
652652
Opts.SanitizeRecover);
653653

654+
Opts.CudaGpuBinaryFileNames =
655+
Args.getAllArgValues(OPT_fcuda_include_gpubinary);
656+
654657
return Success;
655658
}
656659

0 commit comments

Comments
 (0)