20
20
#include " llvm/IR/CallSite.h"
21
21
#include " llvm/IR/Constants.h"
22
22
#include " llvm/IR/DerivedTypes.h"
23
- #include < vector>
24
23
25
24
using namespace clang ;
26
25
using namespace CodeGen ;
@@ -30,29 +29,66 @@ namespace {
30
29
class CGNVCUDARuntime : public CGCUDARuntime {
31
30
32
31
private:
33
- llvm::Type *IntTy, *SizeTy;
34
- llvm::PointerType *CharPtrTy, *VoidPtrTy;
32
+ llvm::Type *IntTy, *SizeTy, *VoidTy;
33
+ llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy;
34
+
35
+ // / Convenience reference to LLVM Context
36
+ llvm::LLVMContext &Context;
37
+ // / Convenience reference to the current module
38
+ llvm::Module &TheModule;
39
+ // / Keeps track of kernel launch stubs emitted in this module
40
+ llvm::SmallVector<llvm::Function *, 16 > EmittedKernels;
41
+ // / Keeps track of variables containing handles of GPU binaries. Populated by
42
+ // / ModuleCtorFunction() and used to create corresponding cleanup calls in
43
+ // / ModuleDtorFunction()
44
+ llvm::SmallVector<llvm::GlobalVariable *, 16 > GpuBinaryHandles;
35
45
36
46
llvm::Constant *getSetupArgumentFn () const ;
37
47
llvm::Constant *getLaunchFn () const ;
38
48
49
+ // / Creates a function to register all kernel stubs generated in this module.
50
+ llvm::Function *makeRegisterKernelsFn ();
51
+
52
+ // / Helper function that generates a constant string and returns a pointer to
53
+ // / the start of the string. The result of this function can be used anywhere
54
+ // / where the C code specifies const char*.
55
+ llvm::Constant *makeConstantString (const std::string &Str,
56
+ const std::string &Name = " " ,
57
+ unsigned Alignment = 0 ) {
58
+ llvm::Constant *Zeros[] = {llvm::ConstantInt::get (SizeTy, 0 ),
59
+ llvm::ConstantInt::get (SizeTy, 0 )};
60
+ auto *ConstStr = CGM.GetAddrOfConstantCString (Str, Name.c_str ());
61
+ return llvm::ConstantExpr::getGetElementPtr (ConstStr->getValueType (),
62
+ ConstStr, Zeros);
63
+ }
64
+
65
+ void emitDeviceStubBody (CodeGenFunction &CGF, FunctionArgList &Args);
66
+
39
67
public:
40
68
CGNVCUDARuntime (CodeGenModule &CGM);
41
69
42
- void EmitDeviceStubBody (CodeGenFunction &CGF, FunctionArgList &Args) override ;
70
+ void emitDeviceStub (CodeGenFunction &CGF, FunctionArgList &Args) override ;
71
+ // / Creates module constructor function
72
+ llvm::Function *makeModuleCtorFunction () override ;
73
+ // / Creates module destructor function
74
+ llvm::Function *makeModuleDtorFunction () override ;
43
75
};
44
76
45
77
}
46
78
47
- CGNVCUDARuntime::CGNVCUDARuntime (CodeGenModule &CGM) : CGCUDARuntime(CGM) {
79
+ CGNVCUDARuntime::CGNVCUDARuntime (CodeGenModule &CGM)
80
+ : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
81
+ TheModule(CGM.getModule()) {
48
82
CodeGen::CodeGenTypes &Types = CGM.getTypes ();
49
83
ASTContext &Ctx = CGM.getContext ();
50
84
51
85
IntTy = Types.ConvertType (Ctx.IntTy );
52
86
SizeTy = Types.ConvertType (Ctx.getSizeType ());
87
+ VoidTy = llvm::Type::getVoidTy (Context);
53
88
54
89
CharPtrTy = llvm::PointerType::getUnqual (Types.ConvertType (Ctx.CharTy ));
55
90
VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType (Ctx.VoidPtrTy ));
91
+ VoidPtrPtrTy = VoidPtrTy->getPointerTo ();
56
92
}
57
93
58
94
llvm::Constant *CGNVCUDARuntime::getSetupArgumentFn () const {
@@ -68,14 +104,17 @@ llvm::Constant *CGNVCUDARuntime::getSetupArgumentFn() const {
68
104
69
105
llvm::Constant *CGNVCUDARuntime::getLaunchFn () const {
70
106
// cudaError_t cudaLaunch(char *)
71
- std::vector<llvm::Type*> Params;
72
- Params.push_back (CharPtrTy);
73
- return CGM.CreateRuntimeFunction (llvm::FunctionType::get (IntTy,
74
- Params, false ),
75
- " cudaLaunch" );
107
+ return CGM.CreateRuntimeFunction (
108
+ llvm::FunctionType::get (IntTy, CharPtrTy, false ), " cudaLaunch" );
109
+ }
110
+
111
+ void CGNVCUDARuntime::emitDeviceStub (CodeGenFunction &CGF,
112
+ FunctionArgList &Args) {
113
+ EmittedKernels.push_back (CGF.CurFn );
114
+ emitDeviceStubBody (CGF, Args);
76
115
}
77
116
78
- void CGNVCUDARuntime::EmitDeviceStubBody (CodeGenFunction &CGF,
117
+ void CGNVCUDARuntime::emitDeviceStubBody (CodeGenFunction &CGF,
79
118
FunctionArgList &Args) {
80
119
// Build the argument value list and the argument stack struct type.
81
120
SmallVector<llvm::Value *, 16 > ArgValues;
@@ -87,8 +126,7 @@ void CGNVCUDARuntime::EmitDeviceStubBody(CodeGenFunction &CGF,
87
126
assert (isa<llvm::PointerType>(V->getType ()) && " Arg type not PointerType" );
88
127
ArgTypes.push_back (cast<llvm::PointerType>(V->getType ())->getElementType ());
89
128
}
90
- llvm::StructType *ArgStackTy = llvm::StructType::get (
91
- CGF.getLLVMContext (), ArgTypes);
129
+ llvm::StructType *ArgStackTy = llvm::StructType::get (Context, ArgTypes);
92
130
93
131
llvm::BasicBlock *EndBlock = CGF.createBasicBlock (" setup.end" );
94
132
@@ -120,6 +158,160 @@ void CGNVCUDARuntime::EmitDeviceStubBody(CodeGenFunction &CGF,
120
158
CGF.EmitBlock (EndBlock);
121
159
}
122
160
161
+ // / Creates internal function to register all kernel stubs generated in this
162
+ // / module with the CUDA runtime.
163
+ // / \code
164
+ // / void __cuda_register_kernels(void** GpuBinaryHandle) {
165
+ // / __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);
166
+ // / ...
167
+ // / __cudaRegisterFunction(GpuBinaryHandle,KernelM,...);
168
+ // / }
169
+ // / \endcode
170
+ llvm::Function *CGNVCUDARuntime::makeRegisterKernelsFn () {
171
+ llvm::Function *RegisterKernelsFunc = llvm::Function::Create (
172
+ llvm::FunctionType::get (VoidTy, VoidPtrPtrTy, false ),
173
+ llvm::GlobalValue::InternalLinkage, " __cuda_register_kernels" , &TheModule);
174
+ llvm::BasicBlock *EntryBB =
175
+ llvm::BasicBlock::Create (Context, " entry" , RegisterKernelsFunc);
176
+ CGBuilderTy Builder (Context);
177
+ Builder.SetInsertPoint (EntryBB);
178
+
179
+ // void __cudaRegisterFunction(void **, const char *, char *, const char *,
180
+ // int, uint3*, uint3*, dim3*, dim3*, int*)
181
+ std::vector<llvm::Type *> RegisterFuncParams = {
182
+ VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy,
183
+ VoidPtrTy, VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo ()};
184
+ llvm::Constant *RegisterFunc = CGM.CreateRuntimeFunction (
185
+ llvm::FunctionType::get (IntTy, RegisterFuncParams, false ),
186
+ " __cudaRegisterFunction" );
187
+
188
+ // Extract GpuBinaryHandle passed as the first argument passed to
189
+ // __cuda_register_kernels() and generate __cudaRegisterFunction() call for
190
+ // each emitted kernel.
191
+ llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin ();
192
+ for (llvm::Function *Kernel : EmittedKernels) {
193
+ llvm::Constant *KernelName = makeConstantString (Kernel->getName ());
194
+ llvm::Constant *NullPtr = llvm::ConstantPointerNull::get (VoidPtrTy);
195
+ llvm::Value *args[] = {
196
+ &GpuBinaryHandlePtr, Builder.CreateBitCast (Kernel, VoidPtrTy),
197
+ KernelName, KernelName, llvm::ConstantInt::get (IntTy, -1 ), NullPtr,
198
+ NullPtr, NullPtr, NullPtr,
199
+ llvm::ConstantPointerNull::get (IntTy->getPointerTo ())};
200
+ Builder.CreateCall (RegisterFunc, args);
201
+ }
202
+
203
+ Builder.CreateRetVoid ();
204
+ return RegisterKernelsFunc;
205
+ }
206
+
207
+ // / Creates a global constructor function for the module:
208
+ // / \code
209
+ // / void __cuda_module_ctor(void*) {
210
+ // / Handle0 = __cudaRegisterFatBinary(GpuBinaryBlob0);
211
+ // / __cuda_register_kernels(Handle0);
212
+ // / ...
213
+ // / HandleN = __cudaRegisterFatBinary(GpuBinaryBlobN);
214
+ // / __cuda_register_kernels(HandleN);
215
+ // / }
216
+ // / \endcode
217
+ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction () {
218
+ // void __cuda_register_kernels(void* handle);
219
+ llvm::Function *RegisterKernelsFunc = makeRegisterKernelsFn ();
220
+ // void ** __cudaRegisterFatBinary(void *);
221
+ llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction (
222
+ llvm::FunctionType::get (VoidPtrPtrTy, VoidPtrTy, false ),
223
+ " __cudaRegisterFatBinary" );
224
+ // struct { int magic, int version, void * gpu_binary, void * dont_care };
225
+ llvm::StructType *FatbinWrapperTy =
226
+ llvm::StructType::get (IntTy, IntTy, VoidPtrTy, VoidPtrTy, nullptr );
227
+
228
+ llvm::Function *ModuleCtorFunc = llvm::Function::Create (
229
+ llvm::FunctionType::get (VoidTy, VoidPtrTy, false ),
230
+ llvm::GlobalValue::InternalLinkage, " __cuda_module_ctor" , &TheModule);
231
+ llvm::BasicBlock *CtorEntryBB =
232
+ llvm::BasicBlock::Create (Context, " entry" , ModuleCtorFunc);
233
+ CGBuilderTy CtorBuilder (Context);
234
+
235
+ CtorBuilder.SetInsertPoint (CtorEntryBB);
236
+
237
+ // For each GPU binary, register it with the CUDA runtime and store returned
238
+ // handle in a global variable and save the handle in GpuBinaryHandles vector
239
+ // to be cleaned up in destructor on exit. Then associate all known kernels
240
+ // with the GPU binary handle so CUDA runtime can figure out what to call on
241
+ // the GPU side.
242
+ for (const std::string &GpuBinaryFileName :
243
+ CGM.getCodeGenOpts ().CudaGpuBinaryFileNames ) {
244
+ llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> GpuBinaryOrErr =
245
+ llvm::MemoryBuffer::getFileOrSTDIN (GpuBinaryFileName);
246
+ if (std::error_code EC = GpuBinaryOrErr.getError ()) {
247
+ CGM.getDiags ().Report (diag::err_cannot_open_file) << GpuBinaryFileName
248
+ << EC.message ();
249
+ continue ;
250
+ }
251
+
252
+ // Create initialized wrapper structure that points to the loaded GPU binary
253
+ llvm::Constant *Values[] = {
254
+ llvm::ConstantInt::get (IntTy, 0x466243b1 ), // Fatbin wrapper magic.
255
+ llvm::ConstantInt::get (IntTy, 1 ), // Fatbin version.
256
+ makeConstantString (GpuBinaryOrErr.get ()->getBuffer (), " " , 16 ), // Data.
257
+ llvm::ConstantPointerNull::get (VoidPtrTy)}; // Unused in fatbin v1.
258
+ llvm::GlobalVariable *FatbinWrapper = new llvm::GlobalVariable (
259
+ TheModule, FatbinWrapperTy, true , llvm::GlobalValue::InternalLinkage,
260
+ llvm::ConstantStruct::get (FatbinWrapperTy, Values),
261
+ " __cuda_fatbin_wrapper" );
262
+
263
+ // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
264
+ llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall (
265
+ RegisterFatbinFunc,
266
+ CtorBuilder.CreateBitCast (FatbinWrapper, VoidPtrTy));
267
+ llvm::GlobalVariable *GpuBinaryHandle = new llvm::GlobalVariable (
268
+ TheModule, VoidPtrPtrTy, false , llvm::GlobalValue::InternalLinkage,
269
+ llvm::ConstantPointerNull::get (VoidPtrPtrTy), " __cuda_gpubin_handle" );
270
+ CtorBuilder.CreateStore (RegisterFatbinCall, GpuBinaryHandle, false );
271
+
272
+ // Call __cuda_register_kernels(GpuBinaryHandle);
273
+ CtorBuilder.CreateCall (RegisterKernelsFunc, RegisterFatbinCall);
274
+
275
+ // Save GpuBinaryHandle so we can unregister it in destructor.
276
+ GpuBinaryHandles.push_back (GpuBinaryHandle);
277
+ }
278
+
279
+ CtorBuilder.CreateRetVoid ();
280
+ return ModuleCtorFunc;
281
+ }
282
+
283
+ // / Creates a global destructor function that unregisters all GPU code blobs
284
+ // / registered by constructor.
285
+ // / \code
286
+ // / void __cuda_module_dtor(void*) {
287
+ // / __cudaUnregisterFatBinary(Handle0);
288
+ // / ...
289
+ // / __cudaUnregisterFatBinary(HandleN);
290
+ // / }
291
+ // / \endcode
292
+ llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction () {
293
+ // void __cudaUnregisterFatBinary(void ** handle);
294
+ llvm::Constant *UnregisterFatbinFunc = CGM.CreateRuntimeFunction (
295
+ llvm::FunctionType::get (VoidTy, VoidPtrPtrTy, false ),
296
+ " __cudaUnregisterFatBinary" );
297
+
298
+ llvm::Function *ModuleDtorFunc = llvm::Function::Create (
299
+ llvm::FunctionType::get (VoidTy, VoidPtrTy, false ),
300
+ llvm::GlobalValue::InternalLinkage, " __cuda_module_dtor" , &TheModule);
301
+ llvm::BasicBlock *DtorEntryBB =
302
+ llvm::BasicBlock::Create (Context, " entry" , ModuleDtorFunc);
303
+ CGBuilderTy DtorBuilder (Context);
304
+ DtorBuilder.SetInsertPoint (DtorEntryBB);
305
+
306
+ for (llvm::GlobalVariable *GpuBinaryHandle : GpuBinaryHandles) {
307
+ DtorBuilder.CreateCall (UnregisterFatbinFunc,
308
+ DtorBuilder.CreateLoad (GpuBinaryHandle, false ));
309
+ }
310
+
311
+ DtorBuilder.CreateRetVoid ();
312
+ return ModuleDtorFunc;
313
+ }
314
+
123
315
CGCUDARuntime *CodeGen::CreateNVCUDARuntime (CodeGenModule &CGM) {
124
316
return new CGNVCUDARuntime (CGM);
125
317
}
0 commit comments