Skip to content

Commit a3bd87b

Browse files
committed
[AMDGPU] Call the FINI_ARRAY destructors in the correct order (llvm#71815)
Summary: The AMDGPU backend uses the linker-provided INIT_ARRAY and FINI_ARRAY sections to call all the global constructors in a single kernel. Previously this mistakenly used the same iteration logic for both arrays. The destructors stored in FINI_ARRAY are stored in the same order as the ones in the INIT_ARRAY section so we need to traverse it in reverse order. Relanding after the revert in fe7b5e2 using the IR builder interface instead of ConstantExpr.
1 parent dbd00c3 commit a3bd87b

File tree

4 files changed

+63
-23
lines changed

4 files changed

+63
-23
lines changed

llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp

+35-4
Original file line numberDiff line numberDiff line change
@@ -53,13 +53,22 @@ static Function *createInitOrFiniKernelFunction(Module &M, bool IsCtor) {
5353
//
5454
// extern "C" void * __init_array_start[];
5555
// extern "C" void * __init_array_end[];
56+
// extern "C" void * __fini_array_start[];
57+
// extern "C" void * __fini_array_end[];
5658
//
5759
// using InitCallback = void();
60+
// using FiniCallback = void(void);
5861
//
5962
// void call_init_array_callbacks() {
6063
// for (auto start = __init_array_start; start != __init_array_end; ++start)
6164
// reinterpret_cast<InitCallback *>(*start)();
6265
// }
66+
//
67+
// void call_fini_array_callbacks() {
68+
// size_t fini_array_size = __fini_array_end - __fini_array_start;
69+
// for (size_t i = fini_array_size; i > 0; --i)
70+
// reinterpret_cast<FiniCallback *>(__fini_array_start[i - 1])();
71+
// }
6372
static void createInitOrFiniCalls(Function &F, bool IsCtor) {
6473
Module &M = *F.getParent();
6574
LLVMContext &C = M.getContext();
@@ -96,15 +105,37 @@ static void createInitOrFiniCalls(Function &F, bool IsCtor) {
96105
// for now we just call them with no arguments.
97106
auto *CallBackTy = FunctionType::get(IRB.getVoidTy(), {});
98107

99-
IRB.CreateCondBr(IRB.CreateICmpNE(Begin, End), LoopBB, ExitBB);
108+
Value *Start = Begin;
109+
Value *Stop = End;
110+
// The destructor array must be called in reverse order. Get a constant
111+
// expression to the end of the array and iterate backwards instead.
112+
if (!IsCtor) {
113+
Type *Int64Ty = IntegerType::getInt64Ty(C);
114+
auto *EndPtr = IRB.CreatePtrToInt(End, Int64Ty);
115+
auto *BeginPtr = IRB.CreatePtrToInt(Begin, Int64Ty);
116+
auto *ByteSize = IRB.CreateSub(EndPtr, BeginPtr);
117+
auto *Size = IRB.CreateAShr(ByteSize, ConstantInt::get(Int64Ty, 3));
118+
auto *Offset = IRB.CreateSub(Size, ConstantInt::get(Int64Ty, 1));
119+
Start = IRB.CreateInBoundsGEP(
120+
ArrayType::get(IRB.getPtrTy(), 0), Begin,
121+
ArrayRef<Value *>({ConstantInt::get(Int64Ty, 0), Offset}));
122+
Stop = Begin;
123+
}
124+
125+
IRB.CreateCondBr(
126+
IRB.CreateCmp(IsCtor ? ICmpInst::ICMP_NE : ICmpInst::ICMP_UGE, Start,
127+
Stop),
128+
LoopBB, ExitBB);
100129
IRB.SetInsertPoint(LoopBB);
101130
auto *CallBackPHI = IRB.CreatePHI(PtrTy, 2, "ptr");
102131
auto *CallBack = IRB.CreateLoad(CallBackTy->getPointerTo(F.getAddressSpace()),
103132
CallBackPHI, "callback");
104133
IRB.CreateCall(CallBackTy, CallBack);
105-
auto *NewCallBack = IRB.CreateConstGEP1_64(PtrTy, CallBackPHI, 1, "next");
106-
auto *EndCmp = IRB.CreateICmpEQ(NewCallBack, End, "end");
107-
CallBackPHI->addIncoming(Begin, &F.getEntryBlock());
134+
auto *NewCallBack =
135+
IRB.CreateConstGEP1_64(PtrTy, CallBackPHI, IsCtor ? 1 : -1, "next");
136+
auto *EndCmp = IRB.CreateCmp(IsCtor ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_ULT,
137+
NewCallBack, Stop, "end");
138+
CallBackPHI->addIncoming(Start, &F.getEntryBlock());
108139
CallBackPHI->addIncoming(NewCallBack, LoopBB);
109140
IRB.CreateCondBr(EndCmp, ExitBB, LoopBB);
110141
IRB.SetInsertPoint(ExitBB);

llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-constexpr-alias.ll

+8-6
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,6 @@ define void @bar() addrspace(1) {
2525
ret void
2626
}
2727

28-
29-
3028
;.
3129
; CHECK: @[[LLVM_GLOBAL_CTORS:[a-zA-Z0-9_$"\\.-]+]] = appending addrspace(1) global [2 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @foo.alias, ptr null }, { i32, ptr, ptr } { i32 1, ptr inttoptr (i64 4096 to ptr), ptr null }]
3230
; CHECK: @[[LLVM_GLOBAL_DTORS:[a-zA-Z0-9_$"\\.-]+]] = appending addrspace(1) global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr addrspacecast (ptr addrspace(1) @bar to ptr), ptr null }]
@@ -65,13 +63,17 @@ define void @bar() addrspace(1) {
6563
; CHECK-LABEL: define weak_odr amdgpu_kernel void @amdgcn.device.fini(
6664
; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
6765
; CHECK-NEXT: entry:
68-
; CHECK-NEXT: br i1 icmp ne (ptr addrspace(1) @__fini_array_start, ptr addrspace(1) @__fini_array_end), label [[WHILE_ENTRY:%.*]], label [[WHILE_END:%.*]]
66+
; CHECK-NEXT: [[TMP0:%.*]] = ashr i64 sub (i64 ptrtoint (ptr addrspace(1) @__fini_array_end to i64), i64 ptrtoint (ptr addrspace(1) @__fini_array_start to i64)), 3
67+
; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], 1
68+
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [0 x ptr], ptr addrspace(1) @__fini_array_start, i64 0, i64 [[TMP1]]
69+
; CHECK-NEXT: [[TMP3:%.*]] = icmp uge ptr addrspace(1) [[TMP2]], @__fini_array_start
70+
; CHECK-NEXT: br i1 [[TMP3]], label [[WHILE_ENTRY:%.*]], label [[WHILE_END:%.*]]
6971
; CHECK: while.entry:
70-
; CHECK-NEXT: [[PTR:%.*]] = phi ptr addrspace(1) [ @__fini_array_start, [[ENTRY:%.*]] ], [ [[NEXT:%.*]], [[WHILE_ENTRY]] ]
72+
; CHECK-NEXT: [[PTR:%.*]] = phi ptr addrspace(1) [ [[TMP2]], [[ENTRY:%.*]] ], [ [[NEXT:%.*]], [[WHILE_ENTRY]] ]
7173
; CHECK-NEXT: [[CALLBACK:%.*]] = load ptr, ptr addrspace(1) [[PTR]], align 8
7274
; CHECK-NEXT: call void [[CALLBACK]]()
73-
; CHECK-NEXT: [[NEXT]] = getelementptr ptr addrspace(1), ptr addrspace(1) [[PTR]], i64 1
74-
; CHECK-NEXT: [[END:%.*]] = icmp eq ptr addrspace(1) [[NEXT]], @__fini_array_end
75+
; CHECK-NEXT: [[NEXT]] = getelementptr ptr addrspace(1), ptr addrspace(1) [[PTR]], i64 -1
76+
; CHECK-NEXT: [[END:%.*]] = icmp ult ptr addrspace(1) [[NEXT]], @__fini_array_start
7577
; CHECK-NEXT: br i1 [[END]], label [[WHILE_END]], label [[WHILE_ENTRY]]
7678
; CHECK: while.end:
7779
; CHECK-NEXT: ret void

llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll

+11-8
Original file line numberDiff line numberDiff line change
@@ -12,20 +12,19 @@
1212
@llvm.global_ctors = appending addrspace(1) global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @foo, ptr null }]
1313
@llvm.global_dtors = appending addrspace(1) global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @bar, ptr null }]
1414

15-
16-
17-
18-
1915
; VISIBILITY: FUNC WEAK PROTECTED {{.*}} amdgcn.device.init
2016
; VISIBILITY: OBJECT WEAK DEFAULT {{.*}} amdgcn.device.init.kd
2117
; VISIBILITY: FUNC WEAK PROTECTED {{.*}} amdgcn.device.fini
2218
; VISIBILITY: OBJECT WEAK DEFAULT {{.*}} amdgcn.device.fini.kd
19+
2320
; SECTION: .init_array.1 INIT_ARRAY {{.*}} {{.*}} 000008 00 WA 0 0 8
2421
; SECTION: .fini_array.1 FINI_ARRAY {{.*}} {{.*}} 000008 00 WA 0 0 8
22+
2523
; DISABLED-NOT: FUNC GLOBAL PROTECTED {{.*}} amdgcn.device.init
2624
; DISABLED-NOT: OBJECT GLOBAL DEFAULT {{.*}} amdgcn.device.init.kd
2725
; DISABLED-NOT: FUNC GLOBAL PROTECTED {{.*}} amdgcn.device.fini
2826
; DISABLED-NOT: OBJECT GLOBAL DEFAULT {{.*}} amdgcn.device.fini.kd
27+
2928
; METADATA: amdhsa.kernels:
3029
; METADATA: .kind: init
3130
; METADATA: .max_flat_workgroup_size: 1
@@ -79,13 +78,17 @@ define internal void @bar() {
7978
; CHECK-LABEL: define weak_odr amdgpu_kernel void @amdgcn.device.fini(
8079
; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
8180
; CHECK-NEXT: entry:
82-
; CHECK-NEXT: br i1 icmp ne (ptr addrspace(1) @__fini_array_start, ptr addrspace(1) @__fini_array_end), label [[WHILE_ENTRY:%.*]], label [[WHILE_END:%.*]]
81+
; CHECK-NEXT: [[TMP0:%.*]] = ashr i64 sub (i64 ptrtoint (ptr addrspace(1) @__fini_array_end to i64), i64 ptrtoint (ptr addrspace(1) @__fini_array_start to i64)), 3
82+
; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], 1
83+
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [0 x ptr], ptr addrspace(1) @__fini_array_start, i64 0, i64 [[TMP1]]
84+
; CHECK-NEXT: [[TMP3:%.*]] = icmp uge ptr addrspace(1) [[TMP2]], @__fini_array_start
85+
; CHECK-NEXT: br i1 [[TMP3]], label [[WHILE_ENTRY:%.*]], label [[WHILE_END:%.*]]
8386
; CHECK: while.entry:
84-
; CHECK-NEXT: [[PTR:%.*]] = phi ptr addrspace(1) [ @__fini_array_start, [[ENTRY:%.*]] ], [ [[NEXT:%.*]], [[WHILE_ENTRY]] ]
87+
; CHECK-NEXT: [[PTR:%.*]] = phi ptr addrspace(1) [ [[TMP2]], [[ENTRY:%.*]] ], [ [[NEXT:%.*]], [[WHILE_ENTRY]] ]
8588
; CHECK-NEXT: [[CALLBACK:%.*]] = load ptr, ptr addrspace(1) [[PTR]], align 8
8689
; CHECK-NEXT: call void [[CALLBACK]]()
87-
; CHECK-NEXT: [[NEXT]] = getelementptr ptr addrspace(1), ptr addrspace(1) [[PTR]], i64 1
88-
; CHECK-NEXT: [[END:%.*]] = icmp eq ptr addrspace(1) [[NEXT]], @__fini_array_end
90+
; CHECK-NEXT: [[NEXT]] = getelementptr ptr addrspace(1), ptr addrspace(1) [[PTR]], i64 -1
91+
; CHECK-NEXT: [[END:%.*]] = icmp ult ptr addrspace(1) [[NEXT]], @__fini_array_start
8992
; CHECK-NEXT: br i1 [[END]], label [[WHILE_END]], label [[WHILE_ENTRY]]
9093
; CHECK: while.end:
9194
; CHECK-NEXT: ret void

llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll

+9-5
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,10 @@
33
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readelf -s - 2>&1 | FileCheck %s -check-prefix=CHECK-VIS
44

55

6-
; UTC_ARGS: --disable
76
@llvm.global_ctors = appending addrspace(1) global [2 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @foo, ptr null }, { i32, ptr, ptr } { i32 1, ptr @foo.5, ptr null }]
87
@llvm.global_dtors = appending addrspace(1) global [2 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @bar, ptr null }, { i32, ptr, ptr } { i32 1, ptr @bar.5, ptr null }]
98

9+
; UTC_ARGS: --disable
1010
; CHECK: @__init_array_start = external addrspace(1) constant [0 x ptr addrspace(1)]
1111
; CHECK: @__init_array_end = external addrspace(1) constant [0 x ptr addrspace(1)]
1212
; CHECK: @__fini_array_start = external addrspace(1) constant [0 x ptr addrspace(1)]
@@ -70,13 +70,17 @@ define internal void @bar.5() {
7070
; CHECK-LABEL: define weak_odr amdgpu_kernel void @amdgcn.device.fini(
7171
; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
7272
; CHECK-NEXT: entry:
73-
; CHECK-NEXT: br i1 icmp ne (ptr addrspace(1) @__fini_array_start, ptr addrspace(1) @__fini_array_end), label [[WHILE_ENTRY:%.*]], label [[WHILE_END:%.*]]
73+
; CHECK-NEXT: [[TMP0:%.*]] = ashr i64 sub (i64 ptrtoint (ptr addrspace(1) @__fini_array_end to i64), i64 ptrtoint (ptr addrspace(1) @__fini_array_start to i64)), 3
74+
; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], 1
75+
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [0 x ptr], ptr addrspace(1) @__fini_array_start, i64 0, i64 [[TMP1]]
76+
; CHECK-NEXT: [[TMP3:%.*]] = icmp uge ptr addrspace(1) [[TMP2]], @__fini_array_start
77+
; CHECK-NEXT: br i1 [[TMP3]], label [[WHILE_ENTRY:%.*]], label [[WHILE_END:%.*]]
7478
; CHECK: while.entry:
75-
; CHECK-NEXT: [[PTR:%.*]] = phi ptr addrspace(1) [ @__fini_array_start, [[ENTRY:%.*]] ], [ [[NEXT:%.*]], [[WHILE_ENTRY]] ]
79+
; CHECK-NEXT: [[PTR:%.*]] = phi ptr addrspace(1) [ [[TMP2]], [[ENTRY:%.*]] ], [ [[NEXT:%.*]], [[WHILE_ENTRY]] ]
7680
; CHECK-NEXT: [[CALLBACK:%.*]] = load ptr, ptr addrspace(1) [[PTR]], align 8
7781
; CHECK-NEXT: call void [[CALLBACK]]()
78-
; CHECK-NEXT: [[NEXT]] = getelementptr ptr addrspace(1), ptr addrspace(1) [[PTR]], i64 1
79-
; CHECK-NEXT: [[END:%.*]] = icmp eq ptr addrspace(1) [[NEXT]], @__fini_array_end
82+
; CHECK-NEXT: [[NEXT]] = getelementptr ptr addrspace(1), ptr addrspace(1) [[PTR]], i64 -1
83+
; CHECK-NEXT: [[END:%.*]] = icmp ult ptr addrspace(1) [[NEXT]], @__fini_array_start
8084
; CHECK-NEXT: br i1 [[END]], label [[WHILE_END]], label [[WHILE_ENTRY]]
8185
; CHECK: while.end:
8286
; CHECK-NEXT: ret void

0 commit comments

Comments
 (0)