Skip to content

Commit 21d2884

Browse files
committed
AMDGPU: Annotate functions that have stack objects
Relying on any MachineFunction state in the MachineFunctionInfo constructor is hazardous, because the construction time is unclear and determined by the first use. The function may be only partially constructed, which is part of why we have many of these hacky string attributes to track what we need for ABI lowering. For SelectionDAG, all stack objects are created up-front before calling convention lowering so stack objects are visible at construction time. For GlobalISel, none of the IR function has been visited yet and the allocas haven't been added to the MachineFrameInfo yet. This should fix failing to set flat_scratch_init in GlobalISel when needed. This pass really needs to be turned into some kind of analysis, but I haven't found a nice way use one here.
1 parent 3d0d2fe commit 21d2884

File tree

4 files changed

+71
-17
lines changed

4 files changed

+71
-17
lines changed

llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp

+11
Original file line numberDiff line numberDiff line change
@@ -279,13 +279,19 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
279279
bool HasApertureRegs = ST.hasApertureRegs();
280280
SmallPtrSet<const Constant *, 8> ConstantExprVisited;
281281

282+
bool HaveStackObjects = false;
282283
bool Changed = false;
283284
bool NeedQueuePtr = false;
284285
bool HaveCall = false;
285286
bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv());
286287

287288
for (BasicBlock &BB : F) {
288289
for (Instruction &I : BB) {
290+
if (isa<AllocaInst>(I)) {
291+
HaveStackObjects = true;
292+
continue;
293+
}
294+
289295
if (auto *CB = dyn_cast<CallBase>(&I)) {
290296
const Function *Callee =
291297
dyn_cast<Function>(CB->getCalledOperand()->stripPointerCasts());
@@ -355,6 +361,11 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
355361
Changed = true;
356362
}
357363

364+
if (HaveStackObjects) {
365+
F.addFnAttr("amdgpu-stack-objects");
366+
Changed = true;
367+
}
368+
358369
return Changed;
359370
}
360371

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp

+5-17
Original file line numberDiff line numberDiff line change
@@ -55,11 +55,10 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
5555

5656
Occupancy = ST.computeOccupancy(MF, getLDSSize());
5757
CallingConv::ID CC = F.getCallingConv();
58-
const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
5958

6059
// FIXME: Should have analysis or something rather than attribute to detect
6160
// calls.
62-
const bool HasCalls = FrameInfo.hasCalls() || F.hasFnAttribute("amdgpu-calls");
61+
const bool HasCalls = F.hasFnAttribute("amdgpu-calls");
6362

6463
// Enable all kernel inputs if we have the fixed ABI. Don't bother if we don't
6564
// have any calls.
@@ -125,8 +124,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
125124
WorkItemIDZ = true;
126125
}
127126

128-
bool HasStackObjects = FrameInfo.hasStackObjects();
129-
127+
bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects");
130128
if (isEntryFunction()) {
131129
// X, XY, and XYZ are the only supported combinations, so make sure Y is
132130
// enabled if Z is.
@@ -170,20 +168,10 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
170168
KernargSegmentPtr = true;
171169

172170
if (ST.hasFlatAddressSpace() && isEntryFunction() && isAmdHsaOrMesa) {
173-
auto hasNonSpillStackObjects = [&]() {
174-
// Avoid expensive checking if there's no stack objects.
175-
if (!HasStackObjects)
176-
return false;
177-
for (auto OI = FrameInfo.getObjectIndexBegin(),
178-
OE = FrameInfo.getObjectIndexEnd(); OI != OE; ++OI)
179-
if (!FrameInfo.isSpillSlotObjectIndex(OI))
180-
return true;
181-
// All stack objects are spill slots.
182-
return false;
183-
};
184171
// TODO: This could be refined a lot. The attribute is a poor way of
185-
// detecting calls that may require it before argument lowering.
186-
if (HasCalls || hasNonSpillStackObjects())
172+
// detecting calls or stack objects that may require it before argument
173+
// lowering.
174+
if (HasCalls || HasStackObjects)
187175
FlatScratchInit = true;
188176
}
189177

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
2+
3+
; Make sure flat_scratch_init is set
4+
5+
; GCN-LABEL: {{^}}stack_object_addrspacecast_in_kernel_no_calls:
6+
; GCN: .amdhsa_user_sgpr_flat_scratch_init 1
7+
define amdgpu_kernel void @stack_object_addrspacecast_in_kernel_no_calls() {
8+
%alloca = alloca i32, addrspace(5)
9+
%cast = addrspacecast i32 addrspace(5)* %alloca to i32*
10+
store volatile i32 0, i32* %cast
11+
ret void
12+
}
13+
14+
; TODO: Could optimize out in this case
15+
; GCN-LABEL: {{^}}stack_object_in_kernel_no_calls:
16+
; GCN: .amdhsa_user_sgpr_flat_scratch_init 1
17+
define amdgpu_kernel void @stack_object_in_kernel_no_calls() {
18+
%alloca = alloca i32, addrspace(5)
19+
store volatile i32 0, i32 addrspace(5)* %alloca
20+
ret void
21+
}
22+
23+
; GCN-LABEL: {{^}}kernel_no_calls_no_stack:
24+
; GCN: .amdhsa_user_sgpr_flat_scratch_init 0
25+
define amdgpu_kernel void @kernel_no_calls_no_stack() {
26+
ret void
27+
}

llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll

+28
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefix=HSA %s
22

3+
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
4+
35
declare i32 @llvm.amdgcn.workgroup.id.x() #0
46
declare i32 @llvm.amdgcn.workgroup.id.y() #0
57
declare i32 @llvm.amdgcn.workgroup.id.z() #0
@@ -250,6 +252,31 @@ define amdgpu_kernel void @use_is_private(i8* %ptr) #1 {
250252
ret void
251253
}
252254

255+
; HSA: define amdgpu_kernel void @use_alloca() #13 {
256+
define amdgpu_kernel void @use_alloca() #1 {
257+
%alloca = alloca i32, addrspace(5)
258+
store i32 0, i32 addrspace(5)* %alloca
259+
ret void
260+
}
261+
262+
; HSA: define amdgpu_kernel void @use_alloca_non_entry_block() #13 {
263+
define amdgpu_kernel void @use_alloca_non_entry_block() #1 {
264+
entry:
265+
br label %bb
266+
267+
bb:
268+
%alloca = alloca i32, addrspace(5)
269+
store i32 0, i32 addrspace(5)* %alloca
270+
ret void
271+
}
272+
273+
; HSA: define void @use_alloca_func() #13 {
274+
define void @use_alloca_func() #1 {
275+
%alloca = alloca i32, addrspace(5)
276+
store i32 0, i32 addrspace(5)* %alloca
277+
ret void
278+
}
279+
253280
attributes #0 = { nounwind readnone speculatable }
254281
attributes #1 = { nounwind }
255282

@@ -266,3 +293,4 @@ attributes #1 = { nounwind }
266293
; HSA: attributes #10 = { nounwind "amdgpu-dispatch-ptr" }
267294
; HSA: attributes #11 = { nounwind "amdgpu-queue-ptr" }
268295
; HSA: attributes #12 = { nounwind "amdgpu-kernarg-segment-ptr" }
296+
; HSA: attributes #13 = { nounwind "amdgpu-stack-objects" }

0 commit comments

Comments
 (0)