Skip to content

Commit 16945bc

Browse files
authored
[AMDGPU] Don't send DEALLOC_VGPRs after calls (llvm#77439)
Calls do not have to wait for VsCnt, so after they return there might still be scratch stores in progress. It's important that we don't send the DEALLOC_VGPR message in that case, since that might release the VGPRs and scratch allocation before those stores are complete.
1 parent 79889fe commit 16945bc

File tree

6 files changed

+35
-48
lines changed

6 files changed

+35
-48
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

+3-2
Original file line numberDiff line numberDiff line change
@@ -292,7 +292,7 @@ class WaitcntBrackets {
292292
VgprVmemTypes[GprNo] = 0;
293293
}
294294

295-
void setNonKernelFunctionInitialState() {
295+
void setStateOnFunctionEntryOrReturn() {
296296
setScoreUB(VS_CNT, getWaitCountMax(VS_CNT));
297297
PendingEvents |= WaitEventMaskForInst[VS_CNT];
298298
}
@@ -1487,6 +1487,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
14871487
if (callWaitsOnFunctionReturn(Inst)) {
14881488
// Act as a wait on everything
14891489
ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZeroExceptVsCnt());
1490+
ScoreBrackets->setStateOnFunctionEntryOrReturn();
14901491
} else {
14911492
// May need to way wait for anything.
14921493
ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
@@ -1879,7 +1880,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
18791880

18801881
auto NonKernelInitialState =
18811882
std::make_unique<WaitcntBrackets>(ST, Limits, Encoding);
1882-
NonKernelInitialState->setNonKernelFunctionInitialState();
1883+
NonKernelInitialState->setStateOnFunctionEntryOrReturn();
18831884
BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
18841885

18851886
Modified = true;

llvm/test/CodeGen/AMDGPU/call-argument-types.ll

-2
Original file line numberDiff line numberDiff line change
@@ -4462,8 +4462,6 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1)
44624462
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
44634463
; GFX11-NEXT: buffer_store_b32 v0, off, s[36:39], 0 dlc
44644464
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
4465-
; GFX11-NEXT: s_nop 0
4466-
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
44674465
; GFX11-NEXT: s_endpgm
44684466
;
44694467
; HSA-LABEL: test_call_external_i32_func_i32_imm:

llvm/test/CodeGen/AMDGPU/calling-conventions.ll

-4
Original file line numberDiff line numberDiff line change
@@ -167,8 +167,6 @@ define amdgpu_kernel void @call_coldcc() #0 {
167167
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
168168
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
169169
; GFX11-NEXT: global_store_b32 v[0:1], v0, off
170-
; GFX11-NEXT: s_nop 0
171-
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
172170
; GFX11-NEXT: s_endpgm
173171
%val = call float @coldcc(float 1.0)
174172
store float %val, ptr addrspace(1) undef
@@ -231,8 +229,6 @@ define amdgpu_kernel void @call_fastcc() #0 {
231229
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
232230
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
233231
; GFX11-NEXT: global_store_b32 v[0:1], v0, off
234-
; GFX11-NEXT: s_nop 0
235-
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
236232
; GFX11-NEXT: s_endpgm
237233
%val = call float @fastcc(float 1.0)
238234
store float %val, ptr addrspace(1) undef

llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll

-24
Original file line numberDiff line numberDiff line change
@@ -626,8 +626,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
626626
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
627627
; GFX1164-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
628628
; GFX1164-NEXT: .LBB1_4:
629-
; GFX1164-NEXT: s_nop 0
630-
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
631629
; GFX1164-NEXT: s_endpgm
632630
;
633631
; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
@@ -675,8 +673,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
675673
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
676674
; GFX1132-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
677675
; GFX1132-NEXT: .LBB1_4:
678-
; GFX1132-NEXT: s_nop 0
679-
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
680676
; GFX1132-NEXT: s_endpgm
681677
;
682678
; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
@@ -988,8 +984,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
988984
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
989985
; GFX1164-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1]
990986
; GFX1164-DPP-NEXT: .LBB1_2:
991-
; GFX1164-DPP-NEXT: s_nop 0
992-
; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
993987
; GFX1164-DPP-NEXT: s_endpgm
994988
;
995989
; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
@@ -1051,8 +1045,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
10511045
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
10521046
; GFX1132-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1]
10531047
; GFX1132-DPP-NEXT: .LBB1_2:
1054-
; GFX1132-DPP-NEXT: s_nop 0
1055-
; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
10561048
; GFX1132-DPP-NEXT: s_endpgm
10571049
%divValue = call float @div.float.value()
10581050
%result = atomicrmw fadd ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic, align 4
@@ -3042,8 +3034,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
30423034
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
30433035
; GFX1164-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
30443036
; GFX1164-NEXT: .LBB5_4:
3045-
; GFX1164-NEXT: s_nop 0
3046-
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
30473037
; GFX1164-NEXT: s_endpgm
30483038
;
30493039
; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
@@ -3091,8 +3081,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
30913081
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
30923082
; GFX1132-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
30933083
; GFX1132-NEXT: .LBB5_4:
3094-
; GFX1132-NEXT: s_nop 0
3095-
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
30963084
; GFX1132-NEXT: s_endpgm
30973085
;
30983086
; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
@@ -3404,8 +3392,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
34043392
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
34053393
; GFX1164-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1]
34063394
; GFX1164-DPP-NEXT: .LBB5_2:
3407-
; GFX1164-DPP-NEXT: s_nop 0
3408-
; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
34093395
; GFX1164-DPP-NEXT: s_endpgm
34103396
;
34113397
; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
@@ -3467,8 +3453,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
34673453
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
34683454
; GFX1132-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1]
34693455
; GFX1132-DPP-NEXT: .LBB5_2:
3470-
; GFX1132-DPP-NEXT: s_nop 0
3471-
; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
34723456
; GFX1132-DPP-NEXT: s_endpgm
34733457
%divValue = call float @div.float.value()
34743458
%result = atomicrmw fadd ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic
@@ -3770,8 +3754,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
37703754
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
37713755
; GFX1164-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
37723756
; GFX1164-NEXT: .LBB6_4:
3773-
; GFX1164-NEXT: s_nop 0
3774-
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
37753757
; GFX1164-NEXT: s_endpgm
37763758
;
37773759
; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
@@ -3819,8 +3801,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
38193801
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
38203802
; GFX1132-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
38213803
; GFX1132-NEXT: .LBB6_4:
3822-
; GFX1132-NEXT: s_nop 0
3823-
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
38243804
; GFX1132-NEXT: s_endpgm
38253805
;
38263806
; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
@@ -4132,8 +4112,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
41324112
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
41334113
; GFX1164-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1]
41344114
; GFX1164-DPP-NEXT: .LBB6_2:
4135-
; GFX1164-DPP-NEXT: s_nop 0
4136-
; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
41374115
; GFX1164-DPP-NEXT: s_endpgm
41384116
;
41394117
; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
@@ -4195,8 +4173,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
41954173
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
41964174
; GFX1132-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1]
41974175
; GFX1132-DPP-NEXT: .LBB6_2:
4198-
; GFX1132-DPP-NEXT: s_nop 0
4199-
; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
42004176
; GFX1132-DPP-NEXT: s_endpgm
42014177
%divValue = call float @div.float.value()
42024178
%result = atomicrmw fadd ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic

llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll

-16
Original file line numberDiff line numberDiff line change
@@ -300,8 +300,6 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1) %buffer) {
300300
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
301301
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
302302
; GFX11-NEXT: global_store_b64 v16, v[0:1], s[34:35]
303-
; GFX11-NEXT: s_nop 0
304-
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
305303
; GFX11-NEXT: s_endpgm
306304
entry:
307305
%call = tail call i64 @_Z13get_global_idj(i32 0)
@@ -930,8 +928,6 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
930928
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
931929
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s35, 0, s0
932930
; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
933-
; GFX11-NEXT: s_nop 0
934-
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
935931
; GFX11-NEXT: s_endpgm
936932
entry:
937933
%call = tail call i64 @_Z13get_global_idj(i32 0)
@@ -1294,8 +1290,6 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) {
12941290
; GFX11-NEXT: s_waitcnt vmcnt(0)
12951291
; GFX11-NEXT: v_add3_u32 v0, v3, v1, v0
12961292
; GFX11-NEXT: global_store_b32 v6, v0, s[34:35]
1297-
; GFX11-NEXT: s_nop 0
1298-
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
12991293
; GFX11-NEXT: s_endpgm
13001294
entry:
13011295
%call = tail call i64 @_Z13get_global_idj(i32 0)
@@ -1543,8 +1537,6 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) {
15431537
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v4, v0
15441538
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v5, v1, vcc_lo
15451539
; GFX11-NEXT: global_store_b64 v8, v[0:1], s[34:35]
1546-
; GFX11-NEXT: s_nop 0
1547-
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
15481540
; GFX11-NEXT: s_endpgm
15491541
entry:
15501542
%call = tail call i64 @_Z13get_global_idj(i32 0)
@@ -1753,8 +1745,6 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1) %buffer) {
17531745
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
17541746
; GFX11-NEXT: v_add3_u32 v0, v2, v0, v3
17551747
; GFX11-NEXT: global_store_b32 v6, v0, s[34:35]
1756-
; GFX11-NEXT: s_nop 0
1757-
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
17581748
; GFX11-NEXT: s_endpgm
17591749
entry:
17601750
%call = tail call i64 @_Z13get_global_idj(i32 0)
@@ -2017,8 +2007,6 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1,
20172007
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
20182008
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
20192009
; GFX11-NEXT: global_store_b64 v12, v[0:1], s[36:37]
2020-
; GFX11-NEXT: s_nop 0
2021-
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
20222010
; GFX11-NEXT: s_endpgm
20232011
ptr addrspace(1) %buffer2) {
20242012
entry:
@@ -2349,8 +2337,6 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) {
23492337
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
23502338
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
23512339
; GFX11-NEXT: global_store_b64 v16, v[0:1], s[34:35]
2352-
; GFX11-NEXT: s_nop 0
2353-
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
23542340
; GFX11-NEXT: s_endpgm
23552341
entry:
23562342
%call = tail call i64 @_Z13get_global_idj(i32 0)
@@ -2553,8 +2539,6 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf
25532539
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0
25542540
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
25552541
; GFX11-NEXT: global_store_b64 v4, v[0:1], s[34:35]
2556-
; GFX11-NEXT: s_nop 0
2557-
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
25582542
; GFX11-NEXT: s_endpgm
25592543
entry:
25602544
%call = tail call i64 @_Z13get_global_idj(i32 0) #2

llvm/test/CodeGen/AMDGPU/release-vgprs.mir

+32
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
define amdgpu_ps void @global_atomic() { ret void }
2323
define amdgpu_ps void @image_atomic() { ret void }
2424
define amdgpu_ps void @global_store_optnone() noinline optnone { ret void }
25+
define amdgpu_cs void @with_calls() { ret void }
26+
define fastcc void @with_tail_calls() { ret void }
2527
...
2628

2729
---
@@ -565,3 +567,33 @@ body: |
565567
S_WAITCNT_VSCNT undef $sgpr_null, 0
566568
S_ENDPGM 0
567569
...
570+
571+
---
572+
name: with_calls
573+
frameInfo:
574+
hasCalls: true
575+
body: |
576+
bb.0:
577+
; Make sure we don't send DEALLOC_VGPRS after a call, since there might be
578+
; scratch stores still in progress.
579+
; CHECK-LABEL: name: with_calls
580+
; CHECK-NOT: S_SENDMSG 3
581+
; CHECK: S_ENDPGM 0
582+
GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
583+
$sgpr30_sgpr31 = SI_CALL undef renamable $sgpr4_sgpr5, 0, csr_amdgpu
584+
S_ENDPGM 0
585+
...
586+
587+
---
588+
name: with_tail_calls
589+
frameInfo:
590+
hasCalls: true
591+
body: |
592+
bb.0:
593+
; Make sure we don't send DEALLOC_VGPRS when there's a tail call, since the
594+
; only valid action after DEALLOC_VGPRS is to terminate the wave.
595+
; CHECK-LABEL: name: with_tail_calls
596+
; CHECK-NOT: S_SENDMSG 3
597+
GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
598+
SI_TCRETURN undef renamable $sgpr4_sgpr5, @with_tail_calls, 0, csr_amdgpu
599+
...

0 commit comments

Comments
 (0)