Skip to content

Commit c111dc7

Browse files
jayfoadrampitec
andauthored
[AMDGPU] Allow potentially negative flat scratch offsets on GFX12 (#78193)
#70634 has disabled use of potentially negative scratch offsets, but we can use it on GFX12. --------- Co-authored-by: Stanislav Mekhanoshin <[email protected]>
1 parent 172dbdf commit c111dc7

9 files changed

+185
-190
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1159,7 +1159,7 @@ bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr) const {
11591159

11601160
// Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
11611161
// values.
1162-
if (AMDGPU::isGFX12Plus(*Subtarget))
1162+
if (Subtarget->hasSignedScratchOffsets())
11631163
return true;
11641164

11651165
auto LHS = Addr.getOperand(0);
@@ -1184,6 +1184,11 @@ bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const {
11841184
if (isNoUnsignedWrap(Addr))
11851185
return true;
11861186

1187+
// Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1188+
// values.
1189+
if (Subtarget->hasSignedScratchOffsets())
1190+
return true;
1191+
11871192
auto LHS = Addr.getOperand(0);
11881193
auto RHS = Addr.getOperand(1);
11891194
return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
@@ -1192,6 +1197,11 @@ bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const {
11921197
// Check address value in SGPR/VGPR are legal for flat scratch in the form
11931198
// of: SGPR + VGPR + Imm.
11941199
bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSVImm(SDValue Addr) const {
1200+
// Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1201+
// values.
1202+
if (AMDGPU::isGFX12Plus(*Subtarget))
1203+
return true;
1204+
11951205
auto Base = Addr.getOperand(0);
11961206
auto *RHSImm = cast<ConstantSDNode>(Addr.getOperand(1));
11971207
// If the immediate offset is negative and within certain range, the base

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4557,7 +4557,7 @@ bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const {
45574557

45584558
// Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
45594559
// values.
4560-
if (AMDGPU::isGFX12Plus(STI))
4560+
if (STI.hasSignedScratchOffsets())
45614561
return true;
45624562

45634563
Register LHS = AddrMI->getOperand(1).getReg();
@@ -4586,6 +4586,11 @@ bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
45864586
if (isNoUnsignedWrap(AddrMI))
45874587
return true;
45884588

4589+
// Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
4590+
// values.
4591+
if (STI.hasSignedScratchOffsets())
4592+
return true;
4593+
45894594
Register LHS = AddrMI->getOperand(1).getReg();
45904595
Register RHS = AddrMI->getOperand(2).getReg();
45914596
return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS);
@@ -4595,6 +4600,11 @@ bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
45954600
// of: SGPR + VGPR + Imm.
45964601
bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
45974602
Register Addr) const {
4603+
// Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
4604+
// values.
4605+
if (STI.hasSignedScratchOffsets())
4606+
return true;
4607+
45984608
MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
45994609
Register Base = AddrMI->getOperand(1).getReg();
46004610
std::optional<DefinitionAndSourceRegister> BaseDef =

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1274,6 +1274,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
12741274
// \returns true if the target has WG_RR_MODE kernel descriptor mode bit
12751275
bool hasRrWGMode() const { return getGeneration() >= GFX12; }
12761276

1277+
/// \returns true if VADDR and SADDR fields in VSCRATCH can use negative
1278+
/// values.
1279+
bool hasSignedScratchOffsets() const { return getGeneration() >= GFX12; }
1280+
12771281
/// \returns SGPR allocation granularity supported by the subtarget.
12781282
unsigned getSGPRAllocGranule() const {
12791283
return AMDGPU::IsaInfo::getSGPRAllocGranule(this);

llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll

Lines changed: 51 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -79,16 +79,17 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
7979
; GFX12-LABEL: store_load_sindex_kernel:
8080
; GFX12: ; %bb.0: ; %bb
8181
; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24
82+
; GFX12-NEXT: v_mov_b32_e32 v1, 15
8283
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
83-
; GFX12-NEXT: s_and_b32 s1, s0, 15
84+
; GFX12-NEXT: s_lshl_b32 s1, s0, 2
85+
; GFX12-NEXT: s_and_b32 s0, s0, 15
86+
; GFX12-NEXT: v_mov_b32_e32 v0, s1
8487
; GFX12-NEXT: s_lshl_b32 s0, s0, 2
85-
; GFX12-NEXT: s_lshl_b32 s1, s1, 2
8688
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
87-
; GFX12-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1
88-
; GFX12-NEXT: s_add_co_i32 s0, s0, 4
89-
; GFX12-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT
89+
; GFX12-NEXT: v_mov_b32_e32 v2, s0
90+
; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:4 th:TH_STORE_NT_RT
9091
; GFX12-NEXT: s_waitcnt_vscnt null, 0x0
91-
; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:4 th:TH_LOAD_RT_NT
92+
; GFX12-NEXT: scratch_load_b32 v0, v2, off offset:4 th:TH_LOAD_RT_NT
9293
; GFX12-NEXT: s_waitcnt vmcnt(0)
9394
; GFX12-NEXT: s_endpgm
9495
bb:
@@ -170,8 +171,7 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
170171
; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1
171172
; GFX12-NEXT: scratch_store_b32 v0, v2, off offset:4 th:TH_STORE_NT_RT
172173
; GFX12-NEXT: s_waitcnt_vscnt null, 0x0
173-
; GFX12-NEXT: v_add_nc_u32_e32 v1, 4, v1
174-
; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 th:TH_LOAD_RT_NT
174+
; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:128 th:TH_LOAD_RT_NT
175175
; GFX12-NEXT: s_waitcnt vmcnt(0)
176176
; GFX12-NEXT: s_endpgm
177177
bb:
@@ -248,14 +248,13 @@ define void @store_load_vindex_foo(i32 %idx) {
248248
; GFX12-LABEL: store_load_vindex_foo:
249249
; GFX12: ; %bb.0: ; %bb
250250
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
251-
; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v0
252-
; GFX12-NEXT: v_and_b32_e32 v0, 15, v0
253-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
254-
; GFX12-NEXT: v_add_nc_u32_e32 v1, s32, v1
251+
; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
255252
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
256-
; GFX12-NEXT: scratch_store_b32 v1, v2, off th:TH_STORE_NT_RT
253+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
254+
; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1
255+
; GFX12-NEXT: scratch_store_b32 v0, v2, s32 th:TH_STORE_NT_RT
257256
; GFX12-NEXT: s_waitcnt_vscnt null, 0x0
258-
; GFX12-NEXT: scratch_load_b32 v0, v0, s32 th:TH_LOAD_RT_NT
257+
; GFX12-NEXT: scratch_load_b32 v0, v1, s32 th:TH_LOAD_RT_NT
259258
; GFX12-NEXT: s_waitcnt vmcnt(0)
260259
; GFX12-NEXT: s_setpc_b64 s[30:31]
261260
bb:
@@ -391,17 +390,19 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
391390
; GFX12-LABEL: store_load_sindex_small_offset_kernel:
392391
; GFX12: ; %bb.0: ; %bb
393392
; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24
394-
; GFX12-NEXT: scratch_load_b32 v2, off, off offset:4 th:TH_LOAD_RT_NT
395-
; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
396-
; GFX12-NEXT: s_and_b32 s1, s0, 15
393+
; GFX12-NEXT: scratch_load_b32 v3, off, off offset:4 th:TH_LOAD_RT_NT
394+
; GFX12-NEXT: s_waitcnt vmcnt(0)
395+
; GFX12-NEXT: v_mov_b32_e32 v1, 15
396+
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
397+
; GFX12-NEXT: s_lshl_b32 s1, s0, 2
398+
; GFX12-NEXT: s_and_b32 s0, s0, 15
399+
; GFX12-NEXT: v_mov_b32_e32 v0, s1
397400
; GFX12-NEXT: s_lshl_b32 s0, s0, 2
398-
; GFX12-NEXT: s_lshl_b32 s1, s1, 2
399401
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
400-
; GFX12-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1
401-
; GFX12-NEXT: s_addk_co_i32 s0, 0x104
402-
; GFX12-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT
402+
; GFX12-NEXT: v_mov_b32_e32 v2, s0
403+
; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:260 th:TH_STORE_NT_RT
403404
; GFX12-NEXT: s_waitcnt_vscnt null, 0x0
404-
; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:260 th:TH_LOAD_RT_NT
405+
; GFX12-NEXT: scratch_load_b32 v0, v2, off offset:260 th:TH_LOAD_RT_NT
405406
; GFX12-NEXT: s_waitcnt vmcnt(0)
406407
; GFX12-NEXT: s_endpgm
407408
bb:
@@ -490,13 +491,13 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
490491
; GFX12: ; %bb.0: ; %bb
491492
; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0
492493
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
494+
; GFX12-NEXT: v_mov_b32_e32 v2, 15
493495
; GFX12-NEXT: scratch_load_b32 v3, off, off offset:4 th:TH_LOAD_RT_NT
494496
; GFX12-NEXT: s_waitcnt vmcnt(0)
495-
; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1
497+
; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1
496498
; GFX12-NEXT: scratch_store_b32 v0, v2, off offset:260 th:TH_STORE_NT_RT
497499
; GFX12-NEXT: s_waitcnt_vscnt null, 0x0
498-
; GFX12-NEXT: v_add_nc_u32_e32 v1, 0x104, v1
499-
; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 th:TH_LOAD_RT_NT
500+
; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:384 th:TH_LOAD_RT_NT
500501
; GFX12-NEXT: s_waitcnt vmcnt(0)
501502
; GFX12-NEXT: s_endpgm
502503
bb:
@@ -589,16 +590,14 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
589590
; GFX12-LABEL: store_load_vindex_small_offset_foo:
590591
; GFX12: ; %bb.0: ; %bb
591592
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
592-
; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v0
593-
; GFX12-NEXT: v_and_b32_e32 v0, 15, v0
594-
; GFX12-NEXT: s_add_co_i32 s0, s32, 0x100
593+
; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
594+
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
595595
; GFX12-NEXT: scratch_load_b32 v3, off, s32 th:TH_LOAD_RT_NT
596596
; GFX12-NEXT: s_waitcnt vmcnt(0)
597-
; GFX12-NEXT: v_add_nc_u32_e32 v1, s0, v1
598-
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
599-
; GFX12-NEXT: scratch_store_b32 v1, v2, off th:TH_STORE_NT_RT
597+
; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1
598+
; GFX12-NEXT: scratch_store_b32 v0, v2, s32 offset:256 th:TH_STORE_NT_RT
600599
; GFX12-NEXT: s_waitcnt_vscnt null, 0x0
601-
; GFX12-NEXT: scratch_load_b32 v0, v0, s32 offset:256 th:TH_LOAD_RT_NT
600+
; GFX12-NEXT: scratch_load_b32 v0, v1, s32 offset:256 th:TH_LOAD_RT_NT
602601
; GFX12-NEXT: s_waitcnt vmcnt(0)
603602
; GFX12-NEXT: s_setpc_b64 s[30:31]
604603
bb:
@@ -697,17 +696,19 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
697696
; GFX12-LABEL: store_load_sindex_large_offset_kernel:
698697
; GFX12: ; %bb.0: ; %bb
699698
; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24
700-
; GFX12-NEXT: scratch_load_b32 v2, off, off offset:4 th:TH_LOAD_RT_NT
701-
; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
702-
; GFX12-NEXT: s_and_b32 s1, s0, 15
699+
; GFX12-NEXT: scratch_load_b32 v3, off, off offset:4 th:TH_LOAD_RT_NT
700+
; GFX12-NEXT: s_waitcnt vmcnt(0)
701+
; GFX12-NEXT: v_mov_b32_e32 v1, 15
702+
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
703+
; GFX12-NEXT: s_lshl_b32 s1, s0, 2
704+
; GFX12-NEXT: s_and_b32 s0, s0, 15
705+
; GFX12-NEXT: v_mov_b32_e32 v0, s1
703706
; GFX12-NEXT: s_lshl_b32 s0, s0, 2
704-
; GFX12-NEXT: s_lshl_b32 s1, s1, 2
705707
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
706-
; GFX12-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1
707-
; GFX12-NEXT: s_addk_co_i32 s0, 0x4004
708-
; GFX12-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT
708+
; GFX12-NEXT: v_mov_b32_e32 v2, s0
709+
; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:16388 th:TH_STORE_NT_RT
709710
; GFX12-NEXT: s_waitcnt_vscnt null, 0x0
710-
; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:16388 th:TH_LOAD_RT_NT
711+
; GFX12-NEXT: scratch_load_b32 v0, v2, off offset:16388 th:TH_LOAD_RT_NT
711712
; GFX12-NEXT: s_waitcnt vmcnt(0)
712713
; GFX12-NEXT: s_endpgm
713714
bb:
@@ -798,13 +799,13 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
798799
; GFX12: ; %bb.0: ; %bb
799800
; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0
800801
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
802+
; GFX12-NEXT: v_mov_b32_e32 v2, 15
801803
; GFX12-NEXT: scratch_load_b32 v3, off, off offset:4 th:TH_LOAD_RT_NT
802804
; GFX12-NEXT: s_waitcnt vmcnt(0)
803-
; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1
805+
; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1
804806
; GFX12-NEXT: scratch_store_b32 v0, v2, off offset:16388 th:TH_STORE_NT_RT
805807
; GFX12-NEXT: s_waitcnt_vscnt null, 0x0
806-
; GFX12-NEXT: v_add_nc_u32_e32 v1, 0x4004, v1
807-
; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 th:TH_LOAD_RT_NT
808+
; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:16512 th:TH_LOAD_RT_NT
808809
; GFX12-NEXT: s_waitcnt vmcnt(0)
809810
; GFX12-NEXT: s_endpgm
810811
bb:
@@ -899,16 +900,14 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
899900
; GFX12-LABEL: store_load_vindex_large_offset_foo:
900901
; GFX12: ; %bb.0: ; %bb
901902
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
902-
; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v0
903-
; GFX12-NEXT: v_and_b32_e32 v0, 15, v0
904-
; GFX12-NEXT: s_add_co_i32 s0, s32, 0x4000
903+
; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
904+
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
905905
; GFX12-NEXT: scratch_load_b32 v3, off, s32 th:TH_LOAD_RT_NT
906906
; GFX12-NEXT: s_waitcnt vmcnt(0)
907-
; GFX12-NEXT: v_add_nc_u32_e32 v1, s0, v1
908-
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
909-
; GFX12-NEXT: scratch_store_b32 v1, v2, off th:TH_STORE_NT_RT
907+
; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1
908+
; GFX12-NEXT: scratch_store_b32 v0, v2, s32 offset:16384 th:TH_STORE_NT_RT
910909
; GFX12-NEXT: s_waitcnt_vscnt null, 0x0
911-
; GFX12-NEXT: scratch_load_b32 v0, v0, s32 offset:16384 th:TH_LOAD_RT_NT
910+
; GFX12-NEXT: scratch_load_b32 v0, v1, s32 offset:16384 th:TH_LOAD_RT_NT
912911
; GFX12-NEXT: s_waitcnt vmcnt(0)
913912
; GFX12-NEXT: s_setpc_b64 s[30:31]
914913
bb:
@@ -1154,11 +1153,9 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
11541153
; GFX12-NEXT: v_mov_b32_e32 v1, 15
11551154
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
11561155
; GFX12-NEXT: v_add_lshl_u32 v0, s0, v0, 2
1157-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1158-
; GFX12-NEXT: v_add_nc_u32_e32 v0, 4, v0
1159-
; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:1024 th:TH_STORE_NT_RT
1156+
; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:1028 th:TH_STORE_NT_RT
11601157
; GFX12-NEXT: s_waitcnt_vscnt null, 0x0
1161-
; GFX12-NEXT: scratch_load_b32 v0, v0, off offset:1024 th:TH_LOAD_RT_NT
1158+
; GFX12-NEXT: scratch_load_b32 v0, v0, off offset:1028 th:TH_LOAD_RT_NT
11621159
; GFX12-NEXT: s_waitcnt vmcnt(0)
11631160
; GFX12-NEXT: s_endpgm
11641161
bb:

0 commit comments

Comments
 (0)