Skip to content

Commit 01c1c7a

Browse files
authored
[AMDGPU][CodeGen] Update support (soffset + offset) s_buffer_load's (#68302)
getBaseWithConstantOffset() is used for scalar and non-scalar buffer loads. Diffrence between s_load and load instruction is that s_load instruction extends 32-bit offset to 64-bits, so a 32-bit (address + offset) should not cause unsigned 32-bit integer wraparound, because it performs addition in 64-bits.
1 parent fffbea3 commit 01c1c7a

File tree

4 files changed

+81
-4
lines changed

4 files changed

+81
-4
lines changed

llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ using namespace MIPatternMatch;
1818

1919
std::pair<Register, unsigned>
2020
AMDGPU::getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg,
21-
GISelKnownBits *KnownBits) {
21+
GISelKnownBits *KnownBits, bool CheckNUW) {
2222
MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
2323
if (Def->getOpcode() == TargetOpcode::G_CONSTANT) {
2424
unsigned Offset;
@@ -33,6 +33,12 @@ AMDGPU::getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg,
3333

3434
int64_t Offset;
3535
if (Def->getOpcode() == TargetOpcode::G_ADD) {
36+
// A 32-bit (address + offset) should not cause unsigned 32-bit integer
37+
// wraparound, because s_load instructions perform the addition in 64 bits.
38+
if (CheckNUW && !Def->getFlag(MachineInstr::NoUWrap)) {
39+
assert(MRI.getType(Reg).getScalarSizeInBits() == 32);
40+
return std::pair(Reg, 0);
41+
}
3642
// TODO: Handle G_OR used for add case
3743
if (mi_match(Def->getOperand(2).getReg(), MRI, m_ICst(Offset)))
3844
return std::pair(Def->getOperand(1).getReg(), Offset);

llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,8 @@ namespace AMDGPU {
2525
/// Returns base register and constant offset.
2626
std::pair<Register, unsigned>
2727
getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg,
28-
GISelKnownBits *KnownBits = nullptr);
28+
GISelKnownBits *KnownBits = nullptr,
29+
bool CheckNUW = false);
2930

3031
bool hasAtomicFaddRtnForTy(const GCNSubtarget &Subtarget, const LLT &Ty);
3132
}

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5005,8 +5005,8 @@ AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
50055005
// an immediate offset.
50065006
Register SOffset;
50075007
unsigned Offset;
5008-
std::tie(SOffset, Offset) =
5009-
AMDGPU::getBaseWithConstantOffset(*MRI, Root.getReg(), KB);
5008+
std::tie(SOffset, Offset) = AMDGPU::getBaseWithConstantOffset(
5009+
*MRI, Root.getReg(), KB, /*CheckNUW*/ true);
50105010
if (!SOffset)
50115011
return std::nullopt;
50125012

llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,76 @@ define amdgpu_cs void @test_buffer_load_sgpr_plus_imm_offset(<4 x i32> inreg %ba
110110
ret void
111111
}
112112

113+
; GCN-LABEL: name: test_buffer_load_sgpr_plus_imm_offset_nuw
114+
; SDAG-DAG: %[[BASE0:.*]]:sgpr_32 = COPY $sgpr0
115+
; SDAG-DAG: %[[BASE1:.*]]:sgpr_32 = COPY $sgpr1
116+
; SDAG-DAG: %[[BASE2:.*]]:sgpr_32 = COPY $sgpr2
117+
; SDAG-DAG: %[[BASE3:.*]]:sgpr_32 = COPY $sgpr3
118+
; SDAG-DAG: %[[OFFSET:.*]]:sgpr_32 = COPY $sgpr4
119+
; SDAG-DAG: %[[BASE:.*]]:sgpr_128 = REG_SEQUENCE %[[BASE0]], %subreg.sub0, %[[BASE1]], %subreg.sub1, %[[BASE2]], %subreg.sub2, %[[BASE3]], %subreg.sub3
120+
; SDAG: S_BUFFER_LOAD_DWORD_SGPR_IMM killed %[[BASE]], %[[OFFSET]], 77,
121+
; GISEL-DAG: %[[BASE0:.*]]:sreg_32 = COPY $sgpr0
122+
; GISEL-DAG: %[[BASE1:.*]]:sreg_32 = COPY $sgpr1
123+
; GISEL-DAG: %[[BASE2:.*]]:sreg_32 = COPY $sgpr2
124+
; GISEL-DAG: %[[BASE3:.*]]:sreg_32 = COPY $sgpr3
125+
; GISEL-DAG: %[[OFFSET:.*]]:sreg_32 = COPY $sgpr4
126+
; GISEL-DAG: %[[BASE:.*]]:sgpr_128 = REG_SEQUENCE %[[BASE0]], %subreg.sub0, %[[BASE1]], %subreg.sub1, %[[BASE2]], %subreg.sub2, %[[BASE3]], %subreg.sub3
127+
; GISEL: S_BUFFER_LOAD_DWORD_SGPR_IMM %[[BASE]], %[[OFFSET]], 77,
128+
define amdgpu_cs void @test_buffer_load_sgpr_plus_imm_offset_nuw(<4 x i32> inreg %base, i32 inreg %i, ptr addrspace(1) inreg %out) #0 {
129+
%off = add nuw i32 %i, 77
130+
%v = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %base, i32 %off, i32 0)
131+
store i32 %v, ptr addrspace(1) %out, align 4
132+
ret void
133+
}
134+
135+
; GCN-LABEL: name: test_buffer_load_sgpr_plus_imm_offset_nsw
136+
; SDAG-DAG: %[[BASE0:.*]]:sgpr_32 = COPY $sgpr0
137+
; SDAG-DAG: %[[BASE1:.*]]:sgpr_32 = COPY $sgpr1
138+
; SDAG-DAG: %[[BASE2:.*]]:sgpr_32 = COPY $sgpr2
139+
; SDAG-DAG: %[[BASE3:.*]]:sgpr_32 = COPY $sgpr3
140+
; SDAG-DAG: %[[OFFSET:.*]]:sgpr_32 = COPY $sgpr4
141+
; SDAG-DAG: %[[BASE:.*]]:sgpr_128 = REG_SEQUENCE %[[BASE0]], %subreg.sub0, %[[BASE1]], %subreg.sub1, %[[BASE2]], %subreg.sub2, %[[BASE3]], %subreg.sub3
142+
; SDAG-DAG: %[[ADD:.*]]:sreg_32 = nsw S_ADD_I32 %4, killed %11, implicit-def dead $scc
143+
; SDAG: S_BUFFER_LOAD_DWORD_SGPR_IMM killed %[[BASE]], killed %[[ADD]], 0,
144+
; GISEL-DAG: %[[BASE0:.*]]:sreg_32 = COPY $sgpr0
145+
; GISEL-DAG: %[[BASE1:.*]]:sreg_32 = COPY $sgpr1
146+
; GISEL-DAG: %[[BASE2:.*]]:sreg_32 = COPY $sgpr2
147+
; GISEL-DAG: %[[BASE3:.*]]:sreg_32 = COPY $sgpr3
148+
; GISEL-DAG: %[[OFFSET:.*]]:sreg_32 = COPY $sgpr4
149+
; GISEL-DAG: %[[BASE:.*]]:sgpr_128 = REG_SEQUENCE %[[BASE0]], %subreg.sub0, %[[BASE1]], %subreg.sub1, %[[BASE2]], %subreg.sub2, %[[BASE3]], %subreg.sub3
150+
; GISEL-DAG: %[[ADD:.*]]:sreg_32 = nsw S_ADD_I32 %1, %10, implicit-def dead $scc
151+
; GISEL: S_BUFFER_LOAD_DWORD_SGPR_IMM %[[BASE]], %[[ADD]], 0,
152+
define amdgpu_cs void @test_buffer_load_sgpr_plus_imm_offset_nsw(<4 x i32> inreg %base, i32 inreg %i, ptr addrspace(1) inreg %out) #0 {
153+
%off = add nsw i32 %i, 77
154+
%v = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %base, i32 %off, i32 0)
155+
store i32 %v, ptr addrspace(1) %out, align 4
156+
ret void
157+
}
158+
159+
; GCN-LABEL: name: test_buffer_load_sgpr_plus_imm_offset_noflags
160+
; SDAG-DAG: %[[BASE0:.*]]:sgpr_32 = COPY $sgpr0
161+
; SDAG-DAG: %[[BASE1:.*]]:sgpr_32 = COPY $sgpr1
162+
; SDAG-DAG: %[[BASE2:.*]]:sgpr_32 = COPY $sgpr2
163+
; SDAG-DAG: %[[BASE3:.*]]:sgpr_32 = COPY $sgpr3
164+
; SDAG-DAG: %[[OFFSET:.*]]:sgpr_32 = COPY $sgpr4
165+
; SDAG-DAG: %[[BASE:.*]]:sgpr_128 = REG_SEQUENCE %[[BASE0]], %subreg.sub0, %[[BASE1]], %subreg.sub1, %[[BASE2]], %subreg.sub2, %[[BASE3]], %subreg.sub3
166+
; SDAG-DAG: %[[ADD:.*]]:sreg_32 = S_ADD_I32 %4, killed %11, implicit-def dead $scc
167+
; SDAG: S_BUFFER_LOAD_DWORD_SGPR_IMM killed %[[BASE]], killed %[[ADD]], 0,
168+
; GISEL-DAG: %[[BASE0:.*]]:sreg_32 = COPY $sgpr0
169+
; GISEL-DAG: %[[BASE1:.*]]:sreg_32 = COPY $sgpr1
170+
; GISEL-DAG: %[[BASE2:.*]]:sreg_32 = COPY $sgpr2
171+
; GISEL-DAG: %[[BASE3:.*]]:sreg_32 = COPY $sgpr3
172+
; GISEL-DAG: %[[OFFSET:.*]]:sreg_32 = COPY $sgpr4
173+
; GISEL-DAG: %[[BASE:.*]]:sgpr_128 = REG_SEQUENCE %[[BASE0]], %subreg.sub0, %[[BASE1]], %subreg.sub1, %[[BASE2]], %subreg.sub2, %[[BASE3]], %subreg.sub3
174+
; GISEL-DAG: %[[ADD:.*]]:sreg_32 = S_ADD_I32 %1, %10, implicit-def dead $scc
175+
; GISEL: S_BUFFER_LOAD_DWORD_SGPR_IMM %[[BASE]], %[[ADD]], 0,
176+
define amdgpu_cs void @test_buffer_load_sgpr_plus_imm_offset_noflags(<4 x i32> inreg %base, i32 inreg %i, ptr addrspace(1) inreg %out) #0 {
177+
%off = add i32 %i, 77
178+
%v = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %base, i32 %off, i32 0)
179+
store i32 %v, ptr addrspace(1) %out, align 4
180+
ret void
181+
}
182+
113183
; GCN-LABEL: name: test_buffer_load_sgpr_or_imm_offset
114184
; SDAG-DAG: %[[BASE0:.*]]:sgpr_32 = COPY $sgpr0
115185
; SDAG-DAG: %[[BASE1:.*]]:sgpr_32 = COPY $sgpr1

0 commit comments

Comments
 (0)