Skip to content

Commit 49c8245

Browse files
committed
AMDGPU/GlobalISel: Partially move constant selection to patterns
This is still relying on the manual code for splitting 64-bit constants, and handling pointers. We were missing some of the tablegen patterns for all immediate types, so this has some side effect DAG path improvements. This also reduces the diff in the 2 selector outputs.
1 parent 761372e commit 49c8245

37 files changed

+1129
-1768
lines changed

llvm/lib/Target/AMDGPU/AMDGPUGISel.td

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -398,8 +398,10 @@ def gi_as_i1timm : GICustomOperandRenderer<"renderTruncTImm">,
398398
def gi_NegateImm : GICustomOperandRenderer<"renderNegateImm">,
399399
GISDNodeXFormEquiv<NegateImm>;
400400

401-
def gi_bitcast_fpimm_to_i32 : GICustomOperandRenderer<"renderBitcastImm">,
401+
def gi_bitcast_fpimm_to_i32 : GICustomOperandRenderer<"renderBitcastFPImm32">,
402402
GISDNodeXFormEquiv<bitcast_fpimm_to_i32>;
403+
def gi_bitcast_fpimm_to_i64 : GICustomOperandRenderer<"renderBitcastFPImm64">,
404+
GISDNodeXFormEquiv<bitcast_fpimm_to_i64>;
403405

404406
def gi_IMMPopCount : GICustomOperandRenderer<"renderPopcntImm">,
405407
GISDNodeXFormEquiv<IMMPopCount>;

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2504,10 +2504,19 @@ bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
25042504
}
25052505

25062506
bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
2507+
if (selectImpl(I, *CoverageInfo))
2508+
return true;
2509+
2510+
// FIXME: Relying on manual selection for 64-bit case, and pointer typed
2511+
// constants.
25072512
MachineBasicBlock *BB = I.getParent();
25082513
MachineOperand &ImmOp = I.getOperand(1);
25092514
Register DstReg = I.getOperand(0).getReg();
2510-
unsigned Size = MRI->getType(DstReg).getSizeInBits();
2515+
LLT Ty = MRI->getType(DstReg);
2516+
unsigned Size = Ty.getSizeInBits();
2517+
assert((Size == 64 || Ty.isPointer()) &&
2518+
"patterns should have selected this");
2519+
25112520
bool IsFP = false;
25122521

25132522
// The AMDGPU backend only supports Imm operands and not CImm or FPImm.
@@ -5606,18 +5615,12 @@ void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
56065615
MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
56075616
}
56085617

5609-
void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB,
5610-
const MachineInstr &MI,
5611-
int OpIdx) const {
5612-
assert(OpIdx == -1);
5613-
5618+
void AMDGPUInstructionSelector::renderBitcastFPImm(MachineInstrBuilder &MIB,
5619+
const MachineInstr &MI,
5620+
int OpIdx) const {
56145621
const MachineOperand &Op = MI.getOperand(1);
5615-
if (MI.getOpcode() == TargetOpcode::G_FCONSTANT)
5616-
MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
5617-
else {
5618-
assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
5619-
MIB.addImm(Op.getCImm()->getSExtValue());
5620-
}
5622+
assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1);
5623+
MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
56215624
}
56225625

56235626
void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -333,8 +333,17 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
333333
void renderNegateImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
334334
int OpIdx) const;
335335

336-
void renderBitcastImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
337-
int OpIdx) const;
336+
void renderBitcastFPImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
337+
int OpIdx) const;
338+
339+
void renderBitcastFPImm32(MachineInstrBuilder &MIB, const MachineInstr &MI,
340+
int OpIdx) const {
341+
renderBitcastFPImm(MIB, MI, OpIdx);
342+
}
343+
void renderBitcastFPImm64(MachineInstrBuilder &MIB, const MachineInstr &MI,
344+
int OpIdx) const {
345+
renderBitcastFPImm(MIB, MI, OpIdx);
346+
}
338347

339348
void renderPopcntImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
340349
int OpIdx) const;

llvm/lib/Target/AMDGPU/SIInstrInfo.td

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -828,7 +828,9 @@ def InlineImmFP64 : FPImmLeaf<f64, [{
828828

829829
class VGPRImm <dag frag> : PatLeaf<frag, [{
830830
return isVGPRImm(N);
831-
}]>;
831+
}]> {
832+
let GISelPredicateCode = [{return true;}];
833+
}
832834

833835
def NegateImm : SDNodeXForm<imm, [{
834836
return CurDAG->getConstant(-N->getSExtValue(), SDLoc(N), MVT::i32);

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2163,6 +2163,11 @@ def : GCNPat <
21632163
(S_MOV_B32 $ga)
21642164
>;
21652165

2166+
def : GCNPat <
2167+
(VGPRImm<(i16 imm)>:$imm),
2168+
(V_MOV_B32_e32 imm:$imm)
2169+
>;
2170+
21662171
// FIXME: Workaround for ordering issue with peephole optimizer where
21672172
// a register class copy interferes with immediate folding. Should
21682173
// use s_mov_b32, which can be shrunk to s_movk_i32
@@ -2229,20 +2234,15 @@ def : GCNPat <
22292234
(S_MOV_B64 InlineImm64:$imm)
22302235
>;
22312236

2232-
// XXX - Should this use a s_cmp to set SCC?
2233-
22342237
// Set to sign-extended 64-bit value (true = -1, false = 0)
2235-
def : GCNPat <
2236-
(i1 imm:$imm),
2237-
(S_MOV_B64 (i64 (as_i64imm $imm)))
2238-
> {
2238+
// Set to sign-extended 64-bit value (true = -1, false = 0)
2239+
def : GCNPat <(i1 imm:$imm),
2240+
(S_MOV_B64 imm:$imm)> {
22392241
let WaveSizePredicate = isWave64;
22402242
}
22412243

2242-
def : GCNPat <
2243-
(i1 imm:$imm),
2244-
(S_MOV_B32 (i32 (as_i32imm $imm)))
2245-
> {
2244+
def : GCNPat <(i1 imm:$imm),
2245+
(S_MOV_B32 imm:$imm)> {
22462246
let WaveSizePredicate = isWave32;
22472247
}
22482248

llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -501,8 +501,8 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
501501
; GFX7-NEXT: s_mov_b32 s7, 0xf000
502502
; GFX7-NEXT: s_mov_b64 s[4:5], 0
503503
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
504-
; GFX7-NEXT: s_mov_b64 s[8:9], 0
505504
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
505+
; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
506506
; GFX7-NEXT: .LBB4_1: ; %atomicrmw.start
507507
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
508508
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -710,8 +710,8 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
710710
; GFX7-NEXT: s_mov_b32 s7, 0xf000
711711
; GFX7-NEXT: s_mov_b64 s[4:5], 0
712712
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
713-
; GFX7-NEXT: s_mov_b64 s[8:9], 0
714713
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
714+
; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
715715
; GFX7-NEXT: .LBB5_1: ; %atomicrmw.start
716716
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
717717
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -936,7 +936,7 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
936936
; GFX7-NEXT: s_mov_b64 s[4:5], 0
937937
; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
938938
; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
939-
; GFX7-NEXT: s_mov_b64 s[8:9], 0
939+
; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
940940
; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start
941941
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
942942
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -1150,7 +1150,7 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
11501150
; GFX7-NEXT: s_mov_b64 s[4:5], 0
11511151
; GFX7-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
11521152
; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
1153-
; GFX7-NEXT: s_mov_b64 s[8:9], 0
1153+
; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
11541154
; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start
11551155
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
11561156
; GFX7-NEXT: s_waitcnt vmcnt(0)

llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -501,8 +501,8 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt
501501
; GFX7-NEXT: s_mov_b32 s7, 0xf000
502502
; GFX7-NEXT: s_mov_b64 s[4:5], 0
503503
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
504-
; GFX7-NEXT: s_mov_b64 s[8:9], 0
505504
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
505+
; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
506506
; GFX7-NEXT: .LBB4_1: ; %atomicrmw.start
507507
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
508508
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -710,8 +710,8 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p
710710
; GFX7-NEXT: s_mov_b32 s7, 0xf000
711711
; GFX7-NEXT: s_mov_b64 s[4:5], 0
712712
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
713-
; GFX7-NEXT: s_mov_b64 s[8:9], 0
714713
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
714+
; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
715715
; GFX7-NEXT: .LBB5_1: ; %atomicrmw.start
716716
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
717717
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -936,7 +936,7 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p
936936
; GFX7-NEXT: s_mov_b64 s[4:5], 0
937937
; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
938938
; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
939-
; GFX7-NEXT: s_mov_b64 s[8:9], 0
939+
; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
940940
; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start
941941
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
942942
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -1150,7 +1150,7 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p
11501150
; GFX7-NEXT: s_mov_b64 s[4:5], 0
11511151
; GFX7-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
11521152
; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
1153-
; GFX7-NEXT: s_mov_b64 s[8:9], 0
1153+
; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
11541154
; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start
11551155
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
11561156
; GFX7-NEXT: s_waitcnt vmcnt(0)

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -226,15 +226,16 @@ exit:
226226
define amdgpu_cs void @single_lane_execution_attribute(i32 inreg %.userdata0, <3 x i32> inreg %.WorkgroupId, <3 x i32> %.LocalInvocationId) #0 {
227227
; GFX10-LABEL: single_lane_execution_attribute:
228228
; GFX10: ; %bb.0: ; %.entry
229+
; GFX10-NEXT: s_mov_b32 s6, 0
229230
; GFX10-NEXT: s_getpc_b64 s[4:5]
230-
; GFX10-NEXT: s_mov_b32 s12, 0
231-
; GFX10-NEXT: s_mov_b32 s13, -1
232-
; GFX10-NEXT: s_mov_b32 s2, s0
233-
; GFX10-NEXT: s_and_b64 s[4:5], s[4:5], s[12:13]
234-
; GFX10-NEXT: s_mov_b32 s3, s12
231+
; GFX10-NEXT: s_mov_b32 s7, -1
232+
; GFX10-NEXT: s_mov_b32 s2, s1
233+
; GFX10-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
234+
; GFX10-NEXT: s_mov_b32 s1, 0
235235
; GFX10-NEXT: v_mbcnt_lo_u32_b32 v1, -1, 0
236-
; GFX10-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3]
237-
; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
236+
; GFX10-NEXT: s_or_b64 s[12:13], s[4:5], s[0:1]
237+
; GFX10-NEXT: s_mov_b32 s3, -1
238+
; GFX10-NEXT: s_load_dwordx8 s[4:11], s[12:13], 0x0
238239
; GFX10-NEXT: v_mbcnt_hi_u32_b32 v1, -1, v1
239240
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v1
240241
; GFX10-NEXT: v_and_b32_e32 v3, 1, v1
@@ -248,8 +249,8 @@ define amdgpu_cs void @single_lane_execution_attribute(i32 inreg %.userdata0, <3
248249
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v2
249250
; GFX10-NEXT: s_cbranch_vccnz .LBB4_4
250251
; GFX10-NEXT: ; %bb.1: ; %.preheader.preheader
251-
; GFX10-NEXT: v_mov_b32_e32 v3, s12
252-
; GFX10-NEXT: v_mov_b32_e32 v4, s12
252+
; GFX10-NEXT: v_mov_b32_e32 v3, s1
253+
; GFX10-NEXT: v_mov_b32_e32 v4, s1
253254
; GFX10-NEXT: .LBB4_2: ; %.preheader
254255
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
255256
; GFX10-NEXT: buffer_load_dword v5, v3, s[4:7], 0 offen
@@ -261,17 +262,17 @@ define amdgpu_cs void @single_lane_execution_attribute(i32 inreg %.userdata0, <3
261262
; GFX10-NEXT: s_cbranch_vccnz .LBB4_2
262263
; GFX10-NEXT: ; %bb.3: ; %.preheader._crit_edge
263264
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
264-
; GFX10-NEXT: s_mov_b32 s13, 0
265-
; GFX10-NEXT: s_or_b32 s2, s0, vcc_lo
266-
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2
265+
; GFX10-NEXT: s_mov_b32 s3, 0
266+
; GFX10-NEXT: s_or_b32 s1, s0, vcc_lo
267+
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s1
267268
; GFX10-NEXT: .LBB4_4: ; %Flow
268-
; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s13
269+
; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s3
269270
; GFX10-NEXT: s_cbranch_vccz .LBB4_6
270271
; GFX10-NEXT: ; %bb.5: ; %.19
271272
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
272273
; GFX10-NEXT: v_or_b32_e32 v3, 2, v1
273274
; GFX10-NEXT: .LBB4_6: ; %.22
274-
; GFX10-NEXT: v_add_lshl_u32 v0, v0, s1, 2
275+
; GFX10-NEXT: v_add_lshl_u32 v0, v0, s2, 2
275276
; GFX10-NEXT: buffer_store_dword v3, v0, s[8:11], 0 offen
276277
; GFX10-NEXT: s_endpgm
277278
.entry:

llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -193,12 +193,12 @@ bb12:
193193
define amdgpu_kernel void @break_loop(i32 %arg) {
194194
; CHECK-LABEL: break_loop:
195195
; CHECK: ; %bb.0: ; %bb
196-
; CHECK-NEXT: s_load_dword s2, s[6:7], 0x0
197-
; CHECK-NEXT: s_mov_b64 s[0:1], 0
196+
; CHECK-NEXT: s_load_dword s0, s[6:7], 0x0
197+
; CHECK-NEXT: ; implicit-def: $sgpr2_sgpr3
198198
; CHECK-NEXT: ; implicit-def: $vgpr1
199199
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
200-
; CHECK-NEXT: v_subrev_u32_e32 v0, s2, v0
201-
; CHECK-NEXT: ; implicit-def: $sgpr2_sgpr3
200+
; CHECK-NEXT: v_subrev_u32_e32 v0, s0, v0
201+
; CHECK-NEXT: s_mov_b64 s[0:1], 0
202202
; CHECK-NEXT: s_branch .LBB5_3
203203
; CHECK-NEXT: .LBB5_1: ; %bb4
204204
; CHECK-NEXT: ; in Loop: Header=BB5_3 Depth=1

0 commit comments

Comments
 (0)