Skip to content

Commit 475cc90

Browse files
committed
Squashed commit of the following:
commit fef5786ca1d78288c3f442057bfeb148788417b2 Merge: 77daaf1c4b40 aadaa1cc32b8 Author: Rin Dobrescu <[email protected]> Date: Tue Nov 7 10:27:03 2023 +0000 Merge branch 'hoist_dups' of https://github.com/Rin18/llvm-project-fork into hoist_dups commit 77daaf1c4b4009bc95803e8992291f64c9cf8b33 Author: Rin Dobrescu <[email protected]> Date: Mon Nov 6 15:52:27 2023 +0000 Remove unneeded line commit aa469a15af8aeaca8d474ee7ab5b59ff20aec28c Author: Rin Dobrescu <[email protected]> Date: Mon Nov 6 15:08:11 2023 +0000 Run clang-format on patch commit 0d43d36e939e702e73b5927b4ae1c713ec0edec9 Author: Rin Dobrescu <[email protected]> Date: Mon Nov 6 12:20:21 2023 +0000 [MachineLICM][AArch64] Hoist COPY instructions with other uses in the loop commit aadaa1cc32b88e682f0b9148683d3f6142e73f4e Author: Rin Dobrescu <[email protected]> Date: Mon Nov 6 15:52:27 2023 +0000 Remove unneeded line commit 27c7354c1bcc5aff81fed7623ff56d93af4201da Author: Rin Dobrescu <[email protected]> Date: Mon Nov 6 15:08:11 2023 +0000 Run clang-format on patch commit 34f9b474e3e10f670b6bb0359f2b81ff8275d5a1 Author: Rin Dobrescu <[email protected]> Date: Mon Nov 6 12:20:21 2023 +0000 [MachineLICM][AArch64] Hoist COPY instructions with other uses in the loop
1 parent c82cc62 commit 475cc90

25 files changed

+3240
-3545
lines changed

llvm/lib/CodeGen/MachineLICM.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1249,6 +1249,15 @@ bool MachineLICMBase::IsProfitableToHoist(MachineInstr &MI,
12491249
return false;
12501250
}
12511251

1252+
// If we have a COPY with other uses in the loop, hoist to allow the users to
1253+
// also be hoisted.
1254+
if (MI.isCopy() && IsLoopInvariantInst(MI, CurLoop) &&
1255+
MI.getOperand(0).isReg() && MI.getOperand(0).getReg().isVirtual() &&
1256+
MI.getOperand(1).isReg() && MI.getOperand(1).getReg().isVirtual() &&
1257+
any_of(MRI->use_nodbg_instructions(MI.getOperand(0).getReg()),
1258+
[&](MachineInstr &UseMI) { return CurLoop->contains(&UseMI); }))
1259+
return true;
1260+
12521261
// High register pressure situation, only hoist if the instruction is going
12531262
// to be remat'ed.
12541263
if (!isTriviallyReMaterializable(MI) &&

llvm/test/CodeGen/AArch64/tbl-loops.ll

Lines changed: 63 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -52,19 +52,19 @@ define void @loop1(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
5252
; CHECK-NEXT: b.eq .LBB0_8
5353
; CHECK-NEXT: .LBB0_6: // %for.body.preheader1
5454
; CHECK-NEXT: movi d0, #0000000000000000
55-
; CHECK-NEXT: sub w10, w2, w10
5655
; CHECK-NEXT: mov w11, #1132396544 // =0x437f0000
56+
; CHECK-NEXT: sub w10, w2, w10
57+
; CHECK-NEXT: fmov s1, w11
5758
; CHECK-NEXT: .LBB0_7: // %for.body
5859
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
59-
; CHECK-NEXT: fmov s2, w11
60-
; CHECK-NEXT: ldr s1, [x8], #4
61-
; CHECK-NEXT: fcmp s1, s2
62-
; CHECK-NEXT: fcsel s2, s2, s1, gt
63-
; CHECK-NEXT: fcmp s1, #0.0
64-
; CHECK-NEXT: fcsel s1, s0, s2, mi
60+
; CHECK-NEXT: ldr s2, [x8], #4
61+
; CHECK-NEXT: fcmp s2, s1
62+
; CHECK-NEXT: fcsel s3, s1, s2, gt
63+
; CHECK-NEXT: fcmp s2, #0.0
64+
; CHECK-NEXT: fcsel s2, s0, s3, mi
6565
; CHECK-NEXT: subs w10, w10, #1
66-
; CHECK-NEXT: fcvtzs w12, s1
67-
; CHECK-NEXT: strb w12, [x9], #1
66+
; CHECK-NEXT: fcvtzs w11, s2
67+
; CHECK-NEXT: strb w11, [x9], #1
6868
; CHECK-NEXT: b.ne .LBB0_7
6969
; CHECK-NEXT: .LBB0_8: // %for.cond.cleanup
7070
; CHECK-NEXT: ret
@@ -165,25 +165,25 @@ define void @loop2(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
165165
; CHECK-NEXT: mov x9, x0
166166
; CHECK-NEXT: .LBB1_5: // %for.body.preheader1
167167
; CHECK-NEXT: movi d0, #0000000000000000
168-
; CHECK-NEXT: sub w10, w2, w10
169168
; CHECK-NEXT: mov w11, #1132396544 // =0x437f0000
169+
; CHECK-NEXT: sub w10, w2, w10
170+
; CHECK-NEXT: fmov s1, w11
170171
; CHECK-NEXT: .LBB1_6: // %for.body
171172
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
172-
; CHECK-NEXT: ldp s1, s3, [x8], #8
173-
; CHECK-NEXT: fmov s2, w11
174-
; CHECK-NEXT: fcmp s1, s2
175-
; CHECK-NEXT: fcsel s4, s2, s1, gt
176-
; CHECK-NEXT: fcmp s1, #0.0
177-
; CHECK-NEXT: fcsel s1, s0, s4, mi
178-
; CHECK-NEXT: fcmp s3, s2
179-
; CHECK-NEXT: fcsel s2, s2, s3, gt
173+
; CHECK-NEXT: ldp s2, s3, [x8], #8
174+
; CHECK-NEXT: fcmp s2, s1
175+
; CHECK-NEXT: fcsel s4, s1, s2, gt
176+
; CHECK-NEXT: fcmp s2, #0.0
177+
; CHECK-NEXT: fcsel s2, s0, s4, mi
178+
; CHECK-NEXT: fcmp s3, s1
179+
; CHECK-NEXT: fcsel s4, s1, s3, gt
180180
; CHECK-NEXT: fcmp s3, #0.0
181-
; CHECK-NEXT: fcvtzs w12, s1
182-
; CHECK-NEXT: fcsel s2, s0, s2, mi
181+
; CHECK-NEXT: fcvtzs w11, s2
182+
; CHECK-NEXT: fcsel s3, s0, s4, mi
183183
; CHECK-NEXT: subs w10, w10, #1
184-
; CHECK-NEXT: strb w12, [x9]
185-
; CHECK-NEXT: fcvtzs w13, s2
186-
; CHECK-NEXT: strb w13, [x9, #1]
184+
; CHECK-NEXT: strb w11, [x9]
185+
; CHECK-NEXT: fcvtzs w12, s3
186+
; CHECK-NEXT: strb w12, [x9, #1]
187187
; CHECK-NEXT: add x9, x9, #2
188188
; CHECK-NEXT: b.ne .LBB1_6
189189
; CHECK-NEXT: .LBB1_7: // %for.cond.cleanup
@@ -380,33 +380,33 @@ define void @loop3(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
380380
; CHECK-NEXT: mov x9, x0
381381
; CHECK-NEXT: .LBB2_7: // %for.body.preheader1
382382
; CHECK-NEXT: movi d0, #0000000000000000
383-
; CHECK-NEXT: sub w10, w2, w10
384383
; CHECK-NEXT: mov w11, #1132396544 // =0x437f0000
384+
; CHECK-NEXT: sub w10, w2, w10
385+
; CHECK-NEXT: fmov s1, w11
385386
; CHECK-NEXT: .LBB2_8: // %for.body
386387
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
387-
; CHECK-NEXT: ldp s1, s3, [x8]
388-
; CHECK-NEXT: fmov s2, w11
389-
; CHECK-NEXT: fcmp s1, s2
390-
; CHECK-NEXT: fcsel s4, s2, s1, gt
391-
; CHECK-NEXT: fcmp s1, #0.0
392-
; CHECK-NEXT: fcsel s1, s0, s4, mi
393-
; CHECK-NEXT: fcmp s3, s2
394-
; CHECK-NEXT: fcsel s4, s2, s3, gt
388+
; CHECK-NEXT: ldp s2, s3, [x8]
389+
; CHECK-NEXT: fcmp s2, s1
390+
; CHECK-NEXT: fcsel s4, s1, s2, gt
391+
; CHECK-NEXT: fcmp s2, #0.0
392+
; CHECK-NEXT: fcsel s2, s0, s4, mi
393+
; CHECK-NEXT: fcmp s3, s1
394+
; CHECK-NEXT: fcsel s4, s1, s3, gt
395395
; CHECK-NEXT: fcmp s3, #0.0
396396
; CHECK-NEXT: ldr s3, [x8, #8]
397-
; CHECK-NEXT: fcvtzs w12, s1
397+
; CHECK-NEXT: fcvtzs w11, s2
398398
; CHECK-NEXT: add x8, x8, #12
399399
; CHECK-NEXT: fcsel s4, s0, s4, mi
400-
; CHECK-NEXT: fcmp s3, s2
401-
; CHECK-NEXT: strb w12, [x9]
402-
; CHECK-NEXT: fcsel s2, s2, s3, gt
400+
; CHECK-NEXT: fcmp s3, s1
401+
; CHECK-NEXT: strb w11, [x9]
402+
; CHECK-NEXT: fcsel s5, s1, s3, gt
403403
; CHECK-NEXT: fcmp s3, #0.0
404-
; CHECK-NEXT: fcvtzs w13, s4
405-
; CHECK-NEXT: fcsel s2, s0, s2, mi
404+
; CHECK-NEXT: fcvtzs w12, s4
405+
; CHECK-NEXT: fcsel s3, s0, s5, mi
406406
; CHECK-NEXT: subs w10, w10, #1
407-
; CHECK-NEXT: strb w13, [x9, #1]
408-
; CHECK-NEXT: fcvtzs w14, s2
409-
; CHECK-NEXT: strb w14, [x9, #2]
407+
; CHECK-NEXT: strb w12, [x9, #1]
408+
; CHECK-NEXT: fcvtzs w13, s3
409+
; CHECK-NEXT: strb w13, [x9, #2]
410410
; CHECK-NEXT: add x9, x9, #3
411411
; CHECK-NEXT: b.ne .LBB2_8
412412
; CHECK-NEXT: .LBB2_9: // %for.cond.cleanup
@@ -549,39 +549,39 @@ define void @loop4(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
549549
; CHECK-NEXT: mov x9, x0
550550
; CHECK-NEXT: .LBB3_5: // %for.body.preheader1
551551
; CHECK-NEXT: movi d0, #0000000000000000
552-
; CHECK-NEXT: sub w10, w2, w10
553552
; CHECK-NEXT: mov w11, #1132396544 // =0x437f0000
553+
; CHECK-NEXT: sub w10, w2, w10
554+
; CHECK-NEXT: fmov s1, w11
554555
; CHECK-NEXT: .LBB3_6: // %for.body
555556
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
556-
; CHECK-NEXT: ldp s1, s3, [x8]
557-
; CHECK-NEXT: fmov s2, w11
558-
; CHECK-NEXT: fcmp s1, s2
559-
; CHECK-NEXT: fcsel s4, s2, s1, gt
560-
; CHECK-NEXT: fcmp s1, #0.0
561-
; CHECK-NEXT: fcsel s1, s0, s4, mi
562-
; CHECK-NEXT: fcmp s3, s2
563-
; CHECK-NEXT: fcsel s4, s2, s3, gt
557+
; CHECK-NEXT: ldp s2, s3, [x8]
558+
; CHECK-NEXT: fcmp s2, s1
559+
; CHECK-NEXT: fcsel s4, s1, s2, gt
560+
; CHECK-NEXT: fcmp s2, #0.0
561+
; CHECK-NEXT: fcsel s2, s0, s4, mi
562+
; CHECK-NEXT: fcmp s3, s1
563+
; CHECK-NEXT: fcsel s4, s1, s3, gt
564564
; CHECK-NEXT: fcmp s3, #0.0
565565
; CHECK-NEXT: ldp s3, s5, [x8, #8]
566-
; CHECK-NEXT: fcvtzs w12, s1
566+
; CHECK-NEXT: fcvtzs w11, s2
567567
; CHECK-NEXT: add x8, x8, #16
568568
; CHECK-NEXT: fcsel s4, s0, s4, mi
569-
; CHECK-NEXT: fcmp s3, s2
570-
; CHECK-NEXT: strb w12, [x9]
571-
; CHECK-NEXT: fcsel s6, s2, s3, gt
569+
; CHECK-NEXT: fcmp s3, s1
570+
; CHECK-NEXT: strb w11, [x9]
571+
; CHECK-NEXT: fcsel s6, s1, s3, gt
572572
; CHECK-NEXT: fcmp s3, #0.0
573-
; CHECK-NEXT: fcvtzs w13, s4
573+
; CHECK-NEXT: fcvtzs w12, s4
574574
; CHECK-NEXT: fcsel s3, s0, s6, mi
575-
; CHECK-NEXT: fcmp s5, s2
576-
; CHECK-NEXT: strb w13, [x9, #1]
577-
; CHECK-NEXT: fcsel s2, s2, s5, gt
575+
; CHECK-NEXT: fcmp s5, s1
576+
; CHECK-NEXT: strb w12, [x9, #1]
577+
; CHECK-NEXT: fcsel s6, s1, s5, gt
578578
; CHECK-NEXT: fcmp s5, #0.0
579-
; CHECK-NEXT: fcvtzs w14, s3
580-
; CHECK-NEXT: fcsel s2, s0, s2, mi
579+
; CHECK-NEXT: fcvtzs w13, s3
580+
; CHECK-NEXT: fcsel s5, s0, s6, mi
581581
; CHECK-NEXT: subs w10, w10, #1
582-
; CHECK-NEXT: strb w14, [x9, #2]
583-
; CHECK-NEXT: fcvtzs w15, s2
584-
; CHECK-NEXT: strb w15, [x9, #3]
582+
; CHECK-NEXT: strb w13, [x9, #2]
583+
; CHECK-NEXT: fcvtzs w14, s5
584+
; CHECK-NEXT: strb w14, [x9, #3]
585585
; CHECK-NEXT: add x9, x9, #4
586586
; CHECK-NEXT: b.ne .LBB3_6
587587
; CHECK-NEXT: .LBB3_7: // %for.cond.cleanup

llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll

Lines changed: 28 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1447,25 +1447,24 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
14471447
; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat:
14481448
; GFX90A: ; %bb.0: ; %main_body
14491449
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1450-
; GFX90A-NEXT: s_mov_b64 s[2:3], 0
14511450
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1452-
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1453-
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
1451+
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1]
1452+
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
1453+
; GFX90A-NEXT: s_mov_b64 s[0:1], 0
14541454
; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start
14551455
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
14561456
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14571457
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
1458-
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1]
14591458
; GFX90A-NEXT: buffer_wbl2
14601459
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14611460
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
14621461
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14631462
; GFX90A-NEXT: buffer_invl2
14641463
; GFX90A-NEXT: buffer_wbinvl1_vol
14651464
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
1466-
; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
1465+
; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
14671466
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
1468-
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
1467+
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1]
14691468
; GFX90A-NEXT: s_cbranch_execnz .LBB50_1
14701469
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
14711470
; GFX90A-NEXT: s_endpgm
@@ -1522,15 +1521,14 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
15221521
; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_system:
15231522
; GFX90A: ; %bb.0: ; %main_body
15241523
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1525-
; GFX90A-NEXT: s_mov_b64 s[2:3], 0
15261524
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1527-
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1528-
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
1525+
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1]
1526+
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
1527+
; GFX90A-NEXT: s_mov_b64 s[0:1], 0
15291528
; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start
15301529
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
15311530
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
15321531
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
1533-
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1]
15341532
; GFX90A-NEXT: buffer_wbl2
15351533
; GFX90A-NEXT: s_waitcnt vmcnt(0)
15361534
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -1539,9 +1537,9 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
15391537
; GFX90A-NEXT: buffer_wbinvl1_vol
15401538
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
15411539
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
1542-
; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
1540+
; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
15431541
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
1544-
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
1542+
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1]
15451543
; GFX90A-NEXT: s_cbranch_execnz .LBB52_1
15461544
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
15471545
; GFX90A-NEXT: s_endpgm
@@ -1724,23 +1722,22 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
17241722
; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe:
17251723
; GFX90A: ; %bb.0: ; %main_body
17261724
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1727-
; GFX90A-NEXT: s_mov_b64 s[2:3], 0
17281725
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1729-
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1730-
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
1726+
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1]
1727+
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
1728+
; GFX90A-NEXT: s_mov_b64 s[0:1], 0
17311729
; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start
17321730
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
17331731
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17341732
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
1735-
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1]
17361733
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17371734
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
17381735
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17391736
; GFX90A-NEXT: buffer_wbinvl1_vol
17401737
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
1741-
; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
1738+
; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
17421739
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
1743-
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
1740+
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1]
17441741
; GFX90A-NEXT: s_cbranch_execnz .LBB58_1
17451742
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
17461743
; GFX90A-NEXT: s_endpgm
@@ -1957,22 +1954,21 @@ main_body:
19571954
define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrspace(3) %ptr) #4 {
19581955
; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe:
19591956
; GFX90A: ; %bb.0: ; %main_body
1960-
; GFX90A-NEXT: s_load_dword s2, s[0:1], 0x24
1961-
; GFX90A-NEXT: s_mov_b64 s[0:1], 0
1957+
; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24
19621958
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1963-
; GFX90A-NEXT: v_mov_b32_e32 v0, s2
1964-
; GFX90A-NEXT: ds_read_b64 v[0:1], v0
1959+
; GFX90A-NEXT: v_mov_b32_e32 v2, s0
1960+
; GFX90A-NEXT: ds_read_b64 v[0:1], v2
1961+
; GFX90A-NEXT: s_mov_b64 s[0:1], 0
19651962
; GFX90A-NEXT: .LBB67_1: ; %atomicrmw.start
19661963
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
19671964
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1968-
; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], 4.0
1969-
; GFX90A-NEXT: v_mov_b32_e32 v4, s2
1965+
; GFX90A-NEXT: v_add_f64 v[4:5], v[0:1], 4.0
19701966
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1971-
; GFX90A-NEXT: ds_cmpst_rtn_b64 v[2:3], v4, v[0:1], v[2:3]
1967+
; GFX90A-NEXT: ds_cmpst_rtn_b64 v[4:5], v2, v[0:1], v[4:5]
19721968
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1973-
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[0:1]
1969+
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[0:1]
19741970
; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
1975-
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[0,1]
1971+
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
19761972
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1]
19771973
; GFX90A-NEXT: s_cbranch_execnz .LBB67_1
19781974
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1985,17 +1981,17 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
19851981
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
19861982
; GFX940-NEXT: v_mov_b32_e32 v0, s2
19871983
; GFX940-NEXT: ds_read_b64 v[0:1], v0
1984+
; GFX940-NEXT: v_mov_b32_e32 v2, s2
19881985
; GFX940-NEXT: .LBB67_1: ; %atomicrmw.start
19891986
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
19901987
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
1991-
; GFX940-NEXT: v_add_f64 v[2:3], v[0:1], 4.0
1992-
; GFX940-NEXT: v_mov_b32_e32 v4, s2
1988+
; GFX940-NEXT: v_add_f64 v[4:5], v[0:1], 4.0
19931989
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
1994-
; GFX940-NEXT: ds_cmpst_rtn_b64 v[2:3], v4, v[0:1], v[2:3]
1990+
; GFX940-NEXT: ds_cmpst_rtn_b64 v[4:5], v2, v[0:1], v[4:5]
19951991
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
1996-
; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[0:1]
1992+
; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[0:1]
19971993
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
1998-
; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[2:3]
1994+
; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
19991995
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
20001996
; GFX940-NEXT: s_cbranch_execnz .LBB67_1
20011997
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end

0 commit comments

Comments
 (0)