Skip to content

Commit e9e9d1b

Browse files
authored
[AMDGPU] Disable V_MAD_U64_U32/V_MAD_I64_I32 workaround for GFX12 (#77927)
1 parent 8f7fdd9 commit e9e9d1b

9 files changed

+436
-68
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1501,7 +1501,6 @@ def FeatureISAVersion12 : FeatureSet<
15011501
FeaturePseudoScalarTrans,
15021502
FeatureHasRestrictedSOffset,
15031503
FeatureVGPRSingleUseHintInsts,
1504-
FeatureMADIntraFwdBug,
15051504
FeatureScalarDwordx3Loads]>;
15061505

15071506
//===----------------------------------------------------------------------===//

llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-mad_64_32.mir

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
22
# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -run-pass=instruction-select -global-isel-abort=2 -pass-remarks-missed='gisel*' -verify-machineinstrs %s -o - 2>%t | FileCheck -check-prefix=GFX10 %s
33
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -global-isel-abort=2 -pass-remarks-missed='gisel*' -verify-machineinstrs %s -o - 2>%t | FileCheck -check-prefix=GFX11 %s
4+
# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -global-isel-abort=2 -pass-remarks-missed='gisel*' -verify-machineinstrs %s -o - 2>%t | FileCheck -check-prefix=GFX12 %s
45

56
---
67
name: mad_u64_u32_vvv
@@ -18,6 +19,7 @@ body: |
1819
; GFX10-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3
1920
; GFX10-NEXT: [[V_MAD_U64_U32_e64_:%[0-9]+]]:vreg_64, [[V_MAD_U64_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_MAD_U64_U32_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec
2021
; GFX10-NEXT: S_ENDPGM 0, implicit [[V_MAD_U64_U32_e64_]], implicit [[V_MAD_U64_U32_e64_1]]
22+
;
2123
; GFX11-LABEL: name: mad_u64_u32_vvv
2224
; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
2325
; GFX11-NEXT: {{ $}}
@@ -26,6 +28,15 @@ body: |
2628
; GFX11-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3
2729
; GFX11-NEXT: [[V_MAD_U64_U32_gfx11_e64_:%[0-9]+]]:vreg_64, [[V_MAD_U64_U32_gfx11_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_MAD_U64_U32_gfx11_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec
2830
; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MAD_U64_U32_gfx11_e64_]], implicit [[V_MAD_U64_U32_gfx11_e64_1]]
31+
;
32+
; GFX12-LABEL: name: mad_u64_u32_vvv
33+
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
34+
; GFX12-NEXT: {{ $}}
35+
; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
36+
; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
37+
; GFX12-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3
38+
; GFX12-NEXT: [[V_MAD_U64_U32_e64_:%[0-9]+]]:vreg_64, [[V_MAD_U64_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_MAD_U64_U32_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec
39+
; GFX12-NEXT: S_ENDPGM 0, implicit [[V_MAD_U64_U32_e64_]], implicit [[V_MAD_U64_U32_e64_1]]
2940
%0:vgpr(s32) = COPY $vgpr0
3041
%1:vgpr(s32) = COPY $vgpr1
3142
%2:vgpr(s32) = COPY $vgpr2
@@ -51,6 +62,7 @@ body: |
5162
; GFX10-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3
5263
; GFX10-NEXT: [[V_MAD_I64_I32_e64_:%[0-9]+]]:vreg_64, [[V_MAD_I64_I32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_MAD_I64_I32_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec
5364
; GFX10-NEXT: S_ENDPGM 0, implicit [[V_MAD_I64_I32_e64_]], implicit [[V_MAD_I64_I32_e64_1]]
65+
;
5466
; GFX11-LABEL: name: mad_i64_i32_vvv
5567
; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
5668
; GFX11-NEXT: {{ $}}
@@ -59,6 +71,15 @@ body: |
5971
; GFX11-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3
6072
; GFX11-NEXT: [[V_MAD_I64_I32_gfx11_e64_:%[0-9]+]]:vreg_64, [[V_MAD_I64_I32_gfx11_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_MAD_I64_I32_gfx11_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec
6173
; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MAD_I64_I32_gfx11_e64_]], implicit [[V_MAD_I64_I32_gfx11_e64_1]]
74+
;
75+
; GFX12-LABEL: name: mad_i64_i32_vvv
76+
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
77+
; GFX12-NEXT: {{ $}}
78+
; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
79+
; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
80+
; GFX12-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3
81+
; GFX12-NEXT: [[V_MAD_I64_I32_e64_:%[0-9]+]]:vreg_64, [[V_MAD_I64_I32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_MAD_I64_I32_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec
82+
; GFX12-NEXT: S_ENDPGM 0, implicit [[V_MAD_I64_I32_e64_]], implicit [[V_MAD_I64_I32_e64_1]]
6283
%0:vgpr(s32) = COPY $vgpr0
6384
%1:vgpr(s32) = COPY $vgpr1
6485
%2:vgpr(s32) = COPY $vgpr2

llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -520,13 +520,12 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
520520
; GFX12W64-NEXT: .LBB1_2:
521521
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
522522
; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
523-
; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
524-
; GFX12W64-NEXT: v_mul_lo_u32 v0, s6, v0
525523
; GFX12W64-NEXT: s_waitcnt vmcnt(0)
526524
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
525+
; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
526+
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
527+
; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3]
527528
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
528-
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
529-
; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0
530529
; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
531530
; GFX12W64-NEXT: s_nop 0
532531
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -553,12 +552,12 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
553552
; GFX12W32-NEXT: .LBB1_2:
554553
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
555554
; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
556-
; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
557-
; GFX12W32-NEXT: v_mul_lo_u32 v0, s2, v0
558555
; GFX12W32-NEXT: s_waitcnt vmcnt(0)
559-
; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
556+
; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1
557+
; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
560558
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
561-
; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0
559+
; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v0, s[4:5]
560+
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
562561
; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
563562
; GFX12W32-NEXT: s_nop 0
564563
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)

llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll

Lines changed: 18 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -572,13 +572,12 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
572572
; GFX1264-NEXT: global_inv scope:SCOPE_DEV
573573
; GFX1264-NEXT: .LBB1_2:
574574
; GFX1264-NEXT: s_or_b64 exec, exec, s[0:1]
575-
; GFX1264-NEXT: s_waitcnt lgkmcnt(0)
576-
; GFX1264-NEXT: v_mul_lo_u32 v0, s8, v0
577575
; GFX1264-NEXT: v_readfirstlane_b32 s0, v1
576+
; GFX1264-NEXT: s_waitcnt lgkmcnt(0)
578577
; GFX1264-NEXT: s_mov_b32 s7, 0x31016000
579578
; GFX1264-NEXT: s_mov_b32 s6, -1
580579
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1)
581-
; GFX1264-NEXT: v_add_nc_u32_e32 v0, s0, v0
580+
; GFX1264-NEXT: v_mad_co_u64_u32 v[0:1], null, s8, v0, s[0:1]
582581
; GFX1264-NEXT: buffer_store_b32 v0, off, s[4:7], null
583582
; GFX1264-NEXT: s_nop 0
584583
; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -610,13 +609,12 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
610609
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
611610
; GFX1232-NEXT: .LBB1_2:
612611
; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s1
612+
; GFX1232-NEXT: v_readfirstlane_b32 s2, v1
613613
; GFX1232-NEXT: s_waitcnt lgkmcnt(0)
614-
; GFX1232-NEXT: v_mul_lo_u32 v0, s0, v0
615-
; GFX1232-NEXT: v_readfirstlane_b32 s0, v1
616614
; GFX1232-NEXT: s_mov_b32 s7, 0x31016000
617615
; GFX1232-NEXT: s_mov_b32 s6, -1
618616
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
619-
; GFX1232-NEXT: v_add_nc_u32_e32 v0, s0, v0
617+
; GFX1232-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v0, s[2:3]
620618
; GFX1232-NEXT: buffer_store_b32 v0, off, s[4:7], null
621619
; GFX1232-NEXT: s_nop 0
622620
; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1671,12 +1669,11 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
16711669
; GFX1264-NEXT: v_readfirstlane_b32 s2, v0
16721670
; GFX1264-NEXT: v_readfirstlane_b32 s3, v1
16731671
; GFX1264-NEXT: s_waitcnt lgkmcnt(0)
1674-
; GFX1264-NEXT: v_mul_lo_u32 v3, s1, v2
16751672
; GFX1264-NEXT: s_mov_b32 s7, 0x31016000
16761673
; GFX1264-NEXT: s_mov_b32 s6, -1
1674+
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
16771675
; GFX1264-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v2, s[2:3]
1678-
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1)
1679-
; GFX1264-NEXT: v_add_nc_u32_e32 v1, v3, v1
1676+
; GFX1264-NEXT: v_mad_co_u64_u32 v[1:2], null, s1, v2, v[1:2]
16801677
; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
16811678
; GFX1264-NEXT: s_nop 0
16821679
; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1712,12 +1709,11 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
17121709
; GFX1232-NEXT: v_readfirstlane_b32 s2, v0
17131710
; GFX1232-NEXT: v_readfirstlane_b32 s3, v1
17141711
; GFX1232-NEXT: s_waitcnt lgkmcnt(0)
1715-
; GFX1232-NEXT: v_mul_lo_u32 v3, s1, v2
17161712
; GFX1232-NEXT: s_mov_b32 s7, 0x31016000
17171713
; GFX1232-NEXT: s_mov_b32 s6, -1
1714+
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
17181715
; GFX1232-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v2, s[2:3]
1719-
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
1720-
; GFX1232-NEXT: v_add_nc_u32_e32 v1, v3, v1
1716+
; GFX1232-NEXT: v_mad_co_u64_u32 v[1:2], null, s1, v2, v[1:2]
17211717
; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
17221718
; GFX1232-NEXT: s_nop 0
17231719
; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -3608,16 +3604,16 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
36083604
; GFX1264-NEXT: .LBB10_2:
36093605
; GFX1264-NEXT: s_or_b64 exec, exec, s[2:3]
36103606
; GFX1264-NEXT: s_waitcnt lgkmcnt(0)
3611-
; GFX1264-NEXT: v_mul_lo_u32 v5, s1, v2
36123607
; GFX1264-NEXT: v_mad_co_u64_u32 v[3:4], null, s0, v2, 0
36133608
; GFX1264-NEXT: v_readfirstlane_b32 s0, v0
3614-
; GFX1264-NEXT: v_readfirstlane_b32 s1, v1
36153609
; GFX1264-NEXT: s_mov_b32 s7, 0x31016000
36163610
; GFX1264-NEXT: s_mov_b32 s6, -1
3617-
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
3618-
; GFX1264-NEXT: v_add_nc_u32_e32 v1, v4, v5
3611+
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
3612+
; GFX1264-NEXT: v_mad_co_u64_u32 v[4:5], null, s1, v2, v[4:5]
3613+
; GFX1264-NEXT: v_readfirstlane_b32 s1, v1
36193614
; GFX1264-NEXT: v_sub_co_u32 v0, vcc, s0, v3
3620-
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_2)
3615+
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
3616+
; GFX1264-NEXT: v_mov_b32_e32 v1, v4
36213617
; GFX1264-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
36223618
; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
36233619
; GFX1264-NEXT: s_nop 0
@@ -3652,16 +3648,16 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
36523648
; GFX1232-NEXT: .LBB10_2:
36533649
; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s8
36543650
; GFX1232-NEXT: s_waitcnt lgkmcnt(0)
3655-
; GFX1232-NEXT: v_mul_lo_u32 v5, s1, v2
36563651
; GFX1232-NEXT: v_mad_co_u64_u32 v[3:4], null, s0, v2, 0
36573652
; GFX1232-NEXT: v_readfirstlane_b32 s0, v0
3658-
; GFX1232-NEXT: v_readfirstlane_b32 s1, v1
36593653
; GFX1232-NEXT: s_mov_b32 s7, 0x31016000
36603654
; GFX1232-NEXT: s_mov_b32 s6, -1
3661-
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
3662-
; GFX1232-NEXT: v_add_nc_u32_e32 v1, v4, v5
3655+
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
3656+
; GFX1232-NEXT: v_mad_co_u64_u32 v[4:5], null, s1, v2, v[4:5]
3657+
; GFX1232-NEXT: v_readfirstlane_b32 s1, v1
36633658
; GFX1232-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v3
3664-
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_2)
3659+
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
3660+
; GFX1232-NEXT: v_mov_b32_e32 v1, v4
36653661
; GFX1232-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
36663662
; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
36673663
; GFX1232-NEXT: s_nop 0

llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -519,13 +519,12 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
519519
; GFX12W64-NEXT: .LBB1_2:
520520
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
521521
; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
522-
; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
523-
; GFX12W64-NEXT: v_mul_lo_u32 v0, s6, v0
524522
; GFX12W64-NEXT: s_waitcnt vmcnt(0)
525523
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
524+
; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
525+
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
526+
; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3]
526527
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
527-
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
528-
; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0
529528
; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
530529
; GFX12W64-NEXT: s_nop 0
531530
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -552,12 +551,12 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
552551
; GFX12W32-NEXT: .LBB1_2:
553552
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
554553
; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
555-
; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
556-
; GFX12W32-NEXT: v_mul_lo_u32 v0, s2, v0
557554
; GFX12W32-NEXT: s_waitcnt vmcnt(0)
558-
; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
555+
; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1
556+
; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
559557
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
560-
; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0
558+
; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v0, s[4:5]
559+
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
561560
; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
562561
; GFX12W32-NEXT: s_nop 0
563562
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)

llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -535,13 +535,12 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
535535
; GFX12W64-NEXT: .LBB1_2:
536536
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
537537
; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
538-
; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
539-
; GFX12W64-NEXT: v_mul_lo_u32 v0, s6, v0
540538
; GFX12W64-NEXT: s_waitcnt vmcnt(0)
541539
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
540+
; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
541+
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
542+
; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3]
542543
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
543-
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
544-
; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0
545544
; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
546545
; GFX12W64-NEXT: s_nop 0
547546
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -568,12 +567,12 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
568567
; GFX12W32-NEXT: .LBB1_2:
569568
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
570569
; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
571-
; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
572-
; GFX12W32-NEXT: v_mul_lo_u32 v0, s2, v0
573570
; GFX12W32-NEXT: s_waitcnt vmcnt(0)
574-
; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
571+
; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1
572+
; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
575573
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
576-
; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0
574+
; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v0, s[4:5]
575+
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
577576
; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
578577
; GFX12W32-NEXT: s_nop 0
579578
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)

0 commit comments

Comments
 (0)