Skip to content

Commit be36812

Browse files
committed
[TargetLowering] Be more efficient in fp -> bf16 NaN conversions
We can avoid masking completely as it is OK (and probably preferable) to bring over some of the existant NaN payload.
1 parent d17eade commit be36812

File tree

7 files changed

+3238
-4174
lines changed

7 files changed

+3238
-4174
lines changed

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10948,12 +10948,10 @@ SDValue TargetLowering::expandFP_ROUND(SDNode *Node, SelectionDAG &DAG) const {
1094810948
Op = expandRoundInexactToOdd(F32, Op, dl, DAG);
1094910949
Op = DAG.getNode(ISD::BITCAST, dl, I32, Op);
1095010950

10951-
// Extract the sign bit and exponent.
10952-
SDValue SignBitAndExponentField = DAG.getNode(
10953-
ISD::AND, dl, I32, Op, DAG.getConstant(0xff800000, dl, I32));
10954-
// Set the quiet bit.
10955-
SDValue NaN = DAG.getNode(ISD::OR, dl, I32, SignBitAndExponentField,
10956-
DAG.getConstant(0x400000, dl, I32));
10951+
// Conversions should set NaN's quiet bit. This also prevents NaNs from
10952+
// turning into infinities.
10953+
SDValue NaN =
10954+
DAG.getNode(ISD::OR, dl, I32, Op, DAG.getConstant(0x400000, dl, I32));
1095710955

1095810956
// Factor in the contribution of the low 16 bits.
1095910957
SDValue One = DAG.getConstant(1, dl, I32);

llvm/test/CodeGen/AMDGPU/bf16.ll

Lines changed: 3179 additions & 4083 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -790,8 +790,7 @@ define bfloat @fmed3_f32_fpext_f16_fptrunc_bf16(half %arg0, half %arg1, half %ar
790790
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
791791
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
792792
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
793-
; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0
794-
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2
793+
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
795794
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
796795
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
797796
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
@@ -806,9 +805,8 @@ define bfloat @fmed3_f32_fpext_f16_fptrunc_bf16(half %arg0, half %arg1, half %ar
806805
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
807806
; GFX9-NEXT: v_med3_f32 v0, v0, v1, v2
808807
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
809-
; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0
810808
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
811-
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2
809+
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
812810
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
813811
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
814812
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0

llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll

Lines changed: 10 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1524,9 +1524,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
15241524
; GFX900-NEXT: v_lshrrev_b32_sdwa v1, s5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
15251525
; GFX900-NEXT: v_add_f32_e32 v1, 4.0, v1
15261526
; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1
1527-
; GFX900-NEXT: v_and_b32_e32 v4, 0xff800000, v1
1527+
; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v1
15281528
; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4
1529-
; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v4
15301529
; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
15311530
; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
15321531
; GFX900-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -1566,9 +1565,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
15661565
; GFX908-NEXT: v_lshrrev_b32_sdwa v1, s5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
15671566
; GFX908-NEXT: v_add_f32_e32 v1, 4.0, v1
15681567
; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1
1569-
; GFX908-NEXT: v_and_b32_e32 v4, 0xff800000, v1
1568+
; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v1
15701569
; GFX908-NEXT: v_add3_u32 v3, v3, v1, s4
1571-
; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v4
15721570
; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
15731571
; GFX908-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
15741572
; GFX908-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -1608,9 +1606,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
16081606
; GFX90A-NEXT: v_lshrrev_b32_sdwa v1, s5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
16091607
; GFX90A-NEXT: v_add_f32_e32 v1, 4.0, v1
16101608
; GFX90A-NEXT: v_bfe_u32 v2, v1, 16, 1
1611-
; GFX90A-NEXT: v_and_b32_e32 v4, 0xff800000, v1
1609+
; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v1
16121610
; GFX90A-NEXT: v_add3_u32 v2, v2, v1, s4
1613-
; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v4
16141611
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
16151612
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
16161613
; GFX90A-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -1632,7 +1629,6 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
16321629
; GFX10: ; %bb.0:
16331630
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
16341631
; GFX10-NEXT: v_mov_b32_e32 v0, 0
1635-
; GFX10-NEXT: s_mov_b32 s5, 0xff800000
16361632
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
16371633
; GFX10-NEXT: s_and_b32 s0, s2, -4
16381634
; GFX10-NEXT: s_mov_b32 s1, s3
@@ -1650,7 +1646,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
16501646
; GFX10-NEXT: v_lshrrev_b32_sdwa v1, s2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
16511647
; GFX10-NEXT: v_add_f32_e32 v1, 4.0, v1
16521648
; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
1653-
; GFX10-NEXT: v_and_or_b32 v4, v1, s5, 0x400000
1649+
; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1
16541650
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
16551651
; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
16561652
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
@@ -1673,7 +1669,6 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
16731669
; GFX11-LABEL: global_atomic_fadd_ret_bf16_agent:
16741670
; GFX11: ; %bb.0:
16751671
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
1676-
; GFX11-NEXT: s_mov_b32 s5, 0xff800000
16771672
; GFX11-NEXT: v_mov_b32_e32 v0, 0
16781673
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
16791674
; GFX11-NEXT: s_and_b32 s0, s2, -4
@@ -1694,7 +1689,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
16941689
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
16951690
; GFX11-NEXT: v_add_f32_e32 v1, 4.0, v1
16961691
; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
1697-
; GFX11-NEXT: v_and_or_b32 v4, v1, s5, 0x400000
1692+
; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1
16981693
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
16991694
; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
17001695
; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
@@ -1744,9 +1739,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
17441739
; GFX900-NEXT: v_lshrrev_b32_sdwa v1, s5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
17451740
; GFX900-NEXT: v_add_f32_e32 v1, 4.0, v1
17461741
; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1
1747-
; GFX900-NEXT: v_and_b32_e32 v4, 0xff800000, v1
1742+
; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v1
17481743
; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4
1749-
; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v4
17501744
; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
17511745
; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
17521746
; GFX900-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -1786,9 +1780,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
17861780
; GFX908-NEXT: v_lshrrev_b32_sdwa v1, s5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
17871781
; GFX908-NEXT: v_add_f32_e32 v1, 4.0, v1
17881782
; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1
1789-
; GFX908-NEXT: v_and_b32_e32 v4, 0xff800000, v1
1783+
; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v1
17901784
; GFX908-NEXT: v_add3_u32 v3, v3, v1, s4
1791-
; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v4
17921785
; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
17931786
; GFX908-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
17941787
; GFX908-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -1828,9 +1821,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
18281821
; GFX90A-NEXT: v_lshrrev_b32_sdwa v1, s5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
18291822
; GFX90A-NEXT: v_add_f32_e32 v1, 4.0, v1
18301823
; GFX90A-NEXT: v_bfe_u32 v2, v1, 16, 1
1831-
; GFX90A-NEXT: v_and_b32_e32 v4, 0xff800000, v1
1824+
; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v1
18321825
; GFX90A-NEXT: v_add3_u32 v2, v2, v1, s4
1833-
; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v4
18341826
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
18351827
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
18361828
; GFX90A-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -1854,7 +1846,6 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
18541846
; GFX10: ; %bb.0:
18551847
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
18561848
; GFX10-NEXT: v_mov_b32_e32 v0, 0
1857-
; GFX10-NEXT: s_mov_b32 s5, 0xff800000
18581849
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
18591850
; GFX10-NEXT: s_and_b32 s0, s2, -4
18601851
; GFX10-NEXT: s_mov_b32 s1, s3
@@ -1872,7 +1863,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
18721863
; GFX10-NEXT: v_lshrrev_b32_sdwa v1, s2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
18731864
; GFX10-NEXT: v_add_f32_e32 v1, 4.0, v1
18741865
; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
1875-
; GFX10-NEXT: v_and_or_b32 v4, v1, s5, 0x400000
1866+
; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1
18761867
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
18771868
; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
18781869
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
@@ -1895,7 +1886,6 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
18951886
; GFX11-LABEL: global_atomic_fadd_ret_bf16_system:
18961887
; GFX11: ; %bb.0:
18971888
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
1898-
; GFX11-NEXT: s_mov_b32 s5, 0xff800000
18991889
; GFX11-NEXT: v_mov_b32_e32 v0, 0
19001890
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
19011891
; GFX11-NEXT: s_and_b32 s0, s2, -4
@@ -1916,7 +1906,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
19161906
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
19171907
; GFX11-NEXT: v_add_f32_e32 v1, 4.0, v1
19181908
; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
1919-
; GFX11-NEXT: v_and_or_b32 v4, v1, s5, 0x400000
1909+
; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1
19201910
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
19211911
; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
19221912
; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo

llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -912,10 +912,9 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a,
912912
; DAGISEL-GFX11-WF32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767
913913
; DAGISEL-GFX11-WF32-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec
914914
; DAGISEL-GFX11-WF32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304
915-
; DAGISEL-GFX11-WF32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -8388608
916-
; DAGISEL-GFX11-WF32-NEXT: [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_2]], killed [[S_MOV_B32_1]], implicit $exec
915+
; DAGISEL-GFX11-WF32-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_1]], implicit $exec
917916
; DAGISEL-GFX11-WF32-NEXT: [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec
918-
; DAGISEL-GFX11-WF32-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_AND_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
917+
; DAGISEL-GFX11-WF32-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
919918
; DAGISEL-GFX11-WF32-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
920919
; DAGISEL-GFX11-WF32-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
921920
; DAGISEL-GFX11-WF32-NEXT: FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
@@ -934,10 +933,9 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a,
934933
; DAGISEL-GFX11-WF64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767
935934
; DAGISEL-GFX11-WF64-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec
936935
; DAGISEL-GFX11-WF64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304
937-
; DAGISEL-GFX11-WF64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -8388608
938-
; DAGISEL-GFX11-WF64-NEXT: [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_2]], killed [[S_MOV_B32_1]], implicit $exec
936+
; DAGISEL-GFX11-WF64-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_1]], implicit $exec
939937
; DAGISEL-GFX11-WF64-NEXT: [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_64_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec
940-
; DAGISEL-GFX11-WF64-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_AND_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
938+
; DAGISEL-GFX11-WF64-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
941939
; DAGISEL-GFX11-WF64-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
942940
; DAGISEL-GFX11-WF64-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
943941
; DAGISEL-GFX11-WF64-NEXT: FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
@@ -956,10 +954,9 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a,
956954
; DAGISEL-GFX10-WF32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767
957955
; DAGISEL-GFX10-WF32-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec
958956
; DAGISEL-GFX10-WF32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304
959-
; DAGISEL-GFX10-WF32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -8388608
960-
; DAGISEL-GFX10-WF32-NEXT: [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_2]], killed [[S_MOV_B32_1]], implicit $exec
957+
; DAGISEL-GFX10-WF32-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_1]], implicit $exec
961958
; DAGISEL-GFX10-WF32-NEXT: [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec
962-
; DAGISEL-GFX10-WF32-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_AND_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
959+
; DAGISEL-GFX10-WF32-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
963960
; DAGISEL-GFX10-WF32-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
964961
; DAGISEL-GFX10-WF32-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
965962
; DAGISEL-GFX10-WF32-NEXT: FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
@@ -978,10 +975,9 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a,
978975
; DAGISEL-GFX10-WF64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767
979976
; DAGISEL-GFX10-WF64-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec
980977
; DAGISEL-GFX10-WF64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304
981-
; DAGISEL-GFX10-WF64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -8388608
982-
; DAGISEL-GFX10-WF64-NEXT: [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_2]], killed [[S_MOV_B32_1]], implicit $exec
978+
; DAGISEL-GFX10-WF64-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_1]], implicit $exec
983979
; DAGISEL-GFX10-WF64-NEXT: [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_64_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec
984-
; DAGISEL-GFX10-WF64-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_AND_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
980+
; DAGISEL-GFX10-WF64-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
985981
; DAGISEL-GFX10-WF64-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
986982
; DAGISEL-GFX10-WF64-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
987983
; DAGISEL-GFX10-WF64-NEXT: FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)

llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1413,9 +1413,8 @@ define bfloat @lds_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind {
14131413
; VI-NEXT: v_add_f32_e32 v3, 4.0, v3
14141414
; VI-NEXT: v_bfe_u32 v6, v3, 16, 1
14151415
; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v3
1416-
; VI-NEXT: v_and_b32_e32 v7, 0xff800000, v3
14171416
; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
1418-
; VI-NEXT: v_or_b32_e32 v7, 0x400000, v7
1417+
; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3
14191418
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
14201419
; VI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc
14211420
; VI-NEXT: v_and_b32_e32 v5, v4, v2
@@ -1451,9 +1450,8 @@ define bfloat @lds_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind {
14511450
; GFX9-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
14521451
; GFX9-NEXT: v_add_f32_e32 v3, 4.0, v3
14531452
; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
1454-
; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v3
1453+
; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3
14551454
; GFX9-NEXT: v_add3_u32 v5, v5, v3, s6
1456-
; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6
14571455
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
14581456
; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
14591457
; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -1560,9 +1558,8 @@ define void @lds_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
15601558
; VI-NEXT: v_add_f32_e32 v4, 4.0, v4
15611559
; VI-NEXT: v_bfe_u32 v6, v4, 16, 1
15621560
; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v4
1563-
; VI-NEXT: v_and_b32_e32 v7, 0xff800000, v4
15641561
; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
1565-
; VI-NEXT: v_or_b32_e32 v7, 0x400000, v7
1562+
; VI-NEXT: v_or_b32_e32 v7, 0x400000, v4
15661563
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
15671564
; VI-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc
15681565
; VI-NEXT: v_and_b32_e32 v5, v3, v2
@@ -1597,9 +1594,8 @@ define void @lds_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
15971594
; GFX9-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
15981595
; GFX9-NEXT: v_add_f32_e32 v4, 4.0, v4
15991596
; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
1600-
; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v4
1597+
; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4
16011598
; GFX9-NEXT: v_add3_u32 v5, v5, v4, s6
1602-
; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6
16031599
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
16041600
; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
16051601
; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1

0 commit comments

Comments
 (0)