@@ -1524,9 +1524,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
1524
1524
; GFX900-NEXT: v_lshrrev_b32_sdwa v1, s5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1525
1525
; GFX900-NEXT: v_add_f32_e32 v1, 4.0, v1
1526
1526
; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1
1527
- ; GFX900-NEXT: v_and_b32_e32 v4, 0xff800000 , v1
1527
+ ; GFX900-NEXT: v_or_b32_e32 v4, 0x400000 , v1
1528
1528
; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4
1529
- ; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v4
1530
1529
; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
1531
1530
; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
1532
1531
; GFX900-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -1566,9 +1565,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
1566
1565
; GFX908-NEXT: v_lshrrev_b32_sdwa v1, s5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1567
1566
; GFX908-NEXT: v_add_f32_e32 v1, 4.0, v1
1568
1567
; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1
1569
- ; GFX908-NEXT: v_and_b32_e32 v4, 0xff800000 , v1
1568
+ ; GFX908-NEXT: v_or_b32_e32 v4, 0x400000 , v1
1570
1569
; GFX908-NEXT: v_add3_u32 v3, v3, v1, s4
1571
- ; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v4
1572
1570
; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
1573
1571
; GFX908-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
1574
1572
; GFX908-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -1608,9 +1606,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
1608
1606
; GFX90A-NEXT: v_lshrrev_b32_sdwa v1, s5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1609
1607
; GFX90A-NEXT: v_add_f32_e32 v1, 4.0, v1
1610
1608
; GFX90A-NEXT: v_bfe_u32 v2, v1, 16, 1
1611
- ; GFX90A-NEXT: v_and_b32_e32 v4, 0xff800000 , v1
1609
+ ; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000 , v1
1612
1610
; GFX90A-NEXT: v_add3_u32 v2, v2, v1, s4
1613
- ; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v4
1614
1611
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
1615
1612
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
1616
1613
; GFX90A-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -1632,7 +1629,6 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
1632
1629
; GFX10: ; %bb.0:
1633
1630
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
1634
1631
; GFX10-NEXT: v_mov_b32_e32 v0, 0
1635
- ; GFX10-NEXT: s_mov_b32 s5, 0xff800000
1636
1632
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1637
1633
; GFX10-NEXT: s_and_b32 s0, s2, -4
1638
1634
; GFX10-NEXT: s_mov_b32 s1, s3
@@ -1650,7 +1646,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
1650
1646
; GFX10-NEXT: v_lshrrev_b32_sdwa v1, s2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1651
1647
; GFX10-NEXT: v_add_f32_e32 v1, 4.0, v1
1652
1648
; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
1653
- ; GFX10-NEXT: v_and_or_b32 v4, v1, s5, 0x400000
1649
+ ; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1
1654
1650
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
1655
1651
; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
1656
1652
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
@@ -1673,7 +1669,6 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
1673
1669
; GFX11-LABEL: global_atomic_fadd_ret_bf16_agent:
1674
1670
; GFX11: ; %bb.0:
1675
1671
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
1676
- ; GFX11-NEXT: s_mov_b32 s5, 0xff800000
1677
1672
; GFX11-NEXT: v_mov_b32_e32 v0, 0
1678
1673
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1679
1674
; GFX11-NEXT: s_and_b32 s0, s2, -4
@@ -1694,7 +1689,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
1694
1689
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1695
1690
; GFX11-NEXT: v_add_f32_e32 v1, 4.0, v1
1696
1691
; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
1697
- ; GFX11-NEXT: v_and_or_b32 v4, v1, s5, 0x400000
1692
+ ; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1
1698
1693
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
1699
1694
; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
1700
1695
; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
@@ -1744,9 +1739,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
1744
1739
; GFX900-NEXT: v_lshrrev_b32_sdwa v1, s5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1745
1740
; GFX900-NEXT: v_add_f32_e32 v1, 4.0, v1
1746
1741
; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1
1747
- ; GFX900-NEXT: v_and_b32_e32 v4, 0xff800000 , v1
1742
+ ; GFX900-NEXT: v_or_b32_e32 v4, 0x400000 , v1
1748
1743
; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4
1749
- ; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v4
1750
1744
; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
1751
1745
; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
1752
1746
; GFX900-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -1786,9 +1780,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
1786
1780
; GFX908-NEXT: v_lshrrev_b32_sdwa v1, s5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1787
1781
; GFX908-NEXT: v_add_f32_e32 v1, 4.0, v1
1788
1782
; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1
1789
- ; GFX908-NEXT: v_and_b32_e32 v4, 0xff800000 , v1
1783
+ ; GFX908-NEXT: v_or_b32_e32 v4, 0x400000 , v1
1790
1784
; GFX908-NEXT: v_add3_u32 v3, v3, v1, s4
1791
- ; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v4
1792
1785
; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
1793
1786
; GFX908-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
1794
1787
; GFX908-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -1828,9 +1821,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
1828
1821
; GFX90A-NEXT: v_lshrrev_b32_sdwa v1, s5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1829
1822
; GFX90A-NEXT: v_add_f32_e32 v1, 4.0, v1
1830
1823
; GFX90A-NEXT: v_bfe_u32 v2, v1, 16, 1
1831
- ; GFX90A-NEXT: v_and_b32_e32 v4, 0xff800000 , v1
1824
+ ; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000 , v1
1832
1825
; GFX90A-NEXT: v_add3_u32 v2, v2, v1, s4
1833
- ; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v4
1834
1826
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
1835
1827
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
1836
1828
; GFX90A-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -1854,7 +1846,6 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
1854
1846
; GFX10: ; %bb.0:
1855
1847
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
1856
1848
; GFX10-NEXT: v_mov_b32_e32 v0, 0
1857
- ; GFX10-NEXT: s_mov_b32 s5, 0xff800000
1858
1849
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1859
1850
; GFX10-NEXT: s_and_b32 s0, s2, -4
1860
1851
; GFX10-NEXT: s_mov_b32 s1, s3
@@ -1872,7 +1863,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
1872
1863
; GFX10-NEXT: v_lshrrev_b32_sdwa v1, s2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1873
1864
; GFX10-NEXT: v_add_f32_e32 v1, 4.0, v1
1874
1865
; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
1875
- ; GFX10-NEXT: v_and_or_b32 v4, v1, s5, 0x400000
1866
+ ; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1
1876
1867
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
1877
1868
; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
1878
1869
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
@@ -1895,7 +1886,6 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
1895
1886
; GFX11-LABEL: global_atomic_fadd_ret_bf16_system:
1896
1887
; GFX11: ; %bb.0:
1897
1888
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
1898
- ; GFX11-NEXT: s_mov_b32 s5, 0xff800000
1899
1889
; GFX11-NEXT: v_mov_b32_e32 v0, 0
1900
1890
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1901
1891
; GFX11-NEXT: s_and_b32 s0, s2, -4
@@ -1916,7 +1906,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
1916
1906
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1917
1907
; GFX11-NEXT: v_add_f32_e32 v1, 4.0, v1
1918
1908
; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
1919
- ; GFX11-NEXT: v_and_or_b32 v4, v1, s5, 0x400000
1909
+ ; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1
1920
1910
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
1921
1911
; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
1922
1912
; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
0 commit comments