@@ -79,16 +79,17 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
79
79
; GFX12-LABEL: store_load_sindex_kernel:
80
80
; GFX12: ; %bb.0: ; %bb
81
81
; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24
82
+ ; GFX12-NEXT: v_mov_b32_e32 v1, 15
82
83
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
83
- ; GFX12-NEXT: s_and_b32 s1, s0, 15
84
+ ; GFX12-NEXT: s_lshl_b32 s1, s0, 2
85
+ ; GFX12-NEXT: s_and_b32 s0, s0, 15
86
+ ; GFX12-NEXT: v_mov_b32_e32 v0, s1
84
87
; GFX12-NEXT: s_lshl_b32 s0, s0, 2
85
- ; GFX12-NEXT: s_lshl_b32 s1, s1, 2
86
88
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
87
- ; GFX12-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1
88
- ; GFX12-NEXT: s_add_co_i32 s0, s0, 4
89
- ; GFX12-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT
89
+ ; GFX12-NEXT: v_mov_b32_e32 v2, s0
90
+ ; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:4 th:TH_STORE_NT_RT
90
91
; GFX12-NEXT: s_waitcnt_vscnt null, 0x0
91
- ; GFX12-NEXT: scratch_load_b32 v0, v1 , off offset:4 th:TH_LOAD_RT_NT
92
+ ; GFX12-NEXT: scratch_load_b32 v0, v2 , off offset:4 th:TH_LOAD_RT_NT
92
93
; GFX12-NEXT: s_waitcnt vmcnt(0)
93
94
; GFX12-NEXT: s_endpgm
94
95
bb:
@@ -170,8 +171,7 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
170
171
; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1
171
172
; GFX12-NEXT: scratch_store_b32 v0, v2, off offset:4 th:TH_STORE_NT_RT
172
173
; GFX12-NEXT: s_waitcnt_vscnt null, 0x0
173
- ; GFX12-NEXT: v_add_nc_u32_e32 v1, 4, v1
174
- ; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 th:TH_LOAD_RT_NT
174
+ ; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:128 th:TH_LOAD_RT_NT
175
175
; GFX12-NEXT: s_waitcnt vmcnt(0)
176
176
; GFX12-NEXT: s_endpgm
177
177
bb:
@@ -248,14 +248,13 @@ define void @store_load_vindex_foo(i32 %idx) {
248
248
; GFX12-LABEL: store_load_vindex_foo:
249
249
; GFX12: ; %bb.0: ; %bb
250
250
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
251
- ; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v0
252
- ; GFX12-NEXT: v_and_b32_e32 v0, 15, v0
253
- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
254
- ; GFX12-NEXT: v_add_nc_u32_e32 v1, s32, v1
251
+ ; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
255
252
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
256
- ; GFX12-NEXT: scratch_store_b32 v1, v2, off th:TH_STORE_NT_RT
253
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
254
+ ; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1
255
+ ; GFX12-NEXT: scratch_store_b32 v0, v2, s32 th:TH_STORE_NT_RT
257
256
; GFX12-NEXT: s_waitcnt_vscnt null, 0x0
258
- ; GFX12-NEXT: scratch_load_b32 v0, v0 , s32 th:TH_LOAD_RT_NT
257
+ ; GFX12-NEXT: scratch_load_b32 v0, v1 , s32 th:TH_LOAD_RT_NT
259
258
; GFX12-NEXT: s_waitcnt vmcnt(0)
260
259
; GFX12-NEXT: s_setpc_b64 s[30:31]
261
260
bb:
@@ -391,17 +390,19 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
391
390
; GFX12-LABEL: store_load_sindex_small_offset_kernel:
392
391
; GFX12: ; %bb.0: ; %bb
393
392
; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24
394
- ; GFX12-NEXT: scratch_load_b32 v2, off, off offset:4 th:TH_LOAD_RT_NT
395
- ; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
396
- ; GFX12-NEXT: s_and_b32 s1, s0, 15
393
+ ; GFX12-NEXT: scratch_load_b32 v3, off, off offset:4 th:TH_LOAD_RT_NT
394
+ ; GFX12-NEXT: s_waitcnt vmcnt(0)
395
+ ; GFX12-NEXT: v_mov_b32_e32 v1, 15
396
+ ; GFX12-NEXT: s_waitcnt lgkmcnt(0)
397
+ ; GFX12-NEXT: s_lshl_b32 s1, s0, 2
398
+ ; GFX12-NEXT: s_and_b32 s0, s0, 15
399
+ ; GFX12-NEXT: v_mov_b32_e32 v0, s1
397
400
; GFX12-NEXT: s_lshl_b32 s0, s0, 2
398
- ; GFX12-NEXT: s_lshl_b32 s1, s1, 2
399
401
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
400
- ; GFX12-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1
401
- ; GFX12-NEXT: s_addk_co_i32 s0, 0x104
402
- ; GFX12-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT
402
+ ; GFX12-NEXT: v_mov_b32_e32 v2, s0
403
+ ; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:260 th:TH_STORE_NT_RT
403
404
; GFX12-NEXT: s_waitcnt_vscnt null, 0x0
404
- ; GFX12-NEXT: scratch_load_b32 v0, v1 , off offset:260 th:TH_LOAD_RT_NT
405
+ ; GFX12-NEXT: scratch_load_b32 v0, v2 , off offset:260 th:TH_LOAD_RT_NT
405
406
; GFX12-NEXT: s_waitcnt vmcnt(0)
406
407
; GFX12-NEXT: s_endpgm
407
408
bb:
@@ -490,13 +491,13 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
490
491
; GFX12: ; %bb.0: ; %bb
491
492
; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0
492
493
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
494
+ ; GFX12-NEXT: v_mov_b32_e32 v2, 15
493
495
; GFX12-NEXT: scratch_load_b32 v3, off, off offset:4 th:TH_LOAD_RT_NT
494
496
; GFX12-NEXT: s_waitcnt vmcnt(0)
495
- ; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1
497
+ ; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1
496
498
; GFX12-NEXT: scratch_store_b32 v0, v2, off offset:260 th:TH_STORE_NT_RT
497
499
; GFX12-NEXT: s_waitcnt_vscnt null, 0x0
498
- ; GFX12-NEXT: v_add_nc_u32_e32 v1, 0x104, v1
499
- ; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 th:TH_LOAD_RT_NT
500
+ ; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:384 th:TH_LOAD_RT_NT
500
501
; GFX12-NEXT: s_waitcnt vmcnt(0)
501
502
; GFX12-NEXT: s_endpgm
502
503
bb:
@@ -589,16 +590,14 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
589
590
; GFX12-LABEL: store_load_vindex_small_offset_foo:
590
591
; GFX12: ; %bb.0: ; %bb
591
592
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
592
- ; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v0
593
- ; GFX12-NEXT: v_and_b32_e32 v0, 15, v0
594
- ; GFX12-NEXT: s_add_co_i32 s0, s32, 0x100
593
+ ; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
594
+ ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
595
595
; GFX12-NEXT: scratch_load_b32 v3, off, s32 th:TH_LOAD_RT_NT
596
596
; GFX12-NEXT: s_waitcnt vmcnt(0)
597
- ; GFX12-NEXT: v_add_nc_u32_e32 v1, s0, v1
598
- ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
599
- ; GFX12-NEXT: scratch_store_b32 v1, v2, off th:TH_STORE_NT_RT
597
+ ; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1
598
+ ; GFX12-NEXT: scratch_store_b32 v0, v2, s32 offset:256 th:TH_STORE_NT_RT
600
599
; GFX12-NEXT: s_waitcnt_vscnt null, 0x0
601
- ; GFX12-NEXT: scratch_load_b32 v0, v0 , s32 offset:256 th:TH_LOAD_RT_NT
600
+ ; GFX12-NEXT: scratch_load_b32 v0, v1 , s32 offset:256 th:TH_LOAD_RT_NT
602
601
; GFX12-NEXT: s_waitcnt vmcnt(0)
603
602
; GFX12-NEXT: s_setpc_b64 s[30:31]
604
603
bb:
@@ -697,17 +696,19 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
697
696
; GFX12-LABEL: store_load_sindex_large_offset_kernel:
698
697
; GFX12: ; %bb.0: ; %bb
699
698
; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24
700
- ; GFX12-NEXT: scratch_load_b32 v2, off, off offset:4 th:TH_LOAD_RT_NT
701
- ; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
702
- ; GFX12-NEXT: s_and_b32 s1, s0, 15
699
+ ; GFX12-NEXT: scratch_load_b32 v3, off, off offset:4 th:TH_LOAD_RT_NT
700
+ ; GFX12-NEXT: s_waitcnt vmcnt(0)
701
+ ; GFX12-NEXT: v_mov_b32_e32 v1, 15
702
+ ; GFX12-NEXT: s_waitcnt lgkmcnt(0)
703
+ ; GFX12-NEXT: s_lshl_b32 s1, s0, 2
704
+ ; GFX12-NEXT: s_and_b32 s0, s0, 15
705
+ ; GFX12-NEXT: v_mov_b32_e32 v0, s1
703
706
; GFX12-NEXT: s_lshl_b32 s0, s0, 2
704
- ; GFX12-NEXT: s_lshl_b32 s1, s1, 2
705
707
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
706
- ; GFX12-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1
707
- ; GFX12-NEXT: s_addk_co_i32 s0, 0x4004
708
- ; GFX12-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT
708
+ ; GFX12-NEXT: v_mov_b32_e32 v2, s0
709
+ ; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:16388 th:TH_STORE_NT_RT
709
710
; GFX12-NEXT: s_waitcnt_vscnt null, 0x0
710
- ; GFX12-NEXT: scratch_load_b32 v0, v1 , off offset:16388 th:TH_LOAD_RT_NT
711
+ ; GFX12-NEXT: scratch_load_b32 v0, v2 , off offset:16388 th:TH_LOAD_RT_NT
711
712
; GFX12-NEXT: s_waitcnt vmcnt(0)
712
713
; GFX12-NEXT: s_endpgm
713
714
bb:
@@ -798,13 +799,13 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
798
799
; GFX12: ; %bb.0: ; %bb
799
800
; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0
800
801
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
802
+ ; GFX12-NEXT: v_mov_b32_e32 v2, 15
801
803
; GFX12-NEXT: scratch_load_b32 v3, off, off offset:4 th:TH_LOAD_RT_NT
802
804
; GFX12-NEXT: s_waitcnt vmcnt(0)
803
- ; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1
805
+ ; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1
804
806
; GFX12-NEXT: scratch_store_b32 v0, v2, off offset:16388 th:TH_STORE_NT_RT
805
807
; GFX12-NEXT: s_waitcnt_vscnt null, 0x0
806
- ; GFX12-NEXT: v_add_nc_u32_e32 v1, 0x4004, v1
807
- ; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 th:TH_LOAD_RT_NT
808
+ ; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:16512 th:TH_LOAD_RT_NT
808
809
; GFX12-NEXT: s_waitcnt vmcnt(0)
809
810
; GFX12-NEXT: s_endpgm
810
811
bb:
@@ -899,16 +900,14 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
899
900
; GFX12-LABEL: store_load_vindex_large_offset_foo:
900
901
; GFX12: ; %bb.0: ; %bb
901
902
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
902
- ; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v0
903
- ; GFX12-NEXT: v_and_b32_e32 v0, 15, v0
904
- ; GFX12-NEXT: s_add_co_i32 s0, s32, 0x4000
903
+ ; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
904
+ ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
905
905
; GFX12-NEXT: scratch_load_b32 v3, off, s32 th:TH_LOAD_RT_NT
906
906
; GFX12-NEXT: s_waitcnt vmcnt(0)
907
- ; GFX12-NEXT: v_add_nc_u32_e32 v1, s0, v1
908
- ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
909
- ; GFX12-NEXT: scratch_store_b32 v1, v2, off th:TH_STORE_NT_RT
907
+ ; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1
908
+ ; GFX12-NEXT: scratch_store_b32 v0, v2, s32 offset:16384 th:TH_STORE_NT_RT
910
909
; GFX12-NEXT: s_waitcnt_vscnt null, 0x0
911
- ; GFX12-NEXT: scratch_load_b32 v0, v0 , s32 offset:16384 th:TH_LOAD_RT_NT
910
+ ; GFX12-NEXT: scratch_load_b32 v0, v1 , s32 offset:16384 th:TH_LOAD_RT_NT
912
911
; GFX12-NEXT: s_waitcnt vmcnt(0)
913
912
; GFX12-NEXT: s_setpc_b64 s[30:31]
914
913
bb:
@@ -1154,11 +1153,9 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
1154
1153
; GFX12-NEXT: v_mov_b32_e32 v1, 15
1155
1154
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
1156
1155
; GFX12-NEXT: v_add_lshl_u32 v0, s0, v0, 2
1157
- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1158
- ; GFX12-NEXT: v_add_nc_u32_e32 v0, 4, v0
1159
- ; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:1024 th:TH_STORE_NT_RT
1156
+ ; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:1028 th:TH_STORE_NT_RT
1160
1157
; GFX12-NEXT: s_waitcnt_vscnt null, 0x0
1161
- ; GFX12-NEXT: scratch_load_b32 v0, v0, off offset:1024 th:TH_LOAD_RT_NT
1158
+ ; GFX12-NEXT: scratch_load_b32 v0, v0, off offset:1028 th:TH_LOAD_RT_NT
1162
1159
; GFX12-NEXT: s_waitcnt vmcnt(0)
1163
1160
; GFX12-NEXT: s_endpgm
1164
1161
bb:
0 commit comments