Skip to content

Commit 30d6c39

Browse files
committed
[AMDGPU] Add merging into S_BUFFER_LOAD_DWORDX8_IMM
Extend SILoadStoreOptimizer to merge into DWORDX8 variant of S_BUFFER_LOAD. Merging into DWORDX2 and DWORDX4 variants is handled already. Differential Revision: https://reviews.llvm.org/D108909
1 parent 2f0750d commit 30d6c39

File tree

2 files changed

+176
-20
lines changed

2 files changed

+176
-20
lines changed

llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp

Lines changed: 43 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
303303
return 2;
304304
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
305305
return 4;
306+
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
307+
return 8;
306308
case AMDGPU::DS_READ_B32: LLVM_FALLTHROUGH;
307309
case AMDGPU::DS_READ_B32_gfx9: LLVM_FALLTHROUGH;
308310
case AMDGPU::DS_WRITE_B32: LLVM_FALLTHROUGH;
@@ -372,6 +374,7 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
372374
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
373375
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
374376
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
377+
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
375378
return S_BUFFER_LOAD_IMM;
376379
case AMDGPU::DS_READ_B32:
377380
case AMDGPU::DS_READ_B32_gfx9:
@@ -413,6 +416,7 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
413416
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
414417
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
415418
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
419+
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
416420
return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
417421
}
418422
}
@@ -463,6 +467,7 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
463467
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
464468
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
465469
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
470+
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
466471
Result.SBase = true;
467472
return Result;
468473
case AMDGPU::DS_READ_B32:
@@ -857,6 +862,7 @@ bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
857862
return false;
858863
case 2:
859864
case 4:
865+
case 8:
860866
return true;
861867
}
862868
}
@@ -1523,45 +1529,62 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
15231529
return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
15241530
case 4:
15251531
return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1532+
case 8:
1533+
return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
15261534
}
15271535
case MIMG:
1528-
assert("No overlaps" && (countPopulation(CI.DMask | Paired.DMask) == Width));
1536+
assert((countPopulation(CI.DMask | Paired.DMask) == Width) &&
1537+
"No overlaps");
15291538
return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
15301539
}
15311540
}
15321541

15331542
std::pair<unsigned, unsigned>
1534-
SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, const CombineInfo &Paired) {
1543+
SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
1544+
const CombineInfo &Paired) {
15351545

1536-
if (CI.Width == 0 || Paired.Width == 0 || CI.Width + Paired.Width > 4)
1537-
return std::make_pair(0, 0);
1546+
assert(CI.Width != 0 && Paired.Width != 0 && "Width cannot be zero");
15381547

15391548
bool ReverseOrder;
15401549
if (CI.InstClass == MIMG) {
1541-
assert((countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) &&
1542-
"No overlaps");
1550+
assert(
1551+
(countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) &&
1552+
"No overlaps");
15431553
ReverseOrder = CI.DMask > Paired.DMask;
15441554
} else
15451555
ReverseOrder = CI.Offset > Paired.Offset;
15461556

1547-
static const unsigned Idxs[4][4] = {
1548-
{AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1549-
{AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0},
1550-
{AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0},
1551-
{AMDGPU::sub3, 0, 0, 0},
1552-
};
15531557
unsigned Idx0;
15541558
unsigned Idx1;
15551559

1556-
assert(CI.Width >= 1 && CI.Width <= 3);
1557-
assert(Paired.Width >= 1 && Paired.Width <= 3);
1560+
if (CI.Width + Paired.Width > 4) {
1561+
assert(CI.Width == 4 && Paired.Width == 4);
15581562

1559-
if (ReverseOrder) {
1560-
Idx1 = Idxs[0][Paired.Width - 1];
1561-
Idx0 = Idxs[Paired.Width][CI.Width - 1];
1563+
if (ReverseOrder) {
1564+
Idx1 = AMDGPU::sub0_sub1_sub2_sub3;
1565+
Idx0 = AMDGPU::sub4_sub5_sub6_sub7;
1566+
} else {
1567+
Idx0 = AMDGPU::sub0_sub1_sub2_sub3;
1568+
Idx1 = AMDGPU::sub4_sub5_sub6_sub7;
1569+
}
15621570
} else {
1563-
Idx0 = Idxs[0][CI.Width - 1];
1564-
Idx1 = Idxs[CI.Width][Paired.Width - 1];
1571+
static const unsigned Idxs[4][4] = {
1572+
{AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1573+
{AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0},
1574+
{AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0},
1575+
{AMDGPU::sub3, 0, 0, 0},
1576+
};
1577+
1578+
assert(CI.Width >= 1 && CI.Width <= 3);
1579+
assert(Paired.Width >= 1 && Paired.Width <= 3);
1580+
1581+
if (ReverseOrder) {
1582+
Idx1 = Idxs[0][Paired.Width - 1];
1583+
Idx0 = Idxs[Paired.Width][CI.Width - 1];
1584+
} else {
1585+
Idx0 = Idxs[0][CI.Width - 1];
1586+
Idx1 = Idxs[CI.Width][Paired.Width - 1];
1587+
}
15651588
}
15661589

15671590
return std::make_pair(Idx0, Idx1);
@@ -2134,7 +2157,7 @@ SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
21342157
MachineBasicBlock::iterator NewMI =
21352158
mergeSBufferLoadImmPair(CI, Paired, InstsToMove);
21362159
CI.setMI(NewMI, *TII, *STM);
2137-
OptimizeListAgain |= (CI.Width + Paired.Width) < 16;
2160+
OptimizeListAgain |= (CI.Width + Paired.Width) < 8;
21382161
break;
21392162
}
21402163
case BUFFER_LOAD: {
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck %s
2+
3+
# CHECK-LABEL: name: merge_s_buffer_load_x2
4+
# CHECK: S_BUFFER_LOAD_DWORDX2_IMM %0, 0, 0 :: (dereferenceable invariant load (s64), align 4)
5+
6+
name: merge_s_buffer_load_x2
7+
tracksRegLiveness: true
8+
body: |
9+
bb.0:
10+
liveins: $sgpr0_sgpr1_sgpr2_sgpr3
11+
12+
%0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
13+
%1:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32))
14+
%2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 4, 0 :: (dereferenceable invariant load (s32))
15+
16+
S_ENDPGM 0
17+
...
18+
---
19+
20+
# CHECK-LABEL: name: merge_s_buffer_load_x4
21+
# CHECK: S_BUFFER_LOAD_DWORDX4_IMM %0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
22+
name: merge_s_buffer_load_x4
23+
tracksRegLiveness: true
24+
body: |
25+
bb.0:
26+
liveins: $sgpr0_sgpr1_sgpr2_sgpr3
27+
28+
%0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
29+
%1:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32))
30+
%2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 4, 0 :: (dereferenceable invariant load (s32))
31+
%3:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 8, 0 :: (dereferenceable invariant load (s32))
32+
%4:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 12, 0 :: (dereferenceable invariant load (s32))
33+
34+
S_ENDPGM 0
35+
...
36+
---
37+
38+
# CHECK-LABEL: name: merge_s_buffer_load_x8
39+
# CHECK: S_BUFFER_LOAD_DWORDX8_IMM %0, 0, 0 :: (dereferenceable invariant load (s256), align 4)
40+
name: merge_s_buffer_load_x8
41+
tracksRegLiveness: true
42+
body: |
43+
bb.0:
44+
liveins: $sgpr0_sgpr1_sgpr2_sgpr3
45+
46+
%0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
47+
%1:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32))
48+
%2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 4, 0 :: (dereferenceable invariant load (s32))
49+
%3:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 8, 0 :: (dereferenceable invariant load (s32))
50+
%4:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 12, 0 :: (dereferenceable invariant load (s32))
51+
%5:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32))
52+
%6:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 20, 0 :: (dereferenceable invariant load (s32))
53+
%7:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 24, 0 :: (dereferenceable invariant load (s32))
54+
%8:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 28, 0 :: (dereferenceable invariant load (s32))
55+
56+
S_ENDPGM 0
57+
...
58+
---
59+
60+
# CHECK-LABEL: name: merge_s_buffer_load_x8_reordered
61+
# CHECK: S_BUFFER_LOAD_DWORDX8_IMM %0, 0, 0 :: (dereferenceable invariant load (s256), align 4)
62+
name: merge_s_buffer_load_x8_reordered
63+
tracksRegLiveness: true
64+
body: |
65+
bb.0:
66+
liveins: $sgpr0_sgpr1_sgpr2_sgpr3
67+
68+
%0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
69+
%1:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 20, 0 :: (dereferenceable invariant load (s32))
70+
%2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 4, 0 :: (dereferenceable invariant load (s32))
71+
%3:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32))
72+
%4:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 28, 0 :: (dereferenceable invariant load (s32))
73+
%5:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 12, 0 :: (dereferenceable invariant load (s32))
74+
%6:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32))
75+
%7:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 8, 0 :: (dereferenceable invariant load (s32))
76+
%8:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 24, 0 :: (dereferenceable invariant load (s32))
77+
78+
S_ENDPGM 0
79+
...
80+
---
81+
82+
# CHECK-LABEL: name: merge_s_buffer_load_x8_out_of_x2
83+
# CHECK: S_BUFFER_LOAD_DWORDX8_IMM %0, 0, 0 :: (dereferenceable invariant load (s256), align 8)
84+
name: merge_s_buffer_load_x8_out_of_x2
85+
tracksRegLiveness: true
86+
body: |
87+
bb.0:
88+
liveins: $sgpr0_sgpr1_sgpr2_sgpr3
89+
90+
%0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
91+
%1:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM %0:sgpr_128, 16, 0 :: (dereferenceable invariant load (s64))
92+
%2:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM %0:sgpr_128, 8, 0 :: (dereferenceable invariant load (s64))
93+
%3:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s64))
94+
%4:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM %0:sgpr_128, 24, 0 :: (dereferenceable invariant load (s64))
95+
96+
S_ENDPGM 0
97+
...
98+
---
99+
100+
# CHECK-LABEL: name: merge_s_buffer_load_x8_out_of_x4
101+
# CHECK: S_BUFFER_LOAD_DWORDX8_IMM %0, 0, 0 :: (dereferenceable invariant load (s256), align 16)
102+
name: merge_s_buffer_load_x8_out_of_x4
103+
tracksRegLiveness: true
104+
body: |
105+
bb.0:
106+
liveins: $sgpr0_sgpr1_sgpr2_sgpr3
107+
108+
%0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
109+
%1:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s128))
110+
%2:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %0:sgpr_128, 16, 0 :: (dereferenceable invariant load (s128))
111+
112+
S_ENDPGM 0
113+
...
114+
---
115+
116+
117+
# CHECK-LABEL: name: merge_s_buffer_load_x8_mixed
118+
# CHECK: S_BUFFER_LOAD_DWORDX8_IMM %0, 0, 0 :: (dereferenceable invariant load (s256), align 16)
119+
name: merge_s_buffer_load_x8_mixed
120+
tracksRegLiveness: true
121+
body: |
122+
bb.0:
123+
liveins: $sgpr0_sgpr1_sgpr2_sgpr3
124+
125+
%0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
126+
%1:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s128))
127+
%2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32))
128+
%3:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM %0:sgpr_128, 24, 0 :: (dereferenceable invariant load (s64))
129+
%4:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 20, 0 :: (dereferenceable invariant load (s32))
130+
131+
S_ENDPGM 0
132+
...
133+
---

0 commit comments

Comments
 (0)