Skip to content

Commit 86f9117

Browse files
committed
AMDGPU: Don't report 2-byte alignment as fast
This is apparently worse than 1-byte alignment. This does not attempt to decompose 2-byte aligned wide stores, but will stop trying to produce them. Also fix bug in LoadStoreVectorizer which was decreasing the alignment and vectorizing stack accesses. It was assuming a stack object was an alloca that could have its base alignment changed, which is not true if the pointer is derived from a function argument.
1 parent b2c44de commit 86f9117

File tree

9 files changed

+695
-29
lines changed

9 files changed

+695
-29
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1251,9 +1251,11 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
12511251
// If we have an uniform constant load, it still requires using a slow
12521252
// buffer instruction if unaligned.
12531253
if (IsFast) {
1254+
// Accesses can really be issued as 1-byte aligned or 4-byte aligned, so
1255+
// 2-byte alignment is worse than 1 unless doing a 2-byte accesss.
12541256
*IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
12551257
AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ?
1256-
(Align % 4 == 0) : true;
1258+
Align >= 4 : Align != 2;
12571259
}
12581260

12591261
return true;

llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1028,8 +1028,10 @@ bool Vectorizer::vectorizeStoreChain(
10281028
unsigned NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(),
10291029
StackAdjustedAlignment,
10301030
DL, S0, nullptr, &DT);
1031-
if (NewAlign != 0)
1031+
if (NewAlign >= Alignment.value())
10321032
Alignment = Align(NewAlign);
1033+
else
1034+
return false;
10331035
}
10341036

10351037
if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment.value(), AS)) {
@@ -1168,8 +1170,12 @@ bool Vectorizer::vectorizeLoadChain(
11681170
vectorizeLoadChain(Chains.second, InstructionsProcessed);
11691171
}
11701172

1171-
Alignment = getOrEnforceKnownAlignment(
1172-
L0->getPointerOperand(), StackAdjustedAlignment, DL, L0, nullptr, &DT);
1173+
unsigned NewAlign = getOrEnforceKnownAlignment(
1174+
L0->getPointerOperand(), StackAdjustedAlignment, DL, L0, nullptr, &DT);
1175+
if (NewAlign >= Alignment)
1176+
Alignment = NewAlign;
1177+
else
1178+
return false;
11731179
}
11741180

11751181
if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) {

llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -199,14 +199,17 @@ define amdgpu_kernel void @vload2_private(i16 addrspace(1)* nocapture readonly %
199199
; GCN-NEXT: s_waitcnt lgkmcnt(0)
200200
; GCN-NEXT: v_mov_b32_e32 v2, s4
201201
; GCN-NEXT: v_mov_b32_e32 v3, s5
202-
; GCN-NEXT: global_load_ushort v4, v[2:3], off offset:4
203-
; GCN-NEXT: global_load_dword v2, v[2:3], off
202+
; GCN-NEXT: global_load_ushort v4, v[2:3], off
204203
; GCN-NEXT: v_mov_b32_e32 v0, s6
205204
; GCN-NEXT: v_mov_b32_e32 v1, s7
206205
; GCN-NEXT: s_waitcnt vmcnt(0)
207-
; GCN-NEXT: buffer_store_short v2, off, s[0:3], s9 offset:4
208-
; GCN-NEXT: buffer_store_short_d16_hi v2, off, s[0:3], s9 offset:6
209-
; GCN-NEXT: buffer_store_short v4, off, s[0:3], s9 offset:8
206+
; GCN-NEXT: buffer_store_short v4, off, s[0:3], s9 offset:4
207+
; GCN-NEXT: global_load_ushort v4, v[2:3], off offset:2
208+
; GCN-NEXT: s_waitcnt vmcnt(0)
209+
; GCN-NEXT: buffer_store_short v4, off, s[0:3], s9 offset:6
210+
; GCN-NEXT: global_load_ushort v2, v[2:3], off offset:4
211+
; GCN-NEXT: s_waitcnt vmcnt(0)
212+
; GCN-NEXT: buffer_store_short v2, off, s[0:3], s9 offset:8
210213
; GCN-NEXT: buffer_load_ushort v2, off, s[0:3], s9 offset:4
211214
; GCN-NEXT: buffer_load_ushort v4, off, s[0:3], s9 offset:6
212215
; GCN-NEXT: s_waitcnt vmcnt(1)
Lines changed: 328 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,328 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -mattr=-unaligned-buffer-access < %s | FileCheck -check-prefixes=GCN,GFX7-ALIGNED %s
3+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -mattr=+unaligned-buffer-access < %s | FileCheck -check-prefixes=GCN,GFX7-UNALIGNED %s
4+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+unaligned-buffer-access < %s | FileCheck -check-prefixes=GCN,GFX9 %s
5+
6+
; Should not merge this to a dword load
7+
define i32 @global_load_2xi16_align2(i16 addrspace(1)* %p) #0 {
8+
; GFX7-ALIGNED-LABEL: global_load_2xi16_align2:
9+
; GFX7-ALIGNED: ; %bb.0:
10+
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11+
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 2, v0
12+
; GFX7-ALIGNED-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
13+
; GFX7-ALIGNED-NEXT: flat_load_ushort v0, v[0:1]
14+
; GFX7-ALIGNED-NEXT: flat_load_ushort v1, v[2:3]
15+
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16+
; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1
17+
; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v0, v1
18+
; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31]
19+
;
20+
; GFX7-UNALIGNED-LABEL: global_load_2xi16_align2:
21+
; GFX7-UNALIGNED: ; %bb.0:
22+
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
23+
; GFX7-UNALIGNED-NEXT: v_add_i32_e32 v2, vcc, 2, v0
24+
; GFX7-UNALIGNED-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
25+
; GFX7-UNALIGNED-NEXT: flat_load_ushort v0, v[0:1]
26+
; GFX7-UNALIGNED-NEXT: flat_load_ushort v1, v[2:3]
27+
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
28+
; GFX7-UNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1
29+
; GFX7-UNALIGNED-NEXT: v_or_b32_e32 v0, v0, v1
30+
; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
31+
;
32+
; GFX9-LABEL: global_load_2xi16_align2:
33+
; GFX9: ; %bb.0:
34+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35+
; GFX9-NEXT: global_load_ushort v2, v[0:1], off
36+
; GFX9-NEXT: global_load_ushort v0, v[0:1], off offset:2
37+
; GFX9-NEXT: s_waitcnt vmcnt(0)
38+
; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2
39+
; GFX9-NEXT: s_setpc_b64 s[30:31]
40+
%gep.p = getelementptr i16, i16 addrspace(1)* %p, i64 1
41+
%p.0 = load i16, i16 addrspace(1)* %p, align 2
42+
%p.1 = load i16, i16 addrspace(1)* %gep.p, align 2
43+
%zext.0 = zext i16 %p.0 to i32
44+
%zext.1 = zext i16 %p.1 to i32
45+
%shl.1 = shl i32 %zext.1, 16
46+
%or = or i32 %zext.0, %shl.1
47+
ret i32 %or
48+
}
49+
50+
; Should not merge this to a dword store
51+
define amdgpu_kernel void @global_store_2xi16_align2(i16 addrspace(1)* %p, i16 addrspace(1)* %r) #0 {
52+
; GFX7-ALIGNED-LABEL: global_store_2xi16_align2:
53+
; GFX7-ALIGNED: ; %bb.0:
54+
; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
55+
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 1
56+
; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
57+
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0
58+
; GFX7-ALIGNED-NEXT: s_add_u32 s2, s0, 2
59+
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s1
60+
; GFX7-ALIGNED-NEXT: flat_store_short v[0:1], v2
61+
; GFX7-ALIGNED-NEXT: s_addc_u32 s3, s1, 0
62+
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s2
63+
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 2
64+
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s3
65+
; GFX7-ALIGNED-NEXT: flat_store_short v[0:1], v2
66+
; GFX7-ALIGNED-NEXT: s_endpgm
67+
;
68+
; GFX7-UNALIGNED-LABEL: global_store_2xi16_align2:
69+
; GFX7-UNALIGNED: ; %bb.0:
70+
; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
71+
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 1
72+
; GFX7-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
73+
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
74+
; GFX7-UNALIGNED-NEXT: s_add_u32 s2, s0, 2
75+
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v1, s1
76+
; GFX7-UNALIGNED-NEXT: flat_store_short v[0:1], v2
77+
; GFX7-UNALIGNED-NEXT: s_addc_u32 s3, s1, 0
78+
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s2
79+
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 2
80+
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v1, s3
81+
; GFX7-UNALIGNED-NEXT: flat_store_short v[0:1], v2
82+
; GFX7-UNALIGNED-NEXT: s_endpgm
83+
;
84+
; GFX9-LABEL: global_store_2xi16_align2:
85+
; GFX9: ; %bb.0:
86+
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
87+
; GFX9-NEXT: v_mov_b32_e32 v2, 1
88+
; GFX9-NEXT: v_mov_b32_e32 v3, 2
89+
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
90+
; GFX9-NEXT: v_mov_b32_e32 v0, s0
91+
; GFX9-NEXT: v_mov_b32_e32 v1, s1
92+
; GFX9-NEXT: global_store_short v[0:1], v2, off
93+
; GFX9-NEXT: global_store_short v[0:1], v3, off offset:2
94+
; GFX9-NEXT: s_endpgm
95+
%gep.r = getelementptr i16, i16 addrspace(1)* %r, i64 1
96+
store i16 1, i16 addrspace(1)* %r, align 2
97+
store i16 2, i16 addrspace(1)* %gep.r, align 2
98+
ret void
99+
}
100+
101+
; Should produce align 1 dword when legal
102+
define i32 @global_load_2xi16_align1(i16 addrspace(1)* %p) #0 {
103+
; GFX7-ALIGNED-LABEL: global_load_2xi16_align1:
104+
; GFX7-ALIGNED: ; %bb.0:
105+
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
106+
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 2, v0
107+
; GFX7-ALIGNED-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
108+
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v4, vcc, 1, v0
109+
; GFX7-ALIGNED-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
110+
; GFX7-ALIGNED-NEXT: flat_load_ubyte v6, v[0:1]
111+
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v0, vcc, 3, v0
112+
; GFX7-ALIGNED-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
113+
; GFX7-ALIGNED-NEXT: flat_load_ubyte v2, v[2:3]
114+
; GFX7-ALIGNED-NEXT: flat_load_ubyte v3, v[4:5]
115+
; GFX7-ALIGNED-NEXT: flat_load_ubyte v0, v[0:1]
116+
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
117+
; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 8, v3
118+
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
119+
; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 8, v0
120+
; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v0, v2
121+
; GFX7-ALIGNED-NEXT: v_or_b32_e32 v1, v1, v6
122+
; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0
123+
; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v1, v0
124+
; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31]
125+
;
126+
; GFX7-UNALIGNED-LABEL: global_load_2xi16_align1:
127+
; GFX7-UNALIGNED: ; %bb.0:
128+
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
129+
; GFX7-UNALIGNED-NEXT: flat_load_dword v0, v[0:1]
130+
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
131+
; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
132+
;
133+
; GFX9-LABEL: global_load_2xi16_align1:
134+
; GFX9: ; %bb.0:
135+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
136+
; GFX9-NEXT: global_load_dword v0, v[0:1], off
137+
; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff
138+
; GFX9-NEXT: s_mov_b32 s4, 0xffff
139+
; GFX9-NEXT: s_waitcnt vmcnt(0)
140+
; GFX9-NEXT: v_bfi_b32 v1, v1, 0, v0
141+
; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1
142+
; GFX9-NEXT: s_setpc_b64 s[30:31]
143+
%gep.p = getelementptr i16, i16 addrspace(1)* %p, i64 1
144+
%p.0 = load i16, i16 addrspace(1)* %p, align 1
145+
%p.1 = load i16, i16 addrspace(1)* %gep.p, align 1
146+
%zext.0 = zext i16 %p.0 to i32
147+
%zext.1 = zext i16 %p.1 to i32
148+
%shl.1 = shl i32 %zext.1, 16
149+
%or = or i32 %zext.0, %shl.1
150+
ret i32 %or
151+
}
152+
153+
; Should produce align 1 dword when legal
154+
define amdgpu_kernel void @global_store_2xi16_align1(i16 addrspace(1)* %p, i16 addrspace(1)* %r) #0 {
155+
; GFX7-ALIGNED-LABEL: global_store_2xi16_align1:
156+
; GFX7-ALIGNED: ; %bb.0:
157+
; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
158+
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v4, 1
159+
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v5, 0
160+
; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
161+
; GFX7-ALIGNED-NEXT: s_add_u32 s2, s0, 2
162+
; GFX7-ALIGNED-NEXT: s_addc_u32 s3, s1, 0
163+
; GFX7-ALIGNED-NEXT: s_add_u32 s4, s0, 1
164+
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0
165+
; GFX7-ALIGNED-NEXT: s_addc_u32 s5, s1, 0
166+
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s1
167+
; GFX7-ALIGNED-NEXT: s_add_u32 s0, s0, 3
168+
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, s4
169+
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, s5
170+
; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v4
171+
; GFX7-ALIGNED-NEXT: flat_store_byte v[2:3], v5
172+
; GFX7-ALIGNED-NEXT: s_addc_u32 s1, s1, 0
173+
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0
174+
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, s2
175+
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s1
176+
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v4, 2
177+
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, s3
178+
; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v5
179+
; GFX7-ALIGNED-NEXT: flat_store_byte v[2:3], v4
180+
; GFX7-ALIGNED-NEXT: s_endpgm
181+
;
182+
; GFX7-UNALIGNED-LABEL: global_store_2xi16_align1:
183+
; GFX7-UNALIGNED: ; %bb.0:
184+
; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
185+
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001
186+
; GFX7-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
187+
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
188+
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v1, s1
189+
; GFX7-UNALIGNED-NEXT: flat_store_dword v[0:1], v2
190+
; GFX7-UNALIGNED-NEXT: s_endpgm
191+
;
192+
; GFX9-LABEL: global_store_2xi16_align1:
193+
; GFX9: ; %bb.0:
194+
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
195+
; GFX9-NEXT: v_mov_b32_e32 v2, 0x20001
196+
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
197+
; GFX9-NEXT: v_mov_b32_e32 v0, s0
198+
; GFX9-NEXT: v_mov_b32_e32 v1, s1
199+
; GFX9-NEXT: global_store_dword v[0:1], v2, off
200+
; GFX9-NEXT: s_endpgm
201+
%gep.r = getelementptr i16, i16 addrspace(1)* %r, i64 1
202+
store i16 1, i16 addrspace(1)* %r, align 1
203+
store i16 2, i16 addrspace(1)* %gep.r, align 1
204+
ret void
205+
}
206+
207+
; Should merge this to a dword load
208+
define i32 @global_load_2xi16_align4(i16 addrspace(1)* %p) #0 {
209+
; GFX7-LABEL: load_2xi16_align4:
210+
; GFX7: ; %bb.0:
211+
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
212+
; GFX7-NEXT: flat_load_dword v0, v[0:1]
213+
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
214+
; GFX7-NEXT: s_setpc_b64 s[30:31]
215+
;
216+
; GFX7-ALIGNED-LABEL: global_load_2xi16_align4:
217+
; GFX7-ALIGNED: ; %bb.0:
218+
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
219+
; GFX7-ALIGNED-NEXT: flat_load_dword v0, v[0:1]
220+
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
221+
; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31]
222+
;
223+
; GFX7-UNALIGNED-LABEL: global_load_2xi16_align4:
224+
; GFX7-UNALIGNED: ; %bb.0:
225+
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
226+
; GFX7-UNALIGNED-NEXT: flat_load_dword v0, v[0:1]
227+
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
228+
; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
229+
;
230+
; GFX9-LABEL: global_load_2xi16_align4:
231+
; GFX9: ; %bb.0:
232+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
233+
; GFX9-NEXT: global_load_dword v0, v[0:1], off
234+
; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff
235+
; GFX9-NEXT: s_mov_b32 s4, 0xffff
236+
; GFX9-NEXT: s_waitcnt vmcnt(0)
237+
; GFX9-NEXT: v_bfi_b32 v1, v1, 0, v0
238+
; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1
239+
; GFX9-NEXT: s_setpc_b64 s[30:31]
240+
%gep.p = getelementptr i16, i16 addrspace(1)* %p, i64 1
241+
%p.0 = load i16, i16 addrspace(1)* %p, align 4
242+
%p.1 = load i16, i16 addrspace(1)* %gep.p, align 2
243+
%zext.0 = zext i16 %p.0 to i32
244+
%zext.1 = zext i16 %p.1 to i32
245+
%shl.1 = shl i32 %zext.1, 16
246+
%or = or i32 %zext.0, %shl.1
247+
ret i32 %or
248+
}
249+
250+
; Should merge this to a dword store
251+
define amdgpu_kernel void @global_store_2xi16_align4(i16 addrspace(1)* %p, i16 addrspace(1)* %r) #0 {
252+
; GFX7-LABEL: global_store_2xi16_align4:
253+
; GFX7: ; %bb.0:
254+
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
255+
; GFX7-NEXT: v_mov_b32_e32 v2, 0x20001
256+
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
257+
; GFX7-NEXT: v_mov_b32_e32 v0, s0
258+
; GFX7-NEXT: v_mov_b32_e32 v1, s1
259+
; GFX7-NEXT: flat_store_dword v[0:1], v2
260+
; GFX7-NEXT: s_endpgm
261+
;
262+
; GFX7-ALIGNED-LABEL: global_store_2xi16_align4:
263+
; GFX7-ALIGNED: ; %bb.0:
264+
; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
265+
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001
266+
; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
267+
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0
268+
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s1
269+
; GFX7-ALIGNED-NEXT: flat_store_dword v[0:1], v2
270+
; GFX7-ALIGNED-NEXT: s_endpgm
271+
;
272+
; GFX7-UNALIGNED-LABEL: global_store_2xi16_align4:
273+
; GFX7-UNALIGNED: ; %bb.0:
274+
; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
275+
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001
276+
; GFX7-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
277+
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
278+
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v1, s1
279+
; GFX7-UNALIGNED-NEXT: flat_store_dword v[0:1], v2
280+
; GFX7-UNALIGNED-NEXT: s_endpgm
281+
;
282+
; GFX9-LABEL: global_store_2xi16_align4:
283+
; GFX9: ; %bb.0:
284+
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
285+
; GFX9-NEXT: v_mov_b32_e32 v2, 0x20001
286+
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
287+
; GFX9-NEXT: v_mov_b32_e32 v0, s0
288+
; GFX9-NEXT: v_mov_b32_e32 v1, s1
289+
; GFX9-NEXT: global_store_dword v[0:1], v2, off
290+
; GFX9-NEXT: s_endpgm
291+
%gep.r = getelementptr i16, i16 addrspace(1)* %r, i64 1
292+
store i16 1, i16 addrspace(1)* %r, align 4
293+
store i16 2, i16 addrspace(1)* %gep.r, align 2
294+
ret void
295+
}
296+
297+
298+
299+
300+
301+
302+
303+
304+
305+
306+
307+
308+
309+
310+
311+
312+
313+
314+
315+
316+
317+
318+
319+
320+
321+
322+
323+
324+
325+
326+
327+
328+

0 commit comments

Comments
 (0)