Skip to content

Commit f2c4211

Browse files
author
Stefan Stipanovic
committed
[AMDGPU][GlobalIsel] Introduce isRegisterClassType to check for legal types, instead of checking bit width.
Make v13s32, v14s32, v15s32 and v7s64 illegal for bitcast first.
1 parent 6d24291 commit f2c4211

File tree

4 files changed

+317
-187
lines changed

4 files changed

+317
-187
lines changed

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 106 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,7 @@ static bool isRegisterVectorType(LLT Ty) {
239239
EltSize == 128 || EltSize == 256;
240240
}
241241

242+
// TODO: replace all uses of isRegisterType with isRegisterClassType
242243
static bool isRegisterType(LLT Ty) {
243244
if (!isRegisterSize(Ty.getSizeInBits()))
244245
return false;
@@ -258,6 +259,8 @@ static LegalityPredicate isRegisterType(unsigned TypeIdx) {
258259
}
259260

260261
// RegisterType that doesn't have a corresponding RegClass.
262+
// TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
263+
// should be removed.
261264
static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) {
262265
return [=](const LegalityQuery &Query) {
263266
LLT Ty = Query.Types[TypeIdx];
@@ -276,6 +279,95 @@ static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
276279
};
277280
}
278281

282+
static const LLT S1 = LLT::scalar(1);
283+
static const LLT S8 = LLT::scalar(8);
284+
static const LLT S16 = LLT::scalar(16);
285+
static const LLT S32 = LLT::scalar(32);
286+
static const LLT S64 = LLT::scalar(64);
287+
static const LLT S96 = LLT::scalar(96);
288+
static const LLT S128 = LLT::scalar(128);
289+
static const LLT S160 = LLT::scalar(160);
290+
static const LLT S224 = LLT::scalar(224);
291+
static const LLT S256 = LLT::scalar(256);
292+
static const LLT S512 = LLT::scalar(512);
293+
static const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
294+
295+
static const LLT V2S8 = LLT::fixed_vector(2, 8);
296+
static const LLT V2S16 = LLT::fixed_vector(2, 16);
297+
static const LLT V4S16 = LLT::fixed_vector(4, 16);
298+
static const LLT V6S16 = LLT::fixed_vector(6, 16);
299+
static const LLT V8S16 = LLT::fixed_vector(8, 16);
300+
static const LLT V10S16 = LLT::fixed_vector(10, 16);
301+
static const LLT V12S16 = LLT::fixed_vector(12, 16);
302+
static const LLT V16S16 = LLT::fixed_vector(16, 16);
303+
304+
static const LLT V2S32 = LLT::fixed_vector(2, 32);
305+
static const LLT V3S32 = LLT::fixed_vector(3, 32);
306+
static const LLT V4S32 = LLT::fixed_vector(4, 32);
307+
static const LLT V5S32 = LLT::fixed_vector(5, 32);
308+
static const LLT V6S32 = LLT::fixed_vector(6, 32);
309+
static const LLT V7S32 = LLT::fixed_vector(7, 32);
310+
static const LLT V8S32 = LLT::fixed_vector(8, 32);
311+
static const LLT V9S32 = LLT::fixed_vector(9, 32);
312+
static const LLT V10S32 = LLT::fixed_vector(10, 32);
313+
static const LLT V11S32 = LLT::fixed_vector(11, 32);
314+
static const LLT V12S32 = LLT::fixed_vector(12, 32);
315+
static const LLT V16S32 = LLT::fixed_vector(16, 32);
316+
static const LLT V32S32 = LLT::fixed_vector(32, 32);
317+
318+
static const LLT V2S64 = LLT::fixed_vector(2, 64);
319+
static const LLT V3S64 = LLT::fixed_vector(3, 64);
320+
static const LLT V4S64 = LLT::fixed_vector(4, 64);
321+
static const LLT V5S64 = LLT::fixed_vector(5, 64);
322+
static const LLT V6S64 = LLT::fixed_vector(6, 64);
323+
static const LLT V7S64 = LLT::fixed_vector(7, 64);
324+
static const LLT V8S64 = LLT::fixed_vector(8, 64);
325+
static const LLT V16S64 = LLT::fixed_vector(16, 64);
326+
327+
static const LLT V2S128 = LLT::fixed_vector(2, 128);
328+
static const LLT V4S128 = LLT::fixed_vector(4, 128);
329+
330+
static std::initializer_list<LLT> AllScalarTypes = {S32, S64, S96, S128,
331+
S160, S224, S256, S512};
332+
333+
static std::initializer_list<LLT> AllS16Vectors{
334+
V2S16, V4S16, V6S16, V8S16, V10S16, V12S16, V16S16, V2S128, V4S128};
335+
336+
static std::initializer_list<LLT> AllS32Vectors = {
337+
V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
338+
V9S32, V10S32, V11S32, V12S32, V16S32, V32S32};
339+
340+
static std::initializer_list<LLT> AllS64Vectors = {V2S64, V3S64, V4S64, V5S64,
341+
V6S64, V7S64, V8S64, V16S64};
342+
343+
static bool typeInSet(LLT Ty, std::initializer_list<LLT> TypesInit) {
344+
SmallVector<LLT, 4> Types = TypesInit;
345+
return llvm::is_contained(Types, Ty);
346+
}
347+
348+
static LLT GetAddrSpacePtr(unsigned AS, const GCNTargetMachine &TM) {
349+
return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
350+
}
351+
352+
// Checks whether a type is in the list of legal register types.
353+
static bool isRegisterClassType(LLT Ty) {
354+
if (Ty.isVector() && Ty.getElementType().isPointer())
355+
Ty = LLT::fixed_vector(Ty.getNumElements(),
356+
LLT::scalar(Ty.getScalarSizeInBits()));
357+
else if (Ty.isPointer())
358+
Ty = LLT::scalar(Ty.getScalarSizeInBits());
359+
360+
return typeInSet(Ty, AllS32Vectors) || typeInSet(Ty, AllS64Vectors) ||
361+
typeInSet(Ty, AllScalarTypes) || typeInSet(Ty, AllS16Vectors) ||
362+
Ty.isPointer();
363+
}
364+
365+
static LegalityPredicate isRegisterClassType(unsigned TypeIdx) {
366+
return [TypeIdx](const LegalityQuery &Query) {
367+
return isRegisterClassType(Query.Types[TypeIdx]);
368+
};
369+
}
370+
279371
// If we have a truncating store or an extending load with a data size larger
280372
// than 32-bits, we need to reduce to a 32-bit type.
281373
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
@@ -574,67 +666,18 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
574666
: ST(ST_) {
575667
using namespace TargetOpcode;
576668

577-
auto GetAddrSpacePtr = [&TM](unsigned AS) {
578-
return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
579-
};
580-
581-
const LLT S1 = LLT::scalar(1);
582-
const LLT S8 = LLT::scalar(8);
583-
const LLT S16 = LLT::scalar(16);
584-
const LLT S32 = LLT::scalar(32);
585-
const LLT S64 = LLT::scalar(64);
586-
const LLT S128 = LLT::scalar(128);
587-
const LLT S256 = LLT::scalar(256);
588-
const LLT S512 = LLT::scalar(512);
589-
const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
590-
591-
const LLT V2S8 = LLT::fixed_vector(2, 8);
592-
const LLT V2S16 = LLT::fixed_vector(2, 16);
593-
const LLT V4S16 = LLT::fixed_vector(4, 16);
594-
595-
const LLT V2S32 = LLT::fixed_vector(2, 32);
596-
const LLT V3S32 = LLT::fixed_vector(3, 32);
597-
const LLT V4S32 = LLT::fixed_vector(4, 32);
598-
const LLT V5S32 = LLT::fixed_vector(5, 32);
599-
const LLT V6S32 = LLT::fixed_vector(6, 32);
600-
const LLT V7S32 = LLT::fixed_vector(7, 32);
601-
const LLT V8S32 = LLT::fixed_vector(8, 32);
602-
const LLT V9S32 = LLT::fixed_vector(9, 32);
603-
const LLT V10S32 = LLT::fixed_vector(10, 32);
604-
const LLT V11S32 = LLT::fixed_vector(11, 32);
605-
const LLT V12S32 = LLT::fixed_vector(12, 32);
606-
const LLT V13S32 = LLT::fixed_vector(13, 32);
607-
const LLT V14S32 = LLT::fixed_vector(14, 32);
608-
const LLT V15S32 = LLT::fixed_vector(15, 32);
609-
const LLT V16S32 = LLT::fixed_vector(16, 32);
610-
const LLT V32S32 = LLT::fixed_vector(32, 32);
611-
612-
const LLT V2S64 = LLT::fixed_vector(2, 64);
613-
const LLT V3S64 = LLT::fixed_vector(3, 64);
614-
const LLT V4S64 = LLT::fixed_vector(4, 64);
615-
const LLT V5S64 = LLT::fixed_vector(5, 64);
616-
const LLT V6S64 = LLT::fixed_vector(6, 64);
617-
const LLT V7S64 = LLT::fixed_vector(7, 64);
618-
const LLT V8S64 = LLT::fixed_vector(8, 64);
619-
const LLT V16S64 = LLT::fixed_vector(16, 64);
620-
621-
std::initializer_list<LLT> AllS32Vectors =
622-
{V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
623-
V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
624-
std::initializer_list<LLT> AllS64Vectors =
625-
{V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
626-
627-
const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
628-
const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
629-
const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
630-
const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
631-
const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
632-
const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
633-
const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
634-
const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
635-
const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
669+
const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS, TM);
670+
const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS, TM);
671+
const LLT Constant32Ptr =
672+
GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT, TM);
673+
const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS, TM);
674+
const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS, TM);
675+
const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS, TM);
676+
const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS, TM);
677+
const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER, TM);
678+
const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE, TM);
636679
const LLT BufferStridedPtr =
637-
GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
680+
GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER, TM);
638681

639682
const LLT CodePtr = FlatPtr;
640683

@@ -836,10 +879,9 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
836879
.scalarize(0);
837880

838881
getActionDefinitionsBuilder(G_BITCAST)
839-
// Don't worry about the size constraint.
840-
.legalIf(all(isRegisterType(0), isRegisterType(1)))
841-
.lower();
842-
882+
// Don't worry about the size constraint.
883+
.legalIf(all(isRegisterClassType(0), isRegisterClassType(1)))
884+
.lower();
843885

844886
getActionDefinitionsBuilder(G_CONSTANT)
845887
.legalFor({S1, S32, S64, S16, GlobalPtr,
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2+
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GPRIDX %s
3+
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL %s
4+
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
5+
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
6+
define void @main(<19 x i32> %arg) {
7+
; GCN-LABEL: main:
8+
; GCN: ; %bb.0: ; %bb
9+
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10+
; GCN-NEXT: s_mov_b32 s4, 0
11+
; GCN-NEXT: s_mov_b32 s12, s4
12+
; GCN-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
13+
; GCN-NEXT: v_mov_b32_e32 v1, 0
14+
; GCN-NEXT: s_mov_b32 s13, s4
15+
; GCN-NEXT: v_mov_b32_e32 v4, s12
16+
; GCN-NEXT: s_mov_b32 s5, s4
17+
; GCN-NEXT: s_mov_b32 s6, s4
18+
; GCN-NEXT: s_mov_b32 s7, s4
19+
; GCN-NEXT: s_mov_b32 s8, s4
20+
; GCN-NEXT: s_mov_b32 s9, s4
21+
; GCN-NEXT: s_mov_b32 s10, s4
22+
; GCN-NEXT: s_mov_b32 s11, s4
23+
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
24+
; GCN-NEXT: v_mov_b32_e32 v2, v1
25+
; GCN-NEXT: v_mov_b32_e32 v3, v1
26+
; GCN-NEXT: v_mov_b32_e32 v5, s13
27+
; GCN-NEXT: image_store v[0:3], v[4:5], s[4:11] unorm
28+
; GCN-NEXT: s_waitcnt vmcnt(0)
29+
; GCN-NEXT: s_setpc_b64 s[30:31]
30+
;
31+
; GFX10-LABEL: main:
32+
; GFX10: ; %bb.0: ; %bb
33+
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34+
; GFX10-NEXT: s_mov_b32 s4, 0
35+
; GFX10-NEXT: v_mov_b32_e32 v1, 0
36+
; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
37+
; GFX10-NEXT: s_mov_b32 s10, s4
38+
; GFX10-NEXT: s_mov_b32 s11, s4
39+
; GFX10-NEXT: v_mov_b32_e32 v4, s10
40+
; GFX10-NEXT: v_mov_b32_e32 v2, v1
41+
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
42+
; GFX10-NEXT: v_mov_b32_e32 v3, v1
43+
; GFX10-NEXT: v_mov_b32_e32 v5, s11
44+
; GFX10-NEXT: s_mov_b32 s5, s4
45+
; GFX10-NEXT: s_mov_b32 s6, s4
46+
; GFX10-NEXT: s_mov_b32 s7, s4
47+
; GFX10-NEXT: s_mov_b32 s8, s4
48+
; GFX10-NEXT: s_mov_b32 s9, s4
49+
; GFX10-NEXT: image_store v[0:3], v[4:5], s[4:11] dim:SQ_RSRC_IMG_2D unorm
50+
; GFX10-NEXT: s_setpc_b64 s[30:31]
51+
;
52+
; GFX11-LABEL: main:
53+
; GFX11: ; %bb.0: ; %bb
54+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
55+
; GFX11-NEXT: s_mov_b32 s0, 0
56+
; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
57+
; GFX11-NEXT: s_mov_b32 s6, s0
58+
; GFX11-NEXT: s_mov_b32 s7, s0
59+
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, s6
60+
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
61+
; GFX11-NEXT: v_mov_b32_e32 v5, s7
62+
; GFX11-NEXT: s_mov_b32 s1, s0
63+
; GFX11-NEXT: v_mov_b32_e32 v2, v1
64+
; GFX11-NEXT: v_mov_b32_e32 v3, v1
65+
; GFX11-NEXT: s_mov_b32 s2, s0
66+
; GFX11-NEXT: s_mov_b32 s3, s0
67+
; GFX11-NEXT: s_mov_b32 s4, s0
68+
; GFX11-NEXT: s_mov_b32 s5, s0
69+
; GFX11-NEXT: image_store v[0:3], v[4:5], s[0:7] dim:SQ_RSRC_IMG_2D unorm
70+
; GFX11-NEXT: s_setpc_b64 s[30:31]
71+
bb:
72+
%i = bitcast <19 x i32> %arg to <38 x i16>
73+
%i1 = extractelement <38 x i16> %i, i64 0
74+
%i2 = icmp eq i16 %i1, 0
75+
%i3 = zext i1 %i2 to i32
76+
%i4 = bitcast i32 %i3 to float
77+
%i5 = insertelement <4 x float> zeroinitializer, float %i4, i64 0
78+
call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %i5, i32 0, i32 0, i32 0, <8 x i32> zeroinitializer, i32 0, i32 0)
79+
ret void
80+
}
81+
declare void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float>, i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg)
82+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
83+
; GFX10PLUS: {{.*}}
84+
; GPRIDX: {{.*}}
85+
; MOVREL: {{.*}}

0 commit comments

Comments
 (0)