Skip to content

Commit ef13308

Browse files
committed
AMDGPU/SDAG: Improve {extract,insert}_subvector lowering for 16-bit vectors
v2: - simplify the escape to TableGen patterns Differential Revision: https://reviews.llvm.org/D149841
1 parent 96e09fe commit ef13308

File tree

5 files changed

+62
-38
lines changed

5 files changed

+62
-38
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 25 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1423,32 +1423,42 @@ SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
14231423

14241424
SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
14251425
SelectionDAG &DAG) const {
1426-
1426+
SDLoc SL(Op);
14271427
SmallVector<SDValue, 8> Args;
14281428
unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
14291429
EVT VT = Op.getValueType();
14301430
EVT SrcVT = Op.getOperand(0).getValueType();
14311431

1432-
// For these types, we have some TableGen patterns except if the index is 1
1433-
if (((SrcVT == MVT::v4f16 && VT == MVT::v2f16) ||
1434-
(SrcVT == MVT::v4i16 && VT == MVT::v2i16)) &&
1435-
Start != 1)
1436-
return Op;
1432+
if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) {
1433+
unsigned NumElt = VT.getVectorNumElements();
1434+
unsigned NumSrcElt = SrcVT.getVectorNumElements();
1435+
assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");
14371436

1438-
if (((SrcVT == MVT::v8f16 && VT == MVT::v4f16) ||
1439-
(SrcVT == MVT::v8i16 && VT == MVT::v4i16)) &&
1440-
(Start == 0 || Start == 4))
1441-
return Op;
1437+
// We have some TableGen patterns for when the extracted vector is exactly
1438+
// the low or high half of the operand.
1439+
if ((NumSrcElt == 2 * NumElt) && (Start == 0 || Start == NumElt))
1440+
return Op;
14421441

1443-
if (((SrcVT == MVT::v16f16 && VT == MVT::v8f16) ||
1444-
(SrcVT == MVT::v16i16 && VT == MVT::v8i16)) &&
1445-
(Start == 0 || Start == 8))
1446-
return Op;
1442+
// Extract 32-bit registers at a time.
1443+
EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2);
1444+
EVT NewVT = NumElt == 2
1445+
? MVT::i32
1446+
: EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2);
1447+
SDValue Tmp = DAG.getNode(ISD::BITCAST, SL, NewSrcVT, Op.getOperand(0));
1448+
1449+
DAG.ExtractVectorElements(Tmp, Args, Start / 2, NumElt / 2);
1450+
if (NumElt == 2)
1451+
Tmp = Args[0];
1452+
else
1453+
Tmp = DAG.getBuildVector(NewVT, SL, Args);
1454+
1455+
return DAG.getNode(ISD::BITCAST, SL, VT, Tmp);
1456+
}
14471457

14481458
DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
14491459
VT.getVectorNumElements());
14501460

1451-
return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1461+
return DAG.getBuildVector(Op.getValueType(), SL, Args);
14521462
}
14531463

14541464
// TODO: Handle fabs too

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5762,6 +5762,35 @@ SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
57625762
unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
57635763
SDLoc SL(Op);
57645764

5765+
if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
5766+
// Insert 32-bit registers at a time.
5767+
assert(InsNumElts % 2 == 0 && "expect legal vector types");
5768+
5769+
unsigned VecNumElts = VecVT.getVectorNumElements();
5770+
EVT NewVecVT =
5771+
EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
5772+
EVT NewInsVT = InsNumElts == 2 ? MVT::i32
5773+
: EVT::getVectorVT(*DAG.getContext(),
5774+
MVT::i32, InsNumElts / 2);
5775+
5776+
Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
5777+
Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
5778+
5779+
for (unsigned I = 0; I != InsNumElts / 2; ++I) {
5780+
SDValue Elt;
5781+
if (InsNumElts == 2) {
5782+
Elt = Ins;
5783+
} else {
5784+
Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
5785+
DAG.getConstant(I, SL, MVT::i32));
5786+
}
5787+
Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
5788+
DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
5789+
}
5790+
5791+
return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
5792+
}
5793+
57655794
for (unsigned I = 0; I != InsNumElts; ++I) {
57665795
SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
57675796
DAG.getConstant(I, SL, MVT::i32));

llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -213,8 +213,6 @@ define amdgpu_kernel void @v3i16_registers(i1 %cond) #0 {
213213
; GCN-NEXT: v_mov_b32_e32 v0, 0
214214
; GCN-NEXT: v_mov_b32_e32 v1, 0
215215
; GCN-NEXT: .LBB4_3: ; %if.end
216-
; GCN-NEXT: s_mov_b32 s4, 0xffff
217-
; GCN-NEXT: v_bfi_b32 v0, s4, v0, v0
218216
; GCN-NEXT: global_store_short v[0:1], v1, off
219217
; GCN-NEXT: global_store_dword v[0:1], v0, off
220218
; GCN-NEXT: s_endpgm

llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -947,16 +947,7 @@ define <8 x i16> @large_vector(ptr addrspace(3) %p, i32 %idxp) {
947947
; GFX9-NEXT: v_lshl_add_u32 v2, v1, 5, v0
948948
; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
949949
; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
950-
; GFX9-NEXT: s_mov_b32 s4, 0xffff
951-
; GFX9-NEXT: s_waitcnt lgkmcnt(1)
952-
; GFX9-NEXT: v_bfi_b32 v4, s4, v0, v0
953950
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
954-
; GFX9-NEXT: v_bfi_b32 v5, s4, v2, v2
955-
; GFX9-NEXT: v_bfi_b32 v4, s4, v0, v4
956-
; GFX9-NEXT: v_bfi_b32 v4, s4, v0, v4
957-
; GFX9-NEXT: v_bfi_b32 v5, s4, v2, v5
958-
; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v5
959-
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v4
960951
; GFX9-NEXT: s_setpc_b64 s[30:31]
961952
%idx = shl i32 %idxp, 4
962953

llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,12 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 {
1010
; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1111
; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0
1212
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
13-
; GFX900-NEXT: s_pack_lh_b32_b16 s4, s0, s0
1413
; GFX900-NEXT: v_mov_b32_e32 v5, s3
1514
; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4
15+
; GFX900-NEXT: v_mov_b32_e32 v0, s0
1616
; GFX900-NEXT: v_mov_b32_e32 v1, s1
17+
; GFX900-NEXT: v_mov_b32_e32 v2, s0
1718
; GFX900-NEXT: v_mov_b32_e32 v3, s0
18-
; GFX900-NEXT: v_mov_b32_e32 v0, s4
19-
; GFX900-NEXT: v_mov_b32_e32 v2, s4
2019
; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
2120
; GFX900-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
2221
; GFX900-NEXT: s_endpgm
@@ -26,13 +25,12 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 {
2625
; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2726
; GFX906-NEXT: v_lshlrev_b32_e32 v4, 4, v0
2827
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
29-
; GFX906-NEXT: s_pack_lh_b32_b16 s4, s0, s0
3028
; GFX906-NEXT: v_mov_b32_e32 v5, s3
3129
; GFX906-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4
30+
; GFX906-NEXT: v_mov_b32_e32 v0, s0
3231
; GFX906-NEXT: v_mov_b32_e32 v1, s1
32+
; GFX906-NEXT: v_mov_b32_e32 v2, s0
3333
; GFX906-NEXT: v_mov_b32_e32 v3, s0
34-
; GFX906-NEXT: v_mov_b32_e32 v0, s4
35-
; GFX906-NEXT: v_mov_b32_e32 v2, s4
3634
; GFX906-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
3735
; GFX906-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
3836
; GFX906-NEXT: s_endpgm
@@ -42,13 +40,12 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 {
4240
; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
4341
; GFX908-NEXT: v_lshlrev_b32_e32 v4, 4, v0
4442
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
45-
; GFX908-NEXT: s_pack_lh_b32_b16 s4, s0, s0
4643
; GFX908-NEXT: v_mov_b32_e32 v5, s3
4744
; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4
45+
; GFX908-NEXT: v_mov_b32_e32 v0, s0
4846
; GFX908-NEXT: v_mov_b32_e32 v1, s1
47+
; GFX908-NEXT: v_mov_b32_e32 v2, s0
4948
; GFX908-NEXT: v_mov_b32_e32 v3, s0
50-
; GFX908-NEXT: v_mov_b32_e32 v0, s4
51-
; GFX908-NEXT: v_mov_b32_e32 v2, s4
5249
; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
5350
; GFX908-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
5451
; GFX908-NEXT: s_endpgm
@@ -58,13 +55,12 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 {
5855
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
5956
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 4, v0
6057
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
61-
; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s0, s0
6258
; GFX90A-NEXT: v_mov_b32_e32 v5, s3
6359
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4
60+
; GFX90A-NEXT: v_mov_b32_e32 v0, s0
6461
; GFX90A-NEXT: v_mov_b32_e32 v1, s1
62+
; GFX90A-NEXT: v_mov_b32_e32 v2, s0
6563
; GFX90A-NEXT: v_mov_b32_e32 v3, s0
66-
; GFX90A-NEXT: v_mov_b32_e32 v0, s4
67-
; GFX90A-NEXT: v_mov_b32_e32 v2, s4
6864
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
6965
; GFX90A-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
7066
; GFX90A-NEXT: s_endpgm

0 commit comments

Comments
 (0)