Skip to content

Commit ca98a3d

Browse files
authored
[AArch64][SVE] Use SVE for scalar FP converts in streaming[-compatible] functions (1/n) (#118505)
In streaming[-compatible] functions, use SVE for scalar FP conversions to/from integer types. This can help avoid moves between FPRs and GRPs, which could be costly. This patch also updates definitions of SCVTF_ZPmZ_StoD and UCVTF_ZPmZ_StoD to disallow lowering to them from ISD nodes, as doing so requires creating a [U|S]INT_TO_FP_MERGE_PASSTHRU node with inconsistent types. Follow up to #112213. Note: This PR does not include support for f64 <-> i32 conversions (like #112564), which needs a bit more work to support.
1 parent 9bb1d03 commit ca98a3d

7 files changed

+857
-254
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+59-1
Original file line numberDiff line numberDiff line change
@@ -19115,13 +19115,67 @@ static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
1911519115
return SDValue();
1911619116
}
1911719117

19118+
/// Tries to replace scalar FP <-> INT conversions with SVE in streaming
19119+
/// functions, this can help to reduce the number of fmovs to/from GPRs.
19120+
static SDValue
19121+
tryToReplaceScalarFPConversionWithSVE(SDNode *N, SelectionDAG &DAG,
19122+
TargetLowering::DAGCombinerInfo &DCI,
19123+
const AArch64Subtarget *Subtarget) {
19124+
if (N->isStrictFPOpcode())
19125+
return SDValue();
19126+
19127+
if (DCI.isBeforeLegalizeOps())
19128+
return SDValue();
19129+
19130+
if (!Subtarget->isSVEorStreamingSVEAvailable() ||
19131+
(!Subtarget->isStreaming() && !Subtarget->isStreamingCompatible()))
19132+
return SDValue();
19133+
19134+
auto isSupportedType = [](EVT VT) {
19135+
return !VT.isVector() && VT != MVT::bf16 && VT != MVT::f128;
19136+
};
19137+
19138+
SDValue SrcVal = N->getOperand(0);
19139+
EVT SrcTy = SrcVal.getValueType();
19140+
EVT DestTy = N->getValueType(0);
19141+
19142+
if (!isSupportedType(SrcTy) || !isSupportedType(DestTy))
19143+
return SDValue();
19144+
19145+
EVT SrcVecTy;
19146+
EVT DestVecTy;
19147+
if (DestTy.bitsGT(SrcTy)) {
19148+
DestVecTy = getPackedSVEVectorVT(DestTy);
19149+
SrcVecTy = DestVecTy.changeVectorElementType(SrcTy);
19150+
} else {
19151+
SrcVecTy = getPackedSVEVectorVT(SrcTy);
19152+
DestVecTy = SrcVecTy.changeVectorElementType(DestTy);
19153+
}
19154+
19155+
// Ensure the resulting src/dest vector type is legal.
19156+
if (SrcVecTy == MVT::nxv2i32 || DestVecTy == MVT::nxv2i32)
19157+
return SDValue();
19158+
19159+
SDLoc DL(N);
19160+
SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
19161+
SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SrcVecTy,
19162+
DAG.getUNDEF(SrcVecTy), SrcVal, ZeroIdx);
19163+
SDValue Convert = DAG.getNode(N->getOpcode(), DL, DestVecTy, Vec);
19164+
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestTy, Convert, ZeroIdx);
19165+
}
19166+
1911819167
static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
19168+
TargetLowering::DAGCombinerInfo &DCI,
1911919169
const AArch64Subtarget *Subtarget) {
1912019170
// First try to optimize away the conversion when it's conditionally from
1912119171
// a constant. Vectors only.
1912219172
if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
1912319173
return Res;
1912419174

19175+
if (SDValue Res =
19176+
tryToReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
19177+
return Res;
19178+
1912519179
EVT VT = N->getValueType(0);
1912619180
if (VT != MVT::f32 && VT != MVT::f64)
1912719181
return SDValue();
@@ -19160,6 +19214,10 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
1916019214
static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
1916119215
TargetLowering::DAGCombinerInfo &DCI,
1916219216
const AArch64Subtarget *Subtarget) {
19217+
if (SDValue Res =
19218+
tryToReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
19219+
return Res;
19220+
1916319221
if (!Subtarget->isNeonAvailable())
1916419222
return SDValue();
1916519223

@@ -26240,7 +26298,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
2624026298
return performMulCombine(N, DAG, DCI, Subtarget);
2624126299
case ISD::SINT_TO_FP:
2624226300
case ISD::UINT_TO_FP:
26243-
return performIntToFpCombine(N, DAG, Subtarget);
26301+
return performIntToFpCombine(N, DAG, DCI, Subtarget);
2624426302
case ISD::FP_TO_SINT:
2624526303
case ISD::FP_TO_UINT:
2624626304
case ISD::FP_TO_SINT_SAT:

llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td

+2-2
Original file line numberDiff line numberDiff line change
@@ -2348,8 +2348,8 @@ let Predicates = [HasSVEorSME] in {
23482348
defm FCVT_ZPmZ_HtoD : sve_fp_2op_p_zd< 0b1101001, "fcvt", ZPR16, ZPR64, int_aarch64_sve_fcvt_f64f16, AArch64fcvte_mt, nxv2f64, nxv2i1, nxv2f16, ElementSizeD>;
23492349
defm FCVT_ZPmZ_DtoS : sve_fp_2op_p_zdr<0b1101010, "fcvt", ZPR64, ZPR32, int_aarch64_sve_fcvt_f32f64, AArch64fcvtr_mt, nxv2f32, nxv2i1, nxv2f64, ElementSizeD>;
23502350
defm FCVT_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1101011, "fcvt", ZPR32, ZPR64, int_aarch64_sve_fcvt_f64f32, AArch64fcvte_mt, nxv2f64, nxv2i1, nxv2f32, ElementSizeD>;
2351-
defm SCVTF_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1110000, "scvtf", ZPR32, ZPR64, int_aarch64_sve_scvtf_f64i32, AArch64scvtf_mt, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
2352-
defm UCVTF_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1110001, "ucvtf", ZPR32, ZPR64, int_aarch64_sve_ucvtf_f64i32, AArch64ucvtf_mt, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
2351+
defm SCVTF_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1110000, "scvtf", ZPR32, ZPR64, int_aarch64_sve_scvtf_f64i32, null_frag, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
2352+
defm UCVTF_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1110001, "ucvtf", ZPR32, ZPR64, int_aarch64_sve_ucvtf_f64i32, null_frag, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
23532353
defm UCVTF_ZPmZ_StoH : sve_fp_2op_p_zd< 0b0110101, "ucvtf", ZPR32, ZPR16, int_aarch64_sve_ucvtf_f16i32, AArch64ucvtf_mt, nxv4f16, nxv4i1, nxv4i32, ElementSizeS>;
23542354
defm SCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd< 0b1110100, "scvtf", ZPR64, ZPR32, int_aarch64_sve_scvtf_f32i64, AArch64scvtf_mt, nxv2f32, nxv2i1, nxv2i64, ElementSizeD>;
23552355
defm SCVTF_ZPmZ_StoH : sve_fp_2op_p_zd< 0b0110100, "scvtf", ZPR32, ZPR16, int_aarch64_sve_scvtf_f16i32, AArch64scvtf_mt, nxv4f16, nxv4i1, nxv4i32, ElementSizeS>;

llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll

+74-19
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,32 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc -force-streaming-compatible < %s | FileCheck %s
3-
; RUN: llc -force-streaming-compatible -mattr=+sme2p2 < %s | FileCheck %s --check-prefix=USE-NEON-NO-GPRS
4-
; RUN: llc < %s | FileCheck %s --check-prefix=USE-NEON-NO-GPRS
2+
; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
3+
; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s
4+
; RUN: llc -mattr=+sme2p2 -force-streaming-compatible < %s | FileCheck %s --check-prefix=USE-NEON-NO-GPRS
5+
; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
56

67
target triple = "aarch64-unknown-linux-gnu"
78

89
define double @t1(double %x) {
910
; CHECK-LABEL: t1:
1011
; CHECK: // %bb.0: // %entry
11-
; CHECK-NEXT: fcvtzs x8, d0
12-
; CHECK-NEXT: scvtf d0, x8
12+
; CHECK-NEXT: ptrue p0.d
13+
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
14+
; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d
15+
; CHECK-NEXT: scvtf z0.d, p0/m, z0.d
16+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
1317
; CHECK-NEXT: ret
1418
;
1519
; USE-NEON-NO-GPRS-LABEL: t1:
1620
; USE-NEON-NO-GPRS: // %bb.0: // %entry
1721
; USE-NEON-NO-GPRS-NEXT: fcvtzs d0, d0
1822
; USE-NEON-NO-GPRS-NEXT: scvtf d0, d0
1923
; USE-NEON-NO-GPRS-NEXT: ret
24+
;
25+
; NONEON-NOSVE-LABEL: t1:
26+
; NONEON-NOSVE: // %bb.0: // %entry
27+
; NONEON-NOSVE-NEXT: fcvtzs x8, d0
28+
; NONEON-NOSVE-NEXT: scvtf d0, x8
29+
; NONEON-NOSVE-NEXT: ret
2030
entry:
2131
%conv = fptosi double %x to i64
2232
%conv1 = sitofp i64 %conv to double
@@ -26,15 +36,24 @@ entry:
2636
define float @t2(float %x) {
2737
; CHECK-LABEL: t2:
2838
; CHECK: // %bb.0: // %entry
29-
; CHECK-NEXT: fcvtzs w8, s0
30-
; CHECK-NEXT: scvtf s0, w8
39+
; CHECK-NEXT: ptrue p0.s
40+
; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
41+
; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s
42+
; CHECK-NEXT: scvtf z0.s, p0/m, z0.s
43+
; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
3144
; CHECK-NEXT: ret
3245
;
3346
; USE-NEON-NO-GPRS-LABEL: t2:
3447
; USE-NEON-NO-GPRS: // %bb.0: // %entry
3548
; USE-NEON-NO-GPRS-NEXT: fcvtzs s0, s0
3649
; USE-NEON-NO-GPRS-NEXT: scvtf s0, s0
3750
; USE-NEON-NO-GPRS-NEXT: ret
51+
;
52+
; NONEON-NOSVE-LABEL: t2:
53+
; NONEON-NOSVE: // %bb.0: // %entry
54+
; NONEON-NOSVE-NEXT: fcvtzs w8, s0
55+
; NONEON-NOSVE-NEXT: scvtf s0, w8
56+
; NONEON-NOSVE-NEXT: ret
3857
entry:
3958
%conv = fptosi float %x to i32
4059
%conv1 = sitofp i32 %conv to float
@@ -44,11 +63,20 @@ entry:
4463
define half @t3(half %x) {
4564
; CHECK-LABEL: t3:
4665
; CHECK: // %bb.0: // %entry
47-
; CHECK-NEXT: fcvt s0, h0
48-
; CHECK-NEXT: fcvtzs w8, s0
49-
; CHECK-NEXT: scvtf s0, w8
50-
; CHECK-NEXT: fcvt h0, s0
66+
; CHECK-NEXT: ptrue p0.s
67+
; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
68+
; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.h
69+
; CHECK-NEXT: scvtf z0.h, p0/m, z0.s
70+
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
5171
; CHECK-NEXT: ret
72+
;
73+
; NONEON-NOSVE-LABEL: t3:
74+
; NONEON-NOSVE: // %bb.0: // %entry
75+
; NONEON-NOSVE-NEXT: fcvt s0, h0
76+
; NONEON-NOSVE-NEXT: fcvtzs w8, s0
77+
; NONEON-NOSVE-NEXT: scvtf s0, w8
78+
; NONEON-NOSVE-NEXT: fcvt h0, s0
79+
; NONEON-NOSVE-NEXT: ret
5280
entry:
5381
%conv = fptosi half %x to i32
5482
%conv1 = sitofp i32 %conv to half
@@ -58,15 +86,24 @@ entry:
5886
define double @t4(double %x) {
5987
; CHECK-LABEL: t4:
6088
; CHECK: // %bb.0: // %entry
61-
; CHECK-NEXT: fcvtzu x8, d0
62-
; CHECK-NEXT: ucvtf d0, x8
89+
; CHECK-NEXT: ptrue p0.d
90+
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
91+
; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d
92+
; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d
93+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
6394
; CHECK-NEXT: ret
6495
;
6596
; USE-NEON-NO-GPRS-LABEL: t4:
6697
; USE-NEON-NO-GPRS: // %bb.0: // %entry
6798
; USE-NEON-NO-GPRS-NEXT: fcvtzu d0, d0
6899
; USE-NEON-NO-GPRS-NEXT: ucvtf d0, d0
69100
; USE-NEON-NO-GPRS-NEXT: ret
101+
;
102+
; NONEON-NOSVE-LABEL: t4:
103+
; NONEON-NOSVE: // %bb.0: // %entry
104+
; NONEON-NOSVE-NEXT: fcvtzu x8, d0
105+
; NONEON-NOSVE-NEXT: ucvtf d0, x8
106+
; NONEON-NOSVE-NEXT: ret
70107
entry:
71108
%conv = fptoui double %x to i64
72109
%conv1 = uitofp i64 %conv to double
@@ -76,15 +113,24 @@ entry:
76113
define float @t5(float %x) {
77114
; CHECK-LABEL: t5:
78115
; CHECK: // %bb.0: // %entry
79-
; CHECK-NEXT: fcvtzu w8, s0
80-
; CHECK-NEXT: ucvtf s0, w8
116+
; CHECK-NEXT: ptrue p0.s
117+
; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
118+
; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.s
119+
; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s
120+
; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
81121
; CHECK-NEXT: ret
82122
;
83123
; USE-NEON-NO-GPRS-LABEL: t5:
84124
; USE-NEON-NO-GPRS: // %bb.0: // %entry
85125
; USE-NEON-NO-GPRS-NEXT: fcvtzu s0, s0
86126
; USE-NEON-NO-GPRS-NEXT: ucvtf s0, s0
87127
; USE-NEON-NO-GPRS-NEXT: ret
128+
;
129+
; NONEON-NOSVE-LABEL: t5:
130+
; NONEON-NOSVE: // %bb.0: // %entry
131+
; NONEON-NOSVE-NEXT: fcvtzu w8, s0
132+
; NONEON-NOSVE-NEXT: ucvtf s0, w8
133+
; NONEON-NOSVE-NEXT: ret
88134
entry:
89135
%conv = fptoui float %x to i32
90136
%conv1 = uitofp i32 %conv to float
@@ -94,11 +140,20 @@ entry:
94140
define half @t6(half %x) {
95141
; CHECK-LABEL: t6:
96142
; CHECK: // %bb.0: // %entry
97-
; CHECK-NEXT: fcvt s0, h0
98-
; CHECK-NEXT: fcvtzu w8, s0
99-
; CHECK-NEXT: ucvtf s0, w8
100-
; CHECK-NEXT: fcvt h0, s0
143+
; CHECK-NEXT: ptrue p0.s
144+
; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
145+
; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.h
146+
; CHECK-NEXT: ucvtf z0.h, p0/m, z0.s
147+
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
101148
; CHECK-NEXT: ret
149+
;
150+
; NONEON-NOSVE-LABEL: t6:
151+
; NONEON-NOSVE: // %bb.0: // %entry
152+
; NONEON-NOSVE-NEXT: fcvt s0, h0
153+
; NONEON-NOSVE-NEXT: fcvtzu w8, s0
154+
; NONEON-NOSVE-NEXT: ucvtf s0, w8
155+
; NONEON-NOSVE-NEXT: fcvt h0, s0
156+
; NONEON-NOSVE-NEXT: ret
102157
entry:
103158
%conv = fptoui half %x to i32
104159
%conv1 = uitofp i32 %conv to half

0 commit comments

Comments
 (0)