Skip to content

Commit 0919ab3

Browse files
authored
[AArch64] Don't try to vectorize fixed point to fp narrowing conversion (llvm#130665)
GCC, correctly, doesn't vectorize in this case. Absence of direct instructions to convert larger fixed point to lower floating point precision inadvertently causes rounding leading to subtle differences across ISAs. https://godbolt.org/z/ssEchMWrE Co-authored by: @echristo
1 parent 8fb802e commit 0919ab3

13 files changed

+1460
-715
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+23
Original file line numberDiff line numberDiff line change
@@ -5106,6 +5106,29 @@ SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
51065106
uint64_t VTSize = VT.getFixedSizeInBits();
51075107
uint64_t InVTSize = InVT.getFixedSizeInBits();
51085108
if (VTSize < InVTSize) {
5109+
// AArch64 doesn't have a direct vector instruction to convert
5110+
// fixed point to floating point AND narrow it at the same time.
5111+
// Additional rounding when the target is f32/f64 causes double
5112+
// rounding issues. Conversion to f16 is fine due to narrow width.
5113+
bool IsTargetf32 = VT.getVectorElementType() == MVT::f32;
5114+
bool IsTargetf16 = false;
5115+
if (Op.hasOneUse() &&
5116+
Op->user_begin()->getOpcode() == ISD::CONCAT_VECTORS) {
5117+
// Some vector types are split during legalization into half, followed by
5118+
// concatenation, followed by rounding to the original vector type. If we
5119+
// end up resolving to f16 type, we shouldn't worry about rounding errors.
5120+
SDNode *U = *Op->user_begin();
5121+
if (U->hasOneUse() && U->user_begin()->getOpcode() == ISD::FP_ROUND) {
5122+
EVT TmpVT = U->user_begin()->getValueType(0);
5123+
if (TmpVT.getScalarType() == MVT::f16)
5124+
IsTargetf16 = true;
5125+
}
5126+
}
5127+
5128+
if (IsTargetf32 && !IsTargetf16) {
5129+
return !IsStrict ? DAG.UnrollVectorOp(Op.getNode()) : SDValue();
5130+
}
5131+
51095132
MVT CastVT =
51105133
MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()),
51115134
InVT.getVectorNumElements());

llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll

+19-13
Original file line numberDiff line numberDiff line change
@@ -148,9 +148,9 @@ define void @insert_vec_v6i64_uaddlv_from_v4i32(ptr %0) {
148148
; CHECK-NEXT: movi.2d v0, #0000000000000000
149149
; CHECK-NEXT: str xzr, [x0, #16]
150150
; CHECK-NEXT: uaddlv.4s d1, v0
151-
; CHECK-NEXT: mov.d v0[0], v1[0]
152-
; CHECK-NEXT: ucvtf.2d v0, v0
153-
; CHECK-NEXT: fcvtn v0.2s, v0.2d
151+
; CHECK-NEXT: fmov x8, d1
152+
; CHECK-NEXT: ucvtf s1, x8
153+
; CHECK-NEXT: mov.s v0[0], v1[0]
154154
; CHECK-NEXT: str q0, [x0]
155155
; CHECK-NEXT: ret
156156

@@ -166,10 +166,11 @@ define void @insert_vec_v2i64_uaddlv_from_v4i32(ptr %0) {
166166
; CHECK-LABEL: insert_vec_v2i64_uaddlv_from_v4i32:
167167
; CHECK: ; %bb.0: ; %entry
168168
; CHECK-NEXT: movi.2d v0, #0000000000000000
169-
; CHECK-NEXT: uaddlv.4s d1, v0
170-
; CHECK-NEXT: mov.d v0[0], v1[0]
171-
; CHECK-NEXT: ucvtf.2d v0, v0
172-
; CHECK-NEXT: fcvtn v0.2s, v0.2d
169+
; CHECK-NEXT: uaddlv.4s d0, v0
170+
; CHECK-NEXT: fmov x8, d0
171+
; CHECK-NEXT: movi d0, #0000000000000000
172+
; CHECK-NEXT: ucvtf s1, x8
173+
; CHECK-NEXT: mov.s v0[0], v1[0]
173174
; CHECK-NEXT: str d0, [x0]
174175
; CHECK-NEXT: ret
175176

@@ -187,9 +188,9 @@ define void @insert_vec_v5i64_uaddlv_from_v4i32(ptr %0) {
187188
; CHECK-NEXT: movi.2d v0, #0000000000000000
188189
; CHECK-NEXT: str wzr, [x0, #16]
189190
; CHECK-NEXT: uaddlv.4s d1, v0
190-
; CHECK-NEXT: mov.d v0[0], v1[0]
191-
; CHECK-NEXT: ucvtf.2d v0, v0
192-
; CHECK-NEXT: fcvtn v0.2s, v0.2d
191+
; CHECK-NEXT: fmov x8, d1
192+
; CHECK-NEXT: ucvtf s1, x8
193+
; CHECK-NEXT: mov.s v0[0], v1[0]
193194
; CHECK-NEXT: str q0, [x0]
194195
; CHECK-NEXT: ret
195196

@@ -254,9 +255,14 @@ define void @insert_vec_v16i64_uaddlv_from_v4i16(ptr %0) {
254255
; CHECK-NEXT: uaddlv.4h s1, v0
255256
; CHECK-NEXT: stp q0, q0, [x0, #32]
256257
; CHECK-NEXT: mov.s v2[0], v1[0]
257-
; CHECK-NEXT: ucvtf.2d v1, v2
258-
; CHECK-NEXT: fcvtn v1.2s, v1.2d
259-
; CHECK-NEXT: stp q1, q0, [x0]
258+
; CHECK-NEXT: fmov x8, d2
259+
; CHECK-NEXT: mov.d x9, v2[1]
260+
; CHECK-NEXT: movi.2d v2, #0000000000000000
261+
; CHECK-NEXT: ucvtf s1, x8
262+
; CHECK-NEXT: ucvtf s3, x9
263+
; CHECK-NEXT: mov.s v2[0], v1[0]
264+
; CHECK-NEXT: mov.s v2[1], v3[0]
265+
; CHECK-NEXT: stp q2, q0, [x0]
260266
; CHECK-NEXT: ret
261267

262268
entry:

llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll

+20-13
Original file line numberDiff line numberDiff line change
@@ -53,20 +53,27 @@ define <4 x half> @uitofp_v4i64_to_v4f16(ptr %ptr) {
5353
define <4 x bfloat> @uitofp_v4i64_to_v4bf16(ptr %ptr) {
5454
; CHECK-LABEL: uitofp_v4i64_to_v4bf16:
5555
; CHECK: // %bb.0:
56-
; CHECK-NEXT: ldp q0, q1, [x0]
56+
; CHECK-NEXT: ldp q0, q2, [x0]
57+
; CHECK-NEXT: mov x8, v0.d[1]
58+
; CHECK-NEXT: fmov x9, d0
59+
; CHECK-NEXT: ucvtf s1, x9
60+
; CHECK-NEXT: mov x9, v2.d[1]
61+
; CHECK-NEXT: ucvtf s0, x8
62+
; CHECK-NEXT: fmov x8, d2
63+
; CHECK-NEXT: ucvtf s2, x8
64+
; CHECK-NEXT: mov v1.s[1], v0.s[0]
65+
; CHECK-NEXT: ucvtf s0, x9
66+
; CHECK-NEXT: mov v1.s[2], v2.s[0]
5767
; CHECK-NEXT: movi v2.4s, #127, msl #8
58-
; CHECK-NEXT: ucvtf v0.2d, v0.2d
59-
; CHECK-NEXT: ucvtf v1.2d, v1.2d
60-
; CHECK-NEXT: fcvtn v0.2s, v0.2d
61-
; CHECK-NEXT: fcvtn2 v0.4s, v1.2d
62-
; CHECK-NEXT: movi v1.4s, #1
63-
; CHECK-NEXT: ushr v3.4s, v0.4s, #16
64-
; CHECK-NEXT: add v2.4s, v0.4s, v2.4s
65-
; CHECK-NEXT: and v1.16b, v3.16b, v1.16b
66-
; CHECK-NEXT: fcmeq v3.4s, v0.4s, v0.4s
67-
; CHECK-NEXT: orr v0.4s, #64, lsl #16
68-
; CHECK-NEXT: add v1.4s, v1.4s, v2.4s
69-
; CHECK-NEXT: bit v0.16b, v1.16b, v3.16b
68+
; CHECK-NEXT: mov v1.s[3], v0.s[0]
69+
; CHECK-NEXT: movi v0.4s, #1
70+
; CHECK-NEXT: ushr v3.4s, v1.4s, #16
71+
; CHECK-NEXT: add v2.4s, v1.4s, v2.4s
72+
; CHECK-NEXT: and v0.16b, v3.16b, v0.16b
73+
; CHECK-NEXT: fcmeq v3.4s, v1.4s, v1.4s
74+
; CHECK-NEXT: orr v1.4s, #64, lsl #16
75+
; CHECK-NEXT: add v0.4s, v0.4s, v2.4s
76+
; CHECK-NEXT: bif v0.16b, v1.16b, v3.16b
7077
; CHECK-NEXT: shrn v0.4h, v0.4s, #16
7178
; CHECK-NEXT: ret
7279
%tmp1 = load <4 x i64>, ptr %ptr

llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll

+64-36
Original file line numberDiff line numberDiff line change
@@ -310,29 +310,43 @@ define <4 x bfloat> @sitofp_i32(<4 x i32> %a) #0 {
310310
define <4 x bfloat> @sitofp_i64(<4 x i64> %a) #0 {
311311
; CHECK-CVT-LABEL: sitofp_i64:
312312
; CHECK-CVT: // %bb.0:
313-
; CHECK-CVT-NEXT: scvtf v0.2d, v0.2d
314-
; CHECK-CVT-NEXT: scvtf v1.2d, v1.2d
315-
; CHECK-CVT-NEXT: movi v2.4s, #127, msl #8
316-
; CHECK-CVT-NEXT: fcvtn v0.2s, v0.2d
317-
; CHECK-CVT-NEXT: fcvtn2 v0.4s, v1.2d
318-
; CHECK-CVT-NEXT: movi v1.4s, #1
319-
; CHECK-CVT-NEXT: ushr v3.4s, v0.4s, #16
320-
; CHECK-CVT-NEXT: add v2.4s, v0.4s, v2.4s
321-
; CHECK-CVT-NEXT: and v1.16b, v3.16b, v1.16b
322-
; CHECK-CVT-NEXT: fcmeq v3.4s, v0.4s, v0.4s
323-
; CHECK-CVT-NEXT: orr v0.4s, #64, lsl #16
324-
; CHECK-CVT-NEXT: add v1.4s, v1.4s, v2.4s
325-
; CHECK-CVT-NEXT: bit v0.16b, v1.16b, v3.16b
313+
; CHECK-CVT-NEXT: mov x8, v0.d[1]
314+
; CHECK-CVT-NEXT: fmov x9, d0
315+
; CHECK-CVT-NEXT: scvtf s2, x9
316+
; CHECK-CVT-NEXT: mov x9, v1.d[1]
317+
; CHECK-CVT-NEXT: scvtf s0, x8
318+
; CHECK-CVT-NEXT: fmov x8, d1
319+
; CHECK-CVT-NEXT: scvtf s1, x8
320+
; CHECK-CVT-NEXT: mov v2.s[1], v0.s[0]
321+
; CHECK-CVT-NEXT: scvtf s0, x9
322+
; CHECK-CVT-NEXT: mov v2.s[2], v1.s[0]
323+
; CHECK-CVT-NEXT: movi v1.4s, #127, msl #8
324+
; CHECK-CVT-NEXT: mov v2.s[3], v0.s[0]
325+
; CHECK-CVT-NEXT: movi v0.4s, #1
326+
; CHECK-CVT-NEXT: ushr v3.4s, v2.4s, #16
327+
; CHECK-CVT-NEXT: add v1.4s, v2.4s, v1.4s
328+
; CHECK-CVT-NEXT: and v0.16b, v3.16b, v0.16b
329+
; CHECK-CVT-NEXT: fcmeq v3.4s, v2.4s, v2.4s
330+
; CHECK-CVT-NEXT: orr v2.4s, #64, lsl #16
331+
; CHECK-CVT-NEXT: add v0.4s, v0.4s, v1.4s
332+
; CHECK-CVT-NEXT: bif v0.16b, v2.16b, v3.16b
326333
; CHECK-CVT-NEXT: shrn v0.4h, v0.4s, #16
327334
; CHECK-CVT-NEXT: ret
328335
;
329336
; CHECK-BF16-LABEL: sitofp_i64:
330337
; CHECK-BF16: // %bb.0:
331-
; CHECK-BF16-NEXT: scvtf v0.2d, v0.2d
332-
; CHECK-BF16-NEXT: scvtf v1.2d, v1.2d
333-
; CHECK-BF16-NEXT: fcvtn v0.2s, v0.2d
334-
; CHECK-BF16-NEXT: fcvtn2 v0.4s, v1.2d
335-
; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s
338+
; CHECK-BF16-NEXT: mov x8, v0.d[1]
339+
; CHECK-BF16-NEXT: fmov x9, d0
340+
; CHECK-BF16-NEXT: scvtf s2, x9
341+
; CHECK-BF16-NEXT: mov x9, v1.d[1]
342+
; CHECK-BF16-NEXT: scvtf s0, x8
343+
; CHECK-BF16-NEXT: fmov x8, d1
344+
; CHECK-BF16-NEXT: mov v2.s[1], v0.s[0]
345+
; CHECK-BF16-NEXT: scvtf s0, x8
346+
; CHECK-BF16-NEXT: mov v2.s[2], v0.s[0]
347+
; CHECK-BF16-NEXT: scvtf s0, x9
348+
; CHECK-BF16-NEXT: mov v2.s[3], v0.s[0]
349+
; CHECK-BF16-NEXT: bfcvtn v0.4h, v2.4s
336350
; CHECK-BF16-NEXT: ret
337351
%1 = sitofp <4 x i64> %a to <4 x bfloat>
338352
ret <4 x bfloat> %1
@@ -413,29 +427,43 @@ define <4 x bfloat> @uitofp_i32(<4 x i32> %a) #0 {
413427
define <4 x bfloat> @uitofp_i64(<4 x i64> %a) #0 {
414428
; CHECK-CVT-LABEL: uitofp_i64:
415429
; CHECK-CVT: // %bb.0:
416-
; CHECK-CVT-NEXT: ucvtf v0.2d, v0.2d
417-
; CHECK-CVT-NEXT: ucvtf v1.2d, v1.2d
418-
; CHECK-CVT-NEXT: movi v2.4s, #127, msl #8
419-
; CHECK-CVT-NEXT: fcvtn v0.2s, v0.2d
420-
; CHECK-CVT-NEXT: fcvtn2 v0.4s, v1.2d
421-
; CHECK-CVT-NEXT: movi v1.4s, #1
422-
; CHECK-CVT-NEXT: ushr v3.4s, v0.4s, #16
423-
; CHECK-CVT-NEXT: add v2.4s, v0.4s, v2.4s
424-
; CHECK-CVT-NEXT: and v1.16b, v3.16b, v1.16b
425-
; CHECK-CVT-NEXT: fcmeq v3.4s, v0.4s, v0.4s
426-
; CHECK-CVT-NEXT: orr v0.4s, #64, lsl #16
427-
; CHECK-CVT-NEXT: add v1.4s, v1.4s, v2.4s
428-
; CHECK-CVT-NEXT: bit v0.16b, v1.16b, v3.16b
430+
; CHECK-CVT-NEXT: mov x8, v0.d[1]
431+
; CHECK-CVT-NEXT: fmov x9, d0
432+
; CHECK-CVT-NEXT: ucvtf s2, x9
433+
; CHECK-CVT-NEXT: mov x9, v1.d[1]
434+
; CHECK-CVT-NEXT: ucvtf s0, x8
435+
; CHECK-CVT-NEXT: fmov x8, d1
436+
; CHECK-CVT-NEXT: ucvtf s1, x8
437+
; CHECK-CVT-NEXT: mov v2.s[1], v0.s[0]
438+
; CHECK-CVT-NEXT: ucvtf s0, x9
439+
; CHECK-CVT-NEXT: mov v2.s[2], v1.s[0]
440+
; CHECK-CVT-NEXT: movi v1.4s, #127, msl #8
441+
; CHECK-CVT-NEXT: mov v2.s[3], v0.s[0]
442+
; CHECK-CVT-NEXT: movi v0.4s, #1
443+
; CHECK-CVT-NEXT: ushr v3.4s, v2.4s, #16
444+
; CHECK-CVT-NEXT: add v1.4s, v2.4s, v1.4s
445+
; CHECK-CVT-NEXT: and v0.16b, v3.16b, v0.16b
446+
; CHECK-CVT-NEXT: fcmeq v3.4s, v2.4s, v2.4s
447+
; CHECK-CVT-NEXT: orr v2.4s, #64, lsl #16
448+
; CHECK-CVT-NEXT: add v0.4s, v0.4s, v1.4s
449+
; CHECK-CVT-NEXT: bif v0.16b, v2.16b, v3.16b
429450
; CHECK-CVT-NEXT: shrn v0.4h, v0.4s, #16
430451
; CHECK-CVT-NEXT: ret
431452
;
432453
; CHECK-BF16-LABEL: uitofp_i64:
433454
; CHECK-BF16: // %bb.0:
434-
; CHECK-BF16-NEXT: ucvtf v0.2d, v0.2d
435-
; CHECK-BF16-NEXT: ucvtf v1.2d, v1.2d
436-
; CHECK-BF16-NEXT: fcvtn v0.2s, v0.2d
437-
; CHECK-BF16-NEXT: fcvtn2 v0.4s, v1.2d
438-
; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s
455+
; CHECK-BF16-NEXT: mov x8, v0.d[1]
456+
; CHECK-BF16-NEXT: fmov x9, d0
457+
; CHECK-BF16-NEXT: ucvtf s2, x9
458+
; CHECK-BF16-NEXT: mov x9, v1.d[1]
459+
; CHECK-BF16-NEXT: ucvtf s0, x8
460+
; CHECK-BF16-NEXT: fmov x8, d1
461+
; CHECK-BF16-NEXT: mov v2.s[1], v0.s[0]
462+
; CHECK-BF16-NEXT: ucvtf s0, x8
463+
; CHECK-BF16-NEXT: mov v2.s[2], v0.s[0]
464+
; CHECK-BF16-NEXT: ucvtf s0, x9
465+
; CHECK-BF16-NEXT: mov v2.s[3], v0.s[0]
466+
; CHECK-BF16-NEXT: bfcvtn v0.4h, v2.4s
439467
; CHECK-BF16-NEXT: ret
440468
%1 = uitofp <4 x i64> %a to <4 x bfloat>
441469
ret <4 x bfloat> %1

0 commit comments

Comments
 (0)