Skip to content

Commit 9aa3948

Browse files
committed
[AArch64] Prefer to fold dup into fmul/fma as opposed to ld1r
There is a fold to create LD1DUPpost from dup(load) that can be postinc. If the dup is used by a "by element" operation such as fmul or fma then it can be slightly better to fold the dup into the fmul instead, which produces slightly fast code. ld1r { v1.4s }, [x0], #4 fmul v0.4s, v1.4s, v0.4s vs ldr s1, [x0], #4 fmul v0.4s, v0.4s, v1.s[0] This could also be done with integer operations such as smull/umull too, so long as the load/dup gets correctly combined into the mul operation. Currently this just operates on foating point types. Differential Revision: https://reviews.llvm.org/D145184
1 parent 912404d commit 9aa3948

File tree

2 files changed

+21
-12
lines changed

2 files changed

+21
-12
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19348,6 +19348,15 @@ static SDValue performPostLD1Combine(SDNode *N,
1934819348
return SDValue();
1934919349
}
1935019350

19351+
// If there is one use and it can splat the value, prefer that operation.
19352+
// TODO: This could be expanded to more operations if they reliably use the
19353+
// index variants.
19354+
if (N->hasOneUse()) {
19355+
unsigned UseOpc = N->use_begin()->getOpcode();
19356+
if (UseOpc == ISD::FMUL || UseOpc == ISD::FMA)
19357+
return SDValue();
19358+
}
19359+
1935119360
SDValue Addr = LD->getOperand(1);
1935219361
SDValue Vector = N->getOperand(0);
1935319362
// Search for a use of the address operand that is an increment.

llvm/test/CodeGen/AArch64/ld1postmul.ll

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,8 @@ define ptr @fmul_v4f16(ptr %p, ptr %ps, <4 x half> %t) {
6363
;
6464
; CHECK-FP16-LABEL: fmul_v4f16:
6565
; CHECK-FP16: // %bb.0:
66-
; CHECK-FP16-NEXT: ld1r { v1.4h }, [x0], #2
67-
; CHECK-FP16-NEXT: fmul v0.4h, v1.4h, v0.4h
66+
; CHECK-FP16-NEXT: ldr h1, [x0], #2
67+
; CHECK-FP16-NEXT: fmul v0.4h, v0.4h, v1.h[0]
6868
; CHECK-FP16-NEXT: str d0, [x1]
6969
; CHECK-FP16-NEXT: ret
7070
%l = load half, ptr %p
@@ -93,8 +93,8 @@ define ptr @fmla_v4f16(ptr %p, ptr %ps, <4 x half> %t, <4 x half> %u) {
9393
;
9494
; CHECK-FP16-LABEL: fmla_v4f16:
9595
; CHECK-FP16: // %bb.0:
96-
; CHECK-FP16-NEXT: ld1r { v2.4h }, [x0], #2
97-
; CHECK-FP16-NEXT: fmla v1.4h, v0.4h, v2.4h
96+
; CHECK-FP16-NEXT: ldr h2, [x0], #2
97+
; CHECK-FP16-NEXT: fmla v1.4h, v0.4h, v2.h[0]
9898
; CHECK-FP16-NEXT: str d1, [x1]
9999
; CHECK-FP16-NEXT: ret
100100
%l = load half, ptr %p
@@ -110,8 +110,8 @@ define ptr @fmla_v4f16(ptr %p, ptr %ps, <4 x half> %t, <4 x half> %u) {
110110
define ptr @fmul_v4f32(ptr %p, ptr %ps, <4 x float> %t) {
111111
; CHECK-LABEL: fmul_v4f32:
112112
; CHECK: // %bb.0:
113-
; CHECK-NEXT: ld1r { v1.4s }, [x0], #4
114-
; CHECK-NEXT: fmul v0.4s, v1.4s, v0.4s
113+
; CHECK-NEXT: ldr s1, [x0], #4
114+
; CHECK-NEXT: fmul v0.4s, v0.4s, v1.s[0]
115115
; CHECK-NEXT: str q0, [x1]
116116
; CHECK-NEXT: ret
117117
%l = load float, ptr %p
@@ -126,8 +126,8 @@ define ptr @fmul_v4f32(ptr %p, ptr %ps, <4 x float> %t) {
126126
define ptr @fmla_v4f32(ptr %p, ptr %ps, <4 x float> %t, <4 x float> %u) {
127127
; CHECK-LABEL: fmla_v4f32:
128128
; CHECK: // %bb.0:
129-
; CHECK-NEXT: ld1r { v2.4s }, [x0], #4
130-
; CHECK-NEXT: fmla v1.4s, v0.4s, v2.4s
129+
; CHECK-NEXT: ldr s2, [x0], #4
130+
; CHECK-NEXT: fmla v1.4s, v0.4s, v2.s[0]
131131
; CHECK-NEXT: str q1, [x1]
132132
; CHECK-NEXT: ret
133133
%l = load float, ptr %p
@@ -143,8 +143,8 @@ define ptr @fmla_v4f32(ptr %p, ptr %ps, <4 x float> %t, <4 x float> %u) {
143143
define ptr @fmul_v2f64(ptr %p, ptr %ps, <2 x double> %t) {
144144
; CHECK-LABEL: fmul_v2f64:
145145
; CHECK: // %bb.0:
146-
; CHECK-NEXT: ld1r { v1.2d }, [x0], #8
147-
; CHECK-NEXT: fmul v0.2d, v1.2d, v0.2d
146+
; CHECK-NEXT: ldr d1, [x0], #8
147+
; CHECK-NEXT: fmul v0.2d, v0.2d, v1.d[0]
148148
; CHECK-NEXT: str q0, [x1]
149149
; CHECK-NEXT: ret
150150
%l = load double, ptr %p
@@ -159,8 +159,8 @@ define ptr @fmul_v2f64(ptr %p, ptr %ps, <2 x double> %t) {
159159
define ptr @fmla_v2f64(ptr %p, ptr %ps, <2 x double> %t, <2 x double> %u) {
160160
; CHECK-LABEL: fmla_v2f64:
161161
; CHECK: // %bb.0:
162-
; CHECK-NEXT: ld1r { v2.2d }, [x0], #8
163-
; CHECK-NEXT: fmla v1.2d, v0.2d, v2.2d
162+
; CHECK-NEXT: ldr d2, [x0], #8
163+
; CHECK-NEXT: fmla v1.2d, v0.2d, v2.d[0]
164164
; CHECK-NEXT: str q1, [x1]
165165
; CHECK-NEXT: ret
166166
%l = load double, ptr %p

0 commit comments

Comments
 (0)