Skip to content

Commit 34f9ddf

Browse files
authored
[AArch64][SVE] Fold ADD+CNTB to INCB/DECB (#118280)
Currently, given: ```cpp uint64_t incb(uint64_t x) { return x+svcntb(); } ``` LLVM generates: ```gas incb: addvl x0, x0, #1 ret ``` Which is equivalent to: ```gas incb: incb x0 ret ``` However, on microarchitectures like the Neoverse V2 and Neoverse V3, the second form (with INCB) can have significantly better latency and throughput (according to their SWOG). On the Neoverse V2, for example, ADDVL has a latency and throughput of 2, whereas some forms of INCB have a latency of 1 and a throughput of 4. The same applies to DECB. This patch adds patterns to prefer the cheaper INCB/DECB forms over ADDVL where applicable.
1 parent 62d32c2 commit 34f9ddf

File tree

9 files changed

+203
-42
lines changed

9 files changed

+203
-42
lines changed

llvm/lib/Target/AArch64/AArch64Features.td

+5
Original file line numberDiff line numberDiff line change
@@ -820,6 +820,11 @@ def FeatureUseFixedOverScalableIfEqualCost : SubtargetFeature<"use-fixed-over-sc
820820
def FeatureAvoidLDAPUR : SubtargetFeature<"avoid-ldapur", "AvoidLDAPUR", "true",
821821
"Prefer add+ldapr to offset ldapur">;
822822

823+
// Some INC/DEC forms have better latency and throughput than ADDVL.
824+
def FeatureDisableFastIncVL : SubtargetFeature<"disable-fast-inc-vl",
825+
"HasDisableFastIncVL", "true",
826+
"Do not prefer INC/DEC, ALL, { 1, 2, 4 } over ADDVL">;
827+
823828
//===----------------------------------------------------------------------===//
824829
// Architectures.
825830
//

llvm/lib/Target/AArch64/AArch64InstrInfo.td

+2
Original file line numberDiff line numberDiff line change
@@ -389,6 +389,8 @@ def UseScalarIncVL : Predicate<"Subtarget->useScalarIncVL()">;
389389

390390
def NoUseScalarIncVL : Predicate<"!Subtarget->useScalarIncVL()">;
391391

392+
def HasFastIncVL : Predicate<"!Subtarget->hasDisableFastIncVL()">;
393+
392394
def UseSVEFPLD1R : Predicate<"!Subtarget->noSVEFPLD1R()">;
393395

394396
def UseLDAPUR : Predicate<"!Subtarget->avoidLDAPUR()">;

llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td

+23
Original file line numberDiff line numberDiff line change
@@ -2677,6 +2677,29 @@ let Predicates = [HasSVE_or_SME] in {
26772677
(DECD_ZPiI ZPR:$op, 31, $imm)>;
26782678
}
26792679

2680+
// Some INCB/DECB forms have better latency and throughput than ADDVL, so we
2681+
// prefer using them here.
2682+
// We could extend this to other INC/DEC (scalar) instructions.
2683+
let Predicates = [HasSVE_or_SME, UseScalarIncVL, HasFastIncVL], AddedComplexity = 6 in {
2684+
foreach imm = [ 1, 2, 4 ] in {
2685+
def : Pat<(add GPR64:$op, (vscale !mul(imm, 16))),
2686+
(INCB_XPiI GPR64:$op, 31, imm)>;
2687+
2688+
def : Pat<(add GPR32:$op, (i32 (trunc (vscale !mul(imm, 16))))),
2689+
(EXTRACT_SUBREG (INCB_XPiI (INSERT_SUBREG (IMPLICIT_DEF),
2690+
GPR32:$op, sub_32), 31, imm),
2691+
sub_32)>;
2692+
2693+
def : Pat<(add GPR64:$op, (vscale !mul(imm, -16))),
2694+
(DECB_XPiI GPR64:$op, 31, imm)>;
2695+
2696+
def : Pat<(add GPR32:$op, (i32 (trunc (vscale !mul(imm, -16))))),
2697+
(EXTRACT_SUBREG (DECB_XPiI (INSERT_SUBREG (IMPLICIT_DEF),
2698+
GPR32:$op, sub_32), 31, imm),
2699+
sub_32)>;
2700+
}
2701+
}
2702+
26802703
let Predicates = [HasSVE_or_SME, UseScalarIncVL], AddedComplexity = 5 in {
26812704
def : Pat<(add GPR64:$op, (vscale (sve_rdvl_imm i32:$imm))),
26822705
(ADDVL_XXI GPR64:$op, $imm)>;

llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll

+4-2
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,8 @@ define void @quux() #1 {
6565
; CHECK-NEXT: mov sp, x9
6666
; CHECK-NEXT: sub x10, x29, #104
6767
; CHECK-NEXT: stur x9, [x10, #-256] // 8-byte Folded Spill
68-
; CHECK-NEXT: addvl x9, x8, #1
68+
; CHECK-NEXT: mov x9, x8
69+
; CHECK-NEXT: incb x9
6970
; CHECK-NEXT: mov w0, w9
7071
; CHECK-NEXT: // implicit-def: $x9
7172
; CHECK-NEXT: mov w9, w0
@@ -160,7 +161,8 @@ define void @quux() #1 {
160161
; CHECK-NEXT: mov x9, sp
161162
; CHECK-NEXT: subs x9, x9, #16
162163
; CHECK-NEXT: mov sp, x9
163-
; CHECK-NEXT: addvl x9, x8, #2
164+
; CHECK-NEXT: mov x9, x8
165+
; CHECK-NEXT: incb x9, all, mul #2
164166
; CHECK-NEXT: mov w0, w9
165167
; CHECK-NEXT: // implicit-def: $x9
166168
; CHECK-NEXT: mov w9, w0

llvm/test/CodeGen/AArch64/sve-lsrchain.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ define void @test(ptr nocapture noundef readonly %kernel, i32 noundef %kw, float
8585
; CHECK-NEXT: ldr z5, [x4, #3, mul vl]
8686
; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z3.h
8787
; CHECK-NEXT: str z4, [x16, #3, mul vl]
88-
; CHECK-NEXT: addvl x16, x16, #4
88+
; CHECK-NEXT: incb x16, all, mul #4
8989
; CHECK-NEXT: cmp x16, x11
9090
; CHECK-NEXT: b.lo .LBB0_4
9191
; CHECK-NEXT: // %bb.5: // %while.cond.i..exit_crit_edge.us

llvm/test/CodeGen/AArch64/sve-vl-arith.ll

+131-3
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
22
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -verify-machineinstrs < %s | FileCheck %s -check-prefix=NO_SCALAR_INC
33
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -sve-use-scalar-inc-vl=true -verify-machineinstrs < %s | FileCheck %s
4+
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,disable-fast-inc-vl -sve-use-scalar-inc-vl=true -verify-machineinstrs < %s | FileCheck %s -check-prefix=NO_FAST_INC
45
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 -verify-machineinstrs < %s | FileCheck %s
56
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 -sve-use-scalar-inc-vl=false -verify-machineinstrs < %s | FileCheck %s -check-prefix=NO_SCALAR_INC
7+
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2,disable-fast-inc-vl -verify-machineinstrs < %s | FileCheck %s -check-prefix=NO_FAST_INC
68

79
define <vscale x 8 x i16> @inch_vec(<vscale x 8 x i16> %a) {
810
; NO_SCALAR_INC-LABEL: inch_vec:
@@ -14,6 +16,11 @@ define <vscale x 8 x i16> @inch_vec(<vscale x 8 x i16> %a) {
1416
; CHECK: // %bb.0:
1517
; CHECK-NEXT: inch z0.h
1618
; CHECK-NEXT: ret
19+
;
20+
; NO_FAST_INC-LABEL: inch_vec:
21+
; NO_FAST_INC: // %bb.0:
22+
; NO_FAST_INC-NEXT: inch z0.h
23+
; NO_FAST_INC-NEXT: ret
1724
%vscale = call i16 @llvm.vscale.i16()
1825
%mul = mul i16 %vscale, 8
1926
%vl = insertelement <vscale x 8 x i16> poison, i16 %mul, i32 0
@@ -32,6 +39,11 @@ define <vscale x 4 x i32> @incw_vec(<vscale x 4 x i32> %a) {
3239
; CHECK: // %bb.0:
3340
; CHECK-NEXT: incw z0.s
3441
; CHECK-NEXT: ret
42+
;
43+
; NO_FAST_INC-LABEL: incw_vec:
44+
; NO_FAST_INC: // %bb.0:
45+
; NO_FAST_INC-NEXT: incw z0.s
46+
; NO_FAST_INC-NEXT: ret
3547
%vscale = call i32 @llvm.vscale.i32()
3648
%mul = mul i32 %vscale, 4
3749
%vl = insertelement <vscale x 4 x i32> poison, i32 %mul, i32 0
@@ -50,6 +62,11 @@ define <vscale x 2 x i64> @incd_vec(<vscale x 2 x i64> %a) {
5062
; CHECK: // %bb.0:
5163
; CHECK-NEXT: incd z0.d
5264
; CHECK-NEXT: ret
65+
;
66+
; NO_FAST_INC-LABEL: incd_vec:
67+
; NO_FAST_INC: // %bb.0:
68+
; NO_FAST_INC-NEXT: incd z0.d
69+
; NO_FAST_INC-NEXT: ret
5370
%vscale = call i64 @llvm.vscale.i64()
5471
%mul = mul i64 %vscale, 2
5572
%vl = insertelement <vscale x 2 x i64> poison, i64 %mul, i32 0
@@ -68,6 +85,11 @@ define <vscale x 8 x i16> @dech_vec(<vscale x 8 x i16> %a) {
6885
; CHECK: // %bb.0:
6986
; CHECK-NEXT: dech z0.h, all, mul #2
7087
; CHECK-NEXT: ret
88+
;
89+
; NO_FAST_INC-LABEL: dech_vec:
90+
; NO_FAST_INC: // %bb.0:
91+
; NO_FAST_INC-NEXT: dech z0.h, all, mul #2
92+
; NO_FAST_INC-NEXT: ret
7193
%vscale = call i16 @llvm.vscale.i16()
7294
%mul = mul i16 %vscale, 16
7395
%vl = insertelement <vscale x 8 x i16> poison, i16 %mul, i32 0
@@ -86,6 +108,11 @@ define <vscale x 4 x i32> @decw_vec(<vscale x 4 x i32> %a) {
86108
; CHECK: // %bb.0:
87109
; CHECK-NEXT: decw z0.s, all, mul #4
88110
; CHECK-NEXT: ret
111+
;
112+
; NO_FAST_INC-LABEL: decw_vec:
113+
; NO_FAST_INC: // %bb.0:
114+
; NO_FAST_INC-NEXT: decw z0.s, all, mul #4
115+
; NO_FAST_INC-NEXT: ret
89116
%vscale = call i32 @llvm.vscale.i32()
90117
%mul = mul i32 %vscale, 16
91118
%vl = insertelement <vscale x 4 x i32> poison, i32 %mul, i32 0
@@ -104,6 +131,11 @@ define <vscale x 2 x i64> @decd_vec(<vscale x 2 x i64> %a) {
104131
; CHECK: // %bb.0:
105132
; CHECK-NEXT: decd z0.d, all, mul #8
106133
; CHECK-NEXT: ret
134+
;
135+
; NO_FAST_INC-LABEL: decd_vec:
136+
; NO_FAST_INC: // %bb.0:
137+
; NO_FAST_INC-NEXT: decd z0.d, all, mul #8
138+
; NO_FAST_INC-NEXT: ret
107139
%vscale = call i64 @llvm.vscale.i64()
108140
%mul = mul i64 %vscale, 16
109141
%vl = insertelement <vscale x 2 x i64> poison, i64 %mul, i32 0
@@ -123,8 +155,13 @@ define i64 @incb_scalar_i64(i64 %a) {
123155
;
124156
; CHECK-LABEL: incb_scalar_i64:
125157
; CHECK: // %bb.0:
126-
; CHECK-NEXT: addvl x0, x0, #1
158+
; CHECK-NEXT: incb x0
127159
; CHECK-NEXT: ret
160+
;
161+
; NO_FAST_INC-LABEL: incb_scalar_i64:
162+
; NO_FAST_INC: // %bb.0:
163+
; NO_FAST_INC-NEXT: addvl x0, x0, #1
164+
; NO_FAST_INC-NEXT: ret
128165
%vscale = call i64 @llvm.vscale.i64()
129166
%mul = mul i64 %vscale, 16
130167
%add = add i64 %a, %mul
@@ -142,6 +179,11 @@ define i64 @inch_scalar_i64(i64 %a) {
142179
; CHECK: // %bb.0:
143180
; CHECK-NEXT: inch x0
144181
; CHECK-NEXT: ret
182+
;
183+
; NO_FAST_INC-LABEL: inch_scalar_i64:
184+
; NO_FAST_INC: // %bb.0:
185+
; NO_FAST_INC-NEXT: inch x0
186+
; NO_FAST_INC-NEXT: ret
145187
%vscale = call i64 @llvm.vscale.i64()
146188
%mul = mul i64 %vscale, 8
147189
%add = add i64 %a, %mul
@@ -159,6 +201,11 @@ define i64 @incw_scalar_i64(i64 %a) {
159201
; CHECK: // %bb.0:
160202
; CHECK-NEXT: incw x0
161203
; CHECK-NEXT: ret
204+
;
205+
; NO_FAST_INC-LABEL: incw_scalar_i64:
206+
; NO_FAST_INC: // %bb.0:
207+
; NO_FAST_INC-NEXT: incw x0
208+
; NO_FAST_INC-NEXT: ret
162209
%vscale = call i64 @llvm.vscale.i64()
163210
%mul = mul i64 %vscale, 4
164211
%add = add i64 %a, %mul
@@ -176,6 +223,11 @@ define i64 @incd_scalar_i64(i64 %a) {
176223
; CHECK: // %bb.0:
177224
; CHECK-NEXT: incd x0
178225
; CHECK-NEXT: ret
226+
;
227+
; NO_FAST_INC-LABEL: incd_scalar_i64:
228+
; NO_FAST_INC: // %bb.0:
229+
; NO_FAST_INC-NEXT: incd x0
230+
; NO_FAST_INC-NEXT: ret
179231
%vscale = call i64 @llvm.vscale.i64()
180232
%mul = mul i64 %vscale, 2
181233
%add = add i64 %a, %mul
@@ -193,8 +245,13 @@ define i64 @decb_scalar_i64(i64 %a) {
193245
;
194246
; CHECK-LABEL: decb_scalar_i64:
195247
; CHECK: // %bb.0:
196-
; CHECK-NEXT: addvl x0, x0, #-2
248+
; CHECK-NEXT: decb x0, all, mul #2
197249
; CHECK-NEXT: ret
250+
;
251+
; NO_FAST_INC-LABEL: decb_scalar_i64:
252+
; NO_FAST_INC: // %bb.0:
253+
; NO_FAST_INC-NEXT: addvl x0, x0, #-2
254+
; NO_FAST_INC-NEXT: ret
198255
%vscale = call i64 @llvm.vscale.i64()
199256
%mul = mul i64 %vscale, 32
200257
%sub = sub i64 %a, %mul
@@ -212,6 +269,11 @@ define i64 @dech_scalar_i64(i64 %a) {
212269
; CHECK: // %bb.0:
213270
; CHECK-NEXT: dech x0, all, mul #3
214271
; CHECK-NEXT: ret
272+
;
273+
; NO_FAST_INC-LABEL: dech_scalar_i64:
274+
; NO_FAST_INC: // %bb.0:
275+
; NO_FAST_INC-NEXT: dech x0, all, mul #3
276+
; NO_FAST_INC-NEXT: ret
215277
%vscale = call i64 @llvm.vscale.i64()
216278
%mul = mul i64 %vscale, 24
217279
%sub = sub i64 %a, %mul
@@ -229,6 +291,11 @@ define i64 @decw_scalar_i64(i64 %a) {
229291
; CHECK: // %bb.0:
230292
; CHECK-NEXT: decw x0, all, mul #3
231293
; CHECK-NEXT: ret
294+
;
295+
; NO_FAST_INC-LABEL: decw_scalar_i64:
296+
; NO_FAST_INC: // %bb.0:
297+
; NO_FAST_INC-NEXT: decw x0, all, mul #3
298+
; NO_FAST_INC-NEXT: ret
232299
%vscale = call i64 @llvm.vscale.i64()
233300
%mul = mul i64 %vscale, 12
234301
%sub = sub i64 %a, %mul
@@ -246,6 +313,11 @@ define i64 @decd_scalar_i64(i64 %a) {
246313
; CHECK: // %bb.0:
247314
; CHECK-NEXT: decd x0, all, mul #3
248315
; CHECK-NEXT: ret
316+
;
317+
; NO_FAST_INC-LABEL: decd_scalar_i64:
318+
; NO_FAST_INC: // %bb.0:
319+
; NO_FAST_INC-NEXT: decd x0, all, mul #3
320+
; NO_FAST_INC-NEXT: ret
249321
%vscale = call i64 @llvm.vscale.i64()
250322
%mul = mul i64 %vscale, 6
251323
%sub = sub i64 %a, %mul
@@ -267,6 +339,13 @@ define i32 @incb_scalar_i32(i32 %a) {
267339
; CHECK-NEXT: addvl x0, x0, #3
268340
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
269341
; CHECK-NEXT: ret
342+
;
343+
; NO_FAST_INC-LABEL: incb_scalar_i32:
344+
; NO_FAST_INC: // %bb.0:
345+
; NO_FAST_INC-NEXT: // kill: def $w0 killed $w0 def $x0
346+
; NO_FAST_INC-NEXT: addvl x0, x0, #3
347+
; NO_FAST_INC-NEXT: // kill: def $w0 killed $w0 killed $x0
348+
; NO_FAST_INC-NEXT: ret
270349

271350
%vscale = call i64 @llvm.vscale.i64()
272351
%mul = mul i64 %vscale, 48
@@ -288,6 +367,13 @@ define i32 @inch_scalar_i32(i32 %a) {
288367
; CHECK-NEXT: inch x0, all, mul #7
289368
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
290369
; CHECK-NEXT: ret
370+
;
371+
; NO_FAST_INC-LABEL: inch_scalar_i32:
372+
; NO_FAST_INC: // %bb.0:
373+
; NO_FAST_INC-NEXT: // kill: def $w0 killed $w0 def $x0
374+
; NO_FAST_INC-NEXT: inch x0, all, mul #7
375+
; NO_FAST_INC-NEXT: // kill: def $w0 killed $w0 killed $x0
376+
; NO_FAST_INC-NEXT: ret
291377

292378
%vscale = call i64 @llvm.vscale.i64()
293379
%mul = mul i64 %vscale, 56
@@ -309,6 +395,13 @@ define i32 @incw_scalar_i32(i32 %a) {
309395
; CHECK-NEXT: incw x0, all, mul #7
310396
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
311397
; CHECK-NEXT: ret
398+
;
399+
; NO_FAST_INC-LABEL: incw_scalar_i32:
400+
; NO_FAST_INC: // %bb.0:
401+
; NO_FAST_INC-NEXT: // kill: def $w0 killed $w0 def $x0
402+
; NO_FAST_INC-NEXT: incw x0, all, mul #7
403+
; NO_FAST_INC-NEXT: // kill: def $w0 killed $w0 killed $x0
404+
; NO_FAST_INC-NEXT: ret
312405

313406
%vscale = call i64 @llvm.vscale.i64()
314407
%mul = mul i64 %vscale, 28
@@ -330,6 +423,13 @@ define i32 @incd_scalar_i32(i32 %a) {
330423
; CHECK-NEXT: incd x0, all, mul #7
331424
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
332425
; CHECK-NEXT: ret
426+
;
427+
; NO_FAST_INC-LABEL: incd_scalar_i32:
428+
; NO_FAST_INC: // %bb.0:
429+
; NO_FAST_INC-NEXT: // kill: def $w0 killed $w0 def $x0
430+
; NO_FAST_INC-NEXT: incd x0, all, mul #7
431+
; NO_FAST_INC-NEXT: // kill: def $w0 killed $w0 killed $x0
432+
; NO_FAST_INC-NEXT: ret
333433

334434
%vscale = call i64 @llvm.vscale.i64()
335435
%mul = mul i64 %vscale, 14
@@ -350,9 +450,16 @@ define i32 @decb_scalar_i32(i32 %a) {
350450
; CHECK-LABEL: decb_scalar_i32:
351451
; CHECK: // %bb.0:
352452
; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
353-
; CHECK-NEXT: addvl x0, x0, #-4
453+
; CHECK-NEXT: decb x0, all, mul #4
354454
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
355455
; CHECK-NEXT: ret
456+
;
457+
; NO_FAST_INC-LABEL: decb_scalar_i32:
458+
; NO_FAST_INC: // %bb.0:
459+
; NO_FAST_INC-NEXT: // kill: def $w0 killed $w0 def $x0
460+
; NO_FAST_INC-NEXT: addvl x0, x0, #-4
461+
; NO_FAST_INC-NEXT: // kill: def $w0 killed $w0 killed $x0
462+
; NO_FAST_INC-NEXT: ret
356463

357464
%vscale = call i64 @llvm.vscale.i64()
358465
%mul = mul i64 %vscale, 64
@@ -374,6 +481,13 @@ define i32 @dech_scalar_i32(i32 %a) {
374481
; CHECK-NEXT: dech x0
375482
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
376483
; CHECK-NEXT: ret
484+
;
485+
; NO_FAST_INC-LABEL: dech_scalar_i32:
486+
; NO_FAST_INC: // %bb.0:
487+
; NO_FAST_INC-NEXT: // kill: def $w0 killed $w0 def $x0
488+
; NO_FAST_INC-NEXT: dech x0
489+
; NO_FAST_INC-NEXT: // kill: def $w0 killed $w0 killed $x0
490+
; NO_FAST_INC-NEXT: ret
377491

378492
%vscale = call i64 @llvm.vscale.i64()
379493
%mul = mul i64 %vscale, 8
@@ -395,6 +509,13 @@ define i32 @decw_scalar_i32(i32 %a) {
395509
; CHECK-NEXT: decw x0
396510
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
397511
; CHECK-NEXT: ret
512+
;
513+
; NO_FAST_INC-LABEL: decw_scalar_i32:
514+
; NO_FAST_INC: // %bb.0:
515+
; NO_FAST_INC-NEXT: // kill: def $w0 killed $w0 def $x0
516+
; NO_FAST_INC-NEXT: decw x0
517+
; NO_FAST_INC-NEXT: // kill: def $w0 killed $w0 killed $x0
518+
; NO_FAST_INC-NEXT: ret
398519

399520
%vscale = call i64 @llvm.vscale.i64()
400521
%mul = mul i64 %vscale, 4
@@ -416,6 +537,13 @@ define i32 @decd_scalar_i32(i32 %a) {
416537
; CHECK-NEXT: decd x0
417538
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
418539
; CHECK-NEXT: ret
540+
;
541+
; NO_FAST_INC-LABEL: decd_scalar_i32:
542+
; NO_FAST_INC: // %bb.0:
543+
; NO_FAST_INC-NEXT: // kill: def $w0 killed $w0 def $x0
544+
; NO_FAST_INC-NEXT: decd x0
545+
; NO_FAST_INC-NEXT: // kill: def $w0 killed $w0 killed $x0
546+
; NO_FAST_INC-NEXT: ret
419547
%vscale = call i64 @llvm.vscale.i64()
420548
%mul = mul i64 %vscale, 2
421549
%vl = trunc i64 %mul to i32

llvm/test/CodeGen/AArch64/sve2p1-intrinsics-ld1-single.ll

+2-2
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,8 @@ define <vscale x 4 x i32> @test_svld1uwq_i32_si(<vscale x 1 x i1> %pred, ptr %ba
3333
define <vscale x 4 x i32> @test_svld1uwq_i32_out_of_bound(<vscale x 1 x i1> %pred, ptr %base) {
3434
; CHECK-LABEL: test_svld1uwq_i32_out_of_bound:
3535
; CHECK: // %bb.0:
36-
; CHECK-NEXT: addvl x8, x0, #2
37-
; CHECK-NEXT: ld1w { z0.q }, p0/z, [x8]
36+
; CHECK-NEXT: incb x0, all, mul #2
37+
; CHECK-NEXT: ld1w { z0.q }, p0/z, [x0]
3838
; CHECK-NEXT: ret
3939
%gep = getelementptr inbounds <vscale x 1 x i32>, ptr %base, i64 8
4040
%res = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1uwq.nxv4i32(<vscale x 1 x i1> %pred, ptr %gep)

0 commit comments

Comments
 (0)