Skip to content

Commit c980a20

Browse files
authored
[AArch64][SVE] Enable max vector bandwidth for SVE (#109671)
Returns true for shouldMaximizeVectorBandwidth when the register type is a scalable vector and SVE or streaming SVE are available.
1 parent cf046c8 commit c980a20

8 files changed

+213
-126
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -333,8 +333,10 @@ AArch64TTIImpl::getInlineCallPenalty(const Function *F, const CallBase &Call,
333333
bool AArch64TTIImpl::shouldMaximizeVectorBandwidth(
334334
TargetTransformInfo::RegisterKind K) const {
335335
assert(K != TargetTransformInfo::RGK_Scalar);
336-
return (K == TargetTransformInfo::RGK_FixedWidthVector &&
337-
ST->isNeonAvailable());
336+
return ((K == TargetTransformInfo::RGK_FixedWidthVector &&
337+
ST->isNeonAvailable()) ||
338+
(K == TargetTransformInfo::RGK_ScalableVector &&
339+
ST->isSVEorStreamingSVEAvailable()));
338340
}
339341

340342
/// Calculate the cost of materializing a 64-bit value. This helper

llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll

Lines changed: 55 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -732,30 +732,60 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
732732
; DEFAULT-LABEL: define void @multiple_exit_conditions(
733733
; DEFAULT-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR2:[0-9]+]] {
734734
; DEFAULT-NEXT: entry:
735-
; DEFAULT-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
735+
; DEFAULT-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
736+
; DEFAULT-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 32
737+
; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 257, [[TMP8]]
738+
; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
736739
; DEFAULT: vector.ph:
737-
; DEFAULT-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 2048
740+
; DEFAULT-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
741+
; DEFAULT-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32
742+
; DEFAULT-NEXT: [[N_MOD_VF:%.*]] = urem i64 257, [[TMP3]]
743+
; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i64 257, [[N_MOD_VF]]
744+
; DEFAULT-NEXT: [[TMP17:%.*]] = mul i64 [[N_VEC]], 8
745+
; DEFAULT-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP17]]
746+
; DEFAULT-NEXT: [[IND_END1:%.*]] = mul i64 [[N_VEC]], 2
747+
; DEFAULT-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
748+
; DEFAULT-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 32
738749
; DEFAULT-NEXT: br label [[VECTOR_BODY:%.*]]
739750
; DEFAULT: vector.body:
740751
; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
741752
; DEFAULT-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
742753
; DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
743754
; DEFAULT-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP0]]
744755
; DEFAULT-NEXT: [[TMP1:%.*]] = load i16, ptr [[SRC]], align 2
745-
; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[TMP1]], i64 0
746-
; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
747-
; DEFAULT-NEXT: [[TMP2:%.*]] = or <8 x i16> [[BROADCAST_SPLAT]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
748-
; DEFAULT-NEXT: [[TMP3:%.*]] = uitofp <8 x i16> [[TMP2]] to <8 x double>
756+
; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[TMP1]], i64 0
757+
; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
758+
; DEFAULT-NEXT: [[TMP9:%.*]] = or <vscale x 8 x i16> [[BROADCAST_SPLAT]], shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
759+
; DEFAULT-NEXT: [[TMP10:%.*]] = or <vscale x 8 x i16> [[BROADCAST_SPLAT]], shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
760+
; DEFAULT-NEXT: [[TMP11:%.*]] = or <vscale x 8 x i16> [[BROADCAST_SPLAT]], shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
761+
; DEFAULT-NEXT: [[TMP12:%.*]] = or <vscale x 8 x i16> [[BROADCAST_SPLAT]], shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
762+
; DEFAULT-NEXT: [[TMP13:%.*]] = uitofp <vscale x 8 x i16> [[TMP9]] to <vscale x 8 x double>
763+
; DEFAULT-NEXT: [[TMP14:%.*]] = uitofp <vscale x 8 x i16> [[TMP10]] to <vscale x 8 x double>
764+
; DEFAULT-NEXT: [[TMP15:%.*]] = uitofp <vscale x 8 x i16> [[TMP11]] to <vscale x 8 x double>
765+
; DEFAULT-NEXT: [[TMP16:%.*]] = uitofp <vscale x 8 x i16> [[TMP12]] to <vscale x 8 x double>
749766
; DEFAULT-NEXT: [[TMP4:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i32 0
750-
; DEFAULT-NEXT: store <8 x double> [[TMP3]], ptr [[TMP4]], align 8
751-
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
752-
; DEFAULT-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
753-
; DEFAULT-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
767+
; DEFAULT-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
768+
; DEFAULT-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 8
769+
; DEFAULT-NEXT: [[TMP20:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i64 [[TMP19]]
770+
; DEFAULT-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
771+
; DEFAULT-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 16
772+
; DEFAULT-NEXT: [[TMP23:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i64 [[TMP22]]
773+
; DEFAULT-NEXT: [[TMP24:%.*]] = call i64 @llvm.vscale.i64()
774+
; DEFAULT-NEXT: [[TMP25:%.*]] = mul i64 [[TMP24]], 24
775+
; DEFAULT-NEXT: [[TMP26:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i64 [[TMP25]]
776+
; DEFAULT-NEXT: store <vscale x 8 x double> [[TMP13]], ptr [[TMP4]], align 8
777+
; DEFAULT-NEXT: store <vscale x 8 x double> [[TMP14]], ptr [[TMP20]], align 8
778+
; DEFAULT-NEXT: store <vscale x 8 x double> [[TMP15]], ptr [[TMP23]], align 8
779+
; DEFAULT-NEXT: store <vscale x 8 x double> [[TMP16]], ptr [[TMP26]], align 8
780+
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
781+
; DEFAULT-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
782+
; DEFAULT-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
754783
; DEFAULT: middle.block:
755-
; DEFAULT-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
784+
; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 257, [[N_VEC]]
785+
; DEFAULT-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
756786
; DEFAULT: scalar.ph:
757787
; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[DST]], [[ENTRY:%.*]] ]
758-
; DEFAULT-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ 512, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
788+
; DEFAULT-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
759789
; DEFAULT-NEXT: br label [[LOOP:%.*]]
760790
; DEFAULT: vector.scevcheck:
761791
; DEFAULT-NEXT: unreachable
@@ -780,7 +810,7 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
780810
; PRED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
781811
; PRED: vector.ph:
782812
; PRED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
783-
; PRED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
813+
; PRED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
784814
; PRED-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1
785815
; PRED-NEXT: [[N_RND_UP:%.*]] = add i64 257, [[TMP2]]
786816
; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
@@ -789,31 +819,31 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
789819
; PRED-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]]
790820
; PRED-NEXT: [[IND_END1:%.*]] = mul i64 [[N_VEC]], 2
791821
; PRED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
792-
; PRED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2
822+
; PRED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
793823
; PRED-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
794-
; PRED-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 2
824+
; PRED-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 8
795825
; PRED-NEXT: [[TMP8:%.*]] = sub i64 257, [[TMP7]]
796826
; PRED-NEXT: [[TMP9:%.*]] = icmp ugt i64 257, [[TMP7]]
797827
; PRED-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i64 [[TMP8]], i64 0
798-
; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 257)
828+
; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 257)
799829
; PRED-NEXT: br label [[VECTOR_BODY:%.*]]
800830
; PRED: vector.body:
801831
; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
802-
; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
832+
; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
803833
; PRED-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
804834
; PRED-NEXT: [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 0
805835
; PRED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP11]]
806836
; PRED-NEXT: [[TMP12:%.*]] = load i16, ptr [[SRC]], align 2
807-
; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i16> poison, i16 [[TMP12]], i64 0
808-
; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
809-
; PRED-NEXT: [[TMP13:%.*]] = or <vscale x 2 x i16> [[BROADCAST_SPLAT]], shufflevector (<vscale x 2 x i16> insertelement (<vscale x 2 x i16> poison, i16 1, i64 0), <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer)
810-
; PRED-NEXT: [[TMP14:%.*]] = uitofp <vscale x 2 x i16> [[TMP13]] to <vscale x 2 x double>
837+
; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[TMP12]], i64 0
838+
; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
839+
; PRED-NEXT: [[TMP13:%.*]] = or <vscale x 8 x i16> [[BROADCAST_SPLAT]], shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
840+
; PRED-NEXT: [[TMP14:%.*]] = uitofp <vscale x 8 x i16> [[TMP13]] to <vscale x 8 x double>
811841
; PRED-NEXT: [[TMP15:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i32 0
812-
; PRED-NEXT: call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP14]], ptr [[TMP15]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
842+
; PRED-NEXT: call void @llvm.masked.store.nxv8f64.p0(<vscale x 8 x double> [[TMP14]], ptr [[TMP15]], i32 8, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
813843
; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP5]]
814-
; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP10]])
815-
; PRED-NEXT: [[TMP16:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
816-
; PRED-NEXT: [[TMP17:%.*]] = extractelement <vscale x 2 x i1> [[TMP16]], i32 0
844+
; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP10]])
845+
; PRED-NEXT: [[TMP16:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
846+
; PRED-NEXT: [[TMP17:%.*]] = extractelement <vscale x 8 x i1> [[TMP16]], i32 0
817847
; PRED-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
818848
; PRED: middle.block:
819849
; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]

llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,23 @@
11
; REQUIRES: asserts
22
; RUN: opt -mtriple=aarch64 -mattr=+sve \
33
; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
4-
; RUN: | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE4
4+
; RUN: | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE16
55

66
; RUN: opt -mtriple=aarch64 -mattr=+sve -mcpu=generic \
77
; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
8-
; RUN: | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE4
8+
; RUN: | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE16
99

1010
; RUN: opt -mtriple=aarch64 -mcpu=neoverse-v1 \
1111
; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
12-
; RUN: | FileCheck %s --check-prefixes=NEOVERSE-V1,VF-VSCALE4
12+
; RUN: | FileCheck %s --check-prefixes=NEOVERSE-V1,VF-VSCALE16
1313

1414
; RUN: opt -mtriple=aarch64 -mcpu=neoverse-n2 \
1515
; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
16-
; RUN: | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE4
16+
; RUN: | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE16
1717

1818
; RUN: opt -mtriple=aarch64 -mcpu=neoverse-n2 \
1919
; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
20-
; RUN: | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE4
20+
; RUN: | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE16
2121

2222
; GENERIC: LV: Vector loop of width vscale x 2 costs: 3 (assuming a minimum vscale of 2).
2323
; GENERIC: LV: Vector loop of width vscale x 4 costs: 1 (assuming a minimum vscale of 2).
@@ -29,7 +29,7 @@
2929
; NEOVERSE-N2: LV: Vector loop of width vscale x 4 costs: 3 (assuming a minimum vscale of 1).
3030

3131
; VF-4: <4 x i32>
32-
; VF-VSCALE4: <16 x i32>
32+
; VF-VSCALE16: <vscale x 16 x i32>
3333
define void @test0(ptr %a, ptr %b, ptr %c) #0 {
3434
entry:
3535
br label %loop

llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@
88
; (maximized bandwidth for i8 in the loop).
99
define void @test0(ptr %a, ptr %b, ptr %c) #0 {
1010
; CHECK: LV: Checking a loop in 'test0'
11-
; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4
12-
; CHECK_SCALABLE_ON: LV: Selecting VF: 16
11+
; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 16
12+
; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 16
1313
; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
1414
; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 16
1515
; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 16

0 commit comments

Comments
 (0)