Skip to content

Commit 34cdd67

Browse files
authored
[VPlan] Use VPWidenIntrinsicRecipe to vp.select. (#110489)
Use VPWidenIntrinsicRecipe (#110486) to create vp.select intrinsics. This potentially offers an alternative to duplicating EVL recipes for all existing recipes. There are some recipes that will need duplicates (at least at the moment), due to extra code-gen needs (e.g. widening loads and stores). But in cases the intrinsic can directly be used, creating the widened intrinsic directly would reduce the need to duplicate some recipes. PR: #110489
1 parent dd47920 commit 34cdd67

File tree

7 files changed

+101
-2
lines changed

7 files changed

+101
-2
lines changed

llvm/lib/Transforms/Vectorize/VPlan.h

+12
Original file line numberDiff line numberDiff line change
@@ -1669,6 +1669,16 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags {
16691669
MayWriteToMemory(CI.mayWriteToMemory()),
16701670
MayHaveSideEffects(CI.mayHaveSideEffects()) {}
16711671

1672+
VPWidenIntrinsicRecipe(Intrinsic::ID VectorIntrinsicID,
1673+
ArrayRef<VPValue *> CallArguments, Type *Ty,
1674+
bool MayReadFromMemory, bool MayWriteToMemory,
1675+
bool MayHaveSideEffects, DebugLoc DL = {})
1676+
: VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments),
1677+
VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty),
1678+
MayReadFromMemory(MayReadFromMemory),
1679+
MayWriteToMemory(MayWriteToMemory),
1680+
MayHaveSideEffects(MayHaveSideEffects) {}
1681+
16721682
~VPWidenIntrinsicRecipe() override = default;
16731683

16741684
VPWidenIntrinsicRecipe *clone() override {
@@ -1706,6 +1716,8 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags {
17061716
void print(raw_ostream &O, const Twine &Indent,
17071717
VPSlotTracker &SlotTracker) const override;
17081718
#endif
1719+
1720+
bool onlyFirstLaneUsed(const VPValue *Op) const override;
17091721
};
17101722

17111723
/// A recipe for widening Call instructions using library calls.

llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp

+2
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
6161
case Instruction::ICmp:
6262
case VPInstruction::ActiveLaneMask:
6363
return inferScalarType(R->getOperand(1));
64+
case VPInstruction::ExplicitVectorLength:
65+
return Type::getIntNTy(Ctx, 32);
6466
case VPInstruction::FirstOrderRecurrenceSplice:
6567
case VPInstruction::Not:
6668
return SetResultTyFromOp();

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

+8-1
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,6 @@ bool VPRecipeBase::mayWriteToMemory() const {
7979
return !cast<VPWidenCallRecipe>(this)
8080
->getCalledScalarFunction()
8181
->onlyReadsMemory();
82-
case VPWidenIntrinsicSC:
8382
return cast<VPWidenIntrinsicRecipe>(this)->mayWriteToMemory();
8483
case VPBranchOnMaskSC:
8584
case VPScalarIVStepsSC:
@@ -1042,6 +1041,14 @@ StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const {
10421041
return Intrinsic::getBaseName(VectorIntrinsicID);
10431042
}
10441043

1044+
bool VPWidenIntrinsicRecipe::onlyFirstLaneUsed(const VPValue *Op) const {
1045+
assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1046+
// Vector predication intrinsics only demand the the first lane the last
1047+
// operand (the EVL operand).
1048+
return VPIntrinsic::isVPIntrinsic(VectorIntrinsicID) &&
1049+
Op == getOperand(getNumOperands() - 1);
1050+
}
1051+
10451052
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
10461053
void VPWidenIntrinsicRecipe::print(raw_ostream &O, const Twine &Indent,
10471054
VPSlotTracker &SlotTracker) const {

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

+9
Original file line numberDiff line numberDiff line change
@@ -1353,6 +1353,7 @@ void VPlanTransforms::addActiveLaneMask(
13531353
/// Replace recipes with their EVL variants.
13541354
static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
13551355
SmallVector<VPValue *> HeaderMasks = collectAllHeaderMasks(Plan);
1356+
VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
13561357
for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) {
13571358
for (VPUser *U : collectUsersRecursively(HeaderMask)) {
13581359
auto *CurRecipe = dyn_cast<VPRecipeBase>(U);
@@ -1384,6 +1385,14 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
13841385
VPValue *NewMask = GetNewMask(Red->getCondOp());
13851386
return new VPReductionEVLRecipe(*Red, EVL, NewMask);
13861387
})
1388+
.Case<VPWidenSelectRecipe>([&](VPWidenSelectRecipe *Sel) {
1389+
SmallVector<VPValue *> Ops(Sel->operands());
1390+
Ops.push_back(&EVL);
1391+
return new VPWidenIntrinsicRecipe(Intrinsic::vp_select, Ops,
1392+
TypeInfo.inferScalarType(Sel),
1393+
false, false, false);
1394+
})
1395+
13871396
.Default([&](VPRecipeBase *R) { return nullptr; });
13881397

13891398
if (!NewRecipe)

llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp

+4
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,10 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
138138
};
139139
for (const VPUser *U : EVL.users()) {
140140
if (!TypeSwitch<const VPUser *, bool>(U)
141+
.Case<VPWidenIntrinsicRecipe>(
142+
[&](const VPWidenIntrinsicRecipe *S) {
143+
return VerifyEVLUse(*S, S->getNumOperands() - 1);
144+
})
141145
.Case<VPWidenStoreEVLRecipe>([&](const VPWidenStoreEVLRecipe *S) {
142146
return VerifyEVLUse(*S, 2);
143147
})

llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ define i32 @cond_add(ptr %a, i64 %n, i32 %start) {
7070
; IF-EVL-INLOOP-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
7171
; IF-EVL-INLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
7272
; IF-EVL-INLOOP-NEXT: [[TMP19:%.*]] = icmp sgt <vscale x 4 x i32> [[VP_OP_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
73-
; IF-EVL-INLOOP-NEXT: [[TMP20:%.*]] = select <vscale x 4 x i1> [[TMP19]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> zeroinitializer
73+
; IF-EVL-INLOOP-NEXT: [[TMP20:%.*]] = call <vscale x 4 x i32> @llvm.vp.select.nxv4i32(<vscale x 4 x i1> [[TMP19]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> zeroinitializer, i32 [[TMP12]])
7474
; IF-EVL-INLOOP-NEXT: [[TMP21:%.*]] = call i32 @llvm.vp.reduce.add.nxv4i32(i32 0, <vscale x 4 x i32> [[TMP20]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
7575
; IF-EVL-INLOOP-NEXT: [[TMP22]] = add i32 [[TMP21]], [[VEC_PHI]]
7676
; IF-EVL-INLOOP-NEXT: [[TMP23:%.*]] = zext i32 [[TMP12]] to i64
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
; REQUIRES: asserts
2+
3+
; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
4+
; RUN: -force-tail-folding-style=data-with-evl \
5+
; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
6+
; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefix=IF-EVL %s
7+
8+
define void @vp_select(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
9+
; IF-EVL: VPlan 'Final VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
10+
; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
11+
; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
12+
; IF-EVL-NEXT: Live-in ir<%N> = original trip-count
13+
14+
; IF-EVL: vector.ph:
15+
; IF-EVL-NEXT: Successor(s): vector loop
16+
17+
; IF-EVL: <x1> vector loop: {
18+
; IF-EVL-NEXT: vector.body:
19+
; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
20+
; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEX:%[0-9]+]]>
21+
; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]>
22+
; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
23+
; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>
24+
; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
25+
; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
26+
; IF-EVL-NEXT: WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]>
27+
; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
28+
; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
29+
; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]>
30+
; IF-EVL-NEXT: WIDEN ir<[[CMP:%.+]]> = icmp sgt ir<[[LD1]]>, ir<[[LD2]]>
31+
; IF-EVL-NEXT: WIDEN ir<[[SUB:%.+]]> = vp.sub ir<0>, ir<[[LD2]]>, vp<[[EVL]]>
32+
; IF-EVL-NEXT: WIDEN-INTRINSIC vp<[[SELECT:%.+]]> = call llvm.vp.select(ir<[[CMP]]>, ir<%1>, ir<%2>, vp<[[EVL]]>)
33+
; IF-EVL-NEXT: WIDEN ir<[[ADD:%.+]]> = vp.add vp<[[SELECT]]>, ir<[[LD1]]>, vp<[[EVL]]>
34+
; IF-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
35+
; IF-EVL-NEXT: vp<[[PTR3:%.+]]> = vector-pointer ir<[[GEP3]]>
36+
; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR3]]>, ir<[[ADD]]>, vp<[[EVL]]>
37+
; IF-EVL-NEXT: SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
38+
; IF-EVL-NEXT: EMIT vp<[[IV_NEX]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
39+
; IF-EVL-NEXT: EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
40+
; IF-EVL-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
41+
; IF-EVL-NEXT: No successors
42+
; IF-EVL-NEXT: }
43+
44+
entry:
45+
br label %for.body
46+
47+
for.body:
48+
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
49+
%arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
50+
%0 = load i32, ptr %arrayidx, align 4
51+
%arrayidx3 = getelementptr inbounds i32, ptr %c, i64 %indvars.iv
52+
%1 = load i32, ptr %arrayidx3, align 4
53+
%cmp4 = icmp sgt i32 %0, %1
54+
%2 = sub i32 0, %1
55+
%cond.p = select i1 %cmp4, i32 %1, i32 %2
56+
%cond = add i32 %cond.p, %0
57+
%arrayidx15 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
58+
store i32 %cond, ptr %arrayidx15, align 4
59+
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
60+
%exitcond.not = icmp eq i64 %indvars.iv.next, %N
61+
br i1 %exitcond.not, label %exit, label %for.body
62+
63+
exit:
64+
ret void
65+
}

0 commit comments

Comments
 (0)