Skip to content

[VPlan] Use VPWidenIntrinsicRecipe to vp.select. #110489

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Oct 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -1649,6 +1649,16 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags {
MayWriteToMemory(CI.mayWriteToMemory()),
MayHaveSideEffects(CI.mayHaveSideEffects()) {}

VPWidenIntrinsicRecipe(Intrinsic::ID VectorIntrinsicID,
ArrayRef<VPValue *> CallArguments, Type *Ty,
bool MayReadFromMemory, bool MayWriteToMemory,
bool MayHaveSideEffects, DebugLoc DL = {})
: VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments),
VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty),
MayReadFromMemory(MayReadFromMemory),
MayWriteToMemory(MayWriteToMemory),
MayHaveSideEffects(MayHaveSideEffects) {}

~VPWidenIntrinsicRecipe() override = default;

VPWidenIntrinsicRecipe *clone() override {
Expand Down Expand Up @@ -1686,6 +1696,8 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags {
void print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const override;
#endif

bool onlyFirstLaneUsed(const VPValue *Op) const override;
};

/// A recipe for widening Call instructions using library calls.
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
case Instruction::ICmp:
case VPInstruction::ActiveLaneMask:
return inferScalarType(R->getOperand(1));
case VPInstruction::ExplicitVectorLength:
return Type::getIntNTy(Ctx, 32);
case VPInstruction::FirstOrderRecurrenceSplice:
case VPInstruction::Not:
return SetResultTyFromOp();
Expand Down
9 changes: 8 additions & 1 deletion llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,6 @@ bool VPRecipeBase::mayWriteToMemory() const {
return !cast<VPWidenCallRecipe>(this)
->getCalledScalarFunction()
->onlyReadsMemory();
case VPWidenIntrinsicSC:
return cast<VPWidenIntrinsicRecipe>(this)->mayWriteToMemory();
case VPBranchOnMaskSC:
case VPScalarIVStepsSC:
Expand Down Expand Up @@ -1042,6 +1041,14 @@ StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const {
return Intrinsic::getBaseName(VectorIntrinsicID);
}

bool VPWidenIntrinsicRecipe::onlyFirstLaneUsed(const VPValue *Op) const {
assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
// Vector predication intrinsics only demand the the first lane the last
// operand (the EVL operand).
return VPIntrinsic::isVPIntrinsic(VectorIntrinsicID) &&
Op == getOperand(getNumOperands() - 1);
Comment on lines +1048 to +1049
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Meet an implementation divergence when rebasing #101641.

  return all_of(enumerate(operands()), [this, Op](const auto &I) {
    return Op != I.value() || 
           isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index()) ||
           (VPIntrinsic::isVPIntrinsic(VectorIntrinsicID) && 
            I.index() == getNumOperands() - 1);
  });

Maybe we need to consider isVectorIntrinsicWithScalarOpAtArg?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see the changes in the current version of the PR I think? Would probably be good to improve (I assume scalar arg here means we only request the first lane). If there are unrelated tests that improve, would be good to prepare a separate patch

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Dropped the related changes while resolving conflicts, as the patch can still function even without considering isVectorIntrinsicWithScalarOpAtArg. So far, I haven't encountered any related cases.

}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenIntrinsicRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
Expand Down
9 changes: 9 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1353,6 +1353,7 @@ void VPlanTransforms::addActiveLaneMask(
/// Replace recipes with their EVL variants.
static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
SmallVector<VPValue *> HeaderMasks = collectAllHeaderMasks(Plan);
VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) {
for (VPUser *U : collectUsersRecursively(HeaderMask)) {
auto *CurRecipe = dyn_cast<VPRecipeBase>(U);
Expand Down Expand Up @@ -1384,6 +1385,14 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
VPValue *NewMask = GetNewMask(Red->getCondOp());
return new VPReductionEVLRecipe(*Red, EVL, NewMask);
})
.Case<VPWidenSelectRecipe>([&](VPWidenSelectRecipe *Sel) {
SmallVector<VPValue *> Ops(Sel->operands());
Ops.push_back(&EVL);
return new VPWidenIntrinsicRecipe(Intrinsic::vp_select, Ops,
TypeInfo.inferScalarType(Sel),
false, false, false);
})

.Default([&](VPRecipeBase *R) { return nullptr; });

if (!NewRecipe)
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,10 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
};
for (const VPUser *U : EVL.users()) {
if (!TypeSwitch<const VPUser *, bool>(U)
.Case<VPWidenIntrinsicRecipe>(
[&](const VPWidenIntrinsicRecipe *S) {
return VerifyEVLUse(*S, S->getNumOperands() - 1);
})
.Case<VPWidenStoreEVLRecipe>([&](const VPWidenStoreEVLRecipe *S) {
return VerifyEVLUse(*S, 2);
})
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ define i32 @cond_add(ptr %a, i64 %n, i32 %start) {
; IF-EVL-INLOOP-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
; IF-EVL-INLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
; IF-EVL-INLOOP-NEXT: [[TMP19:%.*]] = icmp sgt <vscale x 4 x i32> [[VP_OP_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
; IF-EVL-INLOOP-NEXT: [[TMP20:%.*]] = select <vscale x 4 x i1> [[TMP19]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> zeroinitializer
; IF-EVL-INLOOP-NEXT: [[TMP20:%.*]] = call <vscale x 4 x i32> @llvm.vp.select.nxv4i32(<vscale x 4 x i1> [[TMP19]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> zeroinitializer, i32 [[TMP12]])
; IF-EVL-INLOOP-NEXT: [[TMP21:%.*]] = call i32 @llvm.vp.reduce.add.nxv4i32(i32 0, <vscale x 4 x i32> [[TMP20]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
; IF-EVL-INLOOP-NEXT: [[TMP22]] = add i32 [[TMP21]], [[VEC_PHI]]
; IF-EVL-INLOOP-NEXT: [[TMP23:%.*]] = zext i32 [[TMP12]] to i64
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
; REQUIRES: asserts

; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
; RUN: -force-tail-folding-style=data-with-evl \
; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefix=IF-EVL %s

define void @vp_select(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
; IF-EVL: VPlan 'Final VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
; IF-EVL-NEXT: Live-in ir<%N> = original trip-count

; IF-EVL: vector.ph:
; IF-EVL-NEXT: Successor(s): vector loop

; IF-EVL: <x1> vector loop: {
; IF-EVL-NEXT: vector.body:
; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEX:%[0-9]+]]>
; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]>
; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>
; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
; IF-EVL-NEXT: WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]>
; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]>
; IF-EVL-NEXT: WIDEN ir<[[CMP:%.+]]> = icmp sgt ir<[[LD1]]>, ir<[[LD2]]>
; IF-EVL-NEXT: WIDEN ir<[[SUB:%.+]]> = vp.sub ir<0>, ir<[[LD2]]>, vp<[[EVL]]>
; IF-EVL-NEXT: WIDEN-INTRINSIC vp<[[SELECT:%.+]]> = call llvm.vp.select(ir<[[CMP]]>, ir<%1>, ir<%2>, vp<[[EVL]]>)
; IF-EVL-NEXT: WIDEN ir<[[ADD:%.+]]> = vp.add vp<[[SELECT]]>, ir<[[LD1]]>, vp<[[EVL]]>
; IF-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
; IF-EVL-NEXT: vp<[[PTR3:%.+]]> = vector-pointer ir<[[GEP3]]>
; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR3]]>, ir<[[ADD]]>, vp<[[EVL]]>
; IF-EVL-NEXT: SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
; IF-EVL-NEXT: EMIT vp<[[IV_NEX]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
; IF-EVL-NEXT: EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
; IF-EVL-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
; IF-EVL-NEXT: No successors
; IF-EVL-NEXT: }

entry:
br label %for.body

for.body:
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
%0 = load i32, ptr %arrayidx, align 4
%arrayidx3 = getelementptr inbounds i32, ptr %c, i64 %indvars.iv
%1 = load i32, ptr %arrayidx3, align 4
%cmp4 = icmp sgt i32 %0, %1
%2 = sub i32 0, %1
%cond.p = select i1 %cmp4, i32 %1, i32 %2
%cond = add i32 %cond.p, %0
%arrayidx15 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
store i32 %cond, ptr %arrayidx15, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, %N
br i1 %exitcond.not, label %exit, label %for.body

exit:
ret void
}
Loading