Skip to content

Commit dcef154

Browse files
authored
[VPlan] Replace VPRegionBlock with explicit CFG before execute (NFCI). (#117506)
Building on top of #114305, replace VPRegionBlocks with explicit CFG before executing. This brings the final VPlan closer to the IR that is generated and helps to simplify codegen. It will also enable further simplifications of phi handling during execution and transformations that do not have to preserve the canonical IV required by loop regions. This for example could include replacing the canonical IV with an EVL based phi while completely removing the original canonical IV. PR: #117506
1 parent adaf170 commit dcef154

16 files changed

+299
-246
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2781,13 +2781,13 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
27812781
PSE.getSE()->forgetLoop(OrigLoop);
27822782
PSE.getSE()->forgetBlockAndLoopDispositions();
27832783

2784-
// Don't apply optimizations below when no vector region remains, as they all
2785-
// require a vector loop at the moment.
2786-
if (!State.Plan->getVectorLoopRegion())
2784+
// Don't apply optimizations below when no (vector) loop remains, as they all
2785+
// require one at the moment.
2786+
VPBasicBlock *HeaderVPBB =
2787+
vputils::getFirstLoopHeader(*State.Plan, State.VPDT);
2788+
if (!HeaderVPBB)
27872789
return;
27882790

2789-
VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
2790-
VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock();
27912791
BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB];
27922792

27932793
// Remove redundant induction instructions.
@@ -2812,7 +2812,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
28122812
}
28132813

28142814
void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
2815-
auto Iter = vp_depth_first_deep(Plan.getEntry());
2815+
auto Iter = vp_depth_first_shallow(Plan.getEntry());
28162816
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
28172817
for (VPRecipeBase &P : VPBB->phis()) {
28182818
VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
@@ -7623,6 +7623,13 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
76237623
BestVPlan, BestVF,
76247624
TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector));
76257625
VPlanTransforms::removeDeadRecipes(BestVPlan);
7626+
7627+
// Retrieve and store the middle block before dissolving regions. Regions are
7628+
// dissolved after optimizing for VF and UF, which completely removes unneeded
7629+
// loop regions first.
7630+
VPBasicBlock *MiddleVPBB =
7631+
BestVPlan.getVectorLoopRegion() ? BestVPlan.getMiddleBlock() : nullptr;
7632+
VPlanTransforms::dissolveLoopRegions(BestVPlan);
76267633
VPlanTransforms::convertToConcreteRecipes(BestVPlan,
76277634
*Legal->getWidestInductionType());
76287635
// Perform the actual loop transformation.
@@ -7720,14 +7727,14 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
77207727
// 2.6. Maintain Loop Hints
77217728
// Keep all loop hints from the original loop on the vector loop (we'll
77227729
// replace the vectorizer-specific hints below).
7723-
if (auto *LoopRegion = BestVPlan.getVectorLoopRegion()) {
7730+
VPBasicBlock *HeaderVPBB = vputils::getFirstLoopHeader(BestVPlan, State.VPDT);
7731+
if (HeaderVPBB) {
77247732
MDNode *OrigLoopID = OrigLoop->getLoopID();
77257733

77267734
std::optional<MDNode *> VectorizedLoopID =
77277735
makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
77287736
LLVMLoopVectorizeFollowupVectorized});
77297737

7730-
VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
77317738
Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
77327739
if (VectorizedLoopID) {
77337740
L->setLoopID(*VectorizedLoopID);
@@ -7773,8 +7780,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
77737780
ILV.printDebugTracesAtEnd();
77747781

77757782
// 4. Adjust branch weight of the branch in the middle block.
7776-
if (BestVPlan.getVectorLoopRegion()) {
7777-
auto *MiddleVPBB = BestVPlan.getMiddleBlock();
7783+
if (HeaderVPBB) {
77787784
auto *MiddleTerm =
77797785
cast<BranchInst>(State.CFG.VPBB2IRBB[MiddleVPBB]->getTerminator());
77807786
if (MiddleTerm->isConditional() &&

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 84 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,32 @@ VPBlockBase *VPBlockBase::getEnclosingBlockWithPredecessors() {
207207
return Parent->getEnclosingBlockWithPredecessors();
208208
}
209209

210+
bool VPBlockUtils::isHeader(const VPBlockBase *VPB,
211+
const VPDominatorTree &VPDT) {
212+
auto *VPBB = dyn_cast<VPBasicBlock>(VPB);
213+
if (!VPBB)
214+
return false;
215+
216+
// If VPBB is in a region R, VPBB is a loop header if R is a loop region with
217+
// VPBB as its entry, i.e., free of predecessors.
218+
if (auto *R = VPBB->getParent())
219+
return !R->isReplicator() && VPBB->getNumPredecessors() == 0;
220+
221+
// A header dominates its second predecessor (the latch), with the other
222+
// predecessor being the preheader
223+
return VPB->getPredecessors().size() == 2 &&
224+
VPDT.dominates(VPB, VPB->getPredecessors()[1]);
225+
}
226+
227+
bool VPBlockUtils::isLatch(const VPBlockBase *VPB,
228+
const VPDominatorTree &VPDT) {
229+
// A latch has a header as its second successor, with its other successor
230+
// leaving the loop. A preheader OTOH has a header as its first (and only)
231+
// successor.
232+
return VPB->getNumSuccessors() == 2 &&
233+
VPBlockUtils::isHeader(VPB->getSuccessors()[1], VPDT);
234+
}
235+
210236
VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() {
211237
iterator It = begin();
212238
while (It != end() && It->isPhi())
@@ -424,13 +450,21 @@ void VPBasicBlock::connectToPredecessors(VPTransformState &State) {
424450
if (ParentLoop && !State.LI->getLoopFor(NewBB))
425451
ParentLoop->addBasicBlockToLoop(NewBB, *State.LI);
426452

453+
SmallVector<VPBlockBase *> Preds;
454+
if (VPBlockUtils::isHeader(this, State.VPDT)) {
455+
// There's no block for the latch yet, connect to the preheader only.
456+
Preds = {getPredecessors()[0]};
457+
} else {
458+
Preds = to_vector(getPredecessors());
459+
}
460+
427461
// Hook up the new basic block to its predecessors.
428-
for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) {
462+
for (VPBlockBase *PredVPBlock : Preds) {
429463
VPBasicBlock *PredVPBB = PredVPBlock->getExitingBasicBlock();
430464
auto &PredVPSuccessors = PredVPBB->getHierarchicalSuccessors();
465+
assert(CFG.VPBB2IRBB.contains(PredVPBB) &&
466+
"Predecessor basic-block not found building successor.");
431467
BasicBlock *PredBB = CFG.VPBB2IRBB[PredVPBB];
432-
433-
assert(PredBB && "Predecessor basic-block not found building successor.");
434468
auto *PredBBTerminator = PredBB->getTerminator();
435469
LLVM_DEBUG(dbgs() << "LV: draw edge from" << PredBB->getName() << '\n');
436470

@@ -491,11 +525,25 @@ void VPBasicBlock::execute(VPTransformState *State) {
491525
bool Replica = bool(State->Lane);
492526
BasicBlock *NewBB = State->CFG.PrevBB; // Reuse it if possible.
493527

528+
if (VPBlockUtils::isHeader(this, State->VPDT)) {
529+
// Create and register the new vector loop.
530+
Loop *PrevParentLoop = State->CurrentParentLoop;
531+
State->CurrentParentLoop = State->LI->AllocateLoop();
532+
533+
// Insert the new loop into the loop nest and register the new basic blocks
534+
// before calling any utilities such as SCEV that require valid LoopInfo.
535+
if (PrevParentLoop)
536+
PrevParentLoop->addChildLoop(State->CurrentParentLoop);
537+
else
538+
State->LI->addTopLevelLoop(State->CurrentParentLoop);
539+
}
540+
494541
auto IsReplicateRegion = [](VPBlockBase *BB) {
495542
auto *R = dyn_cast_or_null<VPRegionBlock>(BB);
496-
return R && R->isReplicator();
543+
assert((!R || R->isReplicator()) &&
544+
"only replicate region blocks should remain");
545+
return R;
497546
};
498-
499547
// 1. Create an IR basic block.
500548
if ((Replica && this == getParent()->getEntry()) ||
501549
IsReplicateRegion(getSingleHierarchicalPredecessor())) {
@@ -518,6 +566,10 @@ void VPBasicBlock::execute(VPTransformState *State) {
518566

519567
// 2. Fill the IR basic block with IR instructions.
520568
executeRecipes(State, NewBB);
569+
570+
// If this block is a latch, update CurrentParentLoop.
571+
if (VPBlockUtils::isLatch(this, State->VPDT))
572+
State->CurrentParentLoop = State->CurrentParentLoop->getParentLoop();
521573
}
522574

523575
VPBasicBlock *VPBasicBlock::clone() {
@@ -729,35 +781,13 @@ VPRegionBlock *VPRegionBlock::clone() {
729781
}
730782

731783
void VPRegionBlock::execute(VPTransformState *State) {
732-
ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>>
733-
RPOT(Entry);
734-
735-
if (!isReplicator()) {
736-
// Create and register the new vector loop.
737-
Loop *PrevParentLoop = State->CurrentParentLoop;
738-
State->CurrentParentLoop = State->LI->AllocateLoop();
739-
740-
// Insert the new loop into the loop nest and register the new basic blocks
741-
// before calling any utilities such as SCEV that require valid LoopInfo.
742-
if (PrevParentLoop)
743-
PrevParentLoop->addChildLoop(State->CurrentParentLoop);
744-
else
745-
State->LI->addTopLevelLoop(State->CurrentParentLoop);
746-
747-
// Visit the VPBlocks connected to "this", starting from it.
748-
for (VPBlockBase *Block : RPOT) {
749-
LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n');
750-
Block->execute(State);
751-
}
752-
753-
State->CurrentParentLoop = PrevParentLoop;
754-
return;
755-
}
756-
784+
assert(isReplicator() &&
785+
"Loop regions should have been lowered to plain CFG");
757786
assert(!State->Lane && "Replicating a Region with non-null instance.");
758-
759-
// Enter replicating mode.
760787
assert(!State->VF.isScalable() && "VF is assumed to be non scalable.");
788+
789+
ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
790+
Entry);
761791
State->Lane = VPLane(0);
762792
for (unsigned Lane = 0, VF = State->VF.getKnownMinValue(); Lane < VF;
763793
++Lane) {
@@ -851,6 +881,22 @@ void VPRegionBlock::print(raw_ostream &O, const Twine &Indent,
851881
}
852882
#endif
853883

884+
void VPRegionBlock::dissolveToCFGLoop() {
885+
auto *Header = cast<VPBasicBlock>(getEntry());
886+
VPBlockBase *Preheader = getSinglePredecessor();
887+
auto *ExitingLatch = cast<VPBasicBlock>(getExiting());
888+
VPBlockBase *Middle = getSingleSuccessor();
889+
VPBlockUtils::disconnectBlocks(Preheader, this);
890+
VPBlockUtils::disconnectBlocks(this, Middle);
891+
892+
for (VPBlockBase *VPB : vp_depth_first_shallow(Entry))
893+
VPB->setParent(getParent());
894+
895+
VPBlockUtils::connectBlocks(Preheader, Header);
896+
VPBlockUtils::connectBlocks(ExitingLatch, Middle);
897+
VPBlockUtils::connectBlocks(ExitingLatch, Header);
898+
}
899+
854900
VPlan::VPlan(Loop *L) {
855901
setEntry(createVPIRBasicBlock(L->getLoopPreheader()));
856902
ScalarHeader = createVPIRBasicBlock(L->getHeader());
@@ -962,16 +1008,15 @@ void VPlan::execute(VPTransformState *State) {
9621008

9631009
State->CFG.DTU.flush();
9641010

965-
auto *LoopRegion = getVectorLoopRegion();
966-
if (!LoopRegion)
1011+
VPBasicBlock *Header = vputils::getFirstLoopHeader(*this, State->VPDT);
1012+
if (!Header)
9671013
return;
9681014

969-
VPBasicBlock *LatchVPBB = LoopRegion->getExitingBasicBlock();
1015+
auto *LatchVPBB = cast<VPBasicBlock>(Header->getPredecessors()[1]);
9701016
BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB];
9711017

9721018
// Fix the latch value of canonical, reduction and first-order recurrences
9731019
// phis in the vector loop.
974-
VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
9751020
for (VPRecipeBase &R : Header->phis()) {
9761021
// Skip phi-like recipes that generate their backedege values themselves.
9771022
if (isa<VPWidenPHIRecipe>(&R))
@@ -1007,8 +1052,10 @@ void VPlan::execute(VPTransformState *State) {
10071052
bool NeedsScalar = isa<VPInstruction>(PhiR) ||
10081053
(isa<VPReductionPHIRecipe>(PhiR) &&
10091054
cast<VPReductionPHIRecipe>(PhiR)->isInLoop());
1055+
10101056
Value *Phi = State->get(PhiR, NeedsScalar);
1011-
// VPHeaderPHIRecipe supports getBackedgeValue() but VPInstruction does not.
1057+
// VPHeaderPHIRecipe supports getBackedgeValue() but VPInstruction does
1058+
// not.
10121059
Value *Val = State->get(PhiR->getOperand(1), NeedsScalar);
10131060
cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB);
10141061
}

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3872,6 +3872,10 @@ class VPRegionBlock : public VPBlockBase {
38723872
/// Clone all blocks in the single-entry single-exit region of the block and
38733873
/// their recipes without updating the operands of the cloned recipes.
38743874
VPRegionBlock *clone() override;
3875+
3876+
/// Remove the current region from its VPlan, connecting its predecessor to
3877+
/// its entry, and its exiting block to its successor.
3878+
void dissolveToCFGLoop();
38753879
};
38763880

38773881
/// VPlan models a candidate for vectorization, encoding various decisions take

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 22 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -462,6 +462,26 @@ Value *VPInstruction::generatePerLane(VPTransformState &State,
462462
State.get(getOperand(1), Lane), Name);
463463
}
464464

465+
/// Create a conditional branch using \p Cond branching to the successors of \p
466+
/// VPBB. Note that the first successor is always forward (i.e. not created yet)
467+
/// while the second successor may already have been created (if it is a header
468+
/// block and VPBB is a latch).
469+
static BranchInst *createCondBranch(Value *Cond, VPBasicBlock *VPBB,
470+
VPTransformState &State) {
471+
// Replace the temporary unreachable terminator with a new conditional
472+
// branch, hooking it up to backward destination (header) for latch blocks
473+
// now, and to forward destination(s) later when they are created.
474+
// Second successor may be backwards - iff it is already in VPBB2IRBB.
475+
VPBasicBlock *SecondVPSucc = cast<VPBasicBlock>(VPBB->getSuccessors()[1]);
476+
BasicBlock *SecondIRSucc = State.CFG.VPBB2IRBB.lookup(SecondVPSucc);
477+
BasicBlock *IRBB = State.CFG.VPBB2IRBB[VPBB];
478+
BranchInst *CondBr = State.Builder.CreateCondBr(Cond, IRBB, SecondIRSucc);
479+
// First successor is always forward, reset it to nullptr
480+
CondBr->setSuccessor(0, nullptr);
481+
IRBB->getTerminator()->eraseFromParent();
482+
return CondBr;
483+
}
484+
465485
Value *VPInstruction::generate(VPTransformState &State) {
466486
IRBuilderBase &Builder = State.Builder;
467487

@@ -581,43 +601,14 @@ Value *VPInstruction::generate(VPTransformState &State) {
581601
}
582602
case VPInstruction::BranchOnCond: {
583603
Value *Cond = State.get(getOperand(0), VPLane(0));
584-
// Replace the temporary unreachable terminator with a new conditional
585-
// branch, hooking it up to backward destination for exiting blocks now and
586-
// to forward destination(s) later when they are created.
587-
BranchInst *CondBr =
588-
Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr);
589-
CondBr->setSuccessor(0, nullptr);
590-
Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
591-
592-
if (!getParent()->isExiting())
593-
return CondBr;
594-
595-
VPRegionBlock *ParentRegion = getParent()->getParent();
596-
VPBasicBlock *Header = ParentRegion->getEntryBasicBlock();
597-
CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]);
598-
return CondBr;
604+
return createCondBranch(Cond, getParent(), State);
599605
}
600606
case VPInstruction::BranchOnCount: {
601607
// First create the compare.
602608
Value *IV = State.get(getOperand(0), /*IsScalar*/ true);
603609
Value *TC = State.get(getOperand(1), /*IsScalar*/ true);
604610
Value *Cond = Builder.CreateICmpEQ(IV, TC);
605-
606-
// Now create the branch.
607-
auto *Plan = getParent()->getPlan();
608-
VPRegionBlock *TopRegion = Plan->getVectorLoopRegion();
609-
VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock();
610-
611-
// Replace the temporary unreachable terminator with a new conditional
612-
// branch, hooking it up to backward destination (the header) now and to the
613-
// forward destination (the exit/middle block) later when it is created.
614-
// Note that CreateCondBr expects a valid BB as first argument, so we need
615-
// to set it to nullptr later.
616-
BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(),
617-
State.CFG.VPBB2IRBB[Header]);
618-
CondBr->setSuccessor(0, nullptr);
619-
Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
620-
return CondBr;
611+
return createCondBranch(Cond, getParent(), State);
621612
}
622613
case VPInstruction::Broadcast: {
623614
return Builder.CreateVectorSplat(
@@ -1127,10 +1118,6 @@ void VPInstructionWithType::print(raw_ostream &O, const Twine &Indent,
11271118

11281119
void VPPhi::execute(VPTransformState &State) {
11291120
State.setDebugLocFrom(getDebugLoc());
1130-
assert(getParent() ==
1131-
getParent()->getPlan()->getVectorLoopRegion()->getEntry() &&
1132-
"VPInstructions with PHI opcodes must be used for header phis only "
1133-
"at the moment");
11341121
BasicBlock *VectorPH = State.CFG.VPBB2IRBB.at(getIncomingBlock(0));
11351122
Value *Start = State.get(getIncomingValue(0), VPLane(0));
11361123
PHINode *Phi = State.Builder.CreatePHI(Start->getType(), 2, getName());

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2506,6 +2506,18 @@ void VPlanTransforms::createInterleaveGroups(
25062506
}
25072507
}
25082508

2509+
void VPlanTransforms::dissolveLoopRegions(VPlan &Plan) {
2510+
// Replace loop regions with explicity CFG.
2511+
SmallVector<VPRegionBlock *> LoopRegions;
2512+
for (VPRegionBlock *R : VPBlockUtils::blocksOnly<VPRegionBlock>(
2513+
vp_depth_first_deep(Plan.getEntry()))) {
2514+
if (!R->isReplicator())
2515+
LoopRegions.push_back(R);
2516+
}
2517+
for (VPRegionBlock *R : LoopRegions)
2518+
R->dissolveToCFGLoop();
2519+
}
2520+
25092521
// Expand VPExtendedReductionRecipe to VPWidenCastRecipe + VPReductionRecipe.
25102522
static void expandVPExtendedReduction(VPExtendedReductionRecipe *ExtRed) {
25112523
VPWidenCastRecipe *Ext;

llvm/lib/Transforms/Vectorize/VPlanTransforms.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,9 @@ struct VPlanTransforms {
184184
VPBasicBlock *LatchVPBB,
185185
VFRange &Range);
186186

187+
/// Replace loop regions with explicit CFG.
188+
static void dissolveLoopRegions(VPlan &Plan);
189+
187190
/// Lower abstract recipes to concrete ones, that can be codegen'd. Use \p
188191
/// CanonicalIVTy as type for all un-typed live-ins in VPTypeAnalysis.
189192
static void convertToConcreteRecipes(VPlan &Plan, Type &CanonicalIVTy);

0 commit comments

Comments
 (0)