-
Notifications
You must be signed in to change notification settings - Fork 13.7k
[LV] Introduce the EVLIVSimplify Pass for EVL-vectorized loops #91796
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
157f995
855cca9
6bc5c94
14f4a7e
3f6b660
9f8bbd7
c833e9e
8259007
97c3c8a
9da476f
73c651f
8f04feb
34dd59e
14a0c0d
07d2c2c
284f733
e701452
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
//===- EVLIndVarSimplify.h - Optimize vectorized loops w/ EVL IV-*- C++ -*-===// | ||
// | ||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// | ||
//===----------------------------------------------------------------------===// | ||
// | ||
// This pass optimizes a vectorized loop with canonical IV to using EVL-based | ||
// IV if it was tail-folded by predicated EVL. | ||
// | ||
//===----------------------------------------------------------------------===// | ||
|
||
#ifndef LLVM_CODEGEN_EVLINDVARSIMPLIFY_H | ||
#define LLVM_CODEGEN_EVLINDVARSIMPLIFY_H | ||
|
||
#include "llvm/Analysis/LoopAnalysisManager.h" | ||
#include "llvm/IR/PassManager.h" | ||
|
||
namespace llvm { | ||
class Loop; | ||
class LPMUpdater; | ||
|
||
/// Turn vectorized loops with canonical induction variables into loops that | ||
/// only use a single EVL-based induction variable. | ||
struct EVLIndVarSimplifyPass : public PassInfoMixin<EVLIndVarSimplifyPass> { | ||
PreservedAnalyses run(Loop &L, LoopAnalysisManager &LAM, | ||
LoopStandardAnalysisResults &AR, LPMUpdater &U); | ||
}; | ||
} // namespace llvm | ||
#endif |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,236 @@ | ||
//===------ EVLIndVarSimplify.cpp - Optimize vectorized loops w/ EVL IV----===// | ||
// | ||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// | ||
//===----------------------------------------------------------------------===// | ||
// | ||
// This pass optimizes a vectorized loop with canonical IV to using EVL-based | ||
// IV if it was tail-folded by predicated EVL. | ||
// | ||
//===----------------------------------------------------------------------===// | ||
|
||
#include "llvm/CodeGen/EVLIndVarSimplify.h" | ||
#include "llvm/ADT/Statistic.h" | ||
#include "llvm/Analysis/IVDescriptors.h" | ||
#include "llvm/Analysis/LoopInfo.h" | ||
#include "llvm/Analysis/LoopPass.h" | ||
#include "llvm/Analysis/ScalarEvolution.h" | ||
#include "llvm/Analysis/ScalarEvolutionExpressions.h" | ||
#include "llvm/Analysis/ValueTracking.h" | ||
#include "llvm/IR/IRBuilder.h" | ||
#include "llvm/IR/PatternMatch.h" | ||
#include "llvm/Support/CommandLine.h" | ||
#include "llvm/Support/Debug.h" | ||
#include "llvm/Support/MathExtras.h" | ||
#include "llvm/Support/raw_ostream.h" | ||
#include "llvm/Transforms/Scalar/LoopPassManager.h" | ||
#include "llvm/Transforms/Utils/Local.h" | ||
#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" | ||
|
||
#define DEBUG_TYPE "evl-iv-simplify" | ||
|
||
using namespace llvm; | ||
|
||
STATISTIC(NumEliminatedCanonicalIV, "Number of canonical IVs we eliminated"); | ||
|
||
static cl::opt<bool> EnableEVLIndVarSimplify( | ||
"enable-evl-indvar-simplify", | ||
cl::desc("Enable EVL-based induction variable simplify Pass"), cl::Hidden, | ||
cl::init(true)); | ||
|
||
namespace { | ||
struct EVLIndVarSimplifyImpl { | ||
ScalarEvolution &SE; | ||
|
||
explicit EVLIndVarSimplifyImpl(LoopStandardAnalysisResults &LAR) | ||
: SE(LAR.SE) {} | ||
|
||
explicit EVLIndVarSimplifyImpl(ScalarEvolution &SE) : SE(SE) {} | ||
|
||
// Returns true if modify the loop. | ||
bool run(Loop &L); | ||
}; | ||
} // anonymous namespace | ||
|
||
static uint32_t getVFFromIndVar(const SCEV *Step, const Function &F) { | ||
if (!Step) | ||
return 0U; | ||
|
||
// Looking for loops with IV step value in the form of `(<constant VF> x | ||
// vscale)`. | ||
if (auto *Mul = dyn_cast<SCEVMulExpr>(Step)) { | ||
if (Mul->getNumOperands() == 2) { | ||
const SCEV *LHS = Mul->getOperand(0); | ||
const SCEV *RHS = Mul->getOperand(1); | ||
if (auto *Const = dyn_cast<SCEVConstant>(LHS)) { | ||
uint64_t V = Const->getAPInt().getLimitedValue(); | ||
if (isa<SCEVVScale>(RHS) && llvm::isUInt<32>(V)) | ||
return static_cast<uint32_t>(V); | ||
} | ||
} | ||
} | ||
|
||
// If not, see if the vscale_range of the parent function is a fixed value, | ||
// which makes the step value to be replaced by a constant. | ||
if (F.hasFnAttribute(Attribute::VScaleRange)) | ||
if (auto *ConstStep = dyn_cast<SCEVConstant>(Step)) { | ||
APInt V = ConstStep->getAPInt().abs(); | ||
ConstantRange CR = llvm::getVScaleRange(&F, 64); | ||
if (const APInt *Fixed = CR.getSingleElement()) { | ||
V = V.zextOrTrunc(Fixed->getBitWidth()); | ||
uint64_t VF = V.udiv(*Fixed).getLimitedValue(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it possible the constant isn't evenly divisible by vscale? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I added a check here. |
||
if (VF && llvm::isUInt<32>(VF) && | ||
// Make sure step is divisible by vscale. | ||
V.urem(*Fixed).isZero()) | ||
return static_cast<uint32_t>(VF); | ||
} | ||
} | ||
|
||
return 0U; | ||
} | ||
|
||
bool EVLIndVarSimplifyImpl::run(Loop &L) { | ||
if (!EnableEVLIndVarSimplify) | ||
return false; | ||
|
||
BasicBlock *LatchBlock = L.getLoopLatch(); | ||
ICmpInst *OrigLatchCmp = L.getLatchCmpInst(); | ||
if (!LatchBlock || !OrigLatchCmp) | ||
return false; | ||
|
||
InductionDescriptor IVD; | ||
PHINode *IndVar = L.getInductionVariable(SE); | ||
if (!IndVar || !L.getInductionDescriptor(SE, IVD)) { | ||
LLVM_DEBUG(dbgs() << "Cannot retrieve IV from loop " << L.getName() | ||
<< "\n"); | ||
return false; | ||
} | ||
|
||
BasicBlock *InitBlock, *BackEdgeBlock; | ||
if (!L.getIncomingAndBackEdge(InitBlock, BackEdgeBlock)) { | ||
LLVM_DEBUG(dbgs() << "Expect unique incoming and backedge in " | ||
<< L.getName() << "\n"); | ||
return false; | ||
} | ||
|
||
// Retrieve the loop bounds. | ||
std::optional<Loop::LoopBounds> Bounds = L.getBounds(SE); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Using the getBounds API is a slightly weird choice here. getBounds doesn't use SCEV, and has a bunch of restrictions that SCEV does not. You're immediately turning around and converting the results to SCEV, so maybe just use that to begin with? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The SCEV of initial and final values are probably going to be gone, since they're only used in the ad-hoc check I wrote, which is subject to be replaced. Also, I don't quite understand what you mean by saying getBounds doesn't use SCEV, because judging from its code it's using ScalarEvolution to calculate the initial and final values. Could you elaborate a little more? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IVDescriptors has limited use of SCEV to identify addrecs. LoopBounds::getBounds works solely on IR, matching existing instructions. It doesn't produce the limit or start in terms of SCEV values. |
||
if (!Bounds) { | ||
LLVM_DEBUG(dbgs() << "Could not obtain the bounds for loop " << L.getName() | ||
<< "\n"); | ||
return false; | ||
} | ||
Value *CanonicalIVInit = &Bounds->getInitialIVValue(); | ||
Value *CanonicalIVFinal = &Bounds->getFinalIVValue(); | ||
|
||
const SCEV *StepV = IVD.getStep(); | ||
uint32_t VF = getVFFromIndVar(StepV, *L.getHeader()->getParent()); | ||
if (!VF) { | ||
LLVM_DEBUG(dbgs() << "Could not infer VF from IndVar step '" << *StepV | ||
<< "'\n"); | ||
return false; | ||
} | ||
LLVM_DEBUG(dbgs() << "Using VF=" << VF << " for loop " << L.getName() | ||
<< "\n"); | ||
|
||
// Try to find the EVL-based induction variable. | ||
using namespace PatternMatch; | ||
BasicBlock *BB = IndVar->getParent(); | ||
|
||
Value *EVLIndVar = nullptr; | ||
Value *RemTC = nullptr; | ||
Value *TC = nullptr; | ||
auto IntrinsicMatch = m_Intrinsic<Intrinsic::experimental_get_vector_length>( | ||
m_Value(RemTC), m_SpecificInt(VF), | ||
/*Scalable=*/m_SpecificInt(1)); | ||
for (auto &PN : BB->phis()) { | ||
if (&PN == IndVar) | ||
continue; | ||
|
||
// Check 1: it has to contain both incoming (init) & backedge blocks | ||
// from IndVar. | ||
if (PN.getBasicBlockIndex(InitBlock) < 0 || | ||
PN.getBasicBlockIndex(BackEdgeBlock) < 0) | ||
continue; | ||
// Check 2: EVL index is always increasing, thus its inital value has to be | ||
// equal to either the initial IV value (when the canonical IV is also | ||
// increasing) or the last IV value (when canonical IV is decreasing). | ||
Value *Init = PN.getIncomingValueForBlock(InitBlock); | ||
using Direction = Loop::LoopBounds::Direction; | ||
switch (Bounds->getDirection()) { | ||
case Direction::Increasing: | ||
if (Init != CanonicalIVInit) | ||
continue; | ||
break; | ||
case Direction::Decreasing: | ||
if (Init != CanonicalIVFinal) | ||
continue; | ||
break; | ||
case Direction::Unknown: | ||
// To be more permissive and see if either the initial or final IV value | ||
// matches PN's init value. | ||
if (Init != CanonicalIVInit && Init != CanonicalIVFinal) | ||
continue; | ||
break; | ||
} | ||
Value *RecValue = PN.getIncomingValueForBlock(BackEdgeBlock); | ||
assert(RecValue); | ||
|
||
LLVM_DEBUG(dbgs() << "Found candidate PN of EVL-based IndVar: " << PN | ||
<< "\n"); | ||
|
||
// Check 3: Pattern match to find the EVL-based index and total trip count | ||
// (TC). | ||
if (match(RecValue, | ||
m_c_Add(m_ZExtOrSelf(IntrinsicMatch), m_Specific(&PN))) && | ||
match(RemTC, m_Sub(m_Value(TC), m_Specific(&PN)))) { | ||
EVLIndVar = RecValue; | ||
break; | ||
} | ||
} | ||
|
||
if (!EVLIndVar || !TC) | ||
return false; | ||
|
||
LLVM_DEBUG(dbgs() << "Using " << *EVLIndVar << " for EVL-based IndVar\n"); | ||
|
||
// Create an EVL-based comparison and replace the branch to use it as | ||
This comment was marked as resolved.
Sorry, something went wrong. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you for the suggestion, I think I have one question here though:
Is it supposed to be EVLStep = min(IV.Step, sub_nsw(EVL_TC, IV))? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, it wasn't. The active_vector_length has the VF parameter. That may be the same as the step, but in principle doesn't have to be. Note that my (1) above (which your code already does) is proving VF=IV.Step, but that is a step you need to prove. It's not something you can assume. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think there is another problem here: the backedge taken count cannot be computed when vscale is not constant. Namely, IV.Step is in the most commonly seen form of However, alternatively I think we can get exit value via this:
What do you think? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Ok, this seems weird. I don't know of any fundamental reason that SCEV shouldn't be able to figure out a trip count for a loop invariant step - even if non constant. Would you mind sharing a piece of IR which demonstrates this? I want to see if this is something easy to fix, or I'm forgetting some subtle interaction. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Thank you! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See #94411 Though this is with two changes to your test:
I think both should apply to your original (i.e. your test is slightly over-reduced), but please confirm. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Thank you for the quick fix. The major concern being that the instructions for computing exit value expanded from the SCEV expressions we crafted is pretty verbose, long, a more importantly, contain division instructions. Let me start with the formula we have:
In your original formula EVLStep is Now, since only SCEVAddRec has
Because IV.Step and EVL_TC are both loop invariant.
And those divisions seen in the expressions will be expanded into division instructions. I'm concern whether we should spend so many instructions to compute something that will almost certain be I also run the same algorithm on the I've also tried to use Another thing is that I'm still a little confused about how to do Step (1) you described: it's true that I already did such a check (line 212 ~ 223) -- but only on cases where both trip count and vscale are constant. For every other cases we only have SCEV expressions, which we cannot know it value during compile time. Even we expand the check into runtime check, what should we do if the check fails during runtime? Do we fall back the original canonical IV? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Clang emits the vscale_range by default. vscale is bounded by the values of possible VLEN, and VLEN has a specific upper bound in the RISCV vector specification. So while annoying that SCEV can't just ask the backend for this, this is a non-problem for any real c/c++ input. For the rest of your response, one quick comment - the divisions are by 4 * vscale. We know that vscale is a power of two by definition, so these are right shifts. You do need a popcount to know which power of two, but we could reasonable emit one popcount and a bunch of shifts. (Not saying we do so today - I'm saying it's possible.) For the rest, I feel we are talking past each other - and frankly, I'm having trouble following the detail in github PR interface - can I suggest we setup a brief phone call to chat this through instead? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
That will be really helpful! I'll send you an email to your "public" email address to coordinate this. |
||
// predicate. | ||
|
||
// Loop::getLatchCmpInst check at the beginning of this function has ensured | ||
// that latch block ends in a conditional branch. | ||
auto *LatchBranch = cast<BranchInst>(LatchBlock->getTerminator()); | ||
assert(LatchBranch->getNumSuccessors() == 2); | ||
ICmpInst::Predicate Pred; | ||
if (LatchBranch->getSuccessor(0) == L.getHeader()) | ||
Pred = ICmpInst::ICMP_ULT; | ||
else | ||
Pred = ICmpInst::ICMP_UGE; | ||
|
||
IRBuilder<> Builder(OrigLatchCmp); | ||
auto *NewLatchCmp = Builder.CreateICmp(Pred, EVLIndVar, TC); | ||
OrigLatchCmp->replaceAllUsesWith(NewLatchCmp); | ||
|
||
// llvm::RecursivelyDeleteDeadPHINode only deletes cycles whose values are | ||
// not used outside the cycles. However, in this case the now-RAUW-ed | ||
// OrigLatchCmp will be consied a use outside the cycle while in reality it's | ||
// practically dead. Thus we need to remove it before calling | ||
// RecursivelyDeleteDeadPHINode. | ||
(void)RecursivelyDeleteTriviallyDeadInstructions(OrigLatchCmp); | ||
if (llvm::RecursivelyDeleteDeadPHINode(IndVar)) | ||
LLVM_DEBUG(dbgs() << "Removed original IndVar\n"); | ||
|
||
++NumEliminatedCanonicalIV; | ||
|
||
return true; | ||
} | ||
|
||
PreservedAnalyses EVLIndVarSimplifyPass::run(Loop &L, LoopAnalysisManager &LAM, | ||
LoopStandardAnalysisResults &AR, | ||
LPMUpdater &U) { | ||
if (EVLIndVarSimplifyImpl(AR).run(L)) | ||
return PreservedAnalyses::allInSet<CFGAnalyses>(); | ||
return PreservedAnalyses::all(); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add an option to control that pass ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.