|
| 1 | +//===---- EVLIndVarSimplify.cpp - Optimize vectorized loops w/ EVL IV------===// |
| 2 | +// |
| 3 | +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | +// See https://llvm.org/LICENSE.txt for license information. |
| 5 | +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | +// |
| 7 | +//===----------------------------------------------------------------------===// |
| 8 | +// |
| 9 | +// This pass optimizes a vectorized loop with canonical IV to using EVL-based |
| 10 | +// IV if it was tail-folded by predicated EVL. |
| 11 | +// |
| 12 | +//===----------------------------------------------------------------------===// |
| 13 | + |
| 14 | +#include "llvm/Transforms/Vectorize/EVLIndVarSimplify.h" |
| 15 | +#include "llvm/ADT/Statistic.h" |
| 16 | +#include "llvm/Analysis/IVDescriptors.h" |
| 17 | +#include "llvm/Analysis/LoopInfo.h" |
| 18 | +#include "llvm/Analysis/LoopPass.h" |
| 19 | +#include "llvm/Analysis/OptimizationRemarkEmitter.h" |
| 20 | +#include "llvm/Analysis/ScalarEvolution.h" |
| 21 | +#include "llvm/Analysis/ScalarEvolutionExpressions.h" |
| 22 | +#include "llvm/Analysis/ValueTracking.h" |
| 23 | +#include "llvm/IR/IRBuilder.h" |
| 24 | +#include "llvm/IR/PatternMatch.h" |
| 25 | +#include "llvm/Support/CommandLine.h" |
| 26 | +#include "llvm/Support/Debug.h" |
| 27 | +#include "llvm/Support/MathExtras.h" |
| 28 | +#include "llvm/Support/raw_ostream.h" |
| 29 | +#include "llvm/Transforms/Scalar/LoopPassManager.h" |
| 30 | +#include "llvm/Transforms/Utils/Local.h" |
| 31 | +#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" |
| 32 | + |
| 33 | +#define DEBUG_TYPE "evl-iv-simplify" |
| 34 | + |
| 35 | +using namespace llvm; |
| 36 | + |
| 37 | +STATISTIC(NumEliminatedCanonicalIV, "Number of canonical IVs we eliminated"); |
| 38 | + |
| 39 | +static cl::opt<bool> EnableEVLIndVarSimplify( |
| 40 | + "enable-evl-indvar-simplify", |
| 41 | + cl::desc("Enable EVL-based induction variable simplify Pass"), cl::Hidden, |
| 42 | + cl::init(true)); |
| 43 | + |
| 44 | +namespace { |
| 45 | +struct EVLIndVarSimplifyImpl { |
| 46 | + ScalarEvolution &SE; |
| 47 | + OptimizationRemarkEmitter *ORE = nullptr; |
| 48 | + |
| 49 | + EVLIndVarSimplifyImpl(LoopStandardAnalysisResults &LAR, |
| 50 | + OptimizationRemarkEmitter *ORE) |
| 51 | + : SE(LAR.SE), ORE(ORE) {} |
| 52 | + |
| 53 | + /// Returns true if modify the loop. |
| 54 | + bool run(Loop &L); |
| 55 | +}; |
| 56 | +} // anonymous namespace |
| 57 | + |
| 58 | +/// Returns the constant part of vectorization factor from the induction |
| 59 | +/// variable's step value SCEV expression. |
| 60 | +static uint32_t getVFFromIndVar(const SCEV *Step, const Function &F) { |
| 61 | + if (!Step) |
| 62 | + return 0U; |
| 63 | + |
| 64 | + // Looking for loops with IV step value in the form of `(<constant VF> x |
| 65 | + // vscale)`. |
| 66 | + if (const auto *Mul = dyn_cast<SCEVMulExpr>(Step)) { |
| 67 | + if (Mul->getNumOperands() == 2) { |
| 68 | + const SCEV *LHS = Mul->getOperand(0); |
| 69 | + const SCEV *RHS = Mul->getOperand(1); |
| 70 | + if (const auto *Const = dyn_cast<SCEVConstant>(LHS); |
| 71 | + Const && isa<SCEVVScale>(RHS)) { |
| 72 | + uint64_t V = Const->getAPInt().getLimitedValue(); |
| 73 | + if (llvm::isUInt<32>(V)) |
| 74 | + return V; |
| 75 | + } |
| 76 | + } |
| 77 | + } |
| 78 | + |
| 79 | + // If not, see if the vscale_range of the parent function is a fixed value, |
| 80 | + // which makes the step value to be replaced by a constant. |
| 81 | + if (F.hasFnAttribute(Attribute::VScaleRange)) |
| 82 | + if (const auto *ConstStep = dyn_cast<SCEVConstant>(Step)) { |
| 83 | + APInt V = ConstStep->getAPInt().abs(); |
| 84 | + ConstantRange CR = llvm::getVScaleRange(&F, 64); |
| 85 | + if (const APInt *Fixed = CR.getSingleElement()) { |
| 86 | + V = V.zextOrTrunc(Fixed->getBitWidth()); |
| 87 | + uint64_t VF = V.udiv(*Fixed).getLimitedValue(); |
| 88 | + if (VF && llvm::isUInt<32>(VF) && |
| 89 | + // Make sure step is divisible by vscale. |
| 90 | + V.urem(*Fixed).isZero()) |
| 91 | + return VF; |
| 92 | + } |
| 93 | + } |
| 94 | + |
| 95 | + return 0U; |
| 96 | +} |
| 97 | + |
| 98 | +bool EVLIndVarSimplifyImpl::run(Loop &L) { |
| 99 | + if (!EnableEVLIndVarSimplify) |
| 100 | + return false; |
| 101 | + |
| 102 | + if (!getBooleanLoopAttribute(&L, "llvm.loop.isvectorized")) |
| 103 | + return false; |
| 104 | + const MDOperand *EVLMD = |
| 105 | + findStringMetadataForLoop(&L, "llvm.loop.isvectorized.tailfoldingstyle") |
| 106 | + .value_or(nullptr); |
| 107 | + if (!EVLMD || !EVLMD->equalsStr("evl")) |
| 108 | + return false; |
| 109 | + |
| 110 | + BasicBlock *LatchBlock = L.getLoopLatch(); |
| 111 | + ICmpInst *OrigLatchCmp = L.getLatchCmpInst(); |
| 112 | + if (!LatchBlock || !OrigLatchCmp) |
| 113 | + return false; |
| 114 | + |
| 115 | + InductionDescriptor IVD; |
| 116 | + PHINode *IndVar = L.getInductionVariable(SE); |
| 117 | + if (!IndVar || !L.getInductionDescriptor(SE, IVD)) { |
| 118 | + const char *Reason = (IndVar ? "induction descriptor is not available" |
| 119 | + : "cannot recognize induction variable"); |
| 120 | + LLVM_DEBUG(dbgs() << "Cannot retrieve IV from loop " << L.getName() |
| 121 | + << " because" << Reason << "\n"); |
| 122 | + if (ORE) { |
| 123 | + ORE->emit([&]() { |
| 124 | + return OptimizationRemarkMissed(DEBUG_TYPE, "UnrecognizedIndVar", |
| 125 | + L.getStartLoc(), L.getHeader()) |
| 126 | + << "Cannot retrieve IV because " << ore::NV("Reason", Reason); |
| 127 | + }); |
| 128 | + } |
| 129 | + return false; |
| 130 | + } |
| 131 | + |
| 132 | + BasicBlock *InitBlock, *BackEdgeBlock; |
| 133 | + if (!L.getIncomingAndBackEdge(InitBlock, BackEdgeBlock)) { |
| 134 | + LLVM_DEBUG(dbgs() << "Expect unique incoming and backedge in " |
| 135 | + << L.getName() << "\n"); |
| 136 | + if (ORE) { |
| 137 | + ORE->emit([&]() { |
| 138 | + return OptimizationRemarkMissed(DEBUG_TYPE, "UnrecognizedLoopStructure", |
| 139 | + L.getStartLoc(), L.getHeader()) |
| 140 | + << "Does not have a unique incoming and backedge"; |
| 141 | + }); |
| 142 | + } |
| 143 | + return false; |
| 144 | + } |
| 145 | + |
| 146 | + // Retrieve the loop bounds. |
| 147 | + std::optional<Loop::LoopBounds> Bounds = L.getBounds(SE); |
| 148 | + if (!Bounds) { |
| 149 | + LLVM_DEBUG(dbgs() << "Could not obtain the bounds for loop " << L.getName() |
| 150 | + << "\n"); |
| 151 | + if (ORE) { |
| 152 | + ORE->emit([&]() { |
| 153 | + return OptimizationRemarkMissed(DEBUG_TYPE, "UnrecognizedLoopStructure", |
| 154 | + L.getStartLoc(), L.getHeader()) |
| 155 | + << "Could not obtain the loop bounds"; |
| 156 | + }); |
| 157 | + } |
| 158 | + return false; |
| 159 | + } |
| 160 | + Value *CanonicalIVInit = &Bounds->getInitialIVValue(); |
| 161 | + Value *CanonicalIVFinal = &Bounds->getFinalIVValue(); |
| 162 | + |
| 163 | + const SCEV *StepV = IVD.getStep(); |
| 164 | + uint32_t VF = getVFFromIndVar(StepV, *L.getHeader()->getParent()); |
| 165 | + if (!VF) { |
| 166 | + LLVM_DEBUG(dbgs() << "Could not infer VF from IndVar step '" << *StepV |
| 167 | + << "'\n"); |
| 168 | + if (ORE) { |
| 169 | + ORE->emit([&]() { |
| 170 | + return OptimizationRemarkMissed(DEBUG_TYPE, "UnrecognizedIndVar", |
| 171 | + L.getStartLoc(), L.getHeader()) |
| 172 | + << "Could not infer VF from IndVar step " |
| 173 | + << ore::NV("Step", StepV); |
| 174 | + }); |
| 175 | + } |
| 176 | + return false; |
| 177 | + } |
| 178 | + LLVM_DEBUG(dbgs() << "Using VF=" << VF << " for loop " << L.getName() |
| 179 | + << "\n"); |
| 180 | + |
| 181 | + // Try to find the EVL-based induction variable. |
| 182 | + using namespace PatternMatch; |
| 183 | + BasicBlock *BB = IndVar->getParent(); |
| 184 | + |
| 185 | + Value *EVLIndVar = nullptr; |
| 186 | + Value *RemTC = nullptr; |
| 187 | + Value *TC = nullptr; |
| 188 | + auto IntrinsicMatch = m_Intrinsic<Intrinsic::experimental_get_vector_length>( |
| 189 | + m_Value(RemTC), m_SpecificInt(VF), |
| 190 | + /*Scalable=*/m_SpecificInt(1)); |
| 191 | + for (PHINode &PN : BB->phis()) { |
| 192 | + if (&PN == IndVar) |
| 193 | + continue; |
| 194 | + |
| 195 | + // Check 1: it has to contain both incoming (init) & backedge blocks |
| 196 | + // from IndVar. |
| 197 | + if (PN.getBasicBlockIndex(InitBlock) < 0 || |
| 198 | + PN.getBasicBlockIndex(BackEdgeBlock) < 0) |
| 199 | + continue; |
| 200 | + // Check 2: EVL index is always increasing, thus its inital value has to be |
| 201 | + // equal to either the initial IV value (when the canonical IV is also |
| 202 | + // increasing) or the last IV value (when canonical IV is decreasing). |
| 203 | + Value *Init = PN.getIncomingValueForBlock(InitBlock); |
| 204 | + using Direction = Loop::LoopBounds::Direction; |
| 205 | + switch (Bounds->getDirection()) { |
| 206 | + case Direction::Increasing: |
| 207 | + if (Init != CanonicalIVInit) |
| 208 | + continue; |
| 209 | + break; |
| 210 | + case Direction::Decreasing: |
| 211 | + if (Init != CanonicalIVFinal) |
| 212 | + continue; |
| 213 | + break; |
| 214 | + case Direction::Unknown: |
| 215 | + // To be more permissive and see if either the initial or final IV value |
| 216 | + // matches PN's init value. |
| 217 | + if (Init != CanonicalIVInit && Init != CanonicalIVFinal) |
| 218 | + continue; |
| 219 | + break; |
| 220 | + } |
| 221 | + Value *RecValue = PN.getIncomingValueForBlock(BackEdgeBlock); |
| 222 | + assert(RecValue && "expect recurrent IndVar value"); |
| 223 | + |
| 224 | + LLVM_DEBUG(dbgs() << "Found candidate PN of EVL-based IndVar: " << PN |
| 225 | + << "\n"); |
| 226 | + |
| 227 | + // Check 3: Pattern match to find the EVL-based index and total trip count |
| 228 | + // (TC). |
| 229 | + if (match(RecValue, |
| 230 | + m_c_Add(m_ZExtOrSelf(IntrinsicMatch), m_Specific(&PN))) && |
| 231 | + match(RemTC, m_Sub(m_Value(TC), m_Specific(&PN)))) { |
| 232 | + EVLIndVar = RecValue; |
| 233 | + break; |
| 234 | + } |
| 235 | + } |
| 236 | + |
| 237 | + if (!EVLIndVar || !TC) |
| 238 | + return false; |
| 239 | + |
| 240 | + LLVM_DEBUG(dbgs() << "Using " << *EVLIndVar << " for EVL-based IndVar\n"); |
| 241 | + if (ORE) { |
| 242 | + ORE->emit([&]() { |
| 243 | + DebugLoc DL; |
| 244 | + BasicBlock *Region = nullptr; |
| 245 | + if (auto *I = dyn_cast<Instruction>(EVLIndVar)) { |
| 246 | + DL = I->getDebugLoc(); |
| 247 | + Region = I->getParent(); |
| 248 | + } else { |
| 249 | + DL = L.getStartLoc(); |
| 250 | + Region = L.getHeader(); |
| 251 | + } |
| 252 | + return OptimizationRemark(DEBUG_TYPE, "UseEVLIndVar", DL, Region) |
| 253 | + << "Using " << ore::NV("EVLIndVar", EVLIndVar) |
| 254 | + << " for EVL-based IndVar"; |
| 255 | + }); |
| 256 | + } |
| 257 | + |
| 258 | + // Create an EVL-based comparison and replace the branch to use it as |
| 259 | + // predicate. |
| 260 | + |
| 261 | + // Loop::getLatchCmpInst check at the beginning of this function has ensured |
| 262 | + // that latch block ends in a conditional branch. |
| 263 | + auto *LatchBranch = cast<BranchInst>(LatchBlock->getTerminator()); |
| 264 | + assert(LatchBranch->isConditional() && |
| 265 | + "expect the loop latch to be ended with a conditional branch"); |
| 266 | + ICmpInst::Predicate Pred; |
| 267 | + if (LatchBranch->getSuccessor(0) == L.getHeader()) |
| 268 | + Pred = ICmpInst::ICMP_NE; |
| 269 | + else |
| 270 | + Pred = ICmpInst::ICMP_EQ; |
| 271 | + |
| 272 | + IRBuilder<> Builder(OrigLatchCmp); |
| 273 | + auto *NewLatchCmp = Builder.CreateICmp(Pred, EVLIndVar, TC); |
| 274 | + OrigLatchCmp->replaceAllUsesWith(NewLatchCmp); |
| 275 | + |
| 276 | + // llvm::RecursivelyDeleteDeadPHINode only deletes cycles whose values are |
| 277 | + // not used outside the cycles. However, in this case the now-RAUW-ed |
| 278 | + // OrigLatchCmp will be considered a use outside the cycle while in reality |
| 279 | + // it's practically dead. Thus we need to remove it before calling |
| 280 | + // RecursivelyDeleteDeadPHINode. |
| 281 | + (void)RecursivelyDeleteTriviallyDeadInstructions(OrigLatchCmp); |
| 282 | + if (llvm::RecursivelyDeleteDeadPHINode(IndVar)) |
| 283 | + LLVM_DEBUG(dbgs() << "Removed original IndVar\n"); |
| 284 | + |
| 285 | + ++NumEliminatedCanonicalIV; |
| 286 | + |
| 287 | + return true; |
| 288 | +} |
| 289 | + |
| 290 | +PreservedAnalyses EVLIndVarSimplifyPass::run(Loop &L, LoopAnalysisManager &LAM, |
| 291 | + LoopStandardAnalysisResults &AR, |
| 292 | + LPMUpdater &U) { |
| 293 | + Function &F = *L.getHeader()->getParent(); |
| 294 | + auto &FAMProxy = LAM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR); |
| 295 | + OptimizationRemarkEmitter *ORE = |
| 296 | + FAMProxy.getCachedResult<OptimizationRemarkEmitterAnalysis>(F); |
| 297 | + |
| 298 | + if (EVLIndVarSimplifyImpl(AR, ORE).run(L)) |
| 299 | + return PreservedAnalyses::allInSet<CFGAnalyses>(); |
| 300 | + return PreservedAnalyses::all(); |
| 301 | +} |
0 commit comments