Skip to content

Commit c385bf3

Browse files
authored
Merge pull request #8320 from nilanjana87/loop_interleaving_changes_cherry_picked
[LV] Allow loop interleaving for loops with low trip count. (llvm#67725)
2 parents 46f35c9 + 7fe2c56 commit c385bf3

22 files changed

+3307
-1763
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 53 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -264,11 +264,6 @@ static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
264264
"enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
265265
cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
266266

267-
static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
268-
"tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
269-
cl::desc("We don't interleave loops with a estimated constant trip count "
270-
"below this number"));
271-
272267
static cl::opt<unsigned> ForceTargetNumScalarRegs(
273268
"force-target-num-scalar-regs", cl::init(0), cl::Hidden,
274269
cl::desc("A flag that overrides the target's number of scalar registers."));
@@ -316,12 +311,6 @@ static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
316311
cl::desc(
317312
"Enable runtime interleaving until load/store ports are saturated"));
318313

319-
/// Interleave small loops with scalar reductions.
320-
static cl::opt<bool> InterleaveSmallLoopScalarReduction(
321-
"interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
322-
cl::desc("Enable interleaving for loops with small iteration counts that "
323-
"contain scalar reductions to expose ILP."));
324-
325314
/// The number of stores in a loop that are allowed to need predication.
326315
static cl::opt<unsigned> NumberOfStoresToPredicate(
327316
"vectorize-num-stores-pred", cl::init(1), cl::Hidden,
@@ -5823,14 +5812,6 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
58235812

58245813
auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
58255814
const bool HasReductions = !Legal->getReductionVars().empty();
5826-
// Do not interleave loops with a relatively small known or estimated trip
5827-
// count. But we will interleave when InterleaveSmallLoopScalarReduction is
5828-
// enabled, and the code has scalar reductions(HasReductions && VF = 1),
5829-
// because with the above conditions interleaving can expose ILP and break
5830-
// cross iteration dependences for reductions.
5831-
if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
5832-
!(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
5833-
return 1;
58345815

58355816
// If we did not calculate the cost for VF (because the user selected the VF)
58365817
// then we calculate the cost of VF here.
@@ -5903,21 +5884,58 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
59035884
MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
59045885
}
59055886

5906-
// If trip count is known or estimated compile time constant, limit the
5907-
// interleave count to be less than the trip count divided by VF, provided it
5908-
// is at least 1.
5909-
//
5910-
// For scalable vectors we can't know if interleaving is beneficial. It may
5911-
// not be beneficial for small loops if none of the lanes in the second vector
5912-
// iterations is enabled. However, for larger loops, there is likely to be a
5913-
// similar benefit as for fixed-width vectors. For now, we choose to leave
5914-
// the InterleaveCount as if vscale is '1', although if some information about
5915-
// the vector is known (e.g. min vector size), we can make a better decision.
5916-
if (BestKnownTC) {
5917-
MaxInterleaveCount =
5918-
std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
5919-
// Make sure MaxInterleaveCount is greater than 0.
5920-
MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
5887+
unsigned EstimatedVF = VF.getKnownMinValue();
5888+
if (VF.isScalable()) {
5889+
if (std::optional<unsigned> VScale = getVScaleForTuning(TheLoop, TTI))
5890+
EstimatedVF *= *VScale;
5891+
}
5892+
assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
5893+
5894+
unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5895+
if (KnownTC > 0) {
5896+
// At least one iteration must be scalar when this constraint holds. So the
5897+
// maximum available iterations for interleaving is one less.
5898+
unsigned AvailableTC =
5899+
requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC;
5900+
5901+
// If trip count is known we select between two prospective ICs, where
5902+
// 1) the aggressive IC is capped by the trip count divided by VF
5903+
// 2) the conservative IC is capped by the trip count divided by (VF * 2)
5904+
// The final IC is selected in a way that the epilogue loop trip count is
5905+
// minimized while maximizing the IC itself, so that we either run the
5906+
// vector loop at least once if it generates a small epilogue loop, or else
5907+
// we run the vector loop at least twice.
5908+
5909+
unsigned InterleaveCountUB = bit_floor(
5910+
std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
5911+
unsigned InterleaveCountLB = bit_floor(std::max(
5912+
1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
5913+
MaxInterleaveCount = InterleaveCountLB;
5914+
5915+
if (InterleaveCountUB != InterleaveCountLB) {
5916+
unsigned TailTripCountUB =
5917+
(AvailableTC % (EstimatedVF * InterleaveCountUB));
5918+
unsigned TailTripCountLB =
5919+
(AvailableTC % (EstimatedVF * InterleaveCountLB));
5920+
// If both produce same scalar tail, maximize the IC to do the same work
5921+
// in fewer vector loop iterations
5922+
if (TailTripCountUB == TailTripCountLB)
5923+
MaxInterleaveCount = InterleaveCountUB;
5924+
}
5925+
} else if (BestKnownTC && *BestKnownTC > 0) {
5926+
// At least one iteration must be scalar when this constraint holds. So the
5927+
// maximum available iterations for interleaving is one less.
5928+
unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
5929+
? (*BestKnownTC) - 1
5930+
: *BestKnownTC;
5931+
5932+
// If trip count is an estimated compile time constant, limit the
5933+
// IC to be capped by the trip count divided by VF * 2, such that the vector
5934+
// loop runs at least twice to make interleaving seem profitable when there
5935+
// is an epilogue loop present. Since exact Trip count is not known we
5936+
// choose to be conservative in our IC estimate.
5937+
MaxInterleaveCount = bit_floor(std::max(
5938+
1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
59215939
}
59225940

59235941
assert(MaxInterleaveCount > 0 &&
@@ -6021,8 +6039,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
60216039

60226040
// If there are scalar reductions and TTI has enabled aggressive
60236041
// interleaving for reductions, we will interleave to expose ILP.
6024-
if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
6025-
AggressivelyInterleaveReductions) {
6042+
if (VF.isScalar() && AggressivelyInterleaveReductions) {
60266043
LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
60276044
// Interleave no less than SmallIC but not as aggressive as the normal IC
60286045
// to satisfy the rare situation when resources are too limited.

llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
; REQUIRES: x86-registered-target
12
; RUN: opt -aa-pipeline=basic-aa -passes=loop-distribute -enable-loop-distribute -verify-loop-info -verify-dom-info -S \
23
; RUN: < %s | FileCheck %s
34

@@ -79,6 +80,8 @@ entry:
7980

8081

8182
; VECTORIZE: mul <4 x i32>
83+
; VECTORIZE: mul <4 x i32>
84+
; VECTORIZE-NOT: mul <4 x i32>
8285

8386
for.body: ; preds = %for.body, %entry
8487
%ind = phi i64 [ 0, %entry ], [ %add, %for.body ]

0 commit comments

Comments
 (0)