@@ -264,11 +264,6 @@ static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
264
264
"enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
265
265
cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
266
266
267
- static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
268
- "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
269
- cl::desc("We don't interleave loops with a estimated constant trip count "
270
- "below this number"));
271
-
272
267
static cl::opt<unsigned> ForceTargetNumScalarRegs(
273
268
"force-target-num-scalar-regs", cl::init(0), cl::Hidden,
274
269
cl::desc("A flag that overrides the target's number of scalar registers."));
@@ -316,12 +311,6 @@ static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
316
311
cl::desc(
317
312
"Enable runtime interleaving until load/store ports are saturated"));
318
313
319
- /// Interleave small loops with scalar reductions.
320
- static cl::opt<bool> InterleaveSmallLoopScalarReduction(
321
- "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
322
- cl::desc("Enable interleaving for loops with small iteration counts that "
323
- "contain scalar reductions to expose ILP."));
324
-
325
314
/// The number of stores in a loop that are allowed to need predication.
326
315
static cl::opt<unsigned> NumberOfStoresToPredicate(
327
316
"vectorize-num-stores-pred", cl::init(1), cl::Hidden,
@@ -5823,14 +5812,6 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5823
5812
5824
5813
auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5825
5814
const bool HasReductions = !Legal->getReductionVars().empty();
5826
- // Do not interleave loops with a relatively small known or estimated trip
5827
- // count. But we will interleave when InterleaveSmallLoopScalarReduction is
5828
- // enabled, and the code has scalar reductions(HasReductions && VF = 1),
5829
- // because with the above conditions interleaving can expose ILP and break
5830
- // cross iteration dependences for reductions.
5831
- if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
5832
- !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
5833
- return 1;
5834
5815
5835
5816
// If we did not calculate the cost for VF (because the user selected the VF)
5836
5817
// then we calculate the cost of VF here.
@@ -5903,21 +5884,58 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5903
5884
MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5904
5885
}
5905
5886
5906
- // If trip count is known or estimated compile time constant, limit the
5907
- // interleave count to be less than the trip count divided by VF, provided it
5908
- // is at least 1.
5909
- //
5910
- // For scalable vectors we can't know if interleaving is beneficial. It may
5911
- // not be beneficial for small loops if none of the lanes in the second vector
5912
- // iterations is enabled. However, for larger loops, there is likely to be a
5913
- // similar benefit as for fixed-width vectors. For now, we choose to leave
5914
- // the InterleaveCount as if vscale is '1', although if some information about
5915
- // the vector is known (e.g. min vector size), we can make a better decision.
5916
- if (BestKnownTC) {
5917
- MaxInterleaveCount =
5918
- std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
5919
- // Make sure MaxInterleaveCount is greater than 0.
5920
- MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
5887
+ unsigned EstimatedVF = VF.getKnownMinValue();
5888
+ if (VF.isScalable()) {
5889
+ if (std::optional<unsigned> VScale = getVScaleForTuning(TheLoop, TTI))
5890
+ EstimatedVF *= *VScale;
5891
+ }
5892
+ assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
5893
+
5894
+ unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5895
+ if (KnownTC > 0) {
5896
+ // At least one iteration must be scalar when this constraint holds. So the
5897
+ // maximum available iterations for interleaving is one less.
5898
+ unsigned AvailableTC =
5899
+ requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC;
5900
+
5901
+ // If trip count is known we select between two prospective ICs, where
5902
+ // 1) the aggressive IC is capped by the trip count divided by VF
5903
+ // 2) the conservative IC is capped by the trip count divided by (VF * 2)
5904
+ // The final IC is selected in a way that the epilogue loop trip count is
5905
+ // minimized while maximizing the IC itself, so that we either run the
5906
+ // vector loop at least once if it generates a small epilogue loop, or else
5907
+ // we run the vector loop at least twice.
5908
+
5909
+ unsigned InterleaveCountUB = bit_floor(
5910
+ std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
5911
+ unsigned InterleaveCountLB = bit_floor(std::max(
5912
+ 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
5913
+ MaxInterleaveCount = InterleaveCountLB;
5914
+
5915
+ if (InterleaveCountUB != InterleaveCountLB) {
5916
+ unsigned TailTripCountUB =
5917
+ (AvailableTC % (EstimatedVF * InterleaveCountUB));
5918
+ unsigned TailTripCountLB =
5919
+ (AvailableTC % (EstimatedVF * InterleaveCountLB));
5920
+ // If both produce same scalar tail, maximize the IC to do the same work
5921
+ // in fewer vector loop iterations
5922
+ if (TailTripCountUB == TailTripCountLB)
5923
+ MaxInterleaveCount = InterleaveCountUB;
5924
+ }
5925
+ } else if (BestKnownTC && *BestKnownTC > 0) {
5926
+ // At least one iteration must be scalar when this constraint holds. So the
5927
+ // maximum available iterations for interleaving is one less.
5928
+ unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
5929
+ ? (*BestKnownTC) - 1
5930
+ : *BestKnownTC;
5931
+
5932
+ // If trip count is an estimated compile time constant, limit the
5933
+ // IC to be capped by the trip count divided by VF * 2, such that the vector
5934
+ // loop runs at least twice to make interleaving seem profitable when there
5935
+ // is an epilogue loop present. Since exact Trip count is not known we
5936
+ // choose to be conservative in our IC estimate.
5937
+ MaxInterleaveCount = bit_floor(std::max(
5938
+ 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
5921
5939
}
5922
5940
5923
5941
assert(MaxInterleaveCount > 0 &&
@@ -6021,8 +6039,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
6021
6039
6022
6040
// If there are scalar reductions and TTI has enabled aggressive
6023
6041
// interleaving for reductions, we will interleave to expose ILP.
6024
- if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
6025
- AggressivelyInterleaveReductions) {
6042
+ if (VF.isScalar() && AggressivelyInterleaveReductions) {
6026
6043
LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6027
6044
// Interleave no less than SmallIC but not as aggressive as the normal IC
6028
6045
// to satisfy the rare situation when resources are too limited.
0 commit comments