@@ -5911,7 +5911,12 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5911
5911
assert (EstimatedVF >= 1 && " Estimated VF shouldn't be less than 1" );
5912
5912
5913
5913
unsigned KnownTC = PSE.getSE ()->getSmallConstantTripCount (TheLoop);
5914
- if (KnownTC) {
5914
+ if (KnownTC > 0 ) {
5915
+ // At least one iteration must be scalar when this constraint holds. So the
5916
+ // maximum available iterations for interleaving is one less.
5917
+ unsigned AvailableTC =
5918
+ requiresScalarEpilogue (VF.isVector ()) ? KnownTC - 1 : KnownTC;
5919
+
5915
5920
// If trip count is known we select between two prospective ICs, where
5916
5921
// 1) the aggressive IC is capped by the trip count divided by VF
5917
5922
// 2) the conservative IC is capped by the trip count divided by (VF * 2)
@@ -5921,27 +5926,35 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5921
5926
// we run the vector loop at least twice.
5922
5927
5923
5928
unsigned InterleaveCountUB = bit_floor (
5924
- std::max(1u, std::min(KnownTC / EstimatedVF, MaxInterleaveCount)));
5929
+ std::max (1u , std::min (AvailableTC / EstimatedVF, MaxInterleaveCount)));
5925
5930
unsigned InterleaveCountLB = bit_floor (std::max (
5926
- 1u, std::min(KnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
5931
+ 1u , std::min (AvailableTC / (EstimatedVF * 2 ), MaxInterleaveCount)));
5927
5932
MaxInterleaveCount = InterleaveCountLB;
5928
5933
5929
5934
if (InterleaveCountUB != InterleaveCountLB) {
5930
- unsigned TailTripCountUB = (KnownTC % (EstimatedVF * InterleaveCountUB));
5931
- unsigned TailTripCountLB = (KnownTC % (EstimatedVF * InterleaveCountLB));
5935
+ unsigned TailTripCountUB =
5936
+ (AvailableTC % (EstimatedVF * InterleaveCountUB));
5937
+ unsigned TailTripCountLB =
5938
+ (AvailableTC % (EstimatedVF * InterleaveCountLB));
5932
5939
// If both produce same scalar tail, maximize the IC to do the same work
5933
5940
// in fewer vector loop iterations
5934
5941
if (TailTripCountUB == TailTripCountLB)
5935
5942
MaxInterleaveCount = InterleaveCountUB;
5936
5943
}
5937
- } else if (BestKnownTC) {
5944
+ } else if (BestKnownTC && *BestKnownTC > 0 ) {
5945
+ // At least one iteration must be scalar when this constraint holds. So the
5946
+ // maximum available iterations for interleaving is one less.
5947
+ unsigned AvailableTC = requiresScalarEpilogue (VF.isVector ())
5948
+ ? (*BestKnownTC) - 1
5949
+ : *BestKnownTC;
5950
+
5938
5951
// If trip count is an estimated compile time constant, limit the
5939
5952
// IC to be capped by the trip count divided by VF * 2, such that the vector
5940
5953
// loop runs at least twice to make interleaving seem profitable when there
5941
5954
// is an epilogue loop present. Since exact Trip count is not known we
5942
5955
// choose to be conservative in our IC estimate.
5943
5956
MaxInterleaveCount = bit_floor (std::max (
5944
- 1u, std::min(*BestKnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
5957
+ 1u , std::min (AvailableTC / (EstimatedVF * 2 ), MaxInterleaveCount)));
5945
5958
}
5946
5959
5947
5960
assert (MaxInterleaveCount > 0 &&
0 commit comments