Skip to content

Commit 0c62e58

Browse files
committed
[LV] Update interleaving count computation when scalar epilogue loop needs to run at least once (llvm#79651)
Update loop interleaving count computation to address loops that require at least one scalar iteration in the epilogue loop. For this case, the available trip count for interleaving the loop is one less.
1 parent 38e3f26 commit 0c62e58

File tree

3 files changed

+31
-18
lines changed

3 files changed

+31
-18
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5911,7 +5911,12 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
59115911
assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
59125912

59135913
unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5914-
if (KnownTC) {
5914+
if (KnownTC > 0) {
5915+
// At least one iteration must be scalar when this constraint holds. So the
5916+
// maximum available iterations for interleaving is one less.
5917+
unsigned AvailableTC =
5918+
requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC;
5919+
59155920
// If trip count is known we select between two prospective ICs, where
59165921
// 1) the aggressive IC is capped by the trip count divided by VF
59175922
// 2) the conservative IC is capped by the trip count divided by (VF * 2)
@@ -5921,27 +5926,35 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
59215926
// we run the vector loop at least twice.
59225927

59235928
unsigned InterleaveCountUB = bit_floor(
5924-
std::max(1u, std::min(KnownTC / EstimatedVF, MaxInterleaveCount)));
5929+
std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
59255930
unsigned InterleaveCountLB = bit_floor(std::max(
5926-
1u, std::min(KnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
5931+
1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
59275932
MaxInterleaveCount = InterleaveCountLB;
59285933

59295934
if (InterleaveCountUB != InterleaveCountLB) {
5930-
unsigned TailTripCountUB = (KnownTC % (EstimatedVF * InterleaveCountUB));
5931-
unsigned TailTripCountLB = (KnownTC % (EstimatedVF * InterleaveCountLB));
5935+
unsigned TailTripCountUB =
5936+
(AvailableTC % (EstimatedVF * InterleaveCountUB));
5937+
unsigned TailTripCountLB =
5938+
(AvailableTC % (EstimatedVF * InterleaveCountLB));
59325939
// If both produce same scalar tail, maximize the IC to do the same work
59335940
// in fewer vector loop iterations
59345941
if (TailTripCountUB == TailTripCountLB)
59355942
MaxInterleaveCount = InterleaveCountUB;
59365943
}
5937-
} else if (BestKnownTC) {
5944+
} else if (BestKnownTC && *BestKnownTC > 0) {
5945+
// At least one iteration must be scalar when this constraint holds. So the
5946+
// maximum available iterations for interleaving is one less.
5947+
unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
5948+
? (*BestKnownTC) - 1
5949+
: *BestKnownTC;
5950+
59385951
// If trip count is an estimated compile time constant, limit the
59395952
// IC to be capped by the trip count divided by VF * 2, such that the vector
59405953
// loop runs at least twice to make interleaving seem profitable when there
59415954
// is an epilogue loop present. Since exact Trip count is not known we
59425955
// choose to be conservative in our IC estimate.
59435956
MaxInterleaveCount = bit_floor(std::max(
5944-
1u, std::min(*BestKnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
5957+
1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
59455958
}
59465959

59475960
assert(MaxInterleaveCount > 0 &&

llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -129,9 +129,9 @@ for.end:
129129
; This has the same profile-guided estimated trip count as loop_with_profile_tc_64 but since the
130130
; resulting interleaved group in this case may access memory out-of-bounds, it requires a scalar
131131
; epilogue iteration for correctness, making at most 63 iterations available for interleaving.
132-
; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar
132+
; When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar
133133
; remainder than IC 2
134-
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
134+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
135135
define void @loop_with_profile_tc_64_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
136136
entry:
137137
br label %for.body
@@ -211,17 +211,17 @@ for.end:
211211
; the resulting interleaved group in this case may access memory out-of-bounds, it requires
212212
; a scalar epilogue iteration for correctness, making at most 127 iterations available for
213213
; interleaving.
214-
; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar
214+
; When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar
215215
; remainder than IC 4
216-
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
216+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
217217
define void @loop_with_profile_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
218218
; CHECK-IR-LABEL: define void @loop_with_profile_tc_128_scalar_epilogue_reqd(
219219
; CHECK-IR-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]], i64 [[N:%.*]]) {
220220
; CHECK-IR-NEXT: iter.check:
221221
; CHECK-IR-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[N]], 8
222222
; CHECK-IR-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
223223
; CHECK-IR: vector.main.loop.iter.check:
224-
; CHECK-IR-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ule i64 [[N]], 64
224+
; CHECK-IR-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ule i64 [[N]], 32
225225
; CHECK-IR-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
226226
;
227227
entry:

llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,9 @@ for.end:
3333
; This has the same trip count as loop_with_tc_32 but since the resulting interleaved group
3434
; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for
3535
; correctness, making at most 31 iterations available for interleaving.
36-
; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar remainder
36+
; When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar remainder
3737
; than IC 2
38-
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
38+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
3939
define void @loop_with_tc_32_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q) {
4040
entry:
4141
br label %for.body
@@ -229,15 +229,15 @@ for.end:
229229
; This has the same trip count as loop_with_tc_128 but since the resulting interleaved group
230230
; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for
231231
; correctness, making at most 127 iterations available for interleaving.
232-
; TODO: Like loop_with_tc_128, the entry block should branch into the vector loop, instead of the scalar epilogue.
233-
; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar
232+
; Like loop_with_tc_128, the entry block should branch into the vector loop, instead of the scalar epilogue.
233+
; When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar
234234
; remainder than IC 4
235-
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
235+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
236236
define void @loop_with_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q) {
237237
; CHECK-IR-LABEL: define void @loop_with_tc_128_scalar_epilogue_reqd(
238238
; CHECK-IR-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]]) {
239239
; CHECK-IR-NEXT: entry:
240-
; CHECK-IR-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
240+
; CHECK-IR-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
241241
entry:
242242
br label %for.body
243243

0 commit comments

Comments
 (0)