Skip to content

Commit 884aef2

Browse files
committed
[LV] Update interleaving count computation when scalar epilogue loop needs to run at least once
1 parent 155f24b commit 884aef2

File tree

3 files changed

+31
-18
lines changed

3 files changed

+31
-18
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5436,7 +5436,12 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
54365436
assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
54375437

54385438
unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5439-
if (KnownTC) {
5439+
if (KnownTC > 0) {
5440+
// At least one iteration must be scalar when this constraint holds. So the
5441+
// maximum available iterations for interleaving is one less.
5442+
unsigned AvailableTC =
5443+
requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC;
5444+
54405445
// If trip count is known we select between two prospective ICs, where
54415446
// 1) the aggressive IC is capped by the trip count divided by VF
54425447
// 2) the conservative IC is capped by the trip count divided by (VF * 2)
@@ -5446,27 +5451,35 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
54465451
// we run the vector loop at least twice.
54475452

54485453
unsigned InterleaveCountUB = bit_floor(
5449-
std::max(1u, std::min(KnownTC / EstimatedVF, MaxInterleaveCount)));
5454+
std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
54505455
unsigned InterleaveCountLB = bit_floor(std::max(
5451-
1u, std::min(KnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
5456+
1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
54525457
MaxInterleaveCount = InterleaveCountLB;
54535458

54545459
if (InterleaveCountUB != InterleaveCountLB) {
5455-
unsigned TailTripCountUB = (KnownTC % (EstimatedVF * InterleaveCountUB));
5456-
unsigned TailTripCountLB = (KnownTC % (EstimatedVF * InterleaveCountLB));
5460+
unsigned TailTripCountUB =
5461+
(AvailableTC % (EstimatedVF * InterleaveCountUB));
5462+
unsigned TailTripCountLB =
5463+
(AvailableTC % (EstimatedVF * InterleaveCountLB));
54575464
// If both produce same scalar tail, maximize the IC to do the same work
54585465
// in fewer vector loop iterations
54595466
if (TailTripCountUB == TailTripCountLB)
54605467
MaxInterleaveCount = InterleaveCountUB;
54615468
}
5462-
} else if (BestKnownTC) {
5469+
} else if (BestKnownTC && *BestKnownTC > 0) {
5470+
// At least one iteration must be scalar when this constraint holds. So the
5471+
// maximum available iterations for interleaving is one less.
5472+
unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
5473+
? (*BestKnownTC) - 1
5474+
: *BestKnownTC;
5475+
54635476
// If trip count is an estimated compile time constant, limit the
54645477
// IC to be capped by the trip count divided by VF * 2, such that the vector
54655478
// loop runs at least twice to make interleaving seem profitable when there
54665479
// is an epilogue loop present. Since exact Trip count is not known we
54675480
// choose to be conservative in our IC estimate.
54685481
MaxInterleaveCount = bit_floor(std::max(
5469-
1u, std::min(*BestKnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
5482+
1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
54705483
}
54715484

54725485
assert(MaxInterleaveCount > 0 &&

llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -129,9 +129,9 @@ for.end:
129129
; This has the same profile-guided estimated trip count as loop_with_profile_tc_64 but since the
130130
; resulting interleaved group in this case may access memory out-of-bounds, it requires a scalar
131131
; epilogue iteration for correctness, making at most 63 iterations available for interleaving.
132-
; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar
132+
; When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar
133133
; remainder than IC 2
134-
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
134+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
135135
define void @loop_with_profile_tc_64_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
136136
entry:
137137
br label %for.body
@@ -211,17 +211,17 @@ for.end:
211211
; the resulting interleaved group in this case may access memory out-of-bounds, it requires
212212
; a scalar epilogue iteration for correctness, making at most 127 iterations available for
213213
; interleaving.
214-
; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar
214+
; When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar
215215
; remainder than IC 4
216-
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
216+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
217217
define void @loop_with_profile_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
218218
; CHECK-IR-LABEL: define void @loop_with_profile_tc_128_scalar_epilogue_reqd(
219219
; CHECK-IR-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]], i64 [[N:%.*]]) {
220220
; CHECK-IR-NEXT: iter.check:
221221
; CHECK-IR-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[N]], 8
222222
; CHECK-IR-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]], !prof [[PROF6]]
223223
; CHECK-IR: vector.main.loop.iter.check:
224-
; CHECK-IR-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ule i64 [[N]], 64
224+
; CHECK-IR-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ule i64 [[N]], 32
225225
; CHECK-IR-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF6]]
226226
;
227227
entry:

llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,9 @@ for.end:
3333
; This has the same trip count as loop_with_tc_32 but since the resulting interleaved group
3434
; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for
3535
; correctness, making at most 31 iterations available for interleaving.
36-
; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar remainder
36+
; When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar remainder
3737
; than IC 2
38-
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
38+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
3939
define void @loop_with_tc_32_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q) {
4040
entry:
4141
br label %for.body
@@ -229,15 +229,15 @@ for.end:
229229
; This has the same trip count as loop_with_tc_128 but since the resulting interleaved group
230230
; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for
231231
; correctness, making at most 127 iterations available for interleaving.
232-
; TODO: Like loop_with_tc_128, the entry block should branch into the vector loop, instead of the scalar epilogue.
233-
; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar
232+
; Like loop_with_tc_128, the entry block should branch into the vector loop, instead of the scalar epilogue.
233+
; When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar
234234
; remainder than IC 4
235-
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
235+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
236236
define void @loop_with_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q) {
237237
; CHECK-IR-LABEL: define void @loop_with_tc_128_scalar_epilogue_reqd(
238238
; CHECK-IR-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]]) {
239239
; CHECK-IR-NEXT: entry:
240-
; CHECK-IR-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
240+
; CHECK-IR-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
241241
entry:
242242
br label %for.body
243243

0 commit comments

Comments
 (0)