Skip to content

Commit 3f59fd9

Browse files
committed
Updated Loop IC calculation to minimize the epilogue loop iterations when exact trip count is known.
1 parent 8a2ed43 commit 3f59fd9

File tree

3 files changed

+220
-40
lines changed

3 files changed

+220
-40
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 38 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -5728,34 +5728,45 @@ LoopVectorizationCostModel::selectInterleaveCount(Loop *L, ElementCount VF,
57285728
MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
57295729
}
57305730

5731-
// If trip count is known or estimated compile time constant, limit the
5732-
// interleave count to be less than the trip count divided by VF * 2,
5733-
// provided VF is at least 1 and the trip count is not an exact multiple of
5734-
// VF, such that the vector loop runs at least twice to make interleaving seem
5735-
// profitable when there is an epilogue loop present. When
5736-
// InterleaveSmallLoopScalarReduction is true or trip count is an exact
5737-
// multiple of VF, we allow interleaving even when the vector loop runs once.
5738-
//
5739-
// For scalable vectors we can't know if interleaving is beneficial. It may
5740-
// not be beneficial for small loops if none of the lanes in the second vector
5741-
// iterations is enabled. However, for larger loops, there is likely to be a
5742-
// similar benefit as for fixed-width vectors. For now, we choose to leave
5743-
// the InterleaveCount as if vscale is '1', although if some information about
5744-
// the vector is known (e.g. min vector size), we can make a better decision.
5745-
if (BestKnownTC) {
5746-
unsigned EstimatedVF = VF.getKnownMinValue();
5747-
if (VF.isScalable()) {
5748-
if (std::optional<unsigned> VScale = getVScaleForTuning(L, TTI))
5749-
EstimatedVF *= *VScale;
5731+
unsigned EstimatedVF = VF.getKnownMinValue();
5732+
if (VF.isScalable()) {
5733+
if (std::optional<unsigned> VScale = getVScaleForTuning(L, TTI))
5734+
EstimatedVF *= *VScale;
5735+
}
5736+
assert((EstimatedVF >= 1) && "Estimated VF shouldn't be less than 1");
5737+
5738+
unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(L);
5739+
if (KnownTC) {
5740+
// If trip count is known we select between two prospective ICs, where
5741+
// 1) the aggressive IC is capped by the trip count divided by VF
5742+
// 2) the conservative IC is capped by the trip count divided by (VF * 2)
5743+
// The final IC is selected in a way that the epilogue loop trip count is
5744+
// minimized while maximizing the IC itself, so that we either run the
5745+
// vector loop at least once if it generates a small epilogue loop, or else
5746+
// we run the vector loop at least twice.
5747+
5748+
unsigned InterleaveCountUB = bit_floor(
5749+
std::max(1u, std::min(KnownTC / EstimatedVF, MaxInterleaveCount)));
5750+
unsigned InterleaveCountLB = bit_floor(std::max(
5751+
1u, std::min(KnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
5752+
MaxInterleaveCount = InterleaveCountLB;
5753+
5754+
if (InterleaveCountUB != InterleaveCountLB) {
5755+
unsigned TailTripCountUB = (KnownTC % (EstimatedVF * InterleaveCountUB));
5756+
unsigned TailTripCountLB = (KnownTC % (EstimatedVF * InterleaveCountLB));
5757+
// If both produce same scalar tail, maximize the IC to do the same work
5758+
// in fewer vector loop iterations
5759+
if (TailTripCountUB == TailTripCountLB)
5760+
MaxInterleaveCount = InterleaveCountUB;
57505761
}
5751-
if (InterleaveSmallLoopScalarReduction || (*BestKnownTC % EstimatedVF == 0))
5752-
MaxInterleaveCount =
5753-
std::min(*BestKnownTC / EstimatedVF, MaxInterleaveCount);
5754-
else
5755-
MaxInterleaveCount =
5756-
std::min(*BestKnownTC / (EstimatedVF * 2), MaxInterleaveCount);
5757-
// Make sure MaxInterleaveCount is greater than 0 & a power of 2.
5758-
MaxInterleaveCount = llvm::bit_floor(std::max(1u, MaxInterleaveCount));
5762+
} else if (BestKnownTC) {
5763+
// If trip count is an estimated compile time constant, limit the
5764+
// IC to be capped by the trip count divided by VF * 2, such that the vector
5765+
// loop runs at least twice to make interleaving seem profitable when there
5766+
// is an epilogue loop present. Since exact Trip count is not known we
5767+
// choose to be conservative in our IC estimate.
5768+
MaxInterleaveCount = bit_floor(std::max(
5769+
1u, std::min(*BestKnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
57595770
}
57605771

57615772
assert(MaxInterleaveCount > 0 &&

llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll

Lines changed: 179 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
1-
; RUN: opt < %s -tiny-trip-count-interleave-threshold=32 -p loop-vectorize -S -pass-remarks=loop-vectorize -disable-output 2>&1 | FileCheck %s
1+
; RUN: opt < %s -tiny-trip-count-interleave-threshold=16 -p loop-vectorize -S -pass-remarks=loop-vectorize -disable-output 2>&1 | FileCheck %s
22
; TODO: remove -tiny-trip-count-interleave-threshold once the interleave threshold is removed
33

44
target triple = "aarch64-linux-gnu"
55

66
%pair = type { i8, i8 }
77

88
; For this loop with known TC of 32, when the auto-vectorizer chooses VF 16, it should choose
9-
; IC 2 since there is no remainder loop run needed when the vector loop runs.
9+
; IC 2 since there is no remainder loop run needed after the vector loop runs.
1010
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
1111
define void @loop_with_tc_32(ptr noalias %p, ptr noalias %q) {
1212
entry:
@@ -30,8 +30,8 @@ for.end:
3030
}
3131

3232
; For this loop with known TC of 33, when the auto-vectorizer chooses VF 16, it should choose
33-
; IC 1 since there may be a remainder loop that needs to run after the vector loop.
34-
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
33+
; IC 2 since there is a small remainder loop TC that needs to run after the vector loop.
34+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
3535
define void @loop_with_tc_33(ptr noalias %p, ptr noalias %q) {
3636
entry:
3737
br label %for.body
@@ -53,10 +53,105 @@ for.end:
5353
ret void
5454
}
5555

56-
; For a loop with unknown trip count but a profile showing an approx TC estimate of 32, when the
57-
; auto-vectorizer chooses VF 16, it should choose IC 2 since chances are high that the remainder loop
58-
; won't need to run
56+
; For this loop with known TC of 39, when the auto-vectorizer chooses VF 16, it should choose
57+
; IC 2 since there is a small remainder loop that needs to run after the vector loop.
5958
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
59+
define void @loop_with_tc_39(ptr noalias %p, ptr noalias %q) {
60+
entry:
61+
br label %for.body
62+
63+
for.body:
64+
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
65+
%tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
66+
%tmp1 = load i8, ptr %tmp0, align 1
67+
%tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
68+
%tmp3 = load i8, ptr %tmp2, align 1
69+
%add = add i8 %tmp1, %tmp3
70+
%qi = getelementptr i8, ptr %q, i64 %i
71+
store i8 %add, ptr %qi, align 1
72+
%i.next = add nuw nsw i64 %i, 1
73+
%cond = icmp eq i64 %i.next, 39
74+
br i1 %cond, label %for.end, label %for.body
75+
76+
for.end:
77+
ret void
78+
}
79+
80+
; For this loop with known TC of 48, when the auto-vectorizer chooses VF 16, it should choose
81+
; IC 1 since there will be no remainder loop that needs to run after the vector loop.
82+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
83+
define void @loop_with_tc_48(ptr noalias %p, ptr noalias %q) {
84+
entry:
85+
br label %for.body
86+
87+
for.body:
88+
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
89+
%tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
90+
%tmp1 = load i8, ptr %tmp0, align 1
91+
%tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
92+
%tmp3 = load i8, ptr %tmp2, align 1
93+
%add = add i8 %tmp1, %tmp3
94+
%qi = getelementptr i8, ptr %q, i64 %i
95+
store i8 %add, ptr %qi, align 1
96+
%i.next = add nuw nsw i64 %i, 1
97+
%cond = icmp eq i64 %i.next, 48
98+
br i1 %cond, label %for.end, label %for.body
99+
100+
for.end:
101+
ret void
102+
}
103+
104+
; For this loop with known TC of 49, when the auto-vectorizer chooses VF 16, it should choose
105+
; IC 1 since a remainder loop TC of 1 is more efficient than remainder loop TC of 17 with IC 2
106+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
107+
define void @loop_with_tc_49(ptr noalias %p, ptr noalias %q) {
108+
entry:
109+
br label %for.body
110+
111+
for.body:
112+
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
113+
%tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
114+
%tmp1 = load i8, ptr %tmp0, align 1
115+
%tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
116+
%tmp3 = load i8, ptr %tmp2, align 1
117+
%add = add i8 %tmp1, %tmp3
118+
%qi = getelementptr i8, ptr %q, i64 %i
119+
store i8 %add, ptr %qi, align 1
120+
%i.next = add nuw nsw i64 %i, 1
121+
%cond = icmp eq i64 %i.next, 49
122+
br i1 %cond, label %for.end, label %for.body
123+
124+
for.end:
125+
ret void
126+
}
127+
128+
; For this loop with known TC of 55, when the auto-vectorizer chooses VF 16, it should choose
129+
; IC 1 since a remainder loop TC of 7 is more efficient than remainder loop TC of 23 with IC 2
130+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
131+
define void @loop_with_tc_55(ptr noalias %p, ptr noalias %q) {
132+
entry:
133+
br label %for.body
134+
135+
for.body:
136+
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
137+
%tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
138+
%tmp1 = load i8, ptr %tmp0, align 1
139+
%tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
140+
%tmp3 = load i8, ptr %tmp2, align 1
141+
%add = add i8 %tmp1, %tmp3
142+
%qi = getelementptr i8, ptr %q, i64 %i
143+
store i8 %add, ptr %qi, align 1
144+
%i.next = add nuw nsw i64 %i, 1
145+
%cond = icmp eq i64 %i.next, 55
146+
br i1 %cond, label %for.end, label %for.body
147+
148+
for.end:
149+
ret void
150+
}
151+
152+
; For a loop with a profile-guided estimated TC of 32, when the auto-vectorizer chooses VF 16,
153+
; it should conservatively choose IC 1 so that the vector loop runs twice at least
154+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
60155
define void @loop_with_profile_tc_32(ptr noalias %p, ptr noalias %q, i64 %n) {
61156
entry:
62157
br label %for.body
@@ -78,9 +173,8 @@ for.end:
78173
ret void
79174
}
80175

81-
; For a loop with unknown trip count but a profile showing an approx TC estimate of 33,
82-
; when the auto-vectorizer chooses VF 16, it should choose IC 1 since chances are high that the
83-
; remainder loop will need to run
176+
; For a loop with a profile-guided estimated TC of 33, when the auto-vectorizer chooses VF 16,
177+
; it should conservatively choose IC 1 so that the vector loop runs twice at least
84178
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
85179
define void @loop_with_profile_tc_33(ptr noalias %p, ptr noalias %q, i64 %n) {
86180
entry:
@@ -103,5 +197,80 @@ for.end:
103197
ret void
104198
}
105199

200+
; For a loop with a profile-guided estimated TC of 48, when the auto-vectorizer chooses VF 16,
201+
; it should conservatively choose IC 1 so that the vector loop runs twice at least
202+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
203+
define void @loop_with_profile_tc_48(ptr noalias %p, ptr noalias %q, i64 %n) {
204+
entry:
205+
br label %for.body
206+
207+
for.body:
208+
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
209+
%tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
210+
%tmp1 = load i8, ptr %tmp0, align 1
211+
%tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
212+
%tmp3 = load i8, ptr %tmp2, align 1
213+
%add = add i8 %tmp1, %tmp3
214+
%qi = getelementptr i8, ptr %q, i64 %i
215+
store i8 %add, ptr %qi, align 1
216+
%i.next = add nuw nsw i64 %i, 1
217+
%cond = icmp eq i64 %i.next, %n
218+
br i1 %cond, label %for.end, label %for.body, !prof !2
219+
220+
for.end:
221+
ret void
222+
}
223+
224+
; For a loop with a profile-guided estimated TC of 63, when the auto-vectorizer chooses VF 16,
225+
; it should conservatively choose IC 1 so that the vector loop runs twice at least
226+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
227+
define void @loop_with_profile_tc_63(ptr noalias %p, ptr noalias %q, i64 %n) {
228+
entry:
229+
br label %for.body
230+
231+
for.body:
232+
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
233+
%tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
234+
%tmp1 = load i8, ptr %tmp0, align 1
235+
%tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
236+
%tmp3 = load i8, ptr %tmp2, align 1
237+
%add = add i8 %tmp1, %tmp3
238+
%qi = getelementptr i8, ptr %q, i64 %i
239+
store i8 %add, ptr %qi, align 1
240+
%i.next = add nuw nsw i64 %i, 1
241+
%cond = icmp eq i64 %i.next, %n
242+
br i1 %cond, label %for.end, label %for.body, !prof !3
243+
244+
for.end:
245+
ret void
246+
}
247+
248+
; For a loop with a profile-guided estimated TC of 64, when the auto-vectorizer chooses VF 16,
249+
; it should choose conservatively IC 2 so that the vector loop runs twice at least
250+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
251+
define void @loop_with_profile_tc_64(ptr noalias %p, ptr noalias %q, i64 %n) {
252+
entry:
253+
br label %for.body
254+
255+
for.body:
256+
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
257+
%tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
258+
%tmp1 = load i8, ptr %tmp0, align 1
259+
%tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
260+
%tmp3 = load i8, ptr %tmp2, align 1
261+
%add = add i8 %tmp1, %tmp3
262+
%qi = getelementptr i8, ptr %q, i64 %i
263+
store i8 %add, ptr %qi, align 1
264+
%i.next = add nuw nsw i64 %i, 1
265+
%cond = icmp eq i64 %i.next, %n
266+
br i1 %cond, label %for.end, label %for.body, !prof !4
267+
268+
for.end:
269+
ret void
270+
}
271+
106272
!0 = !{!"branch_weights", i32 1, i32 31}
107273
!1 = !{!"branch_weights", i32 1, i32 32}
274+
!2 = !{!"branch_weights", i32 1, i32 47}
275+
!3 = !{!"branch_weights", i32 1, i32 62}
276+
!4 = !{!"branch_weights", i32 1, i32 63}

llvm/test/Transforms/LoopVectorize/X86/interleave_short_tc.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
; Check that we won't interleave by more than "best known" estimated trip count.
1+
; Check that we won't interleave by more than half the "best known" estimated trip count.
22

33
; The loop is expected to be vectorized by 4 and interleaving suppresed due to
44
; short trip count which is controled by "tiny-trip-count-interleave-threshold".
@@ -12,7 +12,7 @@
1212
; Thus the resulting step is 4.
1313
; RUN: opt -passes=loop-vectorize -force-vector-width=2 -vectorizer-min-trip-count=4 -tiny-trip-count-interleave-threshold=4 -S < %s | FileCheck %s
1414

15-
; Check that we won't interleave by more than "best known" estimated trip count.
15+
; Check that we won't interleave by more than half the "best known" estimated trip count.
1616

1717
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
1818
target triple = "x86_64-unknown-linux-gnu"
@@ -56,4 +56,4 @@ for.body: ; preds = %for.body, %for.body
5656
br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body, !prof !1
5757
}
5858

59-
!1 = !{!"branch_weights", i32 1, i32 5}
59+
!1 = !{!"branch_weights", i32 1, i32 9}

0 commit comments

Comments
 (0)