Skip to content

Commit 09d12e9

Browse files
committed
[LV] Pre-committing tests for changing loop interleaving count computation (llvm#70272)
Added tests for evaluating changes to loop interleaving count computation and for removing loop interleaving threshold in subsequent patches.
1 parent 2eabcea commit 09d12e9

File tree

2 files changed

+344
-17
lines changed

2 files changed

+344
-17
lines changed
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
; RUN: opt < %s -tiny-trip-count-interleave-threshold=32 -p loop-vectorize -S -pass-remarks=loop-vectorize -disable-output 2>&1 | FileCheck %s
2+
; TODO: remove -tiny-trip-count-interleave-threshold once the interleave threshold is removed
3+
4+
target triple = "aarch64-linux-gnu"
5+
6+
%pair = type { i8, i8 }
7+
8+
; For this loop with known TC of 32, when the auto-vectorizer chooses VF 16, it should choose
9+
; IC 2 since there is no remainder loop run needed when the vector loop runs.
10+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
11+
define void @loop_with_tc_32(ptr noalias %p, ptr noalias %q) {
12+
entry:
13+
br label %for.body
14+
15+
for.body:
16+
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
17+
%tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
18+
%tmp1 = load i8, ptr %tmp0, align 1
19+
%tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
20+
%tmp3 = load i8, ptr %tmp2, align 1
21+
%add = add i8 %tmp1, %tmp3
22+
%qi = getelementptr i8, ptr %q, i64 %i
23+
store i8 %add, ptr %qi, align 1
24+
%i.next = add nuw nsw i64 %i, 1
25+
%cond = icmp eq i64 %i.next, 32
26+
br i1 %cond, label %for.end, label %for.body
27+
28+
for.end:
29+
ret void
30+
}
31+
32+
; TODO: For this loop with known TC of 33, when the auto-vectorizer chooses VF 16, it should choose
33+
; IC 1 since there may be a remainder loop that needs to run after the vector loop.
34+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
35+
define void @loop_with_tc_33(ptr noalias %p, ptr noalias %q) {
36+
entry:
37+
br label %for.body
38+
39+
for.body:
40+
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
41+
%tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
42+
%tmp1 = load i8, ptr %tmp0, align 1
43+
%tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
44+
%tmp3 = load i8, ptr %tmp2, align 1
45+
%add = add i8 %tmp1, %tmp3
46+
%qi = getelementptr i8, ptr %q, i64 %i
47+
store i8 %add, ptr %qi, align 1
48+
%i.next = add nuw nsw i64 %i, 1
49+
%cond = icmp eq i64 %i.next, 33
50+
br i1 %cond, label %for.end, label %for.body
51+
52+
for.end:
53+
ret void
54+
}
55+
56+
; For a loop with unknown trip count but a profile showing an approx TC estimate of 32, when the
57+
; auto-vectorizer chooses VF 16, it should choose IC 2 since chances are high that the remainder loop
58+
; won't need to run
59+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
60+
define void @loop_with_profile_tc_32(ptr noalias %p, ptr noalias %q, i64 %n) {
61+
entry:
62+
br label %for.body
63+
64+
for.body:
65+
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
66+
%tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
67+
%tmp1 = load i8, ptr %tmp0, align 1
68+
%tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
69+
%tmp3 = load i8, ptr %tmp2, align 1
70+
%add = add i8 %tmp1, %tmp3
71+
%qi = getelementptr i8, ptr %q, i64 %i
72+
store i8 %add, ptr %qi, align 1
73+
%i.next = add nuw nsw i64 %i, 1
74+
%cond = icmp eq i64 %i.next, %n
75+
br i1 %cond, label %for.end, label %for.body, !prof !0
76+
77+
for.end:
78+
ret void
79+
}
80+
81+
; TODO: For a loop with unknown trip count but a profile showing an approx TC estimate of 33,
82+
; when the auto-vectorizer chooses VF 16, it should choose IC 1 since chances are high that the
83+
; remainder loop will need to run
84+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
85+
define void @loop_with_profile_tc_33(ptr noalias %p, ptr noalias %q, i64 %n) {
86+
entry:
87+
br label %for.body
88+
89+
for.body:
90+
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
91+
%tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
92+
%tmp1 = load i8, ptr %tmp0, align 1
93+
%tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
94+
%tmp3 = load i8, ptr %tmp2, align 1
95+
%add = add i8 %tmp1, %tmp3
96+
%qi = getelementptr i8, ptr %q, i64 %i
97+
store i8 %add, ptr %qi, align 1
98+
%i.next = add nuw nsw i64 %i, 1
99+
%cond = icmp eq i64 %i.next, %n
100+
br i1 %cond, label %for.end, label %for.body, !prof !1
101+
102+
for.end:
103+
ret void
104+
}
105+
106+
!0 = !{!"branch_weights", i32 1, i32 31}
107+
!1 = !{!"branch_weights", i32 1, i32 32}

llvm/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll

Lines changed: 237 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -6,37 +6,257 @@
66
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
77
target triple = "x86_64-apple-macosx10.8.0"
88

9-
; We don't unroll this loop because it has a small constant trip count.
9+
; We don't unroll this loop because it has a small constant trip count
10+
; that is not profitable for generating a scalar epilogue
1011
;
11-
; CHECK-VECTOR-LABEL: @foo(
12+
; CHECK-VECTOR-LABEL: @foo_trip_count_8(
1213
; CHECK-VECTOR: load <4 x i32>
1314
; CHECK-VECTOR-NOT: load <4 x i32>
1415
; CHECK-VECTOR: store <4 x i32>
1516
; CHECK-VECTOR-NOT: store <4 x i32>
1617
; CHECK-VECTOR: ret
1718
;
18-
; CHECK-SCALAR-LABEL: @foo(
19+
; CHECK-SCALAR-LABEL: @foo_trip_count_8(
1920
; CHECK-SCALAR: load i32, ptr
2021
; CHECK-SCALAR-NOT: load i32, ptr
2122
; CHECK-SCALAR: store i32
2223
; CHECK-SCALAR-NOT: store i32
2324
; CHECK-SCALAR: ret
24-
define i32 @foo(ptr nocapture %A) nounwind uwtable ssp {
25-
br label %1
25+
define void @foo_trip_count_8(ptr nocapture %A) nounwind uwtable ssp {
26+
entry:
27+
br label %for.body
2628

27-
; <label>:1 ; preds = %1, %0
28-
%indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
29-
%2 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
30-
%3 = load i32, ptr %2, align 4
31-
%4 = add nsw i32 %3, 6
32-
store i32 %4, ptr %2, align 4
29+
for.body: ; preds = %for.body, %entry
30+
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
31+
%0 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
32+
%1 = load i32, ptr %0, align 4
33+
%2 = add nsw i32 %1, 6
34+
store i32 %2, ptr %0, align 4
35+
%indvars.iv.next = add i64 %indvars.iv, 1
36+
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
37+
%exitcond = icmp eq i32 %lftr.wideiv, 8
38+
br i1 %exitcond, label %for.end, label %for.body
39+
40+
for.end: ; preds = %for.body
41+
ret void
42+
}
43+
44+
; TODO: We should unroll this loop 4 times since TC being a multiple of VF means
45+
; that the epilogue loop may not need to run, making it profitable for
46+
; the vector loop to run even once
47+
;
48+
; CHECK-VECTOR-LABEL: @foo_trip_count_16(
49+
; CHECK-VECTOR: load <4 x i32>
50+
; CHECK-VECTOR-NOT: load <4 x i32>
51+
; CHECK-VECTOR: store <4 x i32>
52+
; CHECK-VECTOR-NOT: store <4 x i32>
53+
; CHECK-VECTOR: ret
54+
;
55+
; CHECK-SCALAR-LABEL: @foo_trip_count_16(
56+
; CHECK-SCALAR: load i32, ptr
57+
; CHECK-SCALAR-NOT: load i32, ptr
58+
; CHECK-SCALAR: store i32
59+
; CHECK-SCALAR-NOT: store i32
60+
; CHECK-SCALAR: ret
61+
define void @foo_trip_count_16(ptr nocapture %A) nounwind uwtable ssp {
62+
entry:
63+
br label %for.body
64+
65+
for.body: ; preds = %for.body, %entry
66+
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
67+
%0 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
68+
%1 = load i32, ptr %0, align 4
69+
%2 = add nsw i32 %1, 6
70+
store i32 %2, ptr %0, align 4
71+
%indvars.iv.next = add i64 %indvars.iv, 1
72+
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
73+
%exitcond = icmp eq i32 %lftr.wideiv, 16
74+
br i1 %exitcond, label %for.end, label %for.body
75+
76+
for.end: ; preds = %for.body
77+
ret void
78+
}
79+
80+
; TODO: We should unroll this loop twice since TC not being a multiple of VF may require
81+
; the epilogue loop to run, making it profitable when the vector loop runs
82+
; at least twice.
83+
;
84+
; CHECK-VECTOR-LABEL: @foo_trip_count_17(
85+
; CHECK-VECTOR: load <4 x i32>
86+
; CHECK-VECTOR-NOT: load <4 x i32>
87+
; CHECK-VECTOR: store <4 x i32>
88+
; CHECK-VECTOR-NOT: store <4 x i32>
89+
; CHECK-VECTOR: ret
90+
;
91+
; CHECK-SCALAR-LABEL: @foo_trip_count_17(
92+
; CHECK-SCALAR: load i32, ptr
93+
; CHECK-SCALAR-NOT: load i32, ptr
94+
; CHECK-SCALAR: store i32
95+
; CHECK-SCALAR-NOT: store i32
96+
; CHECK-SCALAR: ret
97+
define void @foo_trip_count_17(ptr nocapture %A) nounwind uwtable ssp {
98+
entry:
99+
br label %for.body
100+
101+
for.body: ; preds = %for.body, %entry
102+
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
103+
%0 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
104+
%1 = load i32, ptr %0, align 4
105+
%2 = add nsw i32 %1, 6
106+
store i32 %2, ptr %0, align 4
107+
%indvars.iv.next = add i64 %indvars.iv, 1
108+
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
109+
%exitcond = icmp eq i32 %lftr.wideiv, 17
110+
br i1 %exitcond, label %for.end, label %for.body
111+
112+
for.end: ; preds = %for.body
113+
ret void
114+
}
115+
116+
; TODO: We should unroll this loop 4 times since TC being a multiple of VF means
117+
; that the epilogue loop may not need to run, making it profitable for
118+
; the vector loop to run even once. The IC is restricted to 4 since
119+
; that is the maximum supported for the target.
120+
;
121+
; CHECK-VECTOR-LABEL: @foo_trip_count_24(
122+
; CHECK-VECTOR: load <4 x i32>
123+
; CHECK-VECTOR-NOT: load <4 x i32>
124+
; CHECK-VECTOR: store <4 x i32>
125+
; CHECK-VECTOR-NOT: store <4 x i32>
126+
; CHECK-VECTOR: ret
127+
;
128+
; CHECK-SCALAR-LABEL: @foo_trip_count_24(
129+
; CHECK-SCALAR: load i32, ptr
130+
; CHECK-SCALAR-NOT: load i32, ptr
131+
; CHECK-SCALAR: store i32
132+
; CHECK-SCALAR-NOT: store i32
133+
; CHECK-SCALAR: ret
134+
define void @foo_trip_count_24(ptr nocapture %A) nounwind uwtable ssp {
135+
entry:
136+
br label %for.body
137+
138+
for.body: ; preds = %for.body, %entry
139+
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
140+
%0 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
141+
%1 = load i32, ptr %0, align 4
142+
%2 = add nsw i32 %1, 6
143+
store i32 %2, ptr %0, align 4
144+
%indvars.iv.next = add i64 %indvars.iv, 1
145+
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
146+
%exitcond = icmp eq i32 %lftr.wideiv, 24
147+
br i1 %exitcond, label %for.end, label %for.body
148+
149+
for.end: ; preds = %for.body
150+
ret void
151+
}
152+
153+
; TODO: We should unroll this loop twice since TC not being a multiple of VF may require
154+
; the epilogue loop to run, making it profitable when the vector loop runs
155+
; at least twice.
156+
;
157+
; CHECK-VECTOR-LABEL: @foo_trip_count_25(
158+
; CHECK-VECTOR: load <4 x i32>
159+
; CHECK-VECTOR-NOT: load <4 x i32>
160+
; CHECK-VECTOR: store <4 x i32>
161+
; CHECK-VECTOR-NOT: store <4 x i32>
162+
; CHECK-VECTOR: ret
163+
;
164+
; CHECK-SCALAR-LABEL: @foo_trip_count_25(
165+
; CHECK-SCALAR: load i32, ptr
166+
; CHECK-SCALAR-NOT: load i32, ptr
167+
; CHECK-SCALAR: store i32
168+
; CHECK-SCALAR-NOT: store i32
169+
; CHECK-SCALAR: ret
170+
define void @foo_trip_count_25(ptr nocapture %A) nounwind uwtable ssp {
171+
entry:
172+
br label %for.body
173+
174+
for.body: ; preds = %for.body, %entry
175+
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
176+
%0 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
177+
%1 = load i32, ptr %0, align 4
178+
%2 = add nsw i32 %1, 6
179+
store i32 %2, ptr %0, align 4
180+
%indvars.iv.next = add i64 %indvars.iv, 1
181+
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
182+
%exitcond = icmp eq i32 %lftr.wideiv, 25
183+
br i1 %exitcond, label %for.end, label %for.body
184+
185+
for.end: ; preds = %for.body
186+
ret void
187+
}
188+
189+
; TODO: We should unroll this loop 4 times since TC not being a multiple of VF may require
190+
; the epilogue loop to run, making it profitable when the vector loop runs
191+
; at least twice.
192+
;
193+
; CHECK-VECTOR-LABEL: @foo_trip_count_33(
194+
; CHECK-VECTOR: load <4 x i32>
195+
; CHECK-VECTOR-NOT: load <4 x i32>
196+
; CHECK-VECTOR: store <4 x i32>
197+
; CHECK-VECTOR-NOT: store <4 x i32>
198+
; CHECK-VECTOR: ret
199+
;
200+
; CHECK-SCALAR-LABEL: @foo_trip_count_33(
201+
; CHECK-SCALAR: load i32, ptr
202+
; CHECK-SCALAR-NOT: load i32, ptr
203+
; CHECK-SCALAR: store i32
204+
; CHECK-SCALAR-NOT: store i32
205+
; CHECK-SCALAR: ret
206+
define void @foo_trip_count_33(ptr nocapture %A) nounwind uwtable ssp {
207+
entry:
208+
br label %for.body
209+
210+
for.body: ; preds = %for.body, %entry
211+
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
212+
%0 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
213+
%1 = load i32, ptr %0, align 4
214+
%2 = add nsw i32 %1, 6
215+
store i32 %2, ptr %0, align 4
33216
%indvars.iv.next = add i64 %indvars.iv, 1
34217
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
35-
%exitcond = icmp eq i32 %lftr.wideiv, 100
36-
br i1 %exitcond, label %5, label %1
218+
%exitcond = icmp eq i32 %lftr.wideiv, 33
219+
br i1 %exitcond, label %for.end, label %for.body
37220

38-
; <label>:5 ; preds = %1
39-
ret i32 undef
221+
for.end: ; preds = %for.body
222+
ret void
223+
}
224+
225+
; TODO: We should unroll this loop 4 times since TC not being a multiple of VF may require
226+
; the epilogue loop to run, making it profitable when the vector loop runs
227+
; at least twice. The IC is restricted to 4 since that is the maximum supported
228+
; for the target.
229+
;
230+
; CHECK-VECTOR-LABEL: @foo_trip_count_101(
231+
; CHECK-VECTOR: load <4 x i32>
232+
; CHECK-VECTOR-NOT: load <4 x i32>
233+
; CHECK-VECTOR: store <4 x i32>
234+
; CHECK-VECTOR-NOT: store <4 x i32>
235+
; CHECK-VECTOR: ret
236+
;
237+
; CHECK-SCALAR-LABEL: @foo_trip_count_101(
238+
; CHECK-SCALAR: load i32, ptr
239+
; CHECK-SCALAR-NOT: load i32, ptr
240+
; CHECK-SCALAR: store i32
241+
; CHECK-SCALAR-NOT: store i32
242+
; CHECK-SCALAR: ret
243+
define void @foo_trip_count_101(ptr nocapture %A) nounwind uwtable ssp {
244+
entry:
245+
br label %for.body
246+
247+
for.body: ; preds = %for.body, %entry
248+
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
249+
%0 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
250+
%1 = load i32, ptr %0, align 4
251+
%2 = add nsw i32 %1, 6
252+
store i32 %2, ptr %0, align 4
253+
%indvars.iv.next = add i64 %indvars.iv, 1
254+
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
255+
%exitcond = icmp eq i32 %lftr.wideiv, 101
256+
br i1 %exitcond, label %for.end, label %for.body
257+
258+
for.end: ; preds = %for.body
259+
ret void
40260
}
41261

42262
; But this is a good small loop to unroll as we don't know of a bound on its
@@ -53,7 +273,7 @@ define i32 @foo(ptr nocapture %A) nounwind uwtable ssp {
53273
; CHECK-SCALAR: store i32
54274
; CHECK-SCALAR-NOT: store i32
55275
; CHECK-SCALAR: ret
56-
define i32 @bar(ptr nocapture %A, i32 %n) nounwind uwtable ssp {
276+
define void @bar(ptr nocapture %A, i32 %n) nounwind uwtable ssp {
57277
%1 = icmp sgt i32 %n, 0
58278
br i1 %1, label %.lr.ph, label %._crit_edge
59279

@@ -69,7 +289,7 @@ define i32 @bar(ptr nocapture %A, i32 %n) nounwind uwtable ssp {
69289
br i1 %exitcond, label %._crit_edge, label %.lr.ph
70290

71291
._crit_edge: ; preds = %.lr.ph, %0
72-
ret i32 undef
292+
ret void
73293
}
74294

75295
; Also unroll if we need a runtime check but it was going to be added for

0 commit comments

Comments
 (0)