Skip to content

Commit 38e3f26

Browse files
committed
[Tests][LV][AArch64] Pre-commit tests for changing loop interleaving count computation for loops that need to run scalar iterations (llvm#79640)
This patch contains a set of pre-commit tests for changing the loop interleaving count computation in a subsequent patch in order to address loops that need to execute at least a single scalar iteration in the epilogue.
1 parent 6e8dc4a commit 38e3f26

File tree

2 files changed

+126
-0
lines changed

2 files changed

+126
-0
lines changed

llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
; RUN: opt < %s -tiny-trip-count-interleave-threshold=16 -force-target-max-vector-interleave=8 -p loop-vectorize -S -pass-remarks=loop-vectorize -disable-output 2>&1 | FileCheck %s
2+
; RUN: opt < %s -tiny-trip-count-interleave-threshold=16 -force-target-max-vector-interleave=8 -p loop-vectorize -S 2>&1 | FileCheck %s -check-prefix=CHECK-IR
23
; TODO: remove -tiny-trip-count-interleave-threshold once the interleave threshold is removed
34

45
target triple = "aarch64-linux-gnu"
@@ -125,6 +126,30 @@ for.end:
125126
ret void
126127
}
127128

129+
; This has the same profile-guided estimated trip count as loop_with_profile_tc_64 but since the
130+
; resulting interleaved group in this case may access memory out-of-bounds, it requires a scalar
131+
; epilogue iteration for correctness, making at most 63 iterations available for interleaving.
132+
; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar
133+
; remainder than IC 2
134+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
135+
define void @loop_with_profile_tc_64_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
136+
entry:
137+
br label %for.body
138+
139+
for.body:
140+
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
141+
%gep.src = getelementptr inbounds [3 x i8], ptr %p, i64 %i, i64 0
142+
%l = load i8, ptr %gep.src, align 1
143+
%gep.dst = getelementptr inbounds i8, ptr %q, i64 %i
144+
store i8 %l, ptr %gep.dst, align 1
145+
%i.next = add nuw nsw i64 %i, 1
146+
%cond = icmp eq i64 %i.next, %n
147+
br i1 %cond, label %for.end, label %for.body, !prof !4
148+
149+
for.end:
150+
ret void
151+
}
152+
128153
; For a loop with a profile-guided estimated TC of 100, when the auto-vectorizer chooses VF 16,
129154
; it should choose conservatively IC 2 so that the vector loop runs twice at least
130155
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
@@ -153,6 +178,15 @@ for.end:
153178
; it should choose conservatively IC 4 so that the vector loop runs twice at least
154179
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
155180
define void @loop_with_profile_tc_128(ptr noalias %p, ptr noalias %q, i64 %n) {
181+
; CHECK-IR-LABEL: define void @loop_with_profile_tc_128(
182+
; CHECK-IR-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]], i64 [[N:%.*]]) {
183+
; CHECK-IR-NEXT: iter.check:
184+
; CHECK-IR-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
185+
; CHECK-IR-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
186+
; CHECK-IR: vector.main.loop.iter.check:
187+
; CHECK-IR-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], 64
188+
; CHECK-IR-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
189+
;
156190
entry:
157191
br label %for.body
158192

@@ -173,6 +207,40 @@ for.end:
173207
ret void
174208
}
175209

210+
; This has the same profile-guided estimated trip count as loop_with_profile_tc_128 but since
211+
; the resulting interleaved group in this case may access memory out-of-bounds, it requires
212+
; a scalar epilogue iteration for correctness, making at most 127 iterations available for
213+
; interleaving.
214+
; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar
215+
; remainder than IC 4
216+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
217+
define void @loop_with_profile_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
218+
; CHECK-IR-LABEL: define void @loop_with_profile_tc_128_scalar_epilogue_reqd(
219+
; CHECK-IR-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]], i64 [[N:%.*]]) {
220+
; CHECK-IR-NEXT: iter.check:
221+
; CHECK-IR-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[N]], 8
222+
; CHECK-IR-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
223+
; CHECK-IR: vector.main.loop.iter.check:
224+
; CHECK-IR-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ule i64 [[N]], 64
225+
; CHECK-IR-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
226+
;
227+
entry:
228+
br label %for.body
229+
230+
for.body:
231+
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
232+
%gep.src = getelementptr inbounds [3 x i8], ptr %p, i64 %i, i64 0
233+
%l = load i8, ptr %gep.src, align 1
234+
%gep.dst = getelementptr inbounds i8, ptr %q, i64 %i
235+
store i8 %l, ptr %gep.dst, align 1
236+
%i.next = add nuw nsw i64 %i, 1
237+
%cond = icmp eq i64 %i.next, %n
238+
br i1 %cond, label %for.end, label %for.body, !prof !6
239+
240+
for.end:
241+
ret void
242+
}
243+
176244
; For a loop with a profile-guided estimated TC of 129, when the auto-vectorizer chooses VF 16,
177245
; it should choose conservatively IC 4 so that the vector loop runs twice at least
178246
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)

llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
; RUN: opt < %s -tiny-trip-count-interleave-threshold=16 -force-target-max-vector-interleave=8 -p loop-vectorize -S -pass-remarks=loop-vectorize -disable-output 2>&1 | FileCheck %s
2+
; RUN: opt < %s -tiny-trip-count-interleave-threshold=16 -force-target-max-vector-interleave=8 -p loop-vectorize -S 2>&1 | FileCheck %s -check-prefix=CHECK-IR
23
; TODO: remove -tiny-trip-count-interleave-threshold once the interleave threshold is removed
34

45
target triple = "aarch64-linux-gnu"
@@ -29,6 +30,30 @@ for.end:
2930
ret void
3031
}
3132

33+
; This has the same trip count as loop_with_tc_32 but since the resulting interleaved group
34+
; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for
35+
; correctness, making at most 31 iterations available for interleaving.
36+
; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar remainder
37+
; than IC 2
38+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
39+
define void @loop_with_tc_32_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q) {
40+
entry:
41+
br label %for.body
42+
43+
for.body:
44+
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
45+
%gep.src = getelementptr inbounds [3 x i8], ptr %p, i64 %i, i64 0
46+
%l = load i8, ptr %gep.src, align 1
47+
%gep.dst = getelementptr inbounds i8, ptr %q, i64 %i
48+
store i8 %l, ptr %gep.dst, align 1
49+
%i.next = add nuw nsw i64 %i, 1
50+
%cond = icmp eq i64 %i.next, 32
51+
br i1 %cond, label %for.end, label %for.body
52+
53+
for.end:
54+
ret void
55+
}
56+
3257
; For this loop with known TC of 33, when the auto-vectorizer chooses VF 16, it should choose
3358
; IC 2 since there is a small remainder loop TC that needs to run after the vector loop.
3459
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
@@ -177,6 +202,10 @@ for.end:
177202
; IC 8 since there is no remainder loop run needed after the vector loop runs
178203
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
179204
define void @loop_with_tc_128(ptr noalias %p, ptr noalias %q) {
205+
; CHECK-IR-LABEL: define void @loop_with_tc_128(
206+
; CHECK-IR-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]]) {
207+
; CHECK-IR-NEXT: entry:
208+
; CHECK-IR-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
180209
entry:
181210
br label %for.body
182211

@@ -197,6 +226,35 @@ for.end:
197226
ret void
198227
}
199228

229+
; This has the same trip count as loop_with_tc_128 but since the resulting interleaved group
230+
; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for
231+
; correctness, making at most 127 iterations available for interleaving.
232+
; TODO: Like loop_with_tc_128, the entry block should branch into the vector loop, instead of the scalar epilogue.
233+
; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar
234+
; remainder than IC 4
235+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
236+
define void @loop_with_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q) {
237+
; CHECK-IR-LABEL: define void @loop_with_tc_128_scalar_epilogue_reqd(
238+
; CHECK-IR-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]]) {
239+
; CHECK-IR-NEXT: entry:
240+
; CHECK-IR-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
241+
entry:
242+
br label %for.body
243+
244+
for.body:
245+
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
246+
%gep.src = getelementptr inbounds [3 x i8], ptr %p, i64 %i, i64 0
247+
%l = load i8, ptr %gep.src, align 1
248+
%gep.dst = getelementptr inbounds i8, ptr %q, i64 %i
249+
store i8 %l, ptr %gep.dst, align 1
250+
%i.next = add nuw nsw i64 %i, 1
251+
%cond = icmp eq i64 %i.next, 128
252+
br i1 %cond, label %for.end, label %for.body
253+
254+
for.end:
255+
ret void
256+
}
257+
200258
; For this loop with known TC of 129, when the auto-vectorizer chooses VF 16, it should choose
201259
; IC 8 since there is a small remainder loop that needs to run after the vector loop
202260
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)

0 commit comments

Comments
 (0)