1
1
; RUN: opt < %s -tiny-trip-count-interleave-threshold=16 -force-target-max-vector-interleave=8 -p loop-vectorize -S -pass-remarks=loop-vectorize -disable-output 2>&1 | FileCheck %s
2
+ ; RUN: opt < %s -tiny-trip-count-interleave-threshold=16 -force-target-max-vector-interleave=8 -p loop-vectorize -S 2>&1 | FileCheck %s -check-prefix=CHECK-IR
2
3
; TODO: remove -tiny-trip-count-interleave-threshold once the interleave threshold is removed
3
4
4
5
target triple = "aarch64-linux-gnu"
@@ -125,6 +126,30 @@ for.end:
125
126
ret void
126
127
}
127
128
129
+ ; This has the same profile-guided estimated trip count as loop_with_profile_tc_64 but since the
130
+ ; resulting interleaved group in this case may access memory out-of-bounds, it requires a scalar
131
+ ; epilogue iteration for correctness, making at most 63 iterations available for interleaving.
132
+ ; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar
133
+ ; remainder than IC 2
134
+ ; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
135
+ define void @loop_with_profile_tc_64_scalar_epilogue_reqd (ptr noalias %p , ptr noalias %q , i64 %n ) {
136
+ entry:
137
+ br label %for.body
138
+
139
+ for.body:
140
+ %i = phi i64 [ 0 , %entry ], [ %i.next , %for.body ]
141
+ %gep.src = getelementptr inbounds [3 x i8 ], ptr %p , i64 %i , i64 0
142
+ %l = load i8 , ptr %gep.src , align 1
143
+ %gep.dst = getelementptr inbounds i8 , ptr %q , i64 %i
144
+ store i8 %l , ptr %gep.dst , align 1
145
+ %i.next = add nuw nsw i64 %i , 1
146
+ %cond = icmp eq i64 %i.next , %n
147
+ br i1 %cond , label %for.end , label %for.body , !prof !4
148
+
149
+ for.end:
150
+ ret void
151
+ }
152
+
128
153
; For a loop with a profile-guided estimated TC of 100, when the auto-vectorizer chooses VF 16,
129
154
; it should choose conservatively IC 2 so that the vector loop runs twice at least
130
155
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
@@ -153,6 +178,15 @@ for.end:
153
178
; it should choose conservatively IC 4 so that the vector loop runs twice at least
154
179
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
155
180
define void @loop_with_profile_tc_128 (ptr noalias %p , ptr noalias %q , i64 %n ) {
181
+ ; CHECK-IR-LABEL: define void @loop_with_profile_tc_128(
182
+ ; CHECK-IR-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]], i64 [[N:%.*]]) {
183
+ ; CHECK-IR-NEXT: iter.check:
184
+ ; CHECK-IR-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
185
+ ; CHECK-IR-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]], !prof [[PROF6:![0-9]+]]
186
+ ; CHECK-IR: vector.main.loop.iter.check:
187
+ ; CHECK-IR-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], 64
188
+ ; CHECK-IR-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF6]]
189
+ ;
156
190
entry:
157
191
br label %for.body
158
192
@@ -173,6 +207,40 @@ for.end:
173
207
ret void
174
208
}
175
209
210
+ ; This has the same profile-guided estimated trip count as loop_with_profile_tc_128 but since
211
+ ; the resulting interleaved group in this case may access memory out-of-bounds, it requires
212
+ ; a scalar epilogue iteration for correctness, making at most 127 iterations available for
213
+ ; interleaving.
214
+ ; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar
215
+ ; remainder than IC 4
216
+ ; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
217
+ define void @loop_with_profile_tc_128_scalar_epilogue_reqd (ptr noalias %p , ptr noalias %q , i64 %n ) {
218
+ ; CHECK-IR-LABEL: define void @loop_with_profile_tc_128_scalar_epilogue_reqd(
219
+ ; CHECK-IR-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]], i64 [[N:%.*]]) {
220
+ ; CHECK-IR-NEXT: iter.check:
221
+ ; CHECK-IR-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[N]], 8
222
+ ; CHECK-IR-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]], !prof [[PROF6]]
223
+ ; CHECK-IR: vector.main.loop.iter.check:
224
+ ; CHECK-IR-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ule i64 [[N]], 64
225
+ ; CHECK-IR-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF6]]
226
+ ;
227
+ entry:
228
+ br label %for.body
229
+
230
+ for.body:
231
+ %i = phi i64 [ 0 , %entry ], [ %i.next , %for.body ]
232
+ %gep.src = getelementptr inbounds [3 x i8 ], ptr %p , i64 %i , i64 0
233
+ %l = load i8 , ptr %gep.src , align 1
234
+ %gep.dst = getelementptr inbounds i8 , ptr %q , i64 %i
235
+ store i8 %l , ptr %gep.dst , align 1
236
+ %i.next = add nuw nsw i64 %i , 1
237
+ %cond = icmp eq i64 %i.next , %n
238
+ br i1 %cond , label %for.end , label %for.body , !prof !6
239
+
240
+ for.end:
241
+ ret void
242
+ }
243
+
176
244
; For a loop with a profile-guided estimated TC of 129, when the auto-vectorizer chooses VF 16,
177
245
; it should choose conservatively IC 4 so that the vector loop runs twice at least
178
246
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
0 commit comments