1
- ; RUN: opt < %s -tiny-trip-count-interleave-threshold=32 -p loop-vectorize -S -pass-remarks=loop-vectorize -disable-output 2>&1 | FileCheck %s
1
+ ; RUN: opt < %s -tiny-trip-count-interleave-threshold=16 -p loop-vectorize -S -pass-remarks=loop-vectorize -disable-output 2>&1 | FileCheck %s
2
2
; TODO: remove -tiny-trip-count-interleave-threshold once the interleave threshold is removed
3
3
4
4
target triple = "aarch64-linux-gnu"
5
5
6
6
%pair = type { i8 , i8 }
7
7
8
8
; For this loop with known TC of 32, when the auto-vectorizer chooses VF 16, it should choose
9
- ; IC 2 since there is no remainder loop run needed when the vector loop runs.
9
+ ; IC 2 since there is no remainder loop run needed after the vector loop runs.
10
10
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
11
11
define void @loop_with_tc_32 (ptr noalias %p , ptr noalias %q ) {
12
12
entry:
@@ -30,8 +30,8 @@ for.end:
30
30
}
31
31
32
32
; For this loop with known TC of 33, when the auto-vectorizer chooses VF 16, it should choose
33
- ; IC 1 since there may be a remainder loop that needs to run after the vector loop.
34
- ; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1 )
33
+ ; IC 2 since there is a small remainder loop TC that needs to run after the vector loop.
34
+ ; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2 )
35
35
define void @loop_with_tc_33 (ptr noalias %p , ptr noalias %q ) {
36
36
entry:
37
37
br label %for.body
@@ -53,10 +53,105 @@ for.end:
53
53
ret void
54
54
}
55
55
56
- ; For a loop with unknown trip count but a profile showing an approx TC estimate of 32, when the
57
- ; auto-vectorizer chooses VF 16, it should choose IC 2 since chances are high that the remainder loop
58
- ; won't need to run
56
+ ; For this loop with known TC of 39, when the auto-vectorizer chooses VF 16, it should choose
57
+ ; IC 2 since there is a small remainder loop that needs to run after the vector loop.
59
58
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
59
+ define void @loop_with_tc_39 (ptr noalias %p , ptr noalias %q ) {
60
+ entry:
61
+ br label %for.body
62
+
63
+ for.body:
64
+ %i = phi i64 [ 0 , %entry ], [ %i.next , %for.body ]
65
+ %tmp0 = getelementptr %pair , ptr %p , i64 %i , i32 0
66
+ %tmp1 = load i8 , ptr %tmp0 , align 1
67
+ %tmp2 = getelementptr %pair , ptr %p , i64 %i , i32 1
68
+ %tmp3 = load i8 , ptr %tmp2 , align 1
69
+ %add = add i8 %tmp1 , %tmp3
70
+ %qi = getelementptr i8 , ptr %q , i64 %i
71
+ store i8 %add , ptr %qi , align 1
72
+ %i.next = add nuw nsw i64 %i , 1
73
+ %cond = icmp eq i64 %i.next , 39
74
+ br i1 %cond , label %for.end , label %for.body
75
+
76
+ for.end:
77
+ ret void
78
+ }
79
+
80
+ ; For this loop with known TC of 48, when the auto-vectorizer chooses VF 16, it should choose
81
+ ; IC 1 since there will be no remainder loop that needs to run after the vector loop.
82
+ ; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
83
+ define void @loop_with_tc_48 (ptr noalias %p , ptr noalias %q ) {
84
+ entry:
85
+ br label %for.body
86
+
87
+ for.body:
88
+ %i = phi i64 [ 0 , %entry ], [ %i.next , %for.body ]
89
+ %tmp0 = getelementptr %pair , ptr %p , i64 %i , i32 0
90
+ %tmp1 = load i8 , ptr %tmp0 , align 1
91
+ %tmp2 = getelementptr %pair , ptr %p , i64 %i , i32 1
92
+ %tmp3 = load i8 , ptr %tmp2 , align 1
93
+ %add = add i8 %tmp1 , %tmp3
94
+ %qi = getelementptr i8 , ptr %q , i64 %i
95
+ store i8 %add , ptr %qi , align 1
96
+ %i.next = add nuw nsw i64 %i , 1
97
+ %cond = icmp eq i64 %i.next , 48
98
+ br i1 %cond , label %for.end , label %for.body
99
+
100
+ for.end:
101
+ ret void
102
+ }
103
+
104
+ ; For this loop with known TC of 49, when the auto-vectorizer chooses VF 16, it should choose
105
+ ; IC 1 since a remainder loop TC of 1 is more efficient than remainder loop TC of 17 with IC 2
106
+ ; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
107
+ define void @loop_with_tc_49 (ptr noalias %p , ptr noalias %q ) {
108
+ entry:
109
+ br label %for.body
110
+
111
+ for.body:
112
+ %i = phi i64 [ 0 , %entry ], [ %i.next , %for.body ]
113
+ %tmp0 = getelementptr %pair , ptr %p , i64 %i , i32 0
114
+ %tmp1 = load i8 , ptr %tmp0 , align 1
115
+ %tmp2 = getelementptr %pair , ptr %p , i64 %i , i32 1
116
+ %tmp3 = load i8 , ptr %tmp2 , align 1
117
+ %add = add i8 %tmp1 , %tmp3
118
+ %qi = getelementptr i8 , ptr %q , i64 %i
119
+ store i8 %add , ptr %qi , align 1
120
+ %i.next = add nuw nsw i64 %i , 1
121
+ %cond = icmp eq i64 %i.next , 49
122
+ br i1 %cond , label %for.end , label %for.body
123
+
124
+ for.end:
125
+ ret void
126
+ }
127
+
128
+ ; For this loop with known TC of 55, when the auto-vectorizer chooses VF 16, it should choose
129
+ ; IC 1 since a remainder loop TC of 7 is more efficient than remainder loop TC of 23 with IC 2
130
+ ; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
131
+ define void @loop_with_tc_55 (ptr noalias %p , ptr noalias %q ) {
132
+ entry:
133
+ br label %for.body
134
+
135
+ for.body:
136
+ %i = phi i64 [ 0 , %entry ], [ %i.next , %for.body ]
137
+ %tmp0 = getelementptr %pair , ptr %p , i64 %i , i32 0
138
+ %tmp1 = load i8 , ptr %tmp0 , align 1
139
+ %tmp2 = getelementptr %pair , ptr %p , i64 %i , i32 1
140
+ %tmp3 = load i8 , ptr %tmp2 , align 1
141
+ %add = add i8 %tmp1 , %tmp3
142
+ %qi = getelementptr i8 , ptr %q , i64 %i
143
+ store i8 %add , ptr %qi , align 1
144
+ %i.next = add nuw nsw i64 %i , 1
145
+ %cond = icmp eq i64 %i.next , 55
146
+ br i1 %cond , label %for.end , label %for.body
147
+
148
+ for.end:
149
+ ret void
150
+ }
151
+
152
+ ; For a loop with a profile-guided estimated TC of 32, when the auto-vectorizer chooses VF 16,
153
+ ; it should conservatively choose IC 1 so that the vector loop runs twice at least
154
+ ; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
60
155
define void @loop_with_profile_tc_32 (ptr noalias %p , ptr noalias %q , i64 %n ) {
61
156
entry:
62
157
br label %for.body
@@ -78,9 +173,8 @@ for.end:
78
173
ret void
79
174
}
80
175
81
- ; For a loop with unknown trip count but a profile showing an approx TC estimate of 33,
82
- ; when the auto-vectorizer chooses VF 16, it should choose IC 1 since chances are high that the
83
- ; remainder loop will need to run
176
+ ; For a loop with a profile-guided estimated TC of 33, when the auto-vectorizer chooses VF 16,
177
+ ; it should conservatively choose IC 1 so that the vector loop runs twice at least
84
178
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
85
179
define void @loop_with_profile_tc_33 (ptr noalias %p , ptr noalias %q , i64 %n ) {
86
180
entry:
@@ -103,5 +197,80 @@ for.end:
103
197
ret void
104
198
}
105
199
200
+ ; For a loop with a profile-guided estimated TC of 48, when the auto-vectorizer chooses VF 16,
201
+ ; it should conservatively choose IC 1 so that the vector loop runs twice at least
202
+ ; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
203
+ define void @loop_with_profile_tc_48 (ptr noalias %p , ptr noalias %q , i64 %n ) {
204
+ entry:
205
+ br label %for.body
206
+
207
+ for.body:
208
+ %i = phi i64 [ 0 , %entry ], [ %i.next , %for.body ]
209
+ %tmp0 = getelementptr %pair , ptr %p , i64 %i , i32 0
210
+ %tmp1 = load i8 , ptr %tmp0 , align 1
211
+ %tmp2 = getelementptr %pair , ptr %p , i64 %i , i32 1
212
+ %tmp3 = load i8 , ptr %tmp2 , align 1
213
+ %add = add i8 %tmp1 , %tmp3
214
+ %qi = getelementptr i8 , ptr %q , i64 %i
215
+ store i8 %add , ptr %qi , align 1
216
+ %i.next = add nuw nsw i64 %i , 1
217
+ %cond = icmp eq i64 %i.next , %n
218
+ br i1 %cond , label %for.end , label %for.body , !prof !2
219
+
220
+ for.end:
221
+ ret void
222
+ }
223
+
224
+ ; For a loop with a profile-guided estimated TC of 63, when the auto-vectorizer chooses VF 16,
225
+ ; it should conservatively choose IC 1 so that the vector loop runs twice at least
226
+ ; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
227
+ define void @loop_with_profile_tc_63 (ptr noalias %p , ptr noalias %q , i64 %n ) {
228
+ entry:
229
+ br label %for.body
230
+
231
+ for.body:
232
+ %i = phi i64 [ 0 , %entry ], [ %i.next , %for.body ]
233
+ %tmp0 = getelementptr %pair , ptr %p , i64 %i , i32 0
234
+ %tmp1 = load i8 , ptr %tmp0 , align 1
235
+ %tmp2 = getelementptr %pair , ptr %p , i64 %i , i32 1
236
+ %tmp3 = load i8 , ptr %tmp2 , align 1
237
+ %add = add i8 %tmp1 , %tmp3
238
+ %qi = getelementptr i8 , ptr %q , i64 %i
239
+ store i8 %add , ptr %qi , align 1
240
+ %i.next = add nuw nsw i64 %i , 1
241
+ %cond = icmp eq i64 %i.next , %n
242
+ br i1 %cond , label %for.end , label %for.body , !prof !3
243
+
244
+ for.end:
245
+ ret void
246
+ }
247
+
248
+ ; For a loop with a profile-guided estimated TC of 64, when the auto-vectorizer chooses VF 16,
249
+ ; it should choose conservatively IC 2 so that the vector loop runs twice at least
250
+ ; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
251
+ define void @loop_with_profile_tc_64 (ptr noalias %p , ptr noalias %q , i64 %n ) {
252
+ entry:
253
+ br label %for.body
254
+
255
+ for.body:
256
+ %i = phi i64 [ 0 , %entry ], [ %i.next , %for.body ]
257
+ %tmp0 = getelementptr %pair , ptr %p , i64 %i , i32 0
258
+ %tmp1 = load i8 , ptr %tmp0 , align 1
259
+ %tmp2 = getelementptr %pair , ptr %p , i64 %i , i32 1
260
+ %tmp3 = load i8 , ptr %tmp2 , align 1
261
+ %add = add i8 %tmp1 , %tmp3
262
+ %qi = getelementptr i8 , ptr %q , i64 %i
263
+ store i8 %add , ptr %qi , align 1
264
+ %i.next = add nuw nsw i64 %i , 1
265
+ %cond = icmp eq i64 %i.next , %n
266
+ br i1 %cond , label %for.end , label %for.body , !prof !4
267
+
268
+ for.end:
269
+ ret void
270
+ }
271
+
106
272
!0 = !{!"branch_weights" , i32 1 , i32 31 }
107
273
!1 = !{!"branch_weights" , i32 1 , i32 32 }
274
+ !2 = !{!"branch_weights" , i32 1 , i32 47 }
275
+ !3 = !{!"branch_weights" , i32 1 , i32 62 }
276
+ !4 = !{!"branch_weights" , i32 1 , i32 63 }
0 commit comments