6
6
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
7
7
target triple = "x86_64-apple-macosx10.8.0"
8
8
9
- ; We don't unroll this loop because it has a small constant trip count.
9
+ ; We don't unroll this loop because it has a small constant trip count
10
+ ; that is not profitable for generating a scalar epilogue
10
11
;
11
- ; CHECK-VECTOR-LABEL: @foo (
12
+ ; CHECK-VECTOR-LABEL: @foo_trip_count_8 (
12
13
; CHECK-VECTOR: load <4 x i32>
13
14
; CHECK-VECTOR-NOT: load <4 x i32>
14
15
; CHECK-VECTOR: store <4 x i32>
15
16
; CHECK-VECTOR-NOT: store <4 x i32>
16
17
; CHECK-VECTOR: ret
17
18
;
18
- ; CHECK-SCALAR-LABEL: @foo (
19
+ ; CHECK-SCALAR-LABEL: @foo_trip_count_8 (
19
20
; CHECK-SCALAR: load i32, ptr
20
21
; CHECK-SCALAR-NOT: load i32, ptr
21
22
; CHECK-SCALAR: store i32
22
23
; CHECK-SCALAR-NOT: store i32
23
24
; CHECK-SCALAR: ret
24
- define i32 @foo (ptr nocapture %A ) nounwind uwtable ssp {
25
- br label %1
25
+ define void @foo_trip_count_8 (ptr nocapture %A ) nounwind uwtable ssp {
26
+ entry:
27
+ br label %for.body
26
28
27
- ; <label>:1 ; preds = %1, %0
28
- %indvars.iv = phi i64 [ 0 , %0 ], [ %indvars.iv.next , %1 ]
29
- %2 = getelementptr inbounds i32 , ptr %A , i64 %indvars.iv
30
- %3 = load i32 , ptr %2 , align 4
31
- %4 = add nsw i32 %3 , 6
32
- store i32 %4 , ptr %2 , align 4
29
+ for.body: ; preds = %for.body, %entry
30
+ %indvars.iv = phi i64 [ 0 , %entry ], [ %indvars.iv.next , %for.body ]
31
+ %0 = getelementptr inbounds i32 , ptr %A , i64 %indvars.iv
32
+ %1 = load i32 , ptr %0 , align 4
33
+ %2 = add nsw i32 %1 , 6
34
+ store i32 %2 , ptr %0 , align 4
35
+ %indvars.iv.next = add i64 %indvars.iv , 1
36
+ %lftr.wideiv = trunc i64 %indvars.iv.next to i32
37
+ %exitcond = icmp eq i32 %lftr.wideiv , 8
38
+ br i1 %exitcond , label %for.end , label %for.body
39
+
40
+ for.end: ; preds = %for.body
41
+ ret void
42
+ }
43
+
44
+ ; TODO: We should unroll this loop 4 times since TC being a multiple of VF means
45
+ ; that the epilogue loop may not need to run, making it profitable for
46
+ ; the vector loop to run even once
47
+ ;
48
+ ; CHECK-VECTOR-LABEL: @foo_trip_count_16(
49
+ ; CHECK-VECTOR: load <4 x i32>
50
+ ; CHECK-VECTOR-NOT: load <4 x i32>
51
+ ; CHECK-VECTOR: store <4 x i32>
52
+ ; CHECK-VECTOR-NOT: store <4 x i32>
53
+ ; CHECK-VECTOR: ret
54
+ ;
55
+ ; CHECK-SCALAR-LABEL: @foo_trip_count_16(
56
+ ; CHECK-SCALAR: load i32, ptr
57
+ ; CHECK-SCALAR-NOT: load i32, ptr
58
+ ; CHECK-SCALAR: store i32
59
+ ; CHECK-SCALAR-NOT: store i32
60
+ ; CHECK-SCALAR: ret
61
+ define void @foo_trip_count_16 (ptr nocapture %A ) nounwind uwtable ssp {
62
+ entry:
63
+ br label %for.body
64
+
65
+ for.body: ; preds = %for.body, %entry
66
+ %indvars.iv = phi i64 [ 0 , %entry ], [ %indvars.iv.next , %for.body ]
67
+ %0 = getelementptr inbounds i32 , ptr %A , i64 %indvars.iv
68
+ %1 = load i32 , ptr %0 , align 4
69
+ %2 = add nsw i32 %1 , 6
70
+ store i32 %2 , ptr %0 , align 4
71
+ %indvars.iv.next = add i64 %indvars.iv , 1
72
+ %lftr.wideiv = trunc i64 %indvars.iv.next to i32
73
+ %exitcond = icmp eq i32 %lftr.wideiv , 16
74
+ br i1 %exitcond , label %for.end , label %for.body
75
+
76
+ for.end: ; preds = %for.body
77
+ ret void
78
+ }
79
+
80
+ ; TODO: We should unroll this loop twice since TC not being a multiple of VF may require
81
+ ; the epilogue loop to run, making it profitable when the vector loop runs
82
+ ; at least twice.
83
+ ;
84
+ ; CHECK-VECTOR-LABEL: @foo_trip_count_17(
85
+ ; CHECK-VECTOR: load <4 x i32>
86
+ ; CHECK-VECTOR-NOT: load <4 x i32>
87
+ ; CHECK-VECTOR: store <4 x i32>
88
+ ; CHECK-VECTOR-NOT: store <4 x i32>
89
+ ; CHECK-VECTOR: ret
90
+ ;
91
+ ; CHECK-SCALAR-LABEL: @foo_trip_count_17(
92
+ ; CHECK-SCALAR: load i32, ptr
93
+ ; CHECK-SCALAR-NOT: load i32, ptr
94
+ ; CHECK-SCALAR: store i32
95
+ ; CHECK-SCALAR-NOT: store i32
96
+ ; CHECK-SCALAR: ret
97
+ define void @foo_trip_count_17 (ptr nocapture %A ) nounwind uwtable ssp {
98
+ entry:
99
+ br label %for.body
100
+
101
+ for.body: ; preds = %for.body, %entry
102
+ %indvars.iv = phi i64 [ 0 , %entry ], [ %indvars.iv.next , %for.body ]
103
+ %0 = getelementptr inbounds i32 , ptr %A , i64 %indvars.iv
104
+ %1 = load i32 , ptr %0 , align 4
105
+ %2 = add nsw i32 %1 , 6
106
+ store i32 %2 , ptr %0 , align 4
107
+ %indvars.iv.next = add i64 %indvars.iv , 1
108
+ %lftr.wideiv = trunc i64 %indvars.iv.next to i32
109
+ %exitcond = icmp eq i32 %lftr.wideiv , 17
110
+ br i1 %exitcond , label %for.end , label %for.body
111
+
112
+ for.end: ; preds = %for.body
113
+ ret void
114
+ }
115
+
116
+ ; TODO: We should unroll this loop 4 times since TC being a multiple of VF means
117
+ ; that the epilogue loop may not need to run, making it profitable for
118
+ ; the vector loop to run even once. The IC is restricted to 4 since
119
+ ; that is the maximum supported for the target.
120
+ ;
121
+ ; CHECK-VECTOR-LABEL: @foo_trip_count_24(
122
+ ; CHECK-VECTOR: load <4 x i32>
123
+ ; CHECK-VECTOR-NOT: load <4 x i32>
124
+ ; CHECK-VECTOR: store <4 x i32>
125
+ ; CHECK-VECTOR-NOT: store <4 x i32>
126
+ ; CHECK-VECTOR: ret
127
+ ;
128
+ ; CHECK-SCALAR-LABEL: @foo_trip_count_24(
129
+ ; CHECK-SCALAR: load i32, ptr
130
+ ; CHECK-SCALAR-NOT: load i32, ptr
131
+ ; CHECK-SCALAR: store i32
132
+ ; CHECK-SCALAR-NOT: store i32
133
+ ; CHECK-SCALAR: ret
134
+ define void @foo_trip_count_24 (ptr nocapture %A ) nounwind uwtable ssp {
135
+ entry:
136
+ br label %for.body
137
+
138
+ for.body: ; preds = %for.body, %entry
139
+ %indvars.iv = phi i64 [ 0 , %entry ], [ %indvars.iv.next , %for.body ]
140
+ %0 = getelementptr inbounds i32 , ptr %A , i64 %indvars.iv
141
+ %1 = load i32 , ptr %0 , align 4
142
+ %2 = add nsw i32 %1 , 6
143
+ store i32 %2 , ptr %0 , align 4
144
+ %indvars.iv.next = add i64 %indvars.iv , 1
145
+ %lftr.wideiv = trunc i64 %indvars.iv.next to i32
146
+ %exitcond = icmp eq i32 %lftr.wideiv , 24
147
+ br i1 %exitcond , label %for.end , label %for.body
148
+
149
+ for.end: ; preds = %for.body
150
+ ret void
151
+ }
152
+
153
+ ; TODO: We should unroll this loop twice since TC not being a multiple of VF may require
154
+ ; the epilogue loop to run, making it profitable when the vector loop runs
155
+ ; at least twice.
156
+ ;
157
+ ; CHECK-VECTOR-LABEL: @foo_trip_count_25(
158
+ ; CHECK-VECTOR: load <4 x i32>
159
+ ; CHECK-VECTOR-NOT: load <4 x i32>
160
+ ; CHECK-VECTOR: store <4 x i32>
161
+ ; CHECK-VECTOR-NOT: store <4 x i32>
162
+ ; CHECK-VECTOR: ret
163
+ ;
164
+ ; CHECK-SCALAR-LABEL: @foo_trip_count_25(
165
+ ; CHECK-SCALAR: load i32, ptr
166
+ ; CHECK-SCALAR-NOT: load i32, ptr
167
+ ; CHECK-SCALAR: store i32
168
+ ; CHECK-SCALAR-NOT: store i32
169
+ ; CHECK-SCALAR: ret
170
+ define void @foo_trip_count_25 (ptr nocapture %A ) nounwind uwtable ssp {
171
+ entry:
172
+ br label %for.body
173
+
174
+ for.body: ; preds = %for.body, %entry
175
+ %indvars.iv = phi i64 [ 0 , %entry ], [ %indvars.iv.next , %for.body ]
176
+ %0 = getelementptr inbounds i32 , ptr %A , i64 %indvars.iv
177
+ %1 = load i32 , ptr %0 , align 4
178
+ %2 = add nsw i32 %1 , 6
179
+ store i32 %2 , ptr %0 , align 4
180
+ %indvars.iv.next = add i64 %indvars.iv , 1
181
+ %lftr.wideiv = trunc i64 %indvars.iv.next to i32
182
+ %exitcond = icmp eq i32 %lftr.wideiv , 25
183
+ br i1 %exitcond , label %for.end , label %for.body
184
+
185
+ for.end: ; preds = %for.body
186
+ ret void
187
+ }
188
+
189
+ ; TODO: We should unroll this loop 4 times since TC not being a multiple of VF may require
190
+ ; the epilogue loop to run, making it profitable when the vector loop runs
191
+ ; at least twice.
192
+ ;
193
+ ; CHECK-VECTOR-LABEL: @foo_trip_count_33(
194
+ ; CHECK-VECTOR: load <4 x i32>
195
+ ; CHECK-VECTOR-NOT: load <4 x i32>
196
+ ; CHECK-VECTOR: store <4 x i32>
197
+ ; CHECK-VECTOR-NOT: store <4 x i32>
198
+ ; CHECK-VECTOR: ret
199
+ ;
200
+ ; CHECK-SCALAR-LABEL: @foo_trip_count_33(
201
+ ; CHECK-SCALAR: load i32, ptr
202
+ ; CHECK-SCALAR-NOT: load i32, ptr
203
+ ; CHECK-SCALAR: store i32
204
+ ; CHECK-SCALAR-NOT: store i32
205
+ ; CHECK-SCALAR: ret
206
+ define void @foo_trip_count_33 (ptr nocapture %A ) nounwind uwtable ssp {
207
+ entry:
208
+ br label %for.body
209
+
210
+ for.body: ; preds = %for.body, %entry
211
+ %indvars.iv = phi i64 [ 0 , %entry ], [ %indvars.iv.next , %for.body ]
212
+ %0 = getelementptr inbounds i32 , ptr %A , i64 %indvars.iv
213
+ %1 = load i32 , ptr %0 , align 4
214
+ %2 = add nsw i32 %1 , 6
215
+ store i32 %2 , ptr %0 , align 4
33
216
%indvars.iv.next = add i64 %indvars.iv , 1
34
217
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
35
- %exitcond = icmp eq i32 %lftr.wideiv , 100
36
- br i1 %exitcond , label %5 , label %1
218
+ %exitcond = icmp eq i32 %lftr.wideiv , 33
219
+ br i1 %exitcond , label %for.end , label %for.body
37
220
38
- ; <label>:5 ; preds = %1
39
- ret i32 undef
221
+ for.end: ; preds = %for.body
222
+ ret void
223
+ }
224
+
225
+ ; TODO: We should unroll this loop 4 times since TC not being a multiple of VF may require
226
+ ; the epilogue loop to run, making it profitable when the vector loop runs
227
+ ; at least twice. The IC is restricted to 4 since that is the maximum supported
228
+ ; for the target.
229
+ ;
230
+ ; CHECK-VECTOR-LABEL: @foo_trip_count_101(
231
+ ; CHECK-VECTOR: load <4 x i32>
232
+ ; CHECK-VECTOR-NOT: load <4 x i32>
233
+ ; CHECK-VECTOR: store <4 x i32>
234
+ ; CHECK-VECTOR-NOT: store <4 x i32>
235
+ ; CHECK-VECTOR: ret
236
+ ;
237
+ ; CHECK-SCALAR-LABEL: @foo_trip_count_101(
238
+ ; CHECK-SCALAR: load i32, ptr
239
+ ; CHECK-SCALAR-NOT: load i32, ptr
240
+ ; CHECK-SCALAR: store i32
241
+ ; CHECK-SCALAR-NOT: store i32
242
+ ; CHECK-SCALAR: ret
243
+ define void @foo_trip_count_101 (ptr nocapture %A ) nounwind uwtable ssp {
244
+ entry:
245
+ br label %for.body
246
+
247
+ for.body: ; preds = %for.body, %entry
248
+ %indvars.iv = phi i64 [ 0 , %entry ], [ %indvars.iv.next , %for.body ]
249
+ %0 = getelementptr inbounds i32 , ptr %A , i64 %indvars.iv
250
+ %1 = load i32 , ptr %0 , align 4
251
+ %2 = add nsw i32 %1 , 6
252
+ store i32 %2 , ptr %0 , align 4
253
+ %indvars.iv.next = add i64 %indvars.iv , 1
254
+ %lftr.wideiv = trunc i64 %indvars.iv.next to i32
255
+ %exitcond = icmp eq i32 %lftr.wideiv , 101
256
+ br i1 %exitcond , label %for.end , label %for.body
257
+
258
+ for.end: ; preds = %for.body
259
+ ret void
40
260
}
41
261
42
262
; But this is a good small loop to unroll as we don't know of a bound on its
@@ -53,7 +273,7 @@ define i32 @foo(ptr nocapture %A) nounwind uwtable ssp {
53
273
; CHECK-SCALAR: store i32
54
274
; CHECK-SCALAR-NOT: store i32
55
275
; CHECK-SCALAR: ret
56
- define i32 @bar (ptr nocapture %A , i32 %n ) nounwind uwtable ssp {
276
+ define void @bar (ptr nocapture %A , i32 %n ) nounwind uwtable ssp {
57
277
%1 = icmp sgt i32 %n , 0
58
278
br i1 %1 , label %.lr.ph , label %._crit_edge
59
279
@@ -69,7 +289,7 @@ define i32 @bar(ptr nocapture %A, i32 %n) nounwind uwtable ssp {
69
289
br i1 %exitcond , label %._crit_edge , label %.lr.ph
70
290
71
291
._crit_edge: ; preds = %.lr.ph, %0
72
- ret i32 undef
292
+ ret void
73
293
}
74
294
75
295
; Also unroll if we need a runtime check but it was going to be added for
0 commit comments