Skip to content

[LV] Pre-committing tests for changing loop interleaving count computation #70272

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Nov 18, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 107 additions & 0 deletions llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
; RUN: opt < %s -tiny-trip-count-interleave-threshold=32 -p loop-vectorize -S -pass-remarks=loop-vectorize 2>&1 | FileCheck %s
; TODO: remove -tiny-trip-count-interleave-threshold once the interleave threshold is removed

target triple = "aarch64-linux-gnu"

%pair = type { i8, i8 }

; For this loop with known TC of 32, when the auto-vectorizer chooses VF 16, it should choose
; IC 2 since there is no remainder loop run needed when the vector loop runs.
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
define void @loop_with_tc_32(ptr noalias %p, ptr noalias %q) {
entry:
br label %for.body

for.body:
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
%tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
%tmp1 = load i8, ptr %tmp0, align 1
%tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
%tmp3 = load i8, ptr %tmp2, align 1
%add = add i8 %tmp1, %tmp3
%qi = getelementptr i8, ptr %q, i64 %i
store i8 %add, ptr %qi, align 1
%i.next = add nuw nsw i64 %i, 1
%cond = icmp eq i64 %i.next, 32
br i1 %cond, label %for.end, label %for.body

for.end:
ret void
}

; TODO: For this loop with known TC of 33, when the auto-vectorizer chooses VF 16, it should choose
; IC 1 since there may be a remainder loop that needs to run after the vector loop.
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
define void @loop_with_tc_33(ptr noalias %p, ptr noalias %q) {
entry:
br label %for.body

for.body:
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
%tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
%tmp1 = load i8, ptr %tmp0, align 1
%tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
%tmp3 = load i8, ptr %tmp2, align 1
%add = add i8 %tmp1, %tmp3
%qi = getelementptr i8, ptr %q, i64 %i
store i8 %add, ptr %qi, align 1
%i.next = add nuw nsw i64 %i, 1
%cond = icmp eq i64 %i.next, 33
br i1 %cond, label %for.end, label %for.body

for.end:
ret void
}

; For a loop with unknown trip count but a profile showing an approx TC estimate of 32, when the
; auto-vectorizer chooses VF 16, it should choose IC 2 since chances are high that the remainder loop
; won't need to run
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
define void @loop_with_profile_tc_32(ptr noalias %p, ptr noalias %q, i64 %n) {
entry:
br label %for.body

for.body:
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
%tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
%tmp1 = load i8, ptr %tmp0, align 1
%tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
%tmp3 = load i8, ptr %tmp2, align 1
%add = add i8 %tmp1, %tmp3
%qi = getelementptr i8, ptr %q, i64 %i
store i8 %add, ptr %qi, align 1
%i.next = add nuw nsw i64 %i, 1
%cond = icmp eq i64 %i.next, %n
br i1 %cond, label %for.end, label %for.body, !prof !0

for.end:
ret void
}

; TODO: For a loop with unknown trip count but a profile showing an approx TC estimate of 33,
; when the auto-vectorizer chooses VF 16, it should choose IC 1 since chances are high that the
; remainder loop will need to run
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
define void @loop_with_profile_tc_33(ptr noalias %p, ptr noalias %q, i64 %n) {
entry:
br label %for.body

for.body:
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
%tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
%tmp1 = load i8, ptr %tmp0, align 1
%tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
%tmp3 = load i8, ptr %tmp2, align 1
%add = add i8 %tmp1, %tmp3
%qi = getelementptr i8, ptr %q, i64 %i
store i8 %add, ptr %qi, align 1
%i.next = add nuw nsw i64 %i, 1
%cond = icmp eq i64 %i.next, %n
br i1 %cond, label %for.end, label %for.body, !prof !1

for.end:
ret void
}

!0 = !{!"branch_weights", i32 1, i32 31}
!1 = !{!"branch_weights", i32 1, i32 32}
254 changes: 237 additions & 17 deletions llvm/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,37 +6,257 @@
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.8.0"

; We don't unroll this loop because it has a small constant trip count.
; We don't unroll this loop because it has a small constant trip count
; that is not profitable for generating a scalar epilogue
;
; CHECK-VECTOR-LABEL: @foo(
; CHECK-VECTOR-LABEL: @foo_trip_count_8(
; CHECK-VECTOR: load <4 x i32>
; CHECK-VECTOR-NOT: load <4 x i32>
; CHECK-VECTOR: store <4 x i32>
; CHECK-VECTOR-NOT: store <4 x i32>
; CHECK-VECTOR: ret
;
; CHECK-SCALAR-LABEL: @foo(
; CHECK-SCALAR-LABEL: @foo_trip_count_8(
; CHECK-SCALAR: load i32, ptr
; CHECK-SCALAR-NOT: load i32, ptr
; CHECK-SCALAR: store i32
; CHECK-SCALAR-NOT: store i32
; CHECK-SCALAR: ret
define i32 @foo(ptr nocapture %A) nounwind uwtable ssp {
br label %1
define void @foo_trip_count_8(ptr nocapture %A) nounwind uwtable ssp {
entry:
br label %for.body

; <label>:1 ; preds = %1, %0
%indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
%2 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
%3 = load i32, ptr %2, align 4
%4 = add nsw i32 %3, 6
store i32 %4, ptr %2, align 4
for.body: ; preds = %for.body, %entry
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%0 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
%1 = load i32, ptr %0, align 4
%2 = add nsw i32 %1, 6
store i32 %2, ptr %0, align 4
%indvars.iv.next = add i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, 8
br i1 %exitcond, label %for.end, label %for.body

for.end: ; preds = %for.body
ret void
}

; TODO: We should unroll this loop 4 times since TC being a multiple of VF means
; that the epilogue loop may not need to run, making it profitable for
; the vector loop to run even once
;
; CHECK-VECTOR-LABEL: @foo_trip_count_16(
; CHECK-VECTOR: load <4 x i32>
; CHECK-VECTOR-NOT: load <4 x i32>
; CHECK-VECTOR: store <4 x i32>
; CHECK-VECTOR-NOT: store <4 x i32>
; CHECK-VECTOR: ret
;
; CHECK-SCALAR-LABEL: @foo_trip_count_16(
; CHECK-SCALAR: load i32, ptr
; CHECK-SCALAR-NOT: load i32, ptr
; CHECK-SCALAR: store i32
; CHECK-SCALAR-NOT: store i32
; CHECK-SCALAR: ret
define void @foo_trip_count_16(ptr nocapture %A) nounwind uwtable ssp {
entry:
br label %for.body

for.body: ; preds = %for.body, %entry
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%0 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
%1 = load i32, ptr %0, align 4
%2 = add nsw i32 %1, 6
store i32 %2, ptr %0, align 4
%indvars.iv.next = add i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, 16
br i1 %exitcond, label %for.end, label %for.body

for.end: ; preds = %for.body
ret void
}

; TODO: We should unroll this loop twice since TC not being a multiple of VF may require
; the epilogue loop to run, making it profitable when the vector loop runs
; at least twice.
;
; CHECK-VECTOR-LABEL: @foo_trip_count_17(
; CHECK-VECTOR: load <4 x i32>
; CHECK-VECTOR-NOT: load <4 x i32>
; CHECK-VECTOR: store <4 x i32>
; CHECK-VECTOR-NOT: store <4 x i32>
; CHECK-VECTOR: ret
;
; CHECK-SCALAR-LABEL: @foo_trip_count_17(
; CHECK-SCALAR: load i32, ptr
; CHECK-SCALAR-NOT: load i32, ptr
; CHECK-SCALAR: store i32
; CHECK-SCALAR-NOT: store i32
; CHECK-SCALAR: ret
define void @foo_trip_count_17(ptr nocapture %A) nounwind uwtable ssp {
entry:
br label %for.body

for.body: ; preds = %for.body, %entry
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%0 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
%1 = load i32, ptr %0, align 4
%2 = add nsw i32 %1, 6
store i32 %2, ptr %0, align 4
%indvars.iv.next = add i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, 17
br i1 %exitcond, label %for.end, label %for.body

for.end: ; preds = %for.body
ret void
}

; TODO: We should unroll this loop 4 times since TC being a multiple of VF means
; that the epilogue loop may not need to run, making it profitable for
; the vector loop to run even once. The IC is restricted to 4 since
; that is the maximum supported for the target.
;
; CHECK-VECTOR-LABEL: @foo_trip_count_24(
; CHECK-VECTOR: load <4 x i32>
; CHECK-VECTOR-NOT: load <4 x i32>
; CHECK-VECTOR: store <4 x i32>
; CHECK-VECTOR-NOT: store <4 x i32>
; CHECK-VECTOR: ret
;
; CHECK-SCALAR-LABEL: @foo_trip_count_24(
; CHECK-SCALAR: load i32, ptr
; CHECK-SCALAR-NOT: load i32, ptr
; CHECK-SCALAR: store i32
; CHECK-SCALAR-NOT: store i32
; CHECK-SCALAR: ret
define void @foo_trip_count_24(ptr nocapture %A) nounwind uwtable ssp {
entry:
br label %for.body

for.body: ; preds = %for.body, %entry
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%0 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
%1 = load i32, ptr %0, align 4
%2 = add nsw i32 %1, 6
store i32 %2, ptr %0, align 4
%indvars.iv.next = add i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, 24
br i1 %exitcond, label %for.end, label %for.body

for.end: ; preds = %for.body
ret void
}

; TODO: We should unroll this loop twice since TC not being a multiple of VF may require
; the epilogue loop to run, making it profitable when the vector loop runs
; at least twice.
;
; CHECK-VECTOR-LABEL: @foo_trip_count_25(
; CHECK-VECTOR: load <4 x i32>
; CHECK-VECTOR-NOT: load <4 x i32>
; CHECK-VECTOR: store <4 x i32>
; CHECK-VECTOR-NOT: store <4 x i32>
; CHECK-VECTOR: ret
;
; CHECK-SCALAR-LABEL: @foo_trip_count_25(
; CHECK-SCALAR: load i32, ptr
; CHECK-SCALAR-NOT: load i32, ptr
; CHECK-SCALAR: store i32
; CHECK-SCALAR-NOT: store i32
; CHECK-SCALAR: ret
define void @foo_trip_count_25(ptr nocapture %A) nounwind uwtable ssp {
entry:
br label %for.body

for.body: ; preds = %for.body, %entry
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%0 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
%1 = load i32, ptr %0, align 4
%2 = add nsw i32 %1, 6
store i32 %2, ptr %0, align 4
%indvars.iv.next = add i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, 25
br i1 %exitcond, label %for.end, label %for.body

for.end: ; preds = %for.body
ret void
}

; TODO: We should unroll this loop 4 times since TC not being a multiple of VF may require
; the epilogue loop to run, making it profitable when the vector loop runs
; at least twice.
;
; CHECK-VECTOR-LABEL: @foo_trip_count_33(
; CHECK-VECTOR: load <4 x i32>
; CHECK-VECTOR-NOT: load <4 x i32>
; CHECK-VECTOR: store <4 x i32>
; CHECK-VECTOR-NOT: store <4 x i32>
; CHECK-VECTOR: ret
;
; CHECK-SCALAR-LABEL: @foo_trip_count_33(
; CHECK-SCALAR: load i32, ptr
; CHECK-SCALAR-NOT: load i32, ptr
; CHECK-SCALAR: store i32
; CHECK-SCALAR-NOT: store i32
; CHECK-SCALAR: ret
define void @foo_trip_count_33(ptr nocapture %A) nounwind uwtable ssp {
entry:
br label %for.body

for.body: ; preds = %for.body, %entry
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%0 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
%1 = load i32, ptr %0, align 4
%2 = add nsw i32 %1, 6
store i32 %2, ptr %0, align 4
%indvars.iv.next = add i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, 100
br i1 %exitcond, label %5, label %1
%exitcond = icmp eq i32 %lftr.wideiv, 33
br i1 %exitcond, label %for.end, label %for.body

; <label>:5 ; preds = %1
ret i32 undef
for.end: ; preds = %for.body
ret void
}

; TODO: We should unroll this loop 4 times since TC not being a multiple of VF may require
; the epilogue loop to run, making it profitable when the vector loop runs
; at least twice. The IC is restricted to 4 since that is the maximum supported
; for the target.
;
; CHECK-VECTOR-LABEL: @foo_trip_count_101(
; CHECK-VECTOR: load <4 x i32>
; CHECK-VECTOR-NOT: load <4 x i32>
; CHECK-VECTOR: store <4 x i32>
; CHECK-VECTOR-NOT: store <4 x i32>
; CHECK-VECTOR: ret
;
; CHECK-SCALAR-LABEL: @foo_trip_count_101(
; CHECK-SCALAR: load i32, ptr
; CHECK-SCALAR-NOT: load i32, ptr
; CHECK-SCALAR: store i32
; CHECK-SCALAR-NOT: store i32
; CHECK-SCALAR: ret
define void @foo_trip_count_101(ptr nocapture %A) nounwind uwtable ssp {
entry:
br label %for.body

for.body: ; preds = %for.body, %entry
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%0 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
%1 = load i32, ptr %0, align 4
%2 = add nsw i32 %1, 6
store i32 %2, ptr %0, align 4
%indvars.iv.next = add i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, 101
br i1 %exitcond, label %for.end, label %for.body

for.end: ; preds = %for.body
ret void
}

; But this is a good small loop to unroll as we don't know of a bound on its
Expand All @@ -53,7 +273,7 @@ define i32 @foo(ptr nocapture %A) nounwind uwtable ssp {
; CHECK-SCALAR: store i32
; CHECK-SCALAR-NOT: store i32
; CHECK-SCALAR: ret
define i32 @bar(ptr nocapture %A, i32 %n) nounwind uwtable ssp {
define void @bar(ptr nocapture %A, i32 %n) nounwind uwtable ssp {
%1 = icmp sgt i32 %n, 0
br i1 %1, label %.lr.ph, label %._crit_edge

Expand All @@ -69,7 +289,7 @@ define i32 @bar(ptr nocapture %A, i32 %n) nounwind uwtable ssp {
br i1 %exitcond, label %._crit_edge, label %.lr.ph

._crit_edge: ; preds = %.lr.ph, %0
ret i32 undef
ret void
}

; Also unroll if we need a runtime check but it was going to be added for
Expand Down