-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[Tests][LV][AArch64] Pre-commit tests for changing loop interleaving count computation for loops that need to run scalar iterations #79640
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Tests][LV][AArch64] Pre-commit tests for changing loop interleaving count computation for loops that need to run scalar iterations #79640
Conversation
…unt computation for loops that require a scalar epilogue run
@llvm/pr-subscribers-llvm-transforms Author: Nilanjana Basu (nilanjana87) ChangesThis patch contains a set of pre-commit tests for changing the loop interleaving count computation in a subsequent patch in order to address loops that need to execute at least a single scalar iteration in the epilogue. Full diff: https://github.com/llvm/llvm-project/pull/79640.diff 3 Files Affected:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll
index 5552f9dd70c954e..d557ad1ead2563d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll
@@ -125,6 +125,30 @@ for.end:
ret void
}
+; This has the same trip count as loop_with_profile_tc_64 but since the resulting interleaved group
+; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for
+; correctness, making at most 63 iterations available for interleaving.
+; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar remainder
+; than IC 2
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
+define void @loop_with_profile_tc_64_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+ %gep.src = getelementptr inbounds [3 x i8], ptr %p, i64 %i, i64 0
+ %l = load i8, ptr %gep.src, align 1
+ %gep.dst = getelementptr inbounds i8, ptr %q, i64 %i
+ store i8 %l, ptr %gep.dst, align 1
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp eq i64 %i.next, %n
+ br i1 %cond, label %for.end, label %for.body, !prof !4
+
+for.end:
+ ret void
+}
+
; For a loop with a profile-guided estimated TC of 100, when the auto-vectorizer chooses VF 16,
; it should choose conservatively IC 2 so that the vector loop runs twice at least
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
@@ -173,6 +197,30 @@ for.end:
ret void
}
+; This has the same trip count as loop_with_profile_tc_128 but since the resulting interleaved group
+; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for
+; correctness, making at most 127 iterations available for interleaving.
+; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar remainder
+; than IC 4
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
+define void @loop_with_profile_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+ %gep.src = getelementptr inbounds [3 x i8], ptr %p, i64 %i, i64 0
+ %l = load i8, ptr %gep.src, align 1
+ %gep.dst = getelementptr inbounds i8, ptr %q, i64 %i
+ store i8 %l, ptr %gep.dst, align 1
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp eq i64 %i.next, %n
+ br i1 %cond, label %for.end, label %for.body, !prof !6
+
+for.end:
+ ret void
+}
+
; For a loop with a profile-guided estimated TC of 129, when the auto-vectorizer chooses VF 16,
; it should choose conservatively IC 4 so that the vector loop runs twice at least
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll
index 0569bfb2ae4e027..0a90798a3437110 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll
@@ -29,6 +29,30 @@ for.end:
ret void
}
+; This has the same trip count as loop_with_tc_32 but since the resulting interleaved group
+; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for
+; correctness, making at most 31 iterations available for interleaving.
+; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar remainder
+; than IC 2
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
+define void @loop_with_tc_32_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q) {
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+ %gep.src = getelementptr inbounds [3 x i8], ptr %p, i64 %i, i64 0
+ %l = load i8, ptr %gep.src, align 1
+ %gep.dst = getelementptr inbounds i8, ptr %q, i64 %i
+ store i8 %l, ptr %gep.dst, align 1
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp eq i64 %i.next, 32
+ br i1 %cond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
; For this loop with known TC of 33, when the auto-vectorizer chooses VF 16, it should choose
; IC 2 since there is a small remainder loop TC that needs to run after the vector loop.
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
@@ -197,6 +221,28 @@ for.end:
ret void
}
+; This has the same trip count as loop_with_tc_128 but since the resulting interleaved group
+; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for
+; correctness, making at most 31 iterations available for interleaving.
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
+define void @loop_with_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q) {
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+ %gep.src = getelementptr inbounds [3 x i8], ptr %p, i64 %i, i64 0
+ %l = load i8, ptr %gep.src, align 1
+ %gep.dst = getelementptr inbounds i8, ptr %q, i64 %i
+ store i8 %l, ptr %gep.dst, align 1
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp eq i64 %i.next, 128
+ br i1 %cond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
; For this loop with known TC of 129, when the auto-vectorizer chooses VF 16, it should choose
; IC 8 since there is a small remainder loop that needs to run after the vector loop
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_loops_needing_scalar_epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_loops_needing_scalar_epilogue.ll
new file mode 100644
index 000000000000000..b1b9e6d1e20a953
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_loops_needing_scalar_epilogue.ll
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -force-target-max-vector-interleave=8 -p loop-vectorize -S 2>&1 | FileCheck %s
+; RUN: opt < %s -force-target-max-vector-interleave=8 -p loop-vectorize -pass-remarks=loop-vectorize -disable-output -S 2>&1 | FileCheck %s -check-prefix=CHECK-REMARKS
+
+target triple = "aarch64-linux-gnu"
+
+%pair = type { i8, i8 }
+
+; For this loop with known TC of 128, when the auto-vectorizer chooses VF 16, it should choose
+; IC 8 since there is no remainder loop run needed after the vector loop runs
+; CHECK-REMARKS: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
+define void @loop_with_tc_128(ptr noalias %p, ptr noalias %q) {
+; CHECK-LABEL: define void @loop_with_tc_128(
+; CHECK-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+ %gep.src = getelementptr %pair, ptr %p, i64 %i, i32 0
+ %load.src = load i8, ptr %gep.src, align 1
+ %gep.dst = getelementptr %pair, ptr %p, i64 %i, i32 1
+ %load.dst = load i8, ptr %gep.dst, align 1
+ %add = add i8 %load.src, %load.dst
+ %qi = getelementptr i8, ptr %q, i64 %i
+ store i8 %add, ptr %qi, align 1
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp eq i64 %i.next, 128
+ br i1 %cond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+; This function has the same trip count as loop_with_tc_128 but since the resulting interleaved group
+; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for
+; correctness, making at most 127 iterations available for interleaving.
+; TODO: The entry block should branch into the vector loop, instead of the scalar epilogue.
+; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 2, to have a smaller scalar remainder
+; than when using IC 4.
+; CHECK-REMARKS: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
+define void @loop_with_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q) {
+; CHECK-LABEL: define void @loop_with_tc_128_scalar_epilogue_reqd(
+; CHECK-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+ %gep.src = getelementptr inbounds [3 x i8], ptr %p, i64 %i, i64 0
+ %l = load i8, ptr %gep.src, align 1
+ %gep.dst = getelementptr inbounds i8, ptr %q, i64 %i
+ store i8 %l, ptr %gep.dst, align 1
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp eq i64 %i.next, 128
+ br i1 %cond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+; For a loop with a profile-guided estimated TC of 128, when the auto-vectorizer chooses VF 16,
+; it should choose conservatively IC 4 so that the vector loop runs twice at least
+; CHECK-REMARKS: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
+define void @loop_with_profile_tc_128(ptr noalias %p, ptr noalias %q, i64 %n) {
+; CHECK-LABEL: define void @loop_with_profile_tc_128(
+; CHECK-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT: iter.check:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]], !prof [[PROF6:![0-9]+]]
+; CHECK: vector.main.loop.iter.check:
+; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], 64
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF6]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+ %gep.src = getelementptr %pair, ptr %p, i64 %i, i32 0
+ %load.src = load i8, ptr %gep.src, align 1
+ %gep.dst = getelementptr %pair, ptr %p, i64 %i, i32 1
+ %load.dst = load i8, ptr %gep.dst, align 1
+ %add = add i8 %load.src, %load.dst
+ %qi = getelementptr i8, ptr %q, i64 %i
+ store i8 %add, ptr %qi, align 1
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp eq i64 %i.next, %n
+ br i1 %cond, label %for.end, label %for.body, !prof !0
+
+for.end:
+ ret void
+}
+
+; This function has the same trip count as loop_with_profile_tc_128 but since the resulting interleaved group
+; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for
+; correctness, making at most 127 iterations available for interleaving.
+; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 2, to have a smaller scalar remainder
+; than IC 4.
+; CHECK-REMARKS: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
+define void @loop_with_profile_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
+; CHECK-LABEL: define void @loop_with_profile_tc_128_scalar_epilogue_reqd(
+; CHECK-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT: iter.check:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[N]], 8
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]], !prof [[PROF6]]
+; CHECK: vector.main.loop.iter.check:
+; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ule i64 [[N]], 64
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF6]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+ %gep.src = getelementptr inbounds [3 x i8], ptr %p, i64 %i, i64 0
+ %l = load i8, ptr %gep.src, align 1
+ %gep.dst = getelementptr inbounds i8, ptr %q, i64 %i
+ store i8 %l, ptr %gep.dst, align 1
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp eq i64 %i.next, %n
+ br i1 %cond, label %for.end, label %for.body, !prof !0
+
+for.end:
+ ret void
+}
+
+!0 = !{!"branch_weights", i32 1, i32 127}
|
llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll
Outdated
Show resolved
Hide resolved
llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll
Outdated
Show resolved
Hide resolved
.../test/Transforms/LoopVectorize/AArch64/interleave_count_for_loops_needing_scalar_epilogue.ll
Outdated
Show resolved
Hide resolved
…ent for testing loop ic computation.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks!
…count computation for loops that need to run scalar iterations (llvm#79640) This patch contains a set of pre-commit tests for changing the loop interleaving count computation in a subsequent patch in order to address loops that need to execute at least a single scalar iteration in the epilogue.
This patch contains a set of pre-commit tests for changing the loop interleaving count computation in a subsequent patch (#79651) in order to address loops that need to execute at least a single scalar iteration in the epilogue.