diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll index 5552f9dd70c95..97c6d2a819615 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll @@ -1,4 +1,5 @@ ; RUN: opt < %s -tiny-trip-count-interleave-threshold=16 -force-target-max-vector-interleave=8 -p loop-vectorize -S -pass-remarks=loop-vectorize -disable-output 2>&1 | FileCheck %s +; RUN: opt < %s -tiny-trip-count-interleave-threshold=16 -force-target-max-vector-interleave=8 -p loop-vectorize -S 2>&1 | FileCheck %s -check-prefix=CHECK-IR ; TODO: remove -tiny-trip-count-interleave-threshold once the interleave threshold is removed target triple = "aarch64-linux-gnu" @@ -125,6 +126,30 @@ for.end: ret void } +; This has the same profile-guided estimated trip count as loop_with_profile_tc_64 but since the +; resulting interleaved group in this case may access memory out-of-bounds, it requires a scalar +; epilogue iteration for correctness, making at most 63 iterations available for interleaving. +; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar +; remainder than IC 2 +; CHECK: remark: :0:0: vectorized loop (vectorization width: 16, interleaved count: 2) +define void @loop_with_profile_tc_64_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %gep.src = getelementptr inbounds [3 x i8], ptr %p, i64 %i, i64 0 + %l = load i8, ptr %gep.src, align 1 + %gep.dst = getelementptr inbounds i8, ptr %q, i64 %i + store i8 %l, ptr %gep.dst, align 1 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp eq i64 %i.next, %n + br i1 %cond, label %for.end, label %for.body, !prof !4 + +for.end: + ret void +} + ; For a loop with a profile-guided estimated TC of 100, when the auto-vectorizer chooses VF 16, ; it should choose conservatively IC 2 so that the vector loop runs twice at least ; CHECK: remark: :0:0: vectorized loop (vectorization width: 16, interleaved count: 2) @@ -153,6 +178,15 @@ for.end: ; it should choose conservatively IC 4 so that the vector loop runs twice at least ; CHECK: remark: :0:0: vectorized loop (vectorization width: 16, interleaved count: 4) define void @loop_with_profile_tc_128(ptr noalias %p, ptr noalias %q, i64 %n) { +; CHECK-IR-LABEL: define void @loop_with_profile_tc_128( +; CHECK-IR-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]], i64 [[N:%.*]]) { +; CHECK-IR-NEXT: iter.check: +; CHECK-IR-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 +; CHECK-IR-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]], !prof [[PROF6:![0-9]+]] +; CHECK-IR: vector.main.loop.iter.check: +; CHECK-IR-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], 64 +; CHECK-IR-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF6]] +; entry: br label %for.body @@ -173,6 +207,40 @@ for.end: ret void } +; This has the same profile-guided estimated trip count as loop_with_profile_tc_128 but since +; the resulting interleaved group in this case may access memory out-of-bounds, it requires +; a scalar epilogue iteration for correctness, making at most 127 iterations available for +; interleaving. +; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar +; remainder than IC 4 +; CHECK: remark: :0:0: vectorized loop (vectorization width: 16, interleaved count: 4) +define void @loop_with_profile_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) { +; CHECK-IR-LABEL: define void @loop_with_profile_tc_128_scalar_epilogue_reqd( +; CHECK-IR-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]], i64 [[N:%.*]]) { +; CHECK-IR-NEXT: iter.check: +; CHECK-IR-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[N]], 8 +; CHECK-IR-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]], !prof [[PROF6]] +; CHECK-IR: vector.main.loop.iter.check: +; CHECK-IR-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ule i64 [[N]], 64 +; CHECK-IR-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF6]] +; +entry: + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %gep.src = getelementptr inbounds [3 x i8], ptr %p, i64 %i, i64 0 + %l = load i8, ptr %gep.src, align 1 + %gep.dst = getelementptr inbounds i8, ptr %q, i64 %i + store i8 %l, ptr %gep.dst, align 1 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp eq i64 %i.next, %n + br i1 %cond, label %for.end, label %for.body, !prof !6 + +for.end: + ret void +} + ; For a loop with a profile-guided estimated TC of 129, when the auto-vectorizer chooses VF 16, ; it should choose conservatively IC 4 so that the vector loop runs twice at least ; CHECK: remark: :0:0: vectorized loop (vectorization width: 16, interleaved count: 4) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll index 0569bfb2ae4e0..526fe0dc0910d 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll @@ -1,4 +1,5 @@ ; RUN: opt < %s -tiny-trip-count-interleave-threshold=16 -force-target-max-vector-interleave=8 -p loop-vectorize -S -pass-remarks=loop-vectorize -disable-output 2>&1 | FileCheck %s +; RUN: opt < %s -tiny-trip-count-interleave-threshold=16 -force-target-max-vector-interleave=8 -p loop-vectorize -S 2>&1 | FileCheck %s -check-prefix=CHECK-IR ; TODO: remove -tiny-trip-count-interleave-threshold once the interleave threshold is removed target triple = "aarch64-linux-gnu" @@ -29,6 +30,30 @@ for.end: ret void } +; This has the same trip count as loop_with_tc_32 but since the resulting interleaved group +; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for +; correctness, making at most 31 iterations available for interleaving. +; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar remainder +; than IC 2 +; CHECK: remark: :0:0: vectorized loop (vectorization width: 16, interleaved count: 2) +define void @loop_with_tc_32_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %gep.src = getelementptr inbounds [3 x i8], ptr %p, i64 %i, i64 0 + %l = load i8, ptr %gep.src, align 1 + %gep.dst = getelementptr inbounds i8, ptr %q, i64 %i + store i8 %l, ptr %gep.dst, align 1 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp eq i64 %i.next, 32 + br i1 %cond, label %for.end, label %for.body + +for.end: + ret void +} + ; For this loop with known TC of 33, when the auto-vectorizer chooses VF 16, it should choose ; IC 2 since there is a small remainder loop TC that needs to run after the vector loop. ; CHECK: remark: :0:0: vectorized loop (vectorization width: 16, interleaved count: 2) @@ -177,6 +202,10 @@ for.end: ; IC 8 since there is no remainder loop run needed after the vector loop runs ; CHECK: remark: :0:0: vectorized loop (vectorization width: 16, interleaved count: 8) define void @loop_with_tc_128(ptr noalias %p, ptr noalias %q) { +; CHECK-IR-LABEL: define void @loop_with_tc_128( +; CHECK-IR-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]]) { +; CHECK-IR-NEXT: entry: +; CHECK-IR-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] entry: br label %for.body @@ -197,6 +226,35 @@ for.end: ret void } +; This has the same trip count as loop_with_tc_128 but since the resulting interleaved group +; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for +; correctness, making at most 127 iterations available for interleaving. +; TODO: Like loop_with_tc_128, the entry block should branch into the vector loop, instead of the scalar epilogue. +; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar +; remainder than IC 4 +; CHECK: remark: :0:0: vectorized loop (vectorization width: 16, interleaved count: 8) +define void @loop_with_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q) { +; CHECK-IR-LABEL: define void @loop_with_tc_128_scalar_epilogue_reqd( +; CHECK-IR-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]]) { +; CHECK-IR-NEXT: entry: +; CHECK-IR-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +entry: + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %gep.src = getelementptr inbounds [3 x i8], ptr %p, i64 %i, i64 0 + %l = load i8, ptr %gep.src, align 1 + %gep.dst = getelementptr inbounds i8, ptr %q, i64 %i + store i8 %l, ptr %gep.dst, align 1 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp eq i64 %i.next, 128 + br i1 %cond, label %for.end, label %for.body + +for.end: + ret void +} + ; For this loop with known TC of 129, when the auto-vectorizer chooses VF 16, it should choose ; IC 8 since there is a small remainder loop that needs to run after the vector loop ; CHECK: remark: :0:0: vectorized loop (vectorization width: 16, interleaved count: 8)