diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll
deleted file mode 100644
index 061cdb5643671..0000000000000
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll
+++ /dev/null
@@ -1,107 +0,0 @@
-; RUN: opt < %s -tiny-trip-count-interleave-threshold=32 -p loop-vectorize -S -pass-remarks=loop-vectorize -disable-output 2>&1 | FileCheck %s
-; TODO: remove -tiny-trip-count-interleave-threshold once the interleave threshold is removed
-
-target triple = "aarch64-linux-gnu"
-
-%pair = type { i8, i8 }
-
-; For this loop with known TC of 32, when the auto-vectorizer chooses VF 16, it should choose
-; IC 2 since there is no remainder loop run needed when the vector loop runs.
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
-define void @loop_with_tc_32(ptr noalias %p, ptr noalias %q) {
-entry:
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
-  %tmp1 = load i8, ptr %tmp0, align 1
-  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
-  %tmp3 = load i8, ptr %tmp2, align 1
-  %add = add i8 %tmp1, %tmp3
-  %qi = getelementptr i8, ptr %q, i64 %i
-  store i8 %add, ptr %qi, align 1
-  %i.next = add nuw nsw i64 %i, 1
-  %cond = icmp eq i64 %i.next, 32
-  br i1 %cond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
-
-; TODO: For this loop with known TC of 33, when the auto-vectorizer chooses VF 16, it should choose
-; IC 1 since there may be a remainder loop that needs to run after the vector loop.
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
-define void @loop_with_tc_33(ptr noalias %p, ptr noalias %q) {
-entry:
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
-  %tmp1 = load i8, ptr %tmp0, align 1
-  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
-  %tmp3 = load i8, ptr %tmp2, align 1
-  %add = add i8 %tmp1, %tmp3
-  %qi = getelementptr i8, ptr %q, i64 %i
-  store i8 %add, ptr %qi, align 1
-  %i.next = add nuw nsw i64 %i, 1
-  %cond = icmp eq i64 %i.next, 33
-  br i1 %cond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
-
-; For a loop with unknown trip count but a profile showing an approx TC estimate of 32, when the
-; auto-vectorizer chooses VF 16, it should choose IC 2 since chances are high that the remainder loop
-; won't need to run
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
-define void @loop_with_profile_tc_32(ptr noalias %p, ptr noalias %q, i64 %n) {
-entry:
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
-  %tmp1 = load i8, ptr %tmp0, align 1
-  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
-  %tmp3 = load i8, ptr %tmp2, align 1
-  %add = add i8 %tmp1, %tmp3
-  %qi = getelementptr i8, ptr %q, i64 %i
-  store i8 %add, ptr %qi, align 1
-  %i.next = add nuw nsw i64 %i, 1
-  %cond = icmp eq i64 %i.next, %n
-  br i1 %cond, label %for.end, label %for.body, !prof !0
-
-for.end:
-  ret void
-}
-
-; TODO: For a loop with unknown trip count but a profile showing an approx TC estimate of 33, 
-; when the auto-vectorizer chooses VF 16, it should choose IC 1 since chances are high that the 
-; remainder loop will need to run
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
-define void @loop_with_profile_tc_33(ptr noalias %p, ptr noalias %q, i64 %n) {
-entry:
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
-  %tmp1 = load i8, ptr %tmp0, align 1
-  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
-  %tmp3 = load i8, ptr %tmp2, align 1
-  %add = add i8 %tmp1, %tmp3
-  %qi = getelementptr i8, ptr %q, i64 %i
-  store i8 %add, ptr %qi, align 1
-  %i.next = add nuw nsw i64 %i, 1
-  %cond = icmp eq i64 %i.next, %n
-  br i1 %cond, label %for.end, label %for.body, !prof !1
-
-for.end:
-  ret void
-}
-
-!0 = !{!"branch_weights", i32 1, i32 31}
-!1 = !{!"branch_weights", i32 1, i32 32}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll
new file mode 100644
index 0000000000000..6d49d7159998c
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll
@@ -0,0 +1,282 @@
+; RUN: opt < %s -tiny-trip-count-interleave-threshold=16 -force-target-max-vector-interleave=8 -p loop-vectorize -S -pass-remarks=loop-vectorize -disable-output 2>&1 | FileCheck %s
+; TODO: remove -tiny-trip-count-interleave-threshold once the interleave threshold is removed
+
+target triple = "aarch64-linux-gnu"
+
+%pair = type { i8, i8 }
+
+; TODO: For a loop with a profile-guided estimated TC of 32, when the auto-vectorizer chooses VF 16, 
+; it should conservatively choose IC 1 so that the vector loop runs twice at least
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
+define void @loop_with_profile_tc_32(ptr noalias %p, ptr noalias %q, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body, !prof !0
+
+for.end:
+  ret void
+}
+
+; TODO: For a loop with a profile-guided estimated TC of 33, when the auto-vectorizer chooses VF 16, 
+; it should conservatively choose IC 1 so that the vector loop runs twice at least
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
+define void @loop_with_profile_tc_33(ptr noalias %p, ptr noalias %q, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body, !prof !1
+
+for.end:
+  ret void
+}
+
+; TODO: For a loop with a profile-guided estimated TC of 48, when the auto-vectorizer chooses VF 16, 
+; it should conservatively choose IC 1 so that the vector loop runs twice at least
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 3)
+define void @loop_with_profile_tc_48(ptr noalias %p, ptr noalias %q, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body, !prof !2
+
+for.end:
+  ret void
+}
+
+; TODO: For a loop with a profile-guided estimated TC of 63, when the auto-vectorizer chooses VF 16, 
+; it should conservatively choose IC 1 so that the vector loop runs twice at least
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 3)
+define void @loop_with_profile_tc_63(ptr noalias %p, ptr noalias %q, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body, !prof !3
+
+for.end:
+  ret void
+}
+
+; TODO: For a loop with a profile-guided estimated TC of 64, when the auto-vectorizer chooses VF 16, 
+; it should choose conservatively IC 2 so that the vector loop runs twice at least
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
+define void @loop_with_profile_tc_64(ptr noalias %p, ptr noalias %q, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body, !prof !4
+
+for.end:
+  ret void
+}
+
+; TODO: For a loop with a profile-guided estimated TC of 100, when the auto-vectorizer chooses VF 16, 
+; it should choose conservatively IC 2 so that the vector loop runs twice at least
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 6)
+define void @loop_with_profile_tc_100(ptr noalias %p, ptr noalias %q, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body, !prof !5
+
+for.end:
+  ret void
+}
+
+; TODO: For a loop with a profile-guided estimated TC of 128, when the auto-vectorizer chooses VF 16, 
+; it should choose conservatively IC 4 so that the vector loop runs twice at least
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
+define void @loop_with_profile_tc_128(ptr noalias %p, ptr noalias %q, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body, !prof !6
+
+for.end:
+  ret void
+}
+
+; TODO: For a loop with a profile-guided estimated TC of 129, when the auto-vectorizer chooses VF 16, 
+; it should choose conservatively IC 4 so that the vector loop runs twice at least
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
+define void @loop_with_profile_tc_129(ptr noalias %p, ptr noalias %q, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body, !prof !7
+
+for.end:
+  ret void
+}
+
+; TODO: For a loop with a profile-guided estimated TC of 180, when the auto-vectorizer chooses VF 16, 
+; it should choose conservatively IC 4 so that the vector loop runs twice at least
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
+define void @loop_with_profile_tc_180(ptr noalias %p, ptr noalias %q, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body, !prof !8
+
+for.end:
+  ret void
+}
+
+; TODO: For a loop with a profile-guided estimated TC of 193, when the auto-vectorizer chooses VF 16, 
+; it should choose conservatively IC 4 so that the vector loop runs twice at least
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
+define void @loop_with_profile_tc_193(ptr noalias %p, ptr noalias %q, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body, !prof !9
+
+for.end:
+  ret void
+}
+
+; TODO: For a loop with a profile-guided estimated TC of 1000, when the auto-vectorizer chooses VF 16, 
+; the IC will be capped by the target-specific maximum interleave count
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
+define void @loop_with_profile_tc_1000(ptr noalias %p, ptr noalias %q, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body, !prof !10
+
+for.end:
+  ret void
+}
+
+!0 = !{!"branch_weights", i32 1, i32 31}
+!1 = !{!"branch_weights", i32 1, i32 32}
+!2 = !{!"branch_weights", i32 1, i32 47}
+!3 = !{!"branch_weights", i32 1, i32 62}
+!4 = !{!"branch_weights", i32 1, i32 63}
+!5 = !{!"branch_weights", i32 1, i32 99}
+!6 = !{!"branch_weights", i32 1, i32 127}
+!7 = !{!"branch_weights", i32 1, i32 128}
+!8 = !{!"branch_weights", i32 1, i32 179}
+!9 = !{!"branch_weights", i32 1, i32 192}
+!10 = !{!"branch_weights", i32 1, i32 999}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll
new file mode 100644
index 0000000000000..828cbe76489a3
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll
@@ -0,0 +1,294 @@
+; RUN: opt < %s -tiny-trip-count-interleave-threshold=16 -force-target-max-vector-interleave=8 -p loop-vectorize -S -pass-remarks=loop-vectorize -disable-output 2>&1 | FileCheck %s
+; TODO: remove -tiny-trip-count-interleave-threshold once the interleave threshold is removed
+
+target triple = "aarch64-linux-gnu"
+
+%pair = type { i8, i8 }
+
+; For this loop with known TC of 32, when the auto-vectorizer chooses VF 16, it should choose
+; IC 2 since there is no remainder loop run needed after the vector loop runs.
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
+define void @loop_with_tc_32(ptr noalias %p, ptr noalias %q) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, 32
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; For this loop with known TC of 33, when the auto-vectorizer chooses VF 16, it should choose
+; IC 2 since there is a small remainder loop TC that needs to run after the vector loop.
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
+define void @loop_with_tc_33(ptr noalias %p, ptr noalias %q) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, 33
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; For this loop with known TC of 39, when the auto-vectorizer chooses VF 16, it should choose
+; IC 2 since there is a small remainder loop that needs to run after the vector loop.
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
+define void @loop_with_tc_39(ptr noalias %p, ptr noalias %q) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, 39
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; TODO: For this loop with known TC of 48, when the auto-vectorizer chooses VF 16, it should choose
+; IC 1 since there will be no remainder loop that needs to run after the vector loop.
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 3)
+define void @loop_with_tc_48(ptr noalias %p, ptr noalias %q) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, 48
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; TODO: For this loop with known TC of 49, when the auto-vectorizer chooses VF 16, it should choose
+; IC 1 since a remainder loop TC of 1 is more efficient than remainder loop TC of 17 with IC 2
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 3)
+define void @loop_with_tc_49(ptr noalias %p, ptr noalias %q) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, 49
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; TODO: For this loop with known TC of 55, when the auto-vectorizer chooses VF 16, it should choose
+; IC 1 since a remainder loop TC of 7 is more efficient than remainder loop TC of 23 with IC 2
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 3)
+define void @loop_with_tc_55(ptr noalias %p, ptr noalias %q) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, 55
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; TODO: For this loop with known TC of 100, when the auto-vectorizer chooses VF 16, it should choose
+; IC 2 since a remainder loop TC of 4 is more efficient than remainder loop TC of 36 with IC 4
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 6)
+define void @loop_with_tc_100(ptr noalias %p, ptr noalias %q) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, 100
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; For this loop with known TC of 128, when the auto-vectorizer chooses VF 16, it should choose
+; IC 8 since there is no remainder loop run needed after the vector loop runs
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
+define void @loop_with_tc_128(ptr noalias %p, ptr noalias %q) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, 128
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; For this loop with known TC of 129, when the auto-vectorizer chooses VF 16, it should choose
+; IC 8 since there is a small remainder loop that needs to run after the vector loop
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
+define void @loop_with_tc_129(ptr noalias %p, ptr noalias %q) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, 129
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; For this loop with known TC of 180, when the auto-vectorizer chooses VF 16, it should choose
+; IC 8 since the remainder loop of TC 52 cannot be reduced by choosing IC 4
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
+define void @loop_with_tc_180(ptr noalias %p, ptr noalias %q) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, 180
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; TODO: For this loop with known TC of 193, when the auto-vectorizer chooses VF 16, it should choose
+; IC 4 since a remainder loop TC of 1 is more efficient than remainder loop TC of 65 with IC 8
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
+define void @loop_with_tc_193(ptr noalias %p, ptr noalias %q) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, 193
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; For a loop with high known TC of 1000, when the auto-vectorizer chooses VF 16, the IC will
+; be capped by the target-specific maximum interleave count
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
+define void @loop_with_tc_1000(ptr noalias %p, ptr noalias %q) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, 1000
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}