Updated Loop IC calculation to minimize the epilogue loop iterations when exact trip count is known.

nilanjana87 · nilanjana87 · commit 3f59fd9a9e52 · 2023-12-06T17:45:56.000-08:00
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5728,34 +5728,45 @@ LoopVectorizationCostModel::selectInterleaveCount(Loop *L, ElementCount VF,
       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
   }
 
-  // If trip count is known or estimated compile time constant, limit the
-  // interleave count to be less than the trip count divided by VF * 2,
-  // provided VF is at least 1 and the trip count is not an exact multiple of
-  // VF, such that the vector loop runs at least twice to make interleaving seem
-  // profitable when there is an epilogue loop present. When
-  // InterleaveSmallLoopScalarReduction is true or trip count is an exact
-  // multiple of VF, we allow interleaving even when the vector loop runs once.
-  //
-  // For scalable vectors we can't know if interleaving is beneficial. It may
-  // not be beneficial for small loops if none of the lanes in the second vector
-  // iterations is enabled. However, for larger loops, there is likely to be a
-  // similar benefit as for fixed-width vectors. For now, we choose to leave
-  // the InterleaveCount as if vscale is '1', although if some information about
-  // the vector is known (e.g. min vector size), we can make a better decision.
-  if (BestKnownTC) {
-    unsigned EstimatedVF = VF.getKnownMinValue();
-    if (VF.isScalable()) {
-      if (std::optional<unsigned> VScale = getVScaleForTuning(L, TTI))
-        EstimatedVF *= *VScale;
+  unsigned EstimatedVF = VF.getKnownMinValue();
+  if (VF.isScalable()) {
+    if (std::optional<unsigned> VScale = getVScaleForTuning(L, TTI))
+      EstimatedVF *= *VScale;
+  }
+  assert((EstimatedVF >= 1) && "Estimated VF shouldn't be less than 1");
+
+  unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(L);
+  if (KnownTC) {
+    // If trip count is known we select between two prospective ICs, where
+    // 1) the aggressive IC is capped by the trip count divided by VF
+    // 2) the conservative IC is capped by the trip count divided by (VF * 2)
+    // The final IC is selected in a way that the epilogue loop trip count is
+    // minimized while maximizing the IC itself, so that we either run the
+    // vector loop at least once if it generates a small epilogue loop, or else
+    // we run the vector loop at least twice.
+
+    unsigned InterleaveCountUB = bit_floor(
+        std::max(1u, std::min(KnownTC / EstimatedVF, MaxInterleaveCount)));
+    unsigned InterleaveCountLB = bit_floor(std::max(
+        1u, std::min(KnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
+    MaxInterleaveCount = InterleaveCountLB;
+
+    if (InterleaveCountUB != InterleaveCountLB) {
+      unsigned TailTripCountUB = (KnownTC % (EstimatedVF * InterleaveCountUB));
+      unsigned TailTripCountLB = (KnownTC % (EstimatedVF * InterleaveCountLB));
+      // If both produce same scalar tail, maximize the IC to do the same work
+      // in fewer vector loop iterations
+      if (TailTripCountUB == TailTripCountLB)
+        MaxInterleaveCount = InterleaveCountUB;
     }
-    if (InterleaveSmallLoopScalarReduction || (*BestKnownTC % EstimatedVF == 0))
-      MaxInterleaveCount =
-          std::min(*BestKnownTC / EstimatedVF, MaxInterleaveCount);
-    else
-      MaxInterleaveCount =
-          std::min(*BestKnownTC / (EstimatedVF * 2), MaxInterleaveCount);
-    // Make sure MaxInterleaveCount is greater than 0 & a power of 2.
-    MaxInterleaveCount = llvm::bit_floor(std::max(1u, MaxInterleaveCount));
+  } else if (BestKnownTC) {
+    // If trip count is an estimated compile time constant, limit the
+    // IC to be capped by the trip count divided by VF * 2, such that the vector
+    // loop runs at least twice to make interleaving seem profitable when there
+    // is an epilogue loop present. Since exact Trip count is not known we
+    // choose to be conservative in our IC estimate.
+    MaxInterleaveCount = bit_floor(std::max(
+        1u, std::min(*BestKnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
   }
 
   assert(MaxInterleaveCount > 0 &&
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll
@@ -1,12 +1,12 @@
-; RUN: opt < %s -tiny-trip-count-interleave-threshold=32 -p loop-vectorize -S -pass-remarks=loop-vectorize -disable-output 2>&1 | FileCheck %s
+; RUN: opt < %s -tiny-trip-count-interleave-threshold=16 -p loop-vectorize -S -pass-remarks=loop-vectorize -disable-output 2>&1 | FileCheck %s
 ; TODO: remove -tiny-trip-count-interleave-threshold once the interleave threshold is removed
 
 target triple = "aarch64-linux-gnu"
 
 %pair = type { i8, i8 }
 
 ; For this loop with known TC of 32, when the auto-vectorizer chooses VF 16, it should choose
-; IC 2 since there is no remainder loop run needed when the vector loop runs.
+; IC 2 since there is no remainder loop run needed after the vector loop runs.
 ; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
 define void @loop_with_tc_32(ptr noalias %p, ptr noalias %q) {
 entry:
@@ -30,8 +30,8 @@ for.end:
 }
 
 ; For this loop with known TC of 33, when the auto-vectorizer chooses VF 16, it should choose
-; IC 1 since there may be a remainder loop that needs to run after the vector loop.
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
+; IC 2 since there is a small remainder loop TC that needs to run after the vector loop.
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
 define void @loop_with_tc_33(ptr noalias %p, ptr noalias %q) {
 entry:
   br label %for.body
@@ -53,10 +53,105 @@ for.end:
   ret void
 }
 
-; For a loop with unknown trip count but a profile showing an approx TC estimate of 32, when the
-; auto-vectorizer chooses VF 16, it should choose IC 2 since chances are high that the remainder loop
-; won't need to run
+; For this loop with known TC of 39, when the auto-vectorizer chooses VF 16, it should choose
+; IC 2 since there is a small remainder loop that needs to run after the vector loop.
 ; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
+define void @loop_with_tc_39(ptr noalias %p, ptr noalias %q) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, 39
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; For this loop with known TC of 48, when the auto-vectorizer chooses VF 16, it should choose
+; IC 1 since there will be no remainder loop that needs to run after the vector loop.
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
+define void @loop_with_tc_48(ptr noalias %p, ptr noalias %q) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, 48
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; For this loop with known TC of 49, when the auto-vectorizer chooses VF 16, it should choose
+; IC 1 since a remainder loop TC of 1 is more efficient than remainder loop TC of 17 with IC 2
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
+define void @loop_with_tc_49(ptr noalias %p, ptr noalias %q) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, 49
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; For this loop with known TC of 55, when the auto-vectorizer chooses VF 16, it should choose
+; IC 1 since a remainder loop TC of 7 is more efficient than remainder loop TC of 23 with IC 2
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
+define void @loop_with_tc_55(ptr noalias %p, ptr noalias %q) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, 55
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; For a loop with a profile-guided estimated TC of 32, when the auto-vectorizer chooses VF 16, 
+; it should conservatively choose IC 1 so that the vector loop runs twice at least
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
 define void @loop_with_profile_tc_32(ptr noalias %p, ptr noalias %q, i64 %n) {
 entry:
   br label %for.body
@@ -78,9 +173,8 @@ for.end:
   ret void
 }
 
-; For a loop with unknown trip count but a profile showing an approx TC estimate of 33, 
-; when the auto-vectorizer chooses VF 16, it should choose IC 1 since chances are high that the 
-; remainder loop will need to run
+; For a loop with a profile-guided estimated TC of 33, when the auto-vectorizer chooses VF 16, 
+; it should conservatively choose IC 1 so that the vector loop runs twice at least
 ; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
 define void @loop_with_profile_tc_33(ptr noalias %p, ptr noalias %q, i64 %n) {
 entry:
@@ -103,5 +197,80 @@ for.end:
   ret void
 }
 
+; For a loop with a profile-guided estimated TC of 48, when the auto-vectorizer chooses VF 16, 
+; it should conservatively choose IC 1 so that the vector loop runs twice at least
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
+define void @loop_with_profile_tc_48(ptr noalias %p, ptr noalias %q, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body, !prof !2
+
+for.end:
+  ret void
+}
+
+; For a loop with a profile-guided estimated TC of 63, when the auto-vectorizer chooses VF 16, 
+; it should conservatively choose IC 1 so that the vector loop runs twice at least
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
+define void @loop_with_profile_tc_63(ptr noalias %p, ptr noalias %q, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body, !prof !3
+
+for.end:
+  ret void
+}
+
+; For a loop with a profile-guided estimated TC of 64, when the auto-vectorizer chooses VF 16, 
+; it should choose conservatively IC 2 so that the vector loop runs twice at least
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
+define void @loop_with_profile_tc_64(ptr noalias %p, ptr noalias %q, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body, !prof !4
+
+for.end:
+  ret void
+}
+
 !0 = !{!"branch_weights", i32 1, i32 31}
 !1 = !{!"branch_weights", i32 1, i32 32}
+!2 = !{!"branch_weights", i32 1, i32 47}
+!3 = !{!"branch_weights", i32 1, i32 62}
+!4 = !{!"branch_weights", i32 1, i32 63}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleave_short_tc.ll b/llvm/test/Transforms/LoopVectorize/X86/interleave_short_tc.ll
@@ -1,4 +1,4 @@
-; Check that we won't interleave by more than "best known" estimated trip count.
+; Check that we won't interleave by more than half the "best known" estimated trip count.
 
 ; The loop is expected to be vectorized by 4 and interleaving suppresed due to
 ; short trip count which is controled by "tiny-trip-count-interleave-threshold".
@@ -12,7 +12,7 @@
 ; Thus the resulting step is 4.
 ; RUN: opt -passes=loop-vectorize -force-vector-width=2 -vectorizer-min-trip-count=4 -tiny-trip-count-interleave-threshold=4 -S < %s |  FileCheck %s
 
-; Check that we won't interleave by more than "best known" estimated trip count.
+; Check that we won't interleave by more than half the "best known" estimated trip count.
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -56,4 +56,4 @@ for.body:                                         ; preds = %for.body, %for.body
   br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body, !prof !1
 }
 
-!1 = !{!"branch_weights", i32 1, i32 5}
+!1 = !{!"branch_weights", i32 1, i32 9}