[LV] Change loops' interleave count computation

nilanjana87 · nilanjana87 · commit 2ef770ddff79 · 2023-10-24T15:50:54.000-07:00
A set of microbenchmarks in llvm-test-suite (llvm/llvm-test-suite#26), when tested on a AArch64 platform, demonstrates that loop interleaving is beneficial in two cases: 1) when TC > 2 * VW * IC, such that the interleaved vectorized portion of the loop runs at least twice 2) when TC is an exact multiple of VW * IC, such that there is no epilogue loop to run where, TC = trip count, VW = vectorization width, IC = interleaving count We change the interleave count computation based on this information but we leave it the same when the flag InterleaveSmallLoopScalarReductionTrue is set to true, since it handles a special case (https://reviews.llvm.org/D81416).
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5745,8 +5745,12 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
   }
 
   // If trip count is known or estimated compile time constant, limit the
-  // interleave count to be less than the trip count divided by VF, provided it
-  // is at least 1.
+  // interleave count to be less than the trip count divided by VF * 2,
+  // provided VF is at least 1 and the trip count is not an exact multiple of
+  // VF, such that the vector loop runs at least twice to make interleaving seem
+  // profitable when there is an epilogue loop present. When
+  // InterleaveSmallLoopScalarReduction is true or trip count is an exact
+  // multiple of VF, we allow interleaving even when the vector loop runs once.
   //
   // For scalable vectors we can't know if interleaving is beneficial. It may
   // not be beneficial for small loops if none of the lanes in the second vector
@@ -5755,10 +5759,15 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
   // the InterleaveCount as if vscale is '1', although if some information about
   // the vector is known (e.g. min vector size), we can make a better decision.
   if (BestKnownTC) {
-    MaxInterleaveCount =
-        std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
-    // Make sure MaxInterleaveCount is greater than 0.
-    MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
+    if (InterleaveSmallLoopScalarReduction ||
+        (*BestKnownTC % VF.getKnownMinValue() == 0))
+      MaxInterleaveCount =
+          std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
+    else
+      MaxInterleaveCount = std::min(*BestKnownTC / (VF.getKnownMinValue() * 2),
+                                    MaxInterleaveCount);
+    // Make sure MaxInterleaveCount is greater than 0 & a power of 2.
+    MaxInterleaveCount = llvm::bit_floor(std::max(1u, MaxInterleaveCount));
   }
 
   assert(MaxInterleaveCount > 0 &&
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll
@@ -30,7 +30,7 @@ for.end:
 
 ; For a loop with known trip count of 129, when we force VF 64, it should use
 ; IC 1, since there may be a remainder loop that needs to run after the vector loop.
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 64, interleaved count: 2)
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 64, interleaved count: 1)
 define void @loop_with_tc_129(ptr %p, ptr %q) {
 entry:
   br label %for.body
@@ -80,7 +80,7 @@ for.end:
 ; For a loop with unknown trip count but a profile showing an approx TC estimate of 129, 
 ; when we force VF 64, it should use IC 1, since chances are high that the remainder loop
 ; will need to run
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 64, interleaved count: 2)
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 64, interleaved count: 1)
 define void @loop_with_profile_tc_129(ptr %p, ptr %q, i64 %n) {
 entry:
   br label %for.body