[sil-loop-unroll] Do not unroll loops if their bodies contain function calls of big functions

swiftix · swiftix · commit 2e5b820063f4 · 2017-08-22T12:45:22.000-07:00
It is more profitable in most cases to inline the big callee rather than unroll the loop, because unrolling it would create a lot of calls which cannot be further optimized due to increased size of the function containing a loop.
diff --git a/lib/SILOptimizer/LoopTransforms/LoopUnroll.cpp b/lib/SILOptimizer/LoopTransforms/LoopUnroll.cpp
@@ -19,6 +19,7 @@
 #include "swift/SILOptimizer/Analysis/LoopAnalysis.h"
 #include "swift/SILOptimizer/PassManager/Passes.h"
 #include "swift/SILOptimizer/PassManager/Transforms.h"
+#include "swift/SILOptimizer/Utils/PerformanceInlinerUtils.h"
 #include "swift/SILOptimizer/Utils/SILInliner.h"
 #include "swift/SILOptimizer/Utils/SILSSAUpdater.h"
 
@@ -185,12 +186,24 @@ static bool canAndShouldUnrollLoop(SILLoop *Loop, uint64_t TripCount) {
 
   // We can unroll a loop if we can duplicate the instructions it holds.
   uint64_t Cost = 0;
+  // Average number of instructions per basic block.
+  // It is used to estimate the cost of the callee
+  // inside a loop.
+  const uint64_t InsnsPerBB = 4;
   for (auto *BB : Loop->getBlocks()) {
     for (auto &Inst : *BB) {
       if (!Loop->canDuplicate(&Inst))
         return false;
       if (instructionInlineCost(Inst) != InlineCost::Free)
         ++Cost;
+      if (auto AI = FullApplySite::isa(&Inst)) {
+        auto Callee = AI.getCalleeFunction();
+        if (Callee && getEligibleFunction(AI, InlineSelection::Everything)) {
+          // If callee is rather big and potentialy inlineable, it may be better
+          // not to unroll, so that the body of the calle can be inlined later.
+          Cost += Callee->size() * InsnsPerBB;
+        }
+      }
       if (Cost * TripCount > SILLoopUnrollThreshold)
         return false;
   }
diff --git a/test/SILOptimizer/loop_unroll.sil b/test/SILOptimizer/loop_unroll.sil
@@ -210,3 +210,74 @@ bb3:
   %8 = tuple()
   return %8 : $()
 }
+
+sil @big_func: $@convention(thin) () -> Builtin.Int64 {
+bb0:
+   %x0 = integer_literal $Builtin.Int64, 1
+   %overflow_check = integer_literal $Builtin.Int1, 0
+   %sum1 = builtin "sadd_with_overflow_Int64"(%x0 : $Builtin.Int64, %x0 : $Builtin.Int64, %overflow_check : $Builtin.Int1) : $(Builtin.Int64, Builtin.Int1)
+   %x1 = tuple_extract %sum1 : $(Builtin.Int64, Builtin.Int1), 0
+   br bb1
+
+bb1:
+   %sum2 = builtin "sadd_with_overflow_Int64"(%x1 : $Builtin.Int64, %x1 : $Builtin.Int64, %overflow_check : $Builtin.Int1) : $(Builtin.Int64, Builtin.Int1)
+   %x2 = tuple_extract %sum2 : $(Builtin.Int64, Builtin.Int1), 0
+   br bb2
+
+bb2:
+   %sum3 = builtin "sadd_with_overflow_Int64"(%x2 : $Builtin.Int64, %x2 : $Builtin.Int64, %overflow_check : $Builtin.Int1) : $(Builtin.Int64, Builtin.Int1)
+   %x3 = tuple_extract %sum3 : $(Builtin.Int64, Builtin.Int1), 0
+   br bb3
+
+bb3:
+   %sum4 = builtin "sadd_with_overflow_Int64"(%x3 : $Builtin.Int64, %x3 : $Builtin.Int64, %overflow_check : $Builtin.Int1) : $(Builtin.Int64, Builtin.Int1)
+   %x4 = tuple_extract %sum4 : $(Builtin.Int64, Builtin.Int1), 0
+   br bb4
+
+bb4:
+   %sum5 = builtin "sadd_with_overflow_Int64"(%x4 : $Builtin.Int64, %x4 : $Builtin.Int64, %overflow_check : $Builtin.Int1) : $(Builtin.Int64, Builtin.Int1)
+   %x5 = tuple_extract %sum5 : $(Builtin.Int64, Builtin.Int1), 0
+   br bb5
+
+bb5:
+   %sum6 = builtin "sadd_with_overflow_Int64"(%x5 : $Builtin.Int64, %x5 : $Builtin.Int64, %overflow_check : $Builtin.Int1) : $(Builtin.Int64, Builtin.Int1)
+   %x6 = tuple_extract %sum6 : $(Builtin.Int64, Builtin.Int1), 0
+   br bb6
+
+bb6:
+   return %x6 : $Builtin.Int64 
+}
+
+// Check that the compiler does not unroll loops containing calls
+// of big inlineable functions.
+//
+// CHECK-LABEL: sil @unroll_with_apply
+// CHECK: apply
+// CHECK: // end sil function 'unroll_with_apply'
+sil @unroll_with_apply : $@convention(thin) () -> () {
+bb0:
+ %0 = integer_literal $Builtin.Int64, 0
+ %1 = integer_literal $Builtin.Int64, 1
+ %2 = integer_literal $Builtin.Int64, 20
+ %3 = integer_literal $Builtin.Int1, 1
+ %f = function_ref @big_func: $@convention(thin) () -> Builtin.Int64 
+ br bb1(%0 : $Builtin.Int64)
+
+bb1(%4 : $Builtin.Int64):
+  %r = apply %f() : $@convention(thin) () -> Builtin.Int64
+  br bb2
+
+bb2:
+  %5 = builtin "sadd_with_overflow_Int64"(%4 : $Builtin.Int64, %1 : $Builtin.Int64, %3 : $Builtin.Int1) : $(Builtin.Int64, Builtin.Int1)
+  %6 = tuple_extract %5 : $(Builtin.Int64, Builtin.Int1), 0
+  %7 = builtin "cmp_eq_Int64"(%6 : $Builtin.Int64, %2 : $Builtin.Int64) : $Builtin.Int1
+  cond_br %7, bb4, bb3
+
+bb3:
+  br bb1(%6 : $Builtin.Int64)
+
+bb4:
+  %8 = tuple()
+  return %8 : $()
+}
+