-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[coro][pgo] Do not insert counters in the suspend
block
#71262
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-pgo @llvm/pr-subscribers-coroutines Author: Mircea Trofin (mtrofin) ChangesIf we did, we couldn't lower the suspend call to a tail call. If this happened in a loop, it can lead to stack overflow (this was encountered in a benchmark, as an extreme case) We can instrument the other 2 edges instead, as long as they also don't point to the same basic block. Full diff: https://github.com/llvm/llvm-project/pull/71262.diff 13 Files Affected:
diff --git a/llvm/include/llvm/Transforms/Instrumentation/CFGMST.h b/llvm/include/llvm/Transforms/Instrumentation/CFGMST.h
index 6ed8a6c6eaf0197..eddfbd8a8e45b7a 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/CFGMST.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/CFGMST.h
@@ -19,6 +19,8 @@
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/BranchProbabilityInfo.h"
#include "llvm/Analysis/CFG.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
#include "llvm/Support/BranchProbability.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
@@ -121,31 +123,70 @@ template <class Edge, class BBInfo> class CFGMST {
static const uint32_t CriticalEdgeMultiplier = 1000;
+ auto GetCoroSuspendSwitch =
+ [&](const Instruction *TI) -> const SwitchInst * {
+ if (!F.isPresplitCoroutine())
+ return nullptr;
+ if (auto *SWInst = dyn_cast<SwitchInst>(TI))
+ if (auto *Intrinsic = dyn_cast<IntrinsicInst>(SWInst->getCondition()))
+ if (Intrinsic->getIntrinsicID() == Intrinsic::coro_suspend)
+ return SWInst;
+ return nullptr;
+ };
+
for (BasicBlock &BB : F) {
Instruction *TI = BB.getTerminator();
+ const SwitchInst *CoroSuspendSwitch = GetCoroSuspendSwitch(TI);
uint64_t BBWeight =
(BFI != nullptr ? BFI->getBlockFreq(&BB).getFrequency() : 2);
uint64_t Weight = 2;
if (int successors = TI->getNumSuccessors()) {
for (int i = 0; i != successors; ++i) {
BasicBlock *TargetBB = TI->getSuccessor(i);
- bool Critical = isCriticalEdge(TI, i);
- uint64_t scaleFactor = BBWeight;
- if (Critical) {
- if (scaleFactor < UINT64_MAX / CriticalEdgeMultiplier)
- scaleFactor *= CriticalEdgeMultiplier;
- else
- scaleFactor = UINT64_MAX;
+ const bool Critical = isCriticalEdge(TI, i);
+ const bool IsCoroSuspendTarget =
+ CoroSuspendSwitch &&
+ CoroSuspendSwitch->getDefaultDest() == TargetBB;
+ // We must not add instrumentation to the BB representing the
+ // "suspend" path, else CoroSplit won't be able to lower
+ // llvm.coro.suspend to a tail call. We do want profiling info for
+ // the other branches (resume/destroy). So we do 2 things:
+ // 1. we prefer instrumenting those other edges by setting the weight
+ // of the "suspend" edge to max, and
+ // 2. we mark the edge as "Removed" to guarantee it is not considered
+ // for instrumentation. That could technically happen:
+ // (from test/Transforms/Coroutines/coro-split-musttail.ll)
+ //
+ // %suspend = call i8 @llvm.coro.suspend(token %save, i1 false)
+ // switch i8 %suspend, label %exit [
+ // i8 0, label %await.ready
+ // i8 1, label %exit
+ // ]
+ if (IsCoroSuspendTarget) {
+ Weight = UINT64_MAX;
+ } else {
+ bool Critical = isCriticalEdge(TI, i);
+ uint64_t scaleFactor = BBWeight;
+ if (Critical) {
+ if (scaleFactor < UINT64_MAX / CriticalEdgeMultiplier)
+ scaleFactor *= CriticalEdgeMultiplier;
+ else
+ scaleFactor = UINT64_MAX;
+ }
+ if (BPI != nullptr)
+ Weight =
+ BPI->getEdgeProbability(&BB, TargetBB).scale(scaleFactor);
+ if (Weight == 0)
+ Weight++;
}
- if (BPI != nullptr)
- Weight = BPI->getEdgeProbability(&BB, TargetBB).scale(scaleFactor);
- if (Weight == 0)
- Weight++;
auto *E = &addEdge(&BB, TargetBB, Weight);
E->IsCritical = Critical;
+ // See comment above - we must guarantee the coro suspend BB isn't
+ // instrumented.
+ if (IsCoroSuspendTarget)
+ E->Removed = true;
LLVM_DEBUG(dbgs() << " Edge: from " << BB.getName() << " to "
- << TargetBB->getName() << " w=" << Weight << "\n");
-
+ << TargetBB->getName() << " w=" << Weight << "\n");
// Keep track of entry/exit edges:
if (&BB == Entry) {
if (Weight > MaxEntryOutWeight) {
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail.ll
index 0406135687904bf..825e44471db27ae 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail.ll
@@ -1,6 +1,7 @@
; Tests that coro-split will convert coro.resume followed by a suspend to a
; musttail call.
-; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
+; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck --check-prefixes=CHECK,NOPGO %s
+; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck --check-prefixes=CHECK,PGO %s
define void @f() #0 {
entry:
@@ -40,7 +41,9 @@ exit:
; Verify that in the resume part resume call is marked with musttail.
; CHECK-LABEL: @f.resume(
; CHECK: %[[addr2:.+]] = call ptr @llvm.coro.subfn.addr(ptr null, i8 0)
-; CHECK-NEXT: musttail call fastcc void %[[addr2]](ptr null)
+; NOPGO-NEXT: musttail call fastcc void %[[addr2]](ptr null)
+; PGO: call void @llvm.instrprof
+; PGO-NEXT: musttail call fastcc void %[[addr2]](ptr null)
; CHECK-NEXT: ret void
declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr) #1
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail1.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail1.ll
index cd1635b93d2cc24..d0d11fc4495e480 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail1.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail1.ll
@@ -1,6 +1,7 @@
; Tests that coro-split will convert coro.resume followed by a suspend to a
; musttail call.
-; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
+; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck --check-prefixes=CHECK,NOPGO %s
+; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck --check-prefixes=CHECK,PGO %s
define void @f() #0 {
entry:
@@ -63,14 +64,17 @@ unreach:
; CHECK-LABEL: @f.resume(
; CHECK: %[[hdl:.+]] = call ptr @g()
; CHECK-NEXT: %[[addr2:.+]] = call ptr @llvm.coro.subfn.addr(ptr %[[hdl]], i8 0)
-; CHECK-NEXT: musttail call fastcc void %[[addr2]](ptr %[[hdl]])
+; NOPGO-NEXT: musttail call fastcc void %[[addr2]](ptr %[[hdl]])
+; PGO: musttail call fastcc void %[[addr2]](ptr %[[hdl]])
; CHECK-NEXT: ret void
; CHECK: %[[hdl2:.+]] = call ptr @h()
; CHECK-NEXT: %[[addr3:.+]] = call ptr @llvm.coro.subfn.addr(ptr %[[hdl2]], i8 0)
-; CHECK-NEXT: musttail call fastcc void %[[addr3]](ptr %[[hdl2]])
+; NOPGO-NEXT: musttail call fastcc void %[[addr3]](ptr %[[hdl2]])
+; PGO: musttail call fastcc void %[[addr3]](ptr %[[hdl2]])
; CHECK-NEXT: ret void
; CHECK: %[[addr4:.+]] = call ptr @llvm.coro.subfn.addr(ptr null, i8 0)
-; CHECK-NEXT: musttail call fastcc void %[[addr4]](ptr null)
+; NOPGO-NEXT: musttail call fastcc void %[[addr4]](ptr null)
+; PGO: musttail call fastcc void %[[addr4]](ptr null)
; CHECK-NEXT: ret void
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail10.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail10.ll
index 9d73c8bbc57b81a..cdd58b2a084fcd8 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail10.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail10.ll
@@ -1,6 +1,7 @@
; Tests that we would convert coro.resume to a musttail call if the target is
; Wasm64 with tail-call support.
; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
+; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
target triple = "wasm64-unknown-unknown"
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail11.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail11.ll
index 9bc5b4f0c65d91e..da5d868280e9671 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail11.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail11.ll
@@ -1,6 +1,7 @@
; Tests that we would convert coro.resume to a musttail call if the target is
; Wasm32 with tail-call support.
; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
+; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
target triple = "wasm32-unknown-unknown"
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail12.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail12.ll
index e7f4bcb9b0ff29a..5baec378876bb1e 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail12.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail12.ll
@@ -1,5 +1,6 @@
; Tests that coro-split won't convert the cmp instruction prematurely.
; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
+; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
declare void @fakeresume1(ptr)
declare void @print()
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail13.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail13.ll
index 2384f9382685bd0..0290e42339e2ad4 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail13.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail13.ll
@@ -1,5 +1,6 @@
; Tests that coro-split won't fall in infinite loop when simplify the terminators leading to ret.
; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
+; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
declare void @fakeresume1(ptr)
declare void @may_throw(ptr)
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail2.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail2.ll
index 38fc12815c033e7..2f27f79480ab1b4 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail2.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail2.ll
@@ -1,6 +1,7 @@
; Tests that coro-split will convert coro.resume followed by a suspend to a
; musttail call.
; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
+; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
define void @fakeresume1(ptr) {
entry:
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail3.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail3.ll
index b777f000e33a6d3..4778e3dcaf9957b 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail3.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail3.ll
@@ -1,6 +1,7 @@
; Tests that coro-split will convert coro.resume followed by a suspend to a
; musttail call.
-; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
+; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck --check-prefixes=CHECK,NOPGO %s
+; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck --check-prefixes=CHECK,PGO %s
define void @f() #0 {
entry:
@@ -59,14 +60,17 @@ unreach:
; CHECK-LABEL: @f.resume(
; CHECK: %[[hdl:.+]] = call ptr @g()
; CHECK-NEXT: %[[addr2:.+]] = call ptr @llvm.coro.subfn.addr(ptr %[[hdl]], i8 0)
-; CHECK-NEXT: musttail call fastcc void %[[addr2]](ptr %[[hdl]])
+; NOPGO-NEXT: musttail call fastcc void %[[addr2]](ptr %[[hdl]])
+; PGO: musttail call fastcc void %[[addr2]](ptr %[[hdl]])
; CHECK-NEXT: ret void
; CHECK: %[[hdl2:.+]] = call ptr @h()
; CHECK-NEXT: %[[addr3:.+]] = call ptr @llvm.coro.subfn.addr(ptr %[[hdl2]], i8 0)
-; CHECK-NEXT: musttail call fastcc void %[[addr3]](ptr %[[hdl2]])
+; NOPGO-NEXT: musttail call fastcc void %[[addr3]](ptr %[[hdl2]])
+; PGO: musttail call fastcc void %[[addr3]](ptr %[[hdl2]])
; CHECK-NEXT: ret void
; CHECK: %[[addr4:.+]] = call ptr @llvm.coro.subfn.addr(ptr null, i8 0)
-; CHECK-NEXT: musttail call fastcc void %[[addr4]](ptr null)
+; NOPGO-NEXT: musttail call fastcc void %[[addr4]](ptr null)
+; PGO: musttail call fastcc void %[[addr4]](ptr null)
; CHECK-NEXT: ret void
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail4.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail4.ll
index 1e0fcdb87a72d30..00ee422ce5863df 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail4.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail4.ll
@@ -1,6 +1,7 @@
; Tests that coro-split will convert a call before coro.suspend to a musttail call
; while the user of the coro.suspend is a icmpinst.
; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
+; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
define void @fakeresume1(ptr) {
entry:
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail5.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail5.ll
index d19606491335e50..9afc79abbe88cd8 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail5.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail5.ll
@@ -1,6 +1,7 @@
; Tests that sinked lifetime markers wouldn't provent optimization
; to convert a resuming call to a musttail call.
; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
+; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
declare void @fakeresume1(ptr align 8)
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail6.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail6.ll
index eea711861c488c5..9c2b1ece1624bc9 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail6.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail6.ll
@@ -4,6 +4,7 @@
; an extra bitcast instruction in the path, which makes it harder to
; optimize.
; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
+; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
declare void @fakeresume1(ptr align 8)
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail7.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail7.ll
index c32fe9b0ee304c2..860032bd3cf8e52 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail7.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail7.ll
@@ -4,6 +4,7 @@
; is that this contains dead instruction generated during the transformation,
; which makes the optimization harder.
; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
+; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
declare void @fakeresume1(ptr align 8)
|
@llvm/pr-subscribers-llvm-transforms Author: Mircea Trofin (mtrofin) ChangesIf we did, we couldn't lower the suspend call to a tail call. If this happened in a loop, it can lead to stack overflow (this was encountered in a benchmark, as an extreme case) We can instrument the other 2 edges instead, as long as they also don't point to the same basic block. Full diff: https://github.com/llvm/llvm-project/pull/71262.diff 13 Files Affected:
diff --git a/llvm/include/llvm/Transforms/Instrumentation/CFGMST.h b/llvm/include/llvm/Transforms/Instrumentation/CFGMST.h
index 6ed8a6c6eaf0197..eddfbd8a8e45b7a 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/CFGMST.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/CFGMST.h
@@ -19,6 +19,8 @@
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/BranchProbabilityInfo.h"
#include "llvm/Analysis/CFG.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
#include "llvm/Support/BranchProbability.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
@@ -121,31 +123,70 @@ template <class Edge, class BBInfo> class CFGMST {
static const uint32_t CriticalEdgeMultiplier = 1000;
+ auto GetCoroSuspendSwitch =
+ [&](const Instruction *TI) -> const SwitchInst * {
+ if (!F.isPresplitCoroutine())
+ return nullptr;
+ if (auto *SWInst = dyn_cast<SwitchInst>(TI))
+ if (auto *Intrinsic = dyn_cast<IntrinsicInst>(SWInst->getCondition()))
+ if (Intrinsic->getIntrinsicID() == Intrinsic::coro_suspend)
+ return SWInst;
+ return nullptr;
+ };
+
for (BasicBlock &BB : F) {
Instruction *TI = BB.getTerminator();
+ const SwitchInst *CoroSuspendSwitch = GetCoroSuspendSwitch(TI);
uint64_t BBWeight =
(BFI != nullptr ? BFI->getBlockFreq(&BB).getFrequency() : 2);
uint64_t Weight = 2;
if (int successors = TI->getNumSuccessors()) {
for (int i = 0; i != successors; ++i) {
BasicBlock *TargetBB = TI->getSuccessor(i);
- bool Critical = isCriticalEdge(TI, i);
- uint64_t scaleFactor = BBWeight;
- if (Critical) {
- if (scaleFactor < UINT64_MAX / CriticalEdgeMultiplier)
- scaleFactor *= CriticalEdgeMultiplier;
- else
- scaleFactor = UINT64_MAX;
+ const bool Critical = isCriticalEdge(TI, i);
+ const bool IsCoroSuspendTarget =
+ CoroSuspendSwitch &&
+ CoroSuspendSwitch->getDefaultDest() == TargetBB;
+ // We must not add instrumentation to the BB representing the
+ // "suspend" path, else CoroSplit won't be able to lower
+ // llvm.coro.suspend to a tail call. We do want profiling info for
+ // the other branches (resume/destroy). So we do 2 things:
+ // 1. we prefer instrumenting those other edges by setting the weight
+ // of the "suspend" edge to max, and
+ // 2. we mark the edge as "Removed" to guarantee it is not considered
+ // for instrumentation. That could technically happen:
+ // (from test/Transforms/Coroutines/coro-split-musttail.ll)
+ //
+ // %suspend = call i8 @llvm.coro.suspend(token %save, i1 false)
+ // switch i8 %suspend, label %exit [
+ // i8 0, label %await.ready
+ // i8 1, label %exit
+ // ]
+ if (IsCoroSuspendTarget) {
+ Weight = UINT64_MAX;
+ } else {
+ bool Critical = isCriticalEdge(TI, i);
+ uint64_t scaleFactor = BBWeight;
+ if (Critical) {
+ if (scaleFactor < UINT64_MAX / CriticalEdgeMultiplier)
+ scaleFactor *= CriticalEdgeMultiplier;
+ else
+ scaleFactor = UINT64_MAX;
+ }
+ if (BPI != nullptr)
+ Weight =
+ BPI->getEdgeProbability(&BB, TargetBB).scale(scaleFactor);
+ if (Weight == 0)
+ Weight++;
}
- if (BPI != nullptr)
- Weight = BPI->getEdgeProbability(&BB, TargetBB).scale(scaleFactor);
- if (Weight == 0)
- Weight++;
auto *E = &addEdge(&BB, TargetBB, Weight);
E->IsCritical = Critical;
+ // See comment above - we must guarantee the coro suspend BB isn't
+ // instrumented.
+ if (IsCoroSuspendTarget)
+ E->Removed = true;
LLVM_DEBUG(dbgs() << " Edge: from " << BB.getName() << " to "
- << TargetBB->getName() << " w=" << Weight << "\n");
-
+ << TargetBB->getName() << " w=" << Weight << "\n");
// Keep track of entry/exit edges:
if (&BB == Entry) {
if (Weight > MaxEntryOutWeight) {
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail.ll
index 0406135687904bf..825e44471db27ae 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail.ll
@@ -1,6 +1,7 @@
; Tests that coro-split will convert coro.resume followed by a suspend to a
; musttail call.
-; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
+; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck --check-prefixes=CHECK,NOPGO %s
+; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck --check-prefixes=CHECK,PGO %s
define void @f() #0 {
entry:
@@ -40,7 +41,9 @@ exit:
; Verify that in the resume part resume call is marked with musttail.
; CHECK-LABEL: @f.resume(
; CHECK: %[[addr2:.+]] = call ptr @llvm.coro.subfn.addr(ptr null, i8 0)
-; CHECK-NEXT: musttail call fastcc void %[[addr2]](ptr null)
+; NOPGO-NEXT: musttail call fastcc void %[[addr2]](ptr null)
+; PGO: call void @llvm.instrprof
+; PGO-NEXT: musttail call fastcc void %[[addr2]](ptr null)
; CHECK-NEXT: ret void
declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr) #1
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail1.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail1.ll
index cd1635b93d2cc24..d0d11fc4495e480 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail1.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail1.ll
@@ -1,6 +1,7 @@
; Tests that coro-split will convert coro.resume followed by a suspend to a
; musttail call.
-; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
+; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck --check-prefixes=CHECK,NOPGO %s
+; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck --check-prefixes=CHECK,PGO %s
define void @f() #0 {
entry:
@@ -63,14 +64,17 @@ unreach:
; CHECK-LABEL: @f.resume(
; CHECK: %[[hdl:.+]] = call ptr @g()
; CHECK-NEXT: %[[addr2:.+]] = call ptr @llvm.coro.subfn.addr(ptr %[[hdl]], i8 0)
-; CHECK-NEXT: musttail call fastcc void %[[addr2]](ptr %[[hdl]])
+; NOPGO-NEXT: musttail call fastcc void %[[addr2]](ptr %[[hdl]])
+; PGO: musttail call fastcc void %[[addr2]](ptr %[[hdl]])
; CHECK-NEXT: ret void
; CHECK: %[[hdl2:.+]] = call ptr @h()
; CHECK-NEXT: %[[addr3:.+]] = call ptr @llvm.coro.subfn.addr(ptr %[[hdl2]], i8 0)
-; CHECK-NEXT: musttail call fastcc void %[[addr3]](ptr %[[hdl2]])
+; NOPGO-NEXT: musttail call fastcc void %[[addr3]](ptr %[[hdl2]])
+; PGO: musttail call fastcc void %[[addr3]](ptr %[[hdl2]])
; CHECK-NEXT: ret void
; CHECK: %[[addr4:.+]] = call ptr @llvm.coro.subfn.addr(ptr null, i8 0)
-; CHECK-NEXT: musttail call fastcc void %[[addr4]](ptr null)
+; NOPGO-NEXT: musttail call fastcc void %[[addr4]](ptr null)
+; PGO: musttail call fastcc void %[[addr4]](ptr null)
; CHECK-NEXT: ret void
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail10.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail10.ll
index 9d73c8bbc57b81a..cdd58b2a084fcd8 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail10.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail10.ll
@@ -1,6 +1,7 @@
; Tests that we would convert coro.resume to a musttail call if the target is
; Wasm64 with tail-call support.
; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
+; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
target triple = "wasm64-unknown-unknown"
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail11.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail11.ll
index 9bc5b4f0c65d91e..da5d868280e9671 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail11.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail11.ll
@@ -1,6 +1,7 @@
; Tests that we would convert coro.resume to a musttail call if the target is
; Wasm32 with tail-call support.
; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
+; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
target triple = "wasm32-unknown-unknown"
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail12.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail12.ll
index e7f4bcb9b0ff29a..5baec378876bb1e 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail12.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail12.ll
@@ -1,5 +1,6 @@
; Tests that coro-split won't convert the cmp instruction prematurely.
; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
+; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
declare void @fakeresume1(ptr)
declare void @print()
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail13.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail13.ll
index 2384f9382685bd0..0290e42339e2ad4 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail13.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail13.ll
@@ -1,5 +1,6 @@
; Tests that coro-split won't fall in infinite loop when simplify the terminators leading to ret.
; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
+; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
declare void @fakeresume1(ptr)
declare void @may_throw(ptr)
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail2.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail2.ll
index 38fc12815c033e7..2f27f79480ab1b4 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail2.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail2.ll
@@ -1,6 +1,7 @@
; Tests that coro-split will convert coro.resume followed by a suspend to a
; musttail call.
; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
+; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
define void @fakeresume1(ptr) {
entry:
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail3.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail3.ll
index b777f000e33a6d3..4778e3dcaf9957b 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail3.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail3.ll
@@ -1,6 +1,7 @@
; Tests that coro-split will convert coro.resume followed by a suspend to a
; musttail call.
-; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
+; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck --check-prefixes=CHECK,NOPGO %s
+; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck --check-prefixes=CHECK,PGO %s
define void @f() #0 {
entry:
@@ -59,14 +60,17 @@ unreach:
; CHECK-LABEL: @f.resume(
; CHECK: %[[hdl:.+]] = call ptr @g()
; CHECK-NEXT: %[[addr2:.+]] = call ptr @llvm.coro.subfn.addr(ptr %[[hdl]], i8 0)
-; CHECK-NEXT: musttail call fastcc void %[[addr2]](ptr %[[hdl]])
+; NOPGO-NEXT: musttail call fastcc void %[[addr2]](ptr %[[hdl]])
+; PGO: musttail call fastcc void %[[addr2]](ptr %[[hdl]])
; CHECK-NEXT: ret void
; CHECK: %[[hdl2:.+]] = call ptr @h()
; CHECK-NEXT: %[[addr3:.+]] = call ptr @llvm.coro.subfn.addr(ptr %[[hdl2]], i8 0)
-; CHECK-NEXT: musttail call fastcc void %[[addr3]](ptr %[[hdl2]])
+; NOPGO-NEXT: musttail call fastcc void %[[addr3]](ptr %[[hdl2]])
+; PGO: musttail call fastcc void %[[addr3]](ptr %[[hdl2]])
; CHECK-NEXT: ret void
; CHECK: %[[addr4:.+]] = call ptr @llvm.coro.subfn.addr(ptr null, i8 0)
-; CHECK-NEXT: musttail call fastcc void %[[addr4]](ptr null)
+; NOPGO-NEXT: musttail call fastcc void %[[addr4]](ptr null)
+; PGO: musttail call fastcc void %[[addr4]](ptr null)
; CHECK-NEXT: ret void
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail4.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail4.ll
index 1e0fcdb87a72d30..00ee422ce5863df 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail4.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail4.ll
@@ -1,6 +1,7 @@
; Tests that coro-split will convert a call before coro.suspend to a musttail call
; while the user of the coro.suspend is a icmpinst.
; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
+; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
define void @fakeresume1(ptr) {
entry:
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail5.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail5.ll
index d19606491335e50..9afc79abbe88cd8 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail5.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail5.ll
@@ -1,6 +1,7 @@
; Tests that sinked lifetime markers wouldn't provent optimization
; to convert a resuming call to a musttail call.
; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
+; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
declare void @fakeresume1(ptr align 8)
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail6.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail6.ll
index eea711861c488c5..9c2b1ece1624bc9 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail6.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail6.ll
@@ -4,6 +4,7 @@
; an extra bitcast instruction in the path, which makes it harder to
; optimize.
; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
+; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
declare void @fakeresume1(ptr align 8)
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail7.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail7.ll
index c32fe9b0ee304c2..860032bd3cf8e52 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail7.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail7.ll
@@ -4,6 +4,7 @@
; is that this contains dead instruction generated during the transformation,
; which makes the optimization harder.
; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
+; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
declare void @fakeresume1(ptr align 8)
|
If we do, we can't lower the suspend call to a tail call. If this happened in a loop, it can lead to stack overflow (this was encountered in a benchmark, as an extreme case) We can instrument the other 2 edges instead, as long as they also don't point to the same basic block.
✅ With the latest revision this PR passed the C/C++ code formatter. |
Gentle reminder. Thanks! |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can a testcase be added in instrumentation test dir as well?
auto *E = &addEdge(&BB, TargetBB, Weight); | ||
E->IsCritical = Critical; | ||
// See comment above - we must guarantee the coro suspend BB isn't |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I suggest move all co-routine handling code in one helper function and call it after Edge creation:
void HandleCoroutine(Edge *E, ....) {
if (!F.notPresplitCoroutine)
return;
if (!NotSwitch(SI))
return;
...
E->Weight = UINT64_MAX;
E->Removed = true;
}
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
// See comment above - we must guarantee the coro suspend BB isn't | ||
// instrumented. | ||
if (IsCoroSuspendTarget) | ||
E->Removed = true; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is setting MAX weighted needed if the removed flag is set?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
IIUC yes (as per @xur-llvm) because it helps pick the other edges for instrumentation. If there's a more explicit way to do that, I'd prefer that instead.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I take that back. Fixing.
I could copy the ones I edited over, is that what you mean? |
Still trying to understand the details here. Is the problem here that new instructions are inserted between a |
No, it's inserted in the Without pgo instrumentation, the end of the
Note how the call to OK, now with pgo instrumentation. First, right after
The effect after
so the call to
|
Removed the non-PGO clause in the copy.
Copied them, and removed the non-PGO I think it makes sense to keep both the original test cases and the copies, at least for awareness, but can be convinced otherwise. |
// for instrumentation. That could technically happen: | ||
// (from test/Transforms/Coroutines/coro-split-musttail.ll) | ||
// | ||
// %suspend = call i8 @llvm.coro.suspend(token %save, i1 false) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
how is suspend lowered? will a tail call be inserted in the dest BB?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
IIUC, llvm.coro.suspend
is really a marker that says "return. Upon resumption, here's where you go if there's more to execute, and here's where you go if we're done". So it's not really a call, and the switch isn't really a switch, and the "default" path of the switch really needs to reduce to ret
.
Adding @GorNishanov to check my understanding.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
...or @Flakebi
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
llvm.coro.suspend
is lowered to either 0 in .resume
function or 1 in .destroy/.cleanup function.
When await_suspend
returns a handle, we always want the following call to its .resume
function to be a tail call in order to have symmetric transfer to work.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Right, and moreover the -1
case should reduce to ret
, correct?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Technically, it depends on the frontend. For C++, yes.
The fancy part here is that symmetric transfer is a C++ feature. We just tried to implement it in the middle end. (This is not good. I know). So we can only consider C++ since the motivation of the patch to save symmetric transfer under pgo.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM. Thanks.
// for instrumentation. That could technically happen: | ||
// (from test/Transforms/Coroutines/coro-split-musttail.ll) | ||
// | ||
// %suspend = call i8 @llvm.coro.suspend(token %save, i1 false) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Technically, it depends on the frontend. For C++, yes.
The fancy part here is that symmetric transfer is a C++ feature. We just tried to implement it in the middle end. (This is not good. I know). So we can only consider C++ since the motivation of the patch to save symmetric transfer under pgo.
suspend
blocksuspend
block
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please also update the comment about tail call and symmetric transfer.
If we did, we couldn't lower symmetric transfer resumes to tail calls. We can instrument the other 2 edges instead, as long as they also don't point to the same basic block.
If we did, we couldn't lower symmetric transfer resumes to tail calls.
We can instrument the other 2 edges instead, as long as they also don't point to the same basic block.