Skip to content

Commit 10605a7

Browse files
committed
[AMDGPU][NPM] Port SIInsertWaitcnts to NPM
1 parent 54641a8 commit 10605a7

7 files changed

+76
-34
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -369,6 +369,13 @@ class SIMemoryLegalizerPass : public PassInfoMixin<SIMemoryLegalizerPass> {
369369
MachineFunctionAnalysisManager &MFAM);
370370
};
371371

372+
class SIInsertWaitcntsPass : public PassInfoMixin<SIInsertWaitcntsPass> {
373+
public:
374+
PreservedAnalyses run(MachineFunction &MF,
375+
MachineFunctionAnalysisManager &MFAM);
376+
static bool isRequired() { return true; }
377+
};
378+
372379
FunctionPass *createAMDGPUAnnotateUniformValuesLegacy();
373380

374381
ModulePass *createAMDGPUPrintfRuntimeBinding();
@@ -445,7 +452,7 @@ extern char &AMDGPUInsertDelayAluID;
445452
void initializeSIInsertHardClausesPass(PassRegistry &);
446453
extern char &SIInsertHardClausesID;
447454

448-
void initializeSIInsertWaitcntsPass(PassRegistry&);
455+
void initializeSIInsertWaitcntsLegacyPass(PassRegistry &);
449456
extern char &SIInsertWaitcntsID;
450457

451458
void initializeSIFormMemoryClausesLegacyPass(PassRegistry &);

llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ MACHINE_FUNCTION_PASS("si-fix-vgpr-copies", SIFixVGPRCopiesPass())
109109
MACHINE_FUNCTION_PASS("si-fold-operands", SIFoldOperandsPass());
110110
MACHINE_FUNCTION_PASS("si-form-memory-clauses", SIFormMemoryClausesPass())
111111
MACHINE_FUNCTION_PASS("si-i1-copies", SILowerI1CopiesPass())
112+
MACHINE_FUNCTION_PASS("si-insert-waitcnts", SIInsertWaitcntsPass())
112113
MACHINE_FUNCTION_PASS("si-load-store-opt", SILoadStoreOptimizerPass())
113114
MACHINE_FUNCTION_PASS("si-lower-control-flow", SILowerControlFlowPass())
114115
MACHINE_FUNCTION_PASS("si-lower-sgpr-spills", SILowerSGPRSpillsPass())
@@ -131,7 +132,6 @@ DUMMY_MACHINE_FUNCTION_PASS("amdgpu-rewrite-partial-reg-uses", GCNRewritePartial
131132
DUMMY_MACHINE_FUNCTION_PASS("amdgpu-set-wave-priority", AMDGPUSetWavePriorityPass())
132133

133134
DUMMY_MACHINE_FUNCTION_PASS("si-insert-hard-clauses", SIInsertHardClausesPass())
134-
DUMMY_MACHINE_FUNCTION_PASS("si-insert-waitcnts", SIInsertWaitcntsPass())
135135
DUMMY_MACHINE_FUNCTION_PASS("si-late-branch-lowering", SILateBranchLoweringPass())
136136
DUMMY_MACHINE_FUNCTION_PASS("si-pre-emit-peephole", SIPreEmitPeepholePass())
137137
// TODO: Move amdgpu-preload-kern-arg-prolog to MACHINE_FUNCTION_PASS since it

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -535,7 +535,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
535535
initializeSIAnnotateControlFlowLegacyPass(*PR);
536536
initializeAMDGPUInsertDelayAluLegacyPass(*PR);
537537
initializeSIInsertHardClausesPass(*PR);
538-
initializeSIInsertWaitcntsPass(*PR);
538+
initializeSIInsertWaitcntsLegacyPass(*PR);
539539
initializeSIModeRegisterLegacyPass(*PR);
540540
initializeSIWholeQuadModeLegacyPass(*PR);
541541
initializeSILowerControlFlowLegacyPass(*PR);
@@ -2153,7 +2153,7 @@ void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const {
21532153
}
21542154

21552155
addPass(SIMemoryLegalizerPass());
2156-
// TODO: addPass(SIInsertWaitcntsPass());
2156+
addPass(SIInsertWaitcntsPass());
21572157

21582158
// TODO: addPass(SIModeRegisterPass());
21592159

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 61 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
#include "llvm/ADT/Sequence.h"
3434
#include "llvm/Analysis/AliasAnalysis.h"
3535
#include "llvm/CodeGen/MachineLoopInfo.h"
36+
#include "llvm/CodeGen/MachinePassManager.h"
3637
#include "llvm/CodeGen/MachinePostDominators.h"
3738
#include "llvm/Support/DebugCounter.h"
3839
#include "llvm/TargetParser/TargetParser.h"
@@ -594,7 +595,7 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
594595
AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
595596
};
596597

597-
class SIInsertWaitcnts : public MachineFunctionPass {
598+
class SIInsertWaitcnts {
598599
private:
599600
const GCNSubtarget *ST = nullptr;
600601
const SIInstrInfo *TII = nullptr;
@@ -633,9 +634,9 @@ class SIInsertWaitcnts : public MachineFunctionPass {
633634
InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS;
634635

635636
public:
636-
static char ID;
637-
638-
SIInsertWaitcnts() : MachineFunctionPass(ID) {
637+
SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT,
638+
AliasAnalysis *AA)
639+
: MLI(MLI), PDT(PDT), AA(AA) {
639640
(void)ForceExpCounter;
640641
(void)ForceLgkmCounter;
641642
(void)ForceVMCounter;
@@ -645,20 +646,7 @@ class SIInsertWaitcnts : public MachineFunctionPass {
645646
bool isPreheaderToFlush(MachineBasicBlock &MBB,
646647
WaitcntBrackets &ScoreBrackets);
647648
bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
648-
bool runOnMachineFunction(MachineFunction &MF) override;
649-
650-
StringRef getPassName() const override {
651-
return "SI insert wait instructions";
652-
}
653-
654-
void getAnalysisUsage(AnalysisUsage &AU) const override {
655-
AU.setPreservesCFG();
656-
AU.addRequired<MachineLoopInfoWrapperPass>();
657-
AU.addRequired<MachinePostDominatorTreeWrapperPass>();
658-
AU.addUsedIfAvailable<AAResultsWrapperPass>();
659-
AU.addPreserved<AAResultsWrapperPass>();
660-
MachineFunctionPass::getAnalysisUsage(AU);
661-
}
649+
bool run(MachineFunction &MF);
662650

663651
bool isForceEmitWaitcnt() const {
664652
for (auto T : inst_counter_types())
@@ -742,6 +730,36 @@ class SIInsertWaitcnts : public MachineFunctionPass {
742730
WaitcntBrackets &ScoreBrackets);
743731
};
744732

733+
class SIInsertWaitcntsLegacy : public MachineFunctionPass {
734+
public:
735+
static char ID;
736+
SIInsertWaitcntsLegacy() : MachineFunctionPass(ID) {}
737+
738+
bool runOnMachineFunction(MachineFunction &MF) override {
739+
auto *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
740+
auto *PDT =
741+
&getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
742+
AliasAnalysis *AA = nullptr;
743+
if (auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
744+
AA = &AAR->getAAResults();
745+
746+
return SIInsertWaitcnts(MLI, PDT, AA).run(MF);
747+
}
748+
749+
StringRef getPassName() const override {
750+
return "SI insert wait instructions";
751+
}
752+
753+
void getAnalysisUsage(AnalysisUsage &AU) const override {
754+
AU.setPreservesCFG();
755+
AU.addRequired<MachineLoopInfoWrapperPass>();
756+
AU.addRequired<MachinePostDominatorTreeWrapperPass>();
757+
AU.addUsedIfAvailable<AAResultsWrapperPass>();
758+
AU.addPreserved<AAResultsWrapperPass>();
759+
MachineFunctionPass::getAnalysisUsage(AU);
760+
}
761+
};
762+
745763
} // end anonymous namespace
746764

747765
RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
@@ -1124,19 +1142,19 @@ bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
11241142
return hasMixedPendingEvents(T);
11251143
}
11261144

1127-
INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
1128-
false)
1145+
INITIALIZE_PASS_BEGIN(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1146+
false, false)
11291147
INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
11301148
INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
1131-
INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
1132-
false)
1149+
INITIALIZE_PASS_END(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1150+
false, false)
11331151

1134-
char SIInsertWaitcnts::ID = 0;
1152+
char SIInsertWaitcntsLegacy::ID = 0;
11351153

1136-
char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
1154+
char &llvm::SIInsertWaitcntsID = SIInsertWaitcntsLegacy::ID;
11371155

11381156
FunctionPass *llvm::createSIInsertWaitcntsPass() {
1139-
return new SIInsertWaitcnts();
1157+
return new SIInsertWaitcntsLegacy();
11401158
}
11411159

11421160
static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName,
@@ -2406,16 +2424,29 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
24062424
return HasVMemLoad && UsesVgprLoadedOutside && ST->hasVmemWriteVgprInOrder();
24072425
}
24082426

2409-
bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
2427+
PreservedAnalyses
2428+
SIInsertWaitcntsPass::run(MachineFunction &MF,
2429+
MachineFunctionAnalysisManager &MFAM) {
2430+
auto *MLI = &MFAM.getResult<MachineLoopAnalysis>(MF);
2431+
auto *PDT = &MFAM.getResult<MachinePostDominatorTreeAnalysis>(MF);
2432+
auto *AA = MFAM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(MF)
2433+
.getManager()
2434+
.getCachedResult<AAManager>(MF.getFunction());
2435+
2436+
if (!SIInsertWaitcnts(MLI, PDT, AA).run(MF))
2437+
return PreservedAnalyses::all();
2438+
2439+
return getMachineFunctionPassPreservedAnalyses()
2440+
.preserveSet<CFGAnalyses>()
2441+
.preserve<AAManager>();
2442+
}
2443+
2444+
bool SIInsertWaitcnts::run(MachineFunction &MF) {
24102445
ST = &MF.getSubtarget<GCNSubtarget>();
24112446
TII = ST->getInstrInfo();
24122447
TRI = &TII->getRegisterInfo();
24132448
MRI = &MF.getRegInfo();
24142449
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2415-
MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
2416-
PDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
2417-
if (auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
2418-
AA = &AAR->getAAResults();
24192450

24202451
AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU());
24212452

llvm/test/CodeGen/AMDGPU/call-waw-waitcnt.mir

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
22
# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=si-insert-waitcnts %s -o - | FileCheck -check-prefix=GCN %s
3+
# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -passes=si-insert-waitcnts %s -o - | FileCheck -check-prefix=GCN %s
34

45
# $sgpr30_sgpr31 will hold the return address. We need a waitcnt before SI_CALL so
56
# that the return address is not clobbered in the callee by the outstanding load.

llvm/test/CodeGen/AMDGPU/insert-waitcnts-hang.mir

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
22
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass si-insert-waitcnts %s -o - | FileCheck %s
3+
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -passes si-insert-waitcnts %s -o - | FileCheck %s
34

45
---
56
name: test

llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
# RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx900 -o - %s | FileCheck %s -check-prefixes=CHECK,GFX9
33
# RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -o - %s | FileCheck %s
44
# RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -o - %s | FileCheck %s
5+
6+
# RUN: llc -passes=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -o - %s | FileCheck %s
57
---
68
# CHECK-LABEL: name: vccz_corrupt_workaround
79
# CHECK: $vcc = V_CMP_EQ_F32

0 commit comments

Comments
 (0)