Skip to content

[AMDGPU][NPM] Port SIInsertWaitcnts to NPM #130061

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Mar 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,13 @@ class AMDGPUMarkLastScratchLoadPass
MachineFunctionAnalysisManager &AM);
};

class SIInsertWaitcntsPass : public PassInfoMixin<SIInsertWaitcntsPass> {
public:
PreservedAnalyses run(MachineFunction &MF,
MachineFunctionAnalysisManager &MFAM);
static bool isRequired() { return true; }
};

FunctionPass *createAMDGPUAnnotateUniformValuesLegacy();

ModulePass *createAMDGPUPrintfRuntimeBinding();
Expand Down Expand Up @@ -454,7 +461,7 @@ extern char &AMDGPUInsertDelayAluID;
void initializeSIInsertHardClausesPass(PassRegistry &);
extern char &SIInsertHardClausesID;

void initializeSIInsertWaitcntsPass(PassRegistry&);
void initializeSIInsertWaitcntsLegacyPass(PassRegistry &);
extern char &SIInsertWaitcntsID;

void initializeSIFormMemoryClausesLegacyPass(PassRegistry &);
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ MACHINE_FUNCTION_PASS("si-fix-vgpr-copies", SIFixVGPRCopiesPass())
MACHINE_FUNCTION_PASS("si-fold-operands", SIFoldOperandsPass());
MACHINE_FUNCTION_PASS("si-form-memory-clauses", SIFormMemoryClausesPass())
MACHINE_FUNCTION_PASS("si-i1-copies", SILowerI1CopiesPass())
MACHINE_FUNCTION_PASS("si-insert-waitcnts", SIInsertWaitcntsPass())
MACHINE_FUNCTION_PASS("si-load-store-opt", SILoadStoreOptimizerPass())
MACHINE_FUNCTION_PASS("si-lower-control-flow", SILowerControlFlowPass())
MACHINE_FUNCTION_PASS("si-lower-sgpr-spills", SILowerSGPRSpillsPass())
Expand All @@ -133,7 +134,6 @@ DUMMY_MACHINE_FUNCTION_PASS("amdgpu-rewrite-partial-reg-uses", GCNRewritePartial
DUMMY_MACHINE_FUNCTION_PASS("amdgpu-set-wave-priority", AMDGPUSetWavePriorityPass())

DUMMY_MACHINE_FUNCTION_PASS("si-insert-hard-clauses", SIInsertHardClausesPass())
DUMMY_MACHINE_FUNCTION_PASS("si-insert-waitcnts", SIInsertWaitcntsPass())
DUMMY_MACHINE_FUNCTION_PASS("si-late-branch-lowering", SILateBranchLoweringPass())
DUMMY_MACHINE_FUNCTION_PASS("si-pre-emit-peephole", SIPreEmitPeepholePass())
// TODO: Move amdgpu-preload-kern-arg-prolog to MACHINE_FUNCTION_PASS since it
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -536,7 +536,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeSIAnnotateControlFlowLegacyPass(*PR);
initializeAMDGPUInsertDelayAluLegacyPass(*PR);
initializeSIInsertHardClausesPass(*PR);
initializeSIInsertWaitcntsPass(*PR);
initializeSIInsertWaitcntsLegacyPass(*PR);
initializeSIModeRegisterLegacyPass(*PR);
initializeSIWholeQuadModeLegacyPass(*PR);
initializeSILowerControlFlowLegacyPass(*PR);
Expand Down Expand Up @@ -2158,7 +2158,7 @@ void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const {
}

addPass(SIMemoryLegalizerPass());
// TODO: addPass(SIInsertWaitcntsPass());
addPass(SIInsertWaitcntsPass());

// TODO: addPass(SIModeRegisterPass());

Expand Down
93 changes: 63 additions & 30 deletions llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#include "llvm/ADT/Sequence.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachinePassManager.h"
#include "llvm/CodeGen/MachinePostDominators.h"
#include "llvm/Support/DebugCounter.h"
#include "llvm/TargetParser/TargetParser.h"
Expand Down Expand Up @@ -597,7 +598,7 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
};

class SIInsertWaitcnts : public MachineFunctionPass {
class SIInsertWaitcnts {
private:
const GCNSubtarget *ST = nullptr;
const SIInstrInfo *TII = nullptr;
Expand Down Expand Up @@ -636,9 +637,9 @@ class SIInsertWaitcnts : public MachineFunctionPass {
InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS;

public:
static char ID;

SIInsertWaitcnts() : MachineFunctionPass(ID) {
SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT,
AliasAnalysis *AA)
: MLI(MLI), PDT(PDT), AA(AA) {
(void)ForceExpCounter;
(void)ForceLgkmCounter;
(void)ForceVMCounter;
Expand All @@ -648,20 +649,7 @@ class SIInsertWaitcnts : public MachineFunctionPass {
bool isPreheaderToFlush(MachineBasicBlock &MBB,
WaitcntBrackets &ScoreBrackets);
bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
bool runOnMachineFunction(MachineFunction &MF) override;

StringRef getPassName() const override {
return "SI insert wait instructions";
}

void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
AU.addRequired<MachineLoopInfoWrapperPass>();
AU.addRequired<MachinePostDominatorTreeWrapperPass>();
AU.addUsedIfAvailable<AAResultsWrapperPass>();
AU.addPreserved<AAResultsWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
bool run(MachineFunction &MF);

bool isForceEmitWaitcnt() const {
for (auto T : inst_counter_types())
Expand Down Expand Up @@ -749,6 +737,27 @@ class SIInsertWaitcnts : public MachineFunctionPass {
WaitcntBrackets &ScoreBrackets);
};

class SIInsertWaitcntsLegacy : public MachineFunctionPass {
public:
static char ID;
SIInsertWaitcntsLegacy() : MachineFunctionPass(ID) {}

bool runOnMachineFunction(MachineFunction &MF) override;

StringRef getPassName() const override {
return "SI insert wait instructions";
}

void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
AU.addRequired<MachineLoopInfoWrapperPass>();
AU.addRequired<MachinePostDominatorTreeWrapperPass>();
AU.addUsedIfAvailable<AAResultsWrapperPass>();
AU.addPreserved<AAResultsWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
};

} // end anonymous namespace

RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
Expand Down Expand Up @@ -1133,19 +1142,19 @@ bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
return hasMixedPendingEvents(T);
}

INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
false)
INITIALIZE_PASS_BEGIN(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
false, false)
INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
false)
INITIALIZE_PASS_END(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
false, false)

char SIInsertWaitcnts::ID = 0;
char SIInsertWaitcntsLegacy::ID = 0;

char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
char &llvm::SIInsertWaitcntsID = SIInsertWaitcntsLegacy::ID;

FunctionPass *llvm::createSIInsertWaitcntsPass() {
return new SIInsertWaitcnts();
return new SIInsertWaitcntsLegacy();
}

static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName,
Expand Down Expand Up @@ -2481,16 +2490,40 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
return HasVMemLoad && UsesVgprLoadedOutside && ST->hasVmemWriteVgprInOrder();
}

bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) {
auto *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
auto *PDT =
&getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
AliasAnalysis *AA = nullptr;
if (auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
AA = &AAR->getAAResults();

return SIInsertWaitcnts(MLI, PDT, AA).run(MF);
}

PreservedAnalyses
SIInsertWaitcntsPass::run(MachineFunction &MF,
MachineFunctionAnalysisManager &MFAM) {
auto *MLI = &MFAM.getResult<MachineLoopAnalysis>(MF);
auto *PDT = &MFAM.getResult<MachinePostDominatorTreeAnalysis>(MF);
auto *AA = MFAM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(MF)
.getManager()
.getCachedResult<AAManager>(MF.getFunction());

if (!SIInsertWaitcnts(MLI, PDT, AA).run(MF))
return PreservedAnalyses::all();

return getMachineFunctionPassPreservedAnalyses()
.preserveSet<CFGAnalyses>()
.preserve<AAManager>();
}

bool SIInsertWaitcnts::run(MachineFunction &MF) {
ST = &MF.getSubtarget<GCNSubtarget>();
TII = ST->getInstrInfo();
TRI = &TII->getRegisterInfo();
MRI = &MF.getRegInfo();
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
PDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
if (auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
AA = &AAR->getAAResults();

AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU());

Expand Down
1 change: 1 addition & 0 deletions llvm/test/CodeGen/AMDGPU/call-waw-waitcnt.mir
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=si-insert-waitcnts %s -o - | FileCheck -check-prefix=GCN %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -passes=si-insert-waitcnts %s -o - | FileCheck -check-prefix=GCN %s

# $sgpr30_sgpr31 will hold the return address. We need a waitcnt before SI_CALL so
# that the return address is not clobbered in the callee by the outstanding load.
Expand Down
1 change: 1 addition & 0 deletions llvm/test/CodeGen/AMDGPU/insert-waitcnts-hang.mir
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass si-insert-waitcnts %s -o - | FileCheck %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -passes si-insert-waitcnts %s -o - | FileCheck %s

---
name: test
Expand Down
2 changes: 2 additions & 0 deletions llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
# RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx900 -o - %s | FileCheck %s -check-prefixes=CHECK,GFX9
# RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -o - %s | FileCheck %s
# RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -o - %s | FileCheck %s

# RUN: llc -passes=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -o - %s | FileCheck %s
---
# CHECK-LABEL: name: vccz_corrupt_workaround
# CHECK: $vcc = V_CMP_EQ_F32
Expand Down