Skip to content

Commit 1d286ad

Browse files
authored
[AMDGPU] Add mark last scratch load pass (#75512)
1 parent 37c87d5 commit 1d286ad

12 files changed

+766
-3
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,9 @@ extern char &SILowerI1CopiesID;
166166
void initializeAMDGPUGlobalISelDivergenceLoweringPass(PassRegistry &);
167167
extern char &AMDGPUGlobalISelDivergenceLoweringID;
168168

169+
void initializeAMDGPUMarkLastScratchLoadPass(PassRegistry &);
170+
extern char &AMDGPUMarkLastScratchLoadID;
171+
169172
void initializeSILowerSGPRSpillsPass(PassRegistry &);
170173
extern char &SILowerSGPRSpillsID;
171174

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
//===-- AMDGPUMarkLastScratchLoad.cpp -------------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
// Mark scratch load/spill instructions which are guaranteed to be the last time
10+
// this scratch slot is used so it can be evicted from caches.
11+
//
12+
// TODO: Handle general stack accesses not just spilling.
13+
//
14+
//===----------------------------------------------------------------------===//
15+
16+
#include "AMDGPU.h"
17+
#include "GCNSubtarget.h"
18+
#include "llvm/CodeGen/LiveIntervals.h"
19+
#include "llvm/CodeGen/LiveStacks.h"
20+
#include "llvm/CodeGen/MachineOperand.h"
21+
22+
using namespace llvm;
23+
24+
#define DEBUG_TYPE "amdgpu-mark-last-scratch-load"
25+
26+
namespace {
27+
28+
class AMDGPUMarkLastScratchLoad : public MachineFunctionPass {
29+
private:
30+
LiveStacks *LS = nullptr;
31+
LiveIntervals *LIS = nullptr;
32+
SlotIndexes *SI = nullptr;
33+
const SIInstrInfo *SII = nullptr;
34+
35+
public:
36+
static char ID;
37+
38+
AMDGPUMarkLastScratchLoad() : MachineFunctionPass(ID) {
39+
initializeAMDGPUMarkLastScratchLoadPass(*PassRegistry::getPassRegistry());
40+
}
41+
42+
bool runOnMachineFunction(MachineFunction &MF) override;
43+
44+
void getAnalysisUsage(AnalysisUsage &AU) const override {
45+
AU.addRequired<SlotIndexes>();
46+
AU.addRequired<LiveIntervals>();
47+
AU.addRequired<LiveStacks>();
48+
AU.setPreservesAll();
49+
MachineFunctionPass::getAnalysisUsage(AU);
50+
}
51+
52+
StringRef getPassName() const override {
53+
return "AMDGPU Mark Last Scratch Load";
54+
}
55+
};
56+
57+
} // end anonymous namespace
58+
59+
bool AMDGPUMarkLastScratchLoad::runOnMachineFunction(MachineFunction &MF) {
60+
if (skipFunction(MF.getFunction()))
61+
return false;
62+
63+
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
64+
if (ST.getGeneration() < AMDGPUSubtarget::GFX12)
65+
return false;
66+
67+
LS = &getAnalysis<LiveStacks>();
68+
LIS = &getAnalysis<LiveIntervals>();
69+
SI = &getAnalysis<SlotIndexes>();
70+
SII = ST.getInstrInfo();
71+
SlotIndexes &Slots = *LIS->getSlotIndexes();
72+
73+
const unsigned NumSlots = LS->getNumIntervals();
74+
if (NumSlots == 0) {
75+
LLVM_DEBUG(dbgs() << "No live slots, skipping\n");
76+
return false;
77+
}
78+
79+
LLVM_DEBUG(dbgs() << LS->getNumIntervals() << " intervals\n");
80+
81+
bool Changed = false;
82+
83+
for (auto &[SS, LI] : *LS) {
84+
for (const LiveRange::Segment &Segment : LI.segments) {
85+
86+
// Ignore segments that run to the end of basic block because in this case
87+
// slot is still live at the end of it.
88+
if (Segment.end.isBlock())
89+
continue;
90+
91+
const int FrameIndex = Register::stackSlot2Index(LI.reg());
92+
MachineInstr *LastLoad = nullptr;
93+
94+
MachineInstr *MISegmentEnd = SI->getInstructionFromIndex(Segment.end);
95+
96+
// If there is no instruction at this slot because it was deleted take the
97+
// instruction from the next slot.
98+
if (!MISegmentEnd) {
99+
SlotIndex NextSlot = Slots.getNextNonNullIndex(Segment.end);
100+
MISegmentEnd = SI->getInstructionFromIndex(NextSlot);
101+
}
102+
103+
MachineInstr *MISegmentStart = SI->getInstructionFromIndex(Segment.start);
104+
MachineBasicBlock *BB = MISegmentEnd->getParent();
105+
106+
// Start iteration backwards from segment end until the start of basic
107+
// block or start of segment if it is in the same basic block.
108+
auto End = BB->rend();
109+
if (MISegmentStart && MISegmentStart->getParent() == BB)
110+
End = MISegmentStart->getReverseIterator();
111+
112+
for (auto MI = MISegmentEnd->getReverseIterator(); MI != End; ++MI) {
113+
int LoadFI = 0;
114+
115+
if (SII->isLoadFromStackSlot(*MI, LoadFI) && LoadFI == FrameIndex) {
116+
LastLoad = &*MI;
117+
break;
118+
}
119+
}
120+
121+
if (LastLoad && !LastLoad->memoperands_empty()) {
122+
MachineMemOperand *MMO = *LastLoad->memoperands_begin();
123+
MMO->setFlags(MOLastUse);
124+
Changed = true;
125+
LLVM_DEBUG(dbgs() << " Found last load: " << *LastLoad);
126+
}
127+
}
128+
}
129+
130+
return Changed;
131+
}
132+
133+
char AMDGPUMarkLastScratchLoad::ID = 0;
134+
135+
char &llvm::AMDGPUMarkLastScratchLoadID = AMDGPUMarkLastScratchLoad::ID;
136+
137+
INITIALIZE_PASS_BEGIN(AMDGPUMarkLastScratchLoad, DEBUG_TYPE,
138+
"AMDGPU Mark last scratch load", false, false)
139+
INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
140+
INITIALIZE_PASS_DEPENDENCY(LiveStacks)
141+
INITIALIZE_PASS_END(AMDGPUMarkLastScratchLoad, DEBUG_TYPE,
142+
"AMDGPU Mark last scratch load", false, false)

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -382,6 +382,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
382382
initializeSILowerI1CopiesPass(*PR);
383383
initializeAMDGPUGlobalISelDivergenceLoweringPass(*PR);
384384
initializeSILowerWWMCopiesPass(*PR);
385+
initializeAMDGPUMarkLastScratchLoadPass(*PR);
385386
initializeSILowerSGPRSpillsPass(*PR);
386387
initializeSIFixSGPRCopiesPass(*PR);
387388
initializeSIFixVGPRCopiesPass(*PR);
@@ -1424,6 +1425,8 @@ bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
14241425
addPreRewrite();
14251426
addPass(&VirtRegRewriterID);
14261427

1428+
addPass(&AMDGPUMarkLastScratchLoadID);
1429+
14271430
return true;
14281431
}
14291432

llvm/lib/Target/AMDGPU/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ add_llvm_target(AMDGPUCodeGen
7979
AMDGPUMCInstLower.cpp
8080
AMDGPUIGroupLP.cpp
8181
AMDGPUInsertSingleUseVDST.cpp
82+
AMDGPUMarkLastScratchLoad.cpp
8283
AMDGPUMIRFormatter.cpp
8384
AMDGPUOpenCLEnqueuedBlockLowering.cpp
8485
AMDGPUPerfHintAnalysis.cpp

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8756,6 +8756,7 @@ SIInstrInfo::getSerializableMachineMemOperandTargetFlags() const {
87568756
static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
87578757
{
87588758
{MONoClobber, "amdgpu-noclobber"},
8759+
{MOLastUse, "amdgpu-last-use"},
87598760
};
87608761

87618762
return ArrayRef(TargetFlags);

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,10 @@ class ScheduleHazardRecognizer;
4141
static const MachineMemOperand::Flags MONoClobber =
4242
MachineMemOperand::MOTargetFlag1;
4343

44+
/// Mark the MMO of a load as the last use.
45+
static const MachineMemOperand::Flags MOLastUse =
46+
MachineMemOperand::MOTargetFlag2;
47+
4448
/// Utility to store machine instructions worklist.
4549
struct SIInstrWorklist {
4650
SIInstrWorklist() = default;

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1657,8 +1657,12 @@ void SIRegisterInfo::buildSpillLoadStore(
16571657
} else {
16581658
MIB.addReg(SOffset, SOffsetRegState);
16591659
}
1660-
MIB.addImm(Offset + RegOffset)
1661-
.addImm(0); // cpol
1660+
1661+
MIB.addImm(Offset + RegOffset);
1662+
1663+
bool LastUse = MMO->getFlags() & MOLastUse;
1664+
MIB.addImm(LastUse ? AMDGPU::CPol::TH_LU : 0); // cpol
1665+
16621666
if (!IsFlat)
16631667
MIB.addImm(0); // swz
16641668
MIB.addMemOperand(NewMMO);
@@ -2241,6 +2245,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
22412245
TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(),
22422246
RS->isRegUsed(AMDGPU::SCC));
22432247
}
2248+
22442249
buildSpillLoadStore(
22452250
*MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
22462251
TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),

llvm/test/CodeGen/AMDGPU/llc-pipeline.ll

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -359,6 +359,7 @@
359359
; GCN-O1-NEXT: SI Lower WWM Copies
360360
; GCN-O1-NEXT: GCN NSA Reassign
361361
; GCN-O1-NEXT: Virtual Register Rewriter
362+
; GCN-O1-NEXT: AMDGPU Mark Last Scratch Load
362363
; GCN-O1-NEXT: Stack Slot Coloring
363364
; GCN-O1-NEXT: Machine Copy Propagation Pass
364365
; GCN-O1-NEXT: Machine Loop Invariant Code Motion
@@ -655,6 +656,7 @@
655656
; GCN-O1-OPTS-NEXT: SI Lower WWM Copies
656657
; GCN-O1-OPTS-NEXT: GCN NSA Reassign
657658
; GCN-O1-OPTS-NEXT: Virtual Register Rewriter
659+
; GCN-O1-OPTS-NEXT: AMDGPU Mark Last Scratch Load
658660
; GCN-O1-OPTS-NEXT: Stack Slot Coloring
659661
; GCN-O1-OPTS-NEXT: Machine Copy Propagation Pass
660662
; GCN-O1-OPTS-NEXT: Machine Loop Invariant Code Motion
@@ -957,6 +959,7 @@
957959
; GCN-O2-NEXT: SI Lower WWM Copies
958960
; GCN-O2-NEXT: GCN NSA Reassign
959961
; GCN-O2-NEXT: Virtual Register Rewriter
962+
; GCN-O2-NEXT: AMDGPU Mark Last Scratch Load
960963
; GCN-O2-NEXT: Stack Slot Coloring
961964
; GCN-O2-NEXT: Machine Copy Propagation Pass
962965
; GCN-O2-NEXT: Machine Loop Invariant Code Motion
@@ -1271,6 +1274,7 @@
12711274
; GCN-O3-NEXT: SI Lower WWM Copies
12721275
; GCN-O3-NEXT: GCN NSA Reassign
12731276
; GCN-O3-NEXT: Virtual Register Rewriter
1277+
; GCN-O3-NEXT: AMDGPU Mark Last Scratch Load
12741278
; GCN-O3-NEXT: Stack Slot Coloring
12751279
; GCN-O3-NEXT: Machine Copy Propagation Pass
12761280
; GCN-O3-NEXT: Machine Loop Invariant Code Motion

llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
; DEFAULT-NEXT: SI Lower WWM Copies
2626
; DEFAULT-NEXT: GCN NSA Reassign
2727
; DEFAULT-NEXT: Virtual Register Rewriter
28+
; DEFAULT-NEXT: AMDGPU Mark Last Scratch Load
2829
; DEFAULT-NEXT: Stack Slot Coloring
2930

3031
; O0: Fast Register Allocator
@@ -61,6 +62,7 @@
6162
; BASIC-DEFAULT-NEXT: SI Lower WWM Copies
6263
; BASIC-DEFAULT-NEXT: GCN NSA Reassign
6364
; BASIC-DEFAULT-NEXT: Virtual Register Rewriter
65+
; BASIC-DEFAULT-NEXT: AMDGPU Mark Last Scratch Load
6466
; BASIC-DEFAULT-NEXT: Stack Slot Coloring
6567

6668

@@ -75,6 +77,7 @@
7577
; DEFAULT-BASIC-NEXT: SI Lower WWM Copies
7678
; DEFAULT-BASIC-NEXT: GCN NSA Reassign
7779
; DEFAULT-BASIC-NEXT: Virtual Register Rewriter
80+
; DEFAULT-BASIC-NEXT: AMDGPU Mark Last Scratch Load
7881
; DEFAULT-BASIC-NEXT: Stack Slot Coloring
7982

8083

@@ -95,6 +98,7 @@
9598
; BASIC-BASIC-NEXT: SI Lower WWM Copies
9699
; BASIC-BASIC-NEXT: GCN NSA Reassign
97100
; BASIC-BASIC-NEXT: Virtual Register Rewriter
101+
; BASIC-BASIC-NEXT: AMDGPU Mark Last Scratch Load
98102
; BASIC-BASIC-NEXT: Stack Slot Coloring
99103

100104

0 commit comments

Comments
 (0)