Skip to content

Commit 3319049

Browse files
committed
[AArch64] merge index address with large offset into base address
A case for this transformation, https://gcc.godbolt.org/z/nhYcWq1WE Fold mov w8, llvm#56952 movk w8, rust-lang#15, lsl rust-lang#16 ldrb w0, [x0, x8] into add x0, x0, 1036288 ldrb w0, [x0, 3704] Only LDRBBroX is supported for the first time. Fix llvm#71917 Note: This PR is try relanding the commit 32878c2 with fix crash for PR79756 this crash is exposes when there is MOVKWi instruction in the head of a block, but without MOVZWi
1 parent 36231a5 commit 3319049

File tree

5 files changed

+273
-12
lines changed

5 files changed

+273
-12
lines changed

llvm/lib/Target/AArch64/AArch64InstrInfo.cpp

+10
Original file line numberDiff line numberDiff line change
@@ -4516,6 +4516,16 @@ AArch64InstrInfo::getLdStOffsetOp(const MachineInstr &MI) {
45164516
return MI.getOperand(Idx);
45174517
}
45184518

4519+
const MachineOperand &
4520+
AArch64InstrInfo::getLdStAmountOp(const MachineInstr &MI) {
4521+
switch (MI.getOpcode()) {
4522+
default:
4523+
llvm_unreachable("Unexpected opcode");
4524+
case AArch64::LDRBBroX:
4525+
return MI.getOperand(4);
4526+
}
4527+
}
4528+
45194529
static const TargetRegisterClass *getRegClass(const MachineInstr &MI,
45204530
Register Reg) {
45214531
if (MI.getParent() == nullptr)

llvm/lib/Target/AArch64/AArch64InstrInfo.h

+3
Original file line numberDiff line numberDiff line change
@@ -254,6 +254,9 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo {
254254
/// Returns whether the physical register is FP or NEON.
255255
static bool isFpOrNEON(Register Reg);
256256

257+
/// Returns the shift amount operator of a load/store.
258+
static const MachineOperand &getLdStAmountOp(const MachineInstr &MI);
259+
257260
/// Returns whether the instruction is FP or NEON.
258261
static bool isFpOrNEON(const MachineInstr &MI);
259262

llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp

+232
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,8 @@ STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
6464
STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
6565
STATISTIC(NumFailedAlignmentCheck, "Number of load/store pair transformation "
6666
"not passed the alignment check");
67+
STATISTIC(NumConstOffsetFolded,
68+
"Number of const offset of index address folded");
6769

6870
DEBUG_COUNTER(RegRenamingCounter, DEBUG_TYPE "-reg-renaming",
6971
"Controls which pairs are considered for renaming");
@@ -77,6 +79,11 @@ static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
7779
static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100),
7880
cl::Hidden);
7981

82+
// The LdStConstLimit limits how far we search for const offset instructions
83+
// when we form index address load/store instructions.
84+
static cl::opt<unsigned> LdStConstLimit("aarch64-load-store-const-scan-limit",
85+
cl::init(10), cl::Hidden);
86+
8087
// Enable register renaming to find additional store pairing opportunities.
8188
static cl::opt<bool> EnableRenaming("aarch64-load-store-renaming",
8289
cl::init(true), cl::Hidden);
@@ -173,6 +180,13 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
173180
findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
174181
int UnscaledOffset, unsigned Limit);
175182

183+
// Scan the instruction list to find a register assigned with a const
184+
// value that can be combined with the current instruction (a load or store)
185+
// using base addressing with writeback. Scan backwards.
186+
MachineBasicBlock::iterator
187+
findMatchingConstOffsetBackward(MachineBasicBlock::iterator I, unsigned Limit,
188+
unsigned &Offset);
189+
176190
// Scan the instruction list to find a base register update that can
177191
// be combined with the current instruction (a load or store) using
178192
// pre or post indexed addressing with writeback. Scan backwards.
@@ -184,11 +198,19 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
184198
bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
185199
unsigned BaseReg, int Offset);
186200

201+
bool isMatchingMovConstInsn(MachineInstr &MemMI, MachineInstr &MI,
202+
unsigned IndexReg, unsigned &Offset);
203+
187204
// Merge a pre- or post-index base register update into a ld/st instruction.
188205
MachineBasicBlock::iterator
189206
mergeUpdateInsn(MachineBasicBlock::iterator I,
190207
MachineBasicBlock::iterator Update, bool IsPreIdx);
191208

209+
MachineBasicBlock::iterator
210+
mergeConstOffsetInsn(MachineBasicBlock::iterator I,
211+
MachineBasicBlock::iterator Update, unsigned Offset,
212+
int Scale);
213+
192214
// Find and merge zero store instructions.
193215
bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI);
194216

@@ -201,6 +223,9 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
201223
// Find and merge a base register updates before or after a ld/st instruction.
202224
bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
203225

226+
// Find and merge an index ldr/st instruction into a base ld/st instruction.
227+
bool tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI, int Scale);
228+
204229
bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
205230

206231
bool runOnMachineFunction(MachineFunction &Fn) override;
@@ -483,6 +508,16 @@ static unsigned getPreIndexedOpcode(unsigned Opc) {
483508
}
484509
}
485510

511+
static unsigned getBaseAddressOpcode(unsigned Opc) {
512+
// TODO: Add more index address loads/stores.
513+
switch (Opc) {
514+
default:
515+
llvm_unreachable("Opcode has no base address equivalent!");
516+
case AArch64::LDRBBroX:
517+
return AArch64::LDRBBui;
518+
}
519+
}
520+
486521
static unsigned getPostIndexedOpcode(unsigned Opc) {
487522
switch (Opc) {
488523
default:
@@ -724,6 +759,20 @@ static bool isMergeableLdStUpdate(MachineInstr &MI) {
724759
}
725760
}
726761

762+
// Make sure this is a reg+reg Ld/St
763+
static bool isMergeableIndexLdSt(MachineInstr &MI, int &Scale) {
764+
unsigned Opc = MI.getOpcode();
765+
switch (Opc) {
766+
default:
767+
return false;
768+
// Scaled instructions.
769+
// TODO: Add more index address loads/stores.
770+
case AArch64::LDRBBroX:
771+
Scale = 1;
772+
return true;
773+
}
774+
}
775+
727776
static bool isRewritableImplicitDef(unsigned Opc) {
728777
switch (Opc) {
729778
default:
@@ -2053,6 +2102,63 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
20532102
return NextI;
20542103
}
20552104

2105+
MachineBasicBlock::iterator
2106+
AArch64LoadStoreOpt::mergeConstOffsetInsn(MachineBasicBlock::iterator I,
2107+
MachineBasicBlock::iterator Update,
2108+
unsigned Offset, int Scale) {
2109+
assert((Update->getOpcode() == AArch64::MOVKWi) &&
2110+
"Unexpected const mov instruction to merge!");
2111+
MachineBasicBlock::iterator E = I->getParent()->end();
2112+
MachineBasicBlock::iterator NextI = next_nodbg(I, E);
2113+
MachineBasicBlock::iterator PrevI = prev_nodbg(Update, E);
2114+
MachineInstr &MemMI = *I;
2115+
unsigned Mask = (1 << 12) * Scale - 1;
2116+
unsigned Low = Offset & Mask;
2117+
unsigned High = Offset - Low;
2118+
Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg();
2119+
Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MemMI).getReg();
2120+
MachineInstrBuilder AddMIB, MemMIB;
2121+
2122+
// Add IndexReg, BaseReg, High (the BaseReg may be SP)
2123+
AddMIB =
2124+
BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(AArch64::ADDXri))
2125+
.addDef(IndexReg)
2126+
.addUse(BaseReg)
2127+
.addImm(High >> 12) // shifted value
2128+
.addImm(12); // shift 12
2129+
(void)AddMIB;
2130+
// Ld/St DestReg, IndexReg, Imm12
2131+
unsigned NewOpc = getBaseAddressOpcode(I->getOpcode());
2132+
MemMIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
2133+
.add(getLdStRegOp(MemMI))
2134+
.add(AArch64InstrInfo::getLdStOffsetOp(MemMI))
2135+
.addImm(Low / Scale)
2136+
.setMemRefs(I->memoperands())
2137+
.setMIFlags(I->mergeFlagsWith(*Update));
2138+
(void)MemMIB;
2139+
2140+
++NumConstOffsetFolded;
2141+
LLVM_DEBUG(dbgs() << "Creating base address load/store.\n");
2142+
LLVM_DEBUG(dbgs() << " Replacing instructions:\n ");
2143+
LLVM_DEBUG(PrevI->print(dbgs()));
2144+
LLVM_DEBUG(dbgs() << " ");
2145+
LLVM_DEBUG(Update->print(dbgs()));
2146+
LLVM_DEBUG(dbgs() << " ");
2147+
LLVM_DEBUG(I->print(dbgs()));
2148+
LLVM_DEBUG(dbgs() << " with instruction:\n ");
2149+
LLVM_DEBUG(((MachineInstr *)AddMIB)->print(dbgs()));
2150+
LLVM_DEBUG(dbgs() << " ");
2151+
LLVM_DEBUG(((MachineInstr *)MemMIB)->print(dbgs()));
2152+
LLVM_DEBUG(dbgs() << "\n");
2153+
2154+
// Erase the old instructions for the block.
2155+
I->eraseFromParent();
2156+
PrevI->eraseFromParent();
2157+
Update->eraseFromParent();
2158+
2159+
return NextI;
2160+
}
2161+
20562162
bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
20572163
MachineInstr &MI,
20582164
unsigned BaseReg, int Offset) {
@@ -2100,6 +2206,34 @@ bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
21002206
return false;
21012207
}
21022208

2209+
bool AArch64LoadStoreOpt::isMatchingMovConstInsn(MachineInstr &MemMI,
2210+
MachineInstr &MI,
2211+
unsigned IndexReg,
2212+
unsigned &Offset) {
2213+
// The update instruction source and destination register must be the
2214+
// same as the load/store index register.
2215+
if (MI.getOpcode() == AArch64::MOVKWi &&
2216+
TRI->isSuperOrSubRegisterEq(IndexReg, MI.getOperand(1).getReg())) {
2217+
2218+
// movz + movk hold a large offset of a Ld/St instruction.
2219+
MachineBasicBlock::iterator B = MI.getParent()->begin();
2220+
MachineBasicBlock::iterator MBBI = &MI;
2221+
// Skip the scene when the MI is the first instruction of a block.
2222+
if (MBBI == B)
2223+
return false;
2224+
MBBI = prev_nodbg(MBBI, B);
2225+
MachineInstr &MovzMI = *MBBI;
2226+
if (MovzMI.getOpcode() == AArch64::MOVZWi) {
2227+
unsigned Low = MovzMI.getOperand(1).getImm();
2228+
unsigned High = MI.getOperand(2).getImm() << MI.getOperand(3).getImm();
2229+
Offset = High + Low;
2230+
// 12-bit optionally shifted immediates are legal for adds.
2231+
return Offset >> 24 == 0;
2232+
}
2233+
}
2234+
return false;
2235+
}
2236+
21032237
MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
21042238
MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
21052239
MachineBasicBlock::iterator E = I->getParent()->end();
@@ -2255,6 +2389,60 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
22552389
return E;
22562390
}
22572391

2392+
MachineBasicBlock::iterator
2393+
AArch64LoadStoreOpt::findMatchingConstOffsetBackward(
2394+
MachineBasicBlock::iterator I, unsigned Limit, unsigned &Offset) {
2395+
MachineBasicBlock::iterator B = I->getParent()->begin();
2396+
MachineBasicBlock::iterator E = I->getParent()->end();
2397+
MachineInstr &MemMI = *I;
2398+
MachineBasicBlock::iterator MBBI = I;
2399+
2400+
// If the load is the first instruction in the block, there's obviously
2401+
// not any matching load or store.
2402+
if (MBBI == B)
2403+
return E;
2404+
2405+
// Make sure the IndexReg is killed and the shift amount is zero.
2406+
// TODO: Relex this restriction to extend, simplify processing now.
2407+
if (!AArch64InstrInfo::getLdStOffsetOp(MemMI).isKill() ||
2408+
!AArch64InstrInfo::getLdStAmountOp(MemMI).isImm() ||
2409+
(AArch64InstrInfo::getLdStAmountOp(MemMI).getImm() != 0))
2410+
return E;
2411+
2412+
Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MemMI).getReg();
2413+
2414+
// Track which register units have been modified and used between the first
2415+
// insn (inclusive) and the second insn.
2416+
ModifiedRegUnits.clear();
2417+
UsedRegUnits.clear();
2418+
unsigned Count = 0;
2419+
do {
2420+
MBBI = prev_nodbg(MBBI, B);
2421+
MachineInstr &MI = *MBBI;
2422+
2423+
// Don't count transient instructions towards the search limit since there
2424+
// may be different numbers of them if e.g. debug information is present.
2425+
if (!MI.isTransient())
2426+
++Count;
2427+
2428+
// If we found a match, return it.
2429+
if (isMatchingMovConstInsn(*I, MI, IndexReg, Offset)) {
2430+
return MBBI;
2431+
}
2432+
2433+
// Update the status of what the instruction clobbered and used.
2434+
LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
2435+
2436+
// Otherwise, if the index register is used or modified, we have no match,
2437+
// so return early.
2438+
if (!ModifiedRegUnits.available(IndexReg) ||
2439+
!UsedRegUnits.available(IndexReg))
2440+
return E;
2441+
2442+
} while (MBBI != B && Count < Limit);
2443+
return E;
2444+
}
2445+
22582446
bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
22592447
MachineBasicBlock::iterator &MBBI) {
22602448
MachineInstr &MI = *MBBI;
@@ -2443,6 +2631,34 @@ bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
24432631
return false;
24442632
}
24452633

2634+
bool AArch64LoadStoreOpt::tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI,
2635+
int Scale) {
2636+
MachineInstr &MI = *MBBI;
2637+
MachineBasicBlock::iterator E = MI.getParent()->end();
2638+
MachineBasicBlock::iterator Update;
2639+
2640+
// Don't know how to handle unscaled pre/post-index versions below, so bail.
2641+
if (TII->hasUnscaledLdStOffset(MI.getOpcode()))
2642+
return false;
2643+
2644+
// Look back to try to find a const offset for index LdSt instruction. For
2645+
// example,
2646+
// mov x8, #LargeImm ; = a * (1<<12) + imm12
2647+
// ldr x1, [x0, x8]
2648+
// merged into:
2649+
// add x8, x0, a * (1<<12)
2650+
// ldr x1, [x8, imm12]
2651+
unsigned Offset;
2652+
Update = findMatchingConstOffsetBackward(MBBI, LdStConstLimit, Offset);
2653+
if (Update != E && (Offset & (Scale - 1)) == 0) {
2654+
// Merge the imm12 into the ld/st.
2655+
MBBI = mergeConstOffsetInsn(MBBI, Update, Offset, Scale);
2656+
return true;
2657+
}
2658+
2659+
return false;
2660+
}
2661+
24462662
bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
24472663
bool EnableNarrowZeroStOpt) {
24482664

@@ -2521,6 +2737,22 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
25212737
++MBBI;
25222738
}
25232739

2740+
// 5) Find a register assigned with a const value that can be combined with
2741+
// into the load or store. e.g.,
2742+
// mov x8, #LargeImm ; = a * (1<<12) + imm12
2743+
// ldr x1, [x0, x8]
2744+
// ; becomes
2745+
// add x8, x0, a * (1<<12)
2746+
// ldr x1, [x8, imm12]
2747+
for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2748+
MBBI != E;) {
2749+
int Scale;
2750+
if (isMergeableIndexLdSt(*MBBI, Scale) && tryToMergeIndexLdSt(MBBI, Scale))
2751+
Modified = true;
2752+
else
2753+
++MBBI;
2754+
}
2755+
25242756
return Modified;
25252757
}
25262758

llvm/test/CodeGen/AArch64/arm64-addrmode.ll

+6-9
Original file line numberDiff line numberDiff line change
@@ -214,9 +214,8 @@ define void @t17(i64 %a) {
214214
define i8 @LdOffset_i8(ptr %a) {
215215
; CHECK-LABEL: LdOffset_i8:
216216
; CHECK: // %bb.0:
217-
; CHECK-NEXT: mov w8, #56952 // =0xde78
218-
; CHECK-NEXT: movk w8, #15, lsl #16
219-
; CHECK-NEXT: ldrb w0, [x0, x8]
217+
; CHECK-NEXT: add x8, x0, #253, lsl #12 // =1036288
218+
; CHECK-NEXT: ldrb w0, [x8, #3704]
220219
; CHECK-NEXT: ret
221220
%arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
222221
%val = load i8, ptr %arrayidx, align 1
@@ -227,9 +226,8 @@ define i8 @LdOffset_i8(ptr %a) {
227226
define i32 @LdOffset_i8_zext32(ptr %a) {
228227
; CHECK-LABEL: LdOffset_i8_zext32:
229228
; CHECK: // %bb.0:
230-
; CHECK-NEXT: mov w8, #56952 // =0xde78
231-
; CHECK-NEXT: movk w8, #15, lsl #16
232-
; CHECK-NEXT: ldrb w0, [x0, x8]
229+
; CHECK-NEXT: add x8, x0, #253, lsl #12 // =1036288
230+
; CHECK-NEXT: ldrb w0, [x8, #3704]
233231
; CHECK-NEXT: ret
234232
%arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
235233
%val = load i8, ptr %arrayidx, align 1
@@ -255,9 +253,8 @@ define i32 @LdOffset_i8_sext32(ptr %a) {
255253
define i64 @LdOffset_i8_zext64(ptr %a) {
256254
; CHECK-LABEL: LdOffset_i8_zext64:
257255
; CHECK: // %bb.0:
258-
; CHECK-NEXT: mov w8, #56952 // =0xde78
259-
; CHECK-NEXT: movk w8, #15, lsl #16
260-
; CHECK-NEXT: ldrb w0, [x0, x8]
256+
; CHECK-NEXT: add x8, x0, #253, lsl #12 // =1036288
257+
; CHECK-NEXT: ldrb w0, [x8, #3704]
261258
; CHECK-NEXT: ret
262259
%arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
263260
%val = load i8, ptr %arrayidx, align 1

0 commit comments

Comments
 (0)