@@ -64,6 +64,8 @@ STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
64
64
STATISTIC (NumLoadsFromStoresPromoted, " Number of loads from stores promoted" );
65
65
STATISTIC (NumFailedAlignmentCheck, " Number of load/store pair transformation "
66
66
" not passed the alignment check" );
67
+ STATISTIC (NumConstOffsetFolded,
68
+ " Number of const offset of index address folded" );
67
69
68
70
DEBUG_COUNTER (RegRenamingCounter, DEBUG_TYPE " -reg-renaming" ,
69
71
" Controls which pairs are considered for renaming" );
@@ -77,6 +79,11 @@ static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
77
79
static cl::opt<unsigned > UpdateLimit (" aarch64-update-scan-limit" , cl::init(100 ),
78
80
cl::Hidden);
79
81
82
+ // The LdStConstLimit limits how far we search for const offset instructions
83
+ // when we form index address load/store instructions.
84
+ static cl::opt<unsigned > LdStConstLimit (" aarch64-load-store-const-scan-limit" ,
85
+ cl::init (10 ), cl::Hidden);
86
+
80
87
// Enable register renaming to find additional store pairing opportunities.
81
88
static cl::opt<bool > EnableRenaming (" aarch64-load-store-renaming" ,
82
89
cl::init (true ), cl::Hidden);
@@ -173,6 +180,13 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
173
180
findMatchingUpdateInsnForward (MachineBasicBlock::iterator I,
174
181
int UnscaledOffset, unsigned Limit);
175
182
183
+ // Scan the instruction list to find a register assigned with a const
184
+ // value that can be combined with the current instruction (a load or store)
185
+ // using base addressing with writeback. Scan backwards.
186
+ MachineBasicBlock::iterator
187
+ findMatchingConstOffsetBackward (MachineBasicBlock::iterator I, unsigned Limit,
188
+ unsigned &Offset);
189
+
176
190
// Scan the instruction list to find a base register update that can
177
191
// be combined with the current instruction (a load or store) using
178
192
// pre or post indexed addressing with writeback. Scan backwards.
@@ -184,11 +198,19 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
184
198
bool isMatchingUpdateInsn (MachineInstr &MemMI, MachineInstr &MI,
185
199
unsigned BaseReg, int Offset);
186
200
201
+ bool isMatchingMovConstInsn (MachineInstr &MemMI, MachineInstr &MI,
202
+ unsigned IndexReg, unsigned &Offset);
203
+
187
204
// Merge a pre- or post-index base register update into a ld/st instruction.
188
205
MachineBasicBlock::iterator
189
206
mergeUpdateInsn (MachineBasicBlock::iterator I,
190
207
MachineBasicBlock::iterator Update, bool IsPreIdx);
191
208
209
+ MachineBasicBlock::iterator
210
+ mergeConstOffsetInsn (MachineBasicBlock::iterator I,
211
+ MachineBasicBlock::iterator Update, unsigned Offset,
212
+ int Scale);
213
+
192
214
// Find and merge zero store instructions.
193
215
bool tryToMergeZeroStInst (MachineBasicBlock::iterator &MBBI);
194
216
@@ -201,6 +223,9 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
201
223
// Find and merge a base register updates before or after a ld/st instruction.
202
224
bool tryToMergeLdStUpdate (MachineBasicBlock::iterator &MBBI);
203
225
226
+ // Find and merge an index ldr/st instruction into a base ld/st instruction.
227
+ bool tryToMergeIndexLdSt (MachineBasicBlock::iterator &MBBI, int Scale);
228
+
204
229
bool optimizeBlock (MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
205
230
206
231
bool runOnMachineFunction (MachineFunction &Fn) override ;
@@ -483,6 +508,16 @@ static unsigned getPreIndexedOpcode(unsigned Opc) {
483
508
}
484
509
}
485
510
511
+ static unsigned getBaseAddressOpcode (unsigned Opc) {
512
+ // TODO: Add more index address loads/stores.
513
+ switch (Opc) {
514
+ default :
515
+ llvm_unreachable (" Opcode has no base address equivalent!" );
516
+ case AArch64::LDRBBroX:
517
+ return AArch64::LDRBBui;
518
+ }
519
+ }
520
+
486
521
static unsigned getPostIndexedOpcode (unsigned Opc) {
487
522
switch (Opc) {
488
523
default :
@@ -724,6 +759,20 @@ static bool isMergeableLdStUpdate(MachineInstr &MI) {
724
759
}
725
760
}
726
761
762
+ // Make sure this is a reg+reg Ld/St
763
+ static bool isMergeableIndexLdSt (MachineInstr &MI, int &Scale) {
764
+ unsigned Opc = MI.getOpcode ();
765
+ switch (Opc) {
766
+ default :
767
+ return false ;
768
+ // Scaled instructions.
769
+ // TODO: Add more index address loads/stores.
770
+ case AArch64::LDRBBroX:
771
+ Scale = 1 ;
772
+ return true ;
773
+ }
774
+ }
775
+
727
776
static bool isRewritableImplicitDef (unsigned Opc) {
728
777
switch (Opc) {
729
778
default :
@@ -2053,6 +2102,63 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
2053
2102
return NextI;
2054
2103
}
2055
2104
2105
+ MachineBasicBlock::iterator
2106
+ AArch64LoadStoreOpt::mergeConstOffsetInsn (MachineBasicBlock::iterator I,
2107
+ MachineBasicBlock::iterator Update,
2108
+ unsigned Offset, int Scale) {
2109
+ assert ((Update->getOpcode () == AArch64::MOVKWi) &&
2110
+ " Unexpected const mov instruction to merge!" );
2111
+ MachineBasicBlock::iterator E = I->getParent ()->end ();
2112
+ MachineBasicBlock::iterator NextI = next_nodbg (I, E);
2113
+ MachineBasicBlock::iterator PrevI = prev_nodbg (Update, E);
2114
+ MachineInstr &MemMI = *I;
2115
+ unsigned Mask = (1 << 12 ) * Scale - 1 ;
2116
+ unsigned Low = Offset & Mask;
2117
+ unsigned High = Offset - Low;
2118
+ Register BaseReg = AArch64InstrInfo::getLdStBaseOp (MemMI).getReg ();
2119
+ Register IndexReg = AArch64InstrInfo::getLdStOffsetOp (MemMI).getReg ();
2120
+ MachineInstrBuilder AddMIB, MemMIB;
2121
+
2122
+ // Add IndexReg, BaseReg, High (the BaseReg may be SP)
2123
+ AddMIB =
2124
+ BuildMI (*I->getParent (), I, I->getDebugLoc (), TII->get (AArch64::ADDXri))
2125
+ .addDef (IndexReg)
2126
+ .addUse (BaseReg)
2127
+ .addImm (High >> 12 ) // shifted value
2128
+ .addImm (12 ); // shift 12
2129
+ (void )AddMIB;
2130
+ // Ld/St DestReg, IndexReg, Imm12
2131
+ unsigned NewOpc = getBaseAddressOpcode (I->getOpcode ());
2132
+ MemMIB = BuildMI (*I->getParent (), I, I->getDebugLoc (), TII->get (NewOpc))
2133
+ .add (getLdStRegOp (MemMI))
2134
+ .add (AArch64InstrInfo::getLdStOffsetOp (MemMI))
2135
+ .addImm (Low / Scale)
2136
+ .setMemRefs (I->memoperands ())
2137
+ .setMIFlags (I->mergeFlagsWith (*Update));
2138
+ (void )MemMIB;
2139
+
2140
+ ++NumConstOffsetFolded;
2141
+ LLVM_DEBUG (dbgs () << " Creating base address load/store.\n " );
2142
+ LLVM_DEBUG (dbgs () << " Replacing instructions:\n " );
2143
+ LLVM_DEBUG (PrevI->print (dbgs ()));
2144
+ LLVM_DEBUG (dbgs () << " " );
2145
+ LLVM_DEBUG (Update->print (dbgs ()));
2146
+ LLVM_DEBUG (dbgs () << " " );
2147
+ LLVM_DEBUG (I->print (dbgs ()));
2148
+ LLVM_DEBUG (dbgs () << " with instruction:\n " );
2149
+ LLVM_DEBUG (((MachineInstr *)AddMIB)->print (dbgs ()));
2150
+ LLVM_DEBUG (dbgs () << " " );
2151
+ LLVM_DEBUG (((MachineInstr *)MemMIB)->print (dbgs ()));
2152
+ LLVM_DEBUG (dbgs () << " \n " );
2153
+
2154
+ // Erase the old instructions for the block.
2155
+ I->eraseFromParent ();
2156
+ PrevI->eraseFromParent ();
2157
+ Update->eraseFromParent ();
2158
+
2159
+ return NextI;
2160
+ }
2161
+
2056
2162
bool AArch64LoadStoreOpt::isMatchingUpdateInsn (MachineInstr &MemMI,
2057
2163
MachineInstr &MI,
2058
2164
unsigned BaseReg, int Offset) {
@@ -2100,6 +2206,34 @@ bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
2100
2206
return false ;
2101
2207
}
2102
2208
2209
+ bool AArch64LoadStoreOpt::isMatchingMovConstInsn (MachineInstr &MemMI,
2210
+ MachineInstr &MI,
2211
+ unsigned IndexReg,
2212
+ unsigned &Offset) {
2213
+ // The update instruction source and destination register must be the
2214
+ // same as the load/store index register.
2215
+ if (MI.getOpcode () == AArch64::MOVKWi &&
2216
+ TRI->isSuperOrSubRegisterEq (IndexReg, MI.getOperand (1 ).getReg ())) {
2217
+
2218
+ // movz + movk hold a large offset of a Ld/St instruction.
2219
+ MachineBasicBlock::iterator B = MI.getParent ()->begin ();
2220
+ MachineBasicBlock::iterator MBBI = &MI;
2221
+ // Skip the scene when the MI is the first instruction of a block.
2222
+ if (MBBI == B)
2223
+ return false ;
2224
+ MBBI = prev_nodbg (MBBI, B);
2225
+ MachineInstr &MovzMI = *MBBI;
2226
+ if (MovzMI.getOpcode () == AArch64::MOVZWi) {
2227
+ unsigned Low = MovzMI.getOperand (1 ).getImm ();
2228
+ unsigned High = MI.getOperand (2 ).getImm () << MI.getOperand (3 ).getImm ();
2229
+ Offset = High + Low;
2230
+ // 12-bit optionally shifted immediates are legal for adds.
2231
+ return Offset >> 24 == 0 ;
2232
+ }
2233
+ }
2234
+ return false ;
2235
+ }
2236
+
2103
2237
MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward (
2104
2238
MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
2105
2239
MachineBasicBlock::iterator E = I->getParent ()->end ();
@@ -2255,6 +2389,60 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
2255
2389
return E;
2256
2390
}
2257
2391
2392
+ MachineBasicBlock::iterator
2393
+ AArch64LoadStoreOpt::findMatchingConstOffsetBackward (
2394
+ MachineBasicBlock::iterator I, unsigned Limit, unsigned &Offset) {
2395
+ MachineBasicBlock::iterator B = I->getParent ()->begin ();
2396
+ MachineBasicBlock::iterator E = I->getParent ()->end ();
2397
+ MachineInstr &MemMI = *I;
2398
+ MachineBasicBlock::iterator MBBI = I;
2399
+
2400
+ // If the load is the first instruction in the block, there's obviously
2401
+ // not any matching load or store.
2402
+ if (MBBI == B)
2403
+ return E;
2404
+
2405
+ // Make sure the IndexReg is killed and the shift amount is zero.
2406
+ // TODO: Relex this restriction to extend, simplify processing now.
2407
+ if (!AArch64InstrInfo::getLdStOffsetOp (MemMI).isKill () ||
2408
+ !AArch64InstrInfo::getLdStAmountOp (MemMI).isImm () ||
2409
+ (AArch64InstrInfo::getLdStAmountOp (MemMI).getImm () != 0 ))
2410
+ return E;
2411
+
2412
+ Register IndexReg = AArch64InstrInfo::getLdStOffsetOp (MemMI).getReg ();
2413
+
2414
+ // Track which register units have been modified and used between the first
2415
+ // insn (inclusive) and the second insn.
2416
+ ModifiedRegUnits.clear ();
2417
+ UsedRegUnits.clear ();
2418
+ unsigned Count = 0 ;
2419
+ do {
2420
+ MBBI = prev_nodbg (MBBI, B);
2421
+ MachineInstr &MI = *MBBI;
2422
+
2423
+ // Don't count transient instructions towards the search limit since there
2424
+ // may be different numbers of them if e.g. debug information is present.
2425
+ if (!MI.isTransient ())
2426
+ ++Count;
2427
+
2428
+ // If we found a match, return it.
2429
+ if (isMatchingMovConstInsn (*I, MI, IndexReg, Offset)) {
2430
+ return MBBI;
2431
+ }
2432
+
2433
+ // Update the status of what the instruction clobbered and used.
2434
+ LiveRegUnits::accumulateUsedDefed (MI, ModifiedRegUnits, UsedRegUnits, TRI);
2435
+
2436
+ // Otherwise, if the index register is used or modified, we have no match,
2437
+ // so return early.
2438
+ if (!ModifiedRegUnits.available (IndexReg) ||
2439
+ !UsedRegUnits.available (IndexReg))
2440
+ return E;
2441
+
2442
+ } while (MBBI != B && Count < Limit);
2443
+ return E;
2444
+ }
2445
+
2258
2446
bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore (
2259
2447
MachineBasicBlock::iterator &MBBI) {
2260
2448
MachineInstr &MI = *MBBI;
@@ -2443,6 +2631,34 @@ bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
2443
2631
return false ;
2444
2632
}
2445
2633
2634
+ bool AArch64LoadStoreOpt::tryToMergeIndexLdSt (MachineBasicBlock::iterator &MBBI,
2635
+ int Scale) {
2636
+ MachineInstr &MI = *MBBI;
2637
+ MachineBasicBlock::iterator E = MI.getParent ()->end ();
2638
+ MachineBasicBlock::iterator Update;
2639
+
2640
+ // Don't know how to handle unscaled pre/post-index versions below, so bail.
2641
+ if (TII->hasUnscaledLdStOffset (MI.getOpcode ()))
2642
+ return false ;
2643
+
2644
+ // Look back to try to find a const offset for index LdSt instruction. For
2645
+ // example,
2646
+ // mov x8, #LargeImm ; = a * (1<<12) + imm12
2647
+ // ldr x1, [x0, x8]
2648
+ // merged into:
2649
+ // add x8, x0, a * (1<<12)
2650
+ // ldr x1, [x8, imm12]
2651
+ unsigned Offset;
2652
+ Update = findMatchingConstOffsetBackward (MBBI, LdStConstLimit, Offset);
2653
+ if (Update != E && (Offset & (Scale - 1 )) == 0 ) {
2654
+ // Merge the imm12 into the ld/st.
2655
+ MBBI = mergeConstOffsetInsn (MBBI, Update, Offset, Scale);
2656
+ return true ;
2657
+ }
2658
+
2659
+ return false ;
2660
+ }
2661
+
2446
2662
bool AArch64LoadStoreOpt::optimizeBlock (MachineBasicBlock &MBB,
2447
2663
bool EnableNarrowZeroStOpt) {
2448
2664
@@ -2521,6 +2737,22 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
2521
2737
++MBBI;
2522
2738
}
2523
2739
2740
+ // 5) Find a register assigned with a const value that can be combined with
2741
+ // into the load or store. e.g.,
2742
+ // mov x8, #LargeImm ; = a * (1<<12) + imm12
2743
+ // ldr x1, [x0, x8]
2744
+ // ; becomes
2745
+ // add x8, x0, a * (1<<12)
2746
+ // ldr x1, [x8, imm12]
2747
+ for (MachineBasicBlock::iterator MBBI = MBB.begin (), E = MBB.end ();
2748
+ MBBI != E;) {
2749
+ int Scale;
2750
+ if (isMergeableIndexLdSt (*MBBI, Scale) && tryToMergeIndexLdSt (MBBI, Scale))
2751
+ Modified = true ;
2752
+ else
2753
+ ++MBBI;
2754
+ }
2755
+
2524
2756
return Modified;
2525
2757
}
2526
2758
0 commit comments