Skip to content

Commit 1e6a5ff

Browse files
committed
[AArch64] Optimize MOV to STP when load symmetric constants (#51483)
This change looks for cases of symmetric constant loading. `symmetric constant load` is when the upper 32 bits and lower 32 bits of a 64-bit register load the same value. When it finds this, it replaces it with an instruction that loads only the lower 32 bits of the constant and stores it in the upper and lower bits simultaneously. For example: renamable $x8 = MOVZXi 49370, 0 renamable $x8 = MOVKXi $x8, 320, 16 renamable $x8 = MOVKXi $x8, 49370, 32 renamable $x8 = MOVKXi $x8, 320, 48 STRXui killed renamable $x8, killed renamable $x0, 0 becomes $w8 = MOVZWi 49370, 0 $w8 = MOVKWi $w8, 320, 16 STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, 0
1 parent a187e3b commit 1e6a5ff

File tree

2 files changed

+176
-4
lines changed

2 files changed

+176
-4
lines changed

llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,13 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
201201
// Find and merge a base register updates before or after a ld/st instruction.
202202
bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
203203

204+
// Finds and collapses loads of repeated constant values.
205+
bool foldRepeatedConstantLoads(MachineBasicBlock::iterator &I,
206+
unsigned Limit);
207+
MachineBasicBlock::iterator tryToFoldRepeatedConstantLoads(
208+
MachineInstr &MI, SmallVectorImpl<MachineBasicBlock::iterator> &MIs,
209+
int SuccIndex, int Accumulated);
210+
204211
bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
205212

206213
bool runOnMachineFunction(MachineFunction &Fn) override;
@@ -2252,6 +2259,151 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
22522259
return E;
22532260
}
22542261

2262+
static bool isRepeatable(MachineInstr &MI, Register BaseReg) {
2263+
auto MatchBaseReg = [&](unsigned Count) {
2264+
for (unsigned I = 0; I < Count; I++) {
2265+
auto OpI = MI.getOperand(I);
2266+
if (OpI.isReg() && OpI.getReg() != BaseReg)
2267+
return false;
2268+
}
2269+
return true;
2270+
};
2271+
2272+
unsigned Opc = MI.getOpcode();
2273+
switch (Opc) {
2274+
default:
2275+
return false;
2276+
case AArch64::MOVZXi:
2277+
return MatchBaseReg(1);
2278+
case AArch64::MOVKXi:
2279+
return MatchBaseReg(2);
2280+
case AArch64::ORRXrs:
2281+
case AArch64::ORRWrs:
2282+
MachineOperand &Imm = MI.getOperand(3);
2283+
unsigned BitSize = Opc == AArch64::ORRXrs ? 32 : 16;
2284+
if (MatchBaseReg(3) && Imm.isImm() && Imm.getImm() == BitSize)
2285+
return true;
2286+
}
2287+
2288+
return false;
2289+
}
2290+
2291+
MachineBasicBlock::iterator AArch64LoadStoreOpt::tryToFoldRepeatedConstantLoads(
2292+
MachineInstr &MI, SmallVectorImpl<MachineBasicBlock::iterator> &MIs,
2293+
int SuccIndex, int Accumulated) {
2294+
MachineBasicBlock::iterator I = MI.getIterator();
2295+
MachineBasicBlock::iterator E = I->getParent()->end();
2296+
MachineBasicBlock::iterator NextI = next_nodbg(I, E);
2297+
MachineBasicBlock::iterator FirstMovI;
2298+
MachineBasicBlock *MBB = MI.getParent();
2299+
uint64_t Mask = 0xFFFFUL;
2300+
int Index = 0;
2301+
2302+
for (auto MI = MIs.begin(), E = MIs.end(); MI != E; ++MI, Index++) {
2303+
if (Index == SuccIndex - 1) {
2304+
FirstMovI = *MI;
2305+
break;
2306+
}
2307+
(*MI)->eraseFromParent();
2308+
}
2309+
2310+
Register DstRegW =
2311+
TRI->getSubReg(FirstMovI->getOperand(0).getReg(), AArch64::sub_32);
2312+
BuildMI(*MBB, FirstMovI, FirstMovI->getDebugLoc(), TII->get(AArch64::MOVZWi),
2313+
DstRegW)
2314+
.addImm(Accumulated & Mask)
2315+
.addImm(0);
2316+
BuildMI(*MBB, FirstMovI, FirstMovI->getDebugLoc(), TII->get(AArch64::MOVKWi),
2317+
DstRegW)
2318+
.addUse(DstRegW)
2319+
.addImm((Accumulated >> 16) & Mask)
2320+
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 16));
2321+
FirstMovI->eraseFromParent();
2322+
2323+
Register BaseReg = getLdStRegOp(MI).getReg();
2324+
const MachineOperand MO = AArch64InstrInfo::getLdStBaseOp(MI);
2325+
DstRegW = TRI->getSubReg(BaseReg, AArch64::sub_32);
2326+
unsigned DstRegState = getRegState(MI.getOperand(0));
2327+
BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(AArch64::STPWi))
2328+
.addReg(DstRegW, DstRegState)
2329+
.addReg(DstRegW, DstRegState)
2330+
.addReg(MO.getReg(), getRegState(MO))
2331+
.add(AArch64InstrInfo::getLdStOffsetOp(MI))
2332+
.setMemRefs(MI.memoperands())
2333+
.setMIFlags(MI.getFlags());
2334+
I->eraseFromParent();
2335+
2336+
return NextI;
2337+
}
2338+
2339+
bool AArch64LoadStoreOpt::foldRepeatedConstantLoads(
2340+
MachineBasicBlock::iterator &I, unsigned Limit) {
2341+
MachineInstr &MI = *I;
2342+
if (MI.getOpcode() != AArch64::STRXui)
2343+
return false;
2344+
2345+
MachineBasicBlock::iterator MBBI = I;
2346+
MachineBasicBlock::iterator B = I->getParent()->begin();
2347+
if (MBBI == B)
2348+
return false;
2349+
2350+
Register BaseReg = getLdStRegOp(MI).getReg();
2351+
unsigned Count = 0, SuccIndex = 0, DupBitSize = 0;
2352+
uint64_t Accumulated = 0;
2353+
SmallVector<MachineBasicBlock::iterator> MIs;
2354+
ModifiedRegUnits.clear();
2355+
UsedRegUnits.clear();
2356+
2357+
do {
2358+
MBBI = prev_nodbg(MBBI, B);
2359+
MachineInstr &MI = *MBBI;
2360+
if (!MI.isTransient())
2361+
++Count;
2362+
if (!isRepeatable(MI, BaseReg)) {
2363+
LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits,
2364+
TRI);
2365+
if (!ModifiedRegUnits.available(BaseReg) ||
2366+
!UsedRegUnits.available(BaseReg))
2367+
break;
2368+
continue;
2369+
}
2370+
2371+
unsigned Opc = MI.getOpcode();
2372+
if (Opc == AArch64::ORRXrs || Opc == AArch64::ORRWrs) {
2373+
DupBitSize = Opc == AArch64::ORRXrs ? 32 : 16;
2374+
MIs.push_back(MBBI);
2375+
continue;
2376+
}
2377+
unsigned ValueOrder = Opc == AArch64::MOVZXi ? 1 : 2;
2378+
MachineOperand Value = MI.getOperand(ValueOrder);
2379+
MachineOperand Shift = MI.getOperand(ValueOrder + 1);
2380+
if (!Value.isImm() || !Shift.isImm())
2381+
return false;
2382+
2383+
uint64_t IValue = Value.getImm();
2384+
uint64_t IShift = Shift.getImm();
2385+
uint64_t mask = 0xFFFFUL;
2386+
Accumulated -= (Accumulated & (mask << IShift));
2387+
Accumulated += (IValue << IShift);
2388+
MIs.push_back(MBBI);
2389+
2390+
if (ValueOrder == 1 && DupBitSize) {
2391+
Accumulated |= Accumulated << DupBitSize;
2392+
DupBitSize = 0;
2393+
}
2394+
2395+
if (Accumulated != 0 && (Accumulated >> 32) == (Accumulated & UINT_MAX))
2396+
SuccIndex = MIs.size();
2397+
} while (MBBI != B && Count < Limit);
2398+
2399+
if (SuccIndex) {
2400+
I = tryToFoldRepeatedConstantLoads(MI, MIs, SuccIndex, Accumulated);
2401+
return true;
2402+
}
2403+
2404+
return false;
2405+
}
2406+
22552407
bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
22562408
MachineBasicBlock::iterator &MBBI) {
22572409
MachineInstr &MI = *MBBI;
@@ -2518,6 +2670,27 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
25182670
++MBBI;
25192671
}
25202672

2673+
// We have an opportunity to optimize the `STRXui` instruction, which loads
2674+
// the same 32-bit value into a register twice. The `STPXi` instruction allows
2675+
// us to load a 32-bit value only once.
2676+
// Considering :
2677+
// mov x8, 49370
2678+
// movk x8, 320, lsl #16
2679+
// movk x8, 49370, lsl #32
2680+
// movk x8, 320, lsl #48
2681+
// str x8, [x0]
2682+
// Transform :
2683+
// mov w8, 49370
2684+
// movk w8, 320, lsl #16
2685+
// stp w8, w8, [x0]
2686+
for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2687+
MBBI != E;) {
2688+
if (foldRepeatedConstantLoads(MBBI, UpdateLimit))
2689+
Modified = true;
2690+
else
2691+
++MBBI;
2692+
}
2693+
25212694
return Modified;
25222695
}
25232696

llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,9 @@ body: |
4141
; CHECK-LABEL: name: test_fold_repeating_constant_load_store_twice
4242
; CHECK: liveins: $x0, $x1
4343
; CHECK-NEXT: {{ $}}
44-
; CHECK-NEXT: renamable $x8 = MOVZXi 49370, 0
45-
; CHECK-NEXT: renamable $x8 = MOVKXi $x8, 320, 16
46-
; CHECK-NEXT: renamable $x8 = ORRXrs $x8, $x8, 32
47-
; CHECK-NEXT: STRXui renamable $x8, killed renamable $x0, 0
44+
; CHECK-NEXT: $w8 = MOVZWi 49370, 0
45+
; CHECK-NEXT: $w8 = MOVKWi $w8, 320, 16
46+
; CHECK-NEXT: STPWi renamable $w8, renamable $w8, killed renamable $x0, 0
4847
; CHECK-NEXT: STRXui killed renamable $x8, killed renamable $x1, 0
4948
; CHECK-NEXT: RET undef $lr
5049
renamable $x8 = MOVi64imm 90284035103834330

0 commit comments

Comments
 (0)