Skip to content

Commit 83ee099

Browse files
committed
[AArch64] Optimization of repeated constant loads (#51483)
This change looks for cases where load same constant in the upper and lower 32-bit sizes of a 64-bit space. If we found, it loads only the lower 32-bit constant and replaces it with an instruction that stores it simultaneously in the upper and lower. For example: renamable $x8 = MOVZXi 49370, 0 renamable $x8 = MOVKXi $x8, 320, 16 renamable $x8 = MOVKXi $x8, 49370, 32 renamable $x8 = MOVKXi $x8, 320, 48 STRXui killed renamable $x8, killed renamable $x0, 0 becomes $w8 = MOVZWi 49370, 0 $w8 = MOVKWi $w8, 320, 16 STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, 0
1 parent b5b34db commit 83ee099

File tree

4 files changed

+258
-0
lines changed

4 files changed

+258
-0
lines changed

llvm/lib/Target/AArch64/AArch64ExpandImm.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -509,6 +509,16 @@ static inline void expandMOVImmSimple(uint64_t Imm, unsigned BitSize,
509509
Imm = ~Imm;
510510

511511
unsigned Opc = (BitSize == 32 ? AArch64::MOVKWi : AArch64::MOVKXi);
512+
Shift += 16;
513+
Imm16 = (Imm >> Shift) & Mask;
514+
if (Imm16 != (isNeg ? Mask : 0))
515+
Insn.push_back(
516+
{Opc, Imm16, AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift)});
517+
if (Imm != 0 && (Imm >> 32) == (Imm & UINT_MAX)) {
518+
Insn.push_back({BitSize == 32 ? AArch64::ORRWrs : AArch64::ORRXrs, 0, 32});
519+
return;
520+
}
521+
512522
while (Shift < LastShift) {
513523
Shift += 16;
514524
Imm16 = (Imm >> Shift) & Mask;

llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,19 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
168168
.addImm(I->Op2));
169169
}
170170
break;
171+
case AArch64::ORRWrs:
172+
case AArch64::ORRXrs: {
173+
Register DstReg = MI.getOperand(0).getReg();
174+
bool DstIsDead = MI.getOperand(0).isDead();
175+
MIBS.push_back(
176+
BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(I->Opcode))
177+
.addReg(DstReg, RegState::Define |
178+
getDeadRegState(DstIsDead && LastItem) |
179+
RenamableState)
180+
.addReg(DstReg)
181+
.addReg(DstReg)
182+
.addImm(I->Op2));
183+
} break;
171184
case AArch64::ANDXri:
172185
case AArch64::EORXri:
173186
if (I->Op1 == 0) {

llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,13 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
201201
// Find and merge a base register updates before or after a ld/st instruction.
202202
bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
203203

204+
// Finds and collapses loads of repeated constant values.
205+
bool foldRepeatedConstantLoads(MachineBasicBlock::iterator &I,
206+
unsigned Limit);
207+
MachineBasicBlock::iterator tryToFoldRepeatedConstantLoads(
208+
MachineInstr &MI, SmallVectorImpl<MachineBasicBlock::iterator> &MIs,
209+
int SuccIndex, int Accumulated);
210+
204211
bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
205212

206213
bool runOnMachineFunction(MachineFunction &Fn) override;
@@ -2252,6 +2259,151 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
22522259
return E;
22532260
}
22542261

2262+
static bool isRepeatable(MachineInstr &MI, Register BaseReg) {
2263+
auto MatchBaseReg = [&](unsigned Count) {
2264+
for (unsigned I = 0; I < Count; I++) {
2265+
auto OpI = MI.getOperand(I);
2266+
if (OpI.isReg() && OpI.getReg() != BaseReg)
2267+
return false;
2268+
}
2269+
return true;
2270+
};
2271+
2272+
unsigned Opc = MI.getOpcode();
2273+
switch (Opc) {
2274+
default:
2275+
return false;
2276+
case AArch64::MOVZXi:
2277+
return MatchBaseReg(1);
2278+
case AArch64::MOVKXi:
2279+
return MatchBaseReg(2);
2280+
case AArch64::ORRXrs:
2281+
case AArch64::ORRWrs:
2282+
MachineOperand &Imm = MI.getOperand(3);
2283+
unsigned BitSize = Opc == AArch64::ORRXrs ? 32 : 16;
2284+
if (MatchBaseReg(3) && Imm.isImm() && Imm.getImm() == BitSize)
2285+
return true;
2286+
}
2287+
2288+
return false;
2289+
}
2290+
2291+
MachineBasicBlock::iterator AArch64LoadStoreOpt::tryToFoldRepeatedConstantLoads(
2292+
MachineInstr &MI, SmallVectorImpl<MachineBasicBlock::iterator> &MIs,
2293+
int SuccIndex, int Accumulated) {
2294+
MachineBasicBlock::iterator I = MI.getIterator();
2295+
MachineBasicBlock::iterator E = I->getParent()->end();
2296+
MachineBasicBlock::iterator NextI = next_nodbg(I, E);
2297+
MachineBasicBlock::iterator FirstMovI;
2298+
MachineBasicBlock *MBB = MI.getParent();
2299+
uint64_t Mask = 0xFFFFUL;
2300+
int Index = 0;
2301+
2302+
for (auto MI = MIs.begin(), E = MIs.end(); MI != E; ++MI, Index++) {
2303+
if (Index == SuccIndex - 1) {
2304+
FirstMovI = *MI;
2305+
break;
2306+
}
2307+
(*MI)->eraseFromParent();
2308+
}
2309+
2310+
Register DstRegW =
2311+
TRI->getSubReg(FirstMovI->getOperand(0).getReg(), AArch64::sub_32);
2312+
BuildMI(*MBB, FirstMovI, FirstMovI->getDebugLoc(), TII->get(AArch64::MOVZWi),
2313+
DstRegW)
2314+
.addImm(Accumulated & Mask)
2315+
.addImm(0);
2316+
BuildMI(*MBB, FirstMovI, FirstMovI->getDebugLoc(), TII->get(AArch64::MOVKWi),
2317+
DstRegW)
2318+
.addUse(DstRegW)
2319+
.addImm((Accumulated >> 16) & Mask)
2320+
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 16));
2321+
FirstMovI->eraseFromParent();
2322+
2323+
Register BaseReg = getLdStRegOp(MI).getReg();
2324+
const MachineOperand MO = AArch64InstrInfo::getLdStBaseOp(MI);
2325+
DstRegW = TRI->getSubReg(BaseReg, AArch64::sub_32);
2326+
unsigned DstRegState = getRegState(MI.getOperand(0));
2327+
BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(AArch64::STPWi))
2328+
.addReg(DstRegW, DstRegState)
2329+
.addReg(DstRegW, DstRegState)
2330+
.addReg(MO.getReg(), getRegState(MO))
2331+
.add(AArch64InstrInfo::getLdStOffsetOp(MI))
2332+
.setMemRefs(MI.memoperands())
2333+
.setMIFlags(MI.getFlags());
2334+
I->eraseFromParent();
2335+
2336+
return NextI;
2337+
}
2338+
2339+
bool AArch64LoadStoreOpt::foldRepeatedConstantLoads(
2340+
MachineBasicBlock::iterator &I, unsigned Limit) {
2341+
MachineInstr &MI = *I;
2342+
if (MI.getOpcode() != AArch64::STRXui)
2343+
return false;
2344+
2345+
MachineBasicBlock::iterator MBBI = I;
2346+
MachineBasicBlock::iterator B = I->getParent()->begin();
2347+
if (MBBI == B)
2348+
return false;
2349+
2350+
Register BaseReg = getLdStRegOp(MI).getReg();
2351+
unsigned Count = 0, SuccIndex = 0, DupBitSize = 0;
2352+
uint64_t Accumulated = 0;
2353+
SmallVector<MachineBasicBlock::iterator> MIs;
2354+
ModifiedRegUnits.clear();
2355+
UsedRegUnits.clear();
2356+
2357+
do {
2358+
MBBI = prev_nodbg(MBBI, B);
2359+
MachineInstr &MI = *MBBI;
2360+
if (!MI.isTransient())
2361+
++Count;
2362+
if (!isRepeatable(MI, BaseReg)) {
2363+
LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits,
2364+
TRI);
2365+
if (!ModifiedRegUnits.available(BaseReg) ||
2366+
!UsedRegUnits.available(BaseReg))
2367+
break;
2368+
continue;
2369+
}
2370+
2371+
unsigned Opc = MI.getOpcode();
2372+
if (Opc == AArch64::ORRXrs || Opc == AArch64::ORRWrs) {
2373+
DupBitSize = Opc == AArch64::ORRXrs ? 32 : 16;
2374+
MIs.push_back(MBBI);
2375+
continue;
2376+
}
2377+
unsigned ValueOrder = Opc == AArch64::MOVZXi ? 1 : 2;
2378+
MachineOperand Value = MI.getOperand(ValueOrder);
2379+
MachineOperand Shift = MI.getOperand(ValueOrder + 1);
2380+
if (!Value.isImm() || !Shift.isImm())
2381+
return false;
2382+
2383+
uint64_t IValue = Value.getImm();
2384+
uint64_t IShift = Shift.getImm();
2385+
uint64_t mask = 0xFFFFUL;
2386+
Accumulated -= (Accumulated & (mask << IShift));
2387+
Accumulated += (IValue << IShift);
2388+
MIs.push_back(MBBI);
2389+
2390+
if (ValueOrder == 1 && DupBitSize) {
2391+
Accumulated |= Accumulated << DupBitSize;
2392+
DupBitSize = 0;
2393+
}
2394+
2395+
if (Accumulated != 0 && (Accumulated >> 32) == (Accumulated & UINT_MAX))
2396+
SuccIndex = MIs.size();
2397+
} while (MBBI != B && Count < Limit);
2398+
2399+
if (SuccIndex) {
2400+
I = tryToFoldRepeatedConstantLoads(MI, MIs, SuccIndex, Accumulated);
2401+
return true;
2402+
}
2403+
2404+
return false;
2405+
}
2406+
22552407
bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
22562408
MachineBasicBlock::iterator &MBBI) {
22572409
MachineInstr &MI = *MBBI;
@@ -2518,6 +2670,27 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
25182670
++MBBI;
25192671
}
25202672

2673+
// We have an opportunity to optimize the `STRXui` instruction, which loads
2674+
// the same 32-bit value into a register twice. The `STPXi` instruction allows
2675+
// us to load a 32-bit value only once.
2676+
// Considering :
2677+
// mov x8, 49370
2678+
// movk x8, 320, lsl #16
2679+
// movk x8, 49370, lsl #32
2680+
// movk x8, 320, lsl #48
2681+
// str x8, [x0]
2682+
// Transform :
2683+
// mov w8, 49370
2684+
// movk w8, 320, lsl #16
2685+
// stp w8, w8, [x0]
2686+
for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2687+
MBBI != E;) {
2688+
if (foldRepeatedConstantLoads(MBBI, UpdateLimit))
2689+
Modified = true;
2690+
else
2691+
++MBBI;
2692+
}
2693+
25212694
return Modified;
25222695
}
25232696

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
2+
# RUN: llc -mtriple=aarch64 -verify-machineinstrs -run-pass=aarch64-expand-pseudo -run-pass=aarch64-ldst-opt -debug-only=aarch64-ldst-opt %s -o - | FileCheck %s
3+
---
4+
# CHECK-LABEL: name: test_fold_repeating_constant_load
5+
# CHECK: renamable $x0 = MOVZXi 49370, 0
6+
# CHECK: renamable $x0 = MOVKXi $x0, 320, 16
7+
# CHECK: renamable $x0 = ORRXrs $x0, $x0, 32
8+
name: test_fold_repeating_constant_load
9+
tracksRegLiveness: true
10+
body: |
11+
bb.0:
12+
renamable $x0 = MOVi64imm 90284035103834330
13+
RET_ReallyLR implicit $x0
14+
...
15+
---
16+
# CHECK-LABEL: name: test_fold_repeating_constant_load_neg
17+
# CHECK: renamable $x0 = MOVZXi 320, 0
18+
# CHECK: renamable $x0 = MOVKXi $x0, 49370, 16
19+
# CHECK: renamable $x0 = ORRXrs $x0, $x0, 32
20+
name: test_fold_repeating_constant_load_neg
21+
tracksRegLiveness: true
22+
body: |
23+
bb.0:
24+
liveins: $x0
25+
26+
renamable $x0 = MOVi64imm -4550323095879417536
27+
RET_ReallyLR implicit $x0
28+
...
29+
---
30+
# CHECK-LABEL: name: test_fold_repeating_constant_load_store_twice
31+
# CHECK: $w8 = MOVZWi 49370, 0
32+
# CHECK: $w8 = MOVKWi $w8, 320, 16
33+
# CHECK: STPWi renamable $w8, renamable $w8, killed renamable $x0, 0
34+
# CHECK: STRXui killed renamable $x8, killed renamable $x1, 0
35+
name: test_fold_repeating_constant_load_store_twice
36+
tracksRegLiveness: true
37+
body: |
38+
bb.0:
39+
liveins: $x0, $x1
40+
41+
renamable $x8 = MOVi64imm 90284035103834330
42+
STRXui renamable $x8, killed renamable $x0, 0
43+
STRXui killed renamable $x8, killed renamable $x1, 0
44+
RET_ReallyLR
45+
...
46+
---
47+
# CHECK-LABEL: name: test_fold_repeating_constant_load_use_reg_before_store
48+
# CHECK: renamable $x8 = MOVZXi 49370, 0
49+
# CHECK: renamable $x8 = MOVKXi $x8, 320, 16
50+
# CHECK: renamable $x8 = ORRXrs $x8, $x8, 32
51+
# CHECK: renamable $x9 = ADDXrs renamable $x8, renamable $x8, 32
52+
# CHECK: STRXui renamable $x8, killed renamable $x0, 0
53+
name: test_fold_repeating_constant_load_use_reg_before_store
54+
tracksRegLiveness: true
55+
body: |
56+
bb.0:
57+
liveins: $x0
58+
59+
renamable $x8 = MOVi64imm 90284035103834330
60+
renamable $x9 = ADDXrs renamable $x8, renamable $x8, 32
61+
STRXui renamable $x8, killed renamable $x0, 0
62+
RET_ReallyLR

0 commit comments

Comments
 (0)