Skip to content

Commit 7082034

Browse files
committed
[AArch64] Optimization of repeated constant loads (#51483)
This change looks for cases where load same constant in the upper and lower 32-bit sizes of a 64-bit space. If we found, it loads only the lower 32-bit constant and replaces it with an instruction that stores it simultaneously in the upper and lower. For example: renamable $x8 = MOVZXi 49370, 0 renamable $x8 = MOVKXi $x8, 320, 16 renamable $x8 = MOVKXi $x8, 49370, 32 renamable $x8 = MOVKXi $x8, 320, 48 STRXui killed renamable $x8, killed renamable $x0, 0 becomes $w8 = MOVZWi 49370, 0 $w8 = MOVKWi $w8, 320, 16 STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, 0
1 parent c240aca commit 7082034

File tree

4 files changed

+258
-0
lines changed

4 files changed

+258
-0
lines changed

llvm/lib/Target/AArch64/AArch64ExpandImm.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -509,6 +509,16 @@ static inline void expandMOVImmSimple(uint64_t Imm, unsigned BitSize,
509509
Imm = ~Imm;
510510

511511
unsigned Opc = (BitSize == 32 ? AArch64::MOVKWi : AArch64::MOVKXi);
512+
Shift += 16;
513+
Imm16 = (Imm >> Shift) & Mask;
514+
if (Imm16 != (isNeg ? Mask : 0))
515+
Insn.push_back(
516+
{Opc, Imm16, AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift)});
517+
if (Imm != 0 && (Imm >> 32) == (Imm & UINT_MAX)) {
518+
Insn.push_back({BitSize == 32 ? AArch64::ORRWrs : AArch64::ORRXrs, 0, 32});
519+
return;
520+
}
521+
512522
while (Shift < LastShift) {
513523
Shift += 16;
514524
Imm16 = (Imm >> Shift) & Mask;

llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,19 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
168168
.addImm(I->Op2));
169169
}
170170
break;
171+
case AArch64::ORRWrs:
172+
case AArch64::ORRXrs: {
173+
Register DstReg = MI.getOperand(0).getReg();
174+
bool DstIsDead = MI.getOperand(0).isDead();
175+
MIBS.push_back(
176+
BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(I->Opcode))
177+
.addReg(DstReg, RegState::Define |
178+
getDeadRegState(DstIsDead && LastItem) |
179+
RenamableState)
180+
.addReg(DstReg)
181+
.addReg(DstReg)
182+
.addImm(I->Op2));
183+
} break;
171184
case AArch64::ANDXri:
172185
case AArch64::EORXri:
173186
if (I->Op1 == 0) {

llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,13 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
199199
// Find and merge a base register updates before or after a ld/st instruction.
200200
bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
201201

202+
// Finds and collapses loads of repeated constant values.
203+
bool foldRepeatedConstantLoads(MachineBasicBlock::iterator &I,
204+
unsigned Limit);
205+
MachineBasicBlock::iterator tryToFoldRepeatedConstantLoads(
206+
MachineInstr &MI, SmallVectorImpl<MachineBasicBlock::iterator> &MIs,
207+
int SuccIndex, int Accumulated);
208+
202209
bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
203210

204211
bool runOnMachineFunction(MachineFunction &Fn) override;
@@ -2250,6 +2257,151 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
22502257
return E;
22512258
}
22522259

2260+
static bool isRepeatable(MachineInstr &MI, Register BaseReg) {
2261+
auto MatchBaseReg = [&](unsigned Count) {
2262+
for (unsigned I = 0; I < Count; I++) {
2263+
auto OpI = MI.getOperand(I);
2264+
if (OpI.isReg() && OpI.getReg() != BaseReg)
2265+
return false;
2266+
}
2267+
return true;
2268+
};
2269+
2270+
unsigned Opc = MI.getOpcode();
2271+
switch (Opc) {
2272+
default:
2273+
return false;
2274+
case AArch64::MOVZXi:
2275+
return MatchBaseReg(1);
2276+
case AArch64::MOVKXi:
2277+
return MatchBaseReg(2);
2278+
case AArch64::ORRXrs:
2279+
case AArch64::ORRWrs:
2280+
MachineOperand &Imm = MI.getOperand(3);
2281+
unsigned BitSize = Opc == AArch64::ORRXrs ? 32 : 16;
2282+
if (MatchBaseReg(3) && Imm.isImm() && Imm.getImm() == BitSize)
2283+
return true;
2284+
}
2285+
2286+
return false;
2287+
}
2288+
2289+
MachineBasicBlock::iterator AArch64LoadStoreOpt::tryToFoldRepeatedConstantLoads(
2290+
MachineInstr &MI, SmallVectorImpl<MachineBasicBlock::iterator> &MIs,
2291+
int SuccIndex, int Accumulated) {
2292+
MachineBasicBlock::iterator I = MI.getIterator();
2293+
MachineBasicBlock::iterator E = I->getParent()->end();
2294+
MachineBasicBlock::iterator NextI = next_nodbg(I, E);
2295+
MachineBasicBlock::iterator FirstMovI;
2296+
MachineBasicBlock *MBB = MI.getParent();
2297+
uint64_t Mask = 0xFFFFUL;
2298+
int Index = 0;
2299+
2300+
for (auto MI = MIs.begin(), E = MIs.end(); MI != E; ++MI, Index++) {
2301+
if (Index == SuccIndex - 1) {
2302+
FirstMovI = *MI;
2303+
break;
2304+
}
2305+
(*MI)->eraseFromParent();
2306+
}
2307+
2308+
Register DstRegW =
2309+
TRI->getSubReg(FirstMovI->getOperand(0).getReg(), AArch64::sub_32);
2310+
BuildMI(*MBB, FirstMovI, FirstMovI->getDebugLoc(), TII->get(AArch64::MOVZWi),
2311+
DstRegW)
2312+
.addImm(Accumulated & Mask)
2313+
.addImm(0);
2314+
BuildMI(*MBB, FirstMovI, FirstMovI->getDebugLoc(), TII->get(AArch64::MOVKWi),
2315+
DstRegW)
2316+
.addUse(DstRegW)
2317+
.addImm((Accumulated >> 16) & Mask)
2318+
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 16));
2319+
FirstMovI->eraseFromParent();
2320+
2321+
Register BaseReg = getLdStRegOp(MI).getReg();
2322+
const MachineOperand MO = AArch64InstrInfo::getLdStBaseOp(MI);
2323+
DstRegW = TRI->getSubReg(BaseReg, AArch64::sub_32);
2324+
unsigned DstRegState = getRegState(MI.getOperand(0));
2325+
BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(AArch64::STPWi))
2326+
.addReg(DstRegW, DstRegState)
2327+
.addReg(DstRegW, DstRegState)
2328+
.addReg(MO.getReg(), getRegState(MO))
2329+
.add(AArch64InstrInfo::getLdStOffsetOp(MI))
2330+
.setMemRefs(MI.memoperands())
2331+
.setMIFlags(MI.getFlags());
2332+
I->eraseFromParent();
2333+
2334+
return NextI;
2335+
}
2336+
2337+
bool AArch64LoadStoreOpt::foldRepeatedConstantLoads(
2338+
MachineBasicBlock::iterator &I, unsigned Limit) {
2339+
MachineInstr &MI = *I;
2340+
if (MI.getOpcode() != AArch64::STRXui)
2341+
return false;
2342+
2343+
MachineBasicBlock::iterator MBBI = I;
2344+
MachineBasicBlock::iterator B = I->getParent()->begin();
2345+
if (MBBI == B)
2346+
return false;
2347+
2348+
Register BaseReg = getLdStRegOp(MI).getReg();
2349+
unsigned Count = 0, SuccIndex = 0, DupBitSize = 0;
2350+
uint64_t Accumulated = 0;
2351+
SmallVector<MachineBasicBlock::iterator> MIs;
2352+
ModifiedRegUnits.clear();
2353+
UsedRegUnits.clear();
2354+
2355+
do {
2356+
MBBI = prev_nodbg(MBBI, B);
2357+
MachineInstr &MI = *MBBI;
2358+
if (!MI.isTransient())
2359+
++Count;
2360+
if (!isRepeatable(MI, BaseReg)) {
2361+
LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits,
2362+
TRI);
2363+
if (!ModifiedRegUnits.available(BaseReg) ||
2364+
!UsedRegUnits.available(BaseReg))
2365+
break;
2366+
continue;
2367+
}
2368+
2369+
unsigned Opc = MI.getOpcode();
2370+
if (Opc == AArch64::ORRXrs || Opc == AArch64::ORRWrs) {
2371+
DupBitSize = Opc == AArch64::ORRXrs ? 32 : 16;
2372+
MIs.push_back(MBBI);
2373+
continue;
2374+
}
2375+
unsigned ValueOrder = Opc == AArch64::MOVZXi ? 1 : 2;
2376+
MachineOperand Value = MI.getOperand(ValueOrder);
2377+
MachineOperand Shift = MI.getOperand(ValueOrder + 1);
2378+
if (!Value.isImm() || !Shift.isImm())
2379+
return false;
2380+
2381+
uint64_t IValue = Value.getImm();
2382+
uint64_t IShift = Shift.getImm();
2383+
uint64_t mask = 0xFFFFUL;
2384+
Accumulated -= (Accumulated & (mask << IShift));
2385+
Accumulated += (IValue << IShift);
2386+
MIs.push_back(MBBI);
2387+
2388+
if (ValueOrder == 1 && DupBitSize) {
2389+
Accumulated |= Accumulated << DupBitSize;
2390+
DupBitSize = 0;
2391+
}
2392+
2393+
if (Accumulated != 0 && (Accumulated >> 32) == (Accumulated & UINT_MAX))
2394+
SuccIndex = MIs.size();
2395+
} while (MBBI != B && Count < Limit);
2396+
2397+
if (SuccIndex) {
2398+
I = tryToFoldRepeatedConstantLoads(MI, MIs, SuccIndex, Accumulated);
2399+
return true;
2400+
}
2401+
2402+
return false;
2403+
}
2404+
22532405
bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
22542406
MachineBasicBlock::iterator &MBBI) {
22552407
MachineInstr &MI = *MBBI;
@@ -2512,6 +2664,27 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
25122664
++MBBI;
25132665
}
25142666

2667+
// We have an opportunity to optimize the `STRXui` instruction, which loads
2668+
// the same 32-bit value into a register twice. The `STPXi` instruction allows
2669+
// us to load a 32-bit value only once.
2670+
// Considering :
2671+
// mov x8, 49370
2672+
// movk x8, 320, lsl #16
2673+
// movk x8, 49370, lsl #32
2674+
// movk x8, 320, lsl #48
2675+
// str x8, [x0]
2676+
// Transform :
2677+
// mov w8, 49370
2678+
// movk w8, 320, lsl #16
2679+
// stp w8, w8, [x0]
2680+
for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2681+
MBBI != E;) {
2682+
if (foldRepeatedConstantLoads(MBBI, UpdateLimit))
2683+
Modified = true;
2684+
else
2685+
++MBBI;
2686+
}
2687+
25152688
return Modified;
25162689
}
25172690

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
2+
# RUN: llc -mtriple=aarch64 -verify-machineinstrs -run-pass=aarch64-expand-pseudo -run-pass=aarch64-ldst-opt -debug-only=aarch64-ldst-opt %s -o - | FileCheck %s
3+
---
4+
# CHECK-LABEL: name: test_fold_repeating_constant_load
5+
# CHECK: renamable $x0 = MOVZXi 49370, 0
6+
# CHECK: renamable $x0 = MOVKXi $x0, 320, 16
7+
# CHECK: renamable $x0 = ORRXrs $x0, $x0, 32
8+
name: test_fold_repeating_constant_load
9+
tracksRegLiveness: true
10+
body: |
11+
bb.0:
12+
renamable $x0 = MOVi64imm 90284035103834330
13+
RET_ReallyLR implicit $x0
14+
...
15+
---
16+
# CHECK-LABEL: name: test_fold_repeating_constant_load_neg
17+
# CHECK: renamable $x0 = MOVZXi 320, 0
18+
# CHECK: renamable $x0 = MOVKXi $x0, 49370, 16
19+
# CHECK: renamable $x0 = ORRXrs $x0, $x0, 32
20+
name: test_fold_repeating_constant_load_neg
21+
tracksRegLiveness: true
22+
body: |
23+
bb.0:
24+
liveins: $x0
25+
26+
renamable $x0 = MOVi64imm -4550323095879417536
27+
RET_ReallyLR implicit $x0
28+
...
29+
---
30+
# CHECK-LABEL: name: test_fold_repeating_constant_load_store_twice
31+
# CHECK: $w8 = MOVZWi 49370, 0
32+
# CHECK: $w8 = MOVKWi $w8, 320, 16
33+
# CHECK: STPWi renamable $w8, renamable $w8, killed renamable $x0, 0
34+
# CHECK: STRXui killed renamable $x8, killed renamable $x1, 0
35+
name: test_fold_repeating_constant_load_store_twice
36+
tracksRegLiveness: true
37+
body: |
38+
bb.0:
39+
liveins: $x0, $x1
40+
41+
renamable $x8 = MOVi64imm 90284035103834330
42+
STRXui renamable $x8, killed renamable $x0, 0
43+
STRXui killed renamable $x8, killed renamable $x1, 0
44+
RET_ReallyLR
45+
...
46+
---
47+
# CHECK-LABEL: name: test_fold_repeating_constant_load_use_reg_before_store
48+
# CHECK: renamable $x8 = MOVZXi 49370, 0
49+
# CHECK: renamable $x8 = MOVKXi $x8, 320, 16
50+
# CHECK: renamable $x8 = ORRXrs $x8, $x8, 32
51+
# CHECK: renamable $x9 = ADDXrs renamable $x8, renamable $x8, 32
52+
# CHECK: STRXui renamable $x8, killed renamable $x0, 0
53+
name: test_fold_repeating_constant_load_use_reg_before_store
54+
tracksRegLiveness: true
55+
body: |
56+
bb.0:
57+
liveins: $x0
58+
59+
renamable $x8 = MOVi64imm 90284035103834330
60+
renamable $x9 = ADDXrs renamable $x8, renamable $x8, 32
61+
STRXui renamable $x8, killed renamable $x0, 0
62+
RET_ReallyLR

0 commit comments

Comments
 (0)