Skip to content

Commit 892af45

Browse files
committed
[ARM] Distribute MVE post-increments
This adds some extra processing into the Pre-RA ARM load/store optimizer to detect and merge MVE loads/stores and adds of the same base. This we don't always turn into a post-inc during ISel, and due to the nature of it being a graph we don't always know an order to use for the nodes, not knowing which nodes to make post-inc and which to use the new post-inc of. After ISel, we have an order that we can use to post-inc the following instructions. So this looks for a loads/store with a starting offset of 0, and an add/sub from the same base, plus a number of other loads/stores. We then do some checks and convert the zero offset load/store into a postinc variant. Any loads/stores after it have the offset subtracted from their immediates. For example: LDR #4 LDR #4 LDR #0 LDR_POSTINC #16 LDR #8 LDR #-8 LDR #12 LDR #-4 ADD #16 It only handles MVE loads/stores at the moment. Normal loads/store will be added in a followup patch, they just have some extra details to ensure that we keep generating LDRD/LDM successfully. Differential Revision: https://reviews.llvm.org/D77813
1 parent 4eca1c0 commit 892af45

14 files changed

+482
-279
lines changed

llvm/lib/Target/ARM/ARMBaseInstrInfo.h

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -788,6 +788,49 @@ bool HasLowerConstantMaterializationCost(unsigned Val1, unsigned Val2,
788788
const ARMSubtarget *Subtarget,
789789
bool ForCodesize = false);
790790

791+
// Return the immediate if this is ADDri or SUBri, scaled as appropriate.
792+
// Returns 0 for unknown instructions.
793+
inline int getAddSubImmediate(MachineInstr &MI) {
794+
int Scale = 1;
795+
unsigned ImmOp;
796+
switch (MI.getOpcode()) {
797+
case ARM::t2ADDri:
798+
ImmOp = 2;
799+
break;
800+
case ARM::t2SUBri:
801+
case ARM::t2SUBri12:
802+
ImmOp = 2;
803+
Scale = -1;
804+
break;
805+
case ARM::tSUBi3:
806+
case ARM::tSUBi8:
807+
ImmOp = 3;
808+
Scale = -1;
809+
break;
810+
default:
811+
return 0;
812+
}
813+
return Scale * MI.getOperand(ImmOp).getImm();
814+
}
815+
816+
// Given a memory access Opcode, check that the give Imm would be a valid Offset
817+
// for this instruction using its addressing mode.
818+
inline bool isLegalAddressImm(unsigned Opcode, int Imm,
819+
const TargetInstrInfo *TII) {
820+
const MCInstrDesc &Desc = TII->get(Opcode);
821+
unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
822+
switch (AddrMode) {
823+
case ARMII::AddrModeT2_i7:
824+
return std::abs(Imm) < (((1 << 7) * 1) - 1);
825+
case ARMII::AddrModeT2_i7s2:
826+
return std::abs(Imm) < (((1 << 7) * 2) - 1) && Imm % 2 == 0;
827+
case ARMII::AddrModeT2_i7s4:
828+
return std::abs(Imm) < (((1 << 7) * 4) - 1) && Imm % 4 == 0;
829+
default:
830+
llvm_unreachable("Unhandled Addressing mode");
831+
}
832+
}
833+
791834
} // end namespace llvm
792835

793836
#endif // LLVM_LIB_TARGET_ARM_ARMBASEINSTRINFO_H

llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp

Lines changed: 238 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
#include "llvm/Analysis/AliasAnalysis.h"
3333
#include "llvm/CodeGen/LivePhysRegs.h"
3434
#include "llvm/CodeGen/MachineBasicBlock.h"
35+
#include "llvm/CodeGen/MachineDominators.h"
3536
#include "llvm/CodeGen/MachineFunction.h"
3637
#include "llvm/CodeGen/MachineFunctionPass.h"
3738
#include "llvm/CodeGen/MachineInstr.h"
@@ -50,6 +51,7 @@
5051
#include "llvm/IR/DerivedTypes.h"
5152
#include "llvm/IR/Function.h"
5253
#include "llvm/IR/Type.h"
54+
#include "llvm/InitializePasses.h"
5355
#include "llvm/MC/MCInstrDesc.h"
5456
#include "llvm/Pass.h"
5557
#include "llvm/Support/Allocator.h"
@@ -1383,6 +1385,38 @@ static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc,
13831385
case ARM::t2STRi8:
13841386
case ARM::t2STRi12:
13851387
return ARM::t2STR_POST;
1388+
1389+
case ARM::MVE_VLDRBS16:
1390+
return ARM::MVE_VLDRBS16_post;
1391+
case ARM::MVE_VLDRBS32:
1392+
return ARM::MVE_VLDRBS32_post;
1393+
case ARM::MVE_VLDRBU16:
1394+
return ARM::MVE_VLDRBU16_post;
1395+
case ARM::MVE_VLDRBU32:
1396+
return ARM::MVE_VLDRBU32_post;
1397+
case ARM::MVE_VLDRHS32:
1398+
return ARM::MVE_VLDRHS32_post;
1399+
case ARM::MVE_VLDRHU32:
1400+
return ARM::MVE_VLDRHU32_post;
1401+
case ARM::MVE_VLDRBU8:
1402+
return ARM::MVE_VLDRBU8_post;
1403+
case ARM::MVE_VLDRHU16:
1404+
return ARM::MVE_VLDRHU16_post;
1405+
case ARM::MVE_VLDRWU32:
1406+
return ARM::MVE_VLDRWU32_post;
1407+
case ARM::MVE_VSTRB16:
1408+
return ARM::MVE_VSTRB16_post;
1409+
case ARM::MVE_VSTRB32:
1410+
return ARM::MVE_VSTRB32_post;
1411+
case ARM::MVE_VSTRH32:
1412+
return ARM::MVE_VSTRH32_post;
1413+
case ARM::MVE_VSTRBU8:
1414+
return ARM::MVE_VSTRBU8_post;
1415+
case ARM::MVE_VSTRHU16:
1416+
return ARM::MVE_VSTRHU16_post;
1417+
case ARM::MVE_VSTRWU32:
1418+
return ARM::MVE_VSTRWU32_post;
1419+
13861420
default: llvm_unreachable("Unhandled opcode!");
13871421
}
13881422
}
@@ -2046,6 +2080,7 @@ namespace {
20462080
const TargetRegisterInfo *TRI;
20472081
const ARMSubtarget *STI;
20482082
MachineRegisterInfo *MRI;
2083+
MachineDominatorTree *DT;
20492084
MachineFunction *MF;
20502085

20512086
ARMPreAllocLoadStoreOpt() : MachineFunctionPass(ID) {}
@@ -2058,6 +2093,8 @@ namespace {
20582093

20592094
void getAnalysisUsage(AnalysisUsage &AU) const override {
20602095
AU.addRequired<AAResultsWrapperPass>();
2096+
AU.addRequired<MachineDominatorTree>();
2097+
AU.addPreserved<MachineDominatorTree>();
20612098
MachineFunctionPass::getAnalysisUsage(AU);
20622099
}
20632100

@@ -2071,14 +2108,19 @@ namespace {
20712108
unsigned Base, bool isLd,
20722109
DenseMap<MachineInstr*, unsigned> &MI2LocMap);
20732110
bool RescheduleLoadStoreInstrs(MachineBasicBlock *MBB);
2111+
bool DistributeIncrements();
2112+
bool DistributeIncrements(Register Base);
20742113
};
20752114

20762115
} // end anonymous namespace
20772116

20782117
char ARMPreAllocLoadStoreOpt::ID = 0;
20792118

2080-
INITIALIZE_PASS(ARMPreAllocLoadStoreOpt, "arm-prera-ldst-opt",
2081-
ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false)
2119+
INITIALIZE_PASS_BEGIN(ARMPreAllocLoadStoreOpt, "arm-prera-ldst-opt",
2120+
ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false)
2121+
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
2122+
INITIALIZE_PASS_END(ARMPreAllocLoadStoreOpt, "arm-prera-ldst-opt",
2123+
ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false)
20822124

20832125
// Limit the number of instructions to be rescheduled.
20842126
// FIXME: tune this limit, and/or come up with some better heuristics.
@@ -2094,10 +2136,11 @@ bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
20942136
TII = STI->getInstrInfo();
20952137
TRI = STI->getRegisterInfo();
20962138
MRI = &Fn.getRegInfo();
2139+
DT = &getAnalysis<MachineDominatorTree>();
20972140
MF = &Fn;
20982141
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
20992142

2100-
bool Modified = false;
2143+
bool Modified = DistributeIncrements();
21012144
for (MachineBasicBlock &MFI : Fn)
21022145
Modified |= RescheduleLoadStoreInstrs(&MFI);
21032146

@@ -2475,6 +2518,198 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
24752518
return RetVal;
24762519
}
24772520

2521+
// Get the Base register operand index from the memory access MachineInst if we
2522+
// should attempt to distribute postinc on it. Return -1 if not of a valid
2523+
// instruction type. If it returns an index, it is assumed that instruction is a
2524+
// r+i indexing mode, and getBaseOperandIndex() + 1 is the Offset index.
2525+
static int getBaseOperandIndex(MachineInstr &MI) {
2526+
switch (MI.getOpcode()) {
2527+
case ARM::MVE_VLDRBS16:
2528+
case ARM::MVE_VLDRBS32:
2529+
case ARM::MVE_VLDRBU16:
2530+
case ARM::MVE_VLDRBU32:
2531+
case ARM::MVE_VLDRHS32:
2532+
case ARM::MVE_VLDRHU32:
2533+
case ARM::MVE_VLDRBU8:
2534+
case ARM::MVE_VLDRHU16:
2535+
case ARM::MVE_VLDRWU32:
2536+
case ARM::MVE_VSTRB16:
2537+
case ARM::MVE_VSTRB32:
2538+
case ARM::MVE_VSTRH32:
2539+
case ARM::MVE_VSTRBU8:
2540+
case ARM::MVE_VSTRHU16:
2541+
case ARM::MVE_VSTRWU32:
2542+
return 1;
2543+
}
2544+
return -1;
2545+
}
2546+
2547+
static MachineInstr *createPostIncLoadStore(MachineInstr *MI, int Offset,
2548+
Register NewReg,
2549+
const TargetInstrInfo *TII,
2550+
const TargetRegisterInfo *TRI) {
2551+
MachineFunction *MF = MI->getMF();
2552+
MachineRegisterInfo &MRI = MF->getRegInfo();
2553+
2554+
unsigned NewOpcode = getPostIndexedLoadStoreOpcode(
2555+
MI->getOpcode(), Offset > 0 ? ARM_AM::add : ARM_AM::sub);
2556+
2557+
const MCInstrDesc &MCID = TII->get(NewOpcode);
2558+
// Constrain the def register class
2559+
const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0, TRI, *MF);
2560+
MRI.constrainRegClass(NewReg, TRC);
2561+
// And do the same for the base operand
2562+
TRC = TII->getRegClass(MCID, 2, TRI, *MF);
2563+
MRI.constrainRegClass(MI->getOperand(1).getReg(), TRC);
2564+
2565+
return BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), MCID)
2566+
.addReg(NewReg, RegState::Define)
2567+
.add(MI->getOperand(0))
2568+
.add(MI->getOperand(1))
2569+
.addImm(Offset)
2570+
.add(MI->getOperand(3))
2571+
.add(MI->getOperand(4))
2572+
.cloneMemRefs(*MI);
2573+
}
2574+
2575+
// Given a Base Register, optimise the load/store uses to attempt to create more
2576+
// post-inc accesses. We do this by taking zero offset loads/stores with an add,
2577+
// and convert them to a postinc load/store of the same type. Any subsequent
2578+
// accesses will be adjusted to use and account for the post-inc value.
2579+
// For example:
2580+
// LDR #0 LDR_POSTINC #16
2581+
// LDR #4 LDR #-12
2582+
// LDR #8 LDR #-8
2583+
// LDR #12 LDR #-4
2584+
// ADD #16
2585+
bool ARMPreAllocLoadStoreOpt::DistributeIncrements(Register Base) {
2586+
// We are looking for:
2587+
// One zero offset load/store that can become postinc
2588+
MachineInstr *BaseAccess = nullptr;
2589+
// An increment that can be folded in
2590+
MachineInstr *Increment = nullptr;
2591+
// Other accesses after BaseAccess that will need to be updated to use the
2592+
// postinc value
2593+
SmallPtrSet<MachineInstr *, 8> OtherAccesses;
2594+
for (auto &Use : MRI->use_nodbg_instructions(Base)) {
2595+
if (!Increment && getAddSubImmediate(Use) != 0) {
2596+
Increment = &Use;
2597+
continue;
2598+
}
2599+
2600+
int BaseOp = getBaseOperandIndex(Use);
2601+
if (BaseOp == -1)
2602+
return false;
2603+
2604+
if (!Use.getOperand(BaseOp).isReg() ||
2605+
Use.getOperand(BaseOp).getReg() != Base)
2606+
return false;
2607+
if (Use.getOperand(BaseOp + 1).getImm() == 0)
2608+
BaseAccess = &Use;
2609+
else
2610+
OtherAccesses.insert(&Use);
2611+
}
2612+
2613+
if (!BaseAccess || !Increment ||
2614+
BaseAccess->getParent() != Increment->getParent())
2615+
return false;
2616+
Register PredReg;
2617+
if (Increment->definesRegister(ARM::CPSR) ||
2618+
getInstrPredicate(*Increment, PredReg) != ARMCC::AL)
2619+
return false;
2620+
2621+
LLVM_DEBUG(dbgs() << "\nAttempting to distribute increments on VirtualReg "
2622+
<< Base.virtRegIndex() << "\n");
2623+
2624+
// Make sure that Increment has no uses before BaseAccess.
2625+
for (MachineInstr &Use :
2626+
MRI->use_nodbg_instructions(Increment->getOperand(0).getReg())) {
2627+
if (!DT->dominates(BaseAccess, &Use) || &Use == BaseAccess) {
2628+
LLVM_DEBUG(dbgs() << " BaseAccess doesn't dominate use of increment\n");
2629+
return false;
2630+
}
2631+
}
2632+
2633+
// Make sure that Increment can be folded into Base
2634+
int IncrementOffset = getAddSubImmediate(*Increment);
2635+
unsigned NewPostIncOpcode = getPostIndexedLoadStoreOpcode(
2636+
BaseAccess->getOpcode(), IncrementOffset > 0 ? ARM_AM::add : ARM_AM::sub);
2637+
if (!isLegalAddressImm(NewPostIncOpcode, IncrementOffset, TII)) {
2638+
LLVM_DEBUG(dbgs() << " Illegal addressing mode immediate on postinc\n");
2639+
return false;
2640+
}
2641+
2642+
// And make sure that the negative value of increment can be added to all
2643+
// other offsets after the BaseAccess. We rely on either
2644+
// dominates(BaseAccess, OtherAccess) or dominates(OtherAccess, BaseAccess)
2645+
// to keep things simple.
2646+
SmallPtrSet<MachineInstr *, 4> SuccessorAccesses;
2647+
for (auto *Use : OtherAccesses) {
2648+
if (DT->dominates(BaseAccess, Use)) {
2649+
SuccessorAccesses.insert(Use);
2650+
unsigned BaseOp = getBaseOperandIndex(*Use);
2651+
if (!isLegalAddressImm(
2652+
Use->getOpcode(),
2653+
Use->getOperand(BaseOp + 1).getImm() - IncrementOffset, TII)) {
2654+
LLVM_DEBUG(dbgs() << " Illegal addressing mode immediate on use\n");
2655+
return false;
2656+
}
2657+
} else if (!DT->dominates(Use, BaseAccess)) {
2658+
LLVM_DEBUG(
2659+
dbgs() << " Unknown dominance relation between Base and Use\n");
2660+
return false;
2661+
}
2662+
}
2663+
2664+
// Replace BaseAccess with a post inc
2665+
LLVM_DEBUG(dbgs() << "Changing: "; BaseAccess->dump());
2666+
LLVM_DEBUG(dbgs() << " And : "; Increment->dump());
2667+
Register NewBaseReg = Increment->getOperand(0).getReg();
2668+
MachineInstr *BaseAccessPost =
2669+
createPostIncLoadStore(BaseAccess, IncrementOffset, NewBaseReg, TII, TRI);
2670+
BaseAccess->eraseFromParent();
2671+
Increment->eraseFromParent();
2672+
LLVM_DEBUG(dbgs() << " To : "; BaseAccessPost->dump());
2673+
2674+
for (auto *Use : SuccessorAccesses) {
2675+
LLVM_DEBUG(dbgs() << "Changing: "; Use->dump());
2676+
unsigned BaseOp = getBaseOperandIndex(*Use);
2677+
Use->getOperand(BaseOp).setReg(NewBaseReg);
2678+
int OldOffset = Use->getOperand(BaseOp + 1).getImm();
2679+
Use->getOperand(BaseOp + 1).setImm(OldOffset - IncrementOffset);
2680+
LLVM_DEBUG(dbgs() << " To : "; Use->dump());
2681+
}
2682+
2683+
// Remove the kill flag from all uses of NewBaseReg, in case any old uses
2684+
// remain.
2685+
for (MachineOperand &Op : MRI->use_nodbg_operands(NewBaseReg))
2686+
Op.setIsKill(false);
2687+
return true;
2688+
}
2689+
2690+
bool ARMPreAllocLoadStoreOpt::DistributeIncrements() {
2691+
bool Changed = false;
2692+
SmallSetVector<Register, 4> Visited;
2693+
for (auto &MBB : *MF) {
2694+
for (auto &MI : MBB) {
2695+
int BaseOp = getBaseOperandIndex(MI);
2696+
if (BaseOp == -1 || !MI.getOperand(BaseOp).isReg())
2697+
continue;
2698+
2699+
Register Base = MI.getOperand(BaseOp).getReg();
2700+
if (!Base.isVirtual() || Visited.count(Base))
2701+
continue;
2702+
2703+
Visited.insert(Base);
2704+
}
2705+
}
2706+
2707+
for (auto Base : Visited)
2708+
Changed |= DistributeIncrements(Base);
2709+
2710+
return Changed;
2711+
}
2712+
24782713
/// Returns an instance of the load / store optimization pass.
24792714
FunctionPass *llvm::createARMLoadStoreOptimizationPass(bool PreAlloc) {
24802715
if (PreAlloc)

llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp

Lines changed: 2 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -463,21 +463,8 @@ bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt) {
463463
// that the predication will be equivalent. For this we need:
464464
// NumElements = NumElements - VectorWidth. The sub will be a sub immediate
465465
// and we can also allow register copies within the chain too.
466-
auto IsValidSub = [](MachineInstr *MI, unsigned ExpectedVecWidth) {
467-
unsigned ImmOpIdx = 0;
468-
switch (MI->getOpcode()) {
469-
default:
470-
llvm_unreachable("unhandled sub opcode");
471-
case ARM::tSUBi3:
472-
case ARM::tSUBi8:
473-
ImmOpIdx = 3;
474-
break;
475-
case ARM::t2SUBri:
476-
case ARM::t2SUBri12:
477-
ImmOpIdx = 2;
478-
break;
479-
}
480-
return MI->getOperand(ImmOpIdx).getImm() == ExpectedVecWidth;
466+
auto IsValidSub = [](MachineInstr *MI, int ExpectedVecWidth) {
467+
return -getAddSubImmediate(*MI) == ExpectedVecWidth;
481468
};
482469

483470
MBB = VCTP->getParent();

llvm/test/CodeGen/ARM/O3-pipeline.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@
9494
; CHECK-NEXT: Remove dead machine instructions
9595
; CHECK-NEXT: MVE VPT Optimisation Pass
9696
; CHECK-NEXT: ARM MLA / MLS expansion pass
97+
; CHECK-NEXT: MachineDominator Tree Construction
9798
; CHECK-NEXT: ARM pre- register allocation load / store optimization pass
9899
; CHECK-NEXT: ARM A15 S->D optimizer
99100
; CHECK-NEXT: Detect Dead Lanes

0 commit comments

Comments
 (0)