Skip to content

Commit 1b11729

Browse files
authored
[AArch64][GlobalISel] Add support for post-indexed loads/stores. (#69532)
Gives small code size improvements across the board at -Os CTMark. Much of the work is porting the existing heuristics in the DAGCombiner.
1 parent bac3808 commit 1b11729

16 files changed

+880
-827
lines changed

llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,8 @@ struct IndexedLoadStoreMatchInfo {
5858
Register Addr;
5959
Register Base;
6060
Register Offset;
61+
bool RematOffset; // True if Offset is a constant that needs to be
62+
// rematerialized before the new load/store.
6163
bool IsPre;
6264
};
6365

@@ -814,12 +816,14 @@ class CombinerHelper {
814816
void applyCommuteBinOpOperands(MachineInstr &MI);
815817

816818
private:
819+
/// Checks for legality of an indexed variant of \p LdSt.
820+
bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const;
817821
/// Given a non-indexed load or store instruction \p MI, find an offset that
818822
/// can be usefully and legally folded into it as a post-indexing operation.
819823
///
820824
/// \returns true if a candidate is found.
821825
bool findPostIndexCandidate(GLoadStore &MI, Register &Addr, Register &Base,
822-
Register &Offset);
826+
Register &Offset, bool &RematOffset);
823827

824828
/// Given a non-indexed load or store instruction \p MI, find an offset that
825829
/// can be usefully and legally folded into it as a pre-indexing operation.

llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ class GIndexedLoad : public GMemOperation {
9797
/// Get the offset register of the pointer value.
9898
Register getOffsetReg() const { return getOperand(3).getReg(); }
9999

100-
bool isPre() const { return getOperand(5).getImm() == 1; }
100+
bool isPre() const { return getOperand(4).getImm() == 1; }
101101
bool isPost() const { return !isPre(); }
102102

103103
static bool classof(const MachineInstr *MI) {

llvm/include/llvm/Target/GlobalISel/Combine.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1248,7 +1248,7 @@ def constant_fold_binops : GICombineGroup<[constant_fold_binop,
12481248

12491249
def all_combines : GICombineGroup<[trivial_combines, insert_vec_elt_combines,
12501250
extract_vec_elt_combines, combines_for_extload,
1251-
combine_indexed_load_store, undef_combines, identity_combines, phi_combines,
1251+
undef_combines, identity_combines, phi_combines,
12521252
simplify_add_to_sub, hoist_logic_op_with_same_opcode_hands, shifts_too_big,
12531253
reassocs, ptr_add_immed_chain,
12541254
shl_ashr_to_sext_inreg, sext_inreg_of_load,

llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp

Lines changed: 158 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -945,42 +945,164 @@ void CombinerHelper::applySextInRegOfLoad(
945945
MI.eraseFromParent();
946946
}
947947

948+
static Type *getTypeForLLT(LLT Ty, LLVMContext &C) {
949+
if (Ty.isVector())
950+
return FixedVectorType::get(IntegerType::get(C, Ty.getScalarSizeInBits()),
951+
Ty.getNumElements());
952+
return IntegerType::get(C, Ty.getSizeInBits());
953+
}
954+
955+
/// Return true if 'MI' is a load or a store that may be fold it's address
956+
/// operand into the load / store addressing mode.
957+
static bool canFoldInAddressingMode(GLoadStore *MI, const TargetLowering &TLI,
958+
MachineRegisterInfo &MRI) {
959+
TargetLowering::AddrMode AM;
960+
auto *MF = MI->getMF();
961+
auto *Addr = getOpcodeDef<GPtrAdd>(MI->getPointerReg(), MRI);
962+
if (!Addr)
963+
return false;
964+
965+
AM.HasBaseReg = true;
966+
if (auto CstOff = getIConstantVRegVal(Addr->getOffsetReg(), MRI))
967+
AM.BaseOffs = CstOff->getSExtValue(); // [reg +/- imm]
968+
else
969+
AM.Scale = 1; // [reg +/- reg]
970+
971+
return TLI.isLegalAddressingMode(
972+
MF->getDataLayout(), AM,
973+
getTypeForLLT(MI->getMMO().getMemoryType(),
974+
MF->getFunction().getContext()),
975+
MI->getMMO().getAddrSpace());
976+
}
977+
978+
static unsigned getIndexedOpc(unsigned LdStOpc) {
979+
switch (LdStOpc) {
980+
case TargetOpcode::G_LOAD:
981+
return TargetOpcode::G_INDEXED_LOAD;
982+
case TargetOpcode::G_STORE:
983+
return TargetOpcode::G_INDEXED_STORE;
984+
case TargetOpcode::G_ZEXTLOAD:
985+
return TargetOpcode::G_INDEXED_ZEXTLOAD;
986+
case TargetOpcode::G_SEXTLOAD:
987+
return TargetOpcode::G_INDEXED_SEXTLOAD;
988+
default:
989+
llvm_unreachable("Unexpected opcode");
990+
}
991+
}
992+
993+
bool CombinerHelper::isIndexedLoadStoreLegal(GLoadStore &LdSt) const {
994+
// Check for legality.
995+
LLT PtrTy = MRI.getType(LdSt.getPointerReg());
996+
LLT Ty = MRI.getType(LdSt.getReg(0));
997+
LLT MemTy = LdSt.getMMO().getMemoryType();
998+
SmallVector<LegalityQuery::MemDesc, 2> MemDescrs(
999+
{{MemTy, MemTy.getSizeInBits(), AtomicOrdering::NotAtomic}});
1000+
unsigned IndexedOpc = getIndexedOpc(LdSt.getOpcode());
1001+
SmallVector<LLT> OpTys;
1002+
if (IndexedOpc == TargetOpcode::G_INDEXED_STORE)
1003+
OpTys = {PtrTy, Ty, Ty};
1004+
else
1005+
OpTys = {Ty, PtrTy}; // For G_INDEXED_LOAD, G_INDEXED_[SZ]EXTLOAD
1006+
1007+
LegalityQuery Q(IndexedOpc, OpTys, MemDescrs);
1008+
return isLegal(Q);
1009+
}
1010+
1011+
static cl::opt<unsigned> PostIndexUseThreshold(
1012+
"post-index-use-threshold", cl::Hidden, cl::init(32),
1013+
cl::desc("Number of uses of a base pointer to check before it is no longer "
1014+
"considered for post-indexing."));
1015+
9481016
bool CombinerHelper::findPostIndexCandidate(GLoadStore &LdSt, Register &Addr,
949-
Register &Base, Register &Offset) {
950-
auto &MF = *LdSt.getParent()->getParent();
951-
const auto &TLI = *MF.getSubtarget().getTargetLowering();
1017+
Register &Base, Register &Offset,
1018+
bool &RematOffset) {
1019+
// We're looking for the following pattern, for either load or store:
1020+
// %baseptr:_(p0) = ...
1021+
// G_STORE %val(s64), %baseptr(p0)
1022+
// %offset:_(s64) = G_CONSTANT i64 -256
1023+
// %new_addr:_(p0) = G_PTR_ADD %baseptr, %offset(s64)
1024+
const auto &TLI = getTargetLowering();
1025+
1026+
Register Ptr = LdSt.getPointerReg();
1027+
// If the store is the only use, don't bother.
1028+
if (MRI.hasOneNonDBGUse(Ptr))
1029+
return false;
9521030

953-
Base = LdSt.getPointerReg();
1031+
if (!isIndexedLoadStoreLegal(LdSt))
1032+
return false;
9541033

955-
if (getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Base, MRI))
1034+
if (getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Ptr, MRI))
9561035
return false;
9571036

958-
// FIXME: The following use traversal needs a bail out for patholigical cases.
959-
for (auto &Use : MRI.use_nodbg_instructions(Base)) {
1037+
MachineInstr *StoredValDef = getDefIgnoringCopies(LdSt.getReg(0), MRI);
1038+
auto *PtrDef = MRI.getVRegDef(Ptr);
1039+
1040+
unsigned NumUsesChecked = 0;
1041+
for (auto &Use : MRI.use_nodbg_instructions(Ptr)) {
1042+
if (++NumUsesChecked > PostIndexUseThreshold)
1043+
return false; // Try to avoid exploding compile time.
1044+
9601045
auto *PtrAdd = dyn_cast<GPtrAdd>(&Use);
961-
if (!PtrAdd)
1046+
// The use itself might be dead. This can happen during combines if DCE
1047+
// hasn't had a chance to run yet. Don't allow it to form an indexed op.
1048+
if (!PtrAdd || MRI.use_nodbg_empty(PtrAdd->getReg(0)))
1049+
continue;
1050+
1051+
// Check the user of this isn't the store, otherwise we'd be generate a
1052+
// indexed store defining its own use.
1053+
if (StoredValDef == &Use)
9621054
continue;
9631055

9641056
Offset = PtrAdd->getOffsetReg();
9651057
if (!ForceLegalIndexing &&
966-
!TLI.isIndexingLegal(LdSt, Base, Offset, /*IsPre*/ false, MRI))
1058+
!TLI.isIndexingLegal(LdSt, PtrAdd->getBaseReg(), Offset,
1059+
/*IsPre*/ false, MRI))
9671060
continue;
9681061

9691062
// Make sure the offset calculation is before the potentially indexed op.
9701063
MachineInstr *OffsetDef = MRI.getVRegDef(Offset);
971-
if (!dominates(*OffsetDef, LdSt))
972-
continue;
1064+
RematOffset = false;
1065+
if (!dominates(*OffsetDef, LdSt)) {
1066+
// If the offset however is just a G_CONSTANT, we can always just
1067+
// rematerialize it where we need it.
1068+
if (OffsetDef->getOpcode() != TargetOpcode::G_CONSTANT)
1069+
continue;
1070+
RematOffset = true;
1071+
}
9731072

974-
// FIXME: check whether all uses of Base are load/store with foldable
975-
// addressing modes. If so, using the normal addr-modes is better than
976-
// forming an indexed one.
977-
if (any_of(MRI.use_nodbg_instructions(PtrAdd->getReg(0)),
978-
[&](MachineInstr &PtrAddUse) {
979-
return !dominates(LdSt, PtrAddUse);
980-
}))
981-
continue;
1073+
for (auto &BasePtrUse : MRI.use_nodbg_instructions(PtrAdd->getBaseReg())) {
1074+
if (&BasePtrUse == PtrDef)
1075+
continue;
1076+
1077+
// If the user is a later load/store that can be post-indexed, then don't
1078+
// combine this one.
1079+
auto *BasePtrLdSt = dyn_cast<GLoadStore>(&BasePtrUse);
1080+
if (BasePtrLdSt && BasePtrLdSt != &LdSt &&
1081+
dominates(LdSt, *BasePtrLdSt) &&
1082+
isIndexedLoadStoreLegal(*BasePtrLdSt))
1083+
return false;
1084+
1085+
// Now we're looking for the key G_PTR_ADD instruction, which contains
1086+
// the offset add that we want to fold.
1087+
if (auto *BasePtrUseDef = dyn_cast<GPtrAdd>(&BasePtrUse)) {
1088+
Register PtrAddDefReg = BasePtrUseDef->getReg(0);
1089+
for (auto &BaseUseUse : MRI.use_nodbg_instructions(PtrAddDefReg)) {
1090+
// If the use is in a different block, then we may produce worse code
1091+
// due to the extra register pressure.
1092+
if (BaseUseUse.getParent() != LdSt.getParent())
1093+
return false;
1094+
1095+
if (auto *UseUseLdSt = dyn_cast<GLoadStore>(&BaseUseUse))
1096+
if (canFoldInAddressingMode(UseUseLdSt, TLI, MRI))
1097+
return false;
1098+
}
1099+
if (!dominates(LdSt, BasePtrUse))
1100+
return false; // All use must be dominated by the load/store.
1101+
}
1102+
}
9821103

9831104
Addr = PtrAdd->getReg(0);
1105+
Base = PtrAdd->getBaseReg();
9841106
return true;
9851107
}
9861108

@@ -1001,6 +1123,9 @@ bool CombinerHelper::findPreIndexCandidate(GLoadStore &LdSt, Register &Addr,
10011123
!TLI.isIndexingLegal(LdSt, Base, Offset, /*IsPre*/ true, MRI))
10021124
return false;
10031125

1126+
if (!isIndexedLoadStoreLegal(LdSt))
1127+
return false;
1128+
10041129
MachineInstr *BaseDef = getDefIgnoringCopies(Base, MRI);
10051130
if (BaseDef->getOpcode() == TargetOpcode::G_FRAME_INDEX)
10061131
return false;
@@ -1027,16 +1152,14 @@ bool CombinerHelper::matchCombineIndexedLoadStore(
10271152
MachineInstr &MI, IndexedLoadStoreMatchInfo &MatchInfo) {
10281153
auto &LdSt = cast<GLoadStore>(MI);
10291154

1030-
// For now, no targets actually support these opcodes so don't waste time
1031-
// running these unless we're forced to for testing.
1032-
if (!ForceLegalIndexing)
1155+
if (LdSt.isAtomic())
10331156
return false;
10341157

10351158
MatchInfo.IsPre = findPreIndexCandidate(LdSt, MatchInfo.Addr, MatchInfo.Base,
10361159
MatchInfo.Offset);
10371160
if (!MatchInfo.IsPre &&
10381161
!findPostIndexCandidate(LdSt, MatchInfo.Addr, MatchInfo.Base,
1039-
MatchInfo.Offset))
1162+
MatchInfo.Offset, MatchInfo.RematOffset))
10401163
return false;
10411164

10421165
return true;
@@ -1045,28 +1168,21 @@ bool CombinerHelper::matchCombineIndexedLoadStore(
10451168
void CombinerHelper::applyCombineIndexedLoadStore(
10461169
MachineInstr &MI, IndexedLoadStoreMatchInfo &MatchInfo) {
10471170
MachineInstr &AddrDef = *MRI.getUniqueVRegDef(MatchInfo.Addr);
1048-
MachineIRBuilder MIRBuilder(MI);
1171+
Builder.setInstrAndDebugLoc(MI);
10491172
unsigned Opcode = MI.getOpcode();
10501173
bool IsStore = Opcode == TargetOpcode::G_STORE;
1051-
unsigned NewOpcode;
1052-
switch (Opcode) {
1053-
case TargetOpcode::G_LOAD:
1054-
NewOpcode = TargetOpcode::G_INDEXED_LOAD;
1055-
break;
1056-
case TargetOpcode::G_SEXTLOAD:
1057-
NewOpcode = TargetOpcode::G_INDEXED_SEXTLOAD;
1058-
break;
1059-
case TargetOpcode::G_ZEXTLOAD:
1060-
NewOpcode = TargetOpcode::G_INDEXED_ZEXTLOAD;
1061-
break;
1062-
case TargetOpcode::G_STORE:
1063-
NewOpcode = TargetOpcode::G_INDEXED_STORE;
1064-
break;
1065-
default:
1066-
llvm_unreachable("Unknown load/store opcode");
1174+
unsigned NewOpcode = getIndexedOpc(Opcode);
1175+
1176+
// If the offset constant didn't happen to dominate the load/store, we can
1177+
// just clone it as needed.
1178+
if (MatchInfo.RematOffset) {
1179+
auto *OldCst = MRI.getVRegDef(MatchInfo.Offset);
1180+
auto NewCst = Builder.buildConstant(MRI.getType(MatchInfo.Offset),
1181+
*OldCst->getOperand(1).getCImm());
1182+
MatchInfo.Offset = NewCst.getReg(0);
10671183
}
10681184

1069-
auto MIB = MIRBuilder.buildInstr(NewOpcode);
1185+
auto MIB = Builder.buildInstr(NewOpcode);
10701186
if (IsStore) {
10711187
MIB.addDef(MatchInfo.Addr);
10721188
MIB.addUse(MI.getOperand(0).getReg());
@@ -1245,13 +1361,7 @@ void CombinerHelper::applyOptBrCondByInvertingCond(MachineInstr &MI,
12451361
Observer.changedInstr(*BrCond);
12461362
}
12471363

1248-
static Type *getTypeForLLT(LLT Ty, LLVMContext &C) {
1249-
if (Ty.isVector())
1250-
return FixedVectorType::get(IntegerType::get(C, Ty.getScalarSizeInBits()),
1251-
Ty.getNumElements());
1252-
return IntegerType::get(C, Ty.getSizeInBits());
1253-
}
1254-
1364+
12551365
bool CombinerHelper::tryEmitMemcpyInline(MachineInstr &MI) {
12561366
MachineIRBuilder HelperBuilder(MI);
12571367
GISelObserverWrapper DummyObserver;

llvm/lib/Target/AArch64/AArch64Combine.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,7 @@ def AArch64PostLegalizerLowering
246246
def AArch64PostLegalizerCombiner
247247
: GICombiner<"AArch64PostLegalizerCombinerImpl",
248248
[copy_prop, combines_for_extload,
249+
combine_indexed_load_store,
249250
sext_trunc_sextload, mutate_anyext_to_zext,
250251
hoist_logic_op_with_same_opcode_hands,
251252
redundant_and, xor_of_and_with_same_reg,

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@
3737
#include "llvm/Analysis/VectorUtils.h"
3838
#include "llvm/CodeGen/Analysis.h"
3939
#include "llvm/CodeGen/CallingConvLower.h"
40+
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
41+
#include "llvm/CodeGen/GlobalISel/Utils.h"
4042
#include "llvm/CodeGen/ISDOpcodes.h"
4143
#include "llvm/CodeGen/MachineBasicBlock.h"
4244
#include "llvm/CodeGen/MachineFrameInfo.h"
@@ -23694,6 +23696,23 @@ bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
2369423696
return CI->isTailCall();
2369523697
}
2369623698

23699+
bool AArch64TargetLowering::isIndexingLegal(MachineInstr &MI, Register Base,
23700+
Register Offset, bool IsPre,
23701+
MachineRegisterInfo &MRI) const {
23702+
// HACK
23703+
if (IsPre)
23704+
return false; // Until we implement.
23705+
23706+
auto CstOffset = getIConstantVRegVal(Offset, MRI);
23707+
if (!CstOffset || CstOffset->isZero())
23708+
return false;
23709+
23710+
// All of the indexed addressing mode instructions take a signed 9 bit
23711+
// immediate offset. Our CstOffset is a G_PTR_ADD offset so it already
23712+
// encodes the sign/indexing direction.
23713+
return isInt<9>(CstOffset->getSExtValue());
23714+
}
23715+
2369723716
bool AArch64TargetLowering::getIndexedAddressParts(SDNode *N, SDNode *Op,
2369823717
SDValue &Base,
2369923718
SDValue &Offset,

llvm/lib/Target/AArch64/AArch64ISelLowering.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1201,6 +1201,8 @@ class AArch64TargetLowering : public TargetLowering {
12011201
bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base,
12021202
SDValue &Offset, ISD::MemIndexedMode &AM,
12031203
SelectionDAG &DAG) const override;
1204+
bool isIndexingLegal(MachineInstr &MI, Register Base, Register Offset,
1205+
bool IsPre, MachineRegisterInfo &MRI) const override;
12041206

12051207
void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
12061208
SelectionDAG &DAG) const override;

0 commit comments

Comments
 (0)