Skip to content

Commit 6c32a1f

Browse files
authored
[SystemZ] Enable MachineCombiner for FP reassociation (#83546)
Enable MachineCombining for FP add, sub and mul. In order for this to work, the default instruction selection of reg/mem opcodes is disabled for ISD nodes that carry the flags that allow reassociation. The reg/mem folding is instead done after MachineCombiner by PeepholeOptimizer. SystemZInstrInfo optimizeLoadInstr() and foldMemoryOperandImpl() ("LoadMI version") have been implemented for this purpose also by this patch.
1 parent 622ec1f commit 6c32a1f

17 files changed

+1074
-23
lines changed

llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -350,6 +350,11 @@ class SystemZDAGToDAGISel : public SelectionDAGISel {
350350
// Try to expand a boolean SELECT_CCMASK using an IPM sequence.
351351
SDValue expandSelectBoolean(SDNode *Node);
352352

353+
// Return true if the flags of N and the subtarget allows for
354+
// reassociation, in which case a reg/reg opcode is needed as input to the
355+
// MachineCombiner.
356+
bool shouldSelectForReassoc(SDNode *N) const;
357+
353358
public:
354359
static char ID;
355360

@@ -2044,6 +2049,15 @@ SDValue SystemZDAGToDAGISel::expandSelectBoolean(SDNode *Node) {
20442049
return Result;
20452050
}
20462051

2052+
bool SystemZDAGToDAGISel::shouldSelectForReassoc(SDNode *N) const {
2053+
EVT VT = N->getValueType(0);
2054+
assert(VT.isFloatingPoint() && "Expected FP SDNode");
2055+
return N->getFlags().hasAllowReassociation() &&
2056+
N->getFlags().hasNoSignedZeros() && Subtarget->hasVector() &&
2057+
(VT != MVT::f32 || Subtarget->hasVectorEnhancements1()) &&
2058+
!N->isStrictFPOpcode();
2059+
}
2060+
20472061
void SystemZDAGToDAGISel::PreprocessISelDAG() {
20482062
// If we have conditional immediate loads, we always prefer
20492063
// using those over an IPM sequence.

llvm/lib/Target/SystemZ/SystemZInstrFP.td

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -430,8 +430,10 @@ let Uses = [FPC], mayRaiseFPException = 1,
430430
def ADBR : BinaryRRE<"adbr", 0xB31A, any_fadd, FP64, FP64>;
431431
def AXBR : BinaryRRE<"axbr", 0xB34A, any_fadd, FP128, FP128>;
432432
}
433-
defm AEB : BinaryRXEAndPseudo<"aeb", 0xED0A, any_fadd, FP32, z_load, 4>;
434-
defm ADB : BinaryRXEAndPseudo<"adb", 0xED1A, any_fadd, FP64, z_load, 8>;
433+
defm AEB : BinaryRXEAndPseudo<"aeb", 0xED0A, z_any_fadd_noreassoc, FP32,
434+
z_load, 4>;
435+
defm ADB : BinaryRXEAndPseudo<"adb", 0xED1A, z_any_fadd_noreassoc, FP64,
436+
z_load, 8>;
435437
}
436438

437439
// Subtraction.
@@ -441,8 +443,10 @@ let Uses = [FPC], mayRaiseFPException = 1,
441443
def SDBR : BinaryRRE<"sdbr", 0xB31B, any_fsub, FP64, FP64>;
442444
def SXBR : BinaryRRE<"sxbr", 0xB34B, any_fsub, FP128, FP128>;
443445

444-
defm SEB : BinaryRXEAndPseudo<"seb", 0xED0B, any_fsub, FP32, z_load, 4>;
445-
defm SDB : BinaryRXEAndPseudo<"sdb", 0xED1B, any_fsub, FP64, z_load, 8>;
446+
defm SEB : BinaryRXEAndPseudo<"seb", 0xED0B, z_any_fsub_noreassoc, FP32,
447+
z_load, 4>;
448+
defm SDB : BinaryRXEAndPseudo<"sdb", 0xED1B, z_any_fsub_noreassoc, FP64,
449+
z_load, 8>;
446450
}
447451

448452
// Multiplication.
@@ -452,8 +456,10 @@ let Uses = [FPC], mayRaiseFPException = 1 in {
452456
def MDBR : BinaryRRE<"mdbr", 0xB31C, any_fmul, FP64, FP64>;
453457
def MXBR : BinaryRRE<"mxbr", 0xB34C, any_fmul, FP128, FP128>;
454458
}
455-
defm MEEB : BinaryRXEAndPseudo<"meeb", 0xED17, any_fmul, FP32, z_load, 4>;
456-
defm MDB : BinaryRXEAndPseudo<"mdb", 0xED1C, any_fmul, FP64, z_load, 8>;
459+
defm MEEB : BinaryRXEAndPseudo<"meeb", 0xED17, z_any_fmul_noreassoc, FP32,
460+
z_load, 4>;
461+
defm MDB : BinaryRXEAndPseudo<"mdb", 0xED1C, z_any_fmul_noreassoc, FP64,
462+
z_load, 8>;
457463
}
458464

459465
// f64 multiplication of two FP32 registers.

llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp

Lines changed: 164 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -610,6 +610,32 @@ void SystemZInstrInfo::insertSelect(MachineBasicBlock &MBB,
610610
.addImm(CCValid).addImm(CCMask);
611611
}
612612

613+
MachineInstr *SystemZInstrInfo::optimizeLoadInstr(MachineInstr &MI,
614+
const MachineRegisterInfo *MRI,
615+
Register &FoldAsLoadDefReg,
616+
MachineInstr *&DefMI) const {
617+
// Check whether we can move the DefMI load, and that it only has one use.
618+
DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
619+
assert(DefMI);
620+
bool SawStore = false;
621+
if (!DefMI->isSafeToMove(nullptr, SawStore) ||
622+
!MRI->hasOneNonDBGUse(FoldAsLoadDefReg))
623+
return nullptr;
624+
625+
int UseOpIdx =
626+
MI.findRegisterUseOperandIdx(FoldAsLoadDefReg, /*TRI=*/nullptr);
627+
assert(UseOpIdx != -1 && "Expected FoldAsLoadDefReg to be used by MI.");
628+
629+
// Check whether we can fold the load.
630+
if (MachineInstr *FoldMI =
631+
foldMemoryOperand(MI, {((unsigned)UseOpIdx)}, *DefMI)) {
632+
FoldAsLoadDefReg = 0;
633+
return FoldMI;
634+
}
635+
636+
return nullptr;
637+
}
638+
613639
bool SystemZInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
614640
Register Reg,
615641
MachineRegisterInfo *MRI) const {
@@ -1004,6 +1030,67 @@ SystemZInstrInfo::convertToThreeAddress(MachineInstr &MI, LiveVariables *LV,
10041030
return nullptr;
10051031
}
10061032

1033+
bool SystemZInstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
1034+
bool Invert) const {
1035+
unsigned Opc = Inst.getOpcode();
1036+
if (Invert) {
1037+
auto InverseOpcode = getInverseOpcode(Opc);
1038+
if (!InverseOpcode)
1039+
return false;
1040+
Opc = *InverseOpcode;
1041+
}
1042+
1043+
switch (Opc) {
1044+
default:
1045+
break;
1046+
// Adds and multiplications.
1047+
case SystemZ::WFADB:
1048+
case SystemZ::WFASB:
1049+
case SystemZ::WFAXB:
1050+
case SystemZ::VFADB:
1051+
case SystemZ::VFASB:
1052+
case SystemZ::WFMDB:
1053+
case SystemZ::WFMSB:
1054+
case SystemZ::WFMXB:
1055+
case SystemZ::VFMDB:
1056+
case SystemZ::VFMSB:
1057+
return (Inst.getFlag(MachineInstr::MIFlag::FmReassoc) &&
1058+
Inst.getFlag(MachineInstr::MIFlag::FmNsz));
1059+
}
1060+
1061+
return false;
1062+
}
1063+
1064+
std::optional<unsigned>
1065+
SystemZInstrInfo::getInverseOpcode(unsigned Opcode) const {
1066+
// fadd => fsub
1067+
switch (Opcode) {
1068+
case SystemZ::WFADB:
1069+
return SystemZ::WFSDB;
1070+
case SystemZ::WFASB:
1071+
return SystemZ::WFSSB;
1072+
case SystemZ::WFAXB:
1073+
return SystemZ::WFSXB;
1074+
case SystemZ::VFADB:
1075+
return SystemZ::VFSDB;
1076+
case SystemZ::VFASB:
1077+
return SystemZ::VFSSB;
1078+
// fsub => fadd
1079+
case SystemZ::WFSDB:
1080+
return SystemZ::WFADB;
1081+
case SystemZ::WFSSB:
1082+
return SystemZ::WFASB;
1083+
case SystemZ::WFSXB:
1084+
return SystemZ::WFAXB;
1085+
case SystemZ::VFSDB:
1086+
return SystemZ::VFADB;
1087+
case SystemZ::VFSSB:
1088+
return SystemZ::VFASB;
1089+
default:
1090+
return std::nullopt;
1091+
}
1092+
}
1093+
10071094
MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
10081095
MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
10091096
MachineBasicBlock::iterator InsertPt, int FrameIndex,
@@ -1338,7 +1425,83 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
13381425
MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
13391426
MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
13401427
LiveIntervals *LIS) const {
1341-
return nullptr;
1428+
MachineRegisterInfo *MRI = &MF.getRegInfo();
1429+
MachineBasicBlock *MBB = MI.getParent();
1430+
1431+
// For reassociable FP operations, any loads have been purposefully left
1432+
// unfolded so that MachineCombiner can do its work on reg/reg
1433+
// opcodes. After that, as many loads as possible are now folded.
1434+
// TODO: This may be beneficial with other opcodes as well as machine-sink
1435+
// can move loads close to their user in a different MBB, which the isel
1436+
// matcher did not see.
1437+
unsigned LoadOpc = 0;
1438+
unsigned RegMemOpcode = 0;
1439+
const TargetRegisterClass *FPRC = nullptr;
1440+
RegMemOpcode = MI.getOpcode() == SystemZ::WFADB ? SystemZ::ADB
1441+
: MI.getOpcode() == SystemZ::WFSDB ? SystemZ::SDB
1442+
: MI.getOpcode() == SystemZ::WFMDB ? SystemZ::MDB
1443+
: 0;
1444+
if (RegMemOpcode) {
1445+
LoadOpc = SystemZ::VL64;
1446+
FPRC = &SystemZ::FP64BitRegClass;
1447+
} else {
1448+
RegMemOpcode = MI.getOpcode() == SystemZ::WFASB ? SystemZ::AEB
1449+
: MI.getOpcode() == SystemZ::WFSSB ? SystemZ::SEB
1450+
: MI.getOpcode() == SystemZ::WFMSB ? SystemZ::MEEB
1451+
: 0;
1452+
if (RegMemOpcode) {
1453+
LoadOpc = SystemZ::VL32;
1454+
FPRC = &SystemZ::FP32BitRegClass;
1455+
}
1456+
}
1457+
if (!RegMemOpcode || LoadMI.getOpcode() != LoadOpc)
1458+
return nullptr;
1459+
1460+
// If RegMemOpcode clobbers CC, first make sure CC is not live at this point.
1461+
if (get(RegMemOpcode).hasImplicitDefOfPhysReg(SystemZ::CC)) {
1462+
assert(LoadMI.getParent() == MI.getParent() && "Assuming a local fold.");
1463+
assert(LoadMI != InsertPt && "Assuming InsertPt not to be first in MBB.");
1464+
for (MachineBasicBlock::iterator MII = std::prev(InsertPt);;
1465+
--MII) {
1466+
if (MII->definesRegister(SystemZ::CC, /*TRI=*/nullptr)) {
1467+
if (!MII->registerDefIsDead(SystemZ::CC, /*TRI=*/nullptr))
1468+
return nullptr;
1469+
break;
1470+
}
1471+
if (MII == MBB->begin()) {
1472+
if (MBB->isLiveIn(SystemZ::CC))
1473+
return nullptr;
1474+
break;
1475+
}
1476+
}
1477+
}
1478+
1479+
Register FoldAsLoadDefReg = LoadMI.getOperand(0).getReg();
1480+
if (Ops.size() != 1 || FoldAsLoadDefReg != MI.getOperand(Ops[0]).getReg())
1481+
return nullptr;
1482+
Register DstReg = MI.getOperand(0).getReg();
1483+
MachineOperand LHS = MI.getOperand(1);
1484+
MachineOperand RHS = MI.getOperand(2);
1485+
MachineOperand &RegMO = RHS.getReg() == FoldAsLoadDefReg ? LHS : RHS;
1486+
if ((RegMemOpcode == SystemZ::SDB || RegMemOpcode == SystemZ::SEB) &&
1487+
FoldAsLoadDefReg != RHS.getReg())
1488+
return nullptr;
1489+
1490+
MachineOperand &Base = LoadMI.getOperand(1);
1491+
MachineOperand &Disp = LoadMI.getOperand(2);
1492+
MachineOperand &Indx = LoadMI.getOperand(3);
1493+
MachineInstrBuilder MIB =
1494+
BuildMI(*MI.getParent(), InsertPt, MI.getDebugLoc(), get(RegMemOpcode), DstReg)
1495+
.add(RegMO)
1496+
.add(Base)
1497+
.add(Disp)
1498+
.add(Indx);
1499+
MIB->addRegisterDead(SystemZ::CC, &RI);
1500+
MRI->setRegClass(DstReg, FPRC);
1501+
MRI->setRegClass(RegMO.getReg(), FPRC);
1502+
transferMIFlag(&MI, MIB, MachineInstr::NoFPExcept);
1503+
1504+
return MIB;
13421505
}
13431506

13441507
bool SystemZInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {

llvm/lib/Target/SystemZ/SystemZInstrInfo.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -254,8 +254,13 @@ class SystemZInstrInfo : public SystemZGenInstrInfo {
254254
const DebugLoc &DL, Register DstReg,
255255
ArrayRef<MachineOperand> Cond, Register TrueReg,
256256
Register FalseReg) const override;
257+
MachineInstr *optimizeLoadInstr(MachineInstr &MI,
258+
const MachineRegisterInfo *MRI,
259+
Register &FoldAsLoadDefReg,
260+
MachineInstr *&DefMI) const override;
257261
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg,
258262
MachineRegisterInfo *MRI) const override;
263+
259264
bool isPredicable(const MachineInstr &MI) const override;
260265
bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
261266
unsigned ExtraPredCycles,
@@ -285,6 +290,12 @@ class SystemZInstrInfo : public SystemZGenInstrInfo {
285290
Register VReg) const override;
286291
MachineInstr *convertToThreeAddress(MachineInstr &MI, LiveVariables *LV,
287292
LiveIntervals *LIS) const override;
293+
294+
bool useMachineCombiner() const override { return true; }
295+
bool isAssociativeAndCommutative(const MachineInstr &Inst,
296+
bool Invert) const override;
297+
std::optional<unsigned> getInverseOpcode(unsigned Opcode) const override;
298+
288299
MachineInstr *
289300
foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
290301
ArrayRef<unsigned> Ops,

llvm/lib/Target/SystemZ/SystemZInstrVector.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ let Predicates = [FeatureVector] in {
139139
// LEY and LDY offer full 20-bit displacement fields. It's often better
140140
// to use those instructions rather than force a 20-bit displacement
141141
// into a GPR temporary.
142-
let mayLoad = 1 in {
142+
let mayLoad = 1, canFoldAsLoad = 1 in {
143143
def VL32 : UnaryAliasVRX<z_load, v32sb, bdxaddr12pair>;
144144
def VL64 : UnaryAliasVRX<z_load, v64db, bdxaddr12pair>;
145145
}

llvm/lib/Target/SystemZ/SystemZOperators.td

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -829,6 +829,18 @@ def any_fnms : PatFrag<(ops node:$src1, node:$src2, node:$src3),
829829
// Floating-point negative absolute.
830830
def fnabs : PatFrag<(ops node:$ptr), (fneg (fabs node:$ptr))>;
831831

832+
// Floating-point operations which will not participate in reassociation, and
833+
// therefore are candidates for reg/mem folding during isel.
834+
def z_any_fadd_noreassoc : PatFrag<(ops node:$src1, node:$src2),
835+
(any_fadd node:$src1, node:$src2),
836+
[{ return !shouldSelectForReassoc(N); }]>;
837+
def z_any_fsub_noreassoc : PatFrag<(ops node:$src1, node:$src2),
838+
(any_fsub node:$src1, node:$src2),
839+
[{ return !shouldSelectForReassoc(N); }]>;
840+
def z_any_fmul_noreassoc : PatFrag<(ops node:$src1, node:$src2),
841+
(any_fmul node:$src1, node:$src2),
842+
[{ return !shouldSelectForReassoc(N); }]>;
843+
832844
// Strict floating-point fragments.
833845
def z_any_fcmp : PatFrags<(ops node:$lhs, node:$rhs),
834846
[(z_strict_fcmp node:$lhs, node:$rhs),

llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,11 @@
3030

3131
using namespace llvm;
3232

33+
static cl::opt<bool> EnableMachineCombinerPass(
34+
"systemz-machine-combiner",
35+
cl::desc("Enable the machine combiner pass"),
36+
cl::init(true), cl::Hidden);
37+
3338
// NOLINTNEXTLINE(readability-identifier-naming)
3439
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSystemZTarget() {
3540
// Register the target.
@@ -245,6 +250,10 @@ bool SystemZPassConfig::addInstSelector() {
245250

246251
bool SystemZPassConfig::addILPOpts() {
247252
addPass(&EarlyIfConverterID);
253+
254+
if (EnableMachineCombinerPass)
255+
addPass(&MachineCombinerID);
256+
248257
return true;
249258
}
250259

llvm/test/CodeGen/SystemZ/anyregcc.ll

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -323,37 +323,37 @@ entry:
323323
; CHECK-NEXT: .byte 1
324324
; CHECK-NEXT: .byte 0
325325
; CHECK-NEXT: .short 8
326-
; CHECK-NEXT: .short {{[0-9]+}}
326+
; CHECK-NEXT: .short 13
327327
; CHECK-NEXT: .short 0
328328
; CHECK-NEXT: .long 0
329-
; Loc 9: Register
330-
; CHECK-NEXT: .byte 1
329+
; Loc 9: IndirectMem
330+
; CHECK-NEXT: .byte 3
331331
; CHECK-NEXT: .byte 0
332332
; CHECK-NEXT: .short 8
333333
; CHECK-NEXT: .short {{[0-9]+}}
334334
; CHECK-NEXT: .short 0
335-
; CHECK-NEXT: .long 0
336-
; Loc 10: Register
337-
; CHECK-NEXT: .byte 1
335+
; CHECK-NEXT: .long 344
336+
; Loc 10: IndirectMem
337+
; CHECK-NEXT: .byte 3
338338
; CHECK-NEXT: .byte 0
339339
; CHECK-NEXT: .short 8
340340
; CHECK-NEXT: .short {{[0-9]+}}
341341
; CHECK-NEXT: .short 0
342-
; CHECK-NEXT: .long 0
343-
; Loc 11: Register
344-
; CHECK-NEXT: .byte 1
342+
; CHECK-NEXT: .long 352
343+
; Loc 11: IndirectMem
344+
; CHECK-NEXT: .byte 3
345345
; CHECK-NEXT: .byte 0
346346
; CHECK-NEXT: .short 8
347347
; CHECK-NEXT: .short {{[0-9]+}}
348348
; CHECK-NEXT: .short 0
349-
; CHECK-NEXT: .long 0
350-
; Loc 12: Register
351-
; CHECK-NEXT: .byte 1
349+
; CHECK-NEXT: .long 360
350+
; Loc 12: IndirectMem
351+
; CHECK-NEXT: .byte 3
352352
; CHECK-NEXT: .byte 0
353353
; CHECK-NEXT: .short 8
354354
; CHECK-NEXT: .short {{[0-9]+}}
355355
; CHECK-NEXT: .short 0
356-
; CHECK-NEXT: .long 0
356+
; CHECK-NEXT: .long 368
357357
define i64 @anyreg_test2(ptr %a1, ptr %a2, ptr %a3, ptr %a4, ptr %a5, ptr %a6, ptr %a7, ptr %a8, ptr %a9, ptr %a10, ptr %a11, ptr %a12) nounwind ssp uwtable {
358358
entry:
359359
%f = inttoptr i64 12297829382473034410 to ptr

0 commit comments

Comments
 (0)