Skip to content

[SystemZ] Enable MachineCombiner for FP reassociation #83546

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Apr 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,11 @@ class SystemZDAGToDAGISel : public SelectionDAGISel {
// Try to expand a boolean SELECT_CCMASK using an IPM sequence.
SDValue expandSelectBoolean(SDNode *Node);

// Return true if the flags of N and the subtarget allows for
// reassociation, in which case a reg/reg opcode is needed as input to the
// MachineCombiner.
bool shouldSelectForReassoc(SDNode *N) const;

public:
static char ID;

Expand Down Expand Up @@ -2044,6 +2049,15 @@ SDValue SystemZDAGToDAGISel::expandSelectBoolean(SDNode *Node) {
return Result;
}

bool SystemZDAGToDAGISel::shouldSelectForReassoc(SDNode *N) const {
EVT VT = N->getValueType(0);
assert(VT.isFloatingPoint() && "Expected FP SDNode");
return N->getFlags().hasAllowReassociation() &&
N->getFlags().hasNoSignedZeros() && Subtarget->hasVector() &&
(VT != MVT::f32 || Subtarget->hasVectorEnhancements1()) &&
!N->isStrictFPOpcode();
}

void SystemZDAGToDAGISel::PreprocessISelDAG() {
// If we have conditional immediate loads, we always prefer
// using those over an IPM sequence.
Expand Down
18 changes: 12 additions & 6 deletions llvm/lib/Target/SystemZ/SystemZInstrFP.td
Original file line number Diff line number Diff line change
Expand Up @@ -430,8 +430,10 @@ let Uses = [FPC], mayRaiseFPException = 1,
def ADBR : BinaryRRE<"adbr", 0xB31A, any_fadd, FP64, FP64>;
def AXBR : BinaryRRE<"axbr", 0xB34A, any_fadd, FP128, FP128>;
}
defm AEB : BinaryRXEAndPseudo<"aeb", 0xED0A, any_fadd, FP32, z_load, 4>;
defm ADB : BinaryRXEAndPseudo<"adb", 0xED1A, any_fadd, FP64, z_load, 8>;
defm AEB : BinaryRXEAndPseudo<"aeb", 0xED0A, z_any_fadd_noreassoc, FP32,
z_load, 4>;
defm ADB : BinaryRXEAndPseudo<"adb", 0xED1A, z_any_fadd_noreassoc, FP64,
z_load, 8>;
}

// Subtraction.
Expand All @@ -441,8 +443,10 @@ let Uses = [FPC], mayRaiseFPException = 1,
def SDBR : BinaryRRE<"sdbr", 0xB31B, any_fsub, FP64, FP64>;
def SXBR : BinaryRRE<"sxbr", 0xB34B, any_fsub, FP128, FP128>;

defm SEB : BinaryRXEAndPseudo<"seb", 0xED0B, any_fsub, FP32, z_load, 4>;
defm SDB : BinaryRXEAndPseudo<"sdb", 0xED1B, any_fsub, FP64, z_load, 8>;
defm SEB : BinaryRXEAndPseudo<"seb", 0xED0B, z_any_fsub_noreassoc, FP32,
z_load, 4>;
defm SDB : BinaryRXEAndPseudo<"sdb", 0xED1B, z_any_fsub_noreassoc, FP64,
z_load, 8>;
}

// Multiplication.
Expand All @@ -452,8 +456,10 @@ let Uses = [FPC], mayRaiseFPException = 1 in {
def MDBR : BinaryRRE<"mdbr", 0xB31C, any_fmul, FP64, FP64>;
def MXBR : BinaryRRE<"mxbr", 0xB34C, any_fmul, FP128, FP128>;
}
defm MEEB : BinaryRXEAndPseudo<"meeb", 0xED17, any_fmul, FP32, z_load, 4>;
defm MDB : BinaryRXEAndPseudo<"mdb", 0xED1C, any_fmul, FP64, z_load, 8>;
defm MEEB : BinaryRXEAndPseudo<"meeb", 0xED17, z_any_fmul_noreassoc, FP32,
z_load, 4>;
defm MDB : BinaryRXEAndPseudo<"mdb", 0xED1C, z_any_fmul_noreassoc, FP64,
z_load, 8>;
}

// f64 multiplication of two FP32 registers.
Expand Down
165 changes: 164 additions & 1 deletion llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -610,6 +610,32 @@ void SystemZInstrInfo::insertSelect(MachineBasicBlock &MBB,
.addImm(CCValid).addImm(CCMask);
}

MachineInstr *SystemZInstrInfo::optimizeLoadInstr(MachineInstr &MI,
const MachineRegisterInfo *MRI,
Register &FoldAsLoadDefReg,
MachineInstr *&DefMI) const {
// Check whether we can move the DefMI load, and that it only has one use.
DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
assert(DefMI);
bool SawStore = false;
if (!DefMI->isSafeToMove(nullptr, SawStore) ||
!MRI->hasOneNonDBGUse(FoldAsLoadDefReg))
return nullptr;

int UseOpIdx =
MI.findRegisterUseOperandIdx(FoldAsLoadDefReg, /*TRI=*/nullptr);
assert(UseOpIdx != -1 && "Expected FoldAsLoadDefReg to be used by MI.");

// Check whether we can fold the load.
if (MachineInstr *FoldMI =
foldMemoryOperand(MI, {((unsigned)UseOpIdx)}, *DefMI)) {
FoldAsLoadDefReg = 0;
return FoldMI;
}

return nullptr;
}

bool SystemZInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
Register Reg,
MachineRegisterInfo *MRI) const {
Expand Down Expand Up @@ -1004,6 +1030,67 @@ SystemZInstrInfo::convertToThreeAddress(MachineInstr &MI, LiveVariables *LV,
return nullptr;
}

bool SystemZInstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
bool Invert) const {
unsigned Opc = Inst.getOpcode();
if (Invert) {
auto InverseOpcode = getInverseOpcode(Opc);
if (!InverseOpcode)
return false;
Opc = *InverseOpcode;
}

switch (Opc) {
default:
break;
// Adds and multiplications.
case SystemZ::WFADB:
case SystemZ::WFASB:
case SystemZ::WFAXB:
case SystemZ::VFADB:
case SystemZ::VFASB:
case SystemZ::WFMDB:
case SystemZ::WFMSB:
case SystemZ::WFMXB:
case SystemZ::VFMDB:
case SystemZ::VFMSB:
return (Inst.getFlag(MachineInstr::MIFlag::FmReassoc) &&
Inst.getFlag(MachineInstr::MIFlag::FmNsz));
}

return false;
}

std::optional<unsigned>
SystemZInstrInfo::getInverseOpcode(unsigned Opcode) const {
// fadd => fsub
switch (Opcode) {
case SystemZ::WFADB:
return SystemZ::WFSDB;
case SystemZ::WFASB:
return SystemZ::WFSSB;
case SystemZ::WFAXB:
return SystemZ::WFSXB;
case SystemZ::VFADB:
return SystemZ::VFSDB;
case SystemZ::VFASB:
return SystemZ::VFSSB;
// fsub => fadd
case SystemZ::WFSDB:
return SystemZ::WFADB;
case SystemZ::WFSSB:
return SystemZ::WFASB;
case SystemZ::WFSXB:
return SystemZ::WFAXB;
case SystemZ::VFSDB:
return SystemZ::VFADB;
case SystemZ::VFSSB:
return SystemZ::VFASB;
default:
return std::nullopt;
}
}

MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
MachineBasicBlock::iterator InsertPt, int FrameIndex,
Expand Down Expand Up @@ -1338,7 +1425,83 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
LiveIntervals *LIS) const {
return nullptr;
MachineRegisterInfo *MRI = &MF.getRegInfo();
MachineBasicBlock *MBB = MI.getParent();

// For reassociable FP operations, any loads have been purposefully left
// unfolded so that MachineCombiner can do its work on reg/reg
// opcodes. After that, as many loads as possible are now folded.
// TODO: This may be beneficial with other opcodes as well as machine-sink
// can move loads close to their user in a different MBB, which the isel
// matcher did not see.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree this could be more generic, using the instruction maps just like the other foldMemoryOperandImpl. But that can be done later.

unsigned LoadOpc = 0;
unsigned RegMemOpcode = 0;
const TargetRegisterClass *FPRC = nullptr;
RegMemOpcode = MI.getOpcode() == SystemZ::WFADB ? SystemZ::ADB
: MI.getOpcode() == SystemZ::WFSDB ? SystemZ::SDB
: MI.getOpcode() == SystemZ::WFMDB ? SystemZ::MDB
: 0;
if (RegMemOpcode) {
LoadOpc = SystemZ::VL64;
FPRC = &SystemZ::FP64BitRegClass;
} else {
RegMemOpcode = MI.getOpcode() == SystemZ::WFASB ? SystemZ::AEB
: MI.getOpcode() == SystemZ::WFSSB ? SystemZ::SEB
: MI.getOpcode() == SystemZ::WFMSB ? SystemZ::MEEB
: 0;
if (RegMemOpcode) {
LoadOpc = SystemZ::VL32;
FPRC = &SystemZ::FP32BitRegClass;
}
}
if (!RegMemOpcode || LoadMI.getOpcode() != LoadOpc)
return nullptr;

// If RegMemOpcode clobbers CC, first make sure CC is not live at this point.
if (get(RegMemOpcode).hasImplicitDefOfPhysReg(SystemZ::CC)) {
assert(LoadMI.getParent() == MI.getParent() && "Assuming a local fold.");
assert(LoadMI != InsertPt && "Assuming InsertPt not to be first in MBB.");
for (MachineBasicBlock::iterator MII = std::prev(InsertPt);;
--MII) {
if (MII->definesRegister(SystemZ::CC, /*TRI=*/nullptr)) {
if (!MII->registerDefIsDead(SystemZ::CC, /*TRI=*/nullptr))
return nullptr;
break;
}
if (MII == MBB->begin()) {
if (MBB->isLiveIn(SystemZ::CC))
return nullptr;
break;
}
}
}

Register FoldAsLoadDefReg = LoadMI.getOperand(0).getReg();
if (Ops.size() != 1 || FoldAsLoadDefReg != MI.getOperand(Ops[0]).getReg())
return nullptr;
Register DstReg = MI.getOperand(0).getReg();
MachineOperand LHS = MI.getOperand(1);
MachineOperand RHS = MI.getOperand(2);
MachineOperand &RegMO = RHS.getReg() == FoldAsLoadDefReg ? LHS : RHS;
if ((RegMemOpcode == SystemZ::SDB || RegMemOpcode == SystemZ::SEB) &&
FoldAsLoadDefReg != RHS.getReg())
return nullptr;

MachineOperand &Base = LoadMI.getOperand(1);
MachineOperand &Disp = LoadMI.getOperand(2);
MachineOperand &Indx = LoadMI.getOperand(3);
MachineInstrBuilder MIB =
BuildMI(*MI.getParent(), InsertPt, MI.getDebugLoc(), get(RegMemOpcode), DstReg)
.add(RegMO)
.add(Base)
.add(Disp)
.add(Indx);
MIB->addRegisterDead(SystemZ::CC, &RI);
MRI->setRegClass(DstReg, FPRC);
MRI->setRegClass(RegMO.getReg(), FPRC);
transferMIFlag(&MI, MIB, MachineInstr::NoFPExcept);

return MIB;
}

bool SystemZInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
Expand Down
11 changes: 11 additions & 0 deletions llvm/lib/Target/SystemZ/SystemZInstrInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -254,8 +254,13 @@ class SystemZInstrInfo : public SystemZGenInstrInfo {
const DebugLoc &DL, Register DstReg,
ArrayRef<MachineOperand> Cond, Register TrueReg,
Register FalseReg) const override;
MachineInstr *optimizeLoadInstr(MachineInstr &MI,
const MachineRegisterInfo *MRI,
Register &FoldAsLoadDefReg,
MachineInstr *&DefMI) const override;
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg,
MachineRegisterInfo *MRI) const override;

bool isPredicable(const MachineInstr &MI) const override;
bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
unsigned ExtraPredCycles,
Expand Down Expand Up @@ -285,6 +290,12 @@ class SystemZInstrInfo : public SystemZGenInstrInfo {
Register VReg) const override;
MachineInstr *convertToThreeAddress(MachineInstr &MI, LiveVariables *LV,
LiveIntervals *LIS) const override;

bool useMachineCombiner() const override { return true; }
bool isAssociativeAndCommutative(const MachineInstr &Inst,
bool Invert) const override;
std::optional<unsigned> getInverseOpcode(unsigned Opcode) const override;

MachineInstr *
foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
ArrayRef<unsigned> Ops,
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/SystemZ/SystemZInstrVector.td
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ let Predicates = [FeatureVector] in {
// LEY and LDY offer full 20-bit displacement fields. It's often better
// to use those instructions rather than force a 20-bit displacement
// into a GPR temporary.
let mayLoad = 1 in {
let mayLoad = 1, canFoldAsLoad = 1 in {
def VL32 : UnaryAliasVRX<z_load, v32sb, bdxaddr12pair>;
def VL64 : UnaryAliasVRX<z_load, v64db, bdxaddr12pair>;
}
Expand Down
12 changes: 12 additions & 0 deletions llvm/lib/Target/SystemZ/SystemZOperators.td
Original file line number Diff line number Diff line change
Expand Up @@ -829,6 +829,18 @@ def any_fnms : PatFrag<(ops node:$src1, node:$src2, node:$src3),
// Floating-point negative absolute.
def fnabs : PatFrag<(ops node:$ptr), (fneg (fabs node:$ptr))>;

// Floating-point operations which will not participate in reassociation, and
// therefore are candidates for reg/mem folding during isel.
def z_any_fadd_noreassoc : PatFrag<(ops node:$src1, node:$src2),
(any_fadd node:$src1, node:$src2),
[{ return !shouldSelectForReassoc(N); }]>;
def z_any_fsub_noreassoc : PatFrag<(ops node:$src1, node:$src2),
(any_fsub node:$src1, node:$src2),
[{ return !shouldSelectForReassoc(N); }]>;
def z_any_fmul_noreassoc : PatFrag<(ops node:$src1, node:$src2),
(any_fmul node:$src1, node:$src2),
[{ return !shouldSelectForReassoc(N); }]>;

// Strict floating-point fragments.
def z_any_fcmp : PatFrags<(ops node:$lhs, node:$rhs),
[(z_strict_fcmp node:$lhs, node:$rhs),
Expand Down
9 changes: 9 additions & 0 deletions llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@

using namespace llvm;

static cl::opt<bool> EnableMachineCombinerPass(
"systemz-machine-combiner",
cl::desc("Enable the machine combiner pass"),
cl::init(true), cl::Hidden);

// NOLINTNEXTLINE(readability-identifier-naming)
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSystemZTarget() {
// Register the target.
Expand Down Expand Up @@ -245,6 +250,10 @@ bool SystemZPassConfig::addInstSelector() {

bool SystemZPassConfig::addILPOpts() {
addPass(&EarlyIfConverterID);

if (EnableMachineCombinerPass)
addPass(&MachineCombinerID);

return true;
}

Expand Down
26 changes: 13 additions & 13 deletions llvm/test/CodeGen/SystemZ/anyregcc.ll
Original file line number Diff line number Diff line change
Expand Up @@ -323,37 +323,37 @@ entry:
; CHECK-NEXT: .byte 1
; CHECK-NEXT: .byte 0
; CHECK-NEXT: .short 8
; CHECK-NEXT: .short {{[0-9]+}}
; CHECK-NEXT: .short 13
; CHECK-NEXT: .short 0
; CHECK-NEXT: .long 0
; Loc 9: Register
; CHECK-NEXT: .byte 1
; Loc 9: IndirectMem
; CHECK-NEXT: .byte 3
; CHECK-NEXT: .byte 0
; CHECK-NEXT: .short 8
; CHECK-NEXT: .short {{[0-9]+}}
; CHECK-NEXT: .short 0
; CHECK-NEXT: .long 0
; Loc 10: Register
; CHECK-NEXT: .byte 1
; CHECK-NEXT: .long 344
; Loc 10: IndirectMem
; CHECK-NEXT: .byte 3
; CHECK-NEXT: .byte 0
; CHECK-NEXT: .short 8
; CHECK-NEXT: .short {{[0-9]+}}
; CHECK-NEXT: .short 0
; CHECK-NEXT: .long 0
; Loc 11: Register
; CHECK-NEXT: .byte 1
; CHECK-NEXT: .long 352
; Loc 11: IndirectMem
; CHECK-NEXT: .byte 3
; CHECK-NEXT: .byte 0
; CHECK-NEXT: .short 8
; CHECK-NEXT: .short {{[0-9]+}}
; CHECK-NEXT: .short 0
; CHECK-NEXT: .long 0
; Loc 12: Register
; CHECK-NEXT: .byte 1
; CHECK-NEXT: .long 360
; Loc 12: IndirectMem
; CHECK-NEXT: .byte 3
; CHECK-NEXT: .byte 0
; CHECK-NEXT: .short 8
; CHECK-NEXT: .short {{[0-9]+}}
; CHECK-NEXT: .short 0
; CHECK-NEXT: .long 0
; CHECK-NEXT: .long 368
define i64 @anyreg_test2(ptr %a1, ptr %a2, ptr %a3, ptr %a4, ptr %a5, ptr %a6, ptr %a7, ptr %a8, ptr %a9, ptr %a10, ptr %a11, ptr %a12) nounwind ssp uwtable {
entry:
%f = inttoptr i64 12297829382473034410 to ptr
Expand Down
Loading
Loading