Skip to content

Commit 7f10cac

Browse files
arsenmrampitec
authored andcommitted
AMDGPU: Match and Select BITOP3 on gfx950 (llvm#117843)
Co-authored-by: Stanislav Mekhanoshin <[email protected]>
1 parent e2c4013 commit 7f10cac

File tree

6 files changed

+757
-0
lines changed

6 files changed

+757
-0
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3549,6 +3549,176 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
35493549
return true;
35503550
}
35513551

3552+
// Match BITOP3 operation and return a number of matched instructions plus
3553+
// truth table.
3554+
static std::pair<unsigned, uint8_t> BitOp3_Op(SDValue In,
3555+
SmallVectorImpl<SDValue> &Src) {
3556+
unsigned NumOpcodes = 0;
3557+
uint8_t LHSBits, RHSBits;
3558+
3559+
auto getOperandBits = [&Src, In](SDValue Op, uint8_t &Bits) -> bool {
3560+
// Define truth table given Src0, Src1, Src2 bits permutations:
3561+
// 0 0 0
3562+
// 0 0 1
3563+
// 0 1 0
3564+
// 0 1 1
3565+
// 1 0 0
3566+
// 1 0 1
3567+
// 1 1 0
3568+
// 1 1 1
3569+
const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
3570+
3571+
if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
3572+
if (C->isAllOnes()) {
3573+
Bits = 0xff;
3574+
return true;
3575+
}
3576+
if (C->isZero()) {
3577+
Bits = 0;
3578+
return true;
3579+
}
3580+
}
3581+
3582+
for (unsigned I = 0; I < Src.size(); ++I) {
3583+
// Try to find existing reused operand
3584+
if (Src[I] == Op) {
3585+
Bits = SrcBits[I];
3586+
return true;
3587+
}
3588+
// Try to replace parent operator
3589+
if (Src[I] == In) {
3590+
Bits = SrcBits[I];
3591+
Src[I] = Op;
3592+
return true;
3593+
}
3594+
}
3595+
3596+
if (Src.size() == 3) {
3597+
// No room left for operands. Try one last time, there can be a 'not' of
3598+
// one of our source operands. In this case we can compute the bits
3599+
// without growing Src vector.
3600+
if (Op.getOpcode() == ISD::XOR) {
3601+
if (auto *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
3602+
if (C->isAllOnes()) {
3603+
SDValue LHS = Op.getOperand(0);
3604+
for (unsigned I = 0; I < Src.size(); ++I) {
3605+
if (Src[I] == LHS) {
3606+
Bits = ~SrcBits[I];
3607+
return true;
3608+
}
3609+
}
3610+
}
3611+
}
3612+
}
3613+
3614+
return false;
3615+
}
3616+
3617+
Bits = SrcBits[Src.size()];
3618+
Src.push_back(Op);
3619+
return true;
3620+
};
3621+
3622+
switch (In.getOpcode()) {
3623+
case ISD::AND:
3624+
case ISD::OR:
3625+
case ISD::XOR: {
3626+
SDValue LHS = In.getOperand(0);
3627+
SDValue RHS = In.getOperand(1);
3628+
3629+
SmallVector<SDValue, 3> Backup(Src.begin(), Src.end());
3630+
if (!getOperandBits(LHS, LHSBits) ||
3631+
!getOperandBits(RHS, RHSBits)) {
3632+
Src = Backup;
3633+
return std::make_pair(0, 0);
3634+
}
3635+
3636+
// Recursion is naturally limited by the size of the operand vector.
3637+
auto Op = BitOp3_Op(LHS, Src);
3638+
if (Op.first) {
3639+
NumOpcodes += Op.first;
3640+
LHSBits = Op.second;
3641+
}
3642+
3643+
Op = BitOp3_Op(RHS, Src);
3644+
if (Op.first) {
3645+
NumOpcodes += Op.first;
3646+
RHSBits = Op.second;
3647+
}
3648+
break;
3649+
}
3650+
default:
3651+
return std::make_pair(0, 0);
3652+
}
3653+
3654+
uint8_t TTbl;
3655+
switch (In.getOpcode()) {
3656+
case ISD::AND:
3657+
TTbl = LHSBits & RHSBits;
3658+
break;
3659+
case ISD::OR:
3660+
TTbl = LHSBits | RHSBits;
3661+
break;
3662+
case ISD::XOR:
3663+
TTbl = LHSBits ^ RHSBits;
3664+
break;
3665+
default:
3666+
break;
3667+
}
3668+
3669+
return std::make_pair(NumOpcodes + 1, TTbl);
3670+
}
3671+
3672+
bool AMDGPUDAGToDAGISel::SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1,
3673+
SDValue &Src2, SDValue &Tbl) const {
3674+
SmallVector<SDValue, 3> Src;
3675+
uint8_t TTbl;
3676+
unsigned NumOpcodes;
3677+
3678+
std::tie(NumOpcodes, TTbl) = BitOp3_Op(In, Src);
3679+
3680+
// Src.empty() case can happen if all operands are all zero or all ones.
3681+
// Normally it shall be optimized out before reaching this.
3682+
if (NumOpcodes < 2 || Src.empty())
3683+
return false;
3684+
3685+
// For a uniform case threshold should be higher to account for moves between
3686+
// VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be in SGPRs
3687+
// and a readtfirstlane after.
3688+
if (NumOpcodes < 4 && !In->isDivergent())
3689+
return false;
3690+
3691+
if (NumOpcodes == 2 && In.getValueType() == MVT::i32) {
3692+
// Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
3693+
// asm more readable. This cannot be modeled with AddedComplexity because
3694+
// selector does not know how many operations did we match.
3695+
if ((In.getOpcode() == ISD::XOR || In.getOpcode() == ISD::OR) &&
3696+
(In.getOperand(0).getOpcode() == In.getOpcode() ||
3697+
In.getOperand(1).getOpcode() == In.getOpcode()))
3698+
return false;
3699+
3700+
if (In.getOpcode() == ISD::OR &&
3701+
(In.getOperand(0).getOpcode() == ISD::AND ||
3702+
In.getOperand(1).getOpcode() == ISD::AND))
3703+
return false;
3704+
}
3705+
3706+
// Last operand can be ignored, turning a ternary operation into a binary.
3707+
// For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
3708+
// 'c' with 'a' here without changing the answer. In some pathological
3709+
// cases it should be possible to get an operation with a single operand
3710+
// too if optimizer would not catch it.
3711+
while (Src.size() < 3)
3712+
Src.push_back(Src[0]);
3713+
3714+
Src0 = Src[0];
3715+
Src1 = Src[1];
3716+
Src2 = Src[2];
3717+
3718+
Tbl = CurDAG->getTargetConstant(TTbl, SDLoc(In), MVT::i32);
3719+
return true;
3720+
}
3721+
35523722
SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
35533723
if (In.isUndef())
35543724
return CurDAG->getUNDEF(MVT::i32);

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,9 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
247247
SDValue &SrcMods) const;
248248
bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
249249

250+
bool SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1, SDValue &Src2,
251+
SDValue &Tbl) const;
252+
250253
SDValue getHi16Elt(SDValue In) const;
251254

252255
SDValue getMaterializedScalarImm32(int64_t Val, const SDLoc &DL) const;

0 commit comments

Comments
 (0)