Skip to content

Commit ffac062

Browse files
rampitecarsenm
authored andcommitted
AMDGPU: Match and Select BITOP3 on gfx950
Co-authored-by: Stanislav Mekhanoshin <[email protected]>
1 parent ff296c1 commit ffac062

File tree

6 files changed

+757
-0
lines changed

6 files changed

+757
-0
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3552,6 +3552,176 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
35523552
return true;
35533553
}
35543554

3555+
// Match BITOP3 operation and return a number of matched instructions plus
3556+
// truth table.
3557+
static std::pair<unsigned, uint8_t> BitOp3_Op(SDValue In,
3558+
SmallVectorImpl<SDValue> &Src) {
3559+
unsigned NumOpcodes = 0;
3560+
uint8_t LHSBits, RHSBits;
3561+
3562+
auto getOperandBits = [&Src, In](SDValue Op, uint8_t &Bits) -> bool {
3563+
// Define truth table given Src0, Src1, Src2 bits permutations:
3564+
// 0 0 0
3565+
// 0 0 1
3566+
// 0 1 0
3567+
// 0 1 1
3568+
// 1 0 0
3569+
// 1 0 1
3570+
// 1 1 0
3571+
// 1 1 1
3572+
const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
3573+
3574+
if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
3575+
if (C->isAllOnes()) {
3576+
Bits = 0xff;
3577+
return true;
3578+
}
3579+
if (C->isZero()) {
3580+
Bits = 0;
3581+
return true;
3582+
}
3583+
}
3584+
3585+
for (unsigned I = 0; I < Src.size(); ++I) {
3586+
// Try to find existing reused operand
3587+
if (Src[I] == Op) {
3588+
Bits = SrcBits[I];
3589+
return true;
3590+
}
3591+
// Try to replace parent operator
3592+
if (Src[I] == In) {
3593+
Bits = SrcBits[I];
3594+
Src[I] = Op;
3595+
return true;
3596+
}
3597+
}
3598+
3599+
if (Src.size() == 3) {
3600+
// No room left for operands. Try one last time, there can be a 'not' of
3601+
// one of our source operands. In this case we can compute the bits
3602+
// without growing Src vector.
3603+
if (Op.getOpcode() == ISD::XOR) {
3604+
if (auto *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
3605+
if (C->isAllOnes()) {
3606+
SDValue LHS = Op.getOperand(0);
3607+
for (unsigned I = 0; I < Src.size(); ++I) {
3608+
if (Src[I] == LHS) {
3609+
Bits = ~SrcBits[I];
3610+
return true;
3611+
}
3612+
}
3613+
}
3614+
}
3615+
}
3616+
3617+
return false;
3618+
}
3619+
3620+
Bits = SrcBits[Src.size()];
3621+
Src.push_back(Op);
3622+
return true;
3623+
};
3624+
3625+
switch (In.getOpcode()) {
3626+
case ISD::AND:
3627+
case ISD::OR:
3628+
case ISD::XOR: {
3629+
SDValue LHS = In.getOperand(0);
3630+
SDValue RHS = In.getOperand(1);
3631+
3632+
SmallVector<SDValue, 3> Backup(Src.begin(), Src.end());
3633+
if (!getOperandBits(LHS, LHSBits) ||
3634+
!getOperandBits(RHS, RHSBits)) {
3635+
Src = Backup;
3636+
return std::make_pair(0, 0);
3637+
}
3638+
3639+
// Recursion is naturally limited by the size of the operand vector.
3640+
auto Op = BitOp3_Op(LHS, Src);
3641+
if (Op.first) {
3642+
NumOpcodes += Op.first;
3643+
LHSBits = Op.second;
3644+
}
3645+
3646+
Op = BitOp3_Op(RHS, Src);
3647+
if (Op.first) {
3648+
NumOpcodes += Op.first;
3649+
RHSBits = Op.second;
3650+
}
3651+
break;
3652+
}
3653+
default:
3654+
return std::make_pair(0, 0);
3655+
}
3656+
3657+
uint8_t TTbl;
3658+
switch (In.getOpcode()) {
3659+
case ISD::AND:
3660+
TTbl = LHSBits & RHSBits;
3661+
break;
3662+
case ISD::OR:
3663+
TTbl = LHSBits | RHSBits;
3664+
break;
3665+
case ISD::XOR:
3666+
TTbl = LHSBits ^ RHSBits;
3667+
break;
3668+
default:
3669+
break;
3670+
}
3671+
3672+
return std::make_pair(NumOpcodes + 1, TTbl);
3673+
}
3674+
3675+
bool AMDGPUDAGToDAGISel::SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1,
3676+
SDValue &Src2, SDValue &Tbl) const {
3677+
SmallVector<SDValue, 3> Src;
3678+
uint8_t TTbl;
3679+
unsigned NumOpcodes;
3680+
3681+
std::tie(NumOpcodes, TTbl) = BitOp3_Op(In, Src);
3682+
3683+
// Src.empty() case can happen if all operands are all zero or all ones.
3684+
// Normally it shall be optimized out before reaching this.
3685+
if (NumOpcodes < 2 || Src.empty())
3686+
return false;
3687+
3688+
// For a uniform case threshold should be higher to account for moves between
3689+
// VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be in SGPRs
3690+
// and a readtfirstlane after.
3691+
if (NumOpcodes < 4 && !In->isDivergent())
3692+
return false;
3693+
3694+
if (NumOpcodes == 2 && In.getValueType() == MVT::i32) {
3695+
// Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
3696+
// asm more readable. This cannot be modeled with AddedComplexity because
3697+
// selector does not know how many operations did we match.
3698+
if ((In.getOpcode() == ISD::XOR || In.getOpcode() == ISD::OR) &&
3699+
(In.getOperand(0).getOpcode() == In.getOpcode() ||
3700+
In.getOperand(1).getOpcode() == In.getOpcode()))
3701+
return false;
3702+
3703+
if (In.getOpcode() == ISD::OR &&
3704+
(In.getOperand(0).getOpcode() == ISD::AND ||
3705+
In.getOperand(1).getOpcode() == ISD::AND))
3706+
return false;
3707+
}
3708+
3709+
// Last operand can be ignored, turning a ternary operation into a binary.
3710+
// For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
3711+
// 'c' with 'a' here without changing the answer. In some pathological
3712+
// cases it should be possible to get an operation with a single operand
3713+
// too if optimizer would not catch it.
3714+
while (Src.size() < 3)
3715+
Src.push_back(Src[0]);
3716+
3717+
Src0 = Src[0];
3718+
Src1 = Src[1];
3719+
Src2 = Src[2];
3720+
3721+
Tbl = CurDAG->getTargetConstant(TTbl, SDLoc(In), MVT::i32);
3722+
return true;
3723+
}
3724+
35553725
SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
35563726
if (In.isUndef())
35573727
return CurDAG->getUNDEF(MVT::i32);

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,9 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
242242
SDValue &SrcMods) const;
243243
bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
244244

245+
bool SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1, SDValue &Src2,
246+
SDValue &Tbl) const;
247+
245248
SDValue getHi16Elt(SDValue In) const;
246249

247250
SDValue getMaterializedScalarImm32(int64_t Val, const SDLoc &DL) const;

0 commit comments

Comments
 (0)