-
Notifications
You must be signed in to change notification settings - Fork 13.5k
AMDGPU: Match and Select BITOP3 on gfx950 #117843
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesCo-authored-by: Stanislav Mekhanoshin <[email protected]> Patch is 27.53 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/117843.diff 6 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 7d78e9cd7eab6f..c0e01a020e0eb9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -3552,6 +3552,176 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
return true;
}
+// Match BITOP3 operation and return a number of matched instructions plus
+// truth table.
+static std::pair<unsigned, uint8_t> BitOp3_Op(SDValue In,
+ SmallVectorImpl<SDValue> &Src) {
+ unsigned NumOpcodes = 0;
+ uint8_t LHSBits, RHSBits;
+
+ auto getOperandBits = [&Src, In](SDValue Op, uint8_t &Bits) -> bool {
+ // Define truth table given Src0, Src1, Src2 bits permutations:
+ // 0 0 0
+ // 0 0 1
+ // 0 1 0
+ // 0 1 1
+ // 1 0 0
+ // 1 0 1
+ // 1 1 0
+ // 1 1 1
+ const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
+
+ if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (C->isAllOnes()) {
+ Bits = 0xff;
+ return true;
+ }
+ if (C->isZero()) {
+ Bits = 0;
+ return true;
+ }
+ }
+
+ for (unsigned I = 0; I < Src.size(); ++I) {
+ // Try to find existing reused operand
+ if (Src[I] == Op) {
+ Bits = SrcBits[I];
+ return true;
+ }
+ // Try to replace parent operator
+ if (Src[I] == In) {
+ Bits = SrcBits[I];
+ Src[I] = Op;
+ return true;
+ }
+ }
+
+ if (Src.size() == 3) {
+ // No room left for operands. Try one last time, there can be a 'not' of
+ // one of our source operands. In this case we can compute the bits
+ // without growing Src vector.
+ if (Op.getOpcode() == ISD::XOR) {
+ if (auto *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+ if (C->isAllOnes()) {
+ SDValue LHS = Op.getOperand(0);
+ for (unsigned I = 0; I < Src.size(); ++I) {
+ if (Src[I] == LHS) {
+ Bits = ~SrcBits[I];
+ return true;
+ }
+ }
+ }
+ }
+ }
+
+ return false;
+ }
+
+ Bits = SrcBits[Src.size()];
+ Src.push_back(Op);
+ return true;
+ };
+
+ switch (In.getOpcode()) {
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR: {
+ SDValue LHS = In.getOperand(0);
+ SDValue RHS = In.getOperand(1);
+
+ SmallVector<SDValue, 3> Backup(Src.begin(), Src.end());
+ if (!getOperandBits(LHS, LHSBits) ||
+ !getOperandBits(RHS, RHSBits)) {
+ Src = Backup;
+ return std::make_pair(0, 0);
+ }
+
+ // Recursion is naturally limited by the size of the operand vector.
+ auto Op = BitOp3_Op(LHS, Src);
+ if (Op.first) {
+ NumOpcodes += Op.first;
+ LHSBits = Op.second;
+ }
+
+ Op = BitOp3_Op(RHS, Src);
+ if (Op.first) {
+ NumOpcodes += Op.first;
+ RHSBits = Op.second;
+ }
+ break;
+ }
+ default:
+ return std::make_pair(0, 0);
+ }
+
+ uint8_t TTbl;
+ switch (In.getOpcode()) {
+ case ISD::AND:
+ TTbl = LHSBits & RHSBits;
+ break;
+ case ISD::OR:
+ TTbl = LHSBits | RHSBits;
+ break;
+ case ISD::XOR:
+ TTbl = LHSBits ^ RHSBits;
+ break;
+ default:
+ break;
+ }
+
+ return std::make_pair(NumOpcodes + 1, TTbl);
+}
+
+bool AMDGPUDAGToDAGISel::SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1,
+ SDValue &Src2, SDValue &Tbl) const {
+ SmallVector<SDValue, 3> Src;
+ uint8_t TTbl;
+ unsigned NumOpcodes;
+
+ std::tie(NumOpcodes, TTbl) = BitOp3_Op(In, Src);
+
+ // Src.empty() case can happen if all operands are all zero or all ones.
+ // Normally it shall be optimized out before reaching this.
+ if (NumOpcodes < 2 || Src.empty())
+ return false;
+
+ // For a uniform case threshold should be higher to account for moves between
+ // VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be in SGPRs
+ // and a readtfirstlane after.
+ if (NumOpcodes < 4 && !In->isDivergent())
+ return false;
+
+ if (NumOpcodes == 2 && In.getValueType() == MVT::i32) {
+ // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
+ // asm more readable. This cannot be modeled with AddedComplexity because
+ // selector does not know how many operations did we match.
+ if ((In.getOpcode() == ISD::XOR || In.getOpcode() == ISD::OR) &&
+ (In.getOperand(0).getOpcode() == In.getOpcode() ||
+ In.getOperand(1).getOpcode() == In.getOpcode()))
+ return false;
+
+ if (In.getOpcode() == ISD::OR &&
+ (In.getOperand(0).getOpcode() == ISD::AND ||
+ In.getOperand(1).getOpcode() == ISD::AND))
+ return false;
+ }
+
+ // Last operand can be ignored, turning a ternary operation into a binary.
+ // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
+ // 'c' with 'a' here without changing the answer. In some pathological
+ // cases it should be possible to get an operation with a single operand
+ // too if optimizer would not catch it.
+ while (Src.size() < 3)
+ Src.push_back(Src[0]);
+
+ Src0 = Src[0];
+ Src1 = Src[1];
+ Src2 = Src[2];
+
+ Tbl = CurDAG->getTargetConstant(TTbl, SDLoc(In), MVT::i32);
+ return true;
+}
+
SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
if (In.isUndef())
return CurDAG->getUNDEF(MVT::i32);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 5ae0b179d7d0e6..7e61eb470622f1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -242,6 +242,9 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
SDValue &SrcMods) const;
bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+ bool SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1, SDValue &Src2,
+ SDValue &Tbl) const;
+
SDValue getHi16Elt(SDValue In) const;
SDValue getMaterializedScalarImm32(int64_t Val, const SDLoc &DL) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 7ce7562cdcaa95..71d23f9fe30c49 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3643,6 +3643,206 @@ bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
return true;
}
+// Match BITOP3 operation and return a number of matched instructions plus
+// truth table.
+static std::pair<unsigned, uint8_t> BitOp3_Op(Register R,
+ SmallVectorImpl<Register> &Src,
+ const MachineRegisterInfo &MRI) {
+ unsigned NumOpcodes = 0;
+ uint8_t LHSBits, RHSBits;
+
+ auto getOperandBits = [&Src, R, &MRI](Register Op, uint8_t &Bits) -> bool {
+ // Define truth table given Src0, Src1, Src2 bits permutations:
+ // 0 0 0
+ // 0 0 1
+ // 0 1 0
+ // 0 1 1
+ // 1 0 0
+ // 1 0 1
+ // 1 1 0
+ // 1 1 1
+ const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
+
+ if (mi_match(Op, MRI, m_AllOnesInt())) {
+ Bits = 0xff;
+ return true;
+ }
+ if (mi_match(Op, MRI, m_ZeroInt())) {
+ Bits = 0;
+ return true;
+ }
+
+ for (unsigned I = 0; I < Src.size(); ++I) {
+ // Try to find existing reused operand
+ if (Src[I] == Op) {
+ Bits = SrcBits[I];
+ return true;
+ }
+ // Try to replace parent operator
+ if (Src[I] == R) {
+ Bits = SrcBits[I];
+ Src[I] = Op;
+ return true;
+ }
+ }
+
+ if (Src.size() == 3) {
+ // No room left for operands. Try one last time, there can be a 'not' of
+ // one of our source operands. In this case we can compute the bits
+ // without growing Src vector.
+ Register LHS;
+ if (mi_match(Op, MRI, m_Not(m_Reg(LHS)))) {
+ LHS = getSrcRegIgnoringCopies(LHS, MRI);
+ for (unsigned I = 0; I < Src.size(); ++I) {
+ if (Src[I] == LHS) {
+ Bits = ~SrcBits[I];
+ return true;
+ }
+ }
+ }
+
+ return false;
+ }
+
+ Bits = SrcBits[Src.size()];
+ Src.push_back(Op);
+ return true;
+ };
+
+ MachineInstr *MI = MRI.getVRegDef(R);
+ switch (MI->getOpcode()) {
+ case TargetOpcode::G_AND:
+ case TargetOpcode::G_OR:
+ case TargetOpcode::G_XOR: {
+ Register LHS = getSrcRegIgnoringCopies(MI->getOperand(1).getReg(), MRI);
+ Register RHS = getSrcRegIgnoringCopies(MI->getOperand(2).getReg(), MRI);
+
+ SmallVector<Register, 3> Backup(Src.begin(), Src.end());
+ if (!getOperandBits(LHS, LHSBits) ||
+ !getOperandBits(RHS, RHSBits)) {
+ Src = Backup;
+ return std::make_pair(0, 0);
+ }
+
+ // Recursion is naturally limited by the size of the operand vector.
+ auto Op = BitOp3_Op(LHS, Src, MRI);
+ if (Op.first) {
+ NumOpcodes += Op.first;
+ LHSBits = Op.second;
+ }
+
+ Op = BitOp3_Op(RHS, Src, MRI);
+ if (Op.first) {
+ NumOpcodes += Op.first;
+ RHSBits = Op.second;
+ }
+ break;
+ }
+ default:
+ return std::make_pair(0, 0);
+ }
+
+ uint8_t TTbl;
+ switch (MI->getOpcode()) {
+ case TargetOpcode::G_AND:
+ TTbl = LHSBits & RHSBits;
+ break;
+ case TargetOpcode::G_OR:
+ TTbl = LHSBits | RHSBits;
+ break;
+ case TargetOpcode::G_XOR:
+ TTbl = LHSBits ^ RHSBits;
+ break;
+ default:
+ break;
+ }
+
+ return std::make_pair(NumOpcodes + 1, TTbl);
+}
+
+bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const {
+ if (!Subtarget->hasBitOp3Insts())
+ return false;
+
+ SmallVector<Register, 3> Src;
+ uint8_t TTbl;
+ unsigned NumOpcodes;
+ Register DstReg = MI.getOperand(0).getReg();
+
+ std::tie(NumOpcodes, TTbl) = BitOp3_Op(DstReg, Src, *MRI);
+
+ // Src.empty() case can happen if all operands are all zero or all ones.
+ // Normally it shall be optimized out before reaching this.
+ if (NumOpcodes < 2 || Src.empty())
+ return false;
+
+ const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
+ const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
+
+ // For a uniform case threshold should be higher to account for moves between
+ // VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be in SGPRs
+ // and a readtfirstlane after.
+ if (NumOpcodes < 4 && !IsVALU)
+ return false;
+
+ bool IsB32 = MRI->getType(DstReg) == LLT::scalar(32);
+ if (NumOpcodes == 2 && IsB32) {
+ // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
+ // asm more readable. This cannot be modeled with AddedComplexity because
+ // selector does not know how many operations did we match.
+ if (mi_match(MI, *MRI, m_GXor(m_GXor(m_Reg(), m_Reg()), m_Reg())) ||
+ mi_match(MI, *MRI, m_GOr(m_GOr(m_Reg(), m_Reg()), m_Reg())) ||
+ mi_match(MI, *MRI, m_GOr(m_GAnd(m_Reg(), m_Reg()), m_Reg())))
+ return false;
+ }
+
+ unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64;
+ unsigned CBL = STI.getConstantBusLimit(Opc);
+ MachineBasicBlock *MBB = MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ for (unsigned I = 0; I < Src.size(); ++I) {
+ const RegisterBank *RB = RBI.getRegBank(Src[I], *MRI, TRI);
+ if (RB->getID() != AMDGPU::SGPRRegBankID)
+ continue;
+ if (CBL > 0) {
+ --CBL;
+ continue;
+ }
+ Register NewReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), NewReg)
+ .addReg(Src[I]);
+ Src[I] = NewReg;
+ }
+
+ // Last operand can be ignored, turning a ternary operation into a binary.
+ // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
+ // 'c' with 'a' here without changing the answer. In some pathological
+ // cases it should be possible to get an operation with a single operand
+ // too if optimizer would not catch it.
+ while (Src.size() < 3)
+ Src.push_back(Src[0]);
+
+ auto MIB = BuildMI(*MBB, MI, DL, TII.get(Opc), DstReg);
+ if (!IsB32)
+ MIB.addImm(0); // src_mod0
+ MIB.addReg(Src[0]);
+ if (!IsB32)
+ MIB.addImm(0); // src_mod1
+ MIB.addReg(Src[1]);
+ if (!IsB32)
+ MIB.addImm(0); // src_mod2
+ MIB.addReg(Src[2])
+ .addImm(TTbl);
+ if (!IsB32)
+ MIB.addImm(0); // op_sel
+
+ constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+ MI.eraseFromParent();
+
+ return true;
+}
+
bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
Register SrcReg = MI.getOperand(0).getReg();
if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
@@ -3682,6 +3882,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
case TargetOpcode::G_AND:
case TargetOpcode::G_OR:
case TargetOpcode::G_XOR:
+ if (selectBITOP3(I))
+ return true;
if (selectImpl(I, *CoverageInfo))
return true;
return selectG_AND_OR_XOR(I);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index a81f1579fb9f33..d294300be40497 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -147,6 +147,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
bool selectSMFMACIntrin(MachineInstr &I) const;
bool selectPermlaneSwapIntrin(MachineInstr &I, Intrinsic::ID IntrID) const;
bool selectWaveAddress(MachineInstr &I) const;
+ bool selectBITOP3(MachineInstr &I) const;
bool selectStackRestore(MachineInstr &MI) const;
bool selectNamedBarrierInit(MachineInstr &I, Intrinsic::ID IID) const;
bool selectNamedBarrierInst(MachineInstr &I, Intrinsic::ID IID) const;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 34850e42a3d605..c8c36714909adf 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -6,6 +6,9 @@
//
//===----------------------------------------------------------------------===//
+def BITOP3_32 : ComplexPattern<i32, 4, "SelectBITOP3", [and, or, xor]>;
+def BITOP3_16 : ComplexPattern<i16, 4, "SelectBITOP3", [and, or, xor]>;
+
// Special case for v_div_fmas_{f32|f64}, since it seems to be the
// only VOP instruction that implicitly reads VCC.
let Asm64 = " $vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$clamp$omod" in {
@@ -1275,6 +1278,16 @@ let SubtargetPredicate = HasBitOp3Insts in {
(i16 (int_amdgcn_bitop3 i16:$src0, i16:$src1, i16:$src2, i8:$bitop3)),
(i16 (V_BITOP3_B16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0))
>;
+
+ def : GCNPat<
+ (i32 (BITOP3_32 i32:$src0, i32:$src1, i32:$src2, i8:$bitop3)),
+ (i32 (V_BITOP3_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2, timm:$bitop3))
+ >;
+
+ def : GCNPat<
+ (i16 (BITOP3_16 i16:$src0, i16:$src1, i16:$src2, i8:$bitop3)),
+ (i16 (V_BITOP3_B16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0))
+ >;
} // End SubtargetPredicate = HasBitOp3Insts
class DivFmasPat<ValueType vt, Instruction inst, Register CondReg> : GCNPat<
diff --git a/llvm/test/CodeGen/AMDGPU/bitop3.ll b/llvm/test/CodeGen/AMDGPU/bitop3.ll
new file mode 100644
index 00000000000000..dd608ef0e5a53d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/bitop3.ll
@@ -0,0 +1,368 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx950 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX950,GFX950-SDAG %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx950 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX950,GFX950-GISEL %s
+
+; ========= Single bit functions =========
+
+define amdgpu_ps float @not_and_not_and_not_and(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: not_and_not_and_not_and:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:1
+; GCN-NEXT: ; return to shader part epilog
+ %nota = xor i32 %a, -1
+ %notb = xor i32 %b, -1
+ %notc = xor i32 %c, -1
+ %and1 = and i32 %nota, %notc
+ %and2 = and i32 %and1, %notb
+ %ret_cast = bitcast i32 %and2 to float
+ ret float %ret_cast
+}
+
+define amdgpu_ps float @not_and_not_and_and(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: not_and_not_and_and:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:2
+; GCN-NEXT: ; return to shader part epilog
+ %nota = xor i32 %a, -1
+ %notb = xor i32 %b, -1
+ %and1 = and i32 %nota, %c
+ %and2 = and i32 %and1, %notb
+ %ret_cast = bitcast i32 %and2 to float
+ ret float %ret_cast
+}
+
+define amdgpu_ps float @not_and_and_not_and(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: not_and_and_not_and:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:4
+; GCN-NEXT: ; return to shader part epilog
+ %nota = xor i32 %a, -1
+ %notc = xor i32 %c, -1
+ %and1 = and i32 %nota, %notc
+ %and2 = and i32 %and1, %b
+ %ret_cast = bitcast i32 %and2 to float
+ ret float %ret_cast
+}
+
+define amdgpu_ps float @not_and_and_and(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: not_and_and_and:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:8
+; GCN-NEXT: ; return to shader part epilog
+ %nota = xor i32 %a, -1
+ %and1 = and i32 %nota, %c
+ %and2 = and i32 %and1, %b
+ %ret_cast = bitcast i32 %and2 to float
+ ret float %ret_cast
+}
+
+define amdgpu_ps float @and_not_and_not_and(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: and_not_and_not_and:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:0x10
+; GCN-NEXT: ; return to shader part epilog
+ %notb = xor i32 %b, -1
+ %notc = xor i32 %c, -1
+ %and1 = and i32 %a, %notc
+ %and2 = and i32 %and1, %notb
+ %ret_cast = bitcast i32 %and2 to float
+ ret float %ret_cast
+}
+
+define amdgpu_ps float @and_not_and_and(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: and_not_and_and:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:0x20
+; GCN-NEXT: ; return to shader part epilog
+ %notb = xor i32 %b, -1
+ %and1 = and i32 %a, %c
+ %and2 = and i32 %and1, %notb
+ %ret_cast = bitcast i32 %and2 to float
+ ret float %ret_cast
+}
+
+define amdgpu_ps float @and_and_not_and(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: and_and_not_and:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:0x40
+; GCN-NEXT: ; return to shader part epilog
+ %notc = xor i32 %c, -1
+ %and1 = and i32 %a, %notc
+ %and2 = and i32 %and1, %b
+ %ret_cast = bitcast i32 %and2 to float
+ ret float %ret_cast
+}
+
+define amdgpu_ps float @and_and_and(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: and_and_and:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:0x80
+; GCN-NEXT: ; return to shader part epilog
+ %and1 = and i32 %a, %c
+ %and2 = and i32 %and1, %b
+ %ret_cast = bitcast i32 %and2 to float
+ ret float %ret_cast
+}
+
+; ========= Multi bit functions =========
+
+define amdgpu_ps float @test_12(i32 %a, i32 %b) {
+; GCN-LABEL: test_12:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_bitop3_b32 v0, v0, v1, v0 bitop3:0xc
+; GCN-NEXT: ; return to shader part epilog
+ %nota = xor i32 %a, -1
+ %and1 = and i32 %nota, %b
+ %ret_cast = bitcast i32 %and1 to float
+ ret float %ret_cast
+}
+
+define amdgpu_ps float @test_63(i32 %a, i32 %b) {
+; GCN-LABEL: test_63:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_bitop3_b32 v0, v0, v1, v0 bitop3:0x3f
+; GCN-NEXT: ; return to shader part epilog
+ %nota = xor i32 %a, -1
+ %notb = xor i32 %b, -1
+ %or = or i32 %nota, %notb
+ %ret_cast = bitcast i32 %or to float
+ ret float %ret_cast
+}
+
+define amdgpu_ps float @test_59(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: test_59:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_bitop3_b32 v0...
[truncated]
|
You can test this locally with the following command:git-clang-format --diff ff296c1db3b1bcf9ed3d9b22b1fefdd74d32f895 ffac062d560a4624a911af7061a7f7b12b4c16f5 --extensions h,cpp -- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h View the diff from clang-format here.diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index c0e01a020e..fcc6aa4ec5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -3569,7 +3569,7 @@ static std::pair<unsigned, uint8_t> BitOp3_Op(SDValue In,
// 1 0 1
// 1 1 0
// 1 1 1
- const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
+ const uint8_t SrcBits[3] = {0xf0, 0xcc, 0xaa};
if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
if (C->isAllOnes()) {
@@ -3630,8 +3630,7 @@ static std::pair<unsigned, uint8_t> BitOp3_Op(SDValue In,
SDValue RHS = In.getOperand(1);
SmallVector<SDValue, 3> Backup(Src.begin(), Src.end());
- if (!getOperandBits(LHS, LHSBits) ||
- !getOperandBits(RHS, RHSBits)) {
+ if (!getOperandBits(LHS, LHSBits) || !getOperandBits(RHS, RHSBits)) {
Src = Backup;
return std::make_pair(0, 0);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 7e61eb4706..3906ede252 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -243,7 +243,7 @@ private:
bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
bool SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1, SDValue &Src2,
- SDValue &Tbl) const;
+ SDValue &Tbl) const;
SDValue getHi16Elt(SDValue In) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 71d23f9fe3..32e2bd1ac6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3661,7 +3661,7 @@ static std::pair<unsigned, uint8_t> BitOp3_Op(Register R,
// 1 0 1
// 1 1 0
// 1 1 1
- const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
+ const uint8_t SrcBits[3] = {0xf0, 0xcc, 0xaa};
if (mi_match(Op, MRI, m_AllOnesInt())) {
Bits = 0xff;
@@ -3718,8 +3718,7 @@ static std::pair<unsigned, uint8_t> BitOp3_Op(Register R,
Register RHS = getSrcRegIgnoringCopies(MI->getOperand(2).getReg(), MRI);
SmallVector<Register, 3> Backup(Src.begin(), Src.end());
- if (!getOperandBits(LHS, LHSBits) ||
- !getOperandBits(RHS, RHSBits)) {
+ if (!getOperandBits(LHS, LHSBits) || !getOperandBits(RHS, RHSBits)) {
Src = Backup;
return std::make_pair(0, 0);
}
@@ -3809,9 +3808,8 @@ bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const {
--CBL;
continue;
}
- Register NewReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), NewReg)
- .addReg(Src[I]);
+ Register NewReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), NewReg).addReg(Src[I]);
Src[I] = NewReg;
}
@@ -3832,8 +3830,7 @@ bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const {
MIB.addReg(Src[1]);
if (!IsB32)
MIB.addImm(0); // src_mod2
- MIB.addReg(Src[2])
- .addImm(TTbl);
+ MIB.addReg(Src[2]).addImm(TTbl);
if (!IsB32)
MIB.addImm(0); // op_sel
|
f83b18c
to
ff296c1
Compare
2758ec2
to
ffac062
Compare
ff296c1
to
11464f0
Compare
Co-authored-by: Stanislav Mekhanoshin <[email protected]>
ffac062
to
c609205
Compare
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/16/builds/9699 Here is the relevant piece of the build log for the reference
|
Hi @arsenm , here is a problem with
|
@arsenm There's still issues with bitop3.ll on expensive checks builds - please can you take another look? |
@arsenm the problem is not fixed for 4 days already. Please revert your changes or fix this problem as soon as possible. |
Co-authored-by: Stanislav Mekhanoshin <[email protected]>
Co-authored-by: Stanislav Mekhanoshin [email protected]