AMDGPU: Match and Select BITOP3 on gfx950 #117843

arsenm · 2024-11-27T04:18:27Z

Co-authored-by: Stanislav Mekhanoshin [email protected]

arsenm · 2024-11-27T04:19:27Z

This stack of pull requests is managed by Graphite. Learn more about stacking.

llvmbot · 2024-11-27T04:22:39Z

@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)

Changes

Co-authored-by: Stanislav Mekhanoshin <[email protected]>

Patch is 27.53 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/117843.diff

6 Files Affected:

(modified) llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp (+170)
(modified) llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h (+3)
(modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (+202)
(modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h (+1)
(modified) llvm/lib/Target/AMDGPU/VOP3Instructions.td (+13)
(added) llvm/test/CodeGen/AMDGPU/bitop3.ll (+368)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 7d78e9cd7eab6f..c0e01a020e0eb9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -3552,6 +3552,176 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
   return true;
 }
 
+// Match BITOP3 operation and return a number of matched instructions plus
+// truth table.
+static std::pair<unsigned, uint8_t> BitOp3_Op(SDValue In,
+                                              SmallVectorImpl<SDValue> &Src) {
+  unsigned NumOpcodes = 0;
+  uint8_t LHSBits, RHSBits;
+
+  auto getOperandBits = [&Src, In](SDValue Op, uint8_t &Bits) -> bool {
+    // Define truth table given Src0, Src1, Src2 bits permutations:
+    //                          0     0     0
+    //                          0     0     1
+    //                          0     1     0
+    //                          0     1     1
+    //                          1     0     0
+    //                          1     0     1
+    //                          1     1     0
+    //                          1     1     1
+    const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
+
+    if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (C->isAllOnes()) {
+        Bits = 0xff;
+        return true;
+      }
+      if (C->isZero()) {
+        Bits = 0;
+        return true;
+      }
+    }
+
+    for (unsigned I = 0; I < Src.size(); ++I) {
+      // Try to find existing reused operand
+      if (Src[I] == Op) {
+        Bits = SrcBits[I];
+        return true;
+      }
+      // Try to replace parent operator
+      if (Src[I] == In) {
+        Bits = SrcBits[I];
+        Src[I] = Op;
+        return true;
+      }
+    }
+
+    if (Src.size() == 3) {
+      // No room left for operands. Try one last time, there can be a 'not' of
+      // one of our source operands. In this case we can compute the bits
+      // without growing Src vector.
+      if (Op.getOpcode() == ISD::XOR) {
+        if (auto *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+          if (C->isAllOnes()) {
+            SDValue LHS = Op.getOperand(0);
+            for (unsigned I = 0; I < Src.size(); ++I) {
+              if (Src[I] == LHS) {
+                Bits = ~SrcBits[I];
+                return true;
+              }
+            }
+          }
+        }
+      }
+
+      return false;
+    }
+
+    Bits = SrcBits[Src.size()];
+    Src.push_back(Op);
+    return true;
+  };
+
+  switch (In.getOpcode()) {
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR: {
+    SDValue LHS = In.getOperand(0);
+    SDValue RHS = In.getOperand(1);
+
+    SmallVector<SDValue, 3> Backup(Src.begin(), Src.end());
+    if (!getOperandBits(LHS, LHSBits) ||
+        !getOperandBits(RHS, RHSBits)) {
+      Src = Backup;
+      return std::make_pair(0, 0);
+    }
+
+    // Recursion is naturally limited by the size of the operand vector.
+    auto Op = BitOp3_Op(LHS, Src);
+    if (Op.first) {
+      NumOpcodes += Op.first;
+      LHSBits = Op.second;
+    }
+
+    Op = BitOp3_Op(RHS, Src);
+    if (Op.first) {
+      NumOpcodes += Op.first;
+      RHSBits = Op.second;
+    }
+    break;
+  }
+  default:
+    return std::make_pair(0, 0);
+  }
+
+  uint8_t TTbl;
+  switch (In.getOpcode()) {
+  case ISD::AND:
+    TTbl = LHSBits & RHSBits;
+    break;
+  case ISD::OR:
+    TTbl = LHSBits | RHSBits;
+    break;
+  case ISD::XOR:
+    TTbl = LHSBits ^ RHSBits;
+    break;
+  default:
+    break;
+  }
+
+  return std::make_pair(NumOpcodes + 1, TTbl);
+}
+
+bool AMDGPUDAGToDAGISel::SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1,
+                                      SDValue &Src2, SDValue &Tbl) const {
+  SmallVector<SDValue, 3> Src;
+  uint8_t TTbl;
+  unsigned NumOpcodes;
+
+  std::tie(NumOpcodes, TTbl) = BitOp3_Op(In, Src);
+
+  // Src.empty() case can happen if all operands are all zero or all ones.
+  // Normally it shall be optimized out before reaching this.
+  if (NumOpcodes < 2 || Src.empty())
+    return false;
+
+  // For a uniform case threshold should be higher to account for moves between
+  // VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be in SGPRs
+  // and a readtfirstlane after.
+  if (NumOpcodes < 4 && !In->isDivergent())
+    return false;
+
+  if (NumOpcodes == 2 && In.getValueType() == MVT::i32) {
+    // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
+    // asm more readable. This cannot be modeled with AddedComplexity because
+    // selector does not know how many operations did we match.
+    if ((In.getOpcode() == ISD::XOR || In.getOpcode() == ISD::OR) &&
+        (In.getOperand(0).getOpcode() == In.getOpcode() ||
+         In.getOperand(1).getOpcode() == In.getOpcode()))
+      return false;
+
+    if (In.getOpcode() == ISD::OR &&
+        (In.getOperand(0).getOpcode() == ISD::AND ||
+         In.getOperand(1).getOpcode() == ISD::AND))
+      return false;
+  }
+
+  // Last operand can be ignored, turning a ternary operation into a binary.
+  // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
+  // 'c' with 'a' here without changing the answer. In some pathological
+  // cases it should be possible to get an operation with a single operand
+  // too if optimizer would not catch it.
+  while (Src.size() < 3)
+    Src.push_back(Src[0]);
+
+  Src0 = Src[0];
+  Src1 = Src[1];
+  Src2 = Src[2];
+
+  Tbl = CurDAG->getTargetConstant(TTbl, SDLoc(In), MVT::i32);
+  return true;
+}
+
 SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
   if (In.isUndef())
     return CurDAG->getUNDEF(MVT::i32);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 5ae0b179d7d0e6..7e61eb470622f1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -242,6 +242,9 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
                                 SDValue &SrcMods) const;
   bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
 
+  bool SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1, SDValue &Src2,
+                   SDValue &Tbl) const;
+
   SDValue getHi16Elt(SDValue In) const;
 
   SDValue getMaterializedScalarImm32(int64_t Val, const SDLoc &DL) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 7ce7562cdcaa95..71d23f9fe30c49 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3643,6 +3643,206 @@ bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
   return true;
 }
 
+// Match BITOP3 operation and return a number of matched instructions plus
+// truth table.
+static std::pair<unsigned, uint8_t> BitOp3_Op(Register R,
+                                              SmallVectorImpl<Register> &Src,
+                                              const MachineRegisterInfo &MRI) {
+  unsigned NumOpcodes = 0;
+  uint8_t LHSBits, RHSBits;
+
+  auto getOperandBits = [&Src, R, &MRI](Register Op, uint8_t &Bits) -> bool {
+    // Define truth table given Src0, Src1, Src2 bits permutations:
+    //                          0     0     0
+    //                          0     0     1
+    //                          0     1     0
+    //                          0     1     1
+    //                          1     0     0
+    //                          1     0     1
+    //                          1     1     0
+    //                          1     1     1
+    const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
+
+    if (mi_match(Op, MRI, m_AllOnesInt())) {
+      Bits = 0xff;
+      return true;
+    }
+    if (mi_match(Op, MRI, m_ZeroInt())) {
+      Bits = 0;
+      return true;
+    }
+
+    for (unsigned I = 0; I < Src.size(); ++I) {
+      // Try to find existing reused operand
+      if (Src[I] == Op) {
+        Bits = SrcBits[I];
+        return true;
+      }
+      // Try to replace parent operator
+      if (Src[I] == R) {
+        Bits = SrcBits[I];
+        Src[I] = Op;
+        return true;
+      }
+    }
+
+    if (Src.size() == 3) {
+      // No room left for operands. Try one last time, there can be a 'not' of
+      // one of our source operands. In this case we can compute the bits
+      // without growing Src vector.
+      Register LHS;
+      if (mi_match(Op, MRI, m_Not(m_Reg(LHS)))) {
+        LHS = getSrcRegIgnoringCopies(LHS, MRI);
+        for (unsigned I = 0; I < Src.size(); ++I) {
+          if (Src[I] == LHS) {
+            Bits = ~SrcBits[I];
+            return true;
+          }
+        }
+      }
+
+      return false;
+    }
+
+    Bits = SrcBits[Src.size()];
+    Src.push_back(Op);
+    return true;
+  };
+
+  MachineInstr *MI = MRI.getVRegDef(R);
+  switch (MI->getOpcode()) {
+  case TargetOpcode::G_AND:
+  case TargetOpcode::G_OR:
+  case TargetOpcode::G_XOR: {
+    Register LHS = getSrcRegIgnoringCopies(MI->getOperand(1).getReg(), MRI);
+    Register RHS = getSrcRegIgnoringCopies(MI->getOperand(2).getReg(), MRI);
+
+    SmallVector<Register, 3> Backup(Src.begin(), Src.end());
+    if (!getOperandBits(LHS, LHSBits) ||
+        !getOperandBits(RHS, RHSBits)) {
+      Src = Backup;
+      return std::make_pair(0, 0);
+    }
+
+    // Recursion is naturally limited by the size of the operand vector.
+    auto Op = BitOp3_Op(LHS, Src, MRI);
+    if (Op.first) {
+      NumOpcodes += Op.first;
+      LHSBits = Op.second;
+    }
+
+    Op = BitOp3_Op(RHS, Src, MRI);
+    if (Op.first) {
+      NumOpcodes += Op.first;
+      RHSBits = Op.second;
+    }
+    break;
+  }
+  default:
+    return std::make_pair(0, 0);
+  }
+
+  uint8_t TTbl;
+  switch (MI->getOpcode()) {
+  case TargetOpcode::G_AND:
+    TTbl = LHSBits & RHSBits;
+    break;
+  case TargetOpcode::G_OR:
+    TTbl = LHSBits | RHSBits;
+    break;
+  case TargetOpcode::G_XOR:
+    TTbl = LHSBits ^ RHSBits;
+    break;
+  default:
+    break;
+  }
+
+  return std::make_pair(NumOpcodes + 1, TTbl);
+}
+
+bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const {
+  if (!Subtarget->hasBitOp3Insts())
+    return false;
+
+  SmallVector<Register, 3> Src;
+  uint8_t TTbl;
+  unsigned NumOpcodes;
+  Register DstReg = MI.getOperand(0).getReg();
+
+  std::tie(NumOpcodes, TTbl) = BitOp3_Op(DstReg, Src, *MRI);
+
+  // Src.empty() case can happen if all operands are all zero or all ones.
+  // Normally it shall be optimized out before reaching this.
+  if (NumOpcodes < 2 || Src.empty())
+    return false;
+
+  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
+  const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
+
+  // For a uniform case threshold should be higher to account for moves between
+  // VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be in SGPRs
+  // and a readtfirstlane after.
+  if (NumOpcodes < 4 && !IsVALU)
+    return false;
+
+  bool IsB32 = MRI->getType(DstReg) == LLT::scalar(32);
+  if (NumOpcodes == 2 && IsB32) {
+    // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
+    // asm more readable. This cannot be modeled with AddedComplexity because
+    // selector does not know how many operations did we match.
+    if (mi_match(MI, *MRI, m_GXor(m_GXor(m_Reg(), m_Reg()), m_Reg())) ||
+        mi_match(MI, *MRI, m_GOr(m_GOr(m_Reg(), m_Reg()), m_Reg())) ||
+        mi_match(MI, *MRI, m_GOr(m_GAnd(m_Reg(), m_Reg()), m_Reg())))
+      return false;
+  }
+
+  unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64;
+  unsigned CBL = STI.getConstantBusLimit(Opc);
+  MachineBasicBlock *MBB = MI.getParent();
+  const DebugLoc &DL = MI.getDebugLoc();
+
+  for (unsigned I = 0; I < Src.size(); ++I) {
+    const RegisterBank *RB = RBI.getRegBank(Src[I], *MRI, TRI);
+    if (RB->getID() != AMDGPU::SGPRRegBankID)
+      continue;
+    if (CBL > 0) {
+      --CBL;
+      continue;
+    }
+    Register NewReg =  MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), NewReg)
+        .addReg(Src[I]);
+    Src[I] = NewReg;
+  }
+
+  // Last operand can be ignored, turning a ternary operation into a binary.
+  // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
+  // 'c' with 'a' here without changing the answer. In some pathological
+  // cases it should be possible to get an operation with a single operand
+  // too if optimizer would not catch it.
+  while (Src.size() < 3)
+    Src.push_back(Src[0]);
+
+  auto MIB = BuildMI(*MBB, MI, DL, TII.get(Opc), DstReg);
+  if (!IsB32)
+    MIB.addImm(0); // src_mod0
+  MIB.addReg(Src[0]);
+  if (!IsB32)
+    MIB.addImm(0); // src_mod1
+  MIB.addReg(Src[1]);
+  if (!IsB32)
+    MIB.addImm(0); // src_mod2
+  MIB.addReg(Src[2])
+     .addImm(TTbl);
+  if (!IsB32)
+    MIB.addImm(0); // op_sel
+
+  constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+  MI.eraseFromParent();
+
+  return true;
+}
+
 bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
   Register SrcReg = MI.getOperand(0).getReg();
   if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
@@ -3682,6 +3882,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
   case TargetOpcode::G_AND:
   case TargetOpcode::G_OR:
   case TargetOpcode::G_XOR:
+    if (selectBITOP3(I))
+      return true;
     if (selectImpl(I, *CoverageInfo))
       return true;
     return selectG_AND_OR_XOR(I);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index a81f1579fb9f33..d294300be40497 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -147,6 +147,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
   bool selectSMFMACIntrin(MachineInstr &I) const;
   bool selectPermlaneSwapIntrin(MachineInstr &I, Intrinsic::ID IntrID) const;
   bool selectWaveAddress(MachineInstr &I) const;
+  bool selectBITOP3(MachineInstr &I) const;
   bool selectStackRestore(MachineInstr &MI) const;
   bool selectNamedBarrierInit(MachineInstr &I, Intrinsic::ID IID) const;
   bool selectNamedBarrierInst(MachineInstr &I, Intrinsic::ID IID) const;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 34850e42a3d605..c8c36714909adf 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -6,6 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+def BITOP3_32 : ComplexPattern<i32, 4, "SelectBITOP3", [and, or, xor]>;
+def BITOP3_16 : ComplexPattern<i16, 4, "SelectBITOP3", [and, or, xor]>;
+
 // Special case for v_div_fmas_{f32|f64}, since it seems to be the
 // only VOP instruction that implicitly reads VCC.
 let Asm64 = " $vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$clamp$omod" in {
@@ -1275,6 +1278,16 @@ let SubtargetPredicate = HasBitOp3Insts  in {
     (i16 (int_amdgcn_bitop3 i16:$src0, i16:$src1, i16:$src2, i8:$bitop3)),
     (i16 (V_BITOP3_B16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0))
   >;
+
+  def : GCNPat<
+    (i32 (BITOP3_32 i32:$src0, i32:$src1, i32:$src2, i8:$bitop3)),
+    (i32 (V_BITOP3_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2, timm:$bitop3))
+  >;
+
+  def : GCNPat<
+    (i16 (BITOP3_16 i16:$src0, i16:$src1, i16:$src2, i8:$bitop3)),
+    (i16 (V_BITOP3_B16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0))
+  >;
 } // End SubtargetPredicate = HasBitOp3Insts
 
 class DivFmasPat<ValueType vt, Instruction inst, Register CondReg> : GCNPat<
diff --git a/llvm/test/CodeGen/AMDGPU/bitop3.ll b/llvm/test/CodeGen/AMDGPU/bitop3.ll
new file mode 100644
index 00000000000000..dd608ef0e5a53d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/bitop3.ll
@@ -0,0 +1,368 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx950 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX950,GFX950-SDAG %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx950 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX950,GFX950-GISEL %s
+
+; ========= Single bit functions =========
+
+define amdgpu_ps float @not_and_not_and_not_and(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: not_and_not_and_not_and:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_bitop3_b32 v0, v0, v1, v2 bitop3:1
+; GCN-NEXT:    ; return to shader part epilog
+  %nota = xor i32 %a, -1
+  %notb = xor i32 %b, -1
+  %notc = xor i32 %c, -1
+  %and1 = and i32 %nota, %notc
+  %and2 = and i32 %and1, %notb
+  %ret_cast = bitcast i32 %and2 to float
+  ret float %ret_cast
+}
+
+define amdgpu_ps float @not_and_not_and_and(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: not_and_not_and_and:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_bitop3_b32 v0, v0, v1, v2 bitop3:2
+; GCN-NEXT:    ; return to shader part epilog
+  %nota = xor i32 %a, -1
+  %notb = xor i32 %b, -1
+  %and1 = and i32 %nota, %c
+  %and2 = and i32 %and1, %notb
+  %ret_cast = bitcast i32 %and2 to float
+  ret float %ret_cast
+}
+
+define amdgpu_ps float @not_and_and_not_and(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: not_and_and_not_and:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_bitop3_b32 v0, v0, v1, v2 bitop3:4
+; GCN-NEXT:    ; return to shader part epilog
+  %nota = xor i32 %a, -1
+  %notc = xor i32 %c, -1
+  %and1 = and i32 %nota, %notc
+  %and2 = and i32 %and1, %b
+  %ret_cast = bitcast i32 %and2 to float
+  ret float %ret_cast
+}
+
+define amdgpu_ps float @not_and_and_and(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: not_and_and_and:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_bitop3_b32 v0, v0, v1, v2 bitop3:8
+; GCN-NEXT:    ; return to shader part epilog
+  %nota = xor i32 %a, -1
+  %and1 = and i32 %nota, %c
+  %and2 = and i32 %and1, %b
+  %ret_cast = bitcast i32 %and2 to float
+  ret float %ret_cast
+}
+
+define amdgpu_ps float @and_not_and_not_and(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: and_not_and_not_and:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_bitop3_b32 v0, v0, v1, v2 bitop3:0x10
+; GCN-NEXT:    ; return to shader part epilog
+  %notb = xor i32 %b, -1
+  %notc = xor i32 %c, -1
+  %and1 = and i32 %a, %notc
+  %and2 = and i32 %and1, %notb
+  %ret_cast = bitcast i32 %and2 to float
+  ret float %ret_cast
+}
+
+define amdgpu_ps float @and_not_and_and(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: and_not_and_and:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_bitop3_b32 v0, v0, v1, v2 bitop3:0x20
+; GCN-NEXT:    ; return to shader part epilog
+  %notb = xor i32 %b, -1
+  %and1 = and i32 %a, %c
+  %and2 = and i32 %and1, %notb
+  %ret_cast = bitcast i32 %and2 to float
+  ret float %ret_cast
+}
+
+define amdgpu_ps float @and_and_not_and(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: and_and_not_and:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_bitop3_b32 v0, v0, v1, v2 bitop3:0x40
+; GCN-NEXT:    ; return to shader part epilog
+  %notc = xor i32 %c, -1
+  %and1 = and i32 %a, %notc
+  %and2 = and i32 %and1, %b
+  %ret_cast = bitcast i32 %and2 to float
+  ret float %ret_cast
+}
+
+define amdgpu_ps float @and_and_and(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: and_and_and:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_bitop3_b32 v0, v0, v1, v2 bitop3:0x80
+; GCN-NEXT:    ; return to shader part epilog
+  %and1 = and i32 %a, %c
+  %and2 = and i32 %and1, %b
+  %ret_cast = bitcast i32 %and2 to float
+  ret float %ret_cast
+}
+
+; ========= Multi bit functions =========
+
+define amdgpu_ps float @test_12(i32 %a, i32 %b) {
+; GCN-LABEL: test_12:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_bitop3_b32 v0, v0, v1, v0 bitop3:0xc
+; GCN-NEXT:    ; return to shader part epilog
+  %nota = xor i32 %a, -1
+  %and1 = and i32 %nota, %b
+  %ret_cast = bitcast i32 %and1 to float
+  ret float %ret_cast
+}
+
+define amdgpu_ps float @test_63(i32 %a, i32 %b) {
+; GCN-LABEL: test_63:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_bitop3_b32 v0, v0, v1, v0 bitop3:0x3f
+; GCN-NEXT:    ; return to shader part epilog
+  %nota = xor i32 %a, -1
+  %notb = xor i32 %b, -1
+  %or = or i32 %nota, %notb
+  %ret_cast = bitcast i32 %or to float
+  ret float %ret_cast
+}
+
+define amdgpu_ps float @test_59(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: test_59:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_bitop3_b32 v0...
[truncated]

github-actions · 2024-11-27T04:25:38Z

⚠️ C/C++ code formatter, clang-format found issues in your code. ⚠️

You can test this locally with the following command:

git-clang-format --diff ff296c1db3b1bcf9ed3d9b22b1fefdd74d32f895 ffac062d560a4624a911af7061a7f7b12b4c16f5 --extensions h,cpp -- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h

View the diff from clang-format here.

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index c0e01a020e..fcc6aa4ec5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -3569,7 +3569,7 @@ static std::pair<unsigned, uint8_t> BitOp3_Op(SDValue In,
     //                          1     0     1
     //                          1     1     0
     //                          1     1     1
-    const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
+    const uint8_t SrcBits[3] = {0xf0, 0xcc, 0xaa};
 
     if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
       if (C->isAllOnes()) {
@@ -3630,8 +3630,7 @@ static std::pair<unsigned, uint8_t> BitOp3_Op(SDValue In,
     SDValue RHS = In.getOperand(1);
 
     SmallVector<SDValue, 3> Backup(Src.begin(), Src.end());
-    if (!getOperandBits(LHS, LHSBits) ||
-        !getOperandBits(RHS, RHSBits)) {
+    if (!getOperandBits(LHS, LHSBits) || !getOperandBits(RHS, RHSBits)) {
       Src = Backup;
       return std::make_pair(0, 0);
     }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 7e61eb4706..3906ede252 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -243,7 +243,7 @@ private:
   bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
 
   bool SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1, SDValue &Src2,
-                   SDValue &Tbl) const;
+                    SDValue &Tbl) const;
 
   SDValue getHi16Elt(SDValue In) const;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 71d23f9fe3..32e2bd1ac6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3661,7 +3661,7 @@ static std::pair<unsigned, uint8_t> BitOp3_Op(Register R,
     //                          1     0     1
     //                          1     1     0
     //                          1     1     1
-    const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
+    const uint8_t SrcBits[3] = {0xf0, 0xcc, 0xaa};
 
     if (mi_match(Op, MRI, m_AllOnesInt())) {
       Bits = 0xff;
@@ -3718,8 +3718,7 @@ static std::pair<unsigned, uint8_t> BitOp3_Op(Register R,
     Register RHS = getSrcRegIgnoringCopies(MI->getOperand(2).getReg(), MRI);
 
     SmallVector<Register, 3> Backup(Src.begin(), Src.end());
-    if (!getOperandBits(LHS, LHSBits) ||
-        !getOperandBits(RHS, RHSBits)) {
+    if (!getOperandBits(LHS, LHSBits) || !getOperandBits(RHS, RHSBits)) {
       Src = Backup;
       return std::make_pair(0, 0);
     }
@@ -3809,9 +3808,8 @@ bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const {
       --CBL;
       continue;
     }
-    Register NewReg =  MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), NewReg)
-        .addReg(Src[I]);
+    Register NewReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), NewReg).addReg(Src[I]);
     Src[I] = NewReg;
   }
 
@@ -3832,8 +3830,7 @@ bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const {
   MIB.addReg(Src[1]);
   if (!IsB32)
     MIB.addImm(0); // src_mod2
-  MIB.addReg(Src[2])
-     .addImm(TTbl);
+  MIB.addReg(Src[2]).addImm(TTbl);
   if (!IsB32)
     MIB.addImm(0); // op_sel

arsenm · 2024-11-27T06:23:04Z

Merge activity

Nov 27, 1:23 AM EST: A user started a stack merge that includes this pull request via Graphite.
Nov 27, 1:29 AM EST: Graphite rebased this pull request as part of a merge.
Nov 27, 1:31 AM EST: A user merged this pull request with Graphite.

Co-authored-by: Stanislav Mekhanoshin <[email protected]>

llvm-ci · 2024-11-27T11:54:58Z

LLVM Buildbot has detected a new failure on builder llvm-clang-x86_64-expensive-checks-debian running on gribozavr4 while building llvm at step 6 "test-build-unified-tree-check-all".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/16/builds/9699

Here is the relevant piece of the build log for the reference

Step 6 (test-build-unified-tree-check-all) failure: test (failure)
******************** TEST 'LLVM :: CodeGen/AMDGPU/bitop3.ll' FAILED ********************
Exit Code: 1

Command Output (stderr):
--
RUN: at line 2: /b/1/llvm-clang-x86_64-expensive-checks-debian/build/bin/llc -march=amdgcn -mcpu=gfx950 < /b/1/llvm-clang-x86_64-expensive-checks-debian/llvm-project/llvm/test/CodeGen/AMDGPU/bitop3.ll | /b/1/llvm-clang-x86_64-expensive-checks-debian/build/bin/FileCheck -enable-var-scope -check-prefixes=GCN,GFX950,GFX950-SDAG /b/1/llvm-clang-x86_64-expensive-checks-debian/llvm-project/llvm/test/CodeGen/AMDGPU/bitop3.ll
+ /b/1/llvm-clang-x86_64-expensive-checks-debian/build/bin/llc -march=amdgcn -mcpu=gfx950
+ /b/1/llvm-clang-x86_64-expensive-checks-debian/build/bin/FileCheck -enable-var-scope -check-prefixes=GCN,GFX950,GFX950-SDAG /b/1/llvm-clang-x86_64-expensive-checks-debian/llvm-project/llvm/test/CodeGen/AMDGPU/bitop3.ll
RUN: at line 3: /b/1/llvm-clang-x86_64-expensive-checks-debian/build/bin/llc -global-isel -march=amdgcn -mcpu=gfx950 < /b/1/llvm-clang-x86_64-expensive-checks-debian/llvm-project/llvm/test/CodeGen/AMDGPU/bitop3.ll | /b/1/llvm-clang-x86_64-expensive-checks-debian/build/bin/FileCheck -enable-var-scope -check-prefixes=GCN,GFX950,GFX950-GISEL /b/1/llvm-clang-x86_64-expensive-checks-debian/llvm-project/llvm/test/CodeGen/AMDGPU/bitop3.ll
+ /b/1/llvm-clang-x86_64-expensive-checks-debian/build/bin/llc -global-isel -march=amdgcn -mcpu=gfx950
+ /b/1/llvm-clang-x86_64-expensive-checks-debian/build/bin/FileCheck -enable-var-scope -check-prefixes=GCN,GFX950,GFX950-GISEL /b/1/llvm-clang-x86_64-expensive-checks-debian/llvm-project/llvm/test/CodeGen/AMDGPU/bitop3.ll

# After Eliminate PHI nodes for register allocation
# Machine code for function uniform_4_op: NoPHIs, TracksLiveness, Legalized, RegBankSelected, Selected

bb.0 (%ir-block.0):
  liveins: $sgpr0, $sgpr1, $sgpr2
  %0:sreg_32 = COPY killed $sgpr0
  %1:sreg_32 = COPY killed $sgpr1
  %2:sreg_32 = COPY killed $sgpr2
  %8:vgpr_32 = COPY killed %1:sreg_32
  %9:vgpr_32 = COPY killed %2:sreg_32
  %10:vgpr_32 = V_BITOP3_B32_e64 killed %0:sreg_32, killed %8:vgpr_32, killed %9:vgpr_32, 2, implicit $exec
  %7:sreg_32 = COPY killed %10:vgpr_32
  $vgpr0 = COPY killed %7:sreg_32
  SI_RETURN_TO_EPILOG implicit killed $vgpr0

# End machine code for function uniform_4_op.

*** Bad machine code: illegal copy from vector register to SGPR ***
- function:    uniform_4_op
- basic block: %bb.0  (0x9680cc8)
- instruction: %7:sreg_32 = COPY killed %10:vgpr_32
LLVM ERROR: Found 1 machine code errors.
PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace.
Stack dump:
0.	Program arguments: /b/1/llvm-clang-x86_64-expensive-checks-debian/build/bin/llc -global-isel -march=amdgcn -mcpu=gfx950
1.	Running pass 'CallGraph Pass Manager' on module '<stdin>'.
2.	Running pass 'Verify generated machine code' on function '@uniform_4_op'
 #0 0x000000000365fe07 llvm::sys::PrintStackTrace(llvm::raw_ostream&, int) (/b/1/llvm-clang-x86_64-expensive-checks-debian/build/bin/llc+0x365fe07)
 #1 0x000000000365d8be llvm::sys::RunSignalHandlers() (/b/1/llvm-clang-x86_64-expensive-checks-debian/build/bin/llc+0x365d8be)
 #2 0x00000000036604bf SignalHandler(int) Signals.cpp:0:0
 #3 0x00007fa3da164140 __restore_rt (/lib/x86_64-linux-gnu/libpthread.so.0+0x13140)
 #4 0x00007fa3d9c78d51 raise (/lib/x86_64-linux-gnu/libc.so.6+0x38d51)
 #5 0x00007fa3d9c62537 abort (/lib/x86_64-linux-gnu/libc.so.6+0x22537)
 #6 0x00000000035d0e7a llvm::report_fatal_error(llvm::Twine const&, bool) (/b/1/llvm-clang-x86_64-expensive-checks-debian/build/bin/llc+0x35d0e7a)
 #7 0x000000000286852a (/b/1/llvm-clang-x86_64-expensive-checks-debian/build/bin/llc+0x286852a)
 #8 0x0000000002869508 (anonymous namespace)::MachineVerifierLegacyPass::runOnMachineFunction(llvm::MachineFunction&) MachineVerifier.cpp:0:0
 #9 0x000000000276b510 llvm::MachineFunctionPass::runOnFunction(llvm::Function&) (/b/1/llvm-clang-x86_64-expensive-checks-debian/build/bin/llc+0x276b510)
#10 0x0000000002cb4364 llvm::FPPassManager::runOnFunction(llvm::Function&) (/b/1/llvm-clang-x86_64-expensive-checks-debian/build/bin/llc+0x2cb4364)
...

vvereschaka · 2024-11-27T23:08:45Z

Hi @arsenm ,

here is a problem with bitop3.ll test of the expensive test builders

https://lab.llvm.org/buildbot/#/builders/187/builds/2801

*** Bad machine code: illegal copy from vector register to SGPR ***
- function:    uniform_4_op
- basic block: %bb.0  (0x55cc20ecdc48)
- instruction: %7:sreg_32 = COPY killed %10:vgpr_32
LLVM ERROR: Found 1 machine code errors.
PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace.
Stack dump:
0.	Program arguments: /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/build/bin/llc -global-isel -march=amdgcn -mcpu=gfx950
1.	Running pass 'CallGraph Pass Manager' on module '<stdin>'.
2.	Running pass 'Verify generated machine code' on function '@uniform_4_op'
 #0 0x000055cbf4868a0c llvm::sys::PrintStackTrace(llvm::raw_ostream&, int) /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/llvm-project/llvm/lib/Support/Unix/Signals.inc:723:22
 #1 0x000055cbf4868e2d PrintStackTraceSignalHandler(void*) /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/llvm-project/llvm/lib/Support/Unix/Signals.inc:798:1
 #2 0x000055cbf486627d llvm::sys::RunSignalHandlers() /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/llvm-project/llvm/lib/Support/Signals.cpp:105:20
 #3 0x000055cbf48682a4 SignalHandler(int) /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/llvm-project/llvm/lib/Support/Unix/Signals.inc:413:1
 #4 0x00007f13cdde4520 (/lib/x86_64-linux-gnu/libc.so.6+0x42520)
 #5 0x00007f13cde389fc pthread_kill (/lib/x86_64-linux-gnu/libc.so.6+0x969fc)
 #6 0x00007f13cdde4476 gsignal (/lib/x86_64-linux-gnu/libc.so.6+0x42476)
 #7 0x00007f13cddca7f3 abort (/lib/x86_64-linux-gnu/libc.so.6+0x287f3)
 #8 0x000055cbf478d463 llvm::report_fatal_error(llvm::Twine const&, bool) /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/llvm-project/llvm/lib/Support/ErrorHandling.cpp:126:9
 #9 0x000055cbf32af872 (anonymous namespace)::MachineVerifier::ReportedErrors::~ReportedErrors() /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/llvm-project/llvm/lib/CodeGen/MachineVerifier.cpp:262:33
#10 0x000055cbf32afa0c (anonymous namespace)::MachineVerifier::~MachineVerifier() /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/llvm-project/llvm/lib/CodeGen/MachineVerifier.cpp:102:8
#11 0x000055cbf32afb57 (anonymous namespace)::MachineVerifierLegacyPass::runOnMachineFunction(llvm::MachineFunction&) /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/llvm-project/llvm/lib/CodeGen/MachineVerifier.cpp:389:12
#12 0x000055cbf3129b05 llvm::MachineFunctionPass::runOnFunction(llvm::Function&) /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/llvm-project/llvm/lib/CodeGen/MachineFunctionPass.cpp:94:33
#13 0x000055cbf3a0d420 llvm::FPPassManager::runOnFunction(llvm::Function&) /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/llvm-project/llvm/lib/IR/LegacyPassManager.cpp:1406:20
#14 0x000055cbf2811d07 (anonymous namespace)::CGPassManager::RunPassOnSCC(llvm::Pass*, llvm::CallGraphSCC&, llvm::CallGraph&, bool&, bool&) /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/llvm-project/llvm/lib/Analysis/CallGraphSCCPass.cpp:180:17
#15 0x000055cbf2812df1 (anonymous namespace)::CGPassManager::RunAllPassesOnSCC(llvm::CallGraphSCC&, llvm::CallGraph&, bool&) /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/llvm-project/llvm/lib/Analysis/CallGraphSCCPass.cpp:469:21
#16 0x000055cbf2813248 (anonymous namespace)::CGPassManager::runOnModule(llvm::Module&) /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/llvm-project/llvm/lib/Analysis/CallGraphSCCPass.cpp:534:15
#17 0x000055cbf3a0dc51 (anonymous namespace)::MPPassManager::runOnModule(llvm::Module&) /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/llvm-project/llvm/lib/IR/LegacyPassManager.cpp:1521:20
#18 0x000055cbf3a08b11 llvm::legacy::PassManagerImpl::run(llvm::Module&) /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/llvm-project/llvm/lib/IR/LegacyPassManager.cpp:539:13
#19 0x000055cbf3a0e59b llvm::legacy::PassManager::run(llvm::Module&) /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/llvm-project/llvm/lib/IR/LegacyPassManager.cpp:1649:1
#20 0x000055cbef89b0ef compileModule(char**, llvm::LLVMContext&) /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/llvm-project/llvm/tools/llc/llc.cpp:753:34
#21 0x000055cbef8988c3 main /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/llvm-project/llvm/tools/llc/llc.cpp:411:35
#22 0x00007f13cddcbd90 (/lib/x86_64-linux-gnu/libc.so.6+0x29d90)
#23 0x00007f13cddcbe40 __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x29e40)
#24 0x000055cbef897465 _start (/home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/build/bin/llc+0xd1d465)
/home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/llvm-project/llvm/test/CodeGen/AMDGPU/bitop3.ll:263:14: error: GCN-LABEL: expected string not found in input
; GCN-LABEL: uniform_4_op:

arsenm · 2024-11-27T23:34:11Z

Hi @arsenm ,

here is a problem with bitop3.ll test of the expensive test builders

https://lab.llvm.org/buildbot/#/builders/187/builds/2801

Fixed by #117940

RKSimon · 2024-11-28T12:09:55Z

@arsenm There's still issues with bitop3.ll on expensive checks builds - please can you take another look?

vvereschaka · 2024-11-30T23:29:40Z

@arsenm the problem is not fixed for 4 days already. Please revert your changes or fix this problem as soon as possible.

Co-authored-by: Stanislav Mekhanoshin <[email protected]>

arsenm added the backend:AMDGPU label Nov 27, 2024 — with Graphite App

arsenm requested review from jayfoad, pravinjagtap, rampitec, shiltian, Sisyph and srpande November 27, 2024 04:18

This was referenced Nov 27, 2024

AMDGPU: Add support for v_cvt_scalef32_sr instructions #117820

Merged

AMDGPU: Builtin & CodeGen support for v_cvt_scalef32_sr_{bf8|fp8}_{f16|bf16|f32} #117821

Merged

AMDGPU: Add builtins & codegen support for bitop3_b{16|32} of gfx950. #117823

Merged

shiltian approved these changes Nov 27, 2024

View reviewed changes

arsenm force-pushed the users/arsenm/gfx950/remove-feature-cvt-fp8-vop1-bug branch from f83b18c to ff296c1 Compare November 27, 2024 04:46

arsenm force-pushed the users/arsenm/gfx950/match-bitop3 branch from 2758ec2 to ffac062 Compare November 27, 2024 04:47

arsenm force-pushed the users/arsenm/gfx950/remove-feature-cvt-fp8-vop1-bug branch from ff296c1 to 11464f0 Compare November 27, 2024 06:24

Base automatically changed from users/arsenm/gfx950/remove-feature-cvt-fp8-vop1-bug to main November 27, 2024 06:28

AMDGPU: Match and Select BITOP3 on gfx950

c609205

Co-authored-by: Stanislav Mekhanoshin <[email protected]>

arsenm force-pushed the users/arsenm/gfx950/match-bitop3 branch from ffac062 to c609205 Compare November 27, 2024 06:28

arsenm merged commit b4a16a7 into main Nov 27, 2024
3 of 6 checks passed

arsenm deleted the users/arsenm/gfx950/match-bitop3 branch November 27, 2024 06:31

This was referenced Dec 2, 2024

AMDGPU: Allow f16/bf16 for DS_READ_TR16_B64 gfx950 builtins #118297

Merged

AMDGPU: Add support for V_CVT_PK_F16_F32 instruction for gfx950 #118300

Merged

AMDGPU: Add codegen support for gfx950 v_ashr_pk_i8/u8_i32 #118304

Merged

searlmc1 pushed a commit to ROCm/llvm-project that referenced this pull request Feb 3, 2025

AMDGPU: Match and Select BITOP3 on gfx950 (llvm#117843)

7f10cac

Co-authored-by: Stanislav Mekhanoshin <[email protected]>

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

AMDGPU: Match and Select BITOP3 on gfx950 #117843

AMDGPU: Match and Select BITOP3 on gfx950 #117843

Uh oh!

arsenm commented Nov 27, 2024

Uh oh!

arsenm commented Nov 27, 2024 •

edited

Loading

Uh oh!

llvmbot commented Nov 27, 2024

Uh oh!

github-actions bot commented Nov 27, 2024 •

edited

Loading

Uh oh!

arsenm commented Nov 27, 2024 •

edited

Loading

Uh oh!

Uh oh!

llvm-ci commented Nov 27, 2024

Uh oh!

vvereschaka commented Nov 27, 2024

Uh oh!

arsenm commented Nov 27, 2024

Uh oh!

RKSimon commented Nov 28, 2024

Uh oh!

vvereschaka commented Nov 30, 2024

Uh oh!

Uh oh!

AMDGPU: Match and Select BITOP3 on gfx950 #117843

AMDGPU: Match and Select BITOP3 on gfx950 #117843

Uh oh!

Conversation

arsenm commented Nov 27, 2024

Uh oh!

arsenm commented Nov 27, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Nov 27, 2024

Uh oh!

github-actions bot commented Nov 27, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

arsenm commented Nov 27, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Merge activity

Uh oh!

Uh oh!

llvm-ci commented Nov 27, 2024

Uh oh!

vvereschaka commented Nov 27, 2024

Uh oh!

arsenm commented Nov 27, 2024

Uh oh!

RKSimon commented Nov 28, 2024

Uh oh!

vvereschaka commented Nov 30, 2024

Uh oh!

Uh oh!

arsenm commented Nov 27, 2024 •

edited

Loading

github-actions bot commented Nov 27, 2024 •

edited

Loading

arsenm commented Nov 27, 2024 •

edited

Loading