[AMDGPU] Src1 of VOP3 DPP instructions can be SGPR on supported subtargets (llvm#67461)

mbrkusanin · web-flow · commit 2cd2445c21bc · 2023-09-29T11:54:49.000+02:00
In order to avoid duplicating every dpp pseudo opcode that has src1, we
allow it for all opcodes and add manual checks on subtargets that do not
support it.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -472,6 +472,12 @@ def FeatureDPALU_DPP : SubtargetFeature<"dpp-64bit",
   "Support DPP (Data Parallel Primitives) extension in DP ALU"
 >;
 
+def FeatureDPPSrc1SGPR : SubtargetFeature<"dpp-src1-sgpr",
+  "HasDPPSrc1SGPR",
+  "true",
+  "Support SGPR for Src1 of DPP instructions"
+>;
+
 def FeaturePackedFP32Ops : SubtargetFeature<"packed-fp32-ops",
   "HasPackedFP32Ops",
   "true",
@@ -1383,11 +1389,13 @@ def FeatureISAVersion11_0_3 : FeatureSet<
 
 def FeatureISAVersion11_5_0 : FeatureSet<
   !listconcat(FeatureISAVersion11_Common.Features,
-    [FeatureSALUFloatInsts])>;
+    [FeatureSALUFloatInsts,
+     FeatureDPPSrc1SGPR])>;
 
 def FeatureISAVersion11_5_1 : FeatureSet<
   !listconcat(FeatureISAVersion11_Common.Features,
     [FeatureSALUFloatInsts,
+     FeatureDPPSrc1SGPR,
      FeatureGFX11FullVGPRs])>;
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -4231,16 +4231,33 @@ bool AMDGPUAsmParser::validateDPP(const MCInst &Inst,
                                   const OperandVector &Operands) {
   const unsigned Opc = Inst.getOpcode();
   int DppCtrlIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dpp_ctrl);
-  if (DppCtrlIdx < 0)
-    return true;
-  unsigned DppCtrl = Inst.getOperand(DppCtrlIdx).getImm();
+  if (DppCtrlIdx >= 0) {
+    unsigned DppCtrl = Inst.getOperand(DppCtrlIdx).getImm();
+
+    if (!AMDGPU::isLegalDPALU_DPPControl(DppCtrl) &&
+        AMDGPU::isDPALU_DPP(MII.get(Opc))) {
+      // DP ALU DPP is supported for row_newbcast only on GFX9*
+      SMLoc S = getImmLoc(AMDGPUOperand::ImmTyDppCtrl, Operands);
+      Error(S, "DP ALU dpp only supports row_newbcast");
+      return false;
+    }
+  }
 
-  if (!AMDGPU::isLegalDPALU_DPPControl(DppCtrl) &&
-      AMDGPU::isDPALU_DPP(MII.get(Opc))) {
-    // DP ALU DPP is supported for row_newbcast only on GFX9*
-    SMLoc S = getImmLoc(AMDGPUOperand::ImmTyDppCtrl, Operands);
-    Error(S, "DP ALU dpp only supports row_newbcast");
-    return false;
+  int Dpp8Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dpp8);
+  bool IsDPP = DppCtrlIdx >= 0 || Dpp8Idx >= 0;
+
+  if (IsDPP && !hasDPPSrc1SGPR(getSTI())) {
+    int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
+    if (Src1Idx >= 0) {
+      const MCOperand &Src1 = Inst.getOperand(Src1Idx);
+      const MCRegisterInfo *TRI = getContext().getRegisterInfo();
+      if (Src1.isImm() ||
+          (Src1.isReg() && isSGPR(mc2PseudoReg(Src1.getReg()), TRI))) {
+        AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[Src1Idx]);
+        Error(Op.getStartLoc(), "invalid operand for instruction");
+        return false;
+      }
+    }
   }
 
   return true;
diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -191,6 +191,16 @@ MachineOperand *GCNDPPCombine::getOldOpndValue(MachineOperand &OldOpnd) const {
   return &OldOpnd;
 }
 
+static unsigned getOperandSize(MachineInstr &MI, unsigned Idx,
+                               MachineRegisterInfo &MRI) {
+  int16_t RegClass = MI.getDesc().operands()[Idx].RegClass;
+  if (RegClass == -1)
+    return 0;
+
+  const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
+  return TRI->getRegSizeInBits(*TRI->getRegClass(RegClass));
+}
+
 MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
                                            MachineInstr &MovMI,
                                            RegSubRegPair CombOldVGPR,
@@ -278,6 +288,7 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
     }
     auto *Src0 = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0);
     assert(Src0);
+    int Src0Idx = NumOperands;
     if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src0)) {
       LLVM_DEBUG(dbgs() << "  failed: src0 is illegal\n");
       Fail = true;
@@ -301,7 +312,17 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
     }
     auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
     if (Src1) {
-      if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src1)) {
+      int OpNum = NumOperands;
+      // If subtarget does not support SGPRs for src1 operand then the
+      // requirements are the same as for src0. We check src0 instead because
+      // pseudos are shared between subtargets and allow SGPR for src1 on all.
+      if (!ST->hasDPPSrc1SGPR()) {
+        assert(getOperandSize(*DPPInst, Src0Idx, *MRI) ==
+                   getOperandSize(*DPPInst, NumOperands, *MRI) &&
+               "Src0 and Src1 operands should have the same size");
+        OpNum = Src0Idx;
+      }
+      if (!TII->isOperandLegal(*DPPInst.getInstr(), OpNum, Src1)) {
         LLVM_DEBUG(dbgs() << "  failed: src1 is illegal\n");
         Fail = true;
         break;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -128,6 +128,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool HasDPP = false;
   bool HasDPP8 = false;
   bool HasDPALU_DPP = false;
+  bool HasDPPSrc1SGPR = false;
   bool HasPackedFP32Ops = false;
   bool HasImageInsts = false;
   bool HasExtendedImageInsts = false;
@@ -916,6 +917,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return HasDPALU_DPP;
   }
 
+  bool hasDPPSrc1SGPR() const { return HasDPPSrc1SGPR; }
+
   bool hasPackedFP32Ops() const {
     return HasPackedFP32Ops;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2296,7 +2296,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
   field RegisterClass Src1DPP = getVregSrcForVT<Src1VT>.ret;
   field RegisterClass Src2DPP = getVregSrcForVT<Src2VT>.ret;
   field RegisterOperand Src0VOP3DPP = VGPRSrc_32;
-  field RegisterOperand Src1VOP3DPP = VRegSrc_32;
+  field RegisterOperand Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT>.ret;
   field RegisterOperand Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT>.ret;
   field RegisterOperand Src0SDWA = getSDWASrcForVT<Src0VT>.ret;
   field RegisterOperand Src1SDWA = getSDWASrcForVT<Src0VT>.ret;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -2085,6 +2085,10 @@ bool hasVOPD(const MCSubtargetInfo &STI) {
   return STI.hasFeature(AMDGPU::FeatureVOPD);
 }
 
+bool hasDPPSrc1SGPR(const MCSubtargetInfo &STI) {
+  return STI.hasFeature(AMDGPU::FeatureDPPSrc1SGPR);
+}
+
 unsigned hasKernargPreload(const MCSubtargetInfo &STI) {
   return STI.hasFeature(AMDGPU::FeatureKernargPreload);
 }
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1169,6 +1169,7 @@ bool isGFX940(const MCSubtargetInfo &STI);
 bool hasArchitectedFlatScratch(const MCSubtargetInfo &STI);
 bool hasMAIInsts(const MCSubtargetInfo &STI);
 bool hasVOPD(const MCSubtargetInfo &STI);
+bool hasDPPSrc1SGPR(const MCSubtargetInfo &STI);
 int getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, int32_t ArgNumVGPR);
 unsigned hasKernargPreload(const MCSubtargetInfo &STI);
 
diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine.ll b/llvm/test/CodeGen/AMDGPU/dpp_combine.ll
@@ -1,6 +1,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
+; RUN: llc -march=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
 
 ; GCN-LABEL: {{^}}dpp_add:
 ; GCN: global_load_{{dword|b32}} [[V:v[0-9]+]],
diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
@@ -1,12 +1,14 @@
-# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GCN
+# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN,GFX1100
+# RUN: llc -march=amdgcn -mcpu=gfx1150 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN,GFX1150
 
 ---
 
 # GCN-label: name: vop3
 # GCN: %6:vgpr_32, %7:sreg_32_xm0_xexec = V_SUBBREV_U32_e64_dpp %3, %0, %1, %5, 1, 1, 15, 15, 1, implicit $exec
 # GCN: %8:vgpr_32 = V_CVT_PK_U8_F32_e64_dpp %3, 4, %0, 2, %2, 2, %1, 1, 1, 15, 15, 1, implicit $mode, implicit $exec
 # GCN: %10:vgpr_32 = V_MED3_F32_e64 0, %9, 0, %0, 0, 12345678, 0, 0, implicit $mode, implicit $exec
-# GCN: %12:vgpr_32 = V_MED3_F32_e64 0, %11, 0, 2, 0, %7, 0, 0, implicit $mode, implicit $exec
+# GFX1100: %12:vgpr_32 = V_MED3_F32_e64 0, %11, 0, 2, 0, %7, 0, 0, implicit $mode, implicit $exec
+# GFX1150: %12:vgpr_32 = V_MED3_F32_e64_dpp %3, 0, %1, 0, 2, 0, %7, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec
 name: vop3
 tracksRegLiveness: true
 body:             |
@@ -28,10 +30,54 @@ body:             |
     %9:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
     %10:vgpr_32 = V_MED3_F32_e64 0, %9, 0, %0, 0, 12345678, 0, 0, implicit $mode, implicit $exec
 
-    ; should not be combined because src1 imm is illegal
+    ; should not be combined on subtargets where src1 imm is illegal
     %11:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
     %12:vgpr_32 = V_MED3_F32_e64 0, %11, 0, 2, 0, %7, 0, 0, implicit $mode, implicit $exec
 ...
+---
+
+# GCN-label: name: vop3_sgpr_src1
+# GCN: %6:vgpr_32 = V_MED3_F32_e64_dpp %4, 0, %0, 0, %1, 0, %2, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec
+# GFX1100: %8:vgpr_32 = V_MED3_F32_e64 0, %7, 0, %2, 0, %1, 0, 0, implicit $mode, implicit $exec
+# GFX1150: %8:vgpr_32 = V_MED3_F32_e64_dpp %4, 0, %0, 0, %2, 0, %1, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec
+# GFX1100: %10:vgpr_32 = V_MED3_F32_e64 0, %9, 0, %2, 0, %3, 0, 0, implicit $mode, implicit $exec
+# GFX1150: %10:vgpr_32 = V_MED3_F32_e64_dpp %4, 0, %0, 0, %2, 0, %3, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec
+# GFX1100: %12:vgpr_32 = V_MED3_F32_e64 0, %11, 0, 42, 0, %2, 0, 0, implicit $mode, implicit $exec
+# GFX1150: %12:vgpr_32 = V_MED3_F32_e64_dpp %4, 0, %0, 0, 42, 0, %2, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec
+# GCN: %14:vgpr_32 = V_MED3_F32_e64 0, %13, 0, 4242, 0, %2, 0, 0, implicit $mode, implicit $exec
+name: vop3_sgpr_src1
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1
+
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+    %2:sgpr_32 = COPY $sgpr0
+    %3:sgpr_32 = COPY $sgpr1
+    %4:vgpr_32 = IMPLICIT_DEF
+
+    ; should be combined because src2 allows sgpr
+    %5:vgpr_32 = V_MOV_B32_dpp %4, %0, 1, 15, 15, 1, implicit $exec
+    %6:vgpr_32 = V_MED3_F32_e64 0, %5, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec
+
+    ; should be combined only on subtargets that allow sgpr for src1
+    %7:vgpr_32 = V_MOV_B32_dpp %4, %0, 1, 15, 15, 1, implicit $exec
+    %8:vgpr_32 = V_MED3_F32_e64 0, %7, 0, %2, 0, %1, 0, 0, implicit $mode, implicit $exec
+
+    ; should be combined only on subtargets that allow sgpr for src1
+    %9:vgpr_32 = V_MOV_B32_dpp %4, %0, 1, 15, 15, 1, implicit $exec
+    %10:vgpr_32 = V_MED3_F32_e64 0, %9, 0, %2, 0, %3, 0, 0, implicit $mode, implicit $exec
+
+    ; should be combined only on subtargets that allow inlinable constants for src1
+    %11:vgpr_32 = V_MOV_B32_dpp %4, %0, 1, 15, 15, 1, implicit $exec
+    %12:vgpr_32 = V_MED3_F32_e64 0, %11, 0, 42, 0, %2, 0, 0, implicit $mode, implicit $exec
+
+    ; should not be combined when literal constants are used
+    %13:vgpr_32 = V_MOV_B32_dpp %4, %0, 1, 15, 15, 1, implicit $exec
+    %14:vgpr_32 = V_MED3_F32_e64 0, %13, 0, 4242, 0, %2, 0, 0, implicit $mode, implicit $exec
+...
+---
 
 # Regression test for src_modifiers on base u16 opcode
 # GCN-label: name: vop3_u16
diff --git a/llvm/test/MC/AMDGPU/gfx1150_asm_features.s b/llvm/test/MC/AMDGPU/gfx1150_asm_features.s
@@ -0,0 +1,19 @@
+// RUN: llvm-mc -arch=amdgcn -show-encoding -mcpu=gfx1150 %s | FileCheck --check-prefix=GFX1150 %s
+// RUN: llvm-mc -arch=amdgcn -show-encoding -mcpu=gfx1151 %s | FileCheck --check-prefix=GFX1150 %s
+
+//
+// Subtargets allow src1 of VOP3 DPP instructions to be SGPR or inlinable
+// constant.
+//
+
+v_add3_u32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf
+// GFX1150: encoding: [0x05,0x00,0x55,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_add3_u32_e64_dpp v5, v1, 42, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf
+// GFX1150: encoding: [0x05,0x00,0x55,0xd6,0xfa,0x54,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
+v_add3_u32_e64_dpp v5, v1, s2, v0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1150: encoding: [0x05,0x00,0x55,0xd6,0xe9,0x04,0x00,0x04,0x01,0x77,0x39,0x05]
+
+v_add3_u32_e64_dpp v5, v1, 42, v0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1150: encoding: [0x05,0x00,0x55,0xd6,0xe9,0x54,0x01,0x04,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_err.s
@@ -45,6 +45,15 @@ v_add3_u32_e64_dpp v5, v1, v2, 49812340 dpp8:[7,6,5,4,3,2,1,0]
 v_add3_u32_e64_dpp v5, v1, s1, v0 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_add3_u32_e64_dpp v5, v1, 42, v0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_add3_u32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf
+// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_add3_u32_e64_dpp v5, v1, 42, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf
+// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cvt_f32_i32_e64_dpp v5, s1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1150_dasm_features.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1150_dasm_features.txt
@@ -0,0 +1,13 @@
+# RUN: llvm-mc -arch=amdgcn -mcpu=gfx1150 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1150 %s
+
+# GFX1150: v_add3_u32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x55,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x55,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff
+
+# GFX1150: v_add3_u32_e64_dpp v5, v1, 42, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x55,0xd6,0xfa,0x54,0x0d,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x55,0xd6,0xfa,0x54,0x0d,0x04,0x01,0x1b,0x00,0xff
+
+# GFX1150: v_add3_u32_e64_dpp v5, v1, s2, v0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x55,0xd6,0xe9,0x04,0x00,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x55,0xd6,0xe9,0x04,0x00,0x04,0x01,0x77,0x39,0x05
+
+# GFX1150: v_add3_u32_e64_dpp v5, v1, 42, v0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x55,0xd6,0xe9,0x54,0x01,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x55,0xd6,0xe9,0x54,0x01,0x04,0x01,0x77,0x39,0x05

Original file line number	Diff line number	Diff line change
`@@ -2085,6 +2085,10 @@ bool hasVOPD(const MCSubtargetInfo &STI) {`
`2085`	`2085`	`return STI.hasFeature(AMDGPU::FeatureVOPD);`
`2086`	`2086`	`}`
`2087`	`2087`
	`2088`	`+bool hasDPPSrc1SGPR(const MCSubtargetInfo &STI) {`
	`2089`	`+ return STI.hasFeature(AMDGPU::FeatureDPPSrc1SGPR);`
	`2090`	`+}`
	`2091`	`+`
`2088`	`2092`	`unsigned hasKernargPreload(const MCSubtargetInfo &STI) {`
`2089`	`2093`	`return STI.hasFeature(AMDGPU::FeatureKernargPreload);`
`2090`	`2094`	`}`