Skip to content

Commit 2cd2445

Browse files
authored
[AMDGPU] Src1 of VOP3 DPP instructions can be SGPR on supported subtargets (llvm#67461)
In order to avoid duplicating every dpp pseudo opcode that has src1, we allow it for all opcodes and add manual checks on subtargets that do not support it.
1 parent 74ab493 commit 2cd2445

File tree

12 files changed

+157
-15
lines changed

12 files changed

+157
-15
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -472,6 +472,12 @@ def FeatureDPALU_DPP : SubtargetFeature<"dpp-64bit",
472472
"Support DPP (Data Parallel Primitives) extension in DP ALU"
473473
>;
474474

475+
def FeatureDPPSrc1SGPR : SubtargetFeature<"dpp-src1-sgpr",
476+
"HasDPPSrc1SGPR",
477+
"true",
478+
"Support SGPR for Src1 of DPP instructions"
479+
>;
480+
475481
def FeaturePackedFP32Ops : SubtargetFeature<"packed-fp32-ops",
476482
"HasPackedFP32Ops",
477483
"true",
@@ -1383,11 +1389,13 @@ def FeatureISAVersion11_0_3 : FeatureSet<
13831389

13841390
def FeatureISAVersion11_5_0 : FeatureSet<
13851391
!listconcat(FeatureISAVersion11_Common.Features,
1386-
[FeatureSALUFloatInsts])>;
1392+
[FeatureSALUFloatInsts,
1393+
FeatureDPPSrc1SGPR])>;
13871394

13881395
def FeatureISAVersion11_5_1 : FeatureSet<
13891396
!listconcat(FeatureISAVersion11_Common.Features,
13901397
[FeatureSALUFloatInsts,
1398+
FeatureDPPSrc1SGPR,
13911399
FeatureGFX11FullVGPRs])>;
13921400

13931401
//===----------------------------------------------------------------------===//

llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4231,16 +4231,33 @@ bool AMDGPUAsmParser::validateDPP(const MCInst &Inst,
42314231
const OperandVector &Operands) {
42324232
const unsigned Opc = Inst.getOpcode();
42334233
int DppCtrlIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dpp_ctrl);
4234-
if (DppCtrlIdx < 0)
4235-
return true;
4236-
unsigned DppCtrl = Inst.getOperand(DppCtrlIdx).getImm();
4234+
if (DppCtrlIdx >= 0) {
4235+
unsigned DppCtrl = Inst.getOperand(DppCtrlIdx).getImm();
4236+
4237+
if (!AMDGPU::isLegalDPALU_DPPControl(DppCtrl) &&
4238+
AMDGPU::isDPALU_DPP(MII.get(Opc))) {
4239+
// DP ALU DPP is supported for row_newbcast only on GFX9*
4240+
SMLoc S = getImmLoc(AMDGPUOperand::ImmTyDppCtrl, Operands);
4241+
Error(S, "DP ALU dpp only supports row_newbcast");
4242+
return false;
4243+
}
4244+
}
42374245

4238-
if (!AMDGPU::isLegalDPALU_DPPControl(DppCtrl) &&
4239-
AMDGPU::isDPALU_DPP(MII.get(Opc))) {
4240-
// DP ALU DPP is supported for row_newbcast only on GFX9*
4241-
SMLoc S = getImmLoc(AMDGPUOperand::ImmTyDppCtrl, Operands);
4242-
Error(S, "DP ALU dpp only supports row_newbcast");
4243-
return false;
4246+
int Dpp8Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dpp8);
4247+
bool IsDPP = DppCtrlIdx >= 0 || Dpp8Idx >= 0;
4248+
4249+
if (IsDPP && !hasDPPSrc1SGPR(getSTI())) {
4250+
int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
4251+
if (Src1Idx >= 0) {
4252+
const MCOperand &Src1 = Inst.getOperand(Src1Idx);
4253+
const MCRegisterInfo *TRI = getContext().getRegisterInfo();
4254+
if (Src1.isImm() ||
4255+
(Src1.isReg() && isSGPR(mc2PseudoReg(Src1.getReg()), TRI))) {
4256+
AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[Src1Idx]);
4257+
Error(Op.getStartLoc(), "invalid operand for instruction");
4258+
return false;
4259+
}
4260+
}
42444261
}
42454262

42464263
return true;

llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,16 @@ MachineOperand *GCNDPPCombine::getOldOpndValue(MachineOperand &OldOpnd) const {
191191
return &OldOpnd;
192192
}
193193

194+
static unsigned getOperandSize(MachineInstr &MI, unsigned Idx,
195+
MachineRegisterInfo &MRI) {
196+
int16_t RegClass = MI.getDesc().operands()[Idx].RegClass;
197+
if (RegClass == -1)
198+
return 0;
199+
200+
const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
201+
return TRI->getRegSizeInBits(*TRI->getRegClass(RegClass));
202+
}
203+
194204
MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
195205
MachineInstr &MovMI,
196206
RegSubRegPair CombOldVGPR,
@@ -278,6 +288,7 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
278288
}
279289
auto *Src0 = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0);
280290
assert(Src0);
291+
int Src0Idx = NumOperands;
281292
if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src0)) {
282293
LLVM_DEBUG(dbgs() << " failed: src0 is illegal\n");
283294
Fail = true;
@@ -301,7 +312,17 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
301312
}
302313
auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
303314
if (Src1) {
304-
if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src1)) {
315+
int OpNum = NumOperands;
316+
// If subtarget does not support SGPRs for src1 operand then the
317+
// requirements are the same as for src0. We check src0 instead because
318+
// pseudos are shared between subtargets and allow SGPR for src1 on all.
319+
if (!ST->hasDPPSrc1SGPR()) {
320+
assert(getOperandSize(*DPPInst, Src0Idx, *MRI) ==
321+
getOperandSize(*DPPInst, NumOperands, *MRI) &&
322+
"Src0 and Src1 operands should have the same size");
323+
OpNum = Src0Idx;
324+
}
325+
if (!TII->isOperandLegal(*DPPInst.getInstr(), OpNum, Src1)) {
305326
LLVM_DEBUG(dbgs() << " failed: src1 is illegal\n");
306327
Fail = true;
307328
break;

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
128128
bool HasDPP = false;
129129
bool HasDPP8 = false;
130130
bool HasDPALU_DPP = false;
131+
bool HasDPPSrc1SGPR = false;
131132
bool HasPackedFP32Ops = false;
132133
bool HasImageInsts = false;
133134
bool HasExtendedImageInsts = false;
@@ -916,6 +917,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
916917
return HasDPALU_DPP;
917918
}
918919

920+
bool hasDPPSrc1SGPR() const { return HasDPPSrc1SGPR; }
921+
919922
bool hasPackedFP32Ops() const {
920923
return HasPackedFP32Ops;
921924
}

llvm/lib/Target/AMDGPU/SIInstrInfo.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2296,7 +2296,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
22962296
field RegisterClass Src1DPP = getVregSrcForVT<Src1VT>.ret;
22972297
field RegisterClass Src2DPP = getVregSrcForVT<Src2VT>.ret;
22982298
field RegisterOperand Src0VOP3DPP = VGPRSrc_32;
2299-
field RegisterOperand Src1VOP3DPP = VRegSrc_32;
2299+
field RegisterOperand Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT>.ret;
23002300
field RegisterOperand Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT>.ret;
23012301
field RegisterOperand Src0SDWA = getSDWASrcForVT<Src0VT>.ret;
23022302
field RegisterOperand Src1SDWA = getSDWASrcForVT<Src0VT>.ret;

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2085,6 +2085,10 @@ bool hasVOPD(const MCSubtargetInfo &STI) {
20852085
return STI.hasFeature(AMDGPU::FeatureVOPD);
20862086
}
20872087

2088+
bool hasDPPSrc1SGPR(const MCSubtargetInfo &STI) {
2089+
return STI.hasFeature(AMDGPU::FeatureDPPSrc1SGPR);
2090+
}
2091+
20882092
unsigned hasKernargPreload(const MCSubtargetInfo &STI) {
20892093
return STI.hasFeature(AMDGPU::FeatureKernargPreload);
20902094
}

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1169,6 +1169,7 @@ bool isGFX940(const MCSubtargetInfo &STI);
11691169
bool hasArchitectedFlatScratch(const MCSubtargetInfo &STI);
11701170
bool hasMAIInsts(const MCSubtargetInfo &STI);
11711171
bool hasVOPD(const MCSubtargetInfo &STI);
1172+
bool hasDPPSrc1SGPR(const MCSubtargetInfo &STI);
11721173
int getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, int32_t ArgNumVGPR);
11731174
unsigned hasKernargPreload(const MCSubtargetInfo &STI);
11741175

llvm/test/CodeGen/AMDGPU/dpp_combine.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
22
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
33
; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
4+
; RUN: llc -march=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
45

56
; GCN-LABEL: {{^}}dpp_add:
67
; GCN: global_load_{{dword|b32}} [[V:v[0-9]+]],

llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir

Lines changed: 49 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,14 @@
1-
# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GCN
1+
# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN,GFX1100
2+
# RUN: llc -march=amdgcn -mcpu=gfx1150 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN,GFX1150
23

34
---
45

56
# GCN-label: name: vop3
67
# GCN: %6:vgpr_32, %7:sreg_32_xm0_xexec = V_SUBBREV_U32_e64_dpp %3, %0, %1, %5, 1, 1, 15, 15, 1, implicit $exec
78
# GCN: %8:vgpr_32 = V_CVT_PK_U8_F32_e64_dpp %3, 4, %0, 2, %2, 2, %1, 1, 1, 15, 15, 1, implicit $mode, implicit $exec
89
# GCN: %10:vgpr_32 = V_MED3_F32_e64 0, %9, 0, %0, 0, 12345678, 0, 0, implicit $mode, implicit $exec
9-
# GCN: %12:vgpr_32 = V_MED3_F32_e64 0, %11, 0, 2, 0, %7, 0, 0, implicit $mode, implicit $exec
10+
# GFX1100: %12:vgpr_32 = V_MED3_F32_e64 0, %11, 0, 2, 0, %7, 0, 0, implicit $mode, implicit $exec
11+
# GFX1150: %12:vgpr_32 = V_MED3_F32_e64_dpp %3, 0, %1, 0, 2, 0, %7, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec
1012
name: vop3
1113
tracksRegLiveness: true
1214
body: |
@@ -28,10 +30,54 @@ body: |
2830
%9:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
2931
%10:vgpr_32 = V_MED3_F32_e64 0, %9, 0, %0, 0, 12345678, 0, 0, implicit $mode, implicit $exec
3032
31-
; should not be combined because src1 imm is illegal
33+
; should not be combined on subtargets where src1 imm is illegal
3234
%11:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
3335
%12:vgpr_32 = V_MED3_F32_e64 0, %11, 0, 2, 0, %7, 0, 0, implicit $mode, implicit $exec
3436
...
37+
---
38+
39+
# GCN-label: name: vop3_sgpr_src1
40+
# GCN: %6:vgpr_32 = V_MED3_F32_e64_dpp %4, 0, %0, 0, %1, 0, %2, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec
41+
# GFX1100: %8:vgpr_32 = V_MED3_F32_e64 0, %7, 0, %2, 0, %1, 0, 0, implicit $mode, implicit $exec
42+
# GFX1150: %8:vgpr_32 = V_MED3_F32_e64_dpp %4, 0, %0, 0, %2, 0, %1, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec
43+
# GFX1100: %10:vgpr_32 = V_MED3_F32_e64 0, %9, 0, %2, 0, %3, 0, 0, implicit $mode, implicit $exec
44+
# GFX1150: %10:vgpr_32 = V_MED3_F32_e64_dpp %4, 0, %0, 0, %2, 0, %3, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec
45+
# GFX1100: %12:vgpr_32 = V_MED3_F32_e64 0, %11, 0, 42, 0, %2, 0, 0, implicit $mode, implicit $exec
46+
# GFX1150: %12:vgpr_32 = V_MED3_F32_e64_dpp %4, 0, %0, 0, 42, 0, %2, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec
47+
# GCN: %14:vgpr_32 = V_MED3_F32_e64 0, %13, 0, 4242, 0, %2, 0, 0, implicit $mode, implicit $exec
48+
name: vop3_sgpr_src1
49+
tracksRegLiveness: true
50+
body: |
51+
bb.0:
52+
liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1
53+
54+
%0:vgpr_32 = COPY $vgpr0
55+
%1:vgpr_32 = COPY $vgpr1
56+
%2:sgpr_32 = COPY $sgpr0
57+
%3:sgpr_32 = COPY $sgpr1
58+
%4:vgpr_32 = IMPLICIT_DEF
59+
60+
; should be combined because src2 allows sgpr
61+
%5:vgpr_32 = V_MOV_B32_dpp %4, %0, 1, 15, 15, 1, implicit $exec
62+
%6:vgpr_32 = V_MED3_F32_e64 0, %5, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec
63+
64+
; should be combined only on subtargets that allow sgpr for src1
65+
%7:vgpr_32 = V_MOV_B32_dpp %4, %0, 1, 15, 15, 1, implicit $exec
66+
%8:vgpr_32 = V_MED3_F32_e64 0, %7, 0, %2, 0, %1, 0, 0, implicit $mode, implicit $exec
67+
68+
; should be combined only on subtargets that allow sgpr for src1
69+
%9:vgpr_32 = V_MOV_B32_dpp %4, %0, 1, 15, 15, 1, implicit $exec
70+
%10:vgpr_32 = V_MED3_F32_e64 0, %9, 0, %2, 0, %3, 0, 0, implicit $mode, implicit $exec
71+
72+
; should be combined only on subtargets that allow inlinable constants for src1
73+
%11:vgpr_32 = V_MOV_B32_dpp %4, %0, 1, 15, 15, 1, implicit $exec
74+
%12:vgpr_32 = V_MED3_F32_e64 0, %11, 0, 42, 0, %2, 0, 0, implicit $mode, implicit $exec
75+
76+
; should not be combined when literal constants are used
77+
%13:vgpr_32 = V_MOV_B32_dpp %4, %0, 1, 15, 15, 1, implicit $exec
78+
%14:vgpr_32 = V_MED3_F32_e64 0, %13, 0, 4242, 0, %2, 0, 0, implicit $mode, implicit $exec
79+
...
80+
---
3581

3682
# Regression test for src_modifiers on base u16 opcode
3783
# GCN-label: name: vop3_u16
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
// RUN: llvm-mc -arch=amdgcn -show-encoding -mcpu=gfx1150 %s | FileCheck --check-prefix=GFX1150 %s
2+
// RUN: llvm-mc -arch=amdgcn -show-encoding -mcpu=gfx1151 %s | FileCheck --check-prefix=GFX1150 %s
3+
4+
//
5+
// Subtargets allow src1 of VOP3 DPP instructions to be SGPR or inlinable
6+
// constant.
7+
//
8+
9+
v_add3_u32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf
10+
// GFX1150: encoding: [0x05,0x00,0x55,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
11+
12+
v_add3_u32_e64_dpp v5, v1, 42, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf
13+
// GFX1150: encoding: [0x05,0x00,0x55,0xd6,0xfa,0x54,0x0d,0x04,0x01,0x1b,0x00,0xff]
14+
15+
v_add3_u32_e64_dpp v5, v1, s2, v0 dpp8:[7,6,5,4,3,2,1,0]
16+
// GFX1150: encoding: [0x05,0x00,0x55,0xd6,0xe9,0x04,0x00,0x04,0x01,0x77,0x39,0x05]
17+
18+
v_add3_u32_e64_dpp v5, v1, 42, v0 dpp8:[7,6,5,4,3,2,1,0]
19+
// GFX1150: encoding: [0x05,0x00,0x55,0xd6,0xe9,0x54,0x01,0x04,0x01,0x77,0x39,0x05]

llvm/test/MC/AMDGPU/gfx11_asm_err.s

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,15 @@ v_add3_u32_e64_dpp v5, v1, v2, 49812340 dpp8:[7,6,5,4,3,2,1,0]
4545
v_add3_u32_e64_dpp v5, v1, s1, v0 dpp8:[7,6,5,4,3,2,1,0]
4646
// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
4747

48+
v_add3_u32_e64_dpp v5, v1, 42, v0 dpp8:[7,6,5,4,3,2,1,0]
49+
// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
50+
51+
v_add3_u32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf
52+
// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
53+
54+
v_add3_u32_e64_dpp v5, v1, 42, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf
55+
// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
56+
4857
v_cvt_f32_i32_e64_dpp v5, s1 dpp8:[7,6,5,4,3,2,1,0]
4958
// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
5059

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# RUN: llvm-mc -arch=amdgcn -mcpu=gfx1150 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1150 %s
2+
3+
# GFX1150: v_add3_u32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x55,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
4+
0x05,0x00,0x55,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff
5+
6+
# GFX1150: v_add3_u32_e64_dpp v5, v1, 42, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x55,0xd6,0xfa,0x54,0x0d,0x04,0x01,0x1b,0x00,0xff]
7+
0x05,0x00,0x55,0xd6,0xfa,0x54,0x0d,0x04,0x01,0x1b,0x00,0xff
8+
9+
# GFX1150: v_add3_u32_e64_dpp v5, v1, s2, v0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x55,0xd6,0xe9,0x04,0x00,0x04,0x01,0x77,0x39,0x05]
10+
0x05,0x00,0x55,0xd6,0xe9,0x04,0x00,0x04,0x01,0x77,0x39,0x05
11+
12+
# GFX1150: v_add3_u32_e64_dpp v5, v1, 42, v0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x55,0xd6,0xe9,0x54,0x01,0x04,0x01,0x77,0x39,0x05]
13+
0x05,0x00,0x55,0xd6,0xe9,0x54,0x01,0x04,0x01,0x77,0x39,0x05

0 commit comments

Comments
 (0)