Skip to content

Commit f685b94

Browse files
mariusz-sikora-at-amdmatejaMarjanovicmbrkusanin
authored andcommitted
[AMDGPU][GFX12] VOP encoding and codegen - add support for v_cvt fp8/… (llvm#78414)
…bf8 instructions Add VOP1, VOP1_DPP8, VOP1_DPP16, VOP3, VOP3_DPP8, VOP3_DPP16 instructions that were supported on GFX940 (MI300): - V_CVT_F32_FP8 - V_CVT_F32_BF8 - V_CVT_PK_F32_FP8 - V_CVT_PK_F32_BF8 - V_CVT_PK_FP8_F32 - V_CVT_PK_BF8_F32 - V_CVT_SR_FP8_F32 - V_CVT_SR_BF8_F32 --------- Co-authored-by: Mateja Marjanovic <[email protected]> Co-authored-by: Mirko Brkušanin <[email protected]> (cherry picked from commit cfddb59)
1 parent 5c6b63f commit f685b94

34 files changed

+1742
-102
lines changed

clang/test/CodeGenOpenCL/amdgpu-features.cl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -100,8 +100,8 @@
100100
// GFX1103: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
101101
// GFX1150: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
102102
// GFX1151: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
103-
// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
104-
// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
103+
// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
104+
// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
105105

106106
// GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64"
107107

clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,59 +1,60 @@
11
// REQUIRES: amdgpu-registered-target
2-
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX940
2+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 -S -emit-llvm -o - %s | FileCheck %s
3+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -S -emit-llvm -o - %s | FileCheck %s
34

45
typedef float v2f __attribute__((ext_vector_type(2)));
56

6-
// CHECK-GFX940-LABEL: @test_cvt_f32_bf8
7-
// CHECK-GFX940: call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 0)
7+
// CHECK-LABEL: @test_cvt_f32_bf8
8+
// CHECK: call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 0)
89
void test_cvt_f32_bf8(global int* out, int a)
910
{
1011
*out = __builtin_amdgcn_cvt_f32_bf8(a, 0);
1112
}
1213

13-
// CHECK-GFX940-LABEL: @test_cvt_f32_fp8
14-
// CHECK-GFX940: call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 1)
14+
// CHECK-LABEL: @test_cvt_f32_fp8
15+
// CHECK: call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 1)
1516
void test_cvt_f32_fp8(global int* out, int a)
1617
{
1718
*out = __builtin_amdgcn_cvt_f32_fp8(a, 1);
1819
}
1920

20-
// CHECK-GFX940-LABEL: @test_cvt_pk_f32_bf8
21-
// CHECK-GFX940: call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 false)
21+
// CHECK-LABEL: @test_cvt_pk_f32_bf8
22+
// CHECK: call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 false)
2223
void test_cvt_pk_f32_bf8(global v2f* out, int a)
2324
{
2425
*out = __builtin_amdgcn_cvt_pk_f32_bf8(a, false);
2526
}
2627

27-
// CHECK-GFX940-LABEL: @test_cvt_pk_f32_fp8
28-
// CHECK-GFX940: call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 true)
28+
// CHECK-LABEL: @test_cvt_pk_f32_fp8
29+
// CHECK: call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 true)
2930
void test_cvt_pk_f32_fp8(global v2f* out, int a)
3031
{
3132
*out = __builtin_amdgcn_cvt_pk_f32_fp8(a, true);
3233
}
3334

34-
// CHECK-GFX940-LABEL: @test_cvt_pk_bf8_f32
35-
// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %a, float %b, i32 %old, i1 false)
35+
// CHECK-LABEL: @test_cvt_pk_bf8_f32
36+
// CHECK: call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %a, float %b, i32 %old, i1 false)
3637
void test_cvt_pk_bf8_f32(global int* out, int old, float a, float b)
3738
{
3839
*out = __builtin_amdgcn_cvt_pk_bf8_f32(a, b, old, false);
3940
}
4041

41-
// CHECK-GFX940-LABEL: @test_cvt_pk_fp8_f32
42-
// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %a, float %b, i32 %old, i1 true)
42+
// CHECK-LABEL: @test_cvt_pk_fp8_f32
43+
// CHECK: call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %a, float %b, i32 %old, i1 true)
4344
void test_cvt_pk_fp8_f32(global int* out, int old, float a, float b)
4445
{
4546
*out = __builtin_amdgcn_cvt_pk_fp8_f32(a, b, old, true);
4647
}
4748

48-
// CHECK-GFX940-LABEL: @test_cvt_sr_bf8_f32
49-
// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %a, i32 %b, i32 %old, i32 2)
49+
// CHECK-LABEL: @test_cvt_sr_bf8_f32
50+
// CHECK: call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %a, i32 %b, i32 %old, i32 2)
5051
void test_cvt_sr_bf8_f32(global int* out, int old, float a, int b)
5152
{
5253
*out = __builtin_amdgcn_cvt_sr_bf8_f32(a, b, old, 2);
5354
}
5455

55-
// CHECK-GFX940-LABEL: @test_cvt_sr_fp8_f32
56-
// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %a, i32 %b, i32 %old, i32 3)
56+
// CHECK-LABEL: @test_cvt_sr_fp8_f32
57+
// CHECK: call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %a, i32 %b, i32 %old, i32 3)
5758
void test_cvt_sr_fp8_f32(global int* out, int old, float a, int b)
5859
{
5960
*out = __builtin_amdgcn_cvt_sr_fp8_f32(a, b, old, 3);

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1506,6 +1506,7 @@ def FeatureISAVersion12 : FeatureSet<
15061506
FeatureFlatAtomicFaddF32Inst,
15071507
FeatureImageInsts,
15081508
FeatureExtendedImageInsts,
1509+
FeatureFP8ConversionInsts,
15091510
FeaturePackedTID,
15101511
FeatureVcmpxPermlaneHazard,
15111512
FeatureSALUFloatInsts,

llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3529,6 +3529,9 @@ bool AMDGPUAsmParser::usesConstantBus(const MCInst &Inst, unsigned OpIdx) {
35293529
return !isInlineConstant(Inst, OpIdx);
35303530
} else if (MO.isReg()) {
35313531
auto Reg = MO.getReg();
3532+
if (!Reg) {
3533+
return false;
3534+
}
35323535
const MCRegisterInfo *TRI = getContext().getRegisterInfo();
35333536
auto PReg = mc2PseudoReg(Reg);
35343537
return isSGPR(PReg, TRI) && PReg != SGPR_NULL;
@@ -8363,12 +8366,20 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
83638366
const bool IsPacked = (Desc.TSFlags & SIInstrFlags::IsPacked) != 0;
83648367

83658368
if (Opc == AMDGPU::V_CVT_SR_BF8_F32_vi ||
8366-
Opc == AMDGPU::V_CVT_SR_FP8_F32_vi) {
8369+
Opc == AMDGPU::V_CVT_SR_FP8_F32_vi ||
8370+
Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_gfx12 ||
8371+
Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_gfx12) {
83678372
Inst.addOperand(MCOperand::createImm(0)); // Placeholder for src2_mods
83688373
Inst.addOperand(Inst.getOperand(0));
83698374
}
83708375

8371-
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in)) {
8376+
// Adding vdst_in operand is already covered for these DPP instructions in
8377+
// cvtVOP3DPP.
8378+
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in) &&
8379+
!(Opc == AMDGPU::V_CVT_PK_BF8_F32_e64_dpp_gfx12 ||
8380+
Opc == AMDGPU::V_CVT_PK_FP8_F32_e64_dpp_gfx12 ||
8381+
Opc == AMDGPU::V_CVT_PK_BF8_F32_e64_dpp8_gfx12 ||
8382+
Opc == AMDGPU::V_CVT_PK_FP8_F32_e64_dpp8_gfx12)) {
83728383
assert(!IsPacked);
83738384
Inst.addOperand(Inst.getOperand(0));
83748385
}
@@ -8869,6 +8880,22 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands,
88698880
}
88708881
}
88718882

8883+
int VdstInIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
8884+
if (VdstInIdx == static_cast<int>(Inst.getNumOperands())) {
8885+
Inst.addOperand(Inst.getOperand(0));
8886+
}
8887+
8888+
bool IsVOP3CvtSrDpp = Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 ||
8889+
Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12 ||
8890+
Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp_gfx12 ||
8891+
Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp_gfx12;
8892+
if (IsVOP3CvtSrDpp) {
8893+
if (Src2ModIdx == static_cast<int>(Inst.getNumOperands())) {
8894+
Inst.addOperand(MCOperand::createImm(0));
8895+
Inst.addOperand(MCOperand::createReg(0));
8896+
}
8897+
}
8898+
88728899
auto TiedTo = Desc.getOperandConstraint(Inst.getNumOperands(),
88738900
MCOI::TIED_TO);
88748901
if (TiedTo != -1) {

llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -720,6 +720,13 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
720720
AMDGPU::OpName::src2_modifiers);
721721
}
722722

723+
if (Res && (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp ||
724+
MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp)) {
725+
// Insert dummy unused src2_modifiers.
726+
insertNamedMCOperand(MI, MCOperand::createImm(0),
727+
AMDGPU::OpName::src2_modifiers);
728+
}
729+
723730
if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::DS) &&
724731
!AMDGPU::hasGDS(STI)) {
725732
insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::gds);
@@ -950,6 +957,7 @@ void AMDGPUDisassembler::convertMacDPPInst(MCInst &MI) const {
950957
// first add optional MI operands to check FI
951958
DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
952959
unsigned Opc = MI.getOpcode();
960+
953961
if (MCII->get(Opc).TSFlags & SIInstrFlags::VOP3P) {
954962
convertVOP3PDPPInst(MI);
955963
} else if ((MCII->get(Opc).TSFlags & SIInstrFlags::VOPC) ||
@@ -959,6 +967,15 @@ DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
959967
if (isMacDPP(MI))
960968
convertMacDPPInst(MI);
961969

970+
int VDstInIdx =
971+
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in);
972+
if (VDstInIdx != -1)
973+
insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in);
974+
975+
if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 ||
976+
MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12)
977+
insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::src2);
978+
962979
unsigned DescNumOps = MCII->get(Opc).getNumOperands();
963980
if (MI.getNumOperands() < DescNumOps &&
964981
AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) {
@@ -985,6 +1002,15 @@ DecodeStatus AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const {
9851002
if (isMacDPP(MI))
9861003
convertMacDPPInst(MI);
9871004

1005+
int VDstInIdx =
1006+
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in);
1007+
if (VDstInIdx != -1)
1008+
insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in);
1009+
1010+
if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp_gfx12 ||
1011+
MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp_gfx12)
1012+
insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::src2);
1013+
9881014
unsigned Opc = MI.getOpcode();
9891015
unsigned DescNumOps = MCII->get(Opc).getNumOperands();
9901016
if (MI.getNumOperands() < DescNumOps &&

llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1322,6 +1322,16 @@ void AMDGPUInstPrinter::printOpSel(const MCInst *MI, unsigned,
13221322
const MCSubtargetInfo &STI,
13231323
raw_ostream &O) {
13241324
unsigned Opc = MI->getOpcode();
1325+
if (isCvt_F32_Fp8_Bf8_e64(Opc)) {
1326+
auto SrcMod =
1327+
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
1328+
unsigned Mod = MI->getOperand(SrcMod).getImm();
1329+
unsigned Index0 = !!(Mod & SISrcMods::OP_SEL_0);
1330+
unsigned Index1 = !!(Mod & SISrcMods::OP_SEL_1);
1331+
if (Index0 || Index1)
1332+
O << " op_sel:[" << Index0 << ',' << Index1 << ']';
1333+
return;
1334+
}
13251335
if (isPermlane16(Opc)) {
13261336
auto FIN = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
13271337
auto BCN = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);

llvm/lib/Target/AMDGPU/SIInstrInfo.td

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1694,8 +1694,9 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
16941694
!if(HasOMod,
16951695
(ins Src0Mod:$src0_modifiers, Src0RC:$src0,
16961696
clampmod0:$clamp, omod0:$omod),
1697-
(ins Src0Mod:$src0_modifiers, Src0RC:$src0,
1698-
clampmod0:$clamp))
1697+
!if (HasClamp,
1698+
(ins Src0Mod:$src0_modifiers, Src0RC:$src0, clampmod0:$clamp),
1699+
(ins Src0Mod:$src0_modifiers, Src0RC:$src0)))
16991700
/* else */,
17001701
// VOP1 without modifiers
17011702
!if (HasClamp,
@@ -2290,6 +2291,8 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
22902291
field bit IsWMMA = 0;
22912292
field bit IsSWMMAC = 0;
22922293

2294+
field bit IsFP8 = 0;
2295+
22932296
field bit HasDst = !ne(DstVT.Value, untyped.Value);
22942297
field bit HasDst32 = HasDst;
22952298
field bit EmitDst = HasDst; // force dst encoding, see v_movreld_b32 special case

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -529,6 +529,17 @@ bool isPermlane16(unsigned Opc) {
529529
Opc == AMDGPU::V_PERMLANEX16_VAR_B32_e64_gfx12;
530530
}
531531

532+
bool isCvt_F32_Fp8_Bf8_e64(unsigned Opc) {
533+
return Opc == AMDGPU::V_CVT_F32_BF8_e64_gfx12 ||
534+
Opc == AMDGPU::V_CVT_F32_FP8_e64_gfx12 ||
535+
Opc == AMDGPU::V_CVT_F32_BF8_e64_dpp_gfx12 ||
536+
Opc == AMDGPU::V_CVT_F32_FP8_e64_dpp_gfx12 ||
537+
Opc == AMDGPU::V_CVT_F32_BF8_e64_dpp8_gfx12 ||
538+
Opc == AMDGPU::V_CVT_F32_FP8_e64_dpp8_gfx12 ||
539+
Opc == AMDGPU::V_CVT_PK_F32_BF8_e64_gfx12 ||
540+
Opc == AMDGPU::V_CVT_PK_F32_FP8_e64_gfx12;
541+
}
542+
532543
bool isGenericAtomic(unsigned Opc) {
533544
return Opc == AMDGPU::G_AMDGPU_ATOMIC_FMIN ||
534545
Opc == AMDGPU::G_AMDGPU_ATOMIC_FMAX ||

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -535,6 +535,9 @@ bool isPermlane16(unsigned Opc);
535535
LLVM_READNONE
536536
bool isGenericAtomic(unsigned Opc);
537537

538+
LLVM_READNONE
539+
bool isCvt_F32_Fp8_Bf8_e64(unsigned Opc);
540+
538541
namespace VOPD {
539542

540543
enum Component : unsigned {

0 commit comments

Comments
 (0)