Skip to content

Commit e3cf2d7

Browse files
pravinjagtapjrbyrnes
authored andcommitted
AMDGPU: Handle cvt_scale F32/F16->F4/F8 gfx950 hazard
gfx950 SP changes doc says: No 4 clk forwarding on opcodes that convert from F32/F16->F8 or F32/F16->F4. Must insert a NOP or instruction writing some other destination VREG after a conversion to F4/F8 since it writes either low/high half or bytes. Co-authored-by: Pravin Jagtap <[email protected]> Co-authored-by: Jeffrey Byrnes <[email protected]>
1 parent b4a16a7 commit e3cf2d7

8 files changed

+435
-11
lines changed

llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -909,17 +909,18 @@ getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
909909

910910
// There are three different types of instructions
911911
// which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3
912-
// which write hi bits (e.g. op_sel[3] == 1), and 3. CVR_SR_FP8_F32 and
913-
// CVT_SR_BF8_F32 with op_sel[3:2]
912+
// which write hi bits (e.g. op_sel[3] == 1), and 3. FP8DstSelInst
913+
// (instructions with dest byte sel, e.g. CVT_SR_BF8_F32) and
914+
// op_sel[3:2]
914915
// != 0
915916
if (SIInstrInfo::isSDWA(MI)) {
916917
// Type 1: SDWA with dst_sel != DWORD
917918
if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
918919
if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
919920
return nullptr;
920921
} else {
921-
// Type 2 && Type 3: (VOP3 which write the hi bits) || (CVT_SR_FP8_F32 and
922-
// CVT_SR_BF8_F32 with op_sel[3:2] != 0)
922+
// Type 2 && Type 3: (VOP3 which write the hi bits) || (FP8DstSelInst
923+
// with op_sel[3:2] != 0)
923924
if (!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel) ||
924925
!(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
925926
SISrcMods::DST_OP_SEL ||
@@ -983,7 +984,7 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
983984
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
984985
}
985986

986-
if (ST.hasDstSelForwardingHazard()) {
987+
if (ST.hasDstSelForwardingHazard() || ST.hasCvtScaleForwardingHazard()) {
987988
const int Shift16DefWaitstates = 1;
988989

989990
auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) {
@@ -1094,7 +1095,8 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
10941095
// problematic thus far.
10951096

10961097
// see checkVALUHazards()
1097-
if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard())
1098+
if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard() &&
1099+
!ST.hasCvtScaleForwardingHazard())
10981100
return 0;
10991101

11001102
const MachineRegisterInfo &MRI = MF.getRegInfo();

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1264,6 +1264,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
12641264

12651265
bool hasVALUTransUseHazard() const { return HasVALUTransUseHazard; }
12661266

1267+
bool hasCvtScaleForwardingHazard() const { return GFX950Insts; }
1268+
12671269
bool hasForceStoreSC0SC1() const { return HasForceStoreSC0SC1; }
12681270

12691271
bool requiresCodeObjectV6() const { return RequiresCOV6; }

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -378,6 +378,14 @@ struct VOPTrue16Info {
378378
bool IsTrue16;
379379
};
380380

381+
#define GET_FP8DstByteSelTable_DECL
382+
#define GET_FP8DstByteSelTable_IMPL
383+
384+
struct DPMACCInstructionInfo {
385+
uint16_t Opcode;
386+
bool IsDPMACCInstruction;
387+
};
388+
381389
struct FP8DstByteSelInfo {
382390
uint16_t Opcode;
383391
bool HasFP8DstByteSel;
@@ -418,6 +426,8 @@ struct FP8DstByteSelInfo {
418426
#define GET_getMFMA_F8F6F4_WithSize_DECL
419427
#define GET_getMFMA_F8F6F4_WithSize_IMPL
420428
#define GET_isMFMA_F8F6F4Table_IMPL
429+
#define GET_isCvtScaleF32_F32F16ToF8F4Table_IMPL
430+
421431
#include "AMDGPUGenSearchableTables.inc"
422432

423433
int getMTBUFBaseOpcode(unsigned Opc) {

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,10 @@ struct MFMA_F8F6F4_Info {
103103
uint8_t NumRegsSrcB;
104104
};
105105

106+
struct CvtScaleF32_F32F16ToF8F4_Info {
107+
unsigned Opcode;
108+
};
109+
106110
#define GET_MIMGBaseOpcode_DECL
107111
#define GET_MIMGDim_DECL
108112
#define GET_MIMGEncoding_DECL
@@ -112,6 +116,7 @@ struct MFMA_F8F6F4_Info {
112116
#define GET_MAIInstInfoTable_DECL
113117
#define GET_MAIInstInfoTable_DECL
114118
#define GET_isMFMA_F8F6F4Table_DECL
119+
#define GET_isCvtScaleF32_F32F16ToF8F4Table_DECL
115120
#include "AMDGPUGenSearchableTables.inc"
116121

117122
namespace IsaInfo {

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -970,11 +970,16 @@ class VOP3_CVT_SCALE_FP4FP8BF8_F32_TiedInput_Profile<VOPProfile P> : VOP3_Profil
970970
let HasOMod = 0;
971971
}
972972

973+
class VOP3_CVT_SCALE_FP4_F32_TiedInput_Profile<VOPProfile P> : VOP3_CVT_SCALE_FP4FP8BF8_F32_TiedInput_Profile<P> {
974+
let HasFP8DstByteSel = 1;
975+
}
976+
973977
class VOP3_CVT_SCALE_SR_F8BF8_F16BF16F32_TiedInput_Profile<VOPProfile P> : VOP3_CVT_SCALE_FP4FP8BF8_F32_TiedInput_Profile<P> {
974978
let InsVOP3OpSel = (ins FP32InputMods:$src0_modifiers, Src0RC64:$src0,
975979
Int32InputMods:$src1_modifiers, Src1RC64:$src1,
976980
FP32InputMods:$src2_modifiers, Src2RC64:$src2,
977981
VGPR_32:$vdst_in, op_sel0:$op_sel);
982+
let HasFP8DstByteSel = 1;
978983
}
979984

980985

@@ -992,6 +997,7 @@ class VOP3_CVT_SCALE_FP4_F16BF16_TiedInput_Profile<VOPProfile P> : VOP3_Profile<
992997
HasSrc0FloatMods, HasSrc1FloatMods,
993998
HasSrc2FloatMods>.ret);
994999
let HasExtVOP3DPP = 0;
1000+
let HasFP8DstByteSel = 1;
9951001
}
9961002

9971003
class VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<ValueType Src0Ty> :
@@ -1004,6 +1010,7 @@ class VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<ValueType Src0Ty> :
10041010
let HasExtVOP3DPP = 0;
10051011
let HasOpSel = 1;
10061012
let HasOMod = 0;
1013+
let HasFP8DstByteSel = 1;
10071014
}
10081015

10091016
def VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile : VOP3_Profile<VOPProfile<[i32, v2f32, i32, f32]>, VOP3_OPSEL> {
@@ -1015,6 +1022,7 @@ def VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile : VOP3_Profile<VOPProfile<[i32
10151022
let HasExtVOP3DPP = 0;
10161023
let HasOpSel = 1;
10171024
let HasOMod = 0;
1025+
let HasFP8DstByteSel = 1;
10181026
}
10191027

10201028
class VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<ValueType DstTy> : VOP3_Profile<VOPProfile<[DstTy, i32, f32, untyped]>,
@@ -1090,7 +1098,7 @@ let SubtargetPredicate = HasBF8ConversionScaleInsts, mayRaiseFPException = 0 in
10901098
let SubtargetPredicate = HasFP4ConversionScaleInsts, mayRaiseFPException = 0 in {
10911099
defm V_CVT_SCALEF32_PK_F32_FP4 : VOP3Inst<"v_cvt_scalef32_pk_f32_fp4", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<v2f32>>;
10921100
let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
1093-
defm V_CVT_SCALEF32_PK_FP4_F32 : VOP3Inst<"v_cvt_scalef32_pk_fp4_f32", VOP3_CVT_SCALE_FP4FP8BF8_F32_TiedInput_Profile<VOP_I32_F32_F32_F32>>;
1101+
defm V_CVT_SCALEF32_PK_FP4_F32 : VOP3Inst<"v_cvt_scalef32_pk_fp4_f32", VOP3_CVT_SCALE_FP4_F32_TiedInput_Profile<VOP_I32_F32_F32_F32>>;
10941102
let Constraints = "@earlyclobber $vdst" in {
10951103
defm V_CVT_SCALEF32_SR_PK_FP4_F16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2f16>>;
10961104
defm V_CVT_SCALEF32_SR_PK_FP4_BF16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_bf16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2bf16>>;
@@ -2047,6 +2055,7 @@ multiclass VOP3_Real_BITOP3_gfx9<bits<10> op, string AsmName, bit isSingle = 0>
20472055
}
20482056
}
20492057
}
2058+
20502059
} // End AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9"
20512060

20522061
defm V_MAD_U64_U32 : VOP3be_Real_vi <0x1E8>;

0 commit comments

Comments
 (0)