-
Notifications
You must be signed in to change notification settings - Fork 13.7k
[AMDGPU][MC][True16] Support V_RCP/SQRT/RSQ/LOG/EXP_F16. #81131
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-globalisel Author: Ivan Kosarev (kosarev) Changes[AMDGPU][MC][True16] Support V_RCP/SQRT/RSQ/LOG/EXP_F16. Also add missing v_ceil/floor_f16 tests. Includes #78446. Patch is 293.28 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/81131.diff 30 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 225e781588668f..a94da992b33859 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -314,8 +314,9 @@ class AMDGPUOperand : public MCParsedAsmOperand {
return isRegOrImmWithInputMods(AMDGPU::VS_64RegClassID, MVT::f64);
}
- bool isRegOrInlineImmWithFP16InputMods() const {
- return isRegOrInline(AMDGPU::VS_32RegClassID, MVT::f16);
+ template <bool IsFake16> bool isRegOrInlineImmWithFP16InputMods() const {
+ return isRegOrInline(
+ IsFake16 ? AMDGPU::VS_32RegClassID : AMDGPU::VS_16RegClassID, MVT::f16);
}
bool isRegOrInlineImmWithFP32InputMods() const {
@@ -8151,7 +8152,7 @@ ParseStatus AMDGPUAsmParser::parseOModSI(OperandVector &Operands) {
// Determines which bit DST_OP_SEL occupies in the op_sel operand according to
// the number of src operands present, then copies that bit into src0_modifiers.
-void cvtVOP3DstOpSelOnly(MCInst &Inst) {
+static void cvtVOP3DstOpSelOnly(MCInst &Inst, const MCRegisterInfo &MRI) {
int Opc = Inst.getOpcode();
int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel);
if (OpSelIdx == -1)
@@ -8168,23 +8169,34 @@ void cvtVOP3DstOpSelOnly(MCInst &Inst) {
unsigned OpSel = Inst.getOperand(OpSelIdx).getImm();
- if ((OpSel & (1 << SrcNum)) != 0) {
- int ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
- uint32_t ModVal = Inst.getOperand(ModIdx).getImm();
- Inst.getOperand(ModIdx).setImm(ModVal | SISrcMods::DST_OP_SEL);
+ int DstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
+ if (DstIdx == -1)
+ return;
+
+ const MCOperand &DstOp = Inst.getOperand(DstIdx);
+ int ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
+ uint32_t ModVal = Inst.getOperand(ModIdx).getImm();
+ if (DstOp.isReg() &&
+ MRI.getRegClass(AMDGPU::VGPR_16RegClassID).contains(DstOp.getReg())) {
+ if (AMDGPU::isHi(DstOp.getReg(), MRI))
+ ModVal |= SISrcMods::DST_OP_SEL;
+ } else {
+ if ((OpSel & (1 << SrcNum)) != 0)
+ ModVal |= SISrcMods::DST_OP_SEL;
}
+ Inst.getOperand(ModIdx).setImm(ModVal);
}
void AMDGPUAsmParser::cvtVOP3OpSel(MCInst &Inst,
const OperandVector &Operands) {
cvtVOP3P(Inst, Operands);
- cvtVOP3DstOpSelOnly(Inst);
+ cvtVOP3DstOpSelOnly(Inst, *getMRI());
}
void AMDGPUAsmParser::cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands,
OptionalImmIndexMap &OptionalIdx) {
cvtVOP3P(Inst, Operands, OptionalIdx);
- cvtVOP3DstOpSelOnly(Inst);
+ cvtVOP3DstOpSelOnly(Inst, *getMRI());
}
static bool isRegOrImmWithInputMods(const MCInstrDesc &Desc, unsigned OpNum) {
@@ -8433,8 +8445,17 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
uint32_t ModVal = 0;
- if ((OpSel & (1 << J)) != 0)
- ModVal |= SISrcMods::OP_SEL_0;
+ const MCOperand &SrcOp = Inst.getOperand(OpIdx);
+ if (SrcOp.isReg() && getMRI()
+ ->getRegClass(AMDGPU::VGPR_16RegClassID)
+ .contains(SrcOp.getReg())) {
+ bool VGPRSuffixIsHi = AMDGPU::isHi(SrcOp.getReg(), *getMRI());
+ if (VGPRSuffixIsHi)
+ ModVal |= SISrcMods::OP_SEL_0;
+ } else {
+ if ((OpSel & (1 << J)) != 0)
+ ModVal |= SISrcMods::OP_SEL_0;
+ }
if ((OpSelHi & (1 << J)) != 0)
ModVal |= SISrcMods::OP_SEL_1;
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index fba9eb53c8a8b4..85377d07c52dcb 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -913,6 +913,41 @@ static VOPModifiers collectVOPModifiers(const MCInst &MI,
return Modifiers;
}
+// Instructions decode the op_sel/suffix bits into the src_modifier
+// operands. Copy those bits into the src operands for true16 VGPRs.
+void AMDGPUDisassembler::convertTrue16OpSel(MCInst &MI) const {
+ const unsigned Opc = MI.getOpcode();
+ const MCRegisterClass &ConversionRC =
+ MRI.getRegClass(AMDGPU::VGPR_16RegClassID);
+ constexpr std::array<std::tuple<int, int, unsigned>, 4> OpAndOpMods = {
+ {{AMDGPU::OpName::src0, AMDGPU::OpName::src0_modifiers,
+ SISrcMods::OP_SEL_0},
+ {AMDGPU::OpName::src1, AMDGPU::OpName::src1_modifiers,
+ SISrcMods::OP_SEL_0},
+ {AMDGPU::OpName::src2, AMDGPU::OpName::src2_modifiers,
+ SISrcMods::OP_SEL_0},
+ {AMDGPU::OpName::vdst, AMDGPU::OpName::src0_modifiers,
+ SISrcMods::DST_OP_SEL}}};
+ for (const auto &[OpName, OpModsName, OpSelMask] : OpAndOpMods) {
+ int OpIdx = AMDGPU::getNamedOperandIdx(Opc, OpName);
+ int OpModsIdx = AMDGPU::getNamedOperandIdx(Opc, OpModsName);
+ if (OpIdx == -1 || OpModsIdx == -1)
+ continue;
+ MCOperand &Op = MI.getOperand(OpIdx);
+ if (!Op.isReg())
+ continue;
+ if (!ConversionRC.contains(Op.getReg()))
+ continue;
+ unsigned OpEnc = MRI.getEncodingValue(Op.getReg());
+ const MCOperand &OpMods = MI.getOperand(OpModsIdx);
+ unsigned ModVal = OpMods.getImm();
+ if (ModVal & OpSelMask) { // isHi
+ unsigned RegIdx = OpEnc & AMDGPU::HWEncoding::REG_IDX_MASK;
+ Op.setReg(ConversionRC.getRegister(RegIdx * 2 + 1));
+ }
+ }
+}
+
// MAC opcodes have special old and src2 operands.
// src2 is tied to dst, while old is not tied (but assumed to be).
bool AMDGPUDisassembler::isMacDPP(MCInst &MI) const {
@@ -968,6 +1003,7 @@ DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
unsigned DescNumOps = MCII->get(Opc).getNumOperands();
if (MI.getNumOperands() < DescNumOps &&
AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) {
+ convertTrue16OpSel(MI);
auto Mods = collectVOPModifiers(MI);
insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSel),
AMDGPU::OpName::op_sel);
@@ -991,6 +1027,8 @@ DecodeStatus AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const {
if (isMacDPP(MI))
convertMacDPPInst(MI);
+ convertTrue16OpSel(MI);
+
int VDstInIdx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in);
if (VDstInIdx != -1)
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 5a89b30f6fb36a..02feaf553c0c45 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -203,6 +203,7 @@ class AMDGPUDisassembler : public MCDisassembler {
DecodeStatus convertVOP3PDPPInst(MCInst &MI) const;
DecodeStatus convertVOPCDPPInst(MCInst &MI) const;
void convertMacDPPInst(MCInst &MI) const;
+ void convertTrue16OpSel(MCInst &MI) const;
enum OpWidthTy {
OPW32,
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index a812cdc61500cc..8bf05682cbe7ea 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -756,14 +756,14 @@ void SIFoldOperands::foldOperand(
int UseOpIdx,
SmallVectorImpl<FoldCandidate> &FoldList,
SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
- const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
+ const MachineOperand *UseOp = &UseMI->getOperand(UseOpIdx);
- if (!isUseSafeToFold(*UseMI, UseOp))
+ if (!isUseSafeToFold(*UseMI, *UseOp))
return;
// FIXME: Fold operands with subregs.
- if (UseOp.isReg() && OpToFold.isReg() &&
- (UseOp.isImplicit() || UseOp.getSubReg() != AMDGPU::NoSubRegister))
+ if (UseOp->isReg() && OpToFold.isReg() &&
+ (UseOp->isImplicit() || UseOp->getSubReg() != AMDGPU::NoSubRegister))
return;
// Special case for REG_SEQUENCE: We can't fold literals into
@@ -859,7 +859,6 @@ void SIFoldOperands::foldOperand(
if (MovOp == AMDGPU::COPY)
return;
- UseMI->setDesc(TII->get(MovOp));
MachineInstr::mop_iterator ImpOpI = UseMI->implicit_operands().begin();
MachineInstr::mop_iterator ImpOpE = UseMI->implicit_operands().end();
while (ImpOpI != ImpOpE) {
@@ -867,6 +866,19 @@ void SIFoldOperands::foldOperand(
ImpOpI++;
UseMI->removeOperand(UseMI->getOperandNo(Tmp));
}
+ UseMI->setDesc(TII->get(MovOp));
+
+ if (MovOp == AMDGPU::V_MOV_B16_t16_e64) {
+ const auto &SrcOp = UseMI->getOperand(UseOpIdx);
+ MachineOperand NewSrcOp(SrcOp);
+ MachineFunction *MF = UseMI->getParent()->getParent();
+ UseMI->removeOperand(1);
+ UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // src0_modifiers
+ UseMI->addOperand(NewSrcOp); // src0
+ UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // op_sel
+ UseOpIdx = 2;
+ UseOp = &UseMI->getOperand(UseOpIdx);
+ }
CopiesToReplace.push_back(UseMI);
} else {
if (UseMI->isCopy() && OpToFold.isReg() &&
@@ -1027,7 +1039,7 @@ void SIFoldOperands::foldOperand(
// Don't fold into target independent nodes. Target independent opcodes
// don't have defined register classes.
- if (UseDesc.isVariadic() || UseOp.isImplicit() ||
+ if (UseDesc.isVariadic() || UseOp->isImplicit() ||
UseDesc.operands()[UseOpIdx].RegClass == -1)
return;
}
@@ -1062,17 +1074,17 @@ void SIFoldOperands::foldOperand(
TRI->getRegClass(FoldDesc.operands()[0].RegClass);
// Split 64-bit constants into 32-bits for folding.
- if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(*FoldRC) == 64) {
- Register UseReg = UseOp.getReg();
+ if (UseOp->getSubReg() && AMDGPU::getRegBitWidth(*FoldRC) == 64) {
+ Register UseReg = UseOp->getReg();
const TargetRegisterClass *UseRC = MRI->getRegClass(UseReg);
if (AMDGPU::getRegBitWidth(*UseRC) != 64)
return;
APInt Imm(64, OpToFold.getImm());
- if (UseOp.getSubReg() == AMDGPU::sub0) {
+ if (UseOp->getSubReg() == AMDGPU::sub0) {
Imm = Imm.getLoBits(32);
} else {
- assert(UseOp.getSubReg() == AMDGPU::sub1);
+ assert(UseOp->getSubReg() == AMDGPU::sub1);
Imm = Imm.getHiBits(32);
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 7edec5a7a5505b..22599773d562cb 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1148,7 +1148,13 @@ def FPT16InputModsMatchClass : FPInputModsMatchClass<16> {
def FP32InputModsMatchClass : FPInputModsMatchClass<32>;
def FP64InputModsMatchClass : FPInputModsMatchClass<64>;
-def FP16VCSrcInputModsMatchClass : FPVCSrcInputModsMatchClass<16>;
+class FP16VCSrcInputModsMatchClass<bit IsFake16>
+ : FPVCSrcInputModsMatchClass<16> {
+ let Name = !if(IsFake16, "RegOrInlineImmWithFPFake16InputMods",
+ "RegOrInlineImmWithFPT16InputMods");
+ let PredicateMethod = "isRegOrInlineImmWithFP16InputMods<" #
+ !if(IsFake16, "true", "false") # ">";
+}
def FP32VCSrcInputModsMatchClass : FPVCSrcInputModsMatchClass<32>;
class InputMods <AsmOperandClass matchClass> : Operand <i32> {
@@ -1166,7 +1172,8 @@ def FPT16InputMods : FPInputMods<FPT16InputModsMatchClass>;
def FP32InputMods : FPInputMods<FP32InputModsMatchClass>;
def FP64InputMods : FPInputMods<FP64InputModsMatchClass>;
-def FP16VCSrcInputMods : FPInputMods<FP16VCSrcInputModsMatchClass>;
+class FP16VCSrcInputMods<bit IsFake16>
+ : FPInputMods<FP16VCSrcInputModsMatchClass<IsFake16>>;
def FP32VCSrcInputMods : FPInputMods<FP32VCSrcInputModsMatchClass>;
class IntInputModsMatchClass <int opSize> : AsmOperandClass {
@@ -1653,11 +1660,11 @@ class getSrcModDPP_t16 <ValueType VT, bit IsFake16 = 1> {
}
// Return type of input modifiers operand for specified input operand for DPP
-class getSrcModVOP3DPP <ValueType VT> {
+class getSrcModVOP3DPP <ValueType VT, bit IsFake16 = 1> {
Operand ret =
!if (VT.isFP,
!if (!or(!eq(VT.Value, f16.Value), !eq(VT.Value, bf16.Value)),
- FP16VCSrcInputMods, FP32VCSrcInputMods),
+ FP16VCSrcInputMods<IsFake16>, FP32VCSrcInputMods),
Int32VCSrcInputMods);
}
@@ -2450,6 +2457,10 @@ class VOP_PAT_GEN <VOPProfile p, int mode=PatGenMode.NoPattern> : VOPProfile <p.
class VOPProfile_True16<VOPProfile P> : VOPProfile<P.ArgVT> {
let IsTrue16 = 1;
let IsRealTrue16 = 1;
+
+ let HasOpSel = 1;
+ let HasModifiers = 1; // All instructions at least have OpSel.
+
// Most DstVT are 16-bit, but not all.
let DstRC = getVALUDstForVT<DstVT, 1 /*IsTrue16*/, 0 /*IsVOP3Encoding*/>.ret;
let DstRC64 = getVALUDstForVT<DstVT>.ret;
@@ -2461,6 +2472,10 @@ class VOPProfile_True16<VOPProfile P> : VOPProfile<P.ArgVT> {
let Src0ModDPP = getSrcModDPP_t16<Src0VT, 0 /*IsFake16*/>.ret;
let Src1ModDPP = getSrcModDPP_t16<Src1VT, 0 /*IsFake16*/>.ret;
let Src2ModDPP = getSrcModDPP_t16<Src2VT, 0 /*IsFake16*/>.ret;
+ let Src0VOP3DPP = VGPRSrc_16;
+ let Src0ModVOP3DPP = getSrcModVOP3DPP<Src0VT, 0 /*IsFake16*/>.ret;
+ let Src1ModVOP3DPP = getSrcModVOP3DPP<Src1VT, 0 /*IsFake16*/>.ret;
+ let Src2ModVOP3DPP = getSrcModVOP3DPP<Src2VT, 0 /*IsFake16*/>.ret;
let DstRC64 = getVALUDstForVT<DstVT, 1 /*IsTrue16*/, 1 /*IsVOP3Encoding*/>.ret;
let Src0RC64 = getVOP3SrcForVT<Src0VT, 1 /*IsTrue16*/>.ret;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index c9dbe02037ef2e..aabb6c29062114 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -1235,6 +1235,12 @@ def VGPRSrc_16_Lo128 : RegisterOperand<VGPR_16_Lo128> {
let EncoderMethod = "getMachineOpValueT16Lo128";
}
+// True 16 operands.
+def VGPRSrc_16 : RegisterOperand<VGPR_16> {
+ let DecoderMethod = "DecodeVGPR_16RegisterClass";
+ let EncoderMethod = "getMachineOpValueT16";
+}
+
//===----------------------------------------------------------------------===//
// ASrc_* Operands with an AccVGPR
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 58b67b21e274b5..2dbe4773e9d6b8 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -954,10 +954,15 @@ defm V_CVT_F16_U16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x050, "v_cvt_f16_
defm V_CVT_F16_I16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x051, "v_cvt_f16_i16">;
defm V_CVT_U16_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x052, "v_cvt_u16_f16">;
defm V_CVT_I16_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x053, "v_cvt_i16_f16">;
+defm V_RCP_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x054, "v_rcp_f16">;
defm V_RCP_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x054, "v_rcp_f16">;
+defm V_SQRT_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x055, "v_sqrt_f16">;
defm V_SQRT_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x055, "v_sqrt_f16">;
+defm V_RSQ_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x056, "v_rsq_f16">;
defm V_RSQ_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x056, "v_rsq_f16">;
+defm V_LOG_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x057, "v_log_f16">;
defm V_LOG_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x057, "v_log_f16">;
+defm V_EXP_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x058, "v_exp_f16">;
defm V_EXP_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x058, "v_exp_f16">;
defm V_FREXP_MANT_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x059, "v_frexp_mant_f16">;
defm V_FREXP_EXP_I16_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05a, "v_frexp_exp_i16_f16">;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir
index 84da311108ce38..014534ab79fe64 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir
@@ -50,7 +50,7 @@ body: |
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]]
- ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CEIL_F16_t16_e64_]]
; GFX11-NEXT: $vgpr0 = COPY [[COPY2]]
;
@@ -88,7 +88,7 @@ body: |
; GFX11: liveins: $sgpr0
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_CEIL_F16_t16_e64_]]
; GFX11-NEXT: $vgpr0 = COPY [[COPY1]]
;
@@ -127,7 +127,7 @@ body: |
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]]
- ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 1, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 1, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CEIL_F16_t16_e64_]]
; GFX11-NEXT: $vgpr0 = COPY [[COPY2]]
;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir
index 30975a8937db62..dcf9e169f586be 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir
@@ -59,7 +59,7 @@ body: |
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]]
- ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_FLOOR_F16_t16_e64_]]
; GFX11-NEXT: $vgpr0 = COPY [[COPY2]]
;
@@ -97,7 +97,7 @@ body: |
; GFX11: liveins: $sgpr0
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_FLOOR_F16_t16_e64_]]
; GFX11-NEXT: $vgpr0 = COPY [[COPY1]]
;
@@ -136,7 +136,7 @@ body: |
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]]
- ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 1, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 1, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_FLOOR_F16_t16_e64_]]
; GFX11-NEXT: $vgpr0 = COPY [[COPY2]]
;
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir
index 7767aa54c81519..9ae5f559e860af 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir
@@ -66,7 +66,7 @@ body: |
; REAL16: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; REAL16-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[D...
[truncated]
|
0x05,0x00,0xdc,0xd5,0xff,0x01,0x00,0x00 | ||
|
||
# GFX11: v_ceil_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xdc,0xd5,0x01,0x00,0x00,0x00] | ||
# GFX11-REAL16: v_ceil_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xdc,0xd5,0x01,0x00,0x00,0x00] | ||
# GFX11-FAKE16: v_ceil_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xdc,0xd5,0x01,0x00,0x00,0x00] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do sgpr operands not have .l
/.h
? VOP3 opsel can be used with sgprs.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No reason other than it's just not supported yet.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think they are either supported already in the MC layer, or will be when we finish upstreaming the current changeset. The syntax is different though. SGPRs use the opsel:[] syntax.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
SGPRs use the opsel:[] syntax.
That's an odd choice, s0.l/s0.h would be much more readable and I don't see any downside to displaying sgprs and vgprs the same way.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
SP3 uses s0.l/s0.h syntax.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
SGPRs use the opsel:[] syntax.
That's an odd choice, s0.l/s0.h would be much more readable and I don't see any downside to displaying sgprs and vgprs the same way.
Yep, I agree the .l/.h syntax would be better. It has not been a priority since it would only affect asm. We do not have codegen support for 16 bit sgprs coming any time soon. If you want to do any patches toward .l/.h syntax it would be welcome.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How would something like v_add_nc_i16 v0.h, v1.h, s0.h
be represented after the switch? Do you mean v_add_nc_i16 v0, v1, s0 op_sel:[1,1,1]
will be accepted as well?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It would work like this ( this is how it works downstream)
v_add_nc_i16 v0.l, v1.h, s0 op_sel:[1,1,0]
v_add_nc_i16 v0.l, v1.h, s0 op_sel:[1,1,0] ; encoding: [0x00,0x18,0x0d,0xd7,0x01,0x01,0x00,0x00]
v_add_nc_i16 v0.l, v1.h, s0 op_sel:[1,0,0]
v_add_nc_i16 v0.l, v1.h, s0 op_sel:[1,0,0] ; encoding: [0x00,0x08,0x0d,0xd7,0x01,0x01,0x00,0x00]
Suffixes on the vgprs are required. If you want to use a hi/lo sgpr, you must pass the op_sel:[] operand, and the bits in that operand corresponding to vgprs must match the suffixes on the vgprs (there is verification they match).
It is a bit awkward but not impossible I think to support suffixes on the sgprs. For vgprs there is an actual register number for hi vs lo, and when you parse src0 you set the register number. For SGPRs you would want to set the bit in src0_modifiers.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see, thanks. Awkward, but OK as a temporary solution.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Needs a rebase.
Should there be vop3 from vop1 dpp asm tests in this patch?
Should there be vop3 from vop1 non-dpp asm and disasm tests in this patch?
This is more of a suggestion for a future patch, but what is the status of the gfx12 versions of these vop1 _t16 instructions? There are no asm or disasm tests for true16 gfx12 instructions. It looks to me like the vop1 version of these _t16 instructions should work correctly in gfx12 (because the encoding for true16 and fake16 is the same). The VOP3 from VOP1 I would not expect to be encoded correctly looking at how the classes are currently defined. I only ask because if we can add gfx12 tests for the _t16 instructions soon it might help prevent regressions.
Also add missing v_ceil/floor_f16 tests.
Added vop3 from-vop1 tests. Non-dpp from-vop1 ones need more work. It's still first steps with True16 here upstream, so I wouldn't generally expect that we add all the relevant and 'happened to be passing' tests right away. For GFX12 True16, I'm going to start looking into it this week, so shouldn't take long to get some coverage there as well. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, LGTM
[AMDGPU][MC][True16] Support V_RCP/SQRT/RSQ/LOG/EXP_F16.
Also add missing v_ceil/floor_f16 tests.
Includes #80892.