Skip to content

Commit 210036a

Browse files
authored
[AMDGPU][True16][CodeGen] true16 codegen pattern for fma (#127240)
Previous PR #122950 get reverted since it hit the buildbot failure. Another patch get merged when this PR is under review, and thus causing one test not up to date. repen this PR and fixed the issue.
1 parent ab3d793 commit 210036a

11 files changed

+913
-256
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

+2
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,8 @@ static unsigned macToMad(unsigned Opc) {
203203
return AMDGPU::V_FMA_F32_e64;
204204
case AMDGPU::V_FMAC_F16_e64:
205205
return AMDGPU::V_FMA_F16_gfx9_e64;
206+
case AMDGPU::V_FMAC_F16_t16_e64:
207+
return AMDGPU::V_FMA_F16_gfx9_t16_e64;
206208
case AMDGPU::V_FMAC_F16_fake16_e64:
207209
return AMDGPU::V_FMA_F16_gfx9_fake16_e64;
208210
case AMDGPU::V_FMAC_LEGACY_F32_e64:

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

+89-36
Original file line numberDiff line numberDiff line change
@@ -3461,6 +3461,62 @@ std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm,
34613461
llvm_unreachable("covered subregister switch");
34623462
}
34633463

3464+
static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) {
3465+
switch (Opc) {
3466+
case AMDGPU::V_MAC_F16_e32:
3467+
case AMDGPU::V_MAC_F16_e64:
3468+
case AMDGPU::V_MAD_F16_e64:
3469+
return AMDGPU::V_MADAK_F16;
3470+
case AMDGPU::V_MAC_F32_e32:
3471+
case AMDGPU::V_MAC_F32_e64:
3472+
case AMDGPU::V_MAD_F32_e64:
3473+
return AMDGPU::V_MADAK_F32;
3474+
case AMDGPU::V_FMAC_F32_e32:
3475+
case AMDGPU::V_FMAC_F32_e64:
3476+
case AMDGPU::V_FMA_F32_e64:
3477+
return AMDGPU::V_FMAAK_F32;
3478+
case AMDGPU::V_FMAC_F16_e32:
3479+
case AMDGPU::V_FMAC_F16_e64:
3480+
case AMDGPU::V_FMAC_F16_t16_e64:
3481+
case AMDGPU::V_FMAC_F16_fake16_e64:
3482+
case AMDGPU::V_FMA_F16_e64:
3483+
return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3484+
? AMDGPU::V_FMAAK_F16_t16
3485+
: AMDGPU::V_FMAAK_F16_fake16
3486+
: AMDGPU::V_FMAAK_F16;
3487+
default:
3488+
llvm_unreachable("invalid instruction");
3489+
}
3490+
}
3491+
3492+
static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) {
3493+
switch (Opc) {
3494+
case AMDGPU::V_MAC_F16_e32:
3495+
case AMDGPU::V_MAC_F16_e64:
3496+
case AMDGPU::V_MAD_F16_e64:
3497+
return AMDGPU::V_MADMK_F16;
3498+
case AMDGPU::V_MAC_F32_e32:
3499+
case AMDGPU::V_MAC_F32_e64:
3500+
case AMDGPU::V_MAD_F32_e64:
3501+
return AMDGPU::V_MADMK_F32;
3502+
case AMDGPU::V_FMAC_F32_e32:
3503+
case AMDGPU::V_FMAC_F32_e64:
3504+
case AMDGPU::V_FMA_F32_e64:
3505+
return AMDGPU::V_FMAMK_F32;
3506+
case AMDGPU::V_FMAC_F16_e32:
3507+
case AMDGPU::V_FMAC_F16_e64:
3508+
case AMDGPU::V_FMAC_F16_t16_e64:
3509+
case AMDGPU::V_FMAC_F16_fake16_e64:
3510+
case AMDGPU::V_FMA_F16_e64:
3511+
return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3512+
? AMDGPU::V_FMAMK_F16_t16
3513+
: AMDGPU::V_FMAMK_F16_fake16
3514+
: AMDGPU::V_FMAMK_F16;
3515+
default:
3516+
llvm_unreachable("invalid instruction");
3517+
}
3518+
}
3519+
34643520
bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
34653521
Register Reg, MachineRegisterInfo *MRI) const {
34663522
if (!MRI->hasOneNonDBGUse(Reg))
@@ -3533,6 +3589,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
35333589
Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
35343590
Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
35353591
Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3592+
Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
35363593
Opc == AMDGPU::V_FMAC_F16_fake16_e64) {
35373594
// Don't fold if we are using source or output modifiers. The new VOP2
35383595
// instructions don't have them.
@@ -3555,6 +3612,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
35553612
bool IsFMA =
35563613
Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
35573614
Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3615+
Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
35583616
Opc == AMDGPU::V_FMAC_F16_fake16_e64;
35593617
MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
35603618
MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
@@ -3586,18 +3644,15 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
35863644
!isInlineConstant(Def->getOperand(1)))
35873645
return false;
35883646

3589-
unsigned NewOpc =
3590-
IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32
3591-
: ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
3592-
: AMDGPU::V_FMAMK_F16)
3593-
: (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
3647+
unsigned NewOpc = getNewFMAMKInst(ST, Opc);
35943648
if (pseudoToMCOpcode(NewOpc) == -1)
35953649
return false;
35963650

3597-
// V_FMAMK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
3598-
// would also require restricting their register classes. For now
3599-
// just bail out.
3600-
if (NewOpc == AMDGPU::V_FMAMK_F16_fake16)
3651+
// V_FMAMK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAMK_F16_fake16
3652+
// takes VGPR_32_Lo128 operands, so the rewrite would also require
3653+
// restricting their register classes. For now just bail out.
3654+
if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3655+
NewOpc == AMDGPU::V_FMAMK_F16_fake16)
36013656
return false;
36023657

36033658
const std::optional<int64_t> SubRegImm = extractSubregFromImm(
@@ -3613,7 +3668,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
36133668
Src0->setIsKill(RegSrc->isKill());
36143669

36153670
if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3616-
Opc == AMDGPU::V_FMAC_F32_e64 ||
3671+
Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
36173672
Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
36183673
UseMI.untieRegOperand(
36193674
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
@@ -3666,25 +3721,22 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
36663721
}
36673722
}
36683723

3669-
unsigned NewOpc =
3670-
IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32
3671-
: ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
3672-
: AMDGPU::V_FMAAK_F16)
3673-
: (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
3724+
unsigned NewOpc = getNewFMAAKInst(ST, Opc);
36743725
if (pseudoToMCOpcode(NewOpc) == -1)
36753726
return false;
36763727

3677-
// V_FMAAK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
3678-
// would also require restricting their register classes. For now
3679-
// just bail out.
3680-
if (NewOpc == AMDGPU::V_FMAAK_F16_fake16)
3728+
// V_FMAAK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAAK_F16_fake16
3729+
// takes VGPR_32_Lo128 operands, so the rewrite would also require
3730+
// restricting their register classes. For now just bail out.
3731+
if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
3732+
NewOpc == AMDGPU::V_FMAAK_F16_fake16)
36813733
return false;
36823734

36833735
// FIXME: This would be a lot easier if we could return a new instruction
36843736
// instead of having to modify in place.
36853737

36863738
if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3687-
Opc == AMDGPU::V_FMAC_F32_e64 ||
3739+
Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
36883740
Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
36893741
UseMI.untieRegOperand(
36903742
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
@@ -3874,8 +3926,11 @@ static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
38743926
return AMDGPU::V_FMA_LEGACY_F32_e64;
38753927
case AMDGPU::V_FMAC_F16_e32:
38763928
case AMDGPU::V_FMAC_F16_e64:
3929+
case AMDGPU::V_FMAC_F16_t16_e64:
38773930
case AMDGPU::V_FMAC_F16_fake16_e64:
3878-
return ST.hasTrue16BitInsts() ? AMDGPU::V_FMA_F16_gfx9_fake16_e64
3931+
return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3932+
? AMDGPU::V_FMA_F16_gfx9_t16_e64
3933+
: AMDGPU::V_FMA_F16_gfx9_fake16_e64
38793934
: AMDGPU::V_FMA_F16_gfx9_e64;
38803935
case AMDGPU::V_FMAC_F32_e32:
38813936
case AMDGPU::V_FMAC_F32_e64:
@@ -3941,19 +3996,21 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
39413996
return MIB;
39423997
}
39433998

3944-
assert(
3945-
Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
3946-
"V_FMAC_F16_fake16_e32 is not supported and not expected to be present "
3947-
"pre-RA");
3999+
assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
4000+
Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
4001+
"V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
4002+
"present pre-RA");
39484003

39494004
// Handle MAC/FMAC.
39504005
bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 ||
39514006
Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
4007+
Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
39524008
Opc == AMDGPU::V_FMAC_F16_fake16_e64;
39534009
bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
39544010
Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
39554011
Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 ||
39564012
Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
4013+
Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
39574014
Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
39584015
Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
39594016
bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
@@ -3968,6 +4025,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
39684025
return nullptr;
39694026
case AMDGPU::V_MAC_F16_e64:
39704027
case AMDGPU::V_FMAC_F16_e64:
4028+
case AMDGPU::V_FMAC_F16_t16_e64:
39714029
case AMDGPU::V_FMAC_F16_fake16_e64:
39724030
case AMDGPU::V_MAC_F32_e64:
39734031
case AMDGPU::V_MAC_LEGACY_F32_e64:
@@ -4052,11 +4110,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
40524110

40534111
int64_t Imm;
40544112
if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
4055-
unsigned NewOpc =
4056-
IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
4057-
: AMDGPU::V_FMAAK_F16)
4058-
: AMDGPU::V_FMAAK_F32)
4059-
: (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
4113+
unsigned NewOpc = getNewFMAAKInst(ST, Opc);
40604114
if (pseudoToMCOpcode(NewOpc) != -1) {
40614115
MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
40624116
.add(*Dst)
@@ -4071,11 +4125,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
40714125
return MIB;
40724126
}
40734127
}
4074-
unsigned NewOpc =
4075-
IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
4076-
: AMDGPU::V_FMAMK_F16)
4077-
: AMDGPU::V_FMAMK_F32)
4078-
: (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
4128+
unsigned NewOpc = getNewFMAMKInst(ST, Opc);
40794129
if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
40804130
if (pseudoToMCOpcode(NewOpc) != -1) {
40814131
MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
@@ -4513,6 +4563,7 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI,
45134563
case AMDGPU::V_MAC_F32_e64:
45144564
case AMDGPU::V_MAC_LEGACY_F32_e64:
45154565
case AMDGPU::V_FMAC_F16_e64:
4566+
case AMDGPU::V_FMAC_F16_t16_e64:
45164567
case AMDGPU::V_FMAC_F16_fake16_e64:
45174568
case AMDGPU::V_FMAC_F32_e64:
45184569
case AMDGPU::V_FMAC_F64_e64:
@@ -5569,7 +5620,9 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
55695620
case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
55705621
case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
55715622
case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
5572-
case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_fake16_e64;
5623+
case AMDGPU::S_FMAC_F16:
5624+
return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64
5625+
: AMDGPU::V_FMAC_F16_fake16_e64;
55735626
case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
55745627
case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
55755628
case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;

llvm/lib/Target/AMDGPU/SIInstructions.td

+8
Original file line numberDiff line numberDiff line change
@@ -3287,6 +3287,14 @@ def : GCNPat <
32873287
(V_FMAC_F16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
32883288
SRCMODS.NONE, $src2)
32893289
>;
3290+
let True16Predicate = UseRealTrue16Insts in
3291+
def : GCNPat <
3292+
(fma (f16 (VOP3NoMods f16:$src0)),
3293+
(f16 (VOP3NoMods f16:$src1)),
3294+
(f16 (VOP3NoMods f16:$src2))),
3295+
(V_FMAC_F16_t16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
3296+
SRCMODS.NONE, $src2)
3297+
>;
32903298
let True16Predicate = UseFakeTrue16Insts in
32913299
def : GCNPat <
32923300
(fma (f16 (VOP3NoMods f16:$src0)),

llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp

+13-4
Original file line numberDiff line numberDiff line change
@@ -455,9 +455,13 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
455455
break;
456456
case AMDGPU::V_FMA_F16_e64:
457457
case AMDGPU::V_FMA_F16_gfx9_e64:
458+
NewOpcode = AMDGPU::V_FMAAK_F16;
459+
break;
460+
case AMDGPU::V_FMA_F16_gfx9_t16_e64:
461+
NewOpcode = AMDGPU::V_FMAAK_F16_t16;
462+
break;
458463
case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
459-
NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
460-
: AMDGPU::V_FMAAK_F16;
464+
NewOpcode = AMDGPU::V_FMAAK_F16_fake16;
461465
break;
462466
}
463467
}
@@ -485,9 +489,13 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
485489
break;
486490
case AMDGPU::V_FMA_F16_e64:
487491
case AMDGPU::V_FMA_F16_gfx9_e64:
492+
NewOpcode = AMDGPU::V_FMAMK_F16;
493+
break;
494+
case AMDGPU::V_FMA_F16_gfx9_t16_e64:
495+
NewOpcode = AMDGPU::V_FMAMK_F16_t16;
496+
break;
488497
case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
489-
NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
490-
: AMDGPU::V_FMAMK_F16;
498+
NewOpcode = AMDGPU::V_FMAMK_F16_fake16;
491499
break;
492500
}
493501
}
@@ -959,6 +967,7 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
959967
MI.getOpcode() == AMDGPU::V_MAD_F16_e64 ||
960968
MI.getOpcode() == AMDGPU::V_FMA_F16_e64 ||
961969
MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_e64 ||
970+
MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_t16_e64 ||
962971
MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_fake16_e64) {
963972
shrinkMadFma(MI);
964973
continue;

0 commit comments

Comments
 (0)