@@ -3486,6 +3486,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
3486
3486
Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3487
3487
Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3488
3488
Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3489
+ Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3489
3490
Opc == AMDGPU::V_FMAC_F16_fake16_e64) {
3490
3491
// Don't fold if we are using source or output modifiers. The new VOP2
3491
3492
// instructions don't have them.
@@ -3506,6 +3507,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
3506
3507
bool IsFMA =
3507
3508
Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3508
3509
Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3510
+ Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3509
3511
Opc == AMDGPU::V_FMAC_F16_fake16_e64;
3510
3512
MachineOperand *Src1 = getNamedOperand (UseMI, AMDGPU::OpName::src1);
3511
3513
MachineOperand *Src2 = getNamedOperand (UseMI, AMDGPU::OpName::src2);
@@ -3539,16 +3541,19 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
3539
3541
3540
3542
unsigned NewOpc =
3541
3543
IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32
3542
- : ST.hasTrue16BitInsts () ? AMDGPU::V_FMAMK_F16_fake16
3544
+ : ST.hasTrue16BitInsts () ? ST.useRealTrue16Insts ()
3545
+ ? AMDGPU::V_FMAMK_F16_t16
3546
+ : AMDGPU::V_FMAMK_F16_fake16
3543
3547
: AMDGPU::V_FMAMK_F16)
3544
3548
: (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
3545
3549
if (pseudoToMCOpcode (NewOpc) == -1 )
3546
3550
return false ;
3547
3551
3548
- // V_FMAMK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
3549
- // would also require restricting their register classes. For now
3550
- // just bail out.
3551
- if (NewOpc == AMDGPU::V_FMAMK_F16_fake16)
3552
+ // V_FMAMK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAMK_F16_fake16
3553
+ // takes VGPR_32_Lo128 operands, so the rewrite would also require
3554
+ // restricting their register classes. For now just bail out.
3555
+ if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3556
+ NewOpc == AMDGPU::V_FMAMK_F16_fake16)
3552
3557
return false ;
3553
3558
3554
3559
const int64_t Imm = getImmFor (RegSrc == Src1 ? *Src0 : *Src1);
@@ -3563,7 +3568,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
3563
3568
Src0->setIsKill (RegSrc->isKill ());
3564
3569
3565
3570
if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3566
- Opc == AMDGPU::V_FMAC_F32_e64 ||
3571
+ Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3567
3572
Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
3568
3573
UseMI.untieRegOperand (
3569
3574
AMDGPU::getNamedOperandIdx (Opc, AMDGPU::OpName::src2));
@@ -3618,23 +3623,26 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
3618
3623
3619
3624
unsigned NewOpc =
3620
3625
IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32
3621
- : ST.hasTrue16BitInsts () ? AMDGPU::V_FMAAK_F16_fake16
3626
+ : ST.hasTrue16BitInsts () ? ST.useRealTrue16Insts ()
3627
+ ? AMDGPU::V_FMAAK_F16_t16
3628
+ : AMDGPU::V_FMAAK_F16_fake16
3622
3629
: AMDGPU::V_FMAAK_F16)
3623
3630
: (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
3624
3631
if (pseudoToMCOpcode (NewOpc) == -1 )
3625
3632
return false ;
3626
3633
3627
- // V_FMAAK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
3628
- // would also require restricting their register classes. For now
3629
- // just bail out.
3630
- if (NewOpc == AMDGPU::V_FMAAK_F16_fake16)
3634
+ // V_FMAK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAAK_F16_fake16
3635
+ // takes VGPR_32_Lo128 operands, so the rewrite would also require
3636
+ // restricting their register classes. For now just bail out.
3637
+ if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
3638
+ NewOpc == AMDGPU::V_FMAAK_F16_fake16)
3631
3639
return false ;
3632
3640
3633
3641
// FIXME: This would be a lot easier if we could return a new instruction
3634
3642
// instead of having to modify in place.
3635
3643
3636
3644
if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3637
- Opc == AMDGPU::V_FMAC_F32_e64 ||
3645
+ Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3638
3646
Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
3639
3647
UseMI.untieRegOperand (
3640
3648
AMDGPU::getNamedOperandIdx (Opc, AMDGPU::OpName::src2));
@@ -3821,8 +3829,11 @@ static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
3821
3829
return AMDGPU::V_FMA_LEGACY_F32_e64;
3822
3830
case AMDGPU::V_FMAC_F16_e32:
3823
3831
case AMDGPU::V_FMAC_F16_e64:
3832
+ case AMDGPU::V_FMAC_F16_t16_e64:
3824
3833
case AMDGPU::V_FMAC_F16_fake16_e64:
3825
- return ST.hasTrue16BitInsts () ? AMDGPU::V_FMA_F16_gfx9_fake16_e64
3834
+ return ST.hasTrue16BitInsts () ? ST.useRealTrue16Insts ()
3835
+ ? AMDGPU::V_FMA_F16_gfx9_t16_e64
3836
+ : AMDGPU::V_FMA_F16_gfx9_fake16_e64
3826
3837
: AMDGPU::V_FMA_F16_gfx9_e64;
3827
3838
case AMDGPU::V_FMAC_F32_e32:
3828
3839
case AMDGPU::V_FMAC_F32_e64:
@@ -3888,19 +3899,22 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
3888
3899
return MIB;
3889
3900
}
3890
3901
3891
- assert (
3892
- Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
3893
- " V_FMAC_F16_fake16_e32 is not supported and not expected to be present "
3894
- " pre-RA" );
3902
+ assert (Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
3903
+ Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
3904
+ " V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
3905
+ " present "
3906
+ " pre-RA" );
3895
3907
3896
3908
// Handle MAC/FMAC.
3897
3909
bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 ||
3898
3910
Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3911
+ Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3899
3912
Opc == AMDGPU::V_FMAC_F16_fake16_e64;
3900
3913
bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3901
3914
Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
3902
3915
Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 ||
3903
3916
Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3917
+ Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3904
3918
Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3905
3919
Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
3906
3920
bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
@@ -3915,6 +3929,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
3915
3929
return nullptr ;
3916
3930
case AMDGPU::V_MAC_F16_e64:
3917
3931
case AMDGPU::V_FMAC_F16_e64:
3932
+ case AMDGPU::V_FMAC_F16_t16_e64:
3918
3933
case AMDGPU::V_FMAC_F16_fake16_e64:
3919
3934
case AMDGPU::V_MAC_F32_e64:
3920
3935
case AMDGPU::V_MAC_LEGACY_F32_e64:
@@ -4000,8 +4015,11 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
4000
4015
int64_t Imm;
4001
4016
if (!Src0Literal && getFoldableImm (Src2, Imm, &DefMI)) {
4002
4017
unsigned NewOpc =
4003
- IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts () ? AMDGPU::V_FMAAK_F16_fake16
4004
- : AMDGPU::V_FMAAK_F16)
4018
+ IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts ()
4019
+ ? ST.useRealTrue16Insts ()
4020
+ ? AMDGPU::V_FMAAK_F16_t16
4021
+ : AMDGPU::V_FMAAK_F16_fake16
4022
+ : AMDGPU::V_FMAAK_F16)
4005
4023
: AMDGPU::V_FMAAK_F32)
4006
4024
: (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
4007
4025
if (pseudoToMCOpcode (NewOpc) != -1 ) {
@@ -4018,11 +4036,14 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
4018
4036
return MIB;
4019
4037
}
4020
4038
}
4021
- unsigned NewOpc =
4022
- IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts () ? AMDGPU::V_FMAMK_F16_fake16
4023
- : AMDGPU::V_FMAMK_F16)
4024
- : AMDGPU::V_FMAMK_F32)
4025
- : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
4039
+ unsigned NewOpc = IsFMA
4040
+ ? (IsF16 ? (ST.hasTrue16BitInsts ()
4041
+ ? ST.useRealTrue16Insts ()
4042
+ ? AMDGPU::V_FMAMK_F16_t16
4043
+ : AMDGPU::V_FMAMK_F16_fake16
4044
+ : AMDGPU::V_FMAMK_F16)
4045
+ : AMDGPU::V_FMAMK_F32)
4046
+ : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
4026
4047
if (!Src0Literal && getFoldableImm (Src1, Imm, &DefMI)) {
4027
4048
if (pseudoToMCOpcode (NewOpc) != -1 ) {
4028
4049
MIB = BuildMI (MBB, MI, MI.getDebugLoc (), get (NewOpc))
@@ -4468,6 +4489,7 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI,
4468
4489
case AMDGPU::V_MAC_F32_e64:
4469
4490
case AMDGPU::V_MAC_LEGACY_F32_e64:
4470
4491
case AMDGPU::V_FMAC_F16_e64:
4492
+ case AMDGPU::V_FMAC_F16_t16_e64:
4471
4493
case AMDGPU::V_FMAC_F16_fake16_e64:
4472
4494
case AMDGPU::V_FMAC_F32_e64:
4473
4495
case AMDGPU::V_FMAC_F64_e64:
@@ -5520,7 +5542,9 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
5520
5542
case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
5521
5543
case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
5522
5544
case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
5523
- case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_fake16_e64;
5545
+ case AMDGPU::S_FMAC_F16:
5546
+ return ST.useRealTrue16Insts () ? AMDGPU::V_FMAC_F16_t16_e64
5547
+ : AMDGPU::V_FMAC_F16_fake16_e64;
5524
5548
case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
5525
5549
case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
5526
5550
case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
0 commit comments