@@ -3461,6 +3461,62 @@ std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm,
3461
3461
llvm_unreachable (" covered subregister switch" );
3462
3462
}
3463
3463
3464
+ static unsigned getNewFMAAKInst (const GCNSubtarget &ST, unsigned Opc) {
3465
+ switch (Opc) {
3466
+ case AMDGPU::V_MAC_F16_e32:
3467
+ case AMDGPU::V_MAC_F16_e64:
3468
+ case AMDGPU::V_MAD_F16_e64:
3469
+ return AMDGPU::V_MADAK_F16;
3470
+ case AMDGPU::V_MAC_F32_e32:
3471
+ case AMDGPU::V_MAC_F32_e64:
3472
+ case AMDGPU::V_MAD_F32_e64:
3473
+ return AMDGPU::V_MADAK_F32;
3474
+ case AMDGPU::V_FMAC_F32_e32:
3475
+ case AMDGPU::V_FMAC_F32_e64:
3476
+ case AMDGPU::V_FMA_F32_e64:
3477
+ return AMDGPU::V_FMAAK_F32;
3478
+ case AMDGPU::V_FMAC_F16_e32:
3479
+ case AMDGPU::V_FMAC_F16_e64:
3480
+ case AMDGPU::V_FMAC_F16_t16_e64:
3481
+ case AMDGPU::V_FMAC_F16_fake16_e64:
3482
+ case AMDGPU::V_FMA_F16_e64:
3483
+ return ST.hasTrue16BitInsts () ? ST.useRealTrue16Insts ()
3484
+ ? AMDGPU::V_FMAAK_F16_t16
3485
+ : AMDGPU::V_FMAAK_F16_fake16
3486
+ : AMDGPU::V_FMAAK_F16;
3487
+ default :
3488
+ llvm_unreachable (" invalid instruction" );
3489
+ }
3490
+ }
3491
+
3492
+ static unsigned getNewFMAMKInst (const GCNSubtarget &ST, unsigned Opc) {
3493
+ switch (Opc) {
3494
+ case AMDGPU::V_MAC_F16_e32:
3495
+ case AMDGPU::V_MAC_F16_e64:
3496
+ case AMDGPU::V_MAD_F16_e64:
3497
+ return AMDGPU::V_MADMK_F16;
3498
+ case AMDGPU::V_MAC_F32_e32:
3499
+ case AMDGPU::V_MAC_F32_e64:
3500
+ case AMDGPU::V_MAD_F32_e64:
3501
+ return AMDGPU::V_MADMK_F32;
3502
+ case AMDGPU::V_FMAC_F32_e32:
3503
+ case AMDGPU::V_FMAC_F32_e64:
3504
+ case AMDGPU::V_FMA_F32_e64:
3505
+ return AMDGPU::V_FMAMK_F32;
3506
+ case AMDGPU::V_FMAC_F16_e32:
3507
+ case AMDGPU::V_FMAC_F16_e64:
3508
+ case AMDGPU::V_FMAC_F16_t16_e64:
3509
+ case AMDGPU::V_FMAC_F16_fake16_e64:
3510
+ case AMDGPU::V_FMA_F16_e64:
3511
+ return ST.hasTrue16BitInsts () ? ST.useRealTrue16Insts ()
3512
+ ? AMDGPU::V_FMAMK_F16_t16
3513
+ : AMDGPU::V_FMAMK_F16_fake16
3514
+ : AMDGPU::V_FMAMK_F16;
3515
+ default :
3516
+ llvm_unreachable (" invalid instruction" );
3517
+ }
3518
+ }
3519
+
3464
3520
bool SIInstrInfo::foldImmediate (MachineInstr &UseMI, MachineInstr &DefMI,
3465
3521
Register Reg, MachineRegisterInfo *MRI) const {
3466
3522
if (!MRI->hasOneNonDBGUse (Reg))
@@ -3533,6 +3589,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
3533
3589
Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3534
3590
Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3535
3591
Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3592
+ Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3536
3593
Opc == AMDGPU::V_FMAC_F16_fake16_e64) {
3537
3594
// Don't fold if we are using source or output modifiers. The new VOP2
3538
3595
// instructions don't have them.
@@ -3555,6 +3612,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
3555
3612
bool IsFMA =
3556
3613
Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3557
3614
Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3615
+ Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3558
3616
Opc == AMDGPU::V_FMAC_F16_fake16_e64;
3559
3617
MachineOperand *Src1 = getNamedOperand (UseMI, AMDGPU::OpName::src1);
3560
3618
MachineOperand *Src2 = getNamedOperand (UseMI, AMDGPU::OpName::src2);
@@ -3586,18 +3644,15 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
3586
3644
!isInlineConstant (Def->getOperand (1 )))
3587
3645
return false ;
3588
3646
3589
- unsigned NewOpc =
3590
- IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32
3591
- : ST.hasTrue16BitInsts () ? AMDGPU::V_FMAMK_F16_fake16
3592
- : AMDGPU::V_FMAMK_F16)
3593
- : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
3647
+ unsigned NewOpc = getNewFMAMKInst (ST, Opc);
3594
3648
if (pseudoToMCOpcode (NewOpc) == -1 )
3595
3649
return false ;
3596
3650
3597
- // V_FMAMK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
3598
- // would also require restricting their register classes. For now
3599
- // just bail out.
3600
- if (NewOpc == AMDGPU::V_FMAMK_F16_fake16)
3651
+ // V_FMAMK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAMK_F16_fake16
3652
+ // takes VGPR_32_Lo128 operands, so the rewrite would also require
3653
+ // restricting their register classes. For now just bail out.
3654
+ if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3655
+ NewOpc == AMDGPU::V_FMAMK_F16_fake16)
3601
3656
return false ;
3602
3657
3603
3658
const std::optional<int64_t > SubRegImm = extractSubregFromImm (
@@ -3613,7 +3668,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
3613
3668
Src0->setIsKill (RegSrc->isKill ());
3614
3669
3615
3670
if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3616
- Opc == AMDGPU::V_FMAC_F32_e64 ||
3671
+ Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3617
3672
Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
3618
3673
UseMI.untieRegOperand (
3619
3674
AMDGPU::getNamedOperandIdx (Opc, AMDGPU::OpName::src2));
@@ -3666,25 +3721,22 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
3666
3721
}
3667
3722
}
3668
3723
3669
- unsigned NewOpc =
3670
- IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32
3671
- : ST.hasTrue16BitInsts () ? AMDGPU::V_FMAAK_F16_fake16
3672
- : AMDGPU::V_FMAAK_F16)
3673
- : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
3724
+ unsigned NewOpc = getNewFMAAKInst (ST, Opc);
3674
3725
if (pseudoToMCOpcode (NewOpc) == -1 )
3675
3726
return false ;
3676
3727
3677
- // V_FMAAK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
3678
- // would also require restricting their register classes. For now
3679
- // just bail out.
3680
- if (NewOpc == AMDGPU::V_FMAAK_F16_fake16)
3728
+ // V_FMAAK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAAK_F16_fake16
3729
+ // takes VGPR_32_Lo128 operands, so the rewrite would also require
3730
+ // restricting their register classes. For now just bail out.
3731
+ if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
3732
+ NewOpc == AMDGPU::V_FMAAK_F16_fake16)
3681
3733
return false ;
3682
3734
3683
3735
// FIXME: This would be a lot easier if we could return a new instruction
3684
3736
// instead of having to modify in place.
3685
3737
3686
3738
if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3687
- Opc == AMDGPU::V_FMAC_F32_e64 ||
3739
+ Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3688
3740
Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
3689
3741
UseMI.untieRegOperand (
3690
3742
AMDGPU::getNamedOperandIdx (Opc, AMDGPU::OpName::src2));
@@ -3874,8 +3926,11 @@ static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
3874
3926
return AMDGPU::V_FMA_LEGACY_F32_e64;
3875
3927
case AMDGPU::V_FMAC_F16_e32:
3876
3928
case AMDGPU::V_FMAC_F16_e64:
3929
+ case AMDGPU::V_FMAC_F16_t16_e64:
3877
3930
case AMDGPU::V_FMAC_F16_fake16_e64:
3878
- return ST.hasTrue16BitInsts () ? AMDGPU::V_FMA_F16_gfx9_fake16_e64
3931
+ return ST.hasTrue16BitInsts () ? ST.useRealTrue16Insts ()
3932
+ ? AMDGPU::V_FMA_F16_gfx9_t16_e64
3933
+ : AMDGPU::V_FMA_F16_gfx9_fake16_e64
3879
3934
: AMDGPU::V_FMA_F16_gfx9_e64;
3880
3935
case AMDGPU::V_FMAC_F32_e32:
3881
3936
case AMDGPU::V_FMAC_F32_e64:
@@ -3941,19 +3996,21 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
3941
3996
return MIB;
3942
3997
}
3943
3998
3944
- assert (
3945
- Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
3946
- " V_FMAC_F16_fake16_e32 is not supported and not expected to be present "
3947
- " pre-RA" );
3999
+ assert (Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
4000
+ Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
4001
+ " V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
4002
+ " present pre-RA" );
3948
4003
3949
4004
// Handle MAC/FMAC.
3950
4005
bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 ||
3951
4006
Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
4007
+ Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3952
4008
Opc == AMDGPU::V_FMAC_F16_fake16_e64;
3953
4009
bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3954
4010
Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
3955
4011
Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 ||
3956
4012
Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
4013
+ Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3957
4014
Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3958
4015
Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
3959
4016
bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
@@ -3968,6 +4025,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
3968
4025
return nullptr ;
3969
4026
case AMDGPU::V_MAC_F16_e64:
3970
4027
case AMDGPU::V_FMAC_F16_e64:
4028
+ case AMDGPU::V_FMAC_F16_t16_e64:
3971
4029
case AMDGPU::V_FMAC_F16_fake16_e64:
3972
4030
case AMDGPU::V_MAC_F32_e64:
3973
4031
case AMDGPU::V_MAC_LEGACY_F32_e64:
@@ -4052,11 +4110,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
4052
4110
4053
4111
int64_t Imm;
4054
4112
if (!Src0Literal && getFoldableImm (Src2, Imm, &DefMI)) {
4055
- unsigned NewOpc =
4056
- IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts () ? AMDGPU::V_FMAAK_F16_fake16
4057
- : AMDGPU::V_FMAAK_F16)
4058
- : AMDGPU::V_FMAAK_F32)
4059
- : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
4113
+ unsigned NewOpc = getNewFMAAKInst (ST, Opc);
4060
4114
if (pseudoToMCOpcode (NewOpc) != -1 ) {
4061
4115
MIB = BuildMI (MBB, MI, MI.getDebugLoc (), get (NewOpc))
4062
4116
.add (*Dst)
@@ -4071,11 +4125,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
4071
4125
return MIB;
4072
4126
}
4073
4127
}
4074
- unsigned NewOpc =
4075
- IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts () ? AMDGPU::V_FMAMK_F16_fake16
4076
- : AMDGPU::V_FMAMK_F16)
4077
- : AMDGPU::V_FMAMK_F32)
4078
- : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
4128
+ unsigned NewOpc = getNewFMAMKInst (ST, Opc);
4079
4129
if (!Src0Literal && getFoldableImm (Src1, Imm, &DefMI)) {
4080
4130
if (pseudoToMCOpcode (NewOpc) != -1 ) {
4081
4131
MIB = BuildMI (MBB, MI, MI.getDebugLoc (), get (NewOpc))
@@ -4513,6 +4563,7 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI,
4513
4563
case AMDGPU::V_MAC_F32_e64:
4514
4564
case AMDGPU::V_MAC_LEGACY_F32_e64:
4515
4565
case AMDGPU::V_FMAC_F16_e64:
4566
+ case AMDGPU::V_FMAC_F16_t16_e64:
4516
4567
case AMDGPU::V_FMAC_F16_fake16_e64:
4517
4568
case AMDGPU::V_FMAC_F32_e64:
4518
4569
case AMDGPU::V_FMAC_F64_e64:
@@ -5569,7 +5620,9 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
5569
5620
case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
5570
5621
case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
5571
5622
case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
5572
- case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_fake16_e64;
5623
+ case AMDGPU::S_FMAC_F16:
5624
+ return ST.useRealTrue16Insts () ? AMDGPU::V_FMAC_F16_t16_e64
5625
+ : AMDGPU::V_FMAC_F16_fake16_e64;
5573
5626
case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
5574
5627
case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
5575
5628
case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
0 commit comments