Skip to content

Commit d43f8e6

Browse files
committed
true16 code pattern for fma
1 parent c912e98 commit d43f8e6

File tree

8 files changed

+366
-239
lines changed

8 files changed

+366
-239
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

+2
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,8 @@ static unsigned macToMad(unsigned Opc) {
198198
return AMDGPU::V_FMA_F32_e64;
199199
case AMDGPU::V_FMAC_F16_e64:
200200
return AMDGPU::V_FMA_F16_gfx9_e64;
201+
case AMDGPU::V_FMAC_F16_t16_e64:
202+
return AMDGPU::V_FMA_F16_gfx9_t16_e64;
201203
case AMDGPU::V_FMAC_F16_fake16_e64:
202204
return AMDGPU::V_FMA_F16_gfx9_fake16_e64;
203205
case AMDGPU::V_FMAC_LEGACY_F32_e64:

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

+49-25
Original file line numberDiff line numberDiff line change
@@ -3486,6 +3486,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
34863486
Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
34873487
Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
34883488
Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3489+
Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
34893490
Opc == AMDGPU::V_FMAC_F16_fake16_e64) {
34903491
// Don't fold if we are using source or output modifiers. The new VOP2
34913492
// instructions don't have them.
@@ -3506,6 +3507,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
35063507
bool IsFMA =
35073508
Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
35083509
Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3510+
Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
35093511
Opc == AMDGPU::V_FMAC_F16_fake16_e64;
35103512
MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
35113513
MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
@@ -3539,16 +3541,19 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
35393541

35403542
unsigned NewOpc =
35413543
IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32
3542-
: ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
3544+
: ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3545+
? AMDGPU::V_FMAMK_F16_t16
3546+
: AMDGPU::V_FMAMK_F16_fake16
35433547
: AMDGPU::V_FMAMK_F16)
35443548
: (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
35453549
if (pseudoToMCOpcode(NewOpc) == -1)
35463550
return false;
35473551

3548-
// V_FMAMK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
3549-
// would also require restricting their register classes. For now
3550-
// just bail out.
3551-
if (NewOpc == AMDGPU::V_FMAMK_F16_fake16)
3552+
// V_FMAMK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAMK_F16_fake16
3553+
// takes VGPR_32_Lo128 operands, so the rewrite would also require
3554+
// restricting their register classes. For now just bail out.
3555+
if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3556+
NewOpc == AMDGPU::V_FMAMK_F16_fake16)
35523557
return false;
35533558

35543559
const int64_t Imm = getImmFor(RegSrc == Src1 ? *Src0 : *Src1);
@@ -3563,7 +3568,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
35633568
Src0->setIsKill(RegSrc->isKill());
35643569

35653570
if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3566-
Opc == AMDGPU::V_FMAC_F32_e64 ||
3571+
Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
35673572
Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
35683573
UseMI.untieRegOperand(
35693574
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
@@ -3618,23 +3623,26 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
36183623

36193624
unsigned NewOpc =
36203625
IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32
3621-
: ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
3626+
: ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3627+
? AMDGPU::V_FMAAK_F16_t16
3628+
: AMDGPU::V_FMAAK_F16_fake16
36223629
: AMDGPU::V_FMAAK_F16)
36233630
: (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
36243631
if (pseudoToMCOpcode(NewOpc) == -1)
36253632
return false;
36263633

3627-
// V_FMAAK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
3628-
// would also require restricting their register classes. For now
3629-
// just bail out.
3630-
if (NewOpc == AMDGPU::V_FMAAK_F16_fake16)
3634+
// V_FMAK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAAK_F16_fake16
3635+
// takes VGPR_32_Lo128 operands, so the rewrite would also require
3636+
// restricting their register classes. For now just bail out.
3637+
if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
3638+
NewOpc == AMDGPU::V_FMAAK_F16_fake16)
36313639
return false;
36323640

36333641
// FIXME: This would be a lot easier if we could return a new instruction
36343642
// instead of having to modify in place.
36353643

36363644
if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3637-
Opc == AMDGPU::V_FMAC_F32_e64 ||
3645+
Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
36383646
Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
36393647
UseMI.untieRegOperand(
36403648
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
@@ -3821,8 +3829,11 @@ static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
38213829
return AMDGPU::V_FMA_LEGACY_F32_e64;
38223830
case AMDGPU::V_FMAC_F16_e32:
38233831
case AMDGPU::V_FMAC_F16_e64:
3832+
case AMDGPU::V_FMAC_F16_t16_e64:
38243833
case AMDGPU::V_FMAC_F16_fake16_e64:
3825-
return ST.hasTrue16BitInsts() ? AMDGPU::V_FMA_F16_gfx9_fake16_e64
3834+
return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3835+
? AMDGPU::V_FMA_F16_gfx9_t16_e64
3836+
: AMDGPU::V_FMA_F16_gfx9_fake16_e64
38263837
: AMDGPU::V_FMA_F16_gfx9_e64;
38273838
case AMDGPU::V_FMAC_F32_e32:
38283839
case AMDGPU::V_FMAC_F32_e64:
@@ -3888,19 +3899,22 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
38883899
return MIB;
38893900
}
38903901

3891-
assert(
3892-
Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
3893-
"V_FMAC_F16_fake16_e32 is not supported and not expected to be present "
3894-
"pre-RA");
3902+
assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
3903+
Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
3904+
"V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
3905+
"present "
3906+
"pre-RA");
38953907

38963908
// Handle MAC/FMAC.
38973909
bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 ||
38983910
Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3911+
Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
38993912
Opc == AMDGPU::V_FMAC_F16_fake16_e64;
39003913
bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
39013914
Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
39023915
Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 ||
39033916
Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3917+
Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
39043918
Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
39053919
Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
39063920
bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
@@ -3915,6 +3929,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
39153929
return nullptr;
39163930
case AMDGPU::V_MAC_F16_e64:
39173931
case AMDGPU::V_FMAC_F16_e64:
3932+
case AMDGPU::V_FMAC_F16_t16_e64:
39183933
case AMDGPU::V_FMAC_F16_fake16_e64:
39193934
case AMDGPU::V_MAC_F32_e64:
39203935
case AMDGPU::V_MAC_LEGACY_F32_e64:
@@ -4000,8 +4015,11 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
40004015
int64_t Imm;
40014016
if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
40024017
unsigned NewOpc =
4003-
IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
4004-
: AMDGPU::V_FMAAK_F16)
4018+
IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts()
4019+
? ST.useRealTrue16Insts()
4020+
? AMDGPU::V_FMAAK_F16_t16
4021+
: AMDGPU::V_FMAAK_F16_fake16
4022+
: AMDGPU::V_FMAAK_F16)
40054023
: AMDGPU::V_FMAAK_F32)
40064024
: (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
40074025
if (pseudoToMCOpcode(NewOpc) != -1) {
@@ -4018,11 +4036,14 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
40184036
return MIB;
40194037
}
40204038
}
4021-
unsigned NewOpc =
4022-
IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
4023-
: AMDGPU::V_FMAMK_F16)
4024-
: AMDGPU::V_FMAMK_F32)
4025-
: (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
4039+
unsigned NewOpc = IsFMA
4040+
? (IsF16 ? (ST.hasTrue16BitInsts()
4041+
? ST.useRealTrue16Insts()
4042+
? AMDGPU::V_FMAMK_F16_t16
4043+
: AMDGPU::V_FMAMK_F16_fake16
4044+
: AMDGPU::V_FMAMK_F16)
4045+
: AMDGPU::V_FMAMK_F32)
4046+
: (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
40264047
if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
40274048
if (pseudoToMCOpcode(NewOpc) != -1) {
40284049
MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
@@ -4468,6 +4489,7 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI,
44684489
case AMDGPU::V_MAC_F32_e64:
44694490
case AMDGPU::V_MAC_LEGACY_F32_e64:
44704491
case AMDGPU::V_FMAC_F16_e64:
4492+
case AMDGPU::V_FMAC_F16_t16_e64:
44714493
case AMDGPU::V_FMAC_F16_fake16_e64:
44724494
case AMDGPU::V_FMAC_F32_e64:
44734495
case AMDGPU::V_FMAC_F64_e64:
@@ -5520,7 +5542,9 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
55205542
case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
55215543
case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
55225544
case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
5523-
case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_fake16_e64;
5545+
case AMDGPU::S_FMAC_F16:
5546+
return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64
5547+
: AMDGPU::V_FMAC_F16_fake16_e64;
55245548
case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
55255549
case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
55265550
case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;

llvm/lib/Target/AMDGPU/SIInstructions.td

+8
Original file line numberDiff line numberDiff line change
@@ -3237,6 +3237,14 @@ def : GCNPat <
32373237
(V_FMAC_F16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
32383238
SRCMODS.NONE, $src2)
32393239
>;
3240+
let True16Predicate = UseRealTrue16Insts in
3241+
def : GCNPat <
3242+
(fma (f16 (VOP3NoMods f16:$src0)),
3243+
(f16 (VOP3NoMods f16:$src1)),
3244+
(f16 (VOP3NoMods f16:$src2))),
3245+
(V_FMAC_F16_t16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
3246+
SRCMODS.NONE, $src2)
3247+
>;
32403248
let True16Predicate = UseFakeTrue16Insts in
32413249
def : GCNPat <
32423250
(fma (f16 (VOP3NoMods f16:$src0)),

llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp

+13-4
Original file line numberDiff line numberDiff line change
@@ -455,9 +455,13 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
455455
break;
456456
case AMDGPU::V_FMA_F16_e64:
457457
case AMDGPU::V_FMA_F16_gfx9_e64:
458+
NewOpcode = AMDGPU::V_FMAAK_F16;
459+
break;
460+
case AMDGPU::V_FMA_F16_gfx9_t16_e64:
461+
NewOpcode = AMDGPU::V_FMAAK_F16_t16;
462+
break;
458463
case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
459-
NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
460-
: AMDGPU::V_FMAAK_F16;
464+
NewOpcode = AMDGPU::V_FMAAK_F16_fake16;
461465
break;
462466
}
463467
}
@@ -485,9 +489,13 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
485489
break;
486490
case AMDGPU::V_FMA_F16_e64:
487491
case AMDGPU::V_FMA_F16_gfx9_e64:
492+
NewOpcode = AMDGPU::V_FMAMK_F16;
493+
break;
494+
case AMDGPU::V_FMA_F16_gfx9_t16_e64:
495+
NewOpcode = AMDGPU::V_FMAMK_F16_t16;
496+
break;
488497
case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
489-
NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
490-
: AMDGPU::V_FMAMK_F16;
498+
NewOpcode = AMDGPU::V_FMAMK_F16_fake16;
491499
break;
492500
}
493501
}
@@ -959,6 +967,7 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
959967
MI.getOpcode() == AMDGPU::V_MAD_F16_e64 ||
960968
MI.getOpcode() == AMDGPU::V_FMA_F16_e64 ||
961969
MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_e64 ||
970+
MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_t16_e64 ||
962971
MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_fake16_e64) {
963972
shrinkMadFma(MI);
964973
continue;

llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll

+47-21
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@
33
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s
44
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
55
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
6-
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX11 %s
6+
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
7+
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
78

89
define float @v_fma_f32(float %x, float %y, float %z) {
910
; GFX6-LABEL: v_fma_f32:
@@ -107,11 +108,18 @@ define half @v_fma_f16(half %x, half %y, half %z) {
107108
; GFX10-NEXT: v_fma_f16 v0, v0, v1, v2
108109
; GFX10-NEXT: s_setpc_b64 s[30:31]
109110
;
110-
; GFX11-LABEL: v_fma_f16:
111-
; GFX11: ; %bb.0:
112-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
113-
; GFX11-NEXT: v_fma_f16 v0, v0, v1, v2
114-
; GFX11-NEXT: s_setpc_b64 s[30:31]
111+
; GFX11-TRUE16-LABEL: v_fma_f16:
112+
; GFX11-TRUE16: ; %bb.0:
113+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
114+
; GFX11-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v1.l
115+
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2
116+
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
117+
;
118+
; GFX11-FAKE16-LABEL: v_fma_f16:
119+
; GFX11-FAKE16: ; %bb.0:
120+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
121+
; GFX11-FAKE16-NEXT: v_fma_f16 v0, v0, v1, v2
122+
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
115123
%fma = call half @llvm.fma.f16(half %x, half %y, half %z)
116124
ret half %fma
117125
}
@@ -145,11 +153,17 @@ define half @v_fma_f16_fneg_lhs(half %x, half %y, half %z) {
145153
; GFX10-NEXT: v_fma_f16 v0, -v0, v1, v2
146154
; GFX10-NEXT: s_setpc_b64 s[30:31]
147155
;
148-
; GFX11-LABEL: v_fma_f16_fneg_lhs:
149-
; GFX11: ; %bb.0:
150-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
151-
; GFX11-NEXT: v_fma_f16 v0, -v0, v1, v2
152-
; GFX11-NEXT: s_setpc_b64 s[30:31]
156+
; GFX11-TRUE16-LABEL: v_fma_f16_fneg_lhs:
157+
; GFX11-TRUE16: ; %bb.0:
158+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
159+
; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.l, v1.l, v2.l
160+
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
161+
;
162+
; GFX11-FAKE16-LABEL: v_fma_f16_fneg_lhs:
163+
; GFX11-FAKE16: ; %bb.0:
164+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
165+
; GFX11-FAKE16-NEXT: v_fma_f16 v0, -v0, v1, v2
166+
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
153167
%neg.x = fneg half %x
154168
%fma = call half @llvm.fma.f16(half %neg.x, half %y, half %z)
155169
ret half %fma
@@ -184,11 +198,17 @@ define half @v_fma_f16_fneg_rhs(half %x, half %y, half %z) {
184198
; GFX10-NEXT: v_fma_f16 v0, v0, -v1, v2
185199
; GFX10-NEXT: s_setpc_b64 s[30:31]
186200
;
187-
; GFX11-LABEL: v_fma_f16_fneg_rhs:
188-
; GFX11: ; %bb.0:
189-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
190-
; GFX11-NEXT: v_fma_f16 v0, v0, -v1, v2
191-
; GFX11-NEXT: s_setpc_b64 s[30:31]
201+
; GFX11-TRUE16-LABEL: v_fma_f16_fneg_rhs:
202+
; GFX11-TRUE16: ; %bb.0:
203+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
204+
; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, -v1.l, v2.l
205+
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
206+
;
207+
; GFX11-FAKE16-LABEL: v_fma_f16_fneg_rhs:
208+
; GFX11-FAKE16: ; %bb.0:
209+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
210+
; GFX11-FAKE16-NEXT: v_fma_f16 v0, v0, -v1, v2
211+
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
192212
%neg.y = fneg half %y
193213
%fma = call half @llvm.fma.f16(half %x, half %neg.y, half %z)
194214
ret half %fma
@@ -223,11 +243,17 @@ define half @v_fma_f16_fneg_add(half %x, half %y, half %z) {
223243
; GFX10-NEXT: v_fma_f16 v0, v0, v1, -v2
224244
; GFX10-NEXT: s_setpc_b64 s[30:31]
225245
;
226-
; GFX11-LABEL: v_fma_f16_fneg_add:
227-
; GFX11: ; %bb.0:
228-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
229-
; GFX11-NEXT: v_fma_f16 v0, v0, v1, -v2
230-
; GFX11-NEXT: s_setpc_b64 s[30:31]
246+
; GFX11-TRUE16-LABEL: v_fma_f16_fneg_add:
247+
; GFX11-TRUE16: ; %bb.0:
248+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
249+
; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v1.l, -v2.l
250+
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
251+
;
252+
; GFX11-FAKE16-LABEL: v_fma_f16_fneg_add:
253+
; GFX11-FAKE16: ; %bb.0:
254+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
255+
; GFX11-FAKE16-NEXT: v_fma_f16 v0, v0, v1, -v2
256+
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
231257
%neg.z = fneg half %z
232258
%fma = call half @llvm.fma.f16(half %x, half %y, half %neg.z)
233259
ret half %fma

llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
2-
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,REAL16 %s
2+
# FIXME-TRUE16. reenable after fix-sgpr-copies is fixed for true16 flow
3+
# XUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,REAL16 %s
34
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,FAKE16 %s
45

56
---

0 commit comments

Comments
 (0)