Skip to content

Commit cf80def

Browse files
authored
[AMDGPU][GFX11] Do not rewrite V_FMA/FMAC_* to V_FMAAK_F16_t16 on operand legalization. (#66202)
V_FMAAK_F16_t16 takes VGPR_32_Lo128 operands whereas the original instructions would have VGPR_32 operands. Switching the opcodes without updating operands' register classes leads to MachineVerifier complaining about the classes not matching instruction definitions. The problem only reveals itself of builds with expensive checks enabled because of missing -verify-machineinstrs in the test. This is the third attempt to update CodeGen/AMDGPU/fma.f16.ll to run for GFX11, following the second attempt in a1e38e0, partially reverted in eaf737a.
1 parent d918b81 commit cf80def

File tree

2 files changed

+101
-4
lines changed

2 files changed

+101
-4
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3346,6 +3346,12 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
33463346
if (pseudoToMCOpcode(NewOpc) == -1)
33473347
return false;
33483348

3349+
// V_FMAAK_F16_t16 takes VGPR_32_Lo128 operands, so the rewrite
3350+
// would also require restricting their register classes. For now
3351+
// just bail out.
3352+
if (NewOpc == AMDGPU::V_FMAAK_F16_t16)
3353+
return false;
3354+
33493355
const int64_t Imm = ImmOp->getImm();
33503356

33513357
// FIXME: This would be a lot easier if we could return a new instruction

llvm/test/CodeGen/AMDGPU/fma.f16.ll

Lines changed: 95 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=GFX9,GFX9-SDAG
3-
; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=GFX9,GFX9-GISEL
4-
; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefixes=GFX10,GFX10-SDAG
5-
; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefixes=GFX10,GFX10-GISEL
2+
; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9,GFX9-SDAG
3+
; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9,GFX9-GISEL
4+
; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10,GFX10-SDAG
5+
; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10,GFX10-GISEL
6+
; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11,GFX11-SDAG
7+
; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL
68

79
declare half @llvm.fma.f16(half, half, half)
810
declare half @llvm.maxnum.f16(half, half)
@@ -19,6 +21,12 @@ define half @test_fma(half %x, half %y, half %z) {
1921
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2022
; GFX10-NEXT: v_fma_f16 v0, v0, v1, v2
2123
; GFX10-NEXT: s_setpc_b64 s[30:31]
24+
;
25+
; GFX11-LABEL: test_fma:
26+
; GFX11: ; %bb.0:
27+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28+
; GFX11-NEXT: v_fma_f16 v0, v0, v1, v2
29+
; GFX11-NEXT: s_setpc_b64 s[30:31]
2230
%r = call half @llvm.fma.f16(half %x, half %y, half %z)
2331
ret half %r
2432
}
@@ -36,6 +44,12 @@ define half @test_fmac(half %x, half %y, half %z) {
3644
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3745
; GFX10-NEXT: v_fmac_f16_e32 v0, v1, v2
3846
; GFX10-NEXT: s_setpc_b64 s[30:31]
47+
;
48+
; GFX11-LABEL: test_fmac:
49+
; GFX11: ; %bb.0:
50+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
51+
; GFX11-NEXT: v_fmac_f16_e32 v0, v1, v2
52+
; GFX11-NEXT: s_setpc_b64 s[30:31]
3953
%r = call half @llvm.fma.f16(half %y, half %z, half %x)
4054
ret half %r
4155
}
@@ -61,6 +75,12 @@ define half @test_fmaak(half %x, half %y, half %z) {
6175
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6276
; GFX10-NEXT: v_fmaak_f16 v0, v0, v1, 0x4200
6377
; GFX10-NEXT: s_setpc_b64 s[30:31]
78+
;
79+
; GFX11-LABEL: test_fmaak:
80+
; GFX11: ; %bb.0:
81+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
82+
; GFX11-NEXT: v_fmaak_f16 v0, v0, v1, 0x4200
83+
; GFX11-NEXT: s_setpc_b64 s[30:31]
6484
%r = call half @llvm.fma.f16(half %x, half %y, half 0xH4200)
6585
ret half %r
6686
}
@@ -86,6 +106,12 @@ define half @test_fmamk(half %x, half %y, half %z) {
86106
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
87107
; GFX10-NEXT: v_fmamk_f16 v0, v0, 0x4200, v2
88108
; GFX10-NEXT: s_setpc_b64 s[30:31]
109+
;
110+
; GFX11-LABEL: test_fmamk:
111+
; GFX11: ; %bb.0:
112+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
113+
; GFX11-NEXT: v_fmamk_f16 v0, v0, 0x4200, v2
114+
; GFX11-NEXT: s_setpc_b64 s[30:31]
89115
%r = call half @llvm.fma.f16(half %x, half 0xH4200, half %z)
90116
ret half %r
91117
}
@@ -139,6 +165,33 @@ define i32 @test_D139469_f16(half %arg) {
139165
; GFX10-GISEL-NEXT: s_or_b32 s4, vcc_lo, s4
140166
; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
141167
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
168+
;
169+
; GFX11-SDAG-LABEL: test_D139469_f16:
170+
; GFX11-SDAG: ; %bb.0: ; %bb
171+
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
172+
; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x211e
173+
; GFX11-SDAG-NEXT: v_mul_f16_e32 v2, 0x291e, v0
174+
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
175+
; GFX11-SDAG-NEXT: v_fmac_f16_e32 v1, 0x291e, v0
176+
; GFX11-SDAG-NEXT: v_min_f16_e32 v0, v2, v1
177+
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
178+
; GFX11-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0
179+
; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
180+
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
181+
;
182+
; GFX11-GISEL-LABEL: test_D139469_f16:
183+
; GFX11-GISEL: ; %bb.0: ; %bb
184+
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
185+
; GFX11-GISEL-NEXT: s_movk_i32 s0, 0x291e
186+
; GFX11-GISEL-NEXT: v_mul_f16_e32 v1, 0x291e, v0
187+
; GFX11-GISEL-NEXT: v_fmaak_f16 v0, s0, v0, 0x211e
188+
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
189+
; GFX11-GISEL-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1
190+
; GFX11-GISEL-NEXT: v_cmp_gt_f16_e64 s0, 0, v0
191+
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
192+
; GFX11-GISEL-NEXT: s_or_b32 s0, vcc_lo, s0
193+
; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
194+
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
142195
bb:
143196
%i = fmul contract half %arg, 0xH291E
144197
%i1 = fcmp olt half %i, 0xH0000
@@ -213,6 +266,44 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) {
213266
; GFX10-GISEL-NEXT: s_or_b32 s4, s6, s5
214267
; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4
215268
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
269+
;
270+
; GFX11-SDAG-LABEL: test_D139469_v2f16:
271+
; GFX11-SDAG: ; %bb.0: ; %bb
272+
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
273+
; GFX11-SDAG-NEXT: s_movk_i32 s0, 0x211e
274+
; GFX11-SDAG-NEXT: v_pk_mul_f16 v1, 0x291e, v0 op_sel_hi:[0,1]
275+
; GFX11-SDAG-NEXT: v_pk_fma_f16 v0, 0x291e, v0, s0 op_sel_hi:[0,1,0]
276+
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
277+
; GFX11-SDAG-NEXT: v_pk_min_f16 v0, v1, v0
278+
; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0
279+
; GFX11-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0
280+
; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
281+
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3)
282+
; GFX11-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1
283+
; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
284+
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
285+
;
286+
; GFX11-GISEL-LABEL: test_D139469_v2f16:
287+
; GFX11-GISEL: ; %bb.0: ; %bb
288+
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
289+
; GFX11-GISEL-NEXT: s_mov_b32 s0, 0x291e291e
290+
; GFX11-GISEL-NEXT: v_pk_mul_f16 v1, v0, 0x291e op_sel_hi:[1,0]
291+
; GFX11-GISEL-NEXT: v_pk_fma_f16 v0, v0, s0, 0x211e op_sel_hi:[1,1,0]
292+
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
293+
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v1
294+
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v0
295+
; GFX11-GISEL-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1
296+
; GFX11-GISEL-NEXT: v_cmp_gt_f16_e64 s0, 0, v0
297+
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
298+
; GFX11-GISEL-NEXT: v_cmp_gt_f16_e64 s1, 0, v2
299+
; GFX11-GISEL-NEXT: v_cmp_gt_f16_e64 s2, 0, v3
300+
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
301+
; GFX11-GISEL-NEXT: s_or_b32 s0, vcc_lo, s0
302+
; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
303+
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
304+
; GFX11-GISEL-NEXT: s_or_b32 s0, s1, s2
305+
; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
306+
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
216307
bb:
217308
%i = fmul contract <2 x half> %arg, <half 0xH291E, half 0xH291E>
218309
%i1 = fcmp olt <2 x half> %i, <half 0xH0000, half 0xH0000>

0 commit comments

Comments
 (0)