Skip to content

[AMDGPU][MC] Add dpp for V_PK_FMAC_F16 for GFX10 #79598

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 11 additions & 4 deletions llvm/lib/Target/AMDGPU/VOP2Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1615,6 +1615,9 @@ multiclass VOP2_Real_FULL_with_name_gfx11_gfx12<bits<6> op, string opName,
multiclass VOP2_Real_e32_gfx11_gfx12<bits<6> op> :
VOP2Only_Real<GFX11Gen, op>, VOP2Only_Real<GFX12Gen, op>;

multiclass VOP2_V_PK_FMAC_F16_gfx11_gfx12<bits<6> op> :
VOP2Only_Real_e32<GFX11Gen, op>, VOP2Only_Real_e32<GFX12Gen, op>;

multiclass VOP3Only_Realtriple_gfx11_gfx12<bits<10> op> :
VOP3Only_Realtriple<GFX11Gen, op>, VOP3Only_Realtriple<GFX12Gen, op>;

Expand Down Expand Up @@ -1661,7 +1664,8 @@ defm V_SUBREV_CO_CI_U32 :

defm V_CVT_PK_RTZ_F16_F32 : VOP2_Real_FULL_with_name_gfx11_gfx12<0x02f,
"V_CVT_PKRTZ_F16_F32", "v_cvt_pk_rtz_f16_f32">;
defm V_PK_FMAC_F16 : VOP2_Real_e32_gfx11_gfx12<0x03c>;

defm V_PK_FMAC_F16 : VOP2_V_PK_FMAC_F16_gfx11_gfx12<0x03c>;

defm V_ADD_F16_t16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x032, "v_add_f16">;
defm V_ADD_F16_fake16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x032, "v_add_f16">;
Expand Down Expand Up @@ -1945,6 +1949,11 @@ multiclass VOP2e_Real_gfx10<bits<6> op, string opName, string asmName> :
VOP2be_Real_dpp_gfx10<op, opName, asmName>,
VOP2be_Real_dpp8_gfx10<op, opName, asmName>;

multiclass VOP2_FMAC_Real<bits<6> op> :
VOP2_Real_e32_gfx10<op>,
VOP2_Real_dpp_gfx10<op>,
VOP2_Real_dpp8_gfx10<op>;

multiclass VOP2_Real_gfx10<bits<6> op> :
VOP2_Real_e32_gfx10<op>, VOP2_Real_e64_gfx10<op>,
VOP2_Real_sdwa_gfx10<op>, VOP2_Real_dpp_gfx10<op>, VOP2_Real_dpp8_gfx10<op>;
Expand Down Expand Up @@ -1988,9 +1997,7 @@ defm V_MAX_F16 : VOP2_Real_gfx10<0x039>;
defm V_MIN_F16 : VOP2_Real_gfx10<0x03a>;
defm V_LDEXP_F16 : VOP2_Real_gfx10<0x03b>;

let IsSingle = 1 in {
defm V_PK_FMAC_F16 : VOP2_Real_e32_gfx10<0x03c>;
}
defm V_PK_FMAC_F16 : VOP2_FMAC_Real<0x03c>;

// VOP2 no carry-in, carry-out.
defm V_ADD_NC_U32 :
Expand Down
17 changes: 17 additions & 0 deletions llvm/test/CodeGen/AMDGPU/dpp_combine_gfx10.mir
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN

# GCN-LABEL: name: v_pk_fmac_f16
# GCN: %4:vgpr_32 = IMPLICIT_DEF
# GCN: %3:vgpr_32 = V_PK_FMAC_F16_dpp %4, 0, %1, 0, %1, 1, 15, 15, 1, implicit $mode, implicit $exec
name: v_pk_fmac_f16
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1

%0:vgpr_32 = COPY $vgpr0
%1:vgpr_32 = COPY $vgpr1

%2:vgpr_32 = V_MOV_B32_dpp %0, %1, 1, 15, 15, 1, implicit $exec
%3:vgpr_32 = V_PK_FMAC_F16_e32 %2, %1, implicit $mode, implicit $exec
...
16 changes: 16 additions & 0 deletions llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,22 @@ body: |
%6:vgpr_32 = V_FMAC_F32_e64 2, %4, 2, %1, 2, %2, 1, 2, implicit $mode, implicit $exec
...

# GCN-LABEL: name: v_pk_fmac_f16
# GCN: %2:vgpr_32 = V_MOV_B32_dpp %0, %1, 1, 15, 15, 1, implicit $exec
# GCN: %3:vgpr_32 = V_PK_FMAC_F16_e32 %2, %1, implicit $mode, implicit $exec
name: v_pk_fmac_f16
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1

%0:vgpr_32 = COPY $vgpr0
%1:vgpr_32 = COPY $vgpr1

%2:vgpr_32 = V_MOV_B32_dpp %0, %1, 1, 15, 15, 1, implicit $exec
%3:vgpr_32 = V_PK_FMAC_F16_e32 %2, %1, implicit $mode, implicit $exec
...

# when the DPP source isn't a src0 operand the operation should be commuted if possible
# GCN-LABEL: name: dpp_commute_shrink
# GCN: %4:vgpr_32 = V_MUL_U32_U24_dpp %1, %0, %1, 1, 14, 15, 0, implicit $exec
Expand Down
6 changes: 6 additions & 0 deletions llvm/test/MC/AMDGPU/gfx10_asm_vop2.s
Original file line number Diff line number Diff line change
Expand Up @@ -13185,3 +13185,9 @@ v_pk_fmac_f16 v5, -4.0, v2

v_pk_fmac_f16 v5, v1, v255
// GFX10: encoding: [0x01,0xff,0x0b,0x78]

v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3]
// GFX10: encoding: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0xff]

v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add gfx10 error tests that DPP16 NEG and ABS modifiers are not supported by this instruction

// GFX10: encoding: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0x03]
3 changes: 3 additions & 0 deletions llvm/test/MC/AMDGPU/gfx11_asm_err.s
Original file line number Diff line number Diff line change
Expand Up @@ -187,3 +187,6 @@ v_mov_b16 v0.l, ttmp0.h

v_mov_b16 v0.l, a0.h
// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction

v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: dpp variant of this instruction is not supported
3 changes: 3 additions & 0 deletions llvm/test/MC/AMDGPU/gfx12_asm_vop2_t16_err.s
Original file line number Diff line number Diff line change
Expand Up @@ -224,3 +224,6 @@ v_sub_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0]

v_subrev_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you please move this test to a different file? It is packed, so not a true16 instruction. There isn't a gfx12_asm_vop2_err.s, so perhaps you can create it.

v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: dpp variant of this instruction is not supported
2 changes: 1 addition & 1 deletion llvm/test/MC/AMDGPU/literalv216.s
Original file line number Diff line number Diff line change
Expand Up @@ -291,4 +291,4 @@ v_pk_add_u16 v5, v1, 123456.0
// FIXME: v_pk_fmac_f16 cannot be promoted to VOP3 so '_e32' suffix is not valid
v_pk_fmac_f16 v5, 0x12345678, v2
// NOGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
// GFX10: v_pk_fmac_f16 v5, 0x12345678, v2 ; encoding: [0xff,0x04,0x0a,0x78,0x78,0x56,0x34,0x12]
// GFX10: v_pk_fmac_f16_e32 v5, 0x12345678, v2 ; encoding: [0xff,0x04,0x0a,0x78,0x78,0x56,0x34,0x12]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I have the convention right, as stated in this patch 0f5ebbc, instructions without a VOP3 form should not have _e32. It looks like removing IsSingle caused _e32 to be added to the mnemonic. Please change it back.

38 changes: 22 additions & 16 deletions llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1779,54 +1779,60 @@
# GFX10: v_or_b32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x38]
0x6a,0x04,0x0a,0x38

# GFX10: v_pk_fmac_f16 v255, v1, v2 ; encoding: [0x01,0x05,0xfe,0x79]
# GFX10: v_pk_fmac_f16_e32 v255, v1, v2 ; encoding: [0x01,0x05,0xfe,0x79]
0x01,0x05,0xfe,0x79

# GFX10: v_pk_fmac_f16 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x78]
# GFX10: v_pk_fmac_f16_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x78]
0xc1,0x04,0x0a,0x78

# GFX10: v_pk_fmac_f16 v5, -4.0, v2 ; encoding: [0xf7,0x04,0x0a,0x78]
# GFX10: v_pk_fmac_f16_e32 v5, -4.0, v2 ; encoding: [0xf7,0x04,0x0a,0x78]
0xf7,0x04,0x0a,0x78

# GFX10: v_pk_fmac_f16 v5, 0, v2 ; encoding: [0x80,0x04,0x0a,0x78]
# GFX10: v_pk_fmac_f16_e32 v5, 0, v2 ; encoding: [0x80,0x04,0x0a,0x78]
0x80,0x04,0x0a,0x78

# GFX10: v_pk_fmac_f16 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x78]
# GFX10: v_pk_fmac_f16_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x78]
0xf0,0x04,0x0a,0x78

# GFX10: v_pk_fmac_f16 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x78]
# GFX10: v_pk_fmac_f16_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x78]
0x7f,0x04,0x0a,0x78

# GFX10: v_pk_fmac_f16 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x78]
# GFX10: v_pk_fmac_f16_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x78]
0x7e,0x04,0x0a,0x78

# GFX10: v_pk_fmac_f16 v5, m0, v2 ; encoding: [0x7c,0x04,0x0a,0x78]
# GFX10: v_pk_fmac_f16_e32 v5, m0, v2 ; encoding: [0x7c,0x04,0x0a,0x78]
0x7c,0x04,0x0a,0x78

# GFX10: v_pk_fmac_f16 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x78]
# GFX10: v_pk_fmac_f16_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x78]
0x01,0x04,0x0a,0x78

# GFX10: v_pk_fmac_f16 v5, s103, v2 ; encoding: [0x67,0x04,0x0a,0x78]
# GFX10: v_pk_fmac_f16_e32 v5, s103, v2 ; encoding: [0x67,0x04,0x0a,0x78]
0x67,0x04,0x0a,0x78

# GFX10: v_pk_fmac_f16 v5, ttmp11, v2 ; encoding: [0x77,0x04,0x0a,0x78]
# GFX10: v_pk_fmac_f16_e32 v5, ttmp11, v2 ; encoding: [0x77,0x04,0x0a,0x78]
0x77,0x04,0x0a,0x78

# GFX10: v_pk_fmac_f16 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x78]
# GFX10: v_pk_fmac_f16_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x78]
0x01,0x05,0x0a,0x78

# GFX10: v_pk_fmac_f16 v5, v1, v255 ; encoding: [0x01,0xff,0x0b,0x78]
# GFX10: v_pk_fmac_f16_e32 v5, v1, v255 ; encoding: [0x01,0xff,0x0b,0x78]
0x01,0xff,0x0b,0x78

# GFX10: v_pk_fmac_f16 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x78]
# GFX10: v_pk_fmac_f16_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x78]
0xff,0x05,0x0a,0x78

# GFX10: v_pk_fmac_f16 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x78]
# GFX10: v_pk_fmac_f16_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x78]
0x6b,0x04,0x0a,0x78

# GFX10: v_pk_fmac_f16 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x78]
# GFX10: v_pk_fmac_f16_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x78]
0x6a,0x04,0x0a,0x78

#GFX10: v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0xff]
0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0xff

#GFX10: v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0x03]
0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0x03

# W32: v_sub_co_ci_u32_e32 v255, vcc_lo, v1, v2, vcc_lo ; encoding: [0x01,0x05,0xfe,0x53]
# W64: v_sub_co_ci_u32_e32 v255, vcc, v1, v2, vcc ; encoding: [0x01,0x05,0xfe,0x53]
0x01,0x05,0xfe,0x53
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -144,5 +144,5 @@
# Packed VOP2
#===----------------------------------------------------------------------===//

# GFX10: v_pk_fmac_f16 v5, 0x12345678, v2 ; encoding: [0xff,0x04,0x0a,0x78,0x78,0x56,0x34,0x12]
# GFX10: v_pk_fmac_f16_e32 v5, 0x12345678, v2 ; encoding: [0xff,0x04,0x0a,0x78,0x78,0x56,0x34,0x12]
0xff,0x04,0x0a,0x78,0x78,0x56,0x34,0x12