[AMDGPU][MC] Add dpp for V_PK_FMAC_F16 for GFX10 #79598

ankurepa · 2024-01-26T14:26:52Z

Adds dpp for v_pk_fmac_f16 for gfx10 and removes them for gfx11 and gfx12

github-actions · 2024-01-26T14:27:11Z

Thank you for submitting a Pull Request (PR) to the LLVM Project!

This PR will be automatically labeled and the relevant teams will be
notified.

If you wish to, you can add reviewers by using the "Reviewers" section on this page.

If this is not working for you, it is probably because you do not have write
permissions for the repository. In which case you can instead tag reviewers by
name in a comment by using @ followed by their GitHub username.

If you have received no comments on your PR for a week, you can request a review
by "ping"ing the PR by adding a comment “Ping”. The common courtesy "ping" rate
is once a week. Please remember that you are asking for valuable time from other developers.

If you have further questions, they may be answered by the LLVM GitHub User Guide.

You can also ask questions in a comment on this PR, on the LLVM Discord or on the forums.

llvmbot · 2024-01-26T14:27:39Z

@llvm/pr-subscribers-mc

@llvm/pr-subscribers-backend-amdgpu

Author: None (ankurepa)

Changes

Adds dpp for v_pk_fmac_f16 for gfx10 and removes them for gfx11 and gfx12

Full diff: https://github.com/llvm/llvm-project/pull/79598.diff

5 Files Affected:

(modified) llvm/lib/Target/AMDGPU/VOP2Instructions.td (+11-4)
(modified) llvm/test/MC/AMDGPU/gfx10_asm_vop2.s (+6)
(modified) llvm/test/MC/AMDGPU/literalv216.s (+1-1)
(modified) llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2.txt (+22-16)
(modified) llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3p_literalv216.txt (+1-1)

diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index eba9bf64884ec8f..fa18a09b3831d68 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -1615,6 +1615,9 @@ multiclass VOP2_Real_FULL_with_name_gfx11_gfx12<bits<6> op, string opName,
 multiclass VOP2_Real_e32_gfx11_gfx12<bits<6> op> :
   VOP2Only_Real<GFX11Gen, op>, VOP2Only_Real<GFX12Gen, op>;
 
+multiclass VOP2_V_PK_FMAC_F16_gfx11_gfx12<bits<6> op> :
+  VOP2Only_Real_e32<GFX11Gen, op>, VOP2Only_Real_e32<GFX12Gen, op>;
+
 multiclass VOP3Only_Realtriple_gfx11_gfx12<bits<10> op> :
   VOP3Only_Realtriple<GFX11Gen, op>, VOP3Only_Realtriple<GFX12Gen, op>;
 
@@ -1661,7 +1664,8 @@ defm V_SUBREV_CO_CI_U32 :
 
 defm V_CVT_PK_RTZ_F16_F32  : VOP2_Real_FULL_with_name_gfx11_gfx12<0x02f,
   "V_CVT_PKRTZ_F16_F32", "v_cvt_pk_rtz_f16_f32">;
-defm V_PK_FMAC_F16     : VOP2_Real_e32_gfx11_gfx12<0x03c>;
+
+defm V_PK_FMAC_F16     : VOP2_V_PK_FMAC_F16_gfx11_gfx12<0x03c>;
 
 defm V_ADD_F16_t16         : VOP2_Real_FULL_t16_gfx11_gfx12<0x032, "v_add_f16">;
 defm V_ADD_F16_fake16      : VOP2_Real_FULL_t16_gfx11_gfx12<0x032, "v_add_f16">;
@@ -1945,6 +1949,11 @@ multiclass VOP2e_Real_gfx10<bits<6> op, string opName, string asmName> :
   VOP2be_Real_dpp_gfx10<op, opName, asmName>,
   VOP2be_Real_dpp8_gfx10<op, opName, asmName>;
 
+multiclass VOP2_FMAC_Real<bits<6> op> :
+  VOP2_Real_e32_gfx10<op>,
+  VOP2_Real_dpp_gfx10<op>,
+  VOP2_Real_dpp8_gfx10<op>;
+
 multiclass VOP2_Real_gfx10<bits<6> op> :
   VOP2_Real_e32_gfx10<op>, VOP2_Real_e64_gfx10<op>,
   VOP2_Real_sdwa_gfx10<op>, VOP2_Real_dpp_gfx10<op>, VOP2_Real_dpp8_gfx10<op>;
@@ -1988,9 +1997,7 @@ defm V_MAX_F16         : VOP2_Real_gfx10<0x039>;
 defm V_MIN_F16         : VOP2_Real_gfx10<0x03a>;
 defm V_LDEXP_F16       : VOP2_Real_gfx10<0x03b>;
 
-let IsSingle = 1 in {
-  defm V_PK_FMAC_F16     : VOP2_Real_e32_gfx10<0x03c>;
-}
+defm V_PK_FMAC_F16     : VOP2_FMAC_Real<0x03c>;
 
 // VOP2 no carry-in, carry-out.
 defm V_ADD_NC_U32 :
diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_vop2.s b/llvm/test/MC/AMDGPU/gfx10_asm_vop2.s
index bf8e18ec1451235..f92b505cbd08354 100644
--- a/llvm/test/MC/AMDGPU/gfx10_asm_vop2.s
+++ b/llvm/test/MC/AMDGPU/gfx10_asm_vop2.s
@@ -13185,3 +13185,9 @@ v_pk_fmac_f16 v5, -4.0, v2
 
 v_pk_fmac_f16 v5, v1, v255
 // GFX10: encoding: [0x01,0xff,0x0b,0x78]
+
+v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3]
+// GFX10: encoding: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0xff]
+
+v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// GFX10: encoding: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0x03]
\ No newline at end of file
diff --git a/llvm/test/MC/AMDGPU/literalv216.s b/llvm/test/MC/AMDGPU/literalv216.s
index c695bc3600c3871..f5afaa6bd61813c 100644
--- a/llvm/test/MC/AMDGPU/literalv216.s
+++ b/llvm/test/MC/AMDGPU/literalv216.s
@@ -291,4 +291,4 @@ v_pk_add_u16 v5, v1, 123456.0
 // FIXME: v_pk_fmac_f16 cannot be promoted to VOP3 so '_e32' suffix is not valid
 v_pk_fmac_f16 v5, 0x12345678, v2
 // NOGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
-// GFX10: v_pk_fmac_f16 v5, 0x12345678, v2 ; encoding: [0xff,0x04,0x0a,0x78,0x78,0x56,0x34,0x12]
+// GFX10: v_pk_fmac_f16_e32 v5, 0x12345678, v2 ; encoding: [0xff,0x04,0x0a,0x78,0x78,0x56,0x34,0x12]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2.txt
index b759912204db82b..33d89da3b3ae939 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2.txt
@@ -1779,54 +1779,60 @@
 # GFX10: v_or_b32_e32 v5, vcc_lo, v2             ; encoding: [0x6a,0x04,0x0a,0x38]
 0x6a,0x04,0x0a,0x38
 
-# GFX10: v_pk_fmac_f16 v255, v1, v2              ; encoding: [0x01,0x05,0xfe,0x79]
+# GFX10: v_pk_fmac_f16_e32 v255, v1, v2          ; encoding: [0x01,0x05,0xfe,0x79]
 0x01,0x05,0xfe,0x79
 
-# GFX10: v_pk_fmac_f16 v5, -1, v2                ; encoding: [0xc1,0x04,0x0a,0x78]
+# GFX10: v_pk_fmac_f16_e32 v5, -1, v2            ; encoding: [0xc1,0x04,0x0a,0x78]
 0xc1,0x04,0x0a,0x78
 
-# GFX10: v_pk_fmac_f16 v5, -4.0, v2              ; encoding: [0xf7,0x04,0x0a,0x78]
+# GFX10: v_pk_fmac_f16_e32 v5, -4.0, v2          ; encoding: [0xf7,0x04,0x0a,0x78]
 0xf7,0x04,0x0a,0x78
 
-# GFX10: v_pk_fmac_f16 v5, 0, v2                 ; encoding: [0x80,0x04,0x0a,0x78]
+# GFX10: v_pk_fmac_f16_e32 v5, 0, v2             ; encoding: [0x80,0x04,0x0a,0x78]
 0x80,0x04,0x0a,0x78
 
-# GFX10: v_pk_fmac_f16 v5, 0.5, v2               ; encoding: [0xf0,0x04,0x0a,0x78]
+# GFX10: v_pk_fmac_f16_e32 v5, 0.5, v2           ; encoding: [0xf0,0x04,0x0a,0x78]
 0xf0,0x04,0x0a,0x78
 
-# GFX10: v_pk_fmac_f16 v5, exec_hi, v2           ; encoding: [0x7f,0x04,0x0a,0x78]
+# GFX10: v_pk_fmac_f16_e32 v5, exec_hi, v2       ; encoding: [0x7f,0x04,0x0a,0x78]
 0x7f,0x04,0x0a,0x78
 
-# GFX10: v_pk_fmac_f16 v5, exec_lo, v2           ; encoding: [0x7e,0x04,0x0a,0x78]
+# GFX10: v_pk_fmac_f16_e32 v5, exec_lo, v2       ; encoding: [0x7e,0x04,0x0a,0x78]
 0x7e,0x04,0x0a,0x78
 
-# GFX10: v_pk_fmac_f16 v5, m0, v2                ; encoding: [0x7c,0x04,0x0a,0x78]
+# GFX10: v_pk_fmac_f16_e32 v5, m0, v2            ; encoding: [0x7c,0x04,0x0a,0x78]
 0x7c,0x04,0x0a,0x78
 
-# GFX10: v_pk_fmac_f16 v5, s1, v2                ; encoding: [0x01,0x04,0x0a,0x78]
+# GFX10: v_pk_fmac_f16_e32 v5, s1, v2            ; encoding: [0x01,0x04,0x0a,0x78]
 0x01,0x04,0x0a,0x78
 
-# GFX10: v_pk_fmac_f16 v5, s103, v2              ; encoding: [0x67,0x04,0x0a,0x78]
+# GFX10: v_pk_fmac_f16_e32 v5, s103, v2          ; encoding: [0x67,0x04,0x0a,0x78]
 0x67,0x04,0x0a,0x78
 
-# GFX10: v_pk_fmac_f16 v5, ttmp11, v2            ; encoding: [0x77,0x04,0x0a,0x78]
+# GFX10: v_pk_fmac_f16_e32 v5, ttmp11, v2        ; encoding: [0x77,0x04,0x0a,0x78]
 0x77,0x04,0x0a,0x78
 
-# GFX10: v_pk_fmac_f16 v5, v1, v2                ; encoding: [0x01,0x05,0x0a,0x78]
+# GFX10: v_pk_fmac_f16_e32 v5, v1, v2            ; encoding: [0x01,0x05,0x0a,0x78]
 0x01,0x05,0x0a,0x78
 
-# GFX10: v_pk_fmac_f16 v5, v1, v255              ; encoding: [0x01,0xff,0x0b,0x78]
+# GFX10: v_pk_fmac_f16_e32 v5, v1, v255          ; encoding: [0x01,0xff,0x0b,0x78]
 0x01,0xff,0x0b,0x78
 
-# GFX10: v_pk_fmac_f16 v5, v255, v2              ; encoding: [0xff,0x05,0x0a,0x78]
+# GFX10: v_pk_fmac_f16_e32 v5, v255, v2          ; encoding: [0xff,0x05,0x0a,0x78]
 0xff,0x05,0x0a,0x78
 
-# GFX10: v_pk_fmac_f16 v5, vcc_hi, v2            ; encoding: [0x6b,0x04,0x0a,0x78]
+# GFX10: v_pk_fmac_f16_e32 v5, vcc_hi, v2        ; encoding: [0x6b,0x04,0x0a,0x78]
 0x6b,0x04,0x0a,0x78
 
-# GFX10: v_pk_fmac_f16 v5, vcc_lo, v2            ; encoding: [0x6a,0x04,0x0a,0x78]
+# GFX10: v_pk_fmac_f16_e32 v5, vcc_lo, v2        ; encoding: [0x6a,0x04,0x0a,0x78]
 0x6a,0x04,0x0a,0x78
 
+#GFX10: v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0xff]
+0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0xff
+
+#GFX10: v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0x03]
+0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0x03
+
 # W32: v_sub_co_ci_u32_e32 v255, vcc_lo, v1, v2, vcc_lo ; encoding: [0x01,0x05,0xfe,0x53]
 # W64: v_sub_co_ci_u32_e32 v255, vcc, v1, v2, vcc ; encoding: [0x01,0x05,0xfe,0x53]
 0x01,0x05,0xfe,0x53
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3p_literalv216.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3p_literalv216.txt
index a022c79fe97e605..97c81ed1a629a03 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3p_literalv216.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3p_literalv216.txt
@@ -144,5 +144,5 @@
 # Packed VOP2
 #===----------------------------------------------------------------------===//
 
-# GFX10: v_pk_fmac_f16 v5, 0x12345678, v2 ; encoding: [0xff,0x04,0x0a,0x78,0x78,0x56,0x34,0x12]
+# GFX10: v_pk_fmac_f16_e32 v5, 0x12345678, v2 ; encoding: [0xff,0x04,0x0a,0x78,0x78,0x56,0x34,0x12]
 0xff,0x04,0x0a,0x78,0x78,0x56,0x34,0x12

DadSchoorse · 2024-01-26T16:30:31Z

and removes them for gfx11 and gfx12

Why is this not supported on gfx11? "Table 30. Which instructions support DPP" in the RDNA3 ISA doc lists no exception for v_pk_fmac_f16.

DadSchoorse · 2024-01-26T16:38:33Z

Another question is if v_pk_fmac_f16_dpp16 supports abs/neg modifiers.

arsenm

Needs test for the error in the removed support cases, assuming it's actually correct to remove support

llvm/test/MC/AMDGPU/gfx10_asm_vop2.s

github-actions · 2024-02-07T11:59:26Z

✅ With the latest revision this PR passed the C/C++ code formatter.

arsenm · 2024-02-08T08:17:48Z

Why is this not supported on gfx11? "Table 30. Which instructions support DPP" in the RDNA3 ISA doc lists no exception for v_pk_fmac_f16.

It's there, but not spelled out (it says "V_PK_*")

arsenm

Do we need new codegen tests for making sure we use it for gfx10 and not for 11?

DadSchoorse · 2024-02-08T11:51:43Z

It's there, but not spelled out (it says "V_PK_*")

The "V_PK_*" is in the VOP3P section of the table and v_pk_fmac_f16 is a VOP2 opcode. The VOP2 section only has exceptions for FMAMK/AD_F32/16 and 64bit opcodes. So according to the table, v_pk_fmac_f16 should support DPP.

arsenm · 2024-02-08T12:13:01Z

It's there, but not spelled out (it says "V_PK_*")

The "V_PK_*" is in the VOP3P section of the table and v_pk_fmac_f16 is a VOP2 opcode. The VOP2 section only has exceptions for FMAMK/AD_F32/16 and 64bit opcodes. So according to the table, v_pk_fmac_f16 should support DPP.

In the internal doc I see a nodpp flag on it still

DadSchoorse · 2024-02-08T12:35:26Z

Okay, then I have a bug to fix in our compiler. It would be nice if the public doc could be corrected though.

DadSchoorse · 2024-02-09T10:44:10Z

To conclude the gfx11 discussion, I tested the hw behavior on navi31 and while the instruction executes, it doesn't behave like on gfx10. Only the high half actually uses DPP, the low half reads the current lane's value.

ankurepa · 2024-02-09T11:49:22Z

@Sisyph review please

Public docs are apparently wrong: llvm/llvm-project#79598 (comment) Cc: mesa-stable Reviewed-by: Daniel Schürmann <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27533>

Public docs are apparently wrong: llvm/llvm-project#79598 (comment) Cc: mesa-stable Reviewed-by: Daniel Schürmann <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27533> (cherry picked from commit e927c50)

Sisyph

Please remove [MC] from the commit title, this affects codegen.

Sisyph · 2024-02-23T17:03:38Z

llvm/test/MC/AMDGPU/gfx12_asm_vop2_t16_err.s

@@ -224,3 +224,6 @@ v_sub_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0]

 v_subrev_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+


Can you please move this test to a different file? It is packed, so not a true16 instruction. There isn't a gfx12_asm_vop2_err.s, so perhaps you can create it.

Sisyph · 2024-02-23T17:08:36Z

llvm/test/MC/AMDGPU/literalv216.s

@@ -291,4 +291,4 @@ v_pk_add_u16 v5, v1, 123456.0
 // FIXME: v_pk_fmac_f16 cannot be promoted to VOP3 so '_e32' suffix is not valid
 v_pk_fmac_f16 v5, 0x12345678, v2
 // NOGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
-// GFX10: v_pk_fmac_f16 v5, 0x12345678, v2 ; encoding: [0xff,0x04,0x0a,0x78,0x78,0x56,0x34,0x12]
+// GFX10: v_pk_fmac_f16_e32 v5, 0x12345678, v2 ; encoding: [0xff,0x04,0x0a,0x78,0x78,0x56,0x34,0x12]


If I have the convention right, as stated in this patch 0f5ebbc, instructions without a VOP3 form should not have _e32. It looks like removing IsSingle caused _e32 to be added to the mnemonic. Please change it back.

Sisyph · 2024-02-23T17:10:02Z

llvm/test/MC/AMDGPU/gfx10_asm_vop2.s

+v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3]
+// GFX10: encoding: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0xff]
+
+v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3


Please add gfx10 error tests that DPP16 NEG and ABS modifiers are not supported by this instruction

[AMDGPU][MC] Add dpp for V_PK_FMAC_F16 for GFX10

d3e1954

Adds dpp for v_pk_fmac_f16 for gfx10 and removes them for gfx11 and gfx12

llvmbot added backend:AMDGPU mc Machine (object) code labels Jan 26, 2024

arsenm requested changes Feb 6, 2024

View reviewed changes

llvm/test/MC/AMDGPU/gfx10_asm_vop2.s Outdated Show resolved Hide resolved

ankurepa requested review from aaupov, maksfb, rafaelauler, dcci, aartbik, PeimingLiu, yinying-lisa-li, matthias-springer, antiagainst, kuhar, hanhanW, nicolasvasilache, banach-space, dcaballe, JDevlieghere, Endilll, nikic and a team as code owners February 7, 2024 11:55

ankurepa force-pushed the v_pk_fmac_dpp branch from 4e5a7b3 to d3e1954 Compare February 7, 2024 12:00

Fix test

7bf323c

nikic removed request for a team, nikic and yinying-lisa-li February 7, 2024 13:29

Endilll removed their request for review February 7, 2024 13:56

arsenm approved these changes Feb 8, 2024

View reviewed changes

arsenm reviewed Feb 8, 2024

View reviewed changes

codegen dpp combine test

a6998ce

ankurepa added 2 commits February 9, 2024 12:20

fix

9bfd6fa

Added gfx10 dpp combine test

9ee09fe

Sisyph reviewed Feb 23, 2024

View reviewed changes

aaupov removed request for aaupov, maksfb, rafaelauler and dcci March 31, 2024 02:24

kasuga-fj mentioned this pull request Dec 17, 2024

[RISCV] Add software pipeliner support #117546

Merged

		@@ -224,3 +224,6 @@ v_sub_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0]

		v_subrev_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
		// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode

[AMDGPU][MC] Add dpp for V_PK_FMAC_F16 for GFX10 #79598

Are you sure you want to change the base?

[AMDGPU][MC] Add dpp for V_PK_FMAC_F16 for GFX10 #79598

Uh oh!

Conversation

ankurepa commented Jan 26, 2024

Uh oh!

github-actions bot commented Jan 26, 2024

Uh oh!

llvmbot commented Jan 26, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

DadSchoorse commented Jan 26, 2024

Uh oh!

DadSchoorse commented Jan 26, 2024

Uh oh!

arsenm left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

github-actions bot commented Feb 7, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

arsenm commented Feb 8, 2024

Uh oh!

arsenm left a comment

Choose a reason for hiding this comment

Uh oh!

DadSchoorse commented Feb 8, 2024

Uh oh!

arsenm commented Feb 8, 2024

Uh oh!

DadSchoorse commented Feb 8, 2024

Uh oh!

DadSchoorse commented Feb 9, 2024

Uh oh!

ankurepa commented Feb 9, 2024

Uh oh!

Sisyph left a comment

Choose a reason for hiding this comment

Uh oh!

Sisyph Feb 23, 2024

Choose a reason for hiding this comment

Uh oh!

Sisyph Feb 23, 2024

Choose a reason for hiding this comment

Uh oh!

Sisyph Feb 23, 2024

Choose a reason for hiding this comment

Uh oh!

Uh oh!

llvmbot commented Jan 26, 2024 •

edited

Loading

github-actions bot commented Feb 7, 2024 •

edited

Loading