AMDGPU: Define v_mfma_f32_32x32x16_bf16 for gfx950 #116679

arsenm · 2024-11-18T19:01:52Z

Unlike the existing gfx940 intrinsics using short/i16 in place of
bfloat, this uses the natural bfloat type.

arsenm · 2024-11-18T19:02:12Z

This stack of pull requests is managed by Graphite. Learn more about stacking.

arsenm · 2024-11-18T19:02:15Z

Warning

This pull request is not mergeable via GitHub because a downstack PR is open. Once all requirements are satisfied, merge this PR as a stack on Graphite.
Learn more

This stack of pull requests is managed by Graphite. Learn more about stacking.

llvmbot · 2024-11-18T19:03:46Z

@llvm/pr-subscribers-llvm-analysis
@llvm/pr-subscribers-mc
@llvm/pr-subscribers-llvm-ir

@llvm/pr-subscribers-clang

Author: Matt Arsenault (arsenm)

Changes

Unlike the existing gfx940 intrinsics using short/i16 in place of
bfloat, this uses the natural bfloat type.

Patch is 40.83 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/116679.diff

12 Files Affected:

(modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+2)
(modified) clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl (+6)
(modified) clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl (+7)
(modified) clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl (+4-1)
(modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+2)
(modified) llvm/lib/Target/AMDGPU/SIInstrInfo.td (+1)
(modified) llvm/lib/Target/AMDGPU/VOP3PInstructions.td (+6)
(modified) llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll (+8)
(added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll (+474)
(modified) llvm/test/MC/AMDGPU/mai-gfx950.s (+52-4)
(modified) llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt (+27)
(modified) llvm/test/tools/llvm-mca/AMDGPU/gfx950.s (+7-3)

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 6917d8d1aca69d..7ce8f2c1669d67 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -437,6 +437,8 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32, "ifiiIi", "nc", "fp8-conversion-
 TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x32_f16, "V4fV8hV8hV4fIiIiIi", "nc", "gfx950-insts")
 TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x16_f16, "V16fV8hV8hV16fIiIiIi", "nc", "gfx950-insts")
 
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x16_bf16, "V16fV8yV8yV16fIiIiIi", "nc", "gfx950-insts")
+
 //===----------------------------------------------------------------------===//
 // GFX12+ only builtins.
 //===----------------------------------------------------------------------===//
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
index a644a60f9ec381..841d8fcad0fee0 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
@@ -24,6 +24,7 @@ typedef short  v8s   __attribute__((ext_vector_type(8)));
 typedef short  v16s  __attribute__((ext_vector_type(16)));
 typedef short  v32s  __attribute__((ext_vector_type(32)));
 typedef double v4d   __attribute__((ext_vector_type(4)));
+typedef __bf16 v8bf16   __attribute__((ext_vector_type(8)));
 
 
 #ifdef MFMA_GFX908_TESTS
@@ -424,5 +425,10 @@ v16f test_mfma_f32_32x32x16_f16(v8h a, v8h b, v16f c)
   return __builtin_amdgcn_mfma_f32_32x32x16_f16(a, b, c, 1, 2, 3);
 }
 
+// CHECK-GFX950-LABEL: @test_mfma_f32_32x32x16_bf16(
+// CHECK-GFX950: tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %a, <8 x bfloat> %b, <16 x float> %c, i32 1, i32 2, i32 3)
+v16f test_mfma_f32_32x32x16_bf16(v8bf16 a, v8bf16 b, v16f c) {
+  return __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, 1, 2, 3);
+}
 
 #endif
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
index 4c267e2cac5cad..4af67763c40dd2 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
@@ -4,6 +4,7 @@
 typedef float float4 __attribute__((ext_vector_type(4)));
 typedef float float16 __attribute__((ext_vector_type(16)));
 typedef half half8 __attribute__((ext_vector_type(8)));
+typedef __bf16 bfloat8 __attribute__((ext_vector_type(8)));
 
 
 void test_mfma_f32_16x16x32_f16(__global float4* out, half8 a, half8 b, float4 c, int X) {
@@ -19,3 +20,9 @@ void test_mfma_f32_32x32x16_f16(__global float16* out, half8 a, half8 b, float16
   *out = __builtin_amdgcn_mfma_f32_32x32x16_f16(a, b, c, 0, X, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_f16' must be a constant integer}}
   *out = __builtin_amdgcn_mfma_f32_32x32x16_f16(a, b, c, 0, 0, X); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_f16' must be a constant integer}}
 }
+
+void test_mfma_f32_32x32x16_bf16(__global float16* out, bfloat8 a, bfloat8 b, float16 c, int X) {
+  *out = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, X, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_bf16' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, 0, X, 0);  // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_bf16' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, 0, 0, X);  // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_bf16' must be a constant integer}}
+}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
index 0b3a8e78e1c795..e0fd2aa5c58a02 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
@@ -4,9 +4,12 @@
 typedef float float4 __attribute__((ext_vector_type(4)));
 typedef float float16 __attribute__((ext_vector_type(16)));
 typedef half half8 __attribute__((ext_vector_type(8)));
+typedef __bf16 bfloat8 __attribute__((ext_vector_type(8)));
 
 void test(__global float4* out0, half8 a0, half8 b0, float4 c0,
-          __global float16* out1, half8 a1, half8 b1, float16 c1) {
+          __global float16* out1, half8 a1, half8 b1, float16 c1,
+          __global float16* out2, bfloat8 a2, bfloat8 b2, float16 c2) {
   *out0 = __builtin_amdgcn_mfma_f32_16x16x32_f16(a0, b0, c0, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_16x16x32_f16' needs target feature gfx950-insts}}
   *out1 = __builtin_amdgcn_mfma_f32_32x32x16_f16(a1, b1, c1, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_32x32x16_f16' needs target feature gfx950-insts}}
+  *out2 = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a2, b2, c2, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_32x32x16_bf16' needs target feature gfx950-insts}}
 }
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index ec1234e7bc7d94..15f33cdbf92e6e 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3117,6 +3117,8 @@ def int_amdgcn_cvt_sr_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_fp8_f32">,
 defset list<Intrinsic> AMDGPUMFMAIntrinsics950 = {
 def int_amdgcn_mfma_f32_16x16x32_f16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v8f16_ty>;
 def int_amdgcn_mfma_f32_32x32x16_f16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v8f16_ty>;
+
+def int_amdgcn_mfma_f32_32x32x16_bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v8bf16_ty>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 7df9be5c6f7a0b..2079b34d0448f4 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2848,6 +2848,7 @@ def VOP_V16F32_V2I32_V4I32_I32    : VOPProfile <[v16f32, v2i32, v4i32, i32]>;
 
 def VOP_V4F32_V8F16_V8F16_V4F32   : VOPProfile <[v4f32,  v8f16, v8f16, v4f32]>;
 def VOP_V16F32_V8F16_V8F16_V16F32 : VOPProfile <[v16f32, v8f16, v8f16, v16f32]>;
+def VOP_V16F32_V8BF16_V8BF16_V16F32 : VOPProfile <[v16f32, v8bf16, v8bf16, v16f32]>;
 
 
 class Commutable_REV <string revOp, bit isOrig> {
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 58e26a96ece202..08882e41d863a1 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -631,6 +631,9 @@ def VOPProfileMAI_F32_V8F16_X32_VCD : VOPProfileMAI<VOP_V4F32_V8F16_V8F16_V4F32,
 def VOPProfileMAI_F32_V8F16_X16     : VOPProfileMAI<VOP_V16F32_V8F16_V8F16_V16F32,     AISrc_512_f32,  ADst_512, AVSrc_128>;
 def VOPProfileMAI_F32_V8F16_X16_VCD : VOPProfileMAI<VOP_V16F32_V8F16_V8F16_V16F32,     VISrc_512_f32,  VDst_512, AVSrc_128>;
 
+def VOPProfileMAI_F32_V8BF16_X16     : VOPProfileMAI<VOP_V16F32_V8BF16_V8BF16_V16F32,     AISrc_512_f32,  ADst_512, AVSrc_128>;
+def VOPProfileMAI_F32_V8BF16_X16_VCD : VOPProfileMAI<VOP_V16F32_V8BF16_V8BF16_V16F32,     VISrc_512_f32,  VDst_512, AVSrc_128>;
+
 class MFMATable <bit is_mac, string Name> {
   bit IsMac = is_mac;
   string FMAOp = Name;
@@ -747,6 +750,7 @@ defm V_MFMA_F32_32X32X4BF16 : MAIInst<"v_mfma_f32_32x32x4bf16", "F32_V2I16_X16",
 let SubtargetPredicate = HasGFX950Insts, is_gfx940_xdl = 1 in {
 defm V_MFMA_F32_16X16X32_F16   : MAIInst<"v_mfma_f32_16x16x32f16",    "F32_V8F16_X32", int_amdgcn_mfma_f32_16x16x32_f16>;
 defm V_MFMA_F32_32X32X16_F16   : MAIInst<"v_mfma_f32_32x32x16f16",    "F32_V8F16_X16", int_amdgcn_mfma_f32_32x32x16_f16>;
+defm V_MFMA_F32_32X32X16_BF16  : MAIInst<"v_mfma_f32_32x32x16bf16",   "F32_V8BF16_X16", int_amdgcn_mfma_f32_32x32x16_bf16>;
 }
 
 let Predicates = [isGFX90APlus] in {
@@ -1786,6 +1790,8 @@ defm V_MFMA_F64_4X4X4F64        : VOP3P_Real_MFMA_gfx90a <0x6f>;
 
 defm V_MFMA_F32_16X16X32_F16     : VOP3P_Real_MFMA_gfx950 <0x54, "v_mfma_f32_16x16x32_f16">;
 defm V_MFMA_F32_32X32X16_F16     : VOP3P_Real_MFMA_gfx950 <0x55, "v_mfma_f32_32x32x16_f16">;
+defm V_MFMA_F32_32X32X16_BF16    : VOP3P_Real_MFMA_gfx950 <0x37, "v_mfma_f32_32x32x16_bf16">;
+
 defm V_MFMA_I32_32X32X16I8       : VOP3P_Real_MFMA_gfx940 <0x56, "v_mfma_i32_32x32x16_i8">;
 defm V_MFMA_I32_16X16X32I8       : VOP3P_Real_MFMA_gfx940 <0x57, "v_mfma_i32_16x16x32_i8">;
 let SubtargetPredicate = HasXF32Insts in {
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
index c457d867af361e..00a3aaf77f9003 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@@ -278,6 +278,14 @@ define amdgpu_kernel void @mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x half> %a
   ret void
 }
 
+; CHECK: DIVERGENT:  %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
+define amdgpu_kernel void @mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, ptr addrspace(1) %out) {
+  %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
+  store <16 x float> %result, ptr addrspace(1) %out
+  ret void
+}
+
+
 declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1
 declare i32 @llvm.amdgcn.permlane16.i32(i32, i32, i32, i32, i1, i1) #1
 declare i32 @llvm.amdgcn.permlanex16.i32(i32, i32, i32, i32, i1, i1) #1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
new file mode 100644
index 00000000000000..2da602713d72c4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
@@ -0,0 +1,474 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -march=amdgcn -mcpu=gfx950 < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
+
+; FIXME: bfloat vector arguments are broken in globalisel.
+; https://github.com/llvm/llvm-project/issues/77055
+
+declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat>, <8 x bfloat>, <16 x float>, i32 immarg, i32 immarg, i32 immarg)
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.mfma.f32.32x32x16.bf16
+; --------------------------------------------------------------------
+
+define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2) #1 {
+; GCN-LABEL: test_mfma_f32_32x32x16_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; GCN-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; GCN-NEXT:    v_mov_b64_e32 v[12:13], 48
+; GCN-NEXT:    v_mov_b64_e32 v[14:15], 32
+; GCN-NEXT:    v_mov_b64_e32 v[16:17], 16
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
+; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
+; GCN-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
+; GCN-NEXT:    v_accvgpr_write_b32 a0, s8
+; GCN-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
+; GCN-NEXT:    v_accvgpr_write_b32 a1, s9
+; GCN-NEXT:    v_accvgpr_write_b32 a2, s10
+; GCN-NEXT:    v_accvgpr_write_b32 a3, s11
+; GCN-NEXT:    v_accvgpr_write_b32 a4, s12
+; GCN-NEXT:    v_accvgpr_write_b32 a5, s13
+; GCN-NEXT:    v_accvgpr_write_b32 a6, s14
+; GCN-NEXT:    v_accvgpr_write_b32 a7, s15
+; GCN-NEXT:    v_accvgpr_write_b32 a8, s16
+; GCN-NEXT:    v_accvgpr_write_b32 a9, s17
+; GCN-NEXT:    v_accvgpr_write_b32 a10, s18
+; GCN-NEXT:    v_accvgpr_write_b32 a11, s19
+; GCN-NEXT:    v_accvgpr_write_b32 a12, s20
+; GCN-NEXT:    v_accvgpr_write_b32 a13, s21
+; GCN-NEXT:    v_accvgpr_write_b32 a14, s22
+; GCN-NEXT:    v_accvgpr_write_b32 a15, s23
+; GCN-NEXT:    v_mov_b64_e32 v[18:19], 0
+; GCN-NEXT:    v_mov_b32_e32 v8, s16
+; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 a[16:31], v[0:3], v[4:7], a[0:15]
+; GCN-NEXT:    v_mov_b32_e32 v0, s20
+; GCN-NEXT:    v_mov_b32_e32 v1, s21
+; GCN-NEXT:    v_mov_b32_e32 v2, s22
+; GCN-NEXT:    v_mov_b32_e32 v3, s23
+; GCN-NEXT:    v_mov_b32_e32 v9, s17
+; GCN-NEXT:    v_mov_b32_e32 v10, s18
+; GCN-NEXT:    v_mov_b32_e32 v11, s19
+; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-NEXT:    v_mov_b32_e32 v1, s9
+; GCN-NEXT:    v_mov_b32_e32 v2, s10
+; GCN-NEXT:    v_mov_b32_e32 v3, s11
+; GCN-NEXT:    global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    v_mov_b32_e32 v0, s12
+; GCN-NEXT:    v_mov_b32_e32 v1, s13
+; GCN-NEXT:    v_mov_b32_e32 v2, s14
+; GCN-NEXT:    v_mov_b32_e32 v3, s15
+; GCN-NEXT:    global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
+  %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
+  store volatile <16 x float> %result, ptr addrspace(1) null
+  store volatile <16 x float> %arg2, ptr addrspace(1) null
+  ret void
+}
+
+define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__flags(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2) #1 {
+; GCN-LABEL: test_mfma_f32_32x32x16_bf16__flags:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; GCN-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; GCN-NEXT:    v_mov_b64_e32 v[12:13], 48
+; GCN-NEXT:    v_mov_b64_e32 v[14:15], 32
+; GCN-NEXT:    v_mov_b64_e32 v[16:17], 16
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
+; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
+; GCN-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
+; GCN-NEXT:    v_accvgpr_write_b32 a0, s8
+; GCN-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
+; GCN-NEXT:    v_accvgpr_write_b32 a1, s9
+; GCN-NEXT:    v_accvgpr_write_b32 a2, s10
+; GCN-NEXT:    v_accvgpr_write_b32 a3, s11
+; GCN-NEXT:    v_accvgpr_write_b32 a4, s12
+; GCN-NEXT:    v_accvgpr_write_b32 a5, s13
+; GCN-NEXT:    v_accvgpr_write_b32 a6, s14
+; GCN-NEXT:    v_accvgpr_write_b32 a7, s15
+; GCN-NEXT:    v_accvgpr_write_b32 a8, s16
+; GCN-NEXT:    v_accvgpr_write_b32 a9, s17
+; GCN-NEXT:    v_accvgpr_write_b32 a10, s18
+; GCN-NEXT:    v_accvgpr_write_b32 a11, s19
+; GCN-NEXT:    v_accvgpr_write_b32 a12, s20
+; GCN-NEXT:    v_accvgpr_write_b32 a13, s21
+; GCN-NEXT:    v_accvgpr_write_b32 a14, s22
+; GCN-NEXT:    v_accvgpr_write_b32 a15, s23
+; GCN-NEXT:    v_mov_b64_e32 v[18:19], 0
+; GCN-NEXT:    v_mov_b32_e32 v8, s16
+; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
+; GCN-NEXT:    v_mov_b32_e32 v0, s20
+; GCN-NEXT:    v_mov_b32_e32 v1, s21
+; GCN-NEXT:    v_mov_b32_e32 v2, s22
+; GCN-NEXT:    v_mov_b32_e32 v3, s23
+; GCN-NEXT:    v_mov_b32_e32 v9, s17
+; GCN-NEXT:    v_mov_b32_e32 v10, s18
+; GCN-NEXT:    v_mov_b32_e32 v11, s19
+; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-NEXT:    v_mov_b32_e32 v1, s9
+; GCN-NEXT:    v_mov_b32_e32 v2, s10
+; GCN-NEXT:    v_mov_b32_e32 v3, s11
+; GCN-NEXT:    global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    v_mov_b32_e32 v0, s12
+; GCN-NEXT:    v_mov_b32_e32 v1, s13
+; GCN-NEXT:    v_mov_b32_e32 v2, s14
+; GCN-NEXT:    v_mov_b32_e32 v3, s15
+; GCN-NEXT:    global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
+  %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 2, i32 3, i32 1)
+  store volatile <16 x float> %result, ptr addrspace(1) null
+  store volatile <16 x float> %arg2, ptr addrspace(1) null
+  ret void
+}
+
+define <16 x float> @test_mfma_f32_32x32x16_bf16__mac(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_f32_32x32x16_bf16__mac:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v8
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v9
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v10
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
+; GCN-NEXT:    v_accvgpr_write_b32 a4, v12
+; GCN-NEXT:    v_accvgpr_write_b32 a5, v13
+; GCN-NEXT:    v_accvgpr_write_b32 a6, v14
+; GCN-NEXT:    v_accvgpr_write_b32 a7, v15
+; GCN-NEXT:    v_accvgpr_write_b32 a8, v16
+; GCN-NEXT:    v_accvgpr_write_b32 a9, v17
+; GCN-NEXT:    v_accvgpr_write_b32 a10, v18
+; GCN-NEXT:    v_accvgpr_write_b32 a11, v19
+; GCN-NEXT:    v_accvgpr_write_b32 a12, v20
+; GCN-NEXT:    v_accvgpr_write_b32 a13, v21
+; GCN-NEXT:    v_accvgpr_write_b32 a14, v22
+; GCN-NEXT:    v_accvgpr_write_b32 a15, v23
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15]
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
+; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
+; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
+; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
+; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
+; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
+; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
+; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
+; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
+; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
+; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
+; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
+  ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_f32_32x32x16_bf16__mac__flags(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_f32_32x32x16_bf16__mac__flags:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v8
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v9
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v10
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
+; GCN-NEXT:    v_accvgpr_write_b32 a4, v12
+; GCN-NEXT:    v_accvgpr_write_b32 a5, v13
+; GCN-NEXT:    v_accvgpr_write_b32 a6, v14
+; GCN-NEXT:    v_accvgpr_write_b32 a7, v15
+; GCN-NEXT:    v_accvgpr_write_b32 a8, v16
+; GCN-NEXT:    v_accvgpr_write_b32 a9, v17
+; GCN-NEXT:    v_accvgpr_write_b32 a10, v18
+; GCN-NEXT:    v_accvgpr_write_b32 a11, v19
+; GCN-NEXT:    v_accvgpr_write_b32 a12, v20
+; GCN-NEXT:    v_accvgpr_write_b32 a13, v21
+; GCN-NEXT:    v_accvgpr_write_b32 a14, v22
+; GCN-NEXT:    v_accvgpr_write_b32 a15, v23
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEX...
[truncated]

llvmbot · 2024-11-18T19:03:46Z

@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)

Changes

Unlike the existing gfx940 intrinsics using short/i16 in place of
bfloat, this uses the natural bfloat type.

Patch is 40.83 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/116679.diff

12 Files Affected:

(modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+2)
(modified) clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl (+6)
(modified) clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl (+7)
(modified) clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl (+4-1)
(modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+2)
(modified) llvm/lib/Target/AMDGPU/SIInstrInfo.td (+1)
(modified) llvm/lib/Target/AMDGPU/VOP3PInstructions.td (+6)
(modified) llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll (+8)
(added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll (+474)
(modified) llvm/test/MC/AMDGPU/mai-gfx950.s (+52-4)
(modified) llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt (+27)
(modified) llvm/test/tools/llvm-mca/AMDGPU/gfx950.s (+7-3)

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 6917d8d1aca69d..7ce8f2c1669d67 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -437,6 +437,8 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32, "ifiiIi", "nc", "fp8-conversion-
 TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x32_f16, "V4fV8hV8hV4fIiIiIi", "nc", "gfx950-insts")
 TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x16_f16, "V16fV8hV8hV16fIiIiIi", "nc", "gfx950-insts")
 
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x16_bf16, "V16fV8yV8yV16fIiIiIi", "nc", "gfx950-insts")
+
 //===----------------------------------------------------------------------===//
 // GFX12+ only builtins.
 //===----------------------------------------------------------------------===//
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
index a644a60f9ec381..841d8fcad0fee0 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
@@ -24,6 +24,7 @@ typedef short  v8s   __attribute__((ext_vector_type(8)));
 typedef short  v16s  __attribute__((ext_vector_type(16)));
 typedef short  v32s  __attribute__((ext_vector_type(32)));
 typedef double v4d   __attribute__((ext_vector_type(4)));
+typedef __bf16 v8bf16   __attribute__((ext_vector_type(8)));
 
 
 #ifdef MFMA_GFX908_TESTS
@@ -424,5 +425,10 @@ v16f test_mfma_f32_32x32x16_f16(v8h a, v8h b, v16f c)
   return __builtin_amdgcn_mfma_f32_32x32x16_f16(a, b, c, 1, 2, 3);
 }
 
+// CHECK-GFX950-LABEL: @test_mfma_f32_32x32x16_bf16(
+// CHECK-GFX950: tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %a, <8 x bfloat> %b, <16 x float> %c, i32 1, i32 2, i32 3)
+v16f test_mfma_f32_32x32x16_bf16(v8bf16 a, v8bf16 b, v16f c) {
+  return __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, 1, 2, 3);
+}
 
 #endif
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
index 4c267e2cac5cad..4af67763c40dd2 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
@@ -4,6 +4,7 @@
 typedef float float4 __attribute__((ext_vector_type(4)));
 typedef float float16 __attribute__((ext_vector_type(16)));
 typedef half half8 __attribute__((ext_vector_type(8)));
+typedef __bf16 bfloat8 __attribute__((ext_vector_type(8)));
 
 
 void test_mfma_f32_16x16x32_f16(__global float4* out, half8 a, half8 b, float4 c, int X) {
@@ -19,3 +20,9 @@ void test_mfma_f32_32x32x16_f16(__global float16* out, half8 a, half8 b, float16
   *out = __builtin_amdgcn_mfma_f32_32x32x16_f16(a, b, c, 0, X, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_f16' must be a constant integer}}
   *out = __builtin_amdgcn_mfma_f32_32x32x16_f16(a, b, c, 0, 0, X); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_f16' must be a constant integer}}
 }
+
+void test_mfma_f32_32x32x16_bf16(__global float16* out, bfloat8 a, bfloat8 b, float16 c, int X) {
+  *out = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, X, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_bf16' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, 0, X, 0);  // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_bf16' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, 0, 0, X);  // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_bf16' must be a constant integer}}
+}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
index 0b3a8e78e1c795..e0fd2aa5c58a02 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
@@ -4,9 +4,12 @@
 typedef float float4 __attribute__((ext_vector_type(4)));
 typedef float float16 __attribute__((ext_vector_type(16)));
 typedef half half8 __attribute__((ext_vector_type(8)));
+typedef __bf16 bfloat8 __attribute__((ext_vector_type(8)));
 
 void test(__global float4* out0, half8 a0, half8 b0, float4 c0,
-          __global float16* out1, half8 a1, half8 b1, float16 c1) {
+          __global float16* out1, half8 a1, half8 b1, float16 c1,
+          __global float16* out2, bfloat8 a2, bfloat8 b2, float16 c2) {
   *out0 = __builtin_amdgcn_mfma_f32_16x16x32_f16(a0, b0, c0, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_16x16x32_f16' needs target feature gfx950-insts}}
   *out1 = __builtin_amdgcn_mfma_f32_32x32x16_f16(a1, b1, c1, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_32x32x16_f16' needs target feature gfx950-insts}}
+  *out2 = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a2, b2, c2, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_32x32x16_bf16' needs target feature gfx950-insts}}
 }
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index ec1234e7bc7d94..15f33cdbf92e6e 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3117,6 +3117,8 @@ def int_amdgcn_cvt_sr_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_fp8_f32">,
 defset list<Intrinsic> AMDGPUMFMAIntrinsics950 = {
 def int_amdgcn_mfma_f32_16x16x32_f16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v8f16_ty>;
 def int_amdgcn_mfma_f32_32x32x16_f16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v8f16_ty>;
+
+def int_amdgcn_mfma_f32_32x32x16_bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v8bf16_ty>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 7df9be5c6f7a0b..2079b34d0448f4 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2848,6 +2848,7 @@ def VOP_V16F32_V2I32_V4I32_I32    : VOPProfile <[v16f32, v2i32, v4i32, i32]>;
 
 def VOP_V4F32_V8F16_V8F16_V4F32   : VOPProfile <[v4f32,  v8f16, v8f16, v4f32]>;
 def VOP_V16F32_V8F16_V8F16_V16F32 : VOPProfile <[v16f32, v8f16, v8f16, v16f32]>;
+def VOP_V16F32_V8BF16_V8BF16_V16F32 : VOPProfile <[v16f32, v8bf16, v8bf16, v16f32]>;
 
 
 class Commutable_REV <string revOp, bit isOrig> {
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 58e26a96ece202..08882e41d863a1 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -631,6 +631,9 @@ def VOPProfileMAI_F32_V8F16_X32_VCD : VOPProfileMAI<VOP_V4F32_V8F16_V8F16_V4F32,
 def VOPProfileMAI_F32_V8F16_X16     : VOPProfileMAI<VOP_V16F32_V8F16_V8F16_V16F32,     AISrc_512_f32,  ADst_512, AVSrc_128>;
 def VOPProfileMAI_F32_V8F16_X16_VCD : VOPProfileMAI<VOP_V16F32_V8F16_V8F16_V16F32,     VISrc_512_f32,  VDst_512, AVSrc_128>;
 
+def VOPProfileMAI_F32_V8BF16_X16     : VOPProfileMAI<VOP_V16F32_V8BF16_V8BF16_V16F32,     AISrc_512_f32,  ADst_512, AVSrc_128>;
+def VOPProfileMAI_F32_V8BF16_X16_VCD : VOPProfileMAI<VOP_V16F32_V8BF16_V8BF16_V16F32,     VISrc_512_f32,  VDst_512, AVSrc_128>;
+
 class MFMATable <bit is_mac, string Name> {
   bit IsMac = is_mac;
   string FMAOp = Name;
@@ -747,6 +750,7 @@ defm V_MFMA_F32_32X32X4BF16 : MAIInst<"v_mfma_f32_32x32x4bf16", "F32_V2I16_X16",
 let SubtargetPredicate = HasGFX950Insts, is_gfx940_xdl = 1 in {
 defm V_MFMA_F32_16X16X32_F16   : MAIInst<"v_mfma_f32_16x16x32f16",    "F32_V8F16_X32", int_amdgcn_mfma_f32_16x16x32_f16>;
 defm V_MFMA_F32_32X32X16_F16   : MAIInst<"v_mfma_f32_32x32x16f16",    "F32_V8F16_X16", int_amdgcn_mfma_f32_32x32x16_f16>;
+defm V_MFMA_F32_32X32X16_BF16  : MAIInst<"v_mfma_f32_32x32x16bf16",   "F32_V8BF16_X16", int_amdgcn_mfma_f32_32x32x16_bf16>;
 }
 
 let Predicates = [isGFX90APlus] in {
@@ -1786,6 +1790,8 @@ defm V_MFMA_F64_4X4X4F64        : VOP3P_Real_MFMA_gfx90a <0x6f>;
 
 defm V_MFMA_F32_16X16X32_F16     : VOP3P_Real_MFMA_gfx950 <0x54, "v_mfma_f32_16x16x32_f16">;
 defm V_MFMA_F32_32X32X16_F16     : VOP3P_Real_MFMA_gfx950 <0x55, "v_mfma_f32_32x32x16_f16">;
+defm V_MFMA_F32_32X32X16_BF16    : VOP3P_Real_MFMA_gfx950 <0x37, "v_mfma_f32_32x32x16_bf16">;
+
 defm V_MFMA_I32_32X32X16I8       : VOP3P_Real_MFMA_gfx940 <0x56, "v_mfma_i32_32x32x16_i8">;
 defm V_MFMA_I32_16X16X32I8       : VOP3P_Real_MFMA_gfx940 <0x57, "v_mfma_i32_16x16x32_i8">;
 let SubtargetPredicate = HasXF32Insts in {
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
index c457d867af361e..00a3aaf77f9003 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@@ -278,6 +278,14 @@ define amdgpu_kernel void @mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x half> %a
   ret void
 }
 
+; CHECK: DIVERGENT:  %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
+define amdgpu_kernel void @mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, ptr addrspace(1) %out) {
+  %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
+  store <16 x float> %result, ptr addrspace(1) %out
+  ret void
+}
+
+
 declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1
 declare i32 @llvm.amdgcn.permlane16.i32(i32, i32, i32, i32, i1, i1) #1
 declare i32 @llvm.amdgcn.permlanex16.i32(i32, i32, i32, i32, i1, i1) #1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
new file mode 100644
index 00000000000000..2da602713d72c4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
@@ -0,0 +1,474 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -march=amdgcn -mcpu=gfx950 < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
+
+; FIXME: bfloat vector arguments are broken in globalisel.
+; https://github.com/llvm/llvm-project/issues/77055
+
+declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat>, <8 x bfloat>, <16 x float>, i32 immarg, i32 immarg, i32 immarg)
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.mfma.f32.32x32x16.bf16
+; --------------------------------------------------------------------
+
+define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2) #1 {
+; GCN-LABEL: test_mfma_f32_32x32x16_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; GCN-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; GCN-NEXT:    v_mov_b64_e32 v[12:13], 48
+; GCN-NEXT:    v_mov_b64_e32 v[14:15], 32
+; GCN-NEXT:    v_mov_b64_e32 v[16:17], 16
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
+; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
+; GCN-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
+; GCN-NEXT:    v_accvgpr_write_b32 a0, s8
+; GCN-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
+; GCN-NEXT:    v_accvgpr_write_b32 a1, s9
+; GCN-NEXT:    v_accvgpr_write_b32 a2, s10
+; GCN-NEXT:    v_accvgpr_write_b32 a3, s11
+; GCN-NEXT:    v_accvgpr_write_b32 a4, s12
+; GCN-NEXT:    v_accvgpr_write_b32 a5, s13
+; GCN-NEXT:    v_accvgpr_write_b32 a6, s14
+; GCN-NEXT:    v_accvgpr_write_b32 a7, s15
+; GCN-NEXT:    v_accvgpr_write_b32 a8, s16
+; GCN-NEXT:    v_accvgpr_write_b32 a9, s17
+; GCN-NEXT:    v_accvgpr_write_b32 a10, s18
+; GCN-NEXT:    v_accvgpr_write_b32 a11, s19
+; GCN-NEXT:    v_accvgpr_write_b32 a12, s20
+; GCN-NEXT:    v_accvgpr_write_b32 a13, s21
+; GCN-NEXT:    v_accvgpr_write_b32 a14, s22
+; GCN-NEXT:    v_accvgpr_write_b32 a15, s23
+; GCN-NEXT:    v_mov_b64_e32 v[18:19], 0
+; GCN-NEXT:    v_mov_b32_e32 v8, s16
+; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 a[16:31], v[0:3], v[4:7], a[0:15]
+; GCN-NEXT:    v_mov_b32_e32 v0, s20
+; GCN-NEXT:    v_mov_b32_e32 v1, s21
+; GCN-NEXT:    v_mov_b32_e32 v2, s22
+; GCN-NEXT:    v_mov_b32_e32 v3, s23
+; GCN-NEXT:    v_mov_b32_e32 v9, s17
+; GCN-NEXT:    v_mov_b32_e32 v10, s18
+; GCN-NEXT:    v_mov_b32_e32 v11, s19
+; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-NEXT:    v_mov_b32_e32 v1, s9
+; GCN-NEXT:    v_mov_b32_e32 v2, s10
+; GCN-NEXT:    v_mov_b32_e32 v3, s11
+; GCN-NEXT:    global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    v_mov_b32_e32 v0, s12
+; GCN-NEXT:    v_mov_b32_e32 v1, s13
+; GCN-NEXT:    v_mov_b32_e32 v2, s14
+; GCN-NEXT:    v_mov_b32_e32 v3, s15
+; GCN-NEXT:    global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
+  %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
+  store volatile <16 x float> %result, ptr addrspace(1) null
+  store volatile <16 x float> %arg2, ptr addrspace(1) null
+  ret void
+}
+
+define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__flags(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2) #1 {
+; GCN-LABEL: test_mfma_f32_32x32x16_bf16__flags:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; GCN-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; GCN-NEXT:    v_mov_b64_e32 v[12:13], 48
+; GCN-NEXT:    v_mov_b64_e32 v[14:15], 32
+; GCN-NEXT:    v_mov_b64_e32 v[16:17], 16
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
+; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
+; GCN-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
+; GCN-NEXT:    v_accvgpr_write_b32 a0, s8
+; GCN-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
+; GCN-NEXT:    v_accvgpr_write_b32 a1, s9
+; GCN-NEXT:    v_accvgpr_write_b32 a2, s10
+; GCN-NEXT:    v_accvgpr_write_b32 a3, s11
+; GCN-NEXT:    v_accvgpr_write_b32 a4, s12
+; GCN-NEXT:    v_accvgpr_write_b32 a5, s13
+; GCN-NEXT:    v_accvgpr_write_b32 a6, s14
+; GCN-NEXT:    v_accvgpr_write_b32 a7, s15
+; GCN-NEXT:    v_accvgpr_write_b32 a8, s16
+; GCN-NEXT:    v_accvgpr_write_b32 a9, s17
+; GCN-NEXT:    v_accvgpr_write_b32 a10, s18
+; GCN-NEXT:    v_accvgpr_write_b32 a11, s19
+; GCN-NEXT:    v_accvgpr_write_b32 a12, s20
+; GCN-NEXT:    v_accvgpr_write_b32 a13, s21
+; GCN-NEXT:    v_accvgpr_write_b32 a14, s22
+; GCN-NEXT:    v_accvgpr_write_b32 a15, s23
+; GCN-NEXT:    v_mov_b64_e32 v[18:19], 0
+; GCN-NEXT:    v_mov_b32_e32 v8, s16
+; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
+; GCN-NEXT:    v_mov_b32_e32 v0, s20
+; GCN-NEXT:    v_mov_b32_e32 v1, s21
+; GCN-NEXT:    v_mov_b32_e32 v2, s22
+; GCN-NEXT:    v_mov_b32_e32 v3, s23
+; GCN-NEXT:    v_mov_b32_e32 v9, s17
+; GCN-NEXT:    v_mov_b32_e32 v10, s18
+; GCN-NEXT:    v_mov_b32_e32 v11, s19
+; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-NEXT:    v_mov_b32_e32 v1, s9
+; GCN-NEXT:    v_mov_b32_e32 v2, s10
+; GCN-NEXT:    v_mov_b32_e32 v3, s11
+; GCN-NEXT:    global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    v_mov_b32_e32 v0, s12
+; GCN-NEXT:    v_mov_b32_e32 v1, s13
+; GCN-NEXT:    v_mov_b32_e32 v2, s14
+; GCN-NEXT:    v_mov_b32_e32 v3, s15
+; GCN-NEXT:    global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
+  %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 2, i32 3, i32 1)
+  store volatile <16 x float> %result, ptr addrspace(1) null
+  store volatile <16 x float> %arg2, ptr addrspace(1) null
+  ret void
+}
+
+define <16 x float> @test_mfma_f32_32x32x16_bf16__mac(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_f32_32x32x16_bf16__mac:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v8
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v9
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v10
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
+; GCN-NEXT:    v_accvgpr_write_b32 a4, v12
+; GCN-NEXT:    v_accvgpr_write_b32 a5, v13
+; GCN-NEXT:    v_accvgpr_write_b32 a6, v14
+; GCN-NEXT:    v_accvgpr_write_b32 a7, v15
+; GCN-NEXT:    v_accvgpr_write_b32 a8, v16
+; GCN-NEXT:    v_accvgpr_write_b32 a9, v17
+; GCN-NEXT:    v_accvgpr_write_b32 a10, v18
+; GCN-NEXT:    v_accvgpr_write_b32 a11, v19
+; GCN-NEXT:    v_accvgpr_write_b32 a12, v20
+; GCN-NEXT:    v_accvgpr_write_b32 a13, v21
+; GCN-NEXT:    v_accvgpr_write_b32 a14, v22
+; GCN-NEXT:    v_accvgpr_write_b32 a15, v23
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15]
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
+; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
+; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
+; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
+; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
+; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
+; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
+; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
+; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
+; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
+; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
+; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
+  ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_f32_32x32x16_bf16__mac__flags(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_f32_32x32x16_bf16__mac__flags:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v8
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v9
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v10
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
+; GCN-NEXT:    v_accvgpr_write_b32 a4, v12
+; GCN-NEXT:    v_accvgpr_write_b32 a5, v13
+; GCN-NEXT:    v_accvgpr_write_b32 a6, v14
+; GCN-NEXT:    v_accvgpr_write_b32 a7, v15
+; GCN-NEXT:    v_accvgpr_write_b32 a8, v16
+; GCN-NEXT:    v_accvgpr_write_b32 a9, v17
+; GCN-NEXT:    v_accvgpr_write_b32 a10, v18
+; GCN-NEXT:    v_accvgpr_write_b32 a11, v19
+; GCN-NEXT:    v_accvgpr_write_b32 a12, v20
+; GCN-NEXT:    v_accvgpr_write_b32 a13, v21
+; GCN-NEXT:    v_accvgpr_write_b32 a14, v22
+; GCN-NEXT:    v_accvgpr_write_b32 a15, v23
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEX...
[truncated]

Unlike the existing gfx940 intrinsics using short/i16 in place of bfloat, this uses the natural bfloat type.

This was referenced Nov 18, 2024

AMDGPU: Handle gfx950 global_load_lds_* instructions #116680

Merged

AMDGPU: Handle gfx950 96/128-bit buffer_load_lds #116681

Merged

arsenm added backend:AMDGPU clang Clang issues not falling into any other category clang:frontend Language frontend issues, e.g. anything involving "Sema" llvm:analysis llvm:ir mc Machine (object) code labels Nov 18, 2024 — with Graphite App

arsenm requested review from jayfoad, kosarev, pravinjagtap, rampitec, scchan, shiltian and Sisyph November 18, 2024 19:03

arsenm marked this pull request as ready for review November 18, 2024 19:03

arsenm force-pushed the users/arsenm/gfx950/v_cvt_pk_bf16_f32 branch from f3682aa to 1adfc6b Compare November 18, 2024 20:07

arsenm force-pushed the users/arsenm/gfx950/v_mfma_f32_32x32x16_bf16 branch from 82bb6e0 to 330427f Compare November 18, 2024 20:07

arsenm force-pushed the users/arsenm/gfx950/v_cvt_pk_bf16_f32 branch from 1adfc6b to 2558f35 Compare November 18, 2024 21:39

searlmc1 pushed a commit to ROCm/llvm-project that referenced this pull request Feb 3, 2025

AMDGPU: Define v_mfma_f32_32x32x16_bf16 for gfx950 (llvm#116679)

1e7b44d

Unlike the existing gfx940 intrinsics using short/i16 in place of bfloat, this uses the natural bfloat type.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

AMDGPU: Define v_mfma_f32_32x32x16_bf16 for gfx950 #116679

AMDGPU: Define v_mfma_f32_32x32x16_bf16 for gfx950 #116679

Uh oh!

arsenm commented Nov 18, 2024

Uh oh!

arsenm commented Nov 18, 2024 •

edited

Loading

Uh oh!

arsenm commented Nov 18, 2024

Uh oh!

llvmbot commented Nov 18, 2024 •

edited

Loading

Uh oh!

llvmbot commented Nov 18, 2024

Uh oh!

Uh oh!

AMDGPU: Define v_mfma_f32_32x32x16_bf16 for gfx950 #116679

AMDGPU: Define v_mfma_f32_32x32x16_bf16 for gfx950 #116679

Uh oh!

Conversation

arsenm commented Nov 18, 2024

Uh oh!

arsenm commented Nov 18, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

arsenm commented Nov 18, 2024

Uh oh!

llvmbot commented Nov 18, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Nov 18, 2024

Uh oh!

Uh oh!

arsenm commented Nov 18, 2024 •

edited

Loading

llvmbot commented Nov 18, 2024 •

edited

Loading