Skip to content

AMDGPU: Add v_mfma_f32_16x16x32_bf16 for gfx950 #117053

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 21, 2024

Conversation

arsenm
Copy link
Contributor

@arsenm arsenm commented Nov 20, 2024

No description provided.

Copy link
Contributor Author

arsenm commented Nov 20, 2024

This stack of pull requests is managed by Graphite. Learn more about stacking.

@arsenm arsenm added backend:AMDGPU clang Clang issues not falling into any other category clang:frontend Language frontend issues, e.g. anything involving "Sema" llvm:analysis llvm:ir mc Machine (object) code labels Nov 20, 2024 — with Graphite App
@arsenm arsenm marked this pull request as ready for review November 20, 2024 21:23
@llvmbot
Copy link
Member

llvmbot commented Nov 20, 2024

@llvm/pr-subscribers-llvm-ir
@llvm/pr-subscribers-clang

@llvm/pr-subscribers-llvm-analysis

Author: Matt Arsenault (arsenm)

Changes

Patch is 28.32 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/117053.diff

12 Files Affected:

  • (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl (+7)
  • (modified) clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl (+7)
  • (modified) clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl (+1)
  • (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+1-1)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+2-1)
  • (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.td (+1)
  • (modified) llvm/lib/Target/AMDGPU/VOP3PInstructions.td (+5)
  • (modified) llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll (+9)
  • (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll (+198)
  • (modified) llvm/test/MC/AMDGPU/mai-gfx950.s (+56)
  • (modified) llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt (+34)
  • (modified) llvm/test/tools/llvm-mca/AMDGPU/gfx950.s (+7-3)
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
index b21394b6982631..bfe2901ee962a3 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
@@ -458,4 +458,11 @@ v16i test_mfma_i32_32x32x32_i8(v4i a, v4i b, v16i c) {
   return __builtin_amdgcn_mfma_i32_32x32x32_i8(a, b, c, 1, 2, 3);
 }
 
+// CHECK-GFX950-LABEL: @test_mfma_f32_16x16x32_bf16(
+// CHECK-GFX950: tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %a, <8 x bfloat> %b, <4 x float> %c, i32 1, i32 2, i32 3)
+v4f test_mfma_f32_16x16x32_bf16(v8bf16 a, v8bf16 b, v4f c)
+{
+  return __builtin_amdgcn_mfma_f32_16x16x32_bf16(a, b, c, 1, 2, 3);
+}
+
 #endif
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
index 9c14c0541ff3b8..acaa20090dfcba 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
@@ -55,3 +55,10 @@ void test_mfma_i32_32x32x32_i8(__global int16* out, int4 a, int4 b, int16 c, int
   *out = __builtin_amdgcn_mfma_i32_32x32x32_i8(a, b, c, 0, X, 0);  // expected-error{{argument to '__builtin_amdgcn_mfma_i32_32x32x32_i8' must be a constant integer}}
   *out = __builtin_amdgcn_mfma_i32_32x32x32_i8(a, b, c, 0, 0, X);  // expected-error{{argument to '__builtin_amdgcn_mfma_i32_32x32x32_i8' must be a constant integer}}
 }
+
+void test_mfma_f32_16x16x32_bf16(__global float4* out, bfloat8 a, bfloat8 b, float4 c, int X) {
+
+  *out = __builtin_amdgcn_mfma_f32_16x16x32_bf16(a, b, c, X, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x32_bf16' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_16x16x32_bf16(a, b, c, 0, X, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x32_bf16' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_16x16x32_bf16(a, b, c, 0, 0, X); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x32_bf16' must be a constant integer}}
+}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
index 71a110066342cb..6bf76b3cba0f59 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
@@ -33,6 +33,7 @@ void test(__global float4* out0, half8 a0, half8 b0, float4 c0,
   *out2 = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a2, b2, c2, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_32x32x16_bf16' needs target feature gfx950-insts}}
   *out3 = __builtin_amdgcn_mfma_i32_16x16x64_i8(a3, b3, c3, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_i32_16x16x64_i8' needs target feature gfx950-insts}}
   *out4 = __builtin_amdgcn_mfma_i32_32x32x32_i8(a4, b4, c4, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_i32_32x32x32_i8' needs target feature gfx950-insts}}
+  *out5 = __builtin_amdgcn_mfma_f32_16x16x32_bf16(a5, b5, c5, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_16x16x32_bf16' needs target feature gfx950-insts}}
   *out14 = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a14, b14, c14, 0, 0, 0, d14, 0, e14); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' needs target feature gfx950-insts}}
   *out15 = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a15, b15, c15, 0, 0, 0, d15, 0, e15); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' needs target feature gfx950-insts}}
 }
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index b5d5eae0c7cd7e..479120f9c202bf 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3148,7 +3148,7 @@ def int_amdgcn_mfma_f32_16x16x32_f16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v
 def int_amdgcn_mfma_f32_32x32x16_f16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v8f16_ty>;
 def int_amdgcn_mfma_i32_16x16x64_i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_v4i32_ty>;
 def int_amdgcn_mfma_i32_32x32x32_i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_v4i32_ty>;
-
+def int_amdgcn_mfma_f32_16x16x32_bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v8bf16_ty>;
 def int_amdgcn_mfma_f32_32x32x16_bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v8bf16_ty>;
 def int_amdgcn_mfma_scale_f32_16x16x128_f8f6f4 : AMDGPUMfmaScaleIntrinsic<llvm_v4f32_ty>;
 def int_amdgcn_mfma_scale_f32_32x32x64_f8f6f4 : AMDGPUMfmaScaleIntrinsic<llvm_v16f32_ty>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index ea9c688c710418..4b7ba4effc6857 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4751,7 +4751,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_mfma_f32_16x16x32_f16:
     case Intrinsic::amdgcn_mfma_f32_32x32x16_f16:
     case Intrinsic::amdgcn_mfma_i32_16x16x64_i8:
-    case Intrinsic::amdgcn_mfma_i32_32x32x32_i8: {
+    case Intrinsic::amdgcn_mfma_i32_32x32x32_i8:
+    case Intrinsic::amdgcn_mfma_f32_16x16x32_bf16: {
       // Default for MAI intrinsics.
       // srcC can also be an immediate which can be folded later.
       // FIXME: Should we eventually add an alternative mapping with AGPR src
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 6b0c32b62176b6..05568b4f9ad1ec 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2880,6 +2880,7 @@ def VOP_V16F32_V2I32_V4I32_I32    : VOPProfile <[v16f32, v2i32, v4i32, i32]>;
 def VOP_V4F32_V8F16_V8F16_V4F32   : VOPProfile <[v4f32,  v8f16, v8f16, v4f32]>;
 def VOP_V16F32_V8F16_V8F16_V16F32 : VOPProfile <[v16f32, v8f16, v8f16, v16f32]>;
 def VOP_V16F32_V8BF16_V8BF16_V16F32 : VOPProfile <[v16f32, v8bf16, v8bf16, v16f32]>;
+def VOP_V4F32_V8BF16_V8BF16_V4F32 : VOPProfile <[v4f32, v8bf16, v8bf16, v4f32]>;
 def VOP_V4F32_V8I32_V8I32_V4F32   : VOPProfile <[v4f32,  v8i32, v8i32, v4f32]>;
 
 def VOP_V4F32_V8I32_V6I32_V4F32   : VOPProfile <[v4f32,  v8i32, v6i32, v4f32]>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 98d5e3f199cddd..9bd2b124f122dc 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -645,6 +645,9 @@ def VOPProfileMAI_F32_V8F16_X16_VCD : VOPProfileMAI<VOP_V16F32_V8F16_V8F16_V16F3
 def VOPProfileMAI_F32_V8BF16_X16     : VOPProfileMAI<VOP_V16F32_V8BF16_V8BF16_V16F32,     AISrc_512_f32,  ADst_512, AVSrc_128>;
 def VOPProfileMAI_F32_V8BF16_X16_VCD : VOPProfileMAI<VOP_V16F32_V8BF16_V8BF16_V16F32,     VISrc_512_f32,  VDst_512, AVSrc_128>;
 
+def VOPProfileMAI_F32_V8BF16_X4     : VOPProfileMAI<VOP_V4F32_V8BF16_V8BF16_V4F32,     AISrc_128_f32,  ADst_128, AVSrc_128>;
+def VOPProfileMAI_F32_V8BF16_X4_VCD : VOPProfileMAI<VOP_V4F32_V8BF16_V8BF16_V4F32,     VISrc_128_f32,  VDst_128, AVSrc_128>;
+
 
 let HasAbid = false in {
 // For f32_16x16x128_f8f6f4 - f8 x f8 case
@@ -952,6 +955,7 @@ defm V_MFMA_F32_32X32X4BF16 : MAIInst<"v_mfma_f32_32x32x4bf16", "F32_V2I16_X16",
 let SubtargetPredicate = HasGFX950Insts, is_gfx940_xdl = 1 in {
 defm V_MFMA_F32_16X16X32_F16   : MAIInst<"v_mfma_f32_16x16x32f16",    "F32_V8F16_X32", int_amdgcn_mfma_f32_16x16x32_f16>;
 defm V_MFMA_F32_32X32X16_F16   : MAIInst<"v_mfma_f32_32x32x16f16",    "F32_V8F16_X16", int_amdgcn_mfma_f32_32x32x16_f16>;
+defm V_MFMA_F32_16X16X32_BF16  : MAIInst<"v_mfma_f32_16x16x32bf16",   "F32_V8BF16_X4", int_amdgcn_mfma_f32_16x16x32_bf16>;
 defm V_MFMA_I32_16X16X64_I8    : MAIInst<"v_mfma_i32_16x16x64i8",     "I32_V4I32_X128", int_amdgcn_mfma_i32_16x16x64_i8>;
 defm V_MFMA_F32_32X32X16_BF16  : MAIInst<"v_mfma_f32_32x32x16bf16",   "F32_V8BF16_X16", int_amdgcn_mfma_f32_32x32x16_bf16>;
 defm V_MFMA_I32_32X32X32_I8    : MAIInst<"v_mfma_i32_32x32x32i8",     "I32_V4I32_X16", int_amdgcn_mfma_i32_32x32x32_i8>;
@@ -2078,6 +2082,7 @@ defm V_MFMA_F64_4X4X4F64        : VOP3P_Real_MFMA_gfx90a <0x6f>;
 
 defm V_MFMA_F32_16X16X32_F16     : VOP3P_Real_MFMA_gfx950 <0x54, "v_mfma_f32_16x16x32_f16">;
 defm V_MFMA_F32_32X32X16_F16     : VOP3P_Real_MFMA_gfx950 <0x55, "v_mfma_f32_32x32x16_f16">;
+defm V_MFMA_F32_16X16X32_BF16    : VOP3P_Real_MFMA_gfx950 <0x35, "v_mfma_f32_16x16x32_bf16">;
 defm V_MFMA_I32_16X16X64_I8      : VOP3P_Real_MFMA_gfx950 <0x36, "v_mfma_i32_16x16x64_i8">;
 defm V_MFMA_F32_32X32X16_BF16    : VOP3P_Real_MFMA_gfx950 <0x37, "v_mfma_f32_32x32x16_bf16">;
 defm V_MFMA_I32_32X32X32_I8      : VOP3P_Real_MFMA_gfx950 <0x38, "v_mfma_i32_32x32x32_i8">;
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
index 26021a56790aae..2d7983ebafd653 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@@ -323,6 +323,15 @@ define amdgpu_kernel void @mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> %arg1
   ret void
 }
 
+declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat>, <8 x bfloat>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)
+
+; CHECK: DIVERGENT: %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
+define amdgpu_kernel void @mfma_f32_16x16x32_bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, ptr addrspace(1) %out) {
+  %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
+  store <4 x float> %result, ptr addrspace(1) %out
+  ret void
+}
+
 declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1
 declare i32 @llvm.amdgcn.permlane16.i32(i32, i32, i32, i32, i1, i1) #1
 declare i32 @llvm.amdgcn.permlanex16.i32(i32, i32, i32, i32, i1, i1) #1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index 29668b2c5bfdbf..d0ae669ffb3d68 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -1315,5 +1315,203 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
   ret void
 }
 
+; --------------------------------------------------------------------
+; llvm.amdgcn.mfma.f32.16x16x32.bf16
+; --------------------------------------------------------------------
+
+declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat>, <8 x bfloat>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)
+
+define <4 x float> @test_mfma_f32_16x16x32_bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) {
+; SDAG-LABEL: test_mfma_f32_16x16x32_bf16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
+; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_f32_16x16x32_bf16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v13, 16, v1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
+; GISEL-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
+; GISEL-NEXT:    v_mov_b32_sdwa v0, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_mov_b32_sdwa v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_mov_b32_sdwa v2, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_mov_b32_sdwa v3, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
+; GISEL-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
+; GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v6
+; GISEL-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v8
+; GISEL-NEXT:    v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v9
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v10
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
+; GISEL-NEXT:    s_nop 6
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
+  ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_f32_16x16x32_bf16__flags(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) {
+; SDAG-LABEL: test_mfma_f32_16x16x32_bf16__flags:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
+; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_f32_16x16x32_bf16__flags:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v13, 16, v1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
+; GISEL-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
+; GISEL-NEXT:    v_mov_b32_sdwa v0, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_mov_b32_sdwa v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_mov_b32_sdwa v2, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_mov_b32_sdwa v3, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
+; GISEL-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
+; GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v6
+; GISEL-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v8
+; GISEL-NEXT:    v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v9
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v10
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
+; GISEL-NEXT:    s_nop 6
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 1, i32 1, i32 1)
+  ret <4 x float> %result
+}
+
+define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrspace(1) %out, <8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) #0 {
+; SDAG-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
+; SDAG-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x54
+; SDAG-NEXT:    v_mov_b32_e32 v12, 0
+; SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[8:9]
+; SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[12:13]
+; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[10:11]
+; SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[14:15]
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
+; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1]
+; SDAG-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
+; GISEL-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x54
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[8:9]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[12:13]
+; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[10:11]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[14:15]
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    s_nop 4
+; GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GISEL-NEXT:    s_endpgm
+  %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
+  store <4 x float> %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(ptr addrspace(1) %out, <8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) #0 {
+; SDAG-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
+; SDAG-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x54
+; SDAG-NEXT:    v_mov_b32_e32 v12, 0
+; SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[8:9]
+; SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[12:13]
+; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[10:11]
+; SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[14:15]
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1]
+; SDAG-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
+; GISEL-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x54
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[8:9]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[12:13]
+; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[10:11]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[14:15]
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    s_nop 4
+; GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GISEL-NEXT:    s_endpgm
+  %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 3, i32 2, i32 1)
+  store <4 x float> %result, p...
[truncated]

@llvmbot
Copy link
Member

llvmbot commented Nov 20, 2024

@llvm/pr-subscribers-mc

Author: Matt Arsenault (arsenm)

Changes

Patch is 28.32 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/117053.diff

12 Files Affected:

  • (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl (+7)
  • (modified) clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl (+7)
  • (modified) clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl (+1)
  • (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+1-1)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+2-1)
  • (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.td (+1)
  • (modified) llvm/lib/Target/AMDGPU/VOP3PInstructions.td (+5)
  • (modified) llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll (+9)
  • (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll (+198)
  • (modified) llvm/test/MC/AMDGPU/mai-gfx950.s (+56)
  • (modified) llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt (+34)
  • (modified) llvm/test/tools/llvm-mca/AMDGPU/gfx950.s (+7-3)
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
index b21394b6982631..bfe2901ee962a3 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
@@ -458,4 +458,11 @@ v16i test_mfma_i32_32x32x32_i8(v4i a, v4i b, v16i c) {
   return __builtin_amdgcn_mfma_i32_32x32x32_i8(a, b, c, 1, 2, 3);
 }
 
+// CHECK-GFX950-LABEL: @test_mfma_f32_16x16x32_bf16(
+// CHECK-GFX950: tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %a, <8 x bfloat> %b, <4 x float> %c, i32 1, i32 2, i32 3)
+v4f test_mfma_f32_16x16x32_bf16(v8bf16 a, v8bf16 b, v4f c)
+{
+  return __builtin_amdgcn_mfma_f32_16x16x32_bf16(a, b, c, 1, 2, 3);
+}
+
 #endif
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
index 9c14c0541ff3b8..acaa20090dfcba 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
@@ -55,3 +55,10 @@ void test_mfma_i32_32x32x32_i8(__global int16* out, int4 a, int4 b, int16 c, int
   *out = __builtin_amdgcn_mfma_i32_32x32x32_i8(a, b, c, 0, X, 0);  // expected-error{{argument to '__builtin_amdgcn_mfma_i32_32x32x32_i8' must be a constant integer}}
   *out = __builtin_amdgcn_mfma_i32_32x32x32_i8(a, b, c, 0, 0, X);  // expected-error{{argument to '__builtin_amdgcn_mfma_i32_32x32x32_i8' must be a constant integer}}
 }
+
+void test_mfma_f32_16x16x32_bf16(__global float4* out, bfloat8 a, bfloat8 b, float4 c, int X) {
+
+  *out = __builtin_amdgcn_mfma_f32_16x16x32_bf16(a, b, c, X, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x32_bf16' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_16x16x32_bf16(a, b, c, 0, X, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x32_bf16' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_16x16x32_bf16(a, b, c, 0, 0, X); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x32_bf16' must be a constant integer}}
+}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
index 71a110066342cb..6bf76b3cba0f59 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
@@ -33,6 +33,7 @@ void test(__global float4* out0, half8 a0, half8 b0, float4 c0,
   *out2 = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a2, b2, c2, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_32x32x16_bf16' needs target feature gfx950-insts}}
   *out3 = __builtin_amdgcn_mfma_i32_16x16x64_i8(a3, b3, c3, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_i32_16x16x64_i8' needs target feature gfx950-insts}}
   *out4 = __builtin_amdgcn_mfma_i32_32x32x32_i8(a4, b4, c4, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_i32_32x32x32_i8' needs target feature gfx950-insts}}
+  *out5 = __builtin_amdgcn_mfma_f32_16x16x32_bf16(a5, b5, c5, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_16x16x32_bf16' needs target feature gfx950-insts}}
   *out14 = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a14, b14, c14, 0, 0, 0, d14, 0, e14); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' needs target feature gfx950-insts}}
   *out15 = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a15, b15, c15, 0, 0, 0, d15, 0, e15); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' needs target feature gfx950-insts}}
 }
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index b5d5eae0c7cd7e..479120f9c202bf 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3148,7 +3148,7 @@ def int_amdgcn_mfma_f32_16x16x32_f16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v
 def int_amdgcn_mfma_f32_32x32x16_f16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v8f16_ty>;
 def int_amdgcn_mfma_i32_16x16x64_i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_v4i32_ty>;
 def int_amdgcn_mfma_i32_32x32x32_i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_v4i32_ty>;
-
+def int_amdgcn_mfma_f32_16x16x32_bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v8bf16_ty>;
 def int_amdgcn_mfma_f32_32x32x16_bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v8bf16_ty>;
 def int_amdgcn_mfma_scale_f32_16x16x128_f8f6f4 : AMDGPUMfmaScaleIntrinsic<llvm_v4f32_ty>;
 def int_amdgcn_mfma_scale_f32_32x32x64_f8f6f4 : AMDGPUMfmaScaleIntrinsic<llvm_v16f32_ty>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index ea9c688c710418..4b7ba4effc6857 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4751,7 +4751,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_mfma_f32_16x16x32_f16:
     case Intrinsic::amdgcn_mfma_f32_32x32x16_f16:
     case Intrinsic::amdgcn_mfma_i32_16x16x64_i8:
-    case Intrinsic::amdgcn_mfma_i32_32x32x32_i8: {
+    case Intrinsic::amdgcn_mfma_i32_32x32x32_i8:
+    case Intrinsic::amdgcn_mfma_f32_16x16x32_bf16: {
       // Default for MAI intrinsics.
       // srcC can also be an immediate which can be folded later.
       // FIXME: Should we eventually add an alternative mapping with AGPR src
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 6b0c32b62176b6..05568b4f9ad1ec 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2880,6 +2880,7 @@ def VOP_V16F32_V2I32_V4I32_I32    : VOPProfile <[v16f32, v2i32, v4i32, i32]>;
 def VOP_V4F32_V8F16_V8F16_V4F32   : VOPProfile <[v4f32,  v8f16, v8f16, v4f32]>;
 def VOP_V16F32_V8F16_V8F16_V16F32 : VOPProfile <[v16f32, v8f16, v8f16, v16f32]>;
 def VOP_V16F32_V8BF16_V8BF16_V16F32 : VOPProfile <[v16f32, v8bf16, v8bf16, v16f32]>;
+def VOP_V4F32_V8BF16_V8BF16_V4F32 : VOPProfile <[v4f32, v8bf16, v8bf16, v4f32]>;
 def VOP_V4F32_V8I32_V8I32_V4F32   : VOPProfile <[v4f32,  v8i32, v8i32, v4f32]>;
 
 def VOP_V4F32_V8I32_V6I32_V4F32   : VOPProfile <[v4f32,  v8i32, v6i32, v4f32]>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 98d5e3f199cddd..9bd2b124f122dc 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -645,6 +645,9 @@ def VOPProfileMAI_F32_V8F16_X16_VCD : VOPProfileMAI<VOP_V16F32_V8F16_V8F16_V16F3
 def VOPProfileMAI_F32_V8BF16_X16     : VOPProfileMAI<VOP_V16F32_V8BF16_V8BF16_V16F32,     AISrc_512_f32,  ADst_512, AVSrc_128>;
 def VOPProfileMAI_F32_V8BF16_X16_VCD : VOPProfileMAI<VOP_V16F32_V8BF16_V8BF16_V16F32,     VISrc_512_f32,  VDst_512, AVSrc_128>;
 
+def VOPProfileMAI_F32_V8BF16_X4     : VOPProfileMAI<VOP_V4F32_V8BF16_V8BF16_V4F32,     AISrc_128_f32,  ADst_128, AVSrc_128>;
+def VOPProfileMAI_F32_V8BF16_X4_VCD : VOPProfileMAI<VOP_V4F32_V8BF16_V8BF16_V4F32,     VISrc_128_f32,  VDst_128, AVSrc_128>;
+
 
 let HasAbid = false in {
 // For f32_16x16x128_f8f6f4 - f8 x f8 case
@@ -952,6 +955,7 @@ defm V_MFMA_F32_32X32X4BF16 : MAIInst<"v_mfma_f32_32x32x4bf16", "F32_V2I16_X16",
 let SubtargetPredicate = HasGFX950Insts, is_gfx940_xdl = 1 in {
 defm V_MFMA_F32_16X16X32_F16   : MAIInst<"v_mfma_f32_16x16x32f16",    "F32_V8F16_X32", int_amdgcn_mfma_f32_16x16x32_f16>;
 defm V_MFMA_F32_32X32X16_F16   : MAIInst<"v_mfma_f32_32x32x16f16",    "F32_V8F16_X16", int_amdgcn_mfma_f32_32x32x16_f16>;
+defm V_MFMA_F32_16X16X32_BF16  : MAIInst<"v_mfma_f32_16x16x32bf16",   "F32_V8BF16_X4", int_amdgcn_mfma_f32_16x16x32_bf16>;
 defm V_MFMA_I32_16X16X64_I8    : MAIInst<"v_mfma_i32_16x16x64i8",     "I32_V4I32_X128", int_amdgcn_mfma_i32_16x16x64_i8>;
 defm V_MFMA_F32_32X32X16_BF16  : MAIInst<"v_mfma_f32_32x32x16bf16",   "F32_V8BF16_X16", int_amdgcn_mfma_f32_32x32x16_bf16>;
 defm V_MFMA_I32_32X32X32_I8    : MAIInst<"v_mfma_i32_32x32x32i8",     "I32_V4I32_X16", int_amdgcn_mfma_i32_32x32x32_i8>;
@@ -2078,6 +2082,7 @@ defm V_MFMA_F64_4X4X4F64        : VOP3P_Real_MFMA_gfx90a <0x6f>;
 
 defm V_MFMA_F32_16X16X32_F16     : VOP3P_Real_MFMA_gfx950 <0x54, "v_mfma_f32_16x16x32_f16">;
 defm V_MFMA_F32_32X32X16_F16     : VOP3P_Real_MFMA_gfx950 <0x55, "v_mfma_f32_32x32x16_f16">;
+defm V_MFMA_F32_16X16X32_BF16    : VOP3P_Real_MFMA_gfx950 <0x35, "v_mfma_f32_16x16x32_bf16">;
 defm V_MFMA_I32_16X16X64_I8      : VOP3P_Real_MFMA_gfx950 <0x36, "v_mfma_i32_16x16x64_i8">;
 defm V_MFMA_F32_32X32X16_BF16    : VOP3P_Real_MFMA_gfx950 <0x37, "v_mfma_f32_32x32x16_bf16">;
 defm V_MFMA_I32_32X32X32_I8      : VOP3P_Real_MFMA_gfx950 <0x38, "v_mfma_i32_32x32x32_i8">;
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
index 26021a56790aae..2d7983ebafd653 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@@ -323,6 +323,15 @@ define amdgpu_kernel void @mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> %arg1
   ret void
 }
 
+declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat>, <8 x bfloat>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)
+
+; CHECK: DIVERGENT: %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
+define amdgpu_kernel void @mfma_f32_16x16x32_bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, ptr addrspace(1) %out) {
+  %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
+  store <4 x float> %result, ptr addrspace(1) %out
+  ret void
+}
+
 declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1
 declare i32 @llvm.amdgcn.permlane16.i32(i32, i32, i32, i32, i1, i1) #1
 declare i32 @llvm.amdgcn.permlanex16.i32(i32, i32, i32, i32, i1, i1) #1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index 29668b2c5bfdbf..d0ae669ffb3d68 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -1315,5 +1315,203 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
   ret void
 }
 
+; --------------------------------------------------------------------
+; llvm.amdgcn.mfma.f32.16x16x32.bf16
+; --------------------------------------------------------------------
+
+declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat>, <8 x bfloat>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)
+
+define <4 x float> @test_mfma_f32_16x16x32_bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) {
+; SDAG-LABEL: test_mfma_f32_16x16x32_bf16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
+; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_f32_16x16x32_bf16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v13, 16, v1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
+; GISEL-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
+; GISEL-NEXT:    v_mov_b32_sdwa v0, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_mov_b32_sdwa v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_mov_b32_sdwa v2, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_mov_b32_sdwa v3, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
+; GISEL-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
+; GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v6
+; GISEL-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v8
+; GISEL-NEXT:    v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v9
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v10
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
+; GISEL-NEXT:    s_nop 6
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
+  ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_f32_16x16x32_bf16__flags(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) {
+; SDAG-LABEL: test_mfma_f32_16x16x32_bf16__flags:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
+; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_f32_16x16x32_bf16__flags:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v13, 16, v1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
+; GISEL-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
+; GISEL-NEXT:    v_mov_b32_sdwa v0, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_mov_b32_sdwa v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_mov_b32_sdwa v2, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_mov_b32_sdwa v3, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
+; GISEL-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
+; GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v6
+; GISEL-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v8
+; GISEL-NEXT:    v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v9
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v10
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
+; GISEL-NEXT:    s_nop 6
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 1, i32 1, i32 1)
+  ret <4 x float> %result
+}
+
+define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrspace(1) %out, <8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) #0 {
+; SDAG-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
+; SDAG-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x54
+; SDAG-NEXT:    v_mov_b32_e32 v12, 0
+; SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[8:9]
+; SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[12:13]
+; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[10:11]
+; SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[14:15]
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
+; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1]
+; SDAG-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
+; GISEL-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x54
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[8:9]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[12:13]
+; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[10:11]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[14:15]
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    s_nop 4
+; GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GISEL-NEXT:    s_endpgm
+  %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
+  store <4 x float> %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(ptr addrspace(1) %out, <8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) #0 {
+; SDAG-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
+; SDAG-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x54
+; SDAG-NEXT:    v_mov_b32_e32 v12, 0
+; SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[8:9]
+; SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[12:13]
+; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[10:11]
+; SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[14:15]
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1]
+; SDAG-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
+; GISEL-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x54
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[8:9]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[12:13]
+; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[10:11]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[14:15]
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    s_nop 4
+; GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GISEL-NEXT:    s_endpgm
+  %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 3, i32 2, i32 1)
+  store <4 x float> %result, p...
[truncated]

@llvmbot
Copy link
Member

llvmbot commented Nov 20, 2024

@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)

Changes

Patch is 28.32 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/117053.diff

12 Files Affected:

  • (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl (+7)
  • (modified) clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl (+7)
  • (modified) clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl (+1)
  • (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+1-1)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+2-1)
  • (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.td (+1)
  • (modified) llvm/lib/Target/AMDGPU/VOP3PInstructions.td (+5)
  • (modified) llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll (+9)
  • (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll (+198)
  • (modified) llvm/test/MC/AMDGPU/mai-gfx950.s (+56)
  • (modified) llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt (+34)
  • (modified) llvm/test/tools/llvm-mca/AMDGPU/gfx950.s (+7-3)
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
index b21394b6982631..bfe2901ee962a3 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
@@ -458,4 +458,11 @@ v16i test_mfma_i32_32x32x32_i8(v4i a, v4i b, v16i c) {
   return __builtin_amdgcn_mfma_i32_32x32x32_i8(a, b, c, 1, 2, 3);
 }
 
+// CHECK-GFX950-LABEL: @test_mfma_f32_16x16x32_bf16(
+// CHECK-GFX950: tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %a, <8 x bfloat> %b, <4 x float> %c, i32 1, i32 2, i32 3)
+v4f test_mfma_f32_16x16x32_bf16(v8bf16 a, v8bf16 b, v4f c)
+{
+  return __builtin_amdgcn_mfma_f32_16x16x32_bf16(a, b, c, 1, 2, 3);
+}
+
 #endif
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
index 9c14c0541ff3b8..acaa20090dfcba 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
@@ -55,3 +55,10 @@ void test_mfma_i32_32x32x32_i8(__global int16* out, int4 a, int4 b, int16 c, int
   *out = __builtin_amdgcn_mfma_i32_32x32x32_i8(a, b, c, 0, X, 0);  // expected-error{{argument to '__builtin_amdgcn_mfma_i32_32x32x32_i8' must be a constant integer}}
   *out = __builtin_amdgcn_mfma_i32_32x32x32_i8(a, b, c, 0, 0, X);  // expected-error{{argument to '__builtin_amdgcn_mfma_i32_32x32x32_i8' must be a constant integer}}
 }
+
+void test_mfma_f32_16x16x32_bf16(__global float4* out, bfloat8 a, bfloat8 b, float4 c, int X) {
+
+  *out = __builtin_amdgcn_mfma_f32_16x16x32_bf16(a, b, c, X, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x32_bf16' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_16x16x32_bf16(a, b, c, 0, X, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x32_bf16' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_16x16x32_bf16(a, b, c, 0, 0, X); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x32_bf16' must be a constant integer}}
+}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
index 71a110066342cb..6bf76b3cba0f59 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
@@ -33,6 +33,7 @@ void test(__global float4* out0, half8 a0, half8 b0, float4 c0,
   *out2 = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a2, b2, c2, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_32x32x16_bf16' needs target feature gfx950-insts}}
   *out3 = __builtin_amdgcn_mfma_i32_16x16x64_i8(a3, b3, c3, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_i32_16x16x64_i8' needs target feature gfx950-insts}}
   *out4 = __builtin_amdgcn_mfma_i32_32x32x32_i8(a4, b4, c4, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_i32_32x32x32_i8' needs target feature gfx950-insts}}
+  *out5 = __builtin_amdgcn_mfma_f32_16x16x32_bf16(a5, b5, c5, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_16x16x32_bf16' needs target feature gfx950-insts}}
   *out14 = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a14, b14, c14, 0, 0, 0, d14, 0, e14); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' needs target feature gfx950-insts}}
   *out15 = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a15, b15, c15, 0, 0, 0, d15, 0, e15); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' needs target feature gfx950-insts}}
 }
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index b5d5eae0c7cd7e..479120f9c202bf 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3148,7 +3148,7 @@ def int_amdgcn_mfma_f32_16x16x32_f16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v
 def int_amdgcn_mfma_f32_32x32x16_f16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v8f16_ty>;
 def int_amdgcn_mfma_i32_16x16x64_i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_v4i32_ty>;
 def int_amdgcn_mfma_i32_32x32x32_i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_v4i32_ty>;
-
+def int_amdgcn_mfma_f32_16x16x32_bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v8bf16_ty>;
 def int_amdgcn_mfma_f32_32x32x16_bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v8bf16_ty>;
 def int_amdgcn_mfma_scale_f32_16x16x128_f8f6f4 : AMDGPUMfmaScaleIntrinsic<llvm_v4f32_ty>;
 def int_amdgcn_mfma_scale_f32_32x32x64_f8f6f4 : AMDGPUMfmaScaleIntrinsic<llvm_v16f32_ty>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index ea9c688c710418..4b7ba4effc6857 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4751,7 +4751,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_mfma_f32_16x16x32_f16:
     case Intrinsic::amdgcn_mfma_f32_32x32x16_f16:
     case Intrinsic::amdgcn_mfma_i32_16x16x64_i8:
-    case Intrinsic::amdgcn_mfma_i32_32x32x32_i8: {
+    case Intrinsic::amdgcn_mfma_i32_32x32x32_i8:
+    case Intrinsic::amdgcn_mfma_f32_16x16x32_bf16: {
       // Default for MAI intrinsics.
       // srcC can also be an immediate which can be folded later.
       // FIXME: Should we eventually add an alternative mapping with AGPR src
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 6b0c32b62176b6..05568b4f9ad1ec 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2880,6 +2880,7 @@ def VOP_V16F32_V2I32_V4I32_I32    : VOPProfile <[v16f32, v2i32, v4i32, i32]>;
 def VOP_V4F32_V8F16_V8F16_V4F32   : VOPProfile <[v4f32,  v8f16, v8f16, v4f32]>;
 def VOP_V16F32_V8F16_V8F16_V16F32 : VOPProfile <[v16f32, v8f16, v8f16, v16f32]>;
 def VOP_V16F32_V8BF16_V8BF16_V16F32 : VOPProfile <[v16f32, v8bf16, v8bf16, v16f32]>;
+def VOP_V4F32_V8BF16_V8BF16_V4F32 : VOPProfile <[v4f32, v8bf16, v8bf16, v4f32]>;
 def VOP_V4F32_V8I32_V8I32_V4F32   : VOPProfile <[v4f32,  v8i32, v8i32, v4f32]>;
 
 def VOP_V4F32_V8I32_V6I32_V4F32   : VOPProfile <[v4f32,  v8i32, v6i32, v4f32]>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 98d5e3f199cddd..9bd2b124f122dc 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -645,6 +645,9 @@ def VOPProfileMAI_F32_V8F16_X16_VCD : VOPProfileMAI<VOP_V16F32_V8F16_V8F16_V16F3
 def VOPProfileMAI_F32_V8BF16_X16     : VOPProfileMAI<VOP_V16F32_V8BF16_V8BF16_V16F32,     AISrc_512_f32,  ADst_512, AVSrc_128>;
 def VOPProfileMAI_F32_V8BF16_X16_VCD : VOPProfileMAI<VOP_V16F32_V8BF16_V8BF16_V16F32,     VISrc_512_f32,  VDst_512, AVSrc_128>;
 
+def VOPProfileMAI_F32_V8BF16_X4     : VOPProfileMAI<VOP_V4F32_V8BF16_V8BF16_V4F32,     AISrc_128_f32,  ADst_128, AVSrc_128>;
+def VOPProfileMAI_F32_V8BF16_X4_VCD : VOPProfileMAI<VOP_V4F32_V8BF16_V8BF16_V4F32,     VISrc_128_f32,  VDst_128, AVSrc_128>;
+
 
 let HasAbid = false in {
 // For f32_16x16x128_f8f6f4 - f8 x f8 case
@@ -952,6 +955,7 @@ defm V_MFMA_F32_32X32X4BF16 : MAIInst<"v_mfma_f32_32x32x4bf16", "F32_V2I16_X16",
 let SubtargetPredicate = HasGFX950Insts, is_gfx940_xdl = 1 in {
 defm V_MFMA_F32_16X16X32_F16   : MAIInst<"v_mfma_f32_16x16x32f16",    "F32_V8F16_X32", int_amdgcn_mfma_f32_16x16x32_f16>;
 defm V_MFMA_F32_32X32X16_F16   : MAIInst<"v_mfma_f32_32x32x16f16",    "F32_V8F16_X16", int_amdgcn_mfma_f32_32x32x16_f16>;
+defm V_MFMA_F32_16X16X32_BF16  : MAIInst<"v_mfma_f32_16x16x32bf16",   "F32_V8BF16_X4", int_amdgcn_mfma_f32_16x16x32_bf16>;
 defm V_MFMA_I32_16X16X64_I8    : MAIInst<"v_mfma_i32_16x16x64i8",     "I32_V4I32_X128", int_amdgcn_mfma_i32_16x16x64_i8>;
 defm V_MFMA_F32_32X32X16_BF16  : MAIInst<"v_mfma_f32_32x32x16bf16",   "F32_V8BF16_X16", int_amdgcn_mfma_f32_32x32x16_bf16>;
 defm V_MFMA_I32_32X32X32_I8    : MAIInst<"v_mfma_i32_32x32x32i8",     "I32_V4I32_X16", int_amdgcn_mfma_i32_32x32x32_i8>;
@@ -2078,6 +2082,7 @@ defm V_MFMA_F64_4X4X4F64        : VOP3P_Real_MFMA_gfx90a <0x6f>;
 
 defm V_MFMA_F32_16X16X32_F16     : VOP3P_Real_MFMA_gfx950 <0x54, "v_mfma_f32_16x16x32_f16">;
 defm V_MFMA_F32_32X32X16_F16     : VOP3P_Real_MFMA_gfx950 <0x55, "v_mfma_f32_32x32x16_f16">;
+defm V_MFMA_F32_16X16X32_BF16    : VOP3P_Real_MFMA_gfx950 <0x35, "v_mfma_f32_16x16x32_bf16">;
 defm V_MFMA_I32_16X16X64_I8      : VOP3P_Real_MFMA_gfx950 <0x36, "v_mfma_i32_16x16x64_i8">;
 defm V_MFMA_F32_32X32X16_BF16    : VOP3P_Real_MFMA_gfx950 <0x37, "v_mfma_f32_32x32x16_bf16">;
 defm V_MFMA_I32_32X32X32_I8      : VOP3P_Real_MFMA_gfx950 <0x38, "v_mfma_i32_32x32x32_i8">;
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
index 26021a56790aae..2d7983ebafd653 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@@ -323,6 +323,15 @@ define amdgpu_kernel void @mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> %arg1
   ret void
 }
 
+declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat>, <8 x bfloat>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)
+
+; CHECK: DIVERGENT: %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
+define amdgpu_kernel void @mfma_f32_16x16x32_bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, ptr addrspace(1) %out) {
+  %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
+  store <4 x float> %result, ptr addrspace(1) %out
+  ret void
+}
+
 declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1
 declare i32 @llvm.amdgcn.permlane16.i32(i32, i32, i32, i32, i1, i1) #1
 declare i32 @llvm.amdgcn.permlanex16.i32(i32, i32, i32, i32, i1, i1) #1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index 29668b2c5bfdbf..d0ae669ffb3d68 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -1315,5 +1315,203 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
   ret void
 }
 
+; --------------------------------------------------------------------
+; llvm.amdgcn.mfma.f32.16x16x32.bf16
+; --------------------------------------------------------------------
+
+declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat>, <8 x bfloat>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)
+
+define <4 x float> @test_mfma_f32_16x16x32_bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) {
+; SDAG-LABEL: test_mfma_f32_16x16x32_bf16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
+; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_f32_16x16x32_bf16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v13, 16, v1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
+; GISEL-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
+; GISEL-NEXT:    v_mov_b32_sdwa v0, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_mov_b32_sdwa v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_mov_b32_sdwa v2, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_mov_b32_sdwa v3, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
+; GISEL-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
+; GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v6
+; GISEL-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v8
+; GISEL-NEXT:    v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v9
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v10
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
+; GISEL-NEXT:    s_nop 6
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
+  ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_f32_16x16x32_bf16__flags(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) {
+; SDAG-LABEL: test_mfma_f32_16x16x32_bf16__flags:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
+; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_f32_16x16x32_bf16__flags:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v13, 16, v1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
+; GISEL-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
+; GISEL-NEXT:    v_mov_b32_sdwa v0, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_mov_b32_sdwa v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_mov_b32_sdwa v2, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_mov_b32_sdwa v3, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
+; GISEL-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
+; GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v6
+; GISEL-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v8
+; GISEL-NEXT:    v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v9
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v10
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
+; GISEL-NEXT:    s_nop 6
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 1, i32 1, i32 1)
+  ret <4 x float> %result
+}
+
+define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrspace(1) %out, <8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) #0 {
+; SDAG-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
+; SDAG-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x54
+; SDAG-NEXT:    v_mov_b32_e32 v12, 0
+; SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[8:9]
+; SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[12:13]
+; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[10:11]
+; SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[14:15]
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
+; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1]
+; SDAG-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
+; GISEL-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x54
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[8:9]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[12:13]
+; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[10:11]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[14:15]
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    s_nop 4
+; GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GISEL-NEXT:    s_endpgm
+  %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
+  store <4 x float> %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(ptr addrspace(1) %out, <8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) #0 {
+; SDAG-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
+; SDAG-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x54
+; SDAG-NEXT:    v_mov_b32_e32 v12, 0
+; SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[8:9]
+; SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[12:13]
+; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[10:11]
+; SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[14:15]
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1]
+; SDAG-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
+; GISEL-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x54
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[8:9]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[12:13]
+; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[10:11]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[14:15]
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    s_nop 4
+; GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GISEL-NEXT:    s_endpgm
+  %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 3, i32 2, i32 1)
+  store <4 x float> %result, p...
[truncated]

@arsenm arsenm force-pushed the users/arsenm/gfx950/v_mfma_i32_32x32x32_i8 branch from 522f526 to 535d10e Compare November 21, 2024 17:04
@arsenm arsenm force-pushed the users/arsenm/gfx950/v_mfma_f32_16x16x32_bf16 branch from b9658b7 to 84c3383 Compare November 21, 2024 17:04
Base automatically changed from users/arsenm/gfx950/v_mfma_i32_32x32x32_i8 to main November 21, 2024 17:08
@arsenm arsenm force-pushed the users/arsenm/gfx950/v_mfma_f32_16x16x32_bf16 branch 2 times, most recently from 34fb85f to 00437fb Compare November 21, 2024 17:43
This was referenced Nov 26, 2024
searlmc1 pushed a commit to ROCm/llvm-project that referenced this pull request Feb 3, 2025
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
backend:AMDGPU clang:frontend Language frontend issues, e.g. anything involving "Sema" clang Clang issues not falling into any other category llvm:analysis llvm:ir mc Machine (object) code
Projects
None yet
Development

Successfully merging this pull request may close these issues.

5 participants