-
Notifications
You must be signed in to change notification settings - Fork 13.5k
AMDGPU: Add v_mfma_f32_16x16x32_bf16 for gfx950 #117053
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-ir @llvm/pr-subscribers-llvm-analysis Author: Matt Arsenault (arsenm) ChangesPatch is 28.32 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/117053.diff 12 Files Affected:
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
index b21394b6982631..bfe2901ee962a3 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
@@ -458,4 +458,11 @@ v16i test_mfma_i32_32x32x32_i8(v4i a, v4i b, v16i c) {
return __builtin_amdgcn_mfma_i32_32x32x32_i8(a, b, c, 1, 2, 3);
}
+// CHECK-GFX950-LABEL: @test_mfma_f32_16x16x32_bf16(
+// CHECK-GFX950: tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %a, <8 x bfloat> %b, <4 x float> %c, i32 1, i32 2, i32 3)
+v4f test_mfma_f32_16x16x32_bf16(v8bf16 a, v8bf16 b, v4f c)
+{
+ return __builtin_amdgcn_mfma_f32_16x16x32_bf16(a, b, c, 1, 2, 3);
+}
+
#endif
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
index 9c14c0541ff3b8..acaa20090dfcba 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
@@ -55,3 +55,10 @@ void test_mfma_i32_32x32x32_i8(__global int16* out, int4 a, int4 b, int16 c, int
*out = __builtin_amdgcn_mfma_i32_32x32x32_i8(a, b, c, 0, X, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_32x32x32_i8' must be a constant integer}}
*out = __builtin_amdgcn_mfma_i32_32x32x32_i8(a, b, c, 0, 0, X); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_32x32x32_i8' must be a constant integer}}
}
+
+void test_mfma_f32_16x16x32_bf16(__global float4* out, bfloat8 a, bfloat8 b, float4 c, int X) {
+
+ *out = __builtin_amdgcn_mfma_f32_16x16x32_bf16(a, b, c, X, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x32_bf16' must be a constant integer}}
+ *out = __builtin_amdgcn_mfma_f32_16x16x32_bf16(a, b, c, 0, X, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x32_bf16' must be a constant integer}}
+ *out = __builtin_amdgcn_mfma_f32_16x16x32_bf16(a, b, c, 0, 0, X); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x32_bf16' must be a constant integer}}
+}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
index 71a110066342cb..6bf76b3cba0f59 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
@@ -33,6 +33,7 @@ void test(__global float4* out0, half8 a0, half8 b0, float4 c0,
*out2 = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a2, b2, c2, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_32x32x16_bf16' needs target feature gfx950-insts}}
*out3 = __builtin_amdgcn_mfma_i32_16x16x64_i8(a3, b3, c3, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_i32_16x16x64_i8' needs target feature gfx950-insts}}
*out4 = __builtin_amdgcn_mfma_i32_32x32x32_i8(a4, b4, c4, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_i32_32x32x32_i8' needs target feature gfx950-insts}}
+ *out5 = __builtin_amdgcn_mfma_f32_16x16x32_bf16(a5, b5, c5, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_16x16x32_bf16' needs target feature gfx950-insts}}
*out14 = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a14, b14, c14, 0, 0, 0, d14, 0, e14); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' needs target feature gfx950-insts}}
*out15 = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a15, b15, c15, 0, 0, 0, d15, 0, e15); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' needs target feature gfx950-insts}}
}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index b5d5eae0c7cd7e..479120f9c202bf 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3148,7 +3148,7 @@ def int_amdgcn_mfma_f32_16x16x32_f16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v
def int_amdgcn_mfma_f32_32x32x16_f16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v8f16_ty>;
def int_amdgcn_mfma_i32_16x16x64_i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_v4i32_ty>;
def int_amdgcn_mfma_i32_32x32x32_i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_v4i32_ty>;
-
+def int_amdgcn_mfma_f32_16x16x32_bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v8bf16_ty>;
def int_amdgcn_mfma_f32_32x32x16_bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v8bf16_ty>;
def int_amdgcn_mfma_scale_f32_16x16x128_f8f6f4 : AMDGPUMfmaScaleIntrinsic<llvm_v4f32_ty>;
def int_amdgcn_mfma_scale_f32_32x32x64_f8f6f4 : AMDGPUMfmaScaleIntrinsic<llvm_v16f32_ty>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index ea9c688c710418..4b7ba4effc6857 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4751,7 +4751,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_mfma_f32_16x16x32_f16:
case Intrinsic::amdgcn_mfma_f32_32x32x16_f16:
case Intrinsic::amdgcn_mfma_i32_16x16x64_i8:
- case Intrinsic::amdgcn_mfma_i32_32x32x32_i8: {
+ case Intrinsic::amdgcn_mfma_i32_32x32x32_i8:
+ case Intrinsic::amdgcn_mfma_f32_16x16x32_bf16: {
// Default for MAI intrinsics.
// srcC can also be an immediate which can be folded later.
// FIXME: Should we eventually add an alternative mapping with AGPR src
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 6b0c32b62176b6..05568b4f9ad1ec 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2880,6 +2880,7 @@ def VOP_V16F32_V2I32_V4I32_I32 : VOPProfile <[v16f32, v2i32, v4i32, i32]>;
def VOP_V4F32_V8F16_V8F16_V4F32 : VOPProfile <[v4f32, v8f16, v8f16, v4f32]>;
def VOP_V16F32_V8F16_V8F16_V16F32 : VOPProfile <[v16f32, v8f16, v8f16, v16f32]>;
def VOP_V16F32_V8BF16_V8BF16_V16F32 : VOPProfile <[v16f32, v8bf16, v8bf16, v16f32]>;
+def VOP_V4F32_V8BF16_V8BF16_V4F32 : VOPProfile <[v4f32, v8bf16, v8bf16, v4f32]>;
def VOP_V4F32_V8I32_V8I32_V4F32 : VOPProfile <[v4f32, v8i32, v8i32, v4f32]>;
def VOP_V4F32_V8I32_V6I32_V4F32 : VOPProfile <[v4f32, v8i32, v6i32, v4f32]>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 98d5e3f199cddd..9bd2b124f122dc 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -645,6 +645,9 @@ def VOPProfileMAI_F32_V8F16_X16_VCD : VOPProfileMAI<VOP_V16F32_V8F16_V8F16_V16F3
def VOPProfileMAI_F32_V8BF16_X16 : VOPProfileMAI<VOP_V16F32_V8BF16_V8BF16_V16F32, AISrc_512_f32, ADst_512, AVSrc_128>;
def VOPProfileMAI_F32_V8BF16_X16_VCD : VOPProfileMAI<VOP_V16F32_V8BF16_V8BF16_V16F32, VISrc_512_f32, VDst_512, AVSrc_128>;
+def VOPProfileMAI_F32_V8BF16_X4 : VOPProfileMAI<VOP_V4F32_V8BF16_V8BF16_V4F32, AISrc_128_f32, ADst_128, AVSrc_128>;
+def VOPProfileMAI_F32_V8BF16_X4_VCD : VOPProfileMAI<VOP_V4F32_V8BF16_V8BF16_V4F32, VISrc_128_f32, VDst_128, AVSrc_128>;
+
let HasAbid = false in {
// For f32_16x16x128_f8f6f4 - f8 x f8 case
@@ -952,6 +955,7 @@ defm V_MFMA_F32_32X32X4BF16 : MAIInst<"v_mfma_f32_32x32x4bf16", "F32_V2I16_X16",
let SubtargetPredicate = HasGFX950Insts, is_gfx940_xdl = 1 in {
defm V_MFMA_F32_16X16X32_F16 : MAIInst<"v_mfma_f32_16x16x32f16", "F32_V8F16_X32", int_amdgcn_mfma_f32_16x16x32_f16>;
defm V_MFMA_F32_32X32X16_F16 : MAIInst<"v_mfma_f32_32x32x16f16", "F32_V8F16_X16", int_amdgcn_mfma_f32_32x32x16_f16>;
+defm V_MFMA_F32_16X16X32_BF16 : MAIInst<"v_mfma_f32_16x16x32bf16", "F32_V8BF16_X4", int_amdgcn_mfma_f32_16x16x32_bf16>;
defm V_MFMA_I32_16X16X64_I8 : MAIInst<"v_mfma_i32_16x16x64i8", "I32_V4I32_X128", int_amdgcn_mfma_i32_16x16x64_i8>;
defm V_MFMA_F32_32X32X16_BF16 : MAIInst<"v_mfma_f32_32x32x16bf16", "F32_V8BF16_X16", int_amdgcn_mfma_f32_32x32x16_bf16>;
defm V_MFMA_I32_32X32X32_I8 : MAIInst<"v_mfma_i32_32x32x32i8", "I32_V4I32_X16", int_amdgcn_mfma_i32_32x32x32_i8>;
@@ -2078,6 +2082,7 @@ defm V_MFMA_F64_4X4X4F64 : VOP3P_Real_MFMA_gfx90a <0x6f>;
defm V_MFMA_F32_16X16X32_F16 : VOP3P_Real_MFMA_gfx950 <0x54, "v_mfma_f32_16x16x32_f16">;
defm V_MFMA_F32_32X32X16_F16 : VOP3P_Real_MFMA_gfx950 <0x55, "v_mfma_f32_32x32x16_f16">;
+defm V_MFMA_F32_16X16X32_BF16 : VOP3P_Real_MFMA_gfx950 <0x35, "v_mfma_f32_16x16x32_bf16">;
defm V_MFMA_I32_16X16X64_I8 : VOP3P_Real_MFMA_gfx950 <0x36, "v_mfma_i32_16x16x64_i8">;
defm V_MFMA_F32_32X32X16_BF16 : VOP3P_Real_MFMA_gfx950 <0x37, "v_mfma_f32_32x32x16_bf16">;
defm V_MFMA_I32_32X32X32_I8 : VOP3P_Real_MFMA_gfx950 <0x38, "v_mfma_i32_32x32x32_i8">;
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
index 26021a56790aae..2d7983ebafd653 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@@ -323,6 +323,15 @@ define amdgpu_kernel void @mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> %arg1
ret void
}
+declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat>, <8 x bfloat>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)
+
+; CHECK: DIVERGENT: %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
+define amdgpu_kernel void @mfma_f32_16x16x32_bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, ptr addrspace(1) %out) {
+ %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
+ store <4 x float> %result, ptr addrspace(1) %out
+ ret void
+}
+
declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1
declare i32 @llvm.amdgcn.permlane16.i32(i32, i32, i32, i32, i1, i1) #1
declare i32 @llvm.amdgcn.permlanex16.i32(i32, i32, i32, i32, i1, i1) #1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index 29668b2c5bfdbf..d0ae669ffb3d68 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -1315,5 +1315,203 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
ret void
}
+; --------------------------------------------------------------------
+; llvm.amdgcn.mfma.f32.16x16x32.bf16
+; --------------------------------------------------------------------
+
+declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat>, <8 x bfloat>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)
+
+define <4 x float> @test_mfma_f32_16x16x32_bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) {
+; SDAG-LABEL: test_mfma_f32_16x16x32_bf16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
+; SDAG-NEXT: s_nop 6
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_f32_16x16x32_bf16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v1
+; GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v2
+; GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v3
+; GISEL-NEXT: v_mov_b32_sdwa v0, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v2, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v3, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v4
+; GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v5
+; GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v6
+; GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v7
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
+; GISEL-NEXT: v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
+; GISEL-NEXT: s_nop 6
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_f32_16x16x32_bf16__flags(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) {
+; SDAG-LABEL: test_mfma_f32_16x16x32_bf16__flags:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
+; SDAG-NEXT: s_nop 6
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_f32_16x16x32_bf16__flags:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v1
+; GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v2
+; GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v3
+; GISEL-NEXT: v_mov_b32_sdwa v0, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v2, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v3, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v4
+; GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v5
+; GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v6
+; GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v7
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
+; GISEL-NEXT: v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
+; GISEL-NEXT: s_nop 6
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 1, i32 1, i32 1)
+ ret <4 x float> %result
+}
+
+define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrspace(1) %out, <8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) #0 {
+; SDAG-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
+; SDAG-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
+; SDAG-NEXT: v_mov_b32_e32 v12, 0
+; SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[14:15]
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
+; SDAG-NEXT: s_nop 6
+; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1]
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
+; GISEL-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[12:13]
+; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[14:15]
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: s_nop 4
+; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GISEL-NEXT: s_endpgm
+ %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
+ store <4 x float> %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(ptr addrspace(1) %out, <8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) #0 {
+; SDAG-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
+; SDAG-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
+; SDAG-NEXT: v_mov_b32_e32 v12, 0
+; SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[14:15]
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; SDAG-NEXT: s_nop 6
+; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1]
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
+; GISEL-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[12:13]
+; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[14:15]
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; GISEL-NEXT: v_mov_b32_e32 v4, 0
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: s_nop 4
+; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GISEL-NEXT: s_endpgm
+ %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 3, i32 2, i32 1)
+ store <4 x float> %result, p...
[truncated]
|
@llvm/pr-subscribers-mc Author: Matt Arsenault (arsenm) ChangesPatch is 28.32 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/117053.diff 12 Files Affected:
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
index b21394b6982631..bfe2901ee962a3 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
@@ -458,4 +458,11 @@ v16i test_mfma_i32_32x32x32_i8(v4i a, v4i b, v16i c) {
return __builtin_amdgcn_mfma_i32_32x32x32_i8(a, b, c, 1, 2, 3);
}
+// CHECK-GFX950-LABEL: @test_mfma_f32_16x16x32_bf16(
+// CHECK-GFX950: tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %a, <8 x bfloat> %b, <4 x float> %c, i32 1, i32 2, i32 3)
+v4f test_mfma_f32_16x16x32_bf16(v8bf16 a, v8bf16 b, v4f c)
+{
+ return __builtin_amdgcn_mfma_f32_16x16x32_bf16(a, b, c, 1, 2, 3);
+}
+
#endif
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
index 9c14c0541ff3b8..acaa20090dfcba 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
@@ -55,3 +55,10 @@ void test_mfma_i32_32x32x32_i8(__global int16* out, int4 a, int4 b, int16 c, int
*out = __builtin_amdgcn_mfma_i32_32x32x32_i8(a, b, c, 0, X, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_32x32x32_i8' must be a constant integer}}
*out = __builtin_amdgcn_mfma_i32_32x32x32_i8(a, b, c, 0, 0, X); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_32x32x32_i8' must be a constant integer}}
}
+
+void test_mfma_f32_16x16x32_bf16(__global float4* out, bfloat8 a, bfloat8 b, float4 c, int X) {
+
+ *out = __builtin_amdgcn_mfma_f32_16x16x32_bf16(a, b, c, X, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x32_bf16' must be a constant integer}}
+ *out = __builtin_amdgcn_mfma_f32_16x16x32_bf16(a, b, c, 0, X, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x32_bf16' must be a constant integer}}
+ *out = __builtin_amdgcn_mfma_f32_16x16x32_bf16(a, b, c, 0, 0, X); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x32_bf16' must be a constant integer}}
+}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
index 71a110066342cb..6bf76b3cba0f59 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
@@ -33,6 +33,7 @@ void test(__global float4* out0, half8 a0, half8 b0, float4 c0,
*out2 = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a2, b2, c2, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_32x32x16_bf16' needs target feature gfx950-insts}}
*out3 = __builtin_amdgcn_mfma_i32_16x16x64_i8(a3, b3, c3, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_i32_16x16x64_i8' needs target feature gfx950-insts}}
*out4 = __builtin_amdgcn_mfma_i32_32x32x32_i8(a4, b4, c4, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_i32_32x32x32_i8' needs target feature gfx950-insts}}
+ *out5 = __builtin_amdgcn_mfma_f32_16x16x32_bf16(a5, b5, c5, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_16x16x32_bf16' needs target feature gfx950-insts}}
*out14 = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a14, b14, c14, 0, 0, 0, d14, 0, e14); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' needs target feature gfx950-insts}}
*out15 = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a15, b15, c15, 0, 0, 0, d15, 0, e15); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' needs target feature gfx950-insts}}
}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index b5d5eae0c7cd7e..479120f9c202bf 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3148,7 +3148,7 @@ def int_amdgcn_mfma_f32_16x16x32_f16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v
def int_amdgcn_mfma_f32_32x32x16_f16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v8f16_ty>;
def int_amdgcn_mfma_i32_16x16x64_i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_v4i32_ty>;
def int_amdgcn_mfma_i32_32x32x32_i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_v4i32_ty>;
-
+def int_amdgcn_mfma_f32_16x16x32_bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v8bf16_ty>;
def int_amdgcn_mfma_f32_32x32x16_bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v8bf16_ty>;
def int_amdgcn_mfma_scale_f32_16x16x128_f8f6f4 : AMDGPUMfmaScaleIntrinsic<llvm_v4f32_ty>;
def int_amdgcn_mfma_scale_f32_32x32x64_f8f6f4 : AMDGPUMfmaScaleIntrinsic<llvm_v16f32_ty>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index ea9c688c710418..4b7ba4effc6857 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4751,7 +4751,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_mfma_f32_16x16x32_f16:
case Intrinsic::amdgcn_mfma_f32_32x32x16_f16:
case Intrinsic::amdgcn_mfma_i32_16x16x64_i8:
- case Intrinsic::amdgcn_mfma_i32_32x32x32_i8: {
+ case Intrinsic::amdgcn_mfma_i32_32x32x32_i8:
+ case Intrinsic::amdgcn_mfma_f32_16x16x32_bf16: {
// Default for MAI intrinsics.
// srcC can also be an immediate which can be folded later.
// FIXME: Should we eventually add an alternative mapping with AGPR src
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 6b0c32b62176b6..05568b4f9ad1ec 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2880,6 +2880,7 @@ def VOP_V16F32_V2I32_V4I32_I32 : VOPProfile <[v16f32, v2i32, v4i32, i32]>;
def VOP_V4F32_V8F16_V8F16_V4F32 : VOPProfile <[v4f32, v8f16, v8f16, v4f32]>;
def VOP_V16F32_V8F16_V8F16_V16F32 : VOPProfile <[v16f32, v8f16, v8f16, v16f32]>;
def VOP_V16F32_V8BF16_V8BF16_V16F32 : VOPProfile <[v16f32, v8bf16, v8bf16, v16f32]>;
+def VOP_V4F32_V8BF16_V8BF16_V4F32 : VOPProfile <[v4f32, v8bf16, v8bf16, v4f32]>;
def VOP_V4F32_V8I32_V8I32_V4F32 : VOPProfile <[v4f32, v8i32, v8i32, v4f32]>;
def VOP_V4F32_V8I32_V6I32_V4F32 : VOPProfile <[v4f32, v8i32, v6i32, v4f32]>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 98d5e3f199cddd..9bd2b124f122dc 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -645,6 +645,9 @@ def VOPProfileMAI_F32_V8F16_X16_VCD : VOPProfileMAI<VOP_V16F32_V8F16_V8F16_V16F3
def VOPProfileMAI_F32_V8BF16_X16 : VOPProfileMAI<VOP_V16F32_V8BF16_V8BF16_V16F32, AISrc_512_f32, ADst_512, AVSrc_128>;
def VOPProfileMAI_F32_V8BF16_X16_VCD : VOPProfileMAI<VOP_V16F32_V8BF16_V8BF16_V16F32, VISrc_512_f32, VDst_512, AVSrc_128>;
+def VOPProfileMAI_F32_V8BF16_X4 : VOPProfileMAI<VOP_V4F32_V8BF16_V8BF16_V4F32, AISrc_128_f32, ADst_128, AVSrc_128>;
+def VOPProfileMAI_F32_V8BF16_X4_VCD : VOPProfileMAI<VOP_V4F32_V8BF16_V8BF16_V4F32, VISrc_128_f32, VDst_128, AVSrc_128>;
+
let HasAbid = false in {
// For f32_16x16x128_f8f6f4 - f8 x f8 case
@@ -952,6 +955,7 @@ defm V_MFMA_F32_32X32X4BF16 : MAIInst<"v_mfma_f32_32x32x4bf16", "F32_V2I16_X16",
let SubtargetPredicate = HasGFX950Insts, is_gfx940_xdl = 1 in {
defm V_MFMA_F32_16X16X32_F16 : MAIInst<"v_mfma_f32_16x16x32f16", "F32_V8F16_X32", int_amdgcn_mfma_f32_16x16x32_f16>;
defm V_MFMA_F32_32X32X16_F16 : MAIInst<"v_mfma_f32_32x32x16f16", "F32_V8F16_X16", int_amdgcn_mfma_f32_32x32x16_f16>;
+defm V_MFMA_F32_16X16X32_BF16 : MAIInst<"v_mfma_f32_16x16x32bf16", "F32_V8BF16_X4", int_amdgcn_mfma_f32_16x16x32_bf16>;
defm V_MFMA_I32_16X16X64_I8 : MAIInst<"v_mfma_i32_16x16x64i8", "I32_V4I32_X128", int_amdgcn_mfma_i32_16x16x64_i8>;
defm V_MFMA_F32_32X32X16_BF16 : MAIInst<"v_mfma_f32_32x32x16bf16", "F32_V8BF16_X16", int_amdgcn_mfma_f32_32x32x16_bf16>;
defm V_MFMA_I32_32X32X32_I8 : MAIInst<"v_mfma_i32_32x32x32i8", "I32_V4I32_X16", int_amdgcn_mfma_i32_32x32x32_i8>;
@@ -2078,6 +2082,7 @@ defm V_MFMA_F64_4X4X4F64 : VOP3P_Real_MFMA_gfx90a <0x6f>;
defm V_MFMA_F32_16X16X32_F16 : VOP3P_Real_MFMA_gfx950 <0x54, "v_mfma_f32_16x16x32_f16">;
defm V_MFMA_F32_32X32X16_F16 : VOP3P_Real_MFMA_gfx950 <0x55, "v_mfma_f32_32x32x16_f16">;
+defm V_MFMA_F32_16X16X32_BF16 : VOP3P_Real_MFMA_gfx950 <0x35, "v_mfma_f32_16x16x32_bf16">;
defm V_MFMA_I32_16X16X64_I8 : VOP3P_Real_MFMA_gfx950 <0x36, "v_mfma_i32_16x16x64_i8">;
defm V_MFMA_F32_32X32X16_BF16 : VOP3P_Real_MFMA_gfx950 <0x37, "v_mfma_f32_32x32x16_bf16">;
defm V_MFMA_I32_32X32X32_I8 : VOP3P_Real_MFMA_gfx950 <0x38, "v_mfma_i32_32x32x32_i8">;
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
index 26021a56790aae..2d7983ebafd653 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@@ -323,6 +323,15 @@ define amdgpu_kernel void @mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> %arg1
ret void
}
+declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat>, <8 x bfloat>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)
+
+; CHECK: DIVERGENT: %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
+define amdgpu_kernel void @mfma_f32_16x16x32_bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, ptr addrspace(1) %out) {
+ %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
+ store <4 x float> %result, ptr addrspace(1) %out
+ ret void
+}
+
declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1
declare i32 @llvm.amdgcn.permlane16.i32(i32, i32, i32, i32, i1, i1) #1
declare i32 @llvm.amdgcn.permlanex16.i32(i32, i32, i32, i32, i1, i1) #1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index 29668b2c5bfdbf..d0ae669ffb3d68 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -1315,5 +1315,203 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
ret void
}
+; --------------------------------------------------------------------
+; llvm.amdgcn.mfma.f32.16x16x32.bf16
+; --------------------------------------------------------------------
+
+declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat>, <8 x bfloat>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)
+
+define <4 x float> @test_mfma_f32_16x16x32_bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) {
+; SDAG-LABEL: test_mfma_f32_16x16x32_bf16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
+; SDAG-NEXT: s_nop 6
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_f32_16x16x32_bf16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v1
+; GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v2
+; GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v3
+; GISEL-NEXT: v_mov_b32_sdwa v0, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v2, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v3, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v4
+; GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v5
+; GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v6
+; GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v7
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
+; GISEL-NEXT: v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
+; GISEL-NEXT: s_nop 6
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_f32_16x16x32_bf16__flags(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) {
+; SDAG-LABEL: test_mfma_f32_16x16x32_bf16__flags:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
+; SDAG-NEXT: s_nop 6
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_f32_16x16x32_bf16__flags:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v1
+; GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v2
+; GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v3
+; GISEL-NEXT: v_mov_b32_sdwa v0, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v2, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v3, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v4
+; GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v5
+; GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v6
+; GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v7
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
+; GISEL-NEXT: v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
+; GISEL-NEXT: s_nop 6
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 1, i32 1, i32 1)
+ ret <4 x float> %result
+}
+
+define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrspace(1) %out, <8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) #0 {
+; SDAG-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
+; SDAG-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
+; SDAG-NEXT: v_mov_b32_e32 v12, 0
+; SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[14:15]
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
+; SDAG-NEXT: s_nop 6
+; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1]
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
+; GISEL-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[12:13]
+; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[14:15]
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: s_nop 4
+; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GISEL-NEXT: s_endpgm
+ %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
+ store <4 x float> %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(ptr addrspace(1) %out, <8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) #0 {
+; SDAG-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
+; SDAG-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
+; SDAG-NEXT: v_mov_b32_e32 v12, 0
+; SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[14:15]
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; SDAG-NEXT: s_nop 6
+; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1]
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
+; GISEL-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[12:13]
+; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[14:15]
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; GISEL-NEXT: v_mov_b32_e32 v4, 0
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: s_nop 4
+; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GISEL-NEXT: s_endpgm
+ %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 3, i32 2, i32 1)
+ store <4 x float> %result, p...
[truncated]
|
@llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesPatch is 28.32 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/117053.diff 12 Files Affected:
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
index b21394b6982631..bfe2901ee962a3 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
@@ -458,4 +458,11 @@ v16i test_mfma_i32_32x32x32_i8(v4i a, v4i b, v16i c) {
return __builtin_amdgcn_mfma_i32_32x32x32_i8(a, b, c, 1, 2, 3);
}
+// CHECK-GFX950-LABEL: @test_mfma_f32_16x16x32_bf16(
+// CHECK-GFX950: tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %a, <8 x bfloat> %b, <4 x float> %c, i32 1, i32 2, i32 3)
+v4f test_mfma_f32_16x16x32_bf16(v8bf16 a, v8bf16 b, v4f c)
+{
+ return __builtin_amdgcn_mfma_f32_16x16x32_bf16(a, b, c, 1, 2, 3);
+}
+
#endif
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
index 9c14c0541ff3b8..acaa20090dfcba 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
@@ -55,3 +55,10 @@ void test_mfma_i32_32x32x32_i8(__global int16* out, int4 a, int4 b, int16 c, int
*out = __builtin_amdgcn_mfma_i32_32x32x32_i8(a, b, c, 0, X, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_32x32x32_i8' must be a constant integer}}
*out = __builtin_amdgcn_mfma_i32_32x32x32_i8(a, b, c, 0, 0, X); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_32x32x32_i8' must be a constant integer}}
}
+
+void test_mfma_f32_16x16x32_bf16(__global float4* out, bfloat8 a, bfloat8 b, float4 c, int X) {
+
+ *out = __builtin_amdgcn_mfma_f32_16x16x32_bf16(a, b, c, X, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x32_bf16' must be a constant integer}}
+ *out = __builtin_amdgcn_mfma_f32_16x16x32_bf16(a, b, c, 0, X, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x32_bf16' must be a constant integer}}
+ *out = __builtin_amdgcn_mfma_f32_16x16x32_bf16(a, b, c, 0, 0, X); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x32_bf16' must be a constant integer}}
+}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
index 71a110066342cb..6bf76b3cba0f59 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
@@ -33,6 +33,7 @@ void test(__global float4* out0, half8 a0, half8 b0, float4 c0,
*out2 = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a2, b2, c2, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_32x32x16_bf16' needs target feature gfx950-insts}}
*out3 = __builtin_amdgcn_mfma_i32_16x16x64_i8(a3, b3, c3, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_i32_16x16x64_i8' needs target feature gfx950-insts}}
*out4 = __builtin_amdgcn_mfma_i32_32x32x32_i8(a4, b4, c4, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_i32_32x32x32_i8' needs target feature gfx950-insts}}
+ *out5 = __builtin_amdgcn_mfma_f32_16x16x32_bf16(a5, b5, c5, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_16x16x32_bf16' needs target feature gfx950-insts}}
*out14 = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a14, b14, c14, 0, 0, 0, d14, 0, e14); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' needs target feature gfx950-insts}}
*out15 = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a15, b15, c15, 0, 0, 0, d15, 0, e15); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' needs target feature gfx950-insts}}
}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index b5d5eae0c7cd7e..479120f9c202bf 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3148,7 +3148,7 @@ def int_amdgcn_mfma_f32_16x16x32_f16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v
def int_amdgcn_mfma_f32_32x32x16_f16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v8f16_ty>;
def int_amdgcn_mfma_i32_16x16x64_i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_v4i32_ty>;
def int_amdgcn_mfma_i32_32x32x32_i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_v4i32_ty>;
-
+def int_amdgcn_mfma_f32_16x16x32_bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v8bf16_ty>;
def int_amdgcn_mfma_f32_32x32x16_bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v8bf16_ty>;
def int_amdgcn_mfma_scale_f32_16x16x128_f8f6f4 : AMDGPUMfmaScaleIntrinsic<llvm_v4f32_ty>;
def int_amdgcn_mfma_scale_f32_32x32x64_f8f6f4 : AMDGPUMfmaScaleIntrinsic<llvm_v16f32_ty>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index ea9c688c710418..4b7ba4effc6857 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4751,7 +4751,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_mfma_f32_16x16x32_f16:
case Intrinsic::amdgcn_mfma_f32_32x32x16_f16:
case Intrinsic::amdgcn_mfma_i32_16x16x64_i8:
- case Intrinsic::amdgcn_mfma_i32_32x32x32_i8: {
+ case Intrinsic::amdgcn_mfma_i32_32x32x32_i8:
+ case Intrinsic::amdgcn_mfma_f32_16x16x32_bf16: {
// Default for MAI intrinsics.
// srcC can also be an immediate which can be folded later.
// FIXME: Should we eventually add an alternative mapping with AGPR src
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 6b0c32b62176b6..05568b4f9ad1ec 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2880,6 +2880,7 @@ def VOP_V16F32_V2I32_V4I32_I32 : VOPProfile <[v16f32, v2i32, v4i32, i32]>;
def VOP_V4F32_V8F16_V8F16_V4F32 : VOPProfile <[v4f32, v8f16, v8f16, v4f32]>;
def VOP_V16F32_V8F16_V8F16_V16F32 : VOPProfile <[v16f32, v8f16, v8f16, v16f32]>;
def VOP_V16F32_V8BF16_V8BF16_V16F32 : VOPProfile <[v16f32, v8bf16, v8bf16, v16f32]>;
+def VOP_V4F32_V8BF16_V8BF16_V4F32 : VOPProfile <[v4f32, v8bf16, v8bf16, v4f32]>;
def VOP_V4F32_V8I32_V8I32_V4F32 : VOPProfile <[v4f32, v8i32, v8i32, v4f32]>;
def VOP_V4F32_V8I32_V6I32_V4F32 : VOPProfile <[v4f32, v8i32, v6i32, v4f32]>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 98d5e3f199cddd..9bd2b124f122dc 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -645,6 +645,9 @@ def VOPProfileMAI_F32_V8F16_X16_VCD : VOPProfileMAI<VOP_V16F32_V8F16_V8F16_V16F3
def VOPProfileMAI_F32_V8BF16_X16 : VOPProfileMAI<VOP_V16F32_V8BF16_V8BF16_V16F32, AISrc_512_f32, ADst_512, AVSrc_128>;
def VOPProfileMAI_F32_V8BF16_X16_VCD : VOPProfileMAI<VOP_V16F32_V8BF16_V8BF16_V16F32, VISrc_512_f32, VDst_512, AVSrc_128>;
+def VOPProfileMAI_F32_V8BF16_X4 : VOPProfileMAI<VOP_V4F32_V8BF16_V8BF16_V4F32, AISrc_128_f32, ADst_128, AVSrc_128>;
+def VOPProfileMAI_F32_V8BF16_X4_VCD : VOPProfileMAI<VOP_V4F32_V8BF16_V8BF16_V4F32, VISrc_128_f32, VDst_128, AVSrc_128>;
+
let HasAbid = false in {
// For f32_16x16x128_f8f6f4 - f8 x f8 case
@@ -952,6 +955,7 @@ defm V_MFMA_F32_32X32X4BF16 : MAIInst<"v_mfma_f32_32x32x4bf16", "F32_V2I16_X16",
let SubtargetPredicate = HasGFX950Insts, is_gfx940_xdl = 1 in {
defm V_MFMA_F32_16X16X32_F16 : MAIInst<"v_mfma_f32_16x16x32f16", "F32_V8F16_X32", int_amdgcn_mfma_f32_16x16x32_f16>;
defm V_MFMA_F32_32X32X16_F16 : MAIInst<"v_mfma_f32_32x32x16f16", "F32_V8F16_X16", int_amdgcn_mfma_f32_32x32x16_f16>;
+defm V_MFMA_F32_16X16X32_BF16 : MAIInst<"v_mfma_f32_16x16x32bf16", "F32_V8BF16_X4", int_amdgcn_mfma_f32_16x16x32_bf16>;
defm V_MFMA_I32_16X16X64_I8 : MAIInst<"v_mfma_i32_16x16x64i8", "I32_V4I32_X128", int_amdgcn_mfma_i32_16x16x64_i8>;
defm V_MFMA_F32_32X32X16_BF16 : MAIInst<"v_mfma_f32_32x32x16bf16", "F32_V8BF16_X16", int_amdgcn_mfma_f32_32x32x16_bf16>;
defm V_MFMA_I32_32X32X32_I8 : MAIInst<"v_mfma_i32_32x32x32i8", "I32_V4I32_X16", int_amdgcn_mfma_i32_32x32x32_i8>;
@@ -2078,6 +2082,7 @@ defm V_MFMA_F64_4X4X4F64 : VOP3P_Real_MFMA_gfx90a <0x6f>;
defm V_MFMA_F32_16X16X32_F16 : VOP3P_Real_MFMA_gfx950 <0x54, "v_mfma_f32_16x16x32_f16">;
defm V_MFMA_F32_32X32X16_F16 : VOP3P_Real_MFMA_gfx950 <0x55, "v_mfma_f32_32x32x16_f16">;
+defm V_MFMA_F32_16X16X32_BF16 : VOP3P_Real_MFMA_gfx950 <0x35, "v_mfma_f32_16x16x32_bf16">;
defm V_MFMA_I32_16X16X64_I8 : VOP3P_Real_MFMA_gfx950 <0x36, "v_mfma_i32_16x16x64_i8">;
defm V_MFMA_F32_32X32X16_BF16 : VOP3P_Real_MFMA_gfx950 <0x37, "v_mfma_f32_32x32x16_bf16">;
defm V_MFMA_I32_32X32X32_I8 : VOP3P_Real_MFMA_gfx950 <0x38, "v_mfma_i32_32x32x32_i8">;
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
index 26021a56790aae..2d7983ebafd653 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@@ -323,6 +323,15 @@ define amdgpu_kernel void @mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> %arg1
ret void
}
+declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat>, <8 x bfloat>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)
+
+; CHECK: DIVERGENT: %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
+define amdgpu_kernel void @mfma_f32_16x16x32_bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, ptr addrspace(1) %out) {
+ %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
+ store <4 x float> %result, ptr addrspace(1) %out
+ ret void
+}
+
declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1
declare i32 @llvm.amdgcn.permlane16.i32(i32, i32, i32, i32, i1, i1) #1
declare i32 @llvm.amdgcn.permlanex16.i32(i32, i32, i32, i32, i1, i1) #1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index 29668b2c5bfdbf..d0ae669ffb3d68 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -1315,5 +1315,203 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
ret void
}
+; --------------------------------------------------------------------
+; llvm.amdgcn.mfma.f32.16x16x32.bf16
+; --------------------------------------------------------------------
+
+declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat>, <8 x bfloat>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)
+
+define <4 x float> @test_mfma_f32_16x16x32_bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) {
+; SDAG-LABEL: test_mfma_f32_16x16x32_bf16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
+; SDAG-NEXT: s_nop 6
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_f32_16x16x32_bf16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v1
+; GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v2
+; GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v3
+; GISEL-NEXT: v_mov_b32_sdwa v0, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v2, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v3, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v4
+; GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v5
+; GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v6
+; GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v7
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
+; GISEL-NEXT: v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
+; GISEL-NEXT: s_nop 6
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_f32_16x16x32_bf16__flags(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) {
+; SDAG-LABEL: test_mfma_f32_16x16x32_bf16__flags:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
+; SDAG-NEXT: s_nop 6
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_f32_16x16x32_bf16__flags:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v1
+; GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v2
+; GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v3
+; GISEL-NEXT: v_mov_b32_sdwa v0, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v2, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v3, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v4
+; GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v5
+; GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v6
+; GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v7
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
+; GISEL-NEXT: v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
+; GISEL-NEXT: s_nop 6
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 1, i32 1, i32 1)
+ ret <4 x float> %result
+}
+
+define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrspace(1) %out, <8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) #0 {
+; SDAG-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
+; SDAG-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
+; SDAG-NEXT: v_mov_b32_e32 v12, 0
+; SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[14:15]
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
+; SDAG-NEXT: s_nop 6
+; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1]
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
+; GISEL-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[12:13]
+; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[14:15]
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: s_nop 4
+; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GISEL-NEXT: s_endpgm
+ %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
+ store <4 x float> %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(ptr addrspace(1) %out, <8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) #0 {
+; SDAG-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
+; SDAG-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
+; SDAG-NEXT: v_mov_b32_e32 v12, 0
+; SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[14:15]
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; SDAG-NEXT: s_nop 6
+; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1]
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
+; GISEL-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[12:13]
+; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[14:15]
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; GISEL-NEXT: v_mov_b32_e32 v4, 0
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: s_nop 4
+; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GISEL-NEXT: s_endpgm
+ %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 3, i32 2, i32 1)
+ store <4 x float> %result, p...
[truncated]
|
522f526
to
535d10e
Compare
b9658b7
to
84c3383
Compare
34fb85f
to
00437fb
Compare
No description provided.