-
Notifications
You must be signed in to change notification settings - Fork 13.5k
AMDGPU: Define v_mfma_f32_32x32x16_bf16 for gfx950 #116679
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Warning This pull request is not mergeable via GitHub because a downstack PR is open. Once all requirements are satisfied, merge this PR as a stack on Graphite.
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-llvm-analysis @llvm/pr-subscribers-clang Author: Matt Arsenault (arsenm) ChangesUnlike the existing gfx940 intrinsics using short/i16 in place of Patch is 40.83 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/116679.diff 12 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 6917d8d1aca69d..7ce8f2c1669d67 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -437,6 +437,8 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32, "ifiiIi", "nc", "fp8-conversion-
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x32_f16, "V4fV8hV8hV4fIiIiIi", "nc", "gfx950-insts")
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x16_f16, "V16fV8hV8hV16fIiIiIi", "nc", "gfx950-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x16_bf16, "V16fV8yV8yV16fIiIiIi", "nc", "gfx950-insts")
+
//===----------------------------------------------------------------------===//
// GFX12+ only builtins.
//===----------------------------------------------------------------------===//
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
index a644a60f9ec381..841d8fcad0fee0 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
@@ -24,6 +24,7 @@ typedef short v8s __attribute__((ext_vector_type(8)));
typedef short v16s __attribute__((ext_vector_type(16)));
typedef short v32s __attribute__((ext_vector_type(32)));
typedef double v4d __attribute__((ext_vector_type(4)));
+typedef __bf16 v8bf16 __attribute__((ext_vector_type(8)));
#ifdef MFMA_GFX908_TESTS
@@ -424,5 +425,10 @@ v16f test_mfma_f32_32x32x16_f16(v8h a, v8h b, v16f c)
return __builtin_amdgcn_mfma_f32_32x32x16_f16(a, b, c, 1, 2, 3);
}
+// CHECK-GFX950-LABEL: @test_mfma_f32_32x32x16_bf16(
+// CHECK-GFX950: tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %a, <8 x bfloat> %b, <16 x float> %c, i32 1, i32 2, i32 3)
+v16f test_mfma_f32_32x32x16_bf16(v8bf16 a, v8bf16 b, v16f c) {
+ return __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, 1, 2, 3);
+}
#endif
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
index 4c267e2cac5cad..4af67763c40dd2 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
@@ -4,6 +4,7 @@
typedef float float4 __attribute__((ext_vector_type(4)));
typedef float float16 __attribute__((ext_vector_type(16)));
typedef half half8 __attribute__((ext_vector_type(8)));
+typedef __bf16 bfloat8 __attribute__((ext_vector_type(8)));
void test_mfma_f32_16x16x32_f16(__global float4* out, half8 a, half8 b, float4 c, int X) {
@@ -19,3 +20,9 @@ void test_mfma_f32_32x32x16_f16(__global float16* out, half8 a, half8 b, float16
*out = __builtin_amdgcn_mfma_f32_32x32x16_f16(a, b, c, 0, X, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_f16' must be a constant integer}}
*out = __builtin_amdgcn_mfma_f32_32x32x16_f16(a, b, c, 0, 0, X); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_f16' must be a constant integer}}
}
+
+void test_mfma_f32_32x32x16_bf16(__global float16* out, bfloat8 a, bfloat8 b, float16 c, int X) {
+ *out = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, X, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_bf16' must be a constant integer}}
+ *out = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, 0, X, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_bf16' must be a constant integer}}
+ *out = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, 0, 0, X); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_bf16' must be a constant integer}}
+}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
index 0b3a8e78e1c795..e0fd2aa5c58a02 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
@@ -4,9 +4,12 @@
typedef float float4 __attribute__((ext_vector_type(4)));
typedef float float16 __attribute__((ext_vector_type(16)));
typedef half half8 __attribute__((ext_vector_type(8)));
+typedef __bf16 bfloat8 __attribute__((ext_vector_type(8)));
void test(__global float4* out0, half8 a0, half8 b0, float4 c0,
- __global float16* out1, half8 a1, half8 b1, float16 c1) {
+ __global float16* out1, half8 a1, half8 b1, float16 c1,
+ __global float16* out2, bfloat8 a2, bfloat8 b2, float16 c2) {
*out0 = __builtin_amdgcn_mfma_f32_16x16x32_f16(a0, b0, c0, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_16x16x32_f16' needs target feature gfx950-insts}}
*out1 = __builtin_amdgcn_mfma_f32_32x32x16_f16(a1, b1, c1, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_32x32x16_f16' needs target feature gfx950-insts}}
+ *out2 = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a2, b2, c2, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_32x32x16_bf16' needs target feature gfx950-insts}}
}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index ec1234e7bc7d94..15f33cdbf92e6e 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3117,6 +3117,8 @@ def int_amdgcn_cvt_sr_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_fp8_f32">,
defset list<Intrinsic> AMDGPUMFMAIntrinsics950 = {
def int_amdgcn_mfma_f32_16x16x32_f16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v8f16_ty>;
def int_amdgcn_mfma_f32_32x32x16_f16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v8f16_ty>;
+
+def int_amdgcn_mfma_f32_32x32x16_bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v8bf16_ty>;
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 7df9be5c6f7a0b..2079b34d0448f4 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2848,6 +2848,7 @@ def VOP_V16F32_V2I32_V4I32_I32 : VOPProfile <[v16f32, v2i32, v4i32, i32]>;
def VOP_V4F32_V8F16_V8F16_V4F32 : VOPProfile <[v4f32, v8f16, v8f16, v4f32]>;
def VOP_V16F32_V8F16_V8F16_V16F32 : VOPProfile <[v16f32, v8f16, v8f16, v16f32]>;
+def VOP_V16F32_V8BF16_V8BF16_V16F32 : VOPProfile <[v16f32, v8bf16, v8bf16, v16f32]>;
class Commutable_REV <string revOp, bit isOrig> {
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 58e26a96ece202..08882e41d863a1 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -631,6 +631,9 @@ def VOPProfileMAI_F32_V8F16_X32_VCD : VOPProfileMAI<VOP_V4F32_V8F16_V8F16_V4F32,
def VOPProfileMAI_F32_V8F16_X16 : VOPProfileMAI<VOP_V16F32_V8F16_V8F16_V16F32, AISrc_512_f32, ADst_512, AVSrc_128>;
def VOPProfileMAI_F32_V8F16_X16_VCD : VOPProfileMAI<VOP_V16F32_V8F16_V8F16_V16F32, VISrc_512_f32, VDst_512, AVSrc_128>;
+def VOPProfileMAI_F32_V8BF16_X16 : VOPProfileMAI<VOP_V16F32_V8BF16_V8BF16_V16F32, AISrc_512_f32, ADst_512, AVSrc_128>;
+def VOPProfileMAI_F32_V8BF16_X16_VCD : VOPProfileMAI<VOP_V16F32_V8BF16_V8BF16_V16F32, VISrc_512_f32, VDst_512, AVSrc_128>;
+
class MFMATable <bit is_mac, string Name> {
bit IsMac = is_mac;
string FMAOp = Name;
@@ -747,6 +750,7 @@ defm V_MFMA_F32_32X32X4BF16 : MAIInst<"v_mfma_f32_32x32x4bf16", "F32_V2I16_X16",
let SubtargetPredicate = HasGFX950Insts, is_gfx940_xdl = 1 in {
defm V_MFMA_F32_16X16X32_F16 : MAIInst<"v_mfma_f32_16x16x32f16", "F32_V8F16_X32", int_amdgcn_mfma_f32_16x16x32_f16>;
defm V_MFMA_F32_32X32X16_F16 : MAIInst<"v_mfma_f32_32x32x16f16", "F32_V8F16_X16", int_amdgcn_mfma_f32_32x32x16_f16>;
+defm V_MFMA_F32_32X32X16_BF16 : MAIInst<"v_mfma_f32_32x32x16bf16", "F32_V8BF16_X16", int_amdgcn_mfma_f32_32x32x16_bf16>;
}
let Predicates = [isGFX90APlus] in {
@@ -1786,6 +1790,8 @@ defm V_MFMA_F64_4X4X4F64 : VOP3P_Real_MFMA_gfx90a <0x6f>;
defm V_MFMA_F32_16X16X32_F16 : VOP3P_Real_MFMA_gfx950 <0x54, "v_mfma_f32_16x16x32_f16">;
defm V_MFMA_F32_32X32X16_F16 : VOP3P_Real_MFMA_gfx950 <0x55, "v_mfma_f32_32x32x16_f16">;
+defm V_MFMA_F32_32X32X16_BF16 : VOP3P_Real_MFMA_gfx950 <0x37, "v_mfma_f32_32x32x16_bf16">;
+
defm V_MFMA_I32_32X32X16I8 : VOP3P_Real_MFMA_gfx940 <0x56, "v_mfma_i32_32x32x16_i8">;
defm V_MFMA_I32_16X16X32I8 : VOP3P_Real_MFMA_gfx940 <0x57, "v_mfma_i32_16x16x32_i8">;
let SubtargetPredicate = HasXF32Insts in {
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
index c457d867af361e..00a3aaf77f9003 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@@ -278,6 +278,14 @@ define amdgpu_kernel void @mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x half> %a
ret void
}
+; CHECK: DIVERGENT: %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
+define amdgpu_kernel void @mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, ptr addrspace(1) %out) {
+ %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
+ store <16 x float> %result, ptr addrspace(1) %out
+ ret void
+}
+
+
declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1
declare i32 @llvm.amdgcn.permlane16.i32(i32, i32, i32, i32, i1, i1) #1
declare i32 @llvm.amdgcn.permlanex16.i32(i32, i32, i32, i32, i1, i1) #1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
new file mode 100644
index 00000000000000..2da602713d72c4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
@@ -0,0 +1,474 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -march=amdgcn -mcpu=gfx950 < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
+
+; FIXME: bfloat vector arguments are broken in globalisel.
+; https://github.com/llvm/llvm-project/issues/77055
+
+declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat>, <8 x bfloat>, <16 x float>, i32 immarg, i32 immarg, i32 immarg)
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.mfma.f32.32x32x16.bf16
+; --------------------------------------------------------------------
+
+define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2) #1 {
+; GCN-LABEL: test_mfma_f32_32x32x16_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; GCN-NEXT: v_mov_b64_e32 v[12:13], 48
+; GCN-NEXT: v_mov_b64_e32 v[14:15], 32
+; GCN-NEXT: v_mov_b64_e32 v[16:17], 16
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
+; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
+; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
+; GCN-NEXT: v_accvgpr_write_b32 a0, s8
+; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; GCN-NEXT: v_accvgpr_write_b32 a1, s9
+; GCN-NEXT: v_accvgpr_write_b32 a2, s10
+; GCN-NEXT: v_accvgpr_write_b32 a3, s11
+; GCN-NEXT: v_accvgpr_write_b32 a4, s12
+; GCN-NEXT: v_accvgpr_write_b32 a5, s13
+; GCN-NEXT: v_accvgpr_write_b32 a6, s14
+; GCN-NEXT: v_accvgpr_write_b32 a7, s15
+; GCN-NEXT: v_accvgpr_write_b32 a8, s16
+; GCN-NEXT: v_accvgpr_write_b32 a9, s17
+; GCN-NEXT: v_accvgpr_write_b32 a10, s18
+; GCN-NEXT: v_accvgpr_write_b32 a11, s19
+; GCN-NEXT: v_accvgpr_write_b32 a12, s20
+; GCN-NEXT: v_accvgpr_write_b32 a13, s21
+; GCN-NEXT: v_accvgpr_write_b32 a14, s22
+; GCN-NEXT: v_accvgpr_write_b32 a15, s23
+; GCN-NEXT: v_mov_b64_e32 v[18:19], 0
+; GCN-NEXT: v_mov_b32_e32 v8, s16
+; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[16:31], v[0:3], v[4:7], a[0:15]
+; GCN-NEXT: v_mov_b32_e32 v0, s20
+; GCN-NEXT: v_mov_b32_e32 v1, s21
+; GCN-NEXT: v_mov_b32_e32 v2, s22
+; GCN-NEXT: v_mov_b32_e32 v3, s23
+; GCN-NEXT: v_mov_b32_e32 v9, s17
+; GCN-NEXT: v_mov_b32_e32 v10, s18
+; GCN-NEXT: v_mov_b32_e32 v11, s19
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mov_b32_e32 v0, s8
+; GCN-NEXT: v_mov_b32_e32 v1, s9
+; GCN-NEXT: v_mov_b32_e32 v2, s10
+; GCN-NEXT: v_mov_b32_e32 v3, s11
+; GCN-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mov_b32_e32 v0, s12
+; GCN-NEXT: v_mov_b32_e32 v1, s13
+; GCN-NEXT: v_mov_b32_e32 v2, s14
+; GCN-NEXT: v_mov_b32_e32 v3, s15
+; GCN-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_endpgm
+ %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
+ store volatile <16 x float> %result, ptr addrspace(1) null
+ store volatile <16 x float> %arg2, ptr addrspace(1) null
+ ret void
+}
+
+define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__flags(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2) #1 {
+; GCN-LABEL: test_mfma_f32_32x32x16_bf16__flags:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; GCN-NEXT: v_mov_b64_e32 v[12:13], 48
+; GCN-NEXT: v_mov_b64_e32 v[14:15], 32
+; GCN-NEXT: v_mov_b64_e32 v[16:17], 16
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
+; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
+; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
+; GCN-NEXT: v_accvgpr_write_b32 a0, s8
+; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; GCN-NEXT: v_accvgpr_write_b32 a1, s9
+; GCN-NEXT: v_accvgpr_write_b32 a2, s10
+; GCN-NEXT: v_accvgpr_write_b32 a3, s11
+; GCN-NEXT: v_accvgpr_write_b32 a4, s12
+; GCN-NEXT: v_accvgpr_write_b32 a5, s13
+; GCN-NEXT: v_accvgpr_write_b32 a6, s14
+; GCN-NEXT: v_accvgpr_write_b32 a7, s15
+; GCN-NEXT: v_accvgpr_write_b32 a8, s16
+; GCN-NEXT: v_accvgpr_write_b32 a9, s17
+; GCN-NEXT: v_accvgpr_write_b32 a10, s18
+; GCN-NEXT: v_accvgpr_write_b32 a11, s19
+; GCN-NEXT: v_accvgpr_write_b32 a12, s20
+; GCN-NEXT: v_accvgpr_write_b32 a13, s21
+; GCN-NEXT: v_accvgpr_write_b32 a14, s22
+; GCN-NEXT: v_accvgpr_write_b32 a15, s23
+; GCN-NEXT: v_mov_b64_e32 v[18:19], 0
+; GCN-NEXT: v_mov_b32_e32 v8, s16
+; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
+; GCN-NEXT: v_mov_b32_e32 v0, s20
+; GCN-NEXT: v_mov_b32_e32 v1, s21
+; GCN-NEXT: v_mov_b32_e32 v2, s22
+; GCN-NEXT: v_mov_b32_e32 v3, s23
+; GCN-NEXT: v_mov_b32_e32 v9, s17
+; GCN-NEXT: v_mov_b32_e32 v10, s18
+; GCN-NEXT: v_mov_b32_e32 v11, s19
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mov_b32_e32 v0, s8
+; GCN-NEXT: v_mov_b32_e32 v1, s9
+; GCN-NEXT: v_mov_b32_e32 v2, s10
+; GCN-NEXT: v_mov_b32_e32 v3, s11
+; GCN-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mov_b32_e32 v0, s12
+; GCN-NEXT: v_mov_b32_e32 v1, s13
+; GCN-NEXT: v_mov_b32_e32 v2, s14
+; GCN-NEXT: v_mov_b32_e32 v3, s15
+; GCN-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_endpgm
+ %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 2, i32 3, i32 1)
+ store volatile <16 x float> %result, ptr addrspace(1) null
+ store volatile <16 x float> %arg2, ptr addrspace(1) null
+ ret void
+}
+
+define <16 x float> @test_mfma_f32_32x32x16_bf16__mac(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_f32_32x32x16_bf16__mac:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v8
+; GCN-NEXT: v_accvgpr_write_b32 a1, v9
+; GCN-NEXT: v_accvgpr_write_b32 a2, v10
+; GCN-NEXT: v_accvgpr_write_b32 a3, v11
+; GCN-NEXT: v_accvgpr_write_b32 a4, v12
+; GCN-NEXT: v_accvgpr_write_b32 a5, v13
+; GCN-NEXT: v_accvgpr_write_b32 a6, v14
+; GCN-NEXT: v_accvgpr_write_b32 a7, v15
+; GCN-NEXT: v_accvgpr_write_b32 a8, v16
+; GCN-NEXT: v_accvgpr_write_b32 a9, v17
+; GCN-NEXT: v_accvgpr_write_b32 a10, v18
+; GCN-NEXT: v_accvgpr_write_b32 a11, v19
+; GCN-NEXT: v_accvgpr_write_b32 a12, v20
+; GCN-NEXT: v_accvgpr_write_b32 a13, v21
+; GCN-NEXT: v_accvgpr_write_b32 a14, v22
+; GCN-NEXT: v_accvgpr_write_b32 a15, v23
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15]
+; GCN-NEXT: s_nop 7
+; GCN-NEXT: s_nop 2
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_f32_32x32x16_bf16__mac__flags(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_f32_32x32x16_bf16__mac__flags:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v8
+; GCN-NEXT: v_accvgpr_write_b32 a1, v9
+; GCN-NEXT: v_accvgpr_write_b32 a2, v10
+; GCN-NEXT: v_accvgpr_write_b32 a3, v11
+; GCN-NEXT: v_accvgpr_write_b32 a4, v12
+; GCN-NEXT: v_accvgpr_write_b32 a5, v13
+; GCN-NEXT: v_accvgpr_write_b32 a6, v14
+; GCN-NEXT: v_accvgpr_write_b32 a7, v15
+; GCN-NEXT: v_accvgpr_write_b32 a8, v16
+; GCN-NEXT: v_accvgpr_write_b32 a9, v17
+; GCN-NEXT: v_accvgpr_write_b32 a10, v18
+; GCN-NEXT: v_accvgpr_write_b32 a11, v19
+; GCN-NEXT: v_accvgpr_write_b32 a12, v20
+; GCN-NEXT: v_accvgpr_write_b32 a13, v21
+; GCN-NEXT: v_accvgpr_write_b32 a14, v22
+; GCN-NEXT: v_accvgpr_write_b32 a15, v23
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
+; GCN-NEXT: s_nop 7
+; GCN-NEXT: s_nop 2
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEX...
[truncated]
|
@llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesUnlike the existing gfx940 intrinsics using short/i16 in place of Patch is 40.83 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/116679.diff 12 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 6917d8d1aca69d..7ce8f2c1669d67 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -437,6 +437,8 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32, "ifiiIi", "nc", "fp8-conversion-
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x32_f16, "V4fV8hV8hV4fIiIiIi", "nc", "gfx950-insts")
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x16_f16, "V16fV8hV8hV16fIiIiIi", "nc", "gfx950-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x16_bf16, "V16fV8yV8yV16fIiIiIi", "nc", "gfx950-insts")
+
//===----------------------------------------------------------------------===//
// GFX12+ only builtins.
//===----------------------------------------------------------------------===//
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
index a644a60f9ec381..841d8fcad0fee0 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
@@ -24,6 +24,7 @@ typedef short v8s __attribute__((ext_vector_type(8)));
typedef short v16s __attribute__((ext_vector_type(16)));
typedef short v32s __attribute__((ext_vector_type(32)));
typedef double v4d __attribute__((ext_vector_type(4)));
+typedef __bf16 v8bf16 __attribute__((ext_vector_type(8)));
#ifdef MFMA_GFX908_TESTS
@@ -424,5 +425,10 @@ v16f test_mfma_f32_32x32x16_f16(v8h a, v8h b, v16f c)
return __builtin_amdgcn_mfma_f32_32x32x16_f16(a, b, c, 1, 2, 3);
}
+// CHECK-GFX950-LABEL: @test_mfma_f32_32x32x16_bf16(
+// CHECK-GFX950: tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %a, <8 x bfloat> %b, <16 x float> %c, i32 1, i32 2, i32 3)
+v16f test_mfma_f32_32x32x16_bf16(v8bf16 a, v8bf16 b, v16f c) {
+ return __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, 1, 2, 3);
+}
#endif
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
index 4c267e2cac5cad..4af67763c40dd2 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
@@ -4,6 +4,7 @@
typedef float float4 __attribute__((ext_vector_type(4)));
typedef float float16 __attribute__((ext_vector_type(16)));
typedef half half8 __attribute__((ext_vector_type(8)));
+typedef __bf16 bfloat8 __attribute__((ext_vector_type(8)));
void test_mfma_f32_16x16x32_f16(__global float4* out, half8 a, half8 b, float4 c, int X) {
@@ -19,3 +20,9 @@ void test_mfma_f32_32x32x16_f16(__global float16* out, half8 a, half8 b, float16
*out = __builtin_amdgcn_mfma_f32_32x32x16_f16(a, b, c, 0, X, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_f16' must be a constant integer}}
*out = __builtin_amdgcn_mfma_f32_32x32x16_f16(a, b, c, 0, 0, X); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_f16' must be a constant integer}}
}
+
+void test_mfma_f32_32x32x16_bf16(__global float16* out, bfloat8 a, bfloat8 b, float16 c, int X) {
+ *out = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, X, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_bf16' must be a constant integer}}
+ *out = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, 0, X, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_bf16' must be a constant integer}}
+ *out = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, 0, 0, X); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_bf16' must be a constant integer}}
+}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
index 0b3a8e78e1c795..e0fd2aa5c58a02 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
@@ -4,9 +4,12 @@
typedef float float4 __attribute__((ext_vector_type(4)));
typedef float float16 __attribute__((ext_vector_type(16)));
typedef half half8 __attribute__((ext_vector_type(8)));
+typedef __bf16 bfloat8 __attribute__((ext_vector_type(8)));
void test(__global float4* out0, half8 a0, half8 b0, float4 c0,
- __global float16* out1, half8 a1, half8 b1, float16 c1) {
+ __global float16* out1, half8 a1, half8 b1, float16 c1,
+ __global float16* out2, bfloat8 a2, bfloat8 b2, float16 c2) {
*out0 = __builtin_amdgcn_mfma_f32_16x16x32_f16(a0, b0, c0, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_16x16x32_f16' needs target feature gfx950-insts}}
*out1 = __builtin_amdgcn_mfma_f32_32x32x16_f16(a1, b1, c1, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_32x32x16_f16' needs target feature gfx950-insts}}
+ *out2 = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a2, b2, c2, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_32x32x16_bf16' needs target feature gfx950-insts}}
}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index ec1234e7bc7d94..15f33cdbf92e6e 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3117,6 +3117,8 @@ def int_amdgcn_cvt_sr_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_fp8_f32">,
defset list<Intrinsic> AMDGPUMFMAIntrinsics950 = {
def int_amdgcn_mfma_f32_16x16x32_f16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v8f16_ty>;
def int_amdgcn_mfma_f32_32x32x16_f16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v8f16_ty>;
+
+def int_amdgcn_mfma_f32_32x32x16_bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v8bf16_ty>;
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 7df9be5c6f7a0b..2079b34d0448f4 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2848,6 +2848,7 @@ def VOP_V16F32_V2I32_V4I32_I32 : VOPProfile <[v16f32, v2i32, v4i32, i32]>;
def VOP_V4F32_V8F16_V8F16_V4F32 : VOPProfile <[v4f32, v8f16, v8f16, v4f32]>;
def VOP_V16F32_V8F16_V8F16_V16F32 : VOPProfile <[v16f32, v8f16, v8f16, v16f32]>;
+def VOP_V16F32_V8BF16_V8BF16_V16F32 : VOPProfile <[v16f32, v8bf16, v8bf16, v16f32]>;
class Commutable_REV <string revOp, bit isOrig> {
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 58e26a96ece202..08882e41d863a1 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -631,6 +631,9 @@ def VOPProfileMAI_F32_V8F16_X32_VCD : VOPProfileMAI<VOP_V4F32_V8F16_V8F16_V4F32,
def VOPProfileMAI_F32_V8F16_X16 : VOPProfileMAI<VOP_V16F32_V8F16_V8F16_V16F32, AISrc_512_f32, ADst_512, AVSrc_128>;
def VOPProfileMAI_F32_V8F16_X16_VCD : VOPProfileMAI<VOP_V16F32_V8F16_V8F16_V16F32, VISrc_512_f32, VDst_512, AVSrc_128>;
+def VOPProfileMAI_F32_V8BF16_X16 : VOPProfileMAI<VOP_V16F32_V8BF16_V8BF16_V16F32, AISrc_512_f32, ADst_512, AVSrc_128>;
+def VOPProfileMAI_F32_V8BF16_X16_VCD : VOPProfileMAI<VOP_V16F32_V8BF16_V8BF16_V16F32, VISrc_512_f32, VDst_512, AVSrc_128>;
+
class MFMATable <bit is_mac, string Name> {
bit IsMac = is_mac;
string FMAOp = Name;
@@ -747,6 +750,7 @@ defm V_MFMA_F32_32X32X4BF16 : MAIInst<"v_mfma_f32_32x32x4bf16", "F32_V2I16_X16",
let SubtargetPredicate = HasGFX950Insts, is_gfx940_xdl = 1 in {
defm V_MFMA_F32_16X16X32_F16 : MAIInst<"v_mfma_f32_16x16x32f16", "F32_V8F16_X32", int_amdgcn_mfma_f32_16x16x32_f16>;
defm V_MFMA_F32_32X32X16_F16 : MAIInst<"v_mfma_f32_32x32x16f16", "F32_V8F16_X16", int_amdgcn_mfma_f32_32x32x16_f16>;
+defm V_MFMA_F32_32X32X16_BF16 : MAIInst<"v_mfma_f32_32x32x16bf16", "F32_V8BF16_X16", int_amdgcn_mfma_f32_32x32x16_bf16>;
}
let Predicates = [isGFX90APlus] in {
@@ -1786,6 +1790,8 @@ defm V_MFMA_F64_4X4X4F64 : VOP3P_Real_MFMA_gfx90a <0x6f>;
defm V_MFMA_F32_16X16X32_F16 : VOP3P_Real_MFMA_gfx950 <0x54, "v_mfma_f32_16x16x32_f16">;
defm V_MFMA_F32_32X32X16_F16 : VOP3P_Real_MFMA_gfx950 <0x55, "v_mfma_f32_32x32x16_f16">;
+defm V_MFMA_F32_32X32X16_BF16 : VOP3P_Real_MFMA_gfx950 <0x37, "v_mfma_f32_32x32x16_bf16">;
+
defm V_MFMA_I32_32X32X16I8 : VOP3P_Real_MFMA_gfx940 <0x56, "v_mfma_i32_32x32x16_i8">;
defm V_MFMA_I32_16X16X32I8 : VOP3P_Real_MFMA_gfx940 <0x57, "v_mfma_i32_16x16x32_i8">;
let SubtargetPredicate = HasXF32Insts in {
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
index c457d867af361e..00a3aaf77f9003 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@@ -278,6 +278,14 @@ define amdgpu_kernel void @mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x half> %a
ret void
}
+; CHECK: DIVERGENT: %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
+define amdgpu_kernel void @mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, ptr addrspace(1) %out) {
+ %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
+ store <16 x float> %result, ptr addrspace(1) %out
+ ret void
+}
+
+
declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1
declare i32 @llvm.amdgcn.permlane16.i32(i32, i32, i32, i32, i1, i1) #1
declare i32 @llvm.amdgcn.permlanex16.i32(i32, i32, i32, i32, i1, i1) #1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
new file mode 100644
index 00000000000000..2da602713d72c4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
@@ -0,0 +1,474 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -march=amdgcn -mcpu=gfx950 < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
+
+; FIXME: bfloat vector arguments are broken in globalisel.
+; https://github.com/llvm/llvm-project/issues/77055
+
+declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat>, <8 x bfloat>, <16 x float>, i32 immarg, i32 immarg, i32 immarg)
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.mfma.f32.32x32x16.bf16
+; --------------------------------------------------------------------
+
+define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2) #1 {
+; GCN-LABEL: test_mfma_f32_32x32x16_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; GCN-NEXT: v_mov_b64_e32 v[12:13], 48
+; GCN-NEXT: v_mov_b64_e32 v[14:15], 32
+; GCN-NEXT: v_mov_b64_e32 v[16:17], 16
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
+; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
+; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
+; GCN-NEXT: v_accvgpr_write_b32 a0, s8
+; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; GCN-NEXT: v_accvgpr_write_b32 a1, s9
+; GCN-NEXT: v_accvgpr_write_b32 a2, s10
+; GCN-NEXT: v_accvgpr_write_b32 a3, s11
+; GCN-NEXT: v_accvgpr_write_b32 a4, s12
+; GCN-NEXT: v_accvgpr_write_b32 a5, s13
+; GCN-NEXT: v_accvgpr_write_b32 a6, s14
+; GCN-NEXT: v_accvgpr_write_b32 a7, s15
+; GCN-NEXT: v_accvgpr_write_b32 a8, s16
+; GCN-NEXT: v_accvgpr_write_b32 a9, s17
+; GCN-NEXT: v_accvgpr_write_b32 a10, s18
+; GCN-NEXT: v_accvgpr_write_b32 a11, s19
+; GCN-NEXT: v_accvgpr_write_b32 a12, s20
+; GCN-NEXT: v_accvgpr_write_b32 a13, s21
+; GCN-NEXT: v_accvgpr_write_b32 a14, s22
+; GCN-NEXT: v_accvgpr_write_b32 a15, s23
+; GCN-NEXT: v_mov_b64_e32 v[18:19], 0
+; GCN-NEXT: v_mov_b32_e32 v8, s16
+; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[16:31], v[0:3], v[4:7], a[0:15]
+; GCN-NEXT: v_mov_b32_e32 v0, s20
+; GCN-NEXT: v_mov_b32_e32 v1, s21
+; GCN-NEXT: v_mov_b32_e32 v2, s22
+; GCN-NEXT: v_mov_b32_e32 v3, s23
+; GCN-NEXT: v_mov_b32_e32 v9, s17
+; GCN-NEXT: v_mov_b32_e32 v10, s18
+; GCN-NEXT: v_mov_b32_e32 v11, s19
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mov_b32_e32 v0, s8
+; GCN-NEXT: v_mov_b32_e32 v1, s9
+; GCN-NEXT: v_mov_b32_e32 v2, s10
+; GCN-NEXT: v_mov_b32_e32 v3, s11
+; GCN-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mov_b32_e32 v0, s12
+; GCN-NEXT: v_mov_b32_e32 v1, s13
+; GCN-NEXT: v_mov_b32_e32 v2, s14
+; GCN-NEXT: v_mov_b32_e32 v3, s15
+; GCN-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_endpgm
+ %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
+ store volatile <16 x float> %result, ptr addrspace(1) null
+ store volatile <16 x float> %arg2, ptr addrspace(1) null
+ ret void
+}
+
+define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__flags(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2) #1 {
+; GCN-LABEL: test_mfma_f32_32x32x16_bf16__flags:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; GCN-NEXT: v_mov_b64_e32 v[12:13], 48
+; GCN-NEXT: v_mov_b64_e32 v[14:15], 32
+; GCN-NEXT: v_mov_b64_e32 v[16:17], 16
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
+; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
+; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
+; GCN-NEXT: v_accvgpr_write_b32 a0, s8
+; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; GCN-NEXT: v_accvgpr_write_b32 a1, s9
+; GCN-NEXT: v_accvgpr_write_b32 a2, s10
+; GCN-NEXT: v_accvgpr_write_b32 a3, s11
+; GCN-NEXT: v_accvgpr_write_b32 a4, s12
+; GCN-NEXT: v_accvgpr_write_b32 a5, s13
+; GCN-NEXT: v_accvgpr_write_b32 a6, s14
+; GCN-NEXT: v_accvgpr_write_b32 a7, s15
+; GCN-NEXT: v_accvgpr_write_b32 a8, s16
+; GCN-NEXT: v_accvgpr_write_b32 a9, s17
+; GCN-NEXT: v_accvgpr_write_b32 a10, s18
+; GCN-NEXT: v_accvgpr_write_b32 a11, s19
+; GCN-NEXT: v_accvgpr_write_b32 a12, s20
+; GCN-NEXT: v_accvgpr_write_b32 a13, s21
+; GCN-NEXT: v_accvgpr_write_b32 a14, s22
+; GCN-NEXT: v_accvgpr_write_b32 a15, s23
+; GCN-NEXT: v_mov_b64_e32 v[18:19], 0
+; GCN-NEXT: v_mov_b32_e32 v8, s16
+; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
+; GCN-NEXT: v_mov_b32_e32 v0, s20
+; GCN-NEXT: v_mov_b32_e32 v1, s21
+; GCN-NEXT: v_mov_b32_e32 v2, s22
+; GCN-NEXT: v_mov_b32_e32 v3, s23
+; GCN-NEXT: v_mov_b32_e32 v9, s17
+; GCN-NEXT: v_mov_b32_e32 v10, s18
+; GCN-NEXT: v_mov_b32_e32 v11, s19
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mov_b32_e32 v0, s8
+; GCN-NEXT: v_mov_b32_e32 v1, s9
+; GCN-NEXT: v_mov_b32_e32 v2, s10
+; GCN-NEXT: v_mov_b32_e32 v3, s11
+; GCN-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mov_b32_e32 v0, s12
+; GCN-NEXT: v_mov_b32_e32 v1, s13
+; GCN-NEXT: v_mov_b32_e32 v2, s14
+; GCN-NEXT: v_mov_b32_e32 v3, s15
+; GCN-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_endpgm
+ %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 2, i32 3, i32 1)
+ store volatile <16 x float> %result, ptr addrspace(1) null
+ store volatile <16 x float> %arg2, ptr addrspace(1) null
+ ret void
+}
+
+define <16 x float> @test_mfma_f32_32x32x16_bf16__mac(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_f32_32x32x16_bf16__mac:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v8
+; GCN-NEXT: v_accvgpr_write_b32 a1, v9
+; GCN-NEXT: v_accvgpr_write_b32 a2, v10
+; GCN-NEXT: v_accvgpr_write_b32 a3, v11
+; GCN-NEXT: v_accvgpr_write_b32 a4, v12
+; GCN-NEXT: v_accvgpr_write_b32 a5, v13
+; GCN-NEXT: v_accvgpr_write_b32 a6, v14
+; GCN-NEXT: v_accvgpr_write_b32 a7, v15
+; GCN-NEXT: v_accvgpr_write_b32 a8, v16
+; GCN-NEXT: v_accvgpr_write_b32 a9, v17
+; GCN-NEXT: v_accvgpr_write_b32 a10, v18
+; GCN-NEXT: v_accvgpr_write_b32 a11, v19
+; GCN-NEXT: v_accvgpr_write_b32 a12, v20
+; GCN-NEXT: v_accvgpr_write_b32 a13, v21
+; GCN-NEXT: v_accvgpr_write_b32 a14, v22
+; GCN-NEXT: v_accvgpr_write_b32 a15, v23
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15]
+; GCN-NEXT: s_nop 7
+; GCN-NEXT: s_nop 2
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_f32_32x32x16_bf16__mac__flags(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_f32_32x32x16_bf16__mac__flags:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v8
+; GCN-NEXT: v_accvgpr_write_b32 a1, v9
+; GCN-NEXT: v_accvgpr_write_b32 a2, v10
+; GCN-NEXT: v_accvgpr_write_b32 a3, v11
+; GCN-NEXT: v_accvgpr_write_b32 a4, v12
+; GCN-NEXT: v_accvgpr_write_b32 a5, v13
+; GCN-NEXT: v_accvgpr_write_b32 a6, v14
+; GCN-NEXT: v_accvgpr_write_b32 a7, v15
+; GCN-NEXT: v_accvgpr_write_b32 a8, v16
+; GCN-NEXT: v_accvgpr_write_b32 a9, v17
+; GCN-NEXT: v_accvgpr_write_b32 a10, v18
+; GCN-NEXT: v_accvgpr_write_b32 a11, v19
+; GCN-NEXT: v_accvgpr_write_b32 a12, v20
+; GCN-NEXT: v_accvgpr_write_b32 a13, v21
+; GCN-NEXT: v_accvgpr_write_b32 a14, v22
+; GCN-NEXT: v_accvgpr_write_b32 a15, v23
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
+; GCN-NEXT: s_nop 7
+; GCN-NEXT: s_nop 2
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEX...
[truncated]
|
f3682aa
to
1adfc6b
Compare
82bb6e0
to
330427f
Compare
1adfc6b
to
2558f35
Compare
Unlike the existing gfx940 intrinsics using short/i16 in place of bfloat, this uses the natural bfloat type.
Unlike the existing gfx940 intrinsics using short/i16 in place of
bfloat, this uses the natural bfloat type.