-
Notifications
You must be signed in to change notification settings - Fork 13.6k
AMDGPU: Add v_smfmac_f32_32x32x32_bf16 for gfx950 #117212
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-mc @llvm/pr-subscribers-clang Author: Matt Arsenault (arsenm) ChangesPatch is 40.89 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/117212.diff 12 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 5a6c3e0d701f09..e93f570a6353b5 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -447,6 +447,7 @@ TARGET_BUILTIN(__builtin_amdgcn_mfma_i32_32x32x32_i8, "V16iV4iV4iV16iIiIiIi", "n
TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_16x16x64_f16, "V4fV8hV16hV4fiIiIi", "nc", "gfx950-insts")
TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x32_f16, "V16fV8hV16hV16fiIiIi", "nc", "gfx950-insts")
TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_16x16x64_bf16, "V4fV8yV16yV4fiIiIi", "nc", "gfx950-insts")
+TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x32_bf16, "V16fV8yV16yV16fiIiIi", "nc", "gfx950-insts")
//===----------------------------------------------------------------------===//
// GFX12+ only builtins.
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
index 833c894252bb61..1977abaec9d575 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
@@ -489,4 +489,11 @@ void test_smfmac_f32_16x16x64_bf16(global v4f* out, v8bf16 a, v16bf16 b, v4f c,
*out = __builtin_amdgcn_smfmac_f32_16x16x64_bf16(a, b, c, idx, 0, 0);
}
+// CHECK-GFX950-LABEL: @test_smfmac_f32_32x32x32_bf16
+// CHECK-GFX950: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %a, <16 x bfloat> %b, <16 x float> %c, i32 %idx, i32 0, i32 0)
+void test_smfmac_f32_32x32x32_bf16(global v16f* out, v8bf16 a, v16bf16 b, v16f c, int idx)
+{
+ *out = __builtin_amdgcn_smfmac_f32_32x32x32_bf16(a, b, c, idx, 0, 0);
+}
+
#endif
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
index b00058a6359c46..a80926c2644851 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
@@ -82,3 +82,9 @@ void test_smfmac_f32_16x16x64_bf16(global float4* out, bfloat8 a, bfloat16 b, fl
*out = __builtin_amdgcn_smfmac_f32_16x16x64_bf16(a, b, c, idx, d, 0); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_16x16x64_bf16' must be a constant integer}}
*out = __builtin_amdgcn_smfmac_f32_16x16x64_bf16(a, b, c, idx, 0, d); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_16x16x64_bf16' must be a constant integer}}
}
+
+void test_smfmac_f32_32x32x32_bf16(global float16* out, bfloat8 a, bfloat16 b, float16 c, int idx, int d)
+{
+ *out = __builtin_amdgcn_smfmac_f32_32x32x32_bf16(a, b, c, idx, d, 0); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_32x32x32_bf16' must be a constant integer}}
+ *out = __builtin_amdgcn_smfmac_f32_32x32x32_bf16(a, b, c, idx, 0, d); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_32x32x32_bf16' must be a constant integer}}
+}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
index afbe132ba6d5df..090cd3348ee837 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
@@ -37,6 +37,7 @@ void test(__global float4* out0, half8 a0, half8 b0, float4 c0,
*out6 = __builtin_amdgcn_smfmac_f32_16x16x64_f16(a6, b6, c6, 0, 0, 0); // expected-error{{'__builtin_amdgcn_smfmac_f32_16x16x64_f16' needs target feature gfx950-insts}}
*out7 = __builtin_amdgcn_smfmac_f32_32x32x32_f16(a7, b7, c7, 0, 0, 0); // expected-error{{'__builtin_amdgcn_smfmac_f32_32x32x32_f16' needs target feature gfx950-insts}}
*out8 = __builtin_amdgcn_smfmac_f32_16x16x64_bf16(a8, b8, c8, 0, 0, 0); // expected-error{{'__builtin_amdgcn_smfmac_f32_16x16x64_bf16' needs target feature gfx950-insts}}
+ *out9 = __builtin_amdgcn_smfmac_f32_32x32x32_bf16(a9, b9, c9, 0, 0, 0); // expected-error{{'__builtin_amdgcn_smfmac_f32_32x32x32_bf16' needs target feature gfx950-insts}}
*out14 = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a14, b14, c14, 0, 0, 0, d14, 0, e14); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' needs target feature gfx950-insts}}
*out15 = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a15, b15, c15, 0, 0, 0, d15, 0, e15); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' needs target feature gfx950-insts}}
}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 0e207be4399dbf..a02efac1a86b1b 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3155,6 +3155,7 @@ def int_amdgcn_mfma_scale_f32_32x32x64_f8f6f4 : AMDGPUMfmaScaleIntrinsic<llvm_v1
def int_amdgcn_smfmac_f32_16x16x64_f16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v8f16_ty, llvm_v16f16_ty>;
def int_amdgcn_smfmac_f32_32x32x32_f16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v8f16_ty, llvm_v16f16_ty>;
def int_amdgcn_smfmac_f32_16x16x64_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v16bf16_ty>;
+def int_amdgcn_smfmac_f32_32x32x32_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v8bf16_ty, llvm_v16bf16_ty>;
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 75553790b98e56..c7e724b202003a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1093,6 +1093,7 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
+ case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
return selectSMFMACIntrin(I);
default:
return selectImpl(I, *CoverageInfo);
@@ -3494,6 +3495,9 @@ bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF16_e64;
break;
+ case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
+ Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF16_e64;
+ break;
default:
llvm_unreachable("unhandled smfmac intrinsic");
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 519bbc9935c81d..d7e0c6ee3e0e32 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4807,7 +4807,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
- case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16: {
+ case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
+ case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16: {
// vdst, srcA, srcB, srcC, idx
OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index f608c5a93f0e07..82fed425621d43 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2873,6 +2873,7 @@ def VOP_V4F32_V8F16_V16F16_I32 : VOPProfile <[v4f32, v8f16, v16f16, i32]>;
def VOP_V4F32_V8BF16_V16BF16_I32 : VOPProfile <[v4f32, v8bf16, v16bf16, i32]>;
def VOP_V16F32_V4F16_V8F16_I32 : VOPProfile <[v16f32, v4f16, v8f16, i32]>;
def VOP_V16F32_V8F16_V16F16_I32 : VOPProfile <[v16f32, v8f16, v16f16, i32]>;
+def VOP_V16F32_V8BF16_V16BF16_I32 : VOPProfile <[v16f32, v8bf16, v16bf16, i32]>;
def VOP_V4F32_V4I16_V8I16_I32 : VOPProfile <[v4f32, v4i16, v8i16, i32]>;
def VOP_V16F32_V4I16_V8I16_I32 : VOPProfile <[v16f32, v4i16, v8i16, i32]>;
def VOP_V4I32_V2I32_V4I32_I32 : VOPProfile <[v4i32, v2i32, v4i32, i32]>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 467415c1a47588..b03b3572b5d539 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -632,6 +632,7 @@ def VOPProfileSMFMAC_F32_16X16X32_F16 : VOPProfileSMFMAC<VOP_V4F32_V4F16_V8F16_I
def VOPProfileSMFMAC_F32_16X16X64_F16 : VOPProfileSMFMAC<VOP_V4F32_V8F16_V16F16_I32, AVDst_128, AVSrc_128, AVSrc_256>;
def VOPProfileSMFMAC_F32_32X32X32_F16 : VOPProfileSMFMAC<VOP_V16F32_V8F16_V16F16_I32, AVDst_512, AVSrc_128, AVSrc_256>;
def VOPProfileSMFMAC_F32_16X16X64_BF16 : VOPProfileSMFMAC<VOP_V4F32_V8BF16_V16BF16_I32, AVDst_128, AVSrc_128, AVSrc_256>;
+def VOPProfileSMFMAC_F32_32X32X32_BF16 : VOPProfileSMFMAC<VOP_V16F32_V8BF16_V16BF16_I32, AVDst_512, AVSrc_128, AVSrc_256>;
def VOPProfileSMFMAC_F32_32X32X16_F16 : VOPProfileSMFMAC<VOP_V16F32_V4F16_V8F16_I32, AVDst_512, AVSrc_64, AVSrc_128>;
def VOPProfileSMFMAC_F32_16X16X32_I16 : VOPProfileSMFMAC<VOP_V4F32_V4I16_V8I16_I32, AVDst_128, AVSrc_64, AVSrc_128>;
def VOPProfileSMFMAC_F32_32X32X16_I16 : VOPProfileSMFMAC<VOP_V16F32_V4I16_V8I16_I32, AVDst_512, AVSrc_64, AVSrc_128>;
@@ -1049,6 +1050,7 @@ let SubtargetPredicate = HasGFX950Insts in {
defm V_SMFMAC_F32_16X16X64_F16 : SMFMACInst<"v_smfmac_f32_16x16x64_f16", "F32_16X16X64_F16", int_amdgcn_smfmac_f32_16x16x64_f16>;
defm V_SMFMAC_F32_32X32X32_F16 : SMFMACInst<"v_smfmac_f32_32x32x32_f16", "F32_32X32X32_F16", int_amdgcn_smfmac_f32_32x32x32_f16>;
defm V_SMFMAC_F32_16X16X64_BF16 : SMFMACInst<"v_smfmac_f32_16x16x64_bf16", "F32_16X16X64_BF16", int_amdgcn_smfmac_f32_16x16x64_bf16>;
+defm V_SMFMAC_F32_32X32X32_BF16 : SMFMACInst<"v_smfmac_f32_32x32x32_bf16", "F32_32X32X32_BF16", int_amdgcn_smfmac_f32_32x32x32_bf16>;
}
def MAIInstInfoTable : GenericTable {
@@ -2143,6 +2145,7 @@ defm V_SMFMAC_F32_32X32X32_FP8_FP8 : VOP3P_Real_SMFMAC <0x7f, "v_smfmac_f32_32x3
defm V_SMFMAC_F32_16X16X64_F16 : VOP3P_Real_SMFMAC <0x5a, "v_smfmac_f32_16x16x64f16">;
defm V_SMFMAC_F32_32X32X32_F16 : VOP3P_Real_SMFMAC <0x5b, "v_smfmac_f32_32x32x32f16">;
defm V_SMFMAC_F32_16X16X64_BF16 : VOP3P_Real_SMFMAC <0x39, "v_smfmac_f32_16x16x64bf16">;
+defm V_SMFMAC_F32_32X32X32_BF16 : VOP3P_Real_SMFMAC <0x46, "v_smfmac_f32_32x32x32bf16">;
defm V_PK_FMA_F32 : VOP3P_Real_vi <0x30>;
defm V_PK_MUL_F32 : VOP3P_Real_vi <0x31>;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
index fd878e870a3fa6..dadf2d99e74e7f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
@@ -953,6 +953,534 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16__sgpr(<8 x bfloat> inreg %arg0
ret <4 x float> %result
}
+; --------------------------------------------------------------------
+; llvm.amdgcn.smfmac.f32.32x32x32.bf16
+; --------------------------------------------------------------------
+
+declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat>, <16 x bfloat>, <16 x float>, i32, i32 immarg, i32 immarg)
+
+define amdgpu_kernel void @test_smfmac_f32_32x32x32_bf16__vgpr(ptr addrspace(1) %arg, <8 x bfloat> %a, <16 x bfloat> %b, i32 %idx) #0 {
+; SDAG-LABEL: test_smfmac_f32_32x32x32_bf16__vgpr:
+; SDAG: ; %bb.0: ; %bb
+; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
+; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
+; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
+; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
+; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
+; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
+; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
+; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[2:3]
+; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[0:1]
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; SDAG-NEXT: v_mov_b32_e32 v28, s16
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_smfmac_f32_32x32x32_bf16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
+; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
+; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_smfmac_f32_32x32x32_bf16__vgpr:
+; GISEL: ; %bb.0: ; %bb
+; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
+; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
+; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
+; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
+; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
+; GISEL-NEXT: s_load_dword s16, s[4:5], 0x64
+; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[2:3]
+; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[0:1]
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; GISEL-NEXT: v_mov_b32_e32 v28, s16
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_smfmac_f32_32x32x32_bf16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
+; GISEL-NEXT: v_mov_b32_e32 v16, 0
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
+; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
+; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
+; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
+; GISEL-NEXT: s_endpgm
+bb:
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <16 x float>, ptr addrspace(1) %arg, i32 %id
+ %in.1 = load <16 x float>, ptr addrspace(1) %gep
+ %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %a, <16 x bfloat> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
+ store <16 x float> %mai.1, ptr addrspace(1) %arg
+ ret void
+}
+
+define <16 x float> @test_smfmac_f32_32x32x32_bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3) {
+; SDAG-LABEL: test_smfmac_f32_32x32x32_bf16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: s_nop 2
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_f32_32x32x32_bf16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v48, v0
+; GISEL-NEXT: v_mov_b32_e32 v49, v1
+; GISEL-NEXT: v_mov_b32_e32 v50, v2
+; GISEL-NEXT: v_mov_b32_e32 v51, v3
+; GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v48
+; GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v49
+; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v50
+; GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v51
+; GISEL-NEXT: v_mov_b32_sdwa v48, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_e32 v30, v4
+; GISEL-NEXT: v_mov_b32_e32 v31, v5
+; GISEL-NEXT: v_mov_b32_e32 v32, v6
+; GISEL-NEXT: v_mov_b32_e32 v33, v7
+; GISEL-NEXT: v_mov_b32_e32 v34, v8
+; GISEL-NEXT: v_mov_b32_e32 v35, v9
+; GISEL-NEXT: v_mov_b32_e32 v36, v10
+; GISEL-NEXT: v_mov_b32_e32 v37, v11
+; GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v30
+; GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v31
+; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v32
+; GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v33
+; GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v34
+; GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v35
+; GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v36
+; GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v37
+; GISEL-NEXT: v_mov_b32_sdwa v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v32, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v34, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v35, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v36, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v37, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27]
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_smfmac_f32_32x32x32_bf16 v[0:15], v[48:51], v[30:37], v28
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags0(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3) {
+; SDAG-LABEL: test_smfmac_f32_32x32x32_bf16__flags0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, ...
[truncated]
|
@llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesPatch is 40.89 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/117212.diff 12 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 5a6c3e0d701f09..e93f570a6353b5 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -447,6 +447,7 @@ TARGET_BUILTIN(__builtin_amdgcn_mfma_i32_32x32x32_i8, "V16iV4iV4iV16iIiIiIi", "n
TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_16x16x64_f16, "V4fV8hV16hV4fiIiIi", "nc", "gfx950-insts")
TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x32_f16, "V16fV8hV16hV16fiIiIi", "nc", "gfx950-insts")
TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_16x16x64_bf16, "V4fV8yV16yV4fiIiIi", "nc", "gfx950-insts")
+TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x32_bf16, "V16fV8yV16yV16fiIiIi", "nc", "gfx950-insts")
//===----------------------------------------------------------------------===//
// GFX12+ only builtins.
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
index 833c894252bb61..1977abaec9d575 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
@@ -489,4 +489,11 @@ void test_smfmac_f32_16x16x64_bf16(global v4f* out, v8bf16 a, v16bf16 b, v4f c,
*out = __builtin_amdgcn_smfmac_f32_16x16x64_bf16(a, b, c, idx, 0, 0);
}
+// CHECK-GFX950-LABEL: @test_smfmac_f32_32x32x32_bf16
+// CHECK-GFX950: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %a, <16 x bfloat> %b, <16 x float> %c, i32 %idx, i32 0, i32 0)
+void test_smfmac_f32_32x32x32_bf16(global v16f* out, v8bf16 a, v16bf16 b, v16f c, int idx)
+{
+ *out = __builtin_amdgcn_smfmac_f32_32x32x32_bf16(a, b, c, idx, 0, 0);
+}
+
#endif
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
index b00058a6359c46..a80926c2644851 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
@@ -82,3 +82,9 @@ void test_smfmac_f32_16x16x64_bf16(global float4* out, bfloat8 a, bfloat16 b, fl
*out = __builtin_amdgcn_smfmac_f32_16x16x64_bf16(a, b, c, idx, d, 0); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_16x16x64_bf16' must be a constant integer}}
*out = __builtin_amdgcn_smfmac_f32_16x16x64_bf16(a, b, c, idx, 0, d); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_16x16x64_bf16' must be a constant integer}}
}
+
+void test_smfmac_f32_32x32x32_bf16(global float16* out, bfloat8 a, bfloat16 b, float16 c, int idx, int d)
+{
+ *out = __builtin_amdgcn_smfmac_f32_32x32x32_bf16(a, b, c, idx, d, 0); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_32x32x32_bf16' must be a constant integer}}
+ *out = __builtin_amdgcn_smfmac_f32_32x32x32_bf16(a, b, c, idx, 0, d); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_32x32x32_bf16' must be a constant integer}}
+}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
index afbe132ba6d5df..090cd3348ee837 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
@@ -37,6 +37,7 @@ void test(__global float4* out0, half8 a0, half8 b0, float4 c0,
*out6 = __builtin_amdgcn_smfmac_f32_16x16x64_f16(a6, b6, c6, 0, 0, 0); // expected-error{{'__builtin_amdgcn_smfmac_f32_16x16x64_f16' needs target feature gfx950-insts}}
*out7 = __builtin_amdgcn_smfmac_f32_32x32x32_f16(a7, b7, c7, 0, 0, 0); // expected-error{{'__builtin_amdgcn_smfmac_f32_32x32x32_f16' needs target feature gfx950-insts}}
*out8 = __builtin_amdgcn_smfmac_f32_16x16x64_bf16(a8, b8, c8, 0, 0, 0); // expected-error{{'__builtin_amdgcn_smfmac_f32_16x16x64_bf16' needs target feature gfx950-insts}}
+ *out9 = __builtin_amdgcn_smfmac_f32_32x32x32_bf16(a9, b9, c9, 0, 0, 0); // expected-error{{'__builtin_amdgcn_smfmac_f32_32x32x32_bf16' needs target feature gfx950-insts}}
*out14 = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a14, b14, c14, 0, 0, 0, d14, 0, e14); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' needs target feature gfx950-insts}}
*out15 = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a15, b15, c15, 0, 0, 0, d15, 0, e15); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' needs target feature gfx950-insts}}
}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 0e207be4399dbf..a02efac1a86b1b 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3155,6 +3155,7 @@ def int_amdgcn_mfma_scale_f32_32x32x64_f8f6f4 : AMDGPUMfmaScaleIntrinsic<llvm_v1
def int_amdgcn_smfmac_f32_16x16x64_f16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v8f16_ty, llvm_v16f16_ty>;
def int_amdgcn_smfmac_f32_32x32x32_f16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v8f16_ty, llvm_v16f16_ty>;
def int_amdgcn_smfmac_f32_16x16x64_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v16bf16_ty>;
+def int_amdgcn_smfmac_f32_32x32x32_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v8bf16_ty, llvm_v16bf16_ty>;
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 75553790b98e56..c7e724b202003a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1093,6 +1093,7 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
+ case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
return selectSMFMACIntrin(I);
default:
return selectImpl(I, *CoverageInfo);
@@ -3494,6 +3495,9 @@ bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF16_e64;
break;
+ case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
+ Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF16_e64;
+ break;
default:
llvm_unreachable("unhandled smfmac intrinsic");
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 519bbc9935c81d..d7e0c6ee3e0e32 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4807,7 +4807,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
- case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16: {
+ case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
+ case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16: {
// vdst, srcA, srcB, srcC, idx
OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index f608c5a93f0e07..82fed425621d43 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2873,6 +2873,7 @@ def VOP_V4F32_V8F16_V16F16_I32 : VOPProfile <[v4f32, v8f16, v16f16, i32]>;
def VOP_V4F32_V8BF16_V16BF16_I32 : VOPProfile <[v4f32, v8bf16, v16bf16, i32]>;
def VOP_V16F32_V4F16_V8F16_I32 : VOPProfile <[v16f32, v4f16, v8f16, i32]>;
def VOP_V16F32_V8F16_V16F16_I32 : VOPProfile <[v16f32, v8f16, v16f16, i32]>;
+def VOP_V16F32_V8BF16_V16BF16_I32 : VOPProfile <[v16f32, v8bf16, v16bf16, i32]>;
def VOP_V4F32_V4I16_V8I16_I32 : VOPProfile <[v4f32, v4i16, v8i16, i32]>;
def VOP_V16F32_V4I16_V8I16_I32 : VOPProfile <[v16f32, v4i16, v8i16, i32]>;
def VOP_V4I32_V2I32_V4I32_I32 : VOPProfile <[v4i32, v2i32, v4i32, i32]>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 467415c1a47588..b03b3572b5d539 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -632,6 +632,7 @@ def VOPProfileSMFMAC_F32_16X16X32_F16 : VOPProfileSMFMAC<VOP_V4F32_V4F16_V8F16_I
def VOPProfileSMFMAC_F32_16X16X64_F16 : VOPProfileSMFMAC<VOP_V4F32_V8F16_V16F16_I32, AVDst_128, AVSrc_128, AVSrc_256>;
def VOPProfileSMFMAC_F32_32X32X32_F16 : VOPProfileSMFMAC<VOP_V16F32_V8F16_V16F16_I32, AVDst_512, AVSrc_128, AVSrc_256>;
def VOPProfileSMFMAC_F32_16X16X64_BF16 : VOPProfileSMFMAC<VOP_V4F32_V8BF16_V16BF16_I32, AVDst_128, AVSrc_128, AVSrc_256>;
+def VOPProfileSMFMAC_F32_32X32X32_BF16 : VOPProfileSMFMAC<VOP_V16F32_V8BF16_V16BF16_I32, AVDst_512, AVSrc_128, AVSrc_256>;
def VOPProfileSMFMAC_F32_32X32X16_F16 : VOPProfileSMFMAC<VOP_V16F32_V4F16_V8F16_I32, AVDst_512, AVSrc_64, AVSrc_128>;
def VOPProfileSMFMAC_F32_16X16X32_I16 : VOPProfileSMFMAC<VOP_V4F32_V4I16_V8I16_I32, AVDst_128, AVSrc_64, AVSrc_128>;
def VOPProfileSMFMAC_F32_32X32X16_I16 : VOPProfileSMFMAC<VOP_V16F32_V4I16_V8I16_I32, AVDst_512, AVSrc_64, AVSrc_128>;
@@ -1049,6 +1050,7 @@ let SubtargetPredicate = HasGFX950Insts in {
defm V_SMFMAC_F32_16X16X64_F16 : SMFMACInst<"v_smfmac_f32_16x16x64_f16", "F32_16X16X64_F16", int_amdgcn_smfmac_f32_16x16x64_f16>;
defm V_SMFMAC_F32_32X32X32_F16 : SMFMACInst<"v_smfmac_f32_32x32x32_f16", "F32_32X32X32_F16", int_amdgcn_smfmac_f32_32x32x32_f16>;
defm V_SMFMAC_F32_16X16X64_BF16 : SMFMACInst<"v_smfmac_f32_16x16x64_bf16", "F32_16X16X64_BF16", int_amdgcn_smfmac_f32_16x16x64_bf16>;
+defm V_SMFMAC_F32_32X32X32_BF16 : SMFMACInst<"v_smfmac_f32_32x32x32_bf16", "F32_32X32X32_BF16", int_amdgcn_smfmac_f32_32x32x32_bf16>;
}
def MAIInstInfoTable : GenericTable {
@@ -2143,6 +2145,7 @@ defm V_SMFMAC_F32_32X32X32_FP8_FP8 : VOP3P_Real_SMFMAC <0x7f, "v_smfmac_f32_32x3
defm V_SMFMAC_F32_16X16X64_F16 : VOP3P_Real_SMFMAC <0x5a, "v_smfmac_f32_16x16x64f16">;
defm V_SMFMAC_F32_32X32X32_F16 : VOP3P_Real_SMFMAC <0x5b, "v_smfmac_f32_32x32x32f16">;
defm V_SMFMAC_F32_16X16X64_BF16 : VOP3P_Real_SMFMAC <0x39, "v_smfmac_f32_16x16x64bf16">;
+defm V_SMFMAC_F32_32X32X32_BF16 : VOP3P_Real_SMFMAC <0x46, "v_smfmac_f32_32x32x32bf16">;
defm V_PK_FMA_F32 : VOP3P_Real_vi <0x30>;
defm V_PK_MUL_F32 : VOP3P_Real_vi <0x31>;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
index fd878e870a3fa6..dadf2d99e74e7f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
@@ -953,6 +953,534 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16__sgpr(<8 x bfloat> inreg %arg0
ret <4 x float> %result
}
+; --------------------------------------------------------------------
+; llvm.amdgcn.smfmac.f32.32x32x32.bf16
+; --------------------------------------------------------------------
+
+declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat>, <16 x bfloat>, <16 x float>, i32, i32 immarg, i32 immarg)
+
+define amdgpu_kernel void @test_smfmac_f32_32x32x32_bf16__vgpr(ptr addrspace(1) %arg, <8 x bfloat> %a, <16 x bfloat> %b, i32 %idx) #0 {
+; SDAG-LABEL: test_smfmac_f32_32x32x32_bf16__vgpr:
+; SDAG: ; %bb.0: ; %bb
+; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
+; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
+; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
+; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
+; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
+; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
+; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
+; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[2:3]
+; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[0:1]
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; SDAG-NEXT: v_mov_b32_e32 v28, s16
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_smfmac_f32_32x32x32_bf16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
+; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
+; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_smfmac_f32_32x32x32_bf16__vgpr:
+; GISEL: ; %bb.0: ; %bb
+; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
+; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
+; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
+; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
+; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
+; GISEL-NEXT: s_load_dword s16, s[4:5], 0x64
+; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[2:3]
+; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[0:1]
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; GISEL-NEXT: v_mov_b32_e32 v28, s16
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_smfmac_f32_32x32x32_bf16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
+; GISEL-NEXT: v_mov_b32_e32 v16, 0
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
+; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
+; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
+; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
+; GISEL-NEXT: s_endpgm
+bb:
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <16 x float>, ptr addrspace(1) %arg, i32 %id
+ %in.1 = load <16 x float>, ptr addrspace(1) %gep
+ %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %a, <16 x bfloat> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
+ store <16 x float> %mai.1, ptr addrspace(1) %arg
+ ret void
+}
+
+define <16 x float> @test_smfmac_f32_32x32x32_bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3) {
+; SDAG-LABEL: test_smfmac_f32_32x32x32_bf16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: s_nop 2
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_f32_32x32x32_bf16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v48, v0
+; GISEL-NEXT: v_mov_b32_e32 v49, v1
+; GISEL-NEXT: v_mov_b32_e32 v50, v2
+; GISEL-NEXT: v_mov_b32_e32 v51, v3
+; GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v48
+; GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v49
+; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v50
+; GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v51
+; GISEL-NEXT: v_mov_b32_sdwa v48, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_e32 v30, v4
+; GISEL-NEXT: v_mov_b32_e32 v31, v5
+; GISEL-NEXT: v_mov_b32_e32 v32, v6
+; GISEL-NEXT: v_mov_b32_e32 v33, v7
+; GISEL-NEXT: v_mov_b32_e32 v34, v8
+; GISEL-NEXT: v_mov_b32_e32 v35, v9
+; GISEL-NEXT: v_mov_b32_e32 v36, v10
+; GISEL-NEXT: v_mov_b32_e32 v37, v11
+; GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v30
+; GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v31
+; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v32
+; GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v33
+; GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v34
+; GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v35
+; GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v36
+; GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v37
+; GISEL-NEXT: v_mov_b32_sdwa v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v32, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v34, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v35, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v36, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v37, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27]
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_smfmac_f32_32x32x32_bf16 v[0:15], v[48:51], v[30:37], v28
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags0(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3) {
+; SDAG-LABEL: test_smfmac_f32_32x32x32_bf16__flags0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, ...
[truncated]
|
@llvm/pr-subscribers-llvm-ir Author: Matt Arsenault (arsenm) ChangesPatch is 40.89 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/117212.diff 12 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 5a6c3e0d701f09..e93f570a6353b5 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -447,6 +447,7 @@ TARGET_BUILTIN(__builtin_amdgcn_mfma_i32_32x32x32_i8, "V16iV4iV4iV16iIiIiIi", "n
TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_16x16x64_f16, "V4fV8hV16hV4fiIiIi", "nc", "gfx950-insts")
TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x32_f16, "V16fV8hV16hV16fiIiIi", "nc", "gfx950-insts")
TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_16x16x64_bf16, "V4fV8yV16yV4fiIiIi", "nc", "gfx950-insts")
+TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x32_bf16, "V16fV8yV16yV16fiIiIi", "nc", "gfx950-insts")
//===----------------------------------------------------------------------===//
// GFX12+ only builtins.
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
index 833c894252bb61..1977abaec9d575 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
@@ -489,4 +489,11 @@ void test_smfmac_f32_16x16x64_bf16(global v4f* out, v8bf16 a, v16bf16 b, v4f c,
*out = __builtin_amdgcn_smfmac_f32_16x16x64_bf16(a, b, c, idx, 0, 0);
}
+// CHECK-GFX950-LABEL: @test_smfmac_f32_32x32x32_bf16
+// CHECK-GFX950: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %a, <16 x bfloat> %b, <16 x float> %c, i32 %idx, i32 0, i32 0)
+void test_smfmac_f32_32x32x32_bf16(global v16f* out, v8bf16 a, v16bf16 b, v16f c, int idx)
+{
+ *out = __builtin_amdgcn_smfmac_f32_32x32x32_bf16(a, b, c, idx, 0, 0);
+}
+
#endif
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
index b00058a6359c46..a80926c2644851 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
@@ -82,3 +82,9 @@ void test_smfmac_f32_16x16x64_bf16(global float4* out, bfloat8 a, bfloat16 b, fl
*out = __builtin_amdgcn_smfmac_f32_16x16x64_bf16(a, b, c, idx, d, 0); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_16x16x64_bf16' must be a constant integer}}
*out = __builtin_amdgcn_smfmac_f32_16x16x64_bf16(a, b, c, idx, 0, d); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_16x16x64_bf16' must be a constant integer}}
}
+
+void test_smfmac_f32_32x32x32_bf16(global float16* out, bfloat8 a, bfloat16 b, float16 c, int idx, int d)
+{
+ *out = __builtin_amdgcn_smfmac_f32_32x32x32_bf16(a, b, c, idx, d, 0); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_32x32x32_bf16' must be a constant integer}}
+ *out = __builtin_amdgcn_smfmac_f32_32x32x32_bf16(a, b, c, idx, 0, d); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_32x32x32_bf16' must be a constant integer}}
+}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
index afbe132ba6d5df..090cd3348ee837 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
@@ -37,6 +37,7 @@ void test(__global float4* out0, half8 a0, half8 b0, float4 c0,
*out6 = __builtin_amdgcn_smfmac_f32_16x16x64_f16(a6, b6, c6, 0, 0, 0); // expected-error{{'__builtin_amdgcn_smfmac_f32_16x16x64_f16' needs target feature gfx950-insts}}
*out7 = __builtin_amdgcn_smfmac_f32_32x32x32_f16(a7, b7, c7, 0, 0, 0); // expected-error{{'__builtin_amdgcn_smfmac_f32_32x32x32_f16' needs target feature gfx950-insts}}
*out8 = __builtin_amdgcn_smfmac_f32_16x16x64_bf16(a8, b8, c8, 0, 0, 0); // expected-error{{'__builtin_amdgcn_smfmac_f32_16x16x64_bf16' needs target feature gfx950-insts}}
+ *out9 = __builtin_amdgcn_smfmac_f32_32x32x32_bf16(a9, b9, c9, 0, 0, 0); // expected-error{{'__builtin_amdgcn_smfmac_f32_32x32x32_bf16' needs target feature gfx950-insts}}
*out14 = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a14, b14, c14, 0, 0, 0, d14, 0, e14); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' needs target feature gfx950-insts}}
*out15 = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a15, b15, c15, 0, 0, 0, d15, 0, e15); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' needs target feature gfx950-insts}}
}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 0e207be4399dbf..a02efac1a86b1b 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3155,6 +3155,7 @@ def int_amdgcn_mfma_scale_f32_32x32x64_f8f6f4 : AMDGPUMfmaScaleIntrinsic<llvm_v1
def int_amdgcn_smfmac_f32_16x16x64_f16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v8f16_ty, llvm_v16f16_ty>;
def int_amdgcn_smfmac_f32_32x32x32_f16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v8f16_ty, llvm_v16f16_ty>;
def int_amdgcn_smfmac_f32_16x16x64_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v16bf16_ty>;
+def int_amdgcn_smfmac_f32_32x32x32_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v8bf16_ty, llvm_v16bf16_ty>;
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 75553790b98e56..c7e724b202003a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1093,6 +1093,7 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
+ case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
return selectSMFMACIntrin(I);
default:
return selectImpl(I, *CoverageInfo);
@@ -3494,6 +3495,9 @@ bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF16_e64;
break;
+ case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
+ Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF16_e64;
+ break;
default:
llvm_unreachable("unhandled smfmac intrinsic");
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 519bbc9935c81d..d7e0c6ee3e0e32 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4807,7 +4807,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
- case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16: {
+ case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
+ case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16: {
// vdst, srcA, srcB, srcC, idx
OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index f608c5a93f0e07..82fed425621d43 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2873,6 +2873,7 @@ def VOP_V4F32_V8F16_V16F16_I32 : VOPProfile <[v4f32, v8f16, v16f16, i32]>;
def VOP_V4F32_V8BF16_V16BF16_I32 : VOPProfile <[v4f32, v8bf16, v16bf16, i32]>;
def VOP_V16F32_V4F16_V8F16_I32 : VOPProfile <[v16f32, v4f16, v8f16, i32]>;
def VOP_V16F32_V8F16_V16F16_I32 : VOPProfile <[v16f32, v8f16, v16f16, i32]>;
+def VOP_V16F32_V8BF16_V16BF16_I32 : VOPProfile <[v16f32, v8bf16, v16bf16, i32]>;
def VOP_V4F32_V4I16_V8I16_I32 : VOPProfile <[v4f32, v4i16, v8i16, i32]>;
def VOP_V16F32_V4I16_V8I16_I32 : VOPProfile <[v16f32, v4i16, v8i16, i32]>;
def VOP_V4I32_V2I32_V4I32_I32 : VOPProfile <[v4i32, v2i32, v4i32, i32]>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 467415c1a47588..b03b3572b5d539 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -632,6 +632,7 @@ def VOPProfileSMFMAC_F32_16X16X32_F16 : VOPProfileSMFMAC<VOP_V4F32_V4F16_V8F16_I
def VOPProfileSMFMAC_F32_16X16X64_F16 : VOPProfileSMFMAC<VOP_V4F32_V8F16_V16F16_I32, AVDst_128, AVSrc_128, AVSrc_256>;
def VOPProfileSMFMAC_F32_32X32X32_F16 : VOPProfileSMFMAC<VOP_V16F32_V8F16_V16F16_I32, AVDst_512, AVSrc_128, AVSrc_256>;
def VOPProfileSMFMAC_F32_16X16X64_BF16 : VOPProfileSMFMAC<VOP_V4F32_V8BF16_V16BF16_I32, AVDst_128, AVSrc_128, AVSrc_256>;
+def VOPProfileSMFMAC_F32_32X32X32_BF16 : VOPProfileSMFMAC<VOP_V16F32_V8BF16_V16BF16_I32, AVDst_512, AVSrc_128, AVSrc_256>;
def VOPProfileSMFMAC_F32_32X32X16_F16 : VOPProfileSMFMAC<VOP_V16F32_V4F16_V8F16_I32, AVDst_512, AVSrc_64, AVSrc_128>;
def VOPProfileSMFMAC_F32_16X16X32_I16 : VOPProfileSMFMAC<VOP_V4F32_V4I16_V8I16_I32, AVDst_128, AVSrc_64, AVSrc_128>;
def VOPProfileSMFMAC_F32_32X32X16_I16 : VOPProfileSMFMAC<VOP_V16F32_V4I16_V8I16_I32, AVDst_512, AVSrc_64, AVSrc_128>;
@@ -1049,6 +1050,7 @@ let SubtargetPredicate = HasGFX950Insts in {
defm V_SMFMAC_F32_16X16X64_F16 : SMFMACInst<"v_smfmac_f32_16x16x64_f16", "F32_16X16X64_F16", int_amdgcn_smfmac_f32_16x16x64_f16>;
defm V_SMFMAC_F32_32X32X32_F16 : SMFMACInst<"v_smfmac_f32_32x32x32_f16", "F32_32X32X32_F16", int_amdgcn_smfmac_f32_32x32x32_f16>;
defm V_SMFMAC_F32_16X16X64_BF16 : SMFMACInst<"v_smfmac_f32_16x16x64_bf16", "F32_16X16X64_BF16", int_amdgcn_smfmac_f32_16x16x64_bf16>;
+defm V_SMFMAC_F32_32X32X32_BF16 : SMFMACInst<"v_smfmac_f32_32x32x32_bf16", "F32_32X32X32_BF16", int_amdgcn_smfmac_f32_32x32x32_bf16>;
}
def MAIInstInfoTable : GenericTable {
@@ -2143,6 +2145,7 @@ defm V_SMFMAC_F32_32X32X32_FP8_FP8 : VOP3P_Real_SMFMAC <0x7f, "v_smfmac_f32_32x3
defm V_SMFMAC_F32_16X16X64_F16 : VOP3P_Real_SMFMAC <0x5a, "v_smfmac_f32_16x16x64f16">;
defm V_SMFMAC_F32_32X32X32_F16 : VOP3P_Real_SMFMAC <0x5b, "v_smfmac_f32_32x32x32f16">;
defm V_SMFMAC_F32_16X16X64_BF16 : VOP3P_Real_SMFMAC <0x39, "v_smfmac_f32_16x16x64bf16">;
+defm V_SMFMAC_F32_32X32X32_BF16 : VOP3P_Real_SMFMAC <0x46, "v_smfmac_f32_32x32x32bf16">;
defm V_PK_FMA_F32 : VOP3P_Real_vi <0x30>;
defm V_PK_MUL_F32 : VOP3P_Real_vi <0x31>;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
index fd878e870a3fa6..dadf2d99e74e7f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
@@ -953,6 +953,534 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16__sgpr(<8 x bfloat> inreg %arg0
ret <4 x float> %result
}
+; --------------------------------------------------------------------
+; llvm.amdgcn.smfmac.f32.32x32x32.bf16
+; --------------------------------------------------------------------
+
+declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat>, <16 x bfloat>, <16 x float>, i32, i32 immarg, i32 immarg)
+
+define amdgpu_kernel void @test_smfmac_f32_32x32x32_bf16__vgpr(ptr addrspace(1) %arg, <8 x bfloat> %a, <16 x bfloat> %b, i32 %idx) #0 {
+; SDAG-LABEL: test_smfmac_f32_32x32x32_bf16__vgpr:
+; SDAG: ; %bb.0: ; %bb
+; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
+; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
+; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
+; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
+; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
+; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
+; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
+; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[2:3]
+; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[0:1]
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; SDAG-NEXT: v_mov_b32_e32 v28, s16
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_smfmac_f32_32x32x32_bf16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
+; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
+; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_smfmac_f32_32x32x32_bf16__vgpr:
+; GISEL: ; %bb.0: ; %bb
+; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
+; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
+; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
+; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
+; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
+; GISEL-NEXT: s_load_dword s16, s[4:5], 0x64
+; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[2:3]
+; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[0:1]
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; GISEL-NEXT: v_mov_b32_e32 v28, s16
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_smfmac_f32_32x32x32_bf16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
+; GISEL-NEXT: v_mov_b32_e32 v16, 0
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
+; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
+; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
+; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
+; GISEL-NEXT: s_endpgm
+bb:
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <16 x float>, ptr addrspace(1) %arg, i32 %id
+ %in.1 = load <16 x float>, ptr addrspace(1) %gep
+ %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %a, <16 x bfloat> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
+ store <16 x float> %mai.1, ptr addrspace(1) %arg
+ ret void
+}
+
+define <16 x float> @test_smfmac_f32_32x32x32_bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3) {
+; SDAG-LABEL: test_smfmac_f32_32x32x32_bf16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: s_nop 2
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_f32_32x32x32_bf16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v48, v0
+; GISEL-NEXT: v_mov_b32_e32 v49, v1
+; GISEL-NEXT: v_mov_b32_e32 v50, v2
+; GISEL-NEXT: v_mov_b32_e32 v51, v3
+; GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v48
+; GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v49
+; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v50
+; GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v51
+; GISEL-NEXT: v_mov_b32_sdwa v48, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_e32 v30, v4
+; GISEL-NEXT: v_mov_b32_e32 v31, v5
+; GISEL-NEXT: v_mov_b32_e32 v32, v6
+; GISEL-NEXT: v_mov_b32_e32 v33, v7
+; GISEL-NEXT: v_mov_b32_e32 v34, v8
+; GISEL-NEXT: v_mov_b32_e32 v35, v9
+; GISEL-NEXT: v_mov_b32_e32 v36, v10
+; GISEL-NEXT: v_mov_b32_e32 v37, v11
+; GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v30
+; GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v31
+; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v32
+; GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v33
+; GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v34
+; GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v35
+; GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v36
+; GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v37
+; GISEL-NEXT: v_mov_b32_sdwa v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v32, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v34, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v35, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v36, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v37, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27]
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_smfmac_f32_32x32x32_bf16 v[0:15], v[48:51], v[30:37], v28
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags0(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3) {
+; SDAG-LABEL: test_smfmac_f32_32x32x32_bf16__flags0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, ...
[truncated]
|
No description provided.