Skip to content

AMDGPU: Add v_mfma_f32_16x16x32_bf16 for gfx950 #117053

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions clang/include/clang/Basic/BuiltinsAMDGPU.def
Original file line number Diff line number Diff line change
Expand Up @@ -438,6 +438,7 @@ TARGET_BUILTIN(__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4, "V4fV8ZiV8ZiV4f
TARGET_BUILTIN(__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4, "V16fV8ZiV8ZiV16fIiIiIiiIii", "nc", "gfx950-insts")

TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x32_f16, "V4fV8hV8hV4fIiIiIi", "nc", "gfx950-insts")
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x32_bf16, "V4fV8yV8yV4fIiIiIi", "nc", "gfx950-insts")
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x16_f16, "V16fV8hV8hV16fIiIiIi", "nc", "gfx950-insts")
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x16_bf16, "V16fV8yV8yV16fIiIiIi", "nc", "gfx950-insts")
TARGET_BUILTIN(__builtin_amdgcn_mfma_i32_16x16x64_i8, "V4iV4iV4iV4iIiIiIi", "nc", "gfx950-insts")
Expand Down
7 changes: 7 additions & 0 deletions clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
Original file line number Diff line number Diff line change
Expand Up @@ -460,4 +460,11 @@ v16i test_mfma_i32_32x32x32_i8(v4i a, v4i b, v16i c) {
return __builtin_amdgcn_mfma_i32_32x32x32_i8(a, b, c, 1, 2, 3);
}

// CHECK-GFX950-LABEL: @test_mfma_f32_16x16x32_bf16(
// CHECK-GFX950: tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %a, <8 x bfloat> %b, <4 x float> %c, i32 1, i32 2, i32 3)
v4f test_mfma_f32_16x16x32_bf16(v8bf16 a, v8bf16 b, v4f c)
{
return __builtin_amdgcn_mfma_f32_16x16x32_bf16(a, b, c, 1, 2, 3);
}

#endif
7 changes: 7 additions & 0 deletions clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,10 @@ void test_mfma_i32_32x32x32_i8(__global int16* out, int4 a, int4 b, int16 c, int
*out = __builtin_amdgcn_mfma_i32_32x32x32_i8(a, b, c, 0, X, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_32x32x32_i8' must be a constant integer}}
*out = __builtin_amdgcn_mfma_i32_32x32x32_i8(a, b, c, 0, 0, X); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_32x32x32_i8' must be a constant integer}}
}

void test_mfma_f32_16x16x32_bf16(__global float4* out, bfloat8 a, bfloat8 b, float4 c, int X) {

*out = __builtin_amdgcn_mfma_f32_16x16x32_bf16(a, b, c, X, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x32_bf16' must be a constant integer}}
*out = __builtin_amdgcn_mfma_f32_16x16x32_bf16(a, b, c, 0, X, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x32_bf16' must be a constant integer}}
*out = __builtin_amdgcn_mfma_f32_16x16x32_bf16(a, b, c, 0, 0, X); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x32_bf16' must be a constant integer}}
}
1 change: 1 addition & 0 deletions clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ void test(__global float4* out0, half8 a0, half8 b0, float4 c0,
*out2 = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a2, b2, c2, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_32x32x16_bf16' needs target feature gfx950-insts}}
*out3 = __builtin_amdgcn_mfma_i32_16x16x64_i8(a3, b3, c3, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_i32_16x16x64_i8' needs target feature gfx950-insts}}
*out4 = __builtin_amdgcn_mfma_i32_32x32x32_i8(a4, b4, c4, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_i32_32x32x32_i8' needs target feature gfx950-insts}}
*out5 = __builtin_amdgcn_mfma_f32_16x16x32_bf16(a5, b5, c5, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_16x16x32_bf16' needs target feature gfx950-insts}}
*out14 = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a14, b14, c14, 0, 0, 0, d14, 0, e14); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' needs target feature gfx950-insts}}
*out15 = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a15, b15, c15, 0, 0, 0, d15, 0, e15); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' needs target feature gfx950-insts}}
}
2 changes: 1 addition & 1 deletion llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -3148,7 +3148,7 @@ def int_amdgcn_mfma_f32_16x16x32_f16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v
def int_amdgcn_mfma_f32_32x32x16_f16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v8f16_ty>;
def int_amdgcn_mfma_i32_16x16x64_i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_v4i32_ty>;
def int_amdgcn_mfma_i32_32x32x32_i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_v4i32_ty>;

def int_amdgcn_mfma_f32_16x16x32_bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v8bf16_ty>;
def int_amdgcn_mfma_f32_32x32x16_bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v8bf16_ty>;
def int_amdgcn_mfma_scale_f32_16x16x128_f8f6f4 : AMDGPUMfmaScaleIntrinsic<llvm_v4f32_ty>;
def int_amdgcn_mfma_scale_f32_32x32x64_f8f6f4 : AMDGPUMfmaScaleIntrinsic<llvm_v16f32_ty>;
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4751,7 +4751,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_mfma_f32_16x16x32_f16:
case Intrinsic::amdgcn_mfma_f32_32x32x16_f16:
case Intrinsic::amdgcn_mfma_i32_16x16x64_i8:
case Intrinsic::amdgcn_mfma_i32_32x32x32_i8: {
case Intrinsic::amdgcn_mfma_i32_32x32x32_i8:
case Intrinsic::amdgcn_mfma_f32_16x16x32_bf16: {
// Default for MAI intrinsics.
// srcC can also be an immediate which can be folded later.
// FIXME: Should we eventually add an alternative mapping with AGPR src
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -2880,6 +2880,7 @@ def VOP_V16F32_V2I32_V4I32_I32 : VOPProfile <[v16f32, v2i32, v4i32, i32]>;
def VOP_V4F32_V8F16_V8F16_V4F32 : VOPProfile <[v4f32, v8f16, v8f16, v4f32]>;
def VOP_V16F32_V8F16_V8F16_V16F32 : VOPProfile <[v16f32, v8f16, v8f16, v16f32]>;
def VOP_V16F32_V8BF16_V8BF16_V16F32 : VOPProfile <[v16f32, v8bf16, v8bf16, v16f32]>;
def VOP_V4F32_V8BF16_V8BF16_V4F32 : VOPProfile <[v4f32, v8bf16, v8bf16, v4f32]>;
def VOP_V4F32_V8I32_V8I32_V4F32 : VOPProfile <[v4f32, v8i32, v8i32, v4f32]>;

def VOP_V4F32_V8I32_V6I32_V4F32 : VOPProfile <[v4f32, v8i32, v6i32, v4f32]>;
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/AMDGPU/VOP3PInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -645,6 +645,9 @@ def VOPProfileMAI_F32_V8F16_X16_VCD : VOPProfileMAI<VOP_V16F32_V8F16_V8F16_V16F3
def VOPProfileMAI_F32_V8BF16_X16 : VOPProfileMAI<VOP_V16F32_V8BF16_V8BF16_V16F32, AISrc_512_f32, ADst_512, AVSrc_128>;
def VOPProfileMAI_F32_V8BF16_X16_VCD : VOPProfileMAI<VOP_V16F32_V8BF16_V8BF16_V16F32, VISrc_512_f32, VDst_512, AVSrc_128>;

def VOPProfileMAI_F32_V8BF16_X4 : VOPProfileMAI<VOP_V4F32_V8BF16_V8BF16_V4F32, AISrc_128_f32, ADst_128, AVSrc_128>;
def VOPProfileMAI_F32_V8BF16_X4_VCD : VOPProfileMAI<VOP_V4F32_V8BF16_V8BF16_V4F32, VISrc_128_f32, VDst_128, AVSrc_128>;


let HasAbid = false in {
// For f32_16x16x128_f8f6f4 - f8 x f8 case
Expand Down Expand Up @@ -952,6 +955,7 @@ defm V_MFMA_F32_32X32X4BF16 : MAIInst<"v_mfma_f32_32x32x4bf16", "F32_V2I16_X16",
let SubtargetPredicate = HasGFX950Insts, is_gfx940_xdl = 1 in {
defm V_MFMA_F32_16X16X32_F16 : MAIInst<"v_mfma_f32_16x16x32f16", "F32_V8F16_X32", int_amdgcn_mfma_f32_16x16x32_f16>;
defm V_MFMA_F32_32X32X16_F16 : MAIInst<"v_mfma_f32_32x32x16f16", "F32_V8F16_X16", int_amdgcn_mfma_f32_32x32x16_f16>;
defm V_MFMA_F32_16X16X32_BF16 : MAIInst<"v_mfma_f32_16x16x32bf16", "F32_V8BF16_X4", int_amdgcn_mfma_f32_16x16x32_bf16>;
defm V_MFMA_I32_16X16X64_I8 : MAIInst<"v_mfma_i32_16x16x64i8", "I32_V4I32_X128", int_amdgcn_mfma_i32_16x16x64_i8>;
defm V_MFMA_F32_32X32X16_BF16 : MAIInst<"v_mfma_f32_32x32x16bf16", "F32_V8BF16_X16", int_amdgcn_mfma_f32_32x32x16_bf16>;
defm V_MFMA_I32_32X32X32_I8 : MAIInst<"v_mfma_i32_32x32x32i8", "I32_V4I32_X16", int_amdgcn_mfma_i32_32x32x32_i8>;
Expand Down Expand Up @@ -2078,6 +2082,7 @@ defm V_MFMA_F64_4X4X4F64 : VOP3P_Real_MFMA_gfx90a <0x6f>;

defm V_MFMA_F32_16X16X32_F16 : VOP3P_Real_MFMA_gfx950 <0x54, "v_mfma_f32_16x16x32_f16">;
defm V_MFMA_F32_32X32X16_F16 : VOP3P_Real_MFMA_gfx950 <0x55, "v_mfma_f32_32x32x16_f16">;
defm V_MFMA_F32_16X16X32_BF16 : VOP3P_Real_MFMA_gfx950 <0x35, "v_mfma_f32_16x16x32_bf16">;
defm V_MFMA_I32_16X16X64_I8 : VOP3P_Real_MFMA_gfx950 <0x36, "v_mfma_i32_16x16x64_i8">;
defm V_MFMA_F32_32X32X16_BF16 : VOP3P_Real_MFMA_gfx950 <0x37, "v_mfma_f32_32x32x16_bf16">;
defm V_MFMA_I32_32X32X32_I8 : VOP3P_Real_MFMA_gfx950 <0x38, "v_mfma_i32_32x32x32_i8">;
Expand Down
9 changes: 9 additions & 0 deletions llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,15 @@ define amdgpu_kernel void @mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> %arg1
ret void
}

declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat>, <8 x bfloat>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)

; CHECK: DIVERGENT: %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
define amdgpu_kernel void @mfma_f32_16x16x32_bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, ptr addrspace(1) %out) {
%result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
store <4 x float> %result, ptr addrspace(1) %out
ret void
}

declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1
declare i32 @llvm.amdgcn.permlane16.i32(i32, i32, i32, i32, i1, i1) #1
declare i32 @llvm.amdgcn.permlanex16.i32(i32, i32, i32, i32, i1, i1) #1
Expand Down
198 changes: 198 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1315,5 +1315,203 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
ret void
}

; --------------------------------------------------------------------
; llvm.amdgcn.mfma.f32.16x16x32.bf16
; --------------------------------------------------------------------

declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat>, <8 x bfloat>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)

define <4 x float> @test_mfma_f32_16x16x32_bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) {
; SDAG-LABEL: test_mfma_f32_16x16x32_bf16:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
; SDAG-NEXT: s_nop 6
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_f32_16x16x32_bf16:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v0
; GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v1
; GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v2
; GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v3
; GISEL-NEXT: v_mov_b32_sdwa v0, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GISEL-NEXT: v_mov_b32_sdwa v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GISEL-NEXT: v_mov_b32_sdwa v2, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GISEL-NEXT: v_mov_b32_sdwa v3, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v4
; GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v5
; GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v6
; GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v7
; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
; GISEL-NEXT: v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GISEL-NEXT: v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GISEL-NEXT: v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GISEL-NEXT: v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
; GISEL-NEXT: s_nop 6
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
ret <4 x float> %result
}

define <4 x float> @test_mfma_f32_16x16x32_bf16__flags(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) {
; SDAG-LABEL: test_mfma_f32_16x16x32_bf16__flags:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
; SDAG-NEXT: s_nop 6
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_f32_16x16x32_bf16__flags:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v0
; GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v1
; GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v2
; GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v3
; GISEL-NEXT: v_mov_b32_sdwa v0, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GISEL-NEXT: v_mov_b32_sdwa v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GISEL-NEXT: v_mov_b32_sdwa v2, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GISEL-NEXT: v_mov_b32_sdwa v3, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v4
; GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v5
; GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v6
; GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v7
; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
; GISEL-NEXT: v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GISEL-NEXT: v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GISEL-NEXT: v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GISEL-NEXT: v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
; GISEL-NEXT: s_nop 6
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 1, i32 1, i32 1)
ret <4 x float> %result
}

define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrspace(1) %out, <8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) #0 {
; SDAG-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
; SDAG-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
; SDAG-NEXT: v_mov_b32_e32 v12, 0
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[12:13]
; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[14:15]
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
; SDAG-NEXT: s_nop 6
; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
; GISEL-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[12:13]
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[14:15]
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
; GISEL-NEXT: v_mov_b32_e32 v4, 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: s_nop 4
; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GISEL-NEXT: s_endpgm
%result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
store <4 x float> %result, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(ptr addrspace(1) %out, <8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) #0 {
; SDAG-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
; SDAG-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
; SDAG-NEXT: v_mov_b32_e32 v12, 0
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[12:13]
; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[14:15]
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
; SDAG-NEXT: s_nop 6
; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
; GISEL-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[12:13]
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[14:15]
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
; GISEL-NEXT: v_mov_b32_e32 v4, 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: s_nop 4
; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GISEL-NEXT: s_endpgm
%result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 3, i32 2, i32 1)
store <4 x float> %result, ptr addrspace(1) %out
ret void
}

attributes #0 = { "amdgpu-flat-work-group-size"="512,512" }
attributes #1 = { "amdgpu-flat-work-group-size"="1,64" }
Loading
Loading