Skip to content

AMDGPU: Add v_mfma_i32_32x32x32_i8 for gfx950 #117052

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions clang/include/clang/Basic/BuiltinsAMDGPU.def
Original file line number Diff line number Diff line change
Expand Up @@ -441,6 +441,7 @@ TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x32_f16, "V4fV8hV8hV4fIiIiIi", "nc
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x16_f16, "V16fV8hV8hV16fIiIiIi", "nc", "gfx950-insts")
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x16_bf16, "V16fV8yV8yV16fIiIiIi", "nc", "gfx950-insts")
TARGET_BUILTIN(__builtin_amdgcn_mfma_i32_16x16x64_i8, "V4iV4iV4iV4iIiIiIi", "nc", "gfx950-insts")
TARGET_BUILTIN(__builtin_amdgcn_mfma_i32_32x32x32_i8, "V16iV4iV4iV16iIiIiIi", "nc", "gfx950-insts")

//===----------------------------------------------------------------------===//
// GFX12+ only builtins.
Expand Down
6 changes: 6 additions & 0 deletions clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
Original file line number Diff line number Diff line change
Expand Up @@ -452,4 +452,10 @@ v4i test_mfma_i32_16x16x64_i8(v4i a, v4i b, v4i c) {
return __builtin_amdgcn_mfma_i32_16x16x64_i8(a, b, c, 1, 2, 3);
}

// CHECK-GFX950-LABEL: @test_mfma_i32_32x32x32_i8(
// CHECK-GFX950: tail call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %a, <4 x i32> %b, <16 x i32> %c, i32 1, i32 2, i32 3)
v16i test_mfma_i32_32x32x32_i8(v4i a, v4i b, v16i c) {
return __builtin_amdgcn_mfma_i32_32x32x32_i8(a, b, c, 1, 2, 3);
}

#endif
7 changes: 7 additions & 0 deletions clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ typedef half half8 __attribute__((ext_vector_type(8)));
typedef __bf16 bfloat8 __attribute__((ext_vector_type(8)));
typedef int int4 __attribute__((ext_vector_type(4)));
typedef int int8 __attribute__((ext_vector_type(8)));
typedef int int16 __attribute__((ext_vector_type(16)));


void test_mfma_f32_16x16x32_f16(__global float4* out, half8 a, half8 b, float4 c, int X) {
Expand Down Expand Up @@ -48,3 +49,9 @@ void test_mfma_i32_16x16x64_i8(__global int4* out, int4 a, int4 b, int4 c, int X
*out = __builtin_amdgcn_mfma_i32_16x16x64_i8(a, b, c, 0, X, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_16x16x64_i8' must be a constant integer}}
*out = __builtin_amdgcn_mfma_i32_16x16x64_i8(a, b, c, 0, 0, X); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_16x16x64_i8' must be a constant integer}}
}

void test_mfma_i32_32x32x32_i8(__global int16* out, int4 a, int4 b, int16 c, int X) {
*out = __builtin_amdgcn_mfma_i32_32x32x32_i8(a, b, c, X, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_32x32x32_i8' must be a constant integer}}
*out = __builtin_amdgcn_mfma_i32_32x32x32_i8(a, b, c, 0, X, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_32x32x32_i8' must be a constant integer}}
*out = __builtin_amdgcn_mfma_i32_32x32x32_i8(a, b, c, 0, 0, X); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_32x32x32_i8' must be a constant integer}}
}
1 change: 1 addition & 0 deletions clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ void test(__global float4* out0, half8 a0, half8 b0, float4 c0,
*out1 = __builtin_amdgcn_mfma_f32_32x32x16_f16(a1, b1, c1, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_32x32x16_f16' needs target feature gfx950-insts}}
*out2 = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a2, b2, c2, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_32x32x16_bf16' needs target feature gfx950-insts}}
*out3 = __builtin_amdgcn_mfma_i32_16x16x64_i8(a3, b3, c3, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_i32_16x16x64_i8' needs target feature gfx950-insts}}
*out4 = __builtin_amdgcn_mfma_i32_32x32x32_i8(a4, b4, c4, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_i32_32x32x32_i8' needs target feature gfx950-insts}}
*out14 = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a14, b14, c14, 0, 0, 0, d14, 0, e14); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' needs target feature gfx950-insts}}
*out15 = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a15, b15, c15, 0, 0, 0, d15, 0, e15); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' needs target feature gfx950-insts}}
}
1 change: 1 addition & 0 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -3147,6 +3147,7 @@ defset list<Intrinsic> AMDGPUMFMAIntrinsics950 = {
def int_amdgcn_mfma_f32_16x16x32_f16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v8f16_ty>;
def int_amdgcn_mfma_f32_32x32x16_f16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v8f16_ty>;
def int_amdgcn_mfma_i32_16x16x64_i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_v4i32_ty>;
def int_amdgcn_mfma_i32_32x32x32_i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_v4i32_ty>;

def int_amdgcn_mfma_f32_32x32x16_bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v8bf16_ty>;
def int_amdgcn_mfma_scale_f32_16x16x128_f8f6f4 : AMDGPUMfmaScaleIntrinsic<llvm_v4f32_ty>;
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4750,7 +4750,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8:
case Intrinsic::amdgcn_mfma_f32_16x16x32_f16:
case Intrinsic::amdgcn_mfma_f32_32x32x16_f16:
case Intrinsic::amdgcn_mfma_i32_16x16x64_i8: {
case Intrinsic::amdgcn_mfma_i32_16x16x64_i8:
case Intrinsic::amdgcn_mfma_i32_32x32x32_i8: {
// Default for MAI intrinsics.
// srcC can also be an immediate which can be folded later.
// FIXME: Should we eventually add an alternative mapping with AGPR src
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/AMDGPU/VOP3PInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -725,6 +725,10 @@ def VOPProfileMAI_F32_V4I32_V4I32_X512_VCD : VOPProfileMAI<VOP_V16F32_V4I32_V4I3
def VOPProfileMAI_I32_V4I32_X128 : VOPProfileMAI<VOP_V4I32_V4I32_V4I32_V4I32, AISrc_128_f32, ADst_128, AVSrc_128>;
def VOPProfileMAI_I32_V4I32_X128_VCD : VOPProfileMAI<VOP_V4I32_V4I32_V4I32_V4I32, VISrc_128_f32, VDst_128, AVSrc_128>;

// For i32_32x32x32_i8
def VOPProfileMAI_I32_V4I32_X16 : VOPProfileMAI<VOP_V16I32_V4I32_V4I32_V16I32, AISrc_512_b32, ADst_512, AVSrc_128>;
def VOPProfileMAI_I32_V4I32_X16_VCD : VOPProfileMAI<VOP_V16I32_V4I32_V4I32_V16I32, VISrc_512_b32, VDst_512, AVSrc_128>;


class MFMATable <bit is_mac, string Name> {
bit IsMac = is_mac;
Expand Down Expand Up @@ -950,6 +954,7 @@ defm V_MFMA_F32_16X16X32_F16 : MAIInst<"v_mfma_f32_16x16x32f16", "F32_V8F16
defm V_MFMA_F32_32X32X16_F16 : MAIInst<"v_mfma_f32_32x32x16f16", "F32_V8F16_X16", int_amdgcn_mfma_f32_32x32x16_f16>;
defm V_MFMA_I32_16X16X64_I8 : MAIInst<"v_mfma_i32_16x16x64i8", "I32_V4I32_X128", int_amdgcn_mfma_i32_16x16x64_i8>;
defm V_MFMA_F32_32X32X16_BF16 : MAIInst<"v_mfma_f32_32x32x16bf16", "F32_V8BF16_X16", int_amdgcn_mfma_f32_32x32x16_bf16>;
defm V_MFMA_I32_32X32X32_I8 : MAIInst<"v_mfma_i32_32x32x32i8", "I32_V4I32_X16", int_amdgcn_mfma_i32_32x32x32_i8>;

defm V_MFMA_F32_16X16X128_F8F6F4 : MAIInst_SrcFormats_mc<"v_mfma_f32_16x16x128f8f6f4",
"_X128", mfma_f32_16x16x128_f8f6f4>;
Expand Down Expand Up @@ -2075,6 +2080,7 @@ defm V_MFMA_F32_16X16X32_F16 : VOP3P_Real_MFMA_gfx950 <0x54, "v_mfma_f32_16x
defm V_MFMA_F32_32X32X16_F16 : VOP3P_Real_MFMA_gfx950 <0x55, "v_mfma_f32_32x32x16_f16">;
defm V_MFMA_I32_16X16X64_I8 : VOP3P_Real_MFMA_gfx950 <0x36, "v_mfma_i32_16x16x64_i8">;
defm V_MFMA_F32_32X32X16_BF16 : VOP3P_Real_MFMA_gfx950 <0x37, "v_mfma_f32_32x32x16_bf16">;
defm V_MFMA_I32_32X32X32_I8 : VOP3P_Real_MFMA_gfx950 <0x38, "v_mfma_i32_32x32x32_i8">;

defm V_MFMA_LD_SCALE_B32 : VOP3P_Real_vi <0x2c>;
defm V_MFMA_F32_16X16X128_F8F6F4 : VOP3P_Real_MFMA_F8F6F4_gfx950_mc <0x2d, "v_mfma_f32_16x16x128_f8f6f4">;
Expand Down
9 changes: 9 additions & 0 deletions llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,15 @@ define amdgpu_kernel void @mfma_i32_16x16x64_i8(<4 x i32> %arg0, <4 x i32> %arg1
ret void
}

declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32>, <4 x i32>, <16 x i32>, i32 immarg, i32 immarg, i32 immarg)

; CHECK: DIVERGENT: %result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
define amdgpu_kernel void @mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, ptr addrspace(1) %out) {
%result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
store <16 x i32> %result, ptr addrspace(1) %out
ret void
}

declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1
declare i32 @llvm.amdgcn.permlane16.i32(i32, i32, i32, i32, i1, i1) #1
declare i32 @llvm.amdgcn.permlanex16.i32(i32, i32, i32, i32, i1, i1) #1
Expand Down
Loading
Loading