Skip to content

Commit e424fee

Browse files
arsenmpravinjagtap
authored andcommitted
AMDGPU: Add v_smfmac_f32_16x16x128_bf8_fp8 for gfx950 (llvm#117233)
1 parent 0b995ed commit e424fee

File tree

12 files changed

+305
-1
lines changed

12 files changed

+305
-1
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -451,6 +451,7 @@ TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x32_bf16, "V16fV8yV16yV16fiIiIi"
451451
TARGET_BUILTIN(__builtin_amdgcn_smfmac_i32_16x16x128_i8, "V4iV4iV8iV4iiIiIi", "nc", "gfx950-insts")
452452
TARGET_BUILTIN(__builtin_amdgcn_smfmac_i32_32x32x64_i8, "V16iV4iV8iV16iiIiIi", "nc", "gfx950-insts")
453453
TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_16x16x128_bf8_bf8, "V4fV4iV8iV4fiIiIi", "nc", "gfx950-insts")
454+
TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_16x16x128_bf8_fp8, "V4fV4iV8iV4fiIiIi", "nc", "gfx950-insts")
454455

455456
//===----------------------------------------------------------------------===//
456457
// GFX12+ only builtins.

clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -517,4 +517,11 @@ void test_smfmac_f32_16x16x128_bf8_bf8(global v4f* out, v4i a, v8i b, v4f c, int
517517
*out = __builtin_amdgcn_smfmac_f32_16x16x128_bf8_bf8(a, b, c, idx, 0, 0);
518518
}
519519

520+
// CHECK-GFX950-LABEL: @test_smfmac_f32_16x16x128_bf8_fp8
521+
// CHECK-GFX950: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %a, <8 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0)
522+
void test_smfmac_f32_16x16x128_bf8_fp8(global v4f* out, v4i a, v8i b, v4f c, int idx)
523+
{
524+
*out = __builtin_amdgcn_smfmac_f32_16x16x128_bf8_fp8(a, b, c, idx, 0, 0);
525+
}
526+
520527
#endif

clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,3 +106,9 @@ void test_smfmac_f32_16x16x128_bf8_bf8(global float4* out, int4 a, int8 b, float
106106
*out = __builtin_amdgcn_smfmac_f32_16x16x128_bf8_bf8(a, b, c, idx, d, 0); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_16x16x128_bf8_bf8' must be a constant integer}}
107107
*out = __builtin_amdgcn_smfmac_f32_16x16x128_bf8_bf8(a, b, c, idx, 0, d); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_16x16x128_bf8_bf8' must be a constant integer}}
108108
}
109+
110+
void test_smfmac_f32_16x16x128_bf8_fp8(global float4* out, int4 a, int8 b, float4 c, int idx, int d)
111+
{
112+
*out = __builtin_amdgcn_smfmac_f32_16x16x128_bf8_fp8(a, b, c, idx, d, 0); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_16x16x128_bf8_fp8' must be a constant integer}}
113+
*out = __builtin_amdgcn_smfmac_f32_16x16x128_bf8_fp8(a, b, c, idx, 0, d); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_16x16x128_bf8_fp8' must be a constant integer}}
114+
}

clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ void test(__global float4* out0, half8 a0, half8 b0, float4 c0,
4141
*out10 = __builtin_amdgcn_smfmac_i32_16x16x128_i8(a10, b10, c10, 0, 0, 0); // expected-error{{'__builtin_amdgcn_smfmac_i32_16x16x128_i8' needs target feature gfx950-insts}}
4242
*out11 = __builtin_amdgcn_smfmac_i32_32x32x64_i8(a11, b11, c11, 0, 0, 0); // expected-error{{'__builtin_amdgcn_smfmac_i32_32x32x64_i8' needs target feature gfx950-insts}}
4343
*out12 = __builtin_amdgcn_smfmac_f32_16x16x128_bf8_bf8(a12, b12, c12, 0, 0, 0); // expected-error{{'__builtin_amdgcn_smfmac_f32_16x16x128_bf8_bf8' needs target feature gfx950-insts}}
44+
*out12 = __builtin_amdgcn_smfmac_f32_16x16x128_bf8_fp8(a12, b12, c12, 0, 0, 0); // expected-error{{'__builtin_amdgcn_smfmac_f32_16x16x128_bf8_fp8' needs target feature gfx950-insts}}
4445
*out14 = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a14, b14, c14, 0, 0, 0, d14, 0, e14); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' needs target feature gfx950-insts}}
4546
*out15 = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a15, b15, c15, 0, 0, 0, d15, 0, e15); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' needs target feature gfx950-insts}}
4647
}

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3186,6 +3186,7 @@ def int_amdgcn_smfmac_f32_32x32x32_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty,
31863186
def int_amdgcn_smfmac_i32_16x16x128_i8 : AMDGPUMSmfmacIntrinsic<llvm_v4i32_ty, llvm_v4i32_ty, llvm_v8i32_ty>;
31873187
def int_amdgcn_smfmac_i32_32x32x64_i8 : AMDGPUMSmfmacIntrinsic<llvm_v16i32_ty, llvm_v4i32_ty, llvm_v8i32_ty>;
31883188
def int_amdgcn_smfmac_f32_16x16x128_bf8_bf8 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4i32_ty, llvm_v8i32_ty>;
3189+
def int_amdgcn_smfmac_f32_16x16x128_bf8_fp8 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4i32_ty, llvm_v8i32_ty>;
31893190
}
31903191

31913192
//===----------------------------------------------------------------------===//

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1082,6 +1082,7 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
10821082
case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
10831083
case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
10841084
case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
1085+
case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
10851086
return selectSMFMACIntrin(I);
10861087
default:
10871088
return selectImpl(I, *CoverageInfo);
@@ -3521,6 +3522,9 @@ bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
35213522
case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
35223523
Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_BF8_e64;
35233524
break;
3525+
case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
3526+
Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_FP8_e64;
3527+
break;
35243528
default:
35253529
llvm_unreachable("unhandled smfmac intrinsic");
35263530
}

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4788,7 +4788,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
47884788
case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
47894789
case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
47904790
case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
4791-
case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8: {
4791+
case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
4792+
case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8: {
47924793
// vdst, srcA, srcB, srcC, idx
47934794
OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
47944795
OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);

llvm/lib/Target/AMDGPU/VOP3PInstructions.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1066,6 +1066,7 @@ defm V_SMFMAC_F32_32X32X32_BF16 : SMFMACInst<"v_smfmac_f32_32x32x32_bf16",
10661066
defm V_SMFMAC_I32_16X16X128_I8 : SMFMACInst<"v_smfmac_i32_16x16x128_i8", "I32_16X16X128_I8", int_amdgcn_smfmac_i32_16x16x128_i8>;
10671067
defm V_SMFMAC_I32_32X32X64_I8 : SMFMACInst<"v_smfmac_i32_32x32x64_i8", "I32_32X32X64_I8", int_amdgcn_smfmac_i32_32x32x64_i8>;
10681068
defm V_SMFMAC_F32_16X16X128_BF8_BF8 : SMFMACInst<"v_smfmac_f32_16x16x128_bf8_bf8", "F32_16X16X128_F8", int_amdgcn_smfmac_f32_16x16x128_bf8_bf8>;
1069+
defm V_SMFMAC_F32_16X16X128_BF8_FP8 : SMFMACInst<"v_smfmac_f32_16x16x128_bf8_fp8", "F32_16X16X128_F8", int_amdgcn_smfmac_f32_16x16x128_bf8_fp8>;
10691070
}
10701071

10711072
def MAIInstInfoTable : GenericTable {
@@ -2172,6 +2173,7 @@ defm V_SMFMAC_I32_16X16X128_I8 : VOP3P_Real_SMFMAC <0x3a, "v_smfmac_i32_16x1
21722173
defm V_SMFMAC_I32_32X32X64_I8 : VOP3P_Real_SMFMAC <0x47, "v_smfmac_i32_32x32x64i8">;
21732174

21742175
defm V_SMFMAC_F32_16X16X128_BF8_BF8 : VOP3P_Real_SMFMAC <0x3b, "v_smfmac_f32_16x16x128bf8bf8">;
2176+
defm V_SMFMAC_F32_16X16X128_BF8_FP8 : VOP3P_Real_SMFMAC <0x3c, "v_smfmac_f32_16x16x128bf8fp8">;
21752177

21762178
defm V_PK_FMA_F32 : VOP3P_Real_vi <0x30>;
21772179
defm V_PK_MUL_F32 : VOP3P_Real_vi <0x31>;

llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -377,6 +377,15 @@ define amdgpu_kernel void @smfmac_f32_16x16x128_bf8_bf8(<4 x i32> %arg0, <8 x i3
377377
ret void
378378
}
379379

380+
declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32>, <8 x i32>, <4 x float>, i32, i32, i32)
381+
382+
; CHECK: DIVERGENT: %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 1, i32 2)
383+
define amdgpu_kernel void @smfmac_f32_16x16x128_bf8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, ptr addrspace(1) %out) {
384+
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 1, i32 2)
385+
store <4 x float> %result, ptr addrspace(1) %out
386+
ret void
387+
}
388+
380389
declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1
381390
declare i32 @llvm.amdgcn.permlane16.i32(i32, i32, i32, i32, i1, i1) #1
382391
declare i32 @llvm.amdgcn.permlanex16.i32(i32, i32, i32, i32, i1, i1) #1

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll

Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2237,4 +2237,217 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__sgpr(<4 x i32> inreg %arg
22372237
ret <4 x float> %result
22382238
}
22392239

2240+
; --------------------------------------------------------------------
2241+
; llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8
2242+
; --------------------------------------------------------------------
2243+
2244+
declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32>, <8 x i32>, <4 x float>, i32, i32 immarg, i32 immarg)
2245+
2246+
define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace(1) %arg, <4 x i32> %a, <8 x i32> %b, i32 %idx) #0 {
2247+
; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__vgpr:
2248+
; SDAG: ; %bb.0: ; %bb
2249+
; SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
2250+
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
2251+
; SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
2252+
; SDAG-NEXT: v_mov_b32_e32 v16, 0
2253+
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2254+
; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[2:3]
2255+
; SDAG-NEXT: s_load_dword s16, s[0:1], 0x64
2256+
; SDAG-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
2257+
; SDAG-NEXT: v_mov_b32_e32 v12, s4
2258+
; SDAG-NEXT: v_mov_b32_e32 v13, s5
2259+
; SDAG-NEXT: v_mov_b32_e32 v14, s6
2260+
; SDAG-NEXT: v_mov_b32_e32 v15, s7
2261+
; SDAG-NEXT: v_mov_b32_e32 v0, s8
2262+
; SDAG-NEXT: v_mov_b32_e32 v1, s9
2263+
; SDAG-NEXT: v_mov_b32_e32 v2, s10
2264+
; SDAG-NEXT: v_mov_b32_e32 v3, s11
2265+
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2266+
; SDAG-NEXT: v_mov_b32_e32 v4, s12
2267+
; SDAG-NEXT: v_mov_b32_e32 v5, s13
2268+
; SDAG-NEXT: v_mov_b32_e32 v6, s14
2269+
; SDAG-NEXT: v_mov_b32_e32 v7, s15
2270+
; SDAG-NEXT: v_mov_b32_e32 v17, s16
2271+
; SDAG-NEXT: s_waitcnt vmcnt(0)
2272+
; SDAG-NEXT: s_nop 0
2273+
; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
2274+
; SDAG-NEXT: s_nop 6
2275+
; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3]
2276+
; SDAG-NEXT: s_endpgm
2277+
;
2278+
; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__vgpr:
2279+
; GISEL: ; %bb.0: ; %bb
2280+
; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
2281+
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
2282+
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2283+
; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[2:3]
2284+
; GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
2285+
; GISEL-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
2286+
; GISEL-NEXT: s_load_dword s16, s[0:1], 0x64
2287+
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2288+
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[6:7]
2289+
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[4:5]
2290+
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
2291+
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
2292+
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
2293+
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
2294+
; GISEL-NEXT: v_mov_b32_e32 v16, s16
2295+
; GISEL-NEXT: s_waitcnt vmcnt(0)
2296+
; GISEL-NEXT: s_nop 0
2297+
; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
2298+
; GISEL-NEXT: v_mov_b32_e32 v0, 0
2299+
; GISEL-NEXT: s_nop 5
2300+
; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[2:3]
2301+
; GISEL-NEXT: s_endpgm
2302+
bb:
2303+
%id = call i32 @llvm.amdgcn.workitem.id.x()
2304+
%gep = getelementptr <4 x float>, ptr addrspace(1) %arg, i32 %id
2305+
%in.1 = load <4 x float>, ptr addrspace(1) %gep
2306+
%mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %a, <8 x i32> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
2307+
store <4 x float> %mai.1, ptr addrspace(1) %arg
2308+
ret void
2309+
}
2310+
2311+
define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
2312+
; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8:
2313+
; SDAG: ; %bb.0:
2314+
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2315+
; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2316+
; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2317+
; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2318+
; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2319+
; SDAG-NEXT: s_nop 1
2320+
; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[4:11], v16
2321+
; SDAG-NEXT: s_nop 6
2322+
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2323+
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2324+
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2325+
; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2326+
; SDAG-NEXT: s_setpc_b64 s[30:31]
2327+
;
2328+
; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8:
2329+
; GISEL: ; %bb.0:
2330+
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2331+
; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16
2332+
; GISEL-NEXT: s_nop 6
2333+
; GISEL-NEXT: v_mov_b32_e32 v0, v12
2334+
; GISEL-NEXT: v_mov_b32_e32 v1, v13
2335+
; GISEL-NEXT: v_mov_b32_e32 v2, v14
2336+
; GISEL-NEXT: v_mov_b32_e32 v3, v15
2337+
; GISEL-NEXT: s_setpc_b64 s[30:31]
2338+
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
2339+
ret <4 x float> %result
2340+
}
2341+
2342+
define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
2343+
; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags0:
2344+
; SDAG: ; %bb.0:
2345+
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2346+
; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2347+
; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2348+
; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2349+
; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2350+
; SDAG-NEXT: s_nop 1
2351+
; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
2352+
; SDAG-NEXT: s_nop 6
2353+
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2354+
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2355+
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2356+
; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2357+
; SDAG-NEXT: s_setpc_b64 s[30:31]
2358+
;
2359+
; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags0:
2360+
; GISEL: ; %bb.0:
2361+
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2362+
; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
2363+
; GISEL-NEXT: s_nop 6
2364+
; GISEL-NEXT: v_mov_b32_e32 v0, v12
2365+
; GISEL-NEXT: v_mov_b32_e32 v1, v13
2366+
; GISEL-NEXT: v_mov_b32_e32 v2, v14
2367+
; GISEL-NEXT: v_mov_b32_e32 v3, v15
2368+
; GISEL-NEXT: s_setpc_b64 s[30:31]
2369+
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
2370+
ret <4 x float> %result
2371+
}
2372+
2373+
define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
2374+
; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags1:
2375+
; SDAG: ; %bb.0:
2376+
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2377+
; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2378+
; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2379+
; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2380+
; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2381+
; SDAG-NEXT: s_nop 1
2382+
; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
2383+
; SDAG-NEXT: s_nop 6
2384+
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2385+
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2386+
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2387+
; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2388+
; SDAG-NEXT: s_setpc_b64 s[30:31]
2389+
;
2390+
; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags1:
2391+
; GISEL: ; %bb.0:
2392+
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2393+
; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
2394+
; GISEL-NEXT: s_nop 6
2395+
; GISEL-NEXT: v_mov_b32_e32 v0, v12
2396+
; GISEL-NEXT: v_mov_b32_e32 v1, v13
2397+
; GISEL-NEXT: v_mov_b32_e32 v2, v14
2398+
; GISEL-NEXT: v_mov_b32_e32 v3, v15
2399+
; GISEL-NEXT: s_setpc_b64 s[30:31]
2400+
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
2401+
ret <4 x float> %result
2402+
}
2403+
2404+
define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <4 x float> inreg %arg2, i32 inreg %arg3) {
2405+
; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__sgpr:
2406+
; SDAG: ; %bb.0:
2407+
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2408+
; SDAG-NEXT: v_mov_b32_e32 v8, s0
2409+
; SDAG-NEXT: v_mov_b32_e32 v9, s1
2410+
; SDAG-NEXT: v_mov_b32_e32 v10, s2
2411+
; SDAG-NEXT: v_mov_b32_e32 v11, s3
2412+
; SDAG-NEXT: v_mov_b32_e32 v0, s4
2413+
; SDAG-NEXT: v_mov_b32_e32 v1, s5
2414+
; SDAG-NEXT: v_mov_b32_e32 v2, s6
2415+
; SDAG-NEXT: v_mov_b32_e32 v3, s7
2416+
; SDAG-NEXT: v_mov_b32_e32 v4, s8
2417+
; SDAG-NEXT: v_mov_b32_e32 v5, s9
2418+
; SDAG-NEXT: v_mov_b32_e32 v6, s10
2419+
; SDAG-NEXT: v_mov_b32_e32 v7, s11
2420+
; SDAG-NEXT: v_accvgpr_write_b32 a0, s12
2421+
; SDAG-NEXT: v_accvgpr_write_b32 a1, s13
2422+
; SDAG-NEXT: v_accvgpr_write_b32 a2, s14
2423+
; SDAG-NEXT: v_accvgpr_write_b32 a3, s15
2424+
; SDAG-NEXT: v_mov_b32_e32 v12, s16
2425+
; SDAG-NEXT: s_nop 1
2426+
; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[8:11], v[0:7], v12
2427+
; SDAG-NEXT: s_nop 6
2428+
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2429+
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2430+
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2431+
; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2432+
; SDAG-NEXT: s_setpc_b64 s[30:31]
2433+
;
2434+
; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__sgpr:
2435+
; GISEL: ; %bb.0:
2436+
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2437+
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
2438+
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
2439+
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
2440+
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
2441+
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
2442+
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
2443+
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
2444+
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
2445+
; GISEL-NEXT: v_mov_b32_e32 v16, s16
2446+
; GISEL-NEXT: s_nop 1
2447+
; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[0:3], v[12:15], v[4:11], v16
2448+
; GISEL-NEXT: s_setpc_b64 s[30:31]
2449+
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
2450+
ret <4 x float> %result
2451+
}
2452+
22402453
attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }

0 commit comments

Comments
 (0)