Skip to content

Commit d0e6456

Browse files
arsenmpravinjagtap
authored andcommitted
AMDGPU: Add v_smfmac_f32_16x16x128_fp8_fp8 for gfx950 (llvm#117235)
1 parent 8211467 commit d0e6456

File tree

12 files changed

+304
-1
lines changed

12 files changed

+304
-1
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -453,6 +453,7 @@ TARGET_BUILTIN(__builtin_amdgcn_smfmac_i32_32x32x64_i8, "V16iV4iV8iV16iiIiIi", "
453453
TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_16x16x128_bf8_bf8, "V4fV4iV8iV4fiIiIi", "nc", "gfx950-insts")
454454
TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_16x16x128_bf8_fp8, "V4fV4iV8iV4fiIiIi", "nc", "gfx950-insts")
455455
TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_16x16x128_fp8_bf8, "V4fV4iV8iV4fiIiIi", "nc", "gfx950-insts")
456+
TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_16x16x128_fp8_fp8, "V4fV4iV8iV4fiIiIi", "nc", "gfx950-insts")
456457

457458
//===----------------------------------------------------------------------===//
458459
// GFX12+ only builtins.

clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -531,4 +531,11 @@ void test_smfmac_f32_16x16x128_fp8_bf8(global v4f* out, v4i a, v8i b, v4f c, int
531531
*out = __builtin_amdgcn_smfmac_f32_16x16x128_fp8_bf8(a, b, c, idx, 0, 0);
532532
}
533533

534+
// CHECK-GFX950-LABEL: @test_smfmac_f32_16x16x128_fp8_fp8
535+
// CHECK-GFX950: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %a, <8 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0)
536+
void test_smfmac_f32_16x16x128_fp8_fp8(global v4f* out, v4i a, v8i b, v4f c, int idx)
537+
{
538+
*out = __builtin_amdgcn_smfmac_f32_16x16x128_fp8_fp8(a, b, c, idx, 0, 0);
539+
}
540+
534541
#endif

clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,3 +118,9 @@ void test_smfmac_f32_16x16x128_fp8_bf8(global float4* out, int4 a, int8 b, float
118118
*out = __builtin_amdgcn_smfmac_f32_16x16x128_fp8_bf8(a, b, c, idx, d, 0); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_16x16x128_fp8_bf8' must be a constant integer}}
119119
*out = __builtin_amdgcn_smfmac_f32_16x16x128_fp8_bf8(a, b, c, idx, 0, d); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_16x16x128_fp8_bf8' must be a constant integer}}
120120
}
121+
122+
void test_smfmac_f32_16x16x128_fp8_fp8(global float4* out, int4 a, int8 b, float4 c, int idx, int d)
123+
{
124+
*out = __builtin_amdgcn_smfmac_f32_16x16x128_fp8_fp8(a, b, c, idx, d, 0); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_16x16x128_fp8_fp8' must be a constant integer}}
125+
*out = __builtin_amdgcn_smfmac_f32_16x16x128_fp8_fp8(a, b, c, idx, 0, d); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_16x16x128_fp8_fp8' must be a constant integer}}
126+
}

clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ void test(__global float4* out0, half8 a0, half8 b0, float4 c0,
4343
*out12 = __builtin_amdgcn_smfmac_f32_16x16x128_bf8_bf8(a12, b12, c12, 0, 0, 0); // expected-error{{'__builtin_amdgcn_smfmac_f32_16x16x128_bf8_bf8' needs target feature gfx950-insts}}
4444
*out12 = __builtin_amdgcn_smfmac_f32_16x16x128_bf8_fp8(a12, b12, c12, 0, 0, 0); // expected-error{{'__builtin_amdgcn_smfmac_f32_16x16x128_bf8_fp8' needs target feature gfx950-insts}}
4545
*out12 = __builtin_amdgcn_smfmac_f32_16x16x128_fp8_bf8(a12, b12, c12, 0, 0, 0); // expected-error{{'__builtin_amdgcn_smfmac_f32_16x16x128_fp8_bf8' needs target feature gfx950-insts}}
46+
*out12 = __builtin_amdgcn_smfmac_f32_16x16x128_fp8_fp8(a12, b12, c12, 0, 0, 0); // expected-error{{'__builtin_amdgcn_smfmac_f32_16x16x128_fp8_fp8' needs target feature gfx950-insts}}
4647
*out14 = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a14, b14, c14, 0, 0, 0, d14, 0, e14); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' needs target feature gfx950-insts}}
4748
*out15 = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a15, b15, c15, 0, 0, 0, d15, 0, e15); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' needs target feature gfx950-insts}}
4849
}

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3188,6 +3188,7 @@ def int_amdgcn_smfmac_i32_32x32x64_i8 : AMDGPUMSmfmacIntrinsic<llvm_v16i32_ty, l
31883188
def int_amdgcn_smfmac_f32_16x16x128_bf8_bf8 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4i32_ty, llvm_v8i32_ty>;
31893189
def int_amdgcn_smfmac_f32_16x16x128_bf8_fp8 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4i32_ty, llvm_v8i32_ty>;
31903190
def int_amdgcn_smfmac_f32_16x16x128_fp8_bf8 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4i32_ty, llvm_v8i32_ty>;
3191+
def int_amdgcn_smfmac_f32_16x16x128_fp8_fp8 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4i32_ty, llvm_v8i32_ty>;
31913192
}
31923193

31933194
//===----------------------------------------------------------------------===//

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1084,6 +1084,7 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
10841084
case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
10851085
case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
10861086
case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
1087+
case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
10871088
return selectSMFMACIntrin(I);
10881089
default:
10891090
return selectImpl(I, *CoverageInfo);
@@ -3529,6 +3530,9 @@ bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
35293530
case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
35303531
Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_BF8_e64;
35313532
break;
3533+
case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
3534+
Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_FP8_e64;
3535+
break;
35323536
default:
35333537
llvm_unreachable("unhandled smfmac intrinsic");
35343538
}

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4790,7 +4790,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
47904790
case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
47914791
case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
47924792
case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
4793-
case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8: {
4793+
case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
4794+
case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8: {
47944795
// vdst, srcA, srcB, srcC, idx
47954796
OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
47964797
OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);

llvm/lib/Target/AMDGPU/VOP3PInstructions.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1068,6 +1068,7 @@ defm V_SMFMAC_I32_32X32X64_I8 : SMFMACInst<"v_smfmac_i32_32x32x64_i8",
10681068
defm V_SMFMAC_F32_16X16X128_BF8_BF8 : SMFMACInst<"v_smfmac_f32_16x16x128_bf8_bf8", "F32_16X16X128_F8", int_amdgcn_smfmac_f32_16x16x128_bf8_bf8>;
10691069
defm V_SMFMAC_F32_16X16X128_BF8_FP8 : SMFMACInst<"v_smfmac_f32_16x16x128_bf8_fp8", "F32_16X16X128_F8", int_amdgcn_smfmac_f32_16x16x128_bf8_fp8>;
10701070
defm V_SMFMAC_F32_16X16X128_FP8_BF8 : SMFMACInst<"v_smfmac_f32_16x16x128_fp8_bf8", "F32_16X16X128_F8", int_amdgcn_smfmac_f32_16x16x128_fp8_bf8>;
1071+
defm V_SMFMAC_F32_16X16X128_FP8_FP8 : SMFMACInst<"v_smfmac_f32_16x16x128_fp8_fp8", "F32_16X16X128_F8", int_amdgcn_smfmac_f32_16x16x128_fp8_fp8>;
10711072
}
10721073

10731074
def MAIInstInfoTable : GenericTable {
@@ -2176,6 +2177,7 @@ defm V_SMFMAC_I32_32X32X64_I8 : VOP3P_Real_SMFMAC <0x47, "v_smfmac_i32_32x3
21762177
defm V_SMFMAC_F32_16X16X128_BF8_BF8 : VOP3P_Real_SMFMAC <0x3b, "v_smfmac_f32_16x16x128bf8bf8">;
21772178
defm V_SMFMAC_F32_16X16X128_BF8_FP8 : VOP3P_Real_SMFMAC <0x3c, "v_smfmac_f32_16x16x128bf8fp8">;
21782179
defm V_SMFMAC_F32_16X16X128_FP8_BF8 : VOP3P_Real_SMFMAC <0x3d, "v_smfmac_f32_16x16x128fp8bf8">;
2180+
defm V_SMFMAC_F32_16X16X128_FP8_FP8 : VOP3P_Real_SMFMAC <0x43, "v_smfmac_f32_16x16x128fp8fp8">;
21792181

21802182
defm V_PK_FMA_F32 : VOP3P_Real_vi <0x30>;
21812183
defm V_PK_MUL_F32 : VOP3P_Real_vi <0x31>;

llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -395,6 +395,15 @@ define amdgpu_kernel void @smfmac_f32_16x16x128_fp8_bf8(<4 x i32> %arg0, <8 x i3
395395
ret void
396396
}
397397

398+
declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32>, <8 x i32>, <4 x float>, i32, i32, i32)
399+
400+
; CHECK: DIVERGENT: %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 1, i32 2)
401+
define amdgpu_kernel void @smfmac_f32_16x16x128_fp8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, ptr addrspace(1) %out) {
402+
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 1, i32 2)
403+
store <4 x float> %result, ptr addrspace(1) %out
404+
ret void
405+
}
406+
398407
declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1
399408
declare i32 @llvm.amdgcn.permlane16.i32(i32, i32, i32, i32, i1, i1) #1
400409
declare i32 @llvm.amdgcn.permlanex16.i32(i32, i32, i32, i32, i1, i1) #1

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll

Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2663,4 +2663,217 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__sgpr(<4 x i32> inreg %arg
26632663
ret <4 x float> %result
26642664
}
26652665

2666+
; --------------------------------------------------------------------
2667+
; llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8
2668+
; --------------------------------------------------------------------
2669+
2670+
declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32>, <8 x i32>, <4 x float>, i32, i32 immarg, i32 immarg)
2671+
2672+
define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace(1) %arg, <4 x i32> %a, <8 x i32> %b, i32 %idx) #0 {
2673+
; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__vgpr:
2674+
; SDAG: ; %bb.0: ; %bb
2675+
; SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
2676+
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
2677+
; SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
2678+
; SDAG-NEXT: v_mov_b32_e32 v16, 0
2679+
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2680+
; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[2:3]
2681+
; SDAG-NEXT: s_load_dword s16, s[0:1], 0x64
2682+
; SDAG-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
2683+
; SDAG-NEXT: v_mov_b32_e32 v12, s4
2684+
; SDAG-NEXT: v_mov_b32_e32 v13, s5
2685+
; SDAG-NEXT: v_mov_b32_e32 v14, s6
2686+
; SDAG-NEXT: v_mov_b32_e32 v15, s7
2687+
; SDAG-NEXT: v_mov_b32_e32 v0, s8
2688+
; SDAG-NEXT: v_mov_b32_e32 v1, s9
2689+
; SDAG-NEXT: v_mov_b32_e32 v2, s10
2690+
; SDAG-NEXT: v_mov_b32_e32 v3, s11
2691+
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2692+
; SDAG-NEXT: v_mov_b32_e32 v4, s12
2693+
; SDAG-NEXT: v_mov_b32_e32 v5, s13
2694+
; SDAG-NEXT: v_mov_b32_e32 v6, s14
2695+
; SDAG-NEXT: v_mov_b32_e32 v7, s15
2696+
; SDAG-NEXT: v_mov_b32_e32 v17, s16
2697+
; SDAG-NEXT: s_waitcnt vmcnt(0)
2698+
; SDAG-NEXT: s_nop 0
2699+
; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
2700+
; SDAG-NEXT: s_nop 6
2701+
; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3]
2702+
; SDAG-NEXT: s_endpgm
2703+
;
2704+
; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__vgpr:
2705+
; GISEL: ; %bb.0: ; %bb
2706+
; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
2707+
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
2708+
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2709+
; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[2:3]
2710+
; GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
2711+
; GISEL-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
2712+
; GISEL-NEXT: s_load_dword s16, s[0:1], 0x64
2713+
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2714+
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[6:7]
2715+
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[4:5]
2716+
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
2717+
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
2718+
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
2719+
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
2720+
; GISEL-NEXT: v_mov_b32_e32 v16, s16
2721+
; GISEL-NEXT: s_waitcnt vmcnt(0)
2722+
; GISEL-NEXT: s_nop 0
2723+
; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
2724+
; GISEL-NEXT: v_mov_b32_e32 v0, 0
2725+
; GISEL-NEXT: s_nop 5
2726+
; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[2:3]
2727+
; GISEL-NEXT: s_endpgm
2728+
bb:
2729+
%id = call i32 @llvm.amdgcn.workitem.id.x()
2730+
%gep = getelementptr <4 x float>, ptr addrspace(1) %arg, i32 %id
2731+
%in.1 = load <4 x float>, ptr addrspace(1) %gep
2732+
%mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %a, <8 x i32> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
2733+
store <4 x float> %mai.1, ptr addrspace(1) %arg
2734+
ret void
2735+
}
2736+
2737+
define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
2738+
; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8:
2739+
; SDAG: ; %bb.0:
2740+
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2741+
; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2742+
; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2743+
; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2744+
; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2745+
; SDAG-NEXT: s_nop 1
2746+
; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[4:11], v16
2747+
; SDAG-NEXT: s_nop 6
2748+
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2749+
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2750+
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2751+
; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2752+
; SDAG-NEXT: s_setpc_b64 s[30:31]
2753+
;
2754+
; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8:
2755+
; GISEL: ; %bb.0:
2756+
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2757+
; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16
2758+
; GISEL-NEXT: s_nop 6
2759+
; GISEL-NEXT: v_mov_b32_e32 v0, v12
2760+
; GISEL-NEXT: v_mov_b32_e32 v1, v13
2761+
; GISEL-NEXT: v_mov_b32_e32 v2, v14
2762+
; GISEL-NEXT: v_mov_b32_e32 v3, v15
2763+
; GISEL-NEXT: s_setpc_b64 s[30:31]
2764+
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
2765+
ret <4 x float> %result
2766+
}
2767+
2768+
define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
2769+
; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags0:
2770+
; SDAG: ; %bb.0:
2771+
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2772+
; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2773+
; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2774+
; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2775+
; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2776+
; SDAG-NEXT: s_nop 1
2777+
; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
2778+
; SDAG-NEXT: s_nop 6
2779+
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2780+
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2781+
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2782+
; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2783+
; SDAG-NEXT: s_setpc_b64 s[30:31]
2784+
;
2785+
; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags0:
2786+
; GISEL: ; %bb.0:
2787+
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2788+
; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
2789+
; GISEL-NEXT: s_nop 6
2790+
; GISEL-NEXT: v_mov_b32_e32 v0, v12
2791+
; GISEL-NEXT: v_mov_b32_e32 v1, v13
2792+
; GISEL-NEXT: v_mov_b32_e32 v2, v14
2793+
; GISEL-NEXT: v_mov_b32_e32 v3, v15
2794+
; GISEL-NEXT: s_setpc_b64 s[30:31]
2795+
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
2796+
ret <4 x float> %result
2797+
}
2798+
2799+
define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
2800+
; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags1:
2801+
; SDAG: ; %bb.0:
2802+
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2803+
; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2804+
; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2805+
; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2806+
; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2807+
; SDAG-NEXT: s_nop 1
2808+
; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
2809+
; SDAG-NEXT: s_nop 6
2810+
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2811+
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2812+
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2813+
; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2814+
; SDAG-NEXT: s_setpc_b64 s[30:31]
2815+
;
2816+
; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags1:
2817+
; GISEL: ; %bb.0:
2818+
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2819+
; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
2820+
; GISEL-NEXT: s_nop 6
2821+
; GISEL-NEXT: v_mov_b32_e32 v0, v12
2822+
; GISEL-NEXT: v_mov_b32_e32 v1, v13
2823+
; GISEL-NEXT: v_mov_b32_e32 v2, v14
2824+
; GISEL-NEXT: v_mov_b32_e32 v3, v15
2825+
; GISEL-NEXT: s_setpc_b64 s[30:31]
2826+
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
2827+
ret <4 x float> %result
2828+
}
2829+
2830+
define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <4 x float> inreg %arg2, i32 inreg %arg3) {
2831+
; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__sgpr:
2832+
; SDAG: ; %bb.0:
2833+
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2834+
; SDAG-NEXT: v_mov_b32_e32 v8, s0
2835+
; SDAG-NEXT: v_mov_b32_e32 v9, s1
2836+
; SDAG-NEXT: v_mov_b32_e32 v10, s2
2837+
; SDAG-NEXT: v_mov_b32_e32 v11, s3
2838+
; SDAG-NEXT: v_mov_b32_e32 v0, s4
2839+
; SDAG-NEXT: v_mov_b32_e32 v1, s5
2840+
; SDAG-NEXT: v_mov_b32_e32 v2, s6
2841+
; SDAG-NEXT: v_mov_b32_e32 v3, s7
2842+
; SDAG-NEXT: v_mov_b32_e32 v4, s8
2843+
; SDAG-NEXT: v_mov_b32_e32 v5, s9
2844+
; SDAG-NEXT: v_mov_b32_e32 v6, s10
2845+
; SDAG-NEXT: v_mov_b32_e32 v7, s11
2846+
; SDAG-NEXT: v_accvgpr_write_b32 a0, s12
2847+
; SDAG-NEXT: v_accvgpr_write_b32 a1, s13
2848+
; SDAG-NEXT: v_accvgpr_write_b32 a2, s14
2849+
; SDAG-NEXT: v_accvgpr_write_b32 a3, s15
2850+
; SDAG-NEXT: v_mov_b32_e32 v12, s16
2851+
; SDAG-NEXT: s_nop 1
2852+
; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[8:11], v[0:7], v12
2853+
; SDAG-NEXT: s_nop 6
2854+
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2855+
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2856+
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2857+
; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2858+
; SDAG-NEXT: s_setpc_b64 s[30:31]
2859+
;
2860+
; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__sgpr:
2861+
; GISEL: ; %bb.0:
2862+
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2863+
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
2864+
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
2865+
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
2866+
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
2867+
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
2868+
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
2869+
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
2870+
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
2871+
; GISEL-NEXT: v_mov_b32_e32 v16, s16
2872+
; GISEL-NEXT: s_nop 1
2873+
; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[0:3], v[12:15], v[4:11], v16
2874+
; GISEL-NEXT: s_setpc_b64 s[30:31]
2875+
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
2876+
ret <4 x float> %result
2877+
}
2878+
26662879
attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }

0 commit comments

Comments
 (0)