Skip to content

Commit 277ede4

Browse files
arsenmshiltian
authored andcommitted
AMDGPU: MC support for v_cvt_sr_{f16|bf16}_f32 instructions (llvm#117796)
Co-authored-by: Shilei Tian <[email protected]>
1 parent 3b999be commit 277ede4

File tree

7 files changed

+97
-0
lines changed

7 files changed

+97
-0
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -420,6 +420,12 @@ def FeatureF16BF16ToFP6BF6ConversionScaleInsts : SubtargetFeature<"f16bf16-to-fp
420420
"Has f16bf16 to fp6bf6 conversion scale instructions"
421421
>;
422422

423+
def FeatureF32ToF16BF16ConversionSRInsts : SubtargetFeature<"f32-to-f16bf16-cvt-sr-insts",
424+
"HasF32ToF16BF16ConversionSRInsts",
425+
"true",
426+
"Has f32 to f16bf16 conversion scale instructions"
427+
>;
428+
423429
def FeatureAshrPkInsts : SubtargetFeature<"ashr-pk-insts",
424430
"HasAshrPkInsts",
425431
"true",
@@ -438,6 +444,7 @@ def FeatureGFX950Insts : SubtargetFeature<"gfx950-insts",
438444
FeatureFP4ConversionScaleInsts,
439445
FeatureFP6BF6ConversionScaleInsts,
440446
FeatureF16BF16ToFP6BF6ConversionScaleInsts,
447+
FeatureF32ToF16BF16ConversionSRInsts,
441448
FeatureMinimum3Maximum3F32,
442449
FeatureMinimum3Maximum3PKF16
443450
]
@@ -2509,6 +2516,9 @@ def HasFP6BF6ConversionScaleInsts : Predicate<"Subtarget->hasFP6BF6ConversionSca
25092516
def HasF16BF16ToFP6BF6ConversionScaleInsts : Predicate<"Subtarget->hasF16BF16ToFP6BF6ConversionScaleInsts()">,
25102517
AssemblerPredicate<(all_of FeatureF16BF16ToFP6BF6ConversionScaleInsts)>;
25112518

2519+
def HasF32ToF16BF16ConversionSRInsts : Predicate<"Subtarget->hasF32ToF16BF16ConversionSRInsts()">,
2520+
AssemblerPredicate<(all_of FeatureF32ToF16BF16ConversionSRInsts)>;
2521+
25122522
def HasGDS : Predicate<"Subtarget->hasGDS()">;
25132523

25142524
def HasGWS : Predicate<"Subtarget->hasGWS()">;

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ class AMDGPUSubtarget {
5555
bool HasFP4ConversionScaleInsts = false;
5656
bool HasFP6BF6ConversionScaleInsts = false;
5757
bool HasF16BF16ToFP6BF6ConversionScaleInsts = false;
58+
bool HasF32ToF16BF16ConversionSRInsts = false;
5859
bool EnableRealTrue16Insts = false;
5960
bool HasBF16ConversionInsts = false;
6061
bool HasMadMixInsts = false;
@@ -190,6 +191,10 @@ class AMDGPUSubtarget {
190191

191192
bool hasF16BF16ToFP6BF6ConversionScaleInsts() const { return HasF16BF16ToFP6BF6ConversionScaleInsts; }
192193

194+
bool hasF32ToF16BF16ConversionSRInsts() const {
195+
return HasF32ToF16BF16ConversionSRInsts;
196+
}
197+
193198
bool hasMadMacF32Insts() const {
194199
return HasMadMacF32Insts || !isGCN();
195200
}

llvm/lib/Target/AMDGPU/SIInstrInfo.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2657,6 +2657,8 @@ def VOP_V2I16_V2BF16_F32 : VOPProfile<[v2i16, v2bf16, f32, untyped]>;
26572657
def VOP_I32_F32_F32_F32 : VOPProfile<[i32, f32, f32, f32]>;
26582658
def VOP_I32_V2F16_F32_F32 : VOPProfile<[i32, v2f16, f32, f32]>;
26592659
def VOP_I32_V2BF16_F32_F32: VOPProfile<[i32, v2bf16, f32, f32]>;
2660+
def VOP_BF16_F32_I32 : VOPProfile<[bf16, f32, i32, untyped]>;
2661+
def VOP_F16_F32_I32 : VOPProfile<[f16, f32, i32, untyped]>;
26602662

26612663
def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>;
26622664
def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>;

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1261,6 +1261,12 @@ let SubtargetPredicate = isGFX11Plus in {
12611261
defm V_CVT_PK_U16_F32 : VOP3Inst<"v_cvt_pk_u16_f32", VOP3_Profile<VOP_V2I16_F32_F32>>;
12621262
} // End SubtargetPredicate = isGFX11Plus
12631263

1264+
class VOP3_CVT_SR_FP16_TiedInput_Profile<VOPProfile P> : VOP3_CVT_SCALE_F1632_FP8BF8_TiedInput_Profile<P> {
1265+
let InsVOP3OpSel = (ins FP32InputMods:$src0_modifiers, Src0RC64:$src0,
1266+
Int32InputMods:$src1_modifiers, Src1RC64:$src1,
1267+
VGPR_32:$vdst_in, op_sel0:$op_sel);
1268+
}
1269+
12641270
// FIXME: GlobalISel cannot distinguish f16 and bf16 and may start using bf16 patterns
12651271
// instead of less complex f16. Disable GlobalISel for these for now.
12661272
def bf16_fpround : PatFrag <(ops node:$src0), (fpround $src0), [{ return true; }]> {
@@ -1285,6 +1291,13 @@ let SubtargetPredicate = HasBF16ConversionInsts in {
12851291
(V_CVT_PK_BF16_F32_e64 0, (f32 (V_CVT_F32_F64_e64 $src0_modifiers, $src0)), 0, (f32 (IMPLICIT_DEF)))>;
12861292
}
12871293

1294+
let SubtargetPredicate = HasF32ToF16BF16ConversionSRInsts in {
1295+
let Constraints = "$vdst = $vdst_in", DisableEncoding = "$vdst_in" in {
1296+
defm V_CVT_SR_F16_F32 : VOP3Inst<"v_cvt_sr_f16_f32", VOP3_CVT_SR_FP16_TiedInput_Profile<VOP_F16_F32_I32>>;
1297+
defm V_CVT_SR_BF16_F32 : VOP3Inst<"v_cvt_sr_bf16_f32", VOP3_CVT_SR_FP16_TiedInput_Profile<VOP_BF16_F32_I32>>;
1298+
}
1299+
}
1300+
12881301
let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in {
12891302
defm V_MAXIMUMMINIMUM_F32 : VOP3Inst<"v_maximumminimum_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
12901303
defm V_MINIMUMMAXIMUM_F32 : VOP3Inst<"v_minimummaximum_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
@@ -2144,6 +2157,11 @@ defm V_CVT_SCALEF32_PK32_BF6_F16 : VOP3_Real_gfx9<0x25a, "v_cvt_scalef32_pk32_b
21442157
defm V_CVT_SCALEF32_PK32_BF6_BF16 : VOP3_Real_gfx9<0x25b, "v_cvt_scalef32_pk32_bf6_bf16">;
21452158
}
21462159

2160+
let OtherPredicates = [HasF32ToF16BF16ConversionSRInsts] in {
2161+
defm V_CVT_SR_F16_F32 : VOP3OpSel_Real_gfx9 <0x2a6>;
2162+
defm V_CVT_SR_BF16_F32: VOP3OpSel_Real_gfx9 <0x2a7>;
2163+
}
2164+
21472165
defm V_ASHR_PK_I8_I32 : VOP3OpSel_Real_gfx9 <0x265>;
21482166
defm V_ASHR_PK_U8_I32 : VOP3OpSel_Real_gfx9 <0x266>;
21492167

llvm/test/MC/AMDGPU/gfx950_asm_features.s

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1375,3 +1375,35 @@ v_cvt_scalef32_sr_pk_fp4_f32 v0, |v[2:3]|, v4, v5
13751375
// NOT-GFX950: error: instruction not supported on this GPU
13761376
// GFX950: v_cvt_scalef32_sr_pk_fp4_f32 v0, v[2:3], v4, |v5| ; encoding: [0x00,0x04,0x3e,0xd2,0x02,0x09,0x16,0x04]
13771377
v_cvt_scalef32_sr_pk_fp4_f32 v0, v[2:3], v4, |v5|
1378+
1379+
// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
1380+
// GFX950: v_cvt_sr_f16_f32 v0, v1, v2 ; encoding: [0x00,0x00,0xa6,0xd2,0x01,0x05,0x02,0x00]
1381+
v_cvt_sr_f16_f32 v0, v1, v2
1382+
1383+
// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
1384+
// GFX950: v_cvt_sr_bf16_f32 v0, v1, v2 ; encoding: [0x00,0x00,0xa7,0xd2,0x01,0x05,0x02,0x00]
1385+
v_cvt_sr_bf16_f32 v0, v1, v2
1386+
1387+
// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
1388+
// GFX950: v_cvt_sr_f16_f32 v0, v1, v2 op_sel:[0,0,1] ; encoding: [0x00,0x40,0xa6,0xd2,0x01,0x05,0x02,0x00]
1389+
v_cvt_sr_f16_f32 v0, v1, v2 op_sel:[0,0,1]
1390+
1391+
// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
1392+
// GFX950: v_cvt_sr_bf16_f32 v0, v1, v2 op_sel:[0,0,1] ; encoding: [0x00,0x40,0xa7,0xd2,0x01,0x05,0x02,0x00]
1393+
v_cvt_sr_bf16_f32 v0, v1, v2 op_sel:[0,0,1]
1394+
1395+
// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
1396+
// GFX950: v_cvt_sr_f16_f32 v0, -v1, v2 ; encoding: [0x00,0x00,0xa6,0xd2,0x01,0x05,0x02,0x20]
1397+
v_cvt_sr_f16_f32 v0, -v1, v2
1398+
1399+
// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
1400+
// GFX950: v_cvt_sr_f16_f32 v0, |v1|, v2 ; encoding: [0x00,0x01,0xa6,0xd2,0x01,0x05,0x02,0x00]
1401+
v_cvt_sr_f16_f32 v0, |v1|, v2
1402+
1403+
// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
1404+
// GFX950: v_cvt_sr_bf16_f32 v0, -v1, v2 ; encoding: [0x00,0x00,0xa7,0xd2,0x01,0x05,0x02,0x20]
1405+
v_cvt_sr_bf16_f32 v0, -v1, v2
1406+
1407+
// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
1408+
// GFX950: v_cvt_sr_bf16_f32 v0, |v1|, v2 ; encoding: [0x00,0x01,0xa7,0xd2,0x01,0x05,0x02,0x00]
1409+
v_cvt_sr_bf16_f32 v0, |v1|, v2

llvm/test/MC/AMDGPU/gfx950_err.s

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -392,3 +392,9 @@ v_pk_minimum3_f16 v0, s1, s2, v3
392392

393393
// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand (violates constant bus restrictions)
394394
v_pk_maximum3_f16 v0, s1, s2, v3
395+
396+
// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
397+
v_cvt_sr_f16_f32 v1, v2, v3 clamp
398+
399+
// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
400+
v_cvt_sr_bf16_f32 v1, v2, v3 clamp

llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1014,3 +1014,27 @@
10141014

10151015
# GFX950: v_cvt_scalef32_sr_pk_fp4_f32 v0, v[2:3], v4, |v5| ; encoding: [0x00,0x04,0x3e,0xd2,0x02,0x09,0x16,0x04]
10161016
0x00,0x04,0x3e,0xd2,0x02,0x09,0x16,0x04
1017+
1018+
# GFX950: v_cvt_sr_f16_f32 v0, v1, v2 ; encoding: [0x00,0x00,0xa6,0xd2,0x01,0x05,0x02,0x00]
1019+
0x00,0x00,0xa6,0xd2,0x01,0x05,0x02,0x00
1020+
1021+
# GFX950: v_cvt_sr_bf16_f32 v0, v1, v2 ; encoding: [0x00,0x00,0xa7,0xd2,0x01,0x05,0x02,0x00]
1022+
0x00,0x00,0xa7,0xd2,0x01,0x05,0x02,0x00
1023+
1024+
# GFX950: v_cvt_sr_f16_f32 v0, v1, v2 op_sel:[0,0,1] ; encoding: [0x00,0x40,0xa6,0xd2,0x01,0x05,0x02,0x00]
1025+
0x00,0x40,0xa6,0xd2,0x01,0x05,0x02,0x00
1026+
1027+
# GFX950: v_cvt_sr_bf16_f32 v0, v1, v2 op_sel:[0,0,1] ; encoding: [0x00,0x40,0xa7,0xd2,0x01,0x05,0x02,0x00]
1028+
0x00,0x40,0xa7,0xd2,0x01,0x05,0x02,0x00
1029+
1030+
# GFX950: v_cvt_sr_f16_f32 v0, -v1, v2 ; encoding: [0x00,0x00,0xa6,0xd2,0x01,0x05,0x02,0x20]
1031+
0x00,0x00,0xa6,0xd2,0x01,0x05,0x02,0x20
1032+
1033+
# GFX950: v_cvt_sr_f16_f32 v0, |v1|, v2 ; encoding: [0x00,0x01,0xa6,0xd2,0x01,0x05,0x02,0x00]
1034+
0x00,0x01,0xa6,0xd2,0x01,0x05,0x02,0x00
1035+
1036+
# GFX950: v_cvt_sr_bf16_f32 v0, -v1, v2 ; encoding: [0x00,0x00,0xa7,0xd2,0x01,0x05,0x02,0x20]
1037+
0x00,0x00,0xa7,0xd2,0x01,0x05,0x02,0x20
1038+
1039+
# GFX950: v_cvt_sr_bf16_f32 v0, |v1|, v2 ; encoding: [0x00,0x01,0xa7,0xd2,0x01,0x05,0x02,0x00]
1040+
0x00,0x01,0xa7,0xd2,0x01,0x05,0x02,0x00

0 commit comments

Comments
 (0)