Skip to content

Commit 8d78c8c

Browse files
arsenmpravinjagtap
andcommitted
AMDGPU: Support v_cvt_scalef32_pk32_{bf|f}6_{bf|fp}16 for gfx950 (llvm#117592)
Co-authored-by: Pravin Jagtap <[email protected]>
1 parent ea3fc0f commit 8d78c8c

File tree

16 files changed

+653
-3
lines changed

16 files changed

+653
-3
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -557,6 +557,10 @@ TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64, "V4fiV2iV4fs",
557557
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64")
558558

559559
TARGET_BUILTIN(__builtin_amdgcn_prng_b32, "UiUi", "nc", "prng-inst")
560+
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk32_fp6_f16, "V6UiV32hf", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts")
561+
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk32_bf6_f16, "V6UiV32hf", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts")
562+
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk32_fp6_bf16, "V6UiV32yf", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts")
563+
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk32_bf6_bf16, "V6UiV32yf", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts")
560564

561565
#undef BUILTIN
562566
#undef TARGET_BUILTIN

clang/test/CodeGenOpenCL/amdgpu-features.cl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@
8989
// GFX941: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts"
9090
// GFX942: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts"
9191
// GFX9_4_Generic: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
92-
// GFX950: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
92+
// GFX950: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
9393
// GFX1010: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
9494
// GFX1011: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
9595
// GFX1012: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@
44

55
typedef unsigned int uint;
66
typedef unsigned int __attribute__((ext_vector_type(2))) uint2;
7+
typedef unsigned int __attribute__((ext_vector_type(6))) uint6;
8+
typedef __bf16 __attribute__((ext_vector_type(32))) bfloat32;
9+
typedef half __attribute__((ext_vector_type(32))) half32;
710

811
// CHECK-LABEL: @test_prng_b32(
912
// CHECK-NEXT: entry:
@@ -106,3 +109,43 @@ void test_permlane32_swap(global uint2* out, uint old, uint src) {
106109
*out = __builtin_amdgcn_permlane32_swap(old, src, true, false);
107110
*out = __builtin_amdgcn_permlane32_swap(old, src, false, true);
108111
}
112+
113+
// CHECK-LABEL: @test_cvt_scalef32_pk(
114+
// CHECK-NEXT: entry:
115+
// CHECK-NEXT: [[OUT6_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
116+
// CHECK-NEXT: [[SRCBF32_ADDR:%.*]] = alloca <32 x bfloat>, align 64, addrspace(5)
117+
// CHECK-NEXT: [[SRCH32_ADDR:%.*]] = alloca <32 x half>, align 64, addrspace(5)
118+
// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
119+
// CHECK-NEXT: store ptr addrspace(1) [[OUT6:%.*]], ptr addrspace(5) [[OUT6_ADDR]], align 8
120+
// CHECK-NEXT: store <32 x bfloat> [[SRCBF32:%.*]], ptr addrspace(5) [[SRCBF32_ADDR]], align 64
121+
// CHECK-NEXT: store <32 x half> [[SRCH32:%.*]], ptr addrspace(5) [[SRCH32_ADDR]], align 64
122+
// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
123+
// CHECK-NEXT: [[TMP0:%.*]] = load <32 x bfloat>, ptr addrspace(5) [[SRCBF32_ADDR]], align 64
124+
// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
125+
// CHECK-NEXT: [[TMP2:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.bf16(<32 x bfloat> [[TMP0]], float [[TMP1]])
126+
// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8
127+
// CHECK-NEXT: store <6 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 32
128+
// CHECK-NEXT: [[TMP4:%.*]] = load <32 x half>, ptr addrspace(5) [[SRCH32_ADDR]], align 64
129+
// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
130+
// CHECK-NEXT: [[TMP6:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.f16(<32 x half> [[TMP4]], float [[TMP5]])
131+
// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8
132+
// CHECK-NEXT: store <6 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 32
133+
// CHECK-NEXT: [[TMP8:%.*]] = load <32 x bfloat>, ptr addrspace(5) [[SRCBF32_ADDR]], align 64
134+
// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
135+
// CHECK-NEXT: [[TMP10:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.bf16(<32 x bfloat> [[TMP8]], float [[TMP9]])
136+
// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8
137+
// CHECK-NEXT: store <6 x i32> [[TMP10]], ptr addrspace(1) [[TMP11]], align 32
138+
// CHECK-NEXT: [[TMP12:%.*]] = load <32 x half>, ptr addrspace(5) [[SRCH32_ADDR]], align 64
139+
// CHECK-NEXT: [[TMP13:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
140+
// CHECK-NEXT: [[TMP14:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.f16(<32 x half> [[TMP12]], float [[TMP13]])
141+
// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8
142+
// CHECK-NEXT: store <6 x i32> [[TMP14]], ptr addrspace(1) [[TMP15]], align 32
143+
// CHECK-NEXT: ret void
144+
//
145+
void test_cvt_scalef32_pk(global uint6 *out6, bfloat32 srcbf32, half32 srch32, float scale)
146+
{
147+
*out6 = __builtin_amdgcn_cvt_scalef32_pk32_bf6_bf16(srcbf32, scale);
148+
*out6 = __builtin_amdgcn_cvt_scalef32_pk32_bf6_f16(srch32, scale);
149+
*out6 = __builtin_amdgcn_cvt_scalef32_pk32_fp6_bf16(srcbf32, scale);
150+
*out6 = __builtin_amdgcn_cvt_scalef32_pk32_fp6_f16(srch32, scale);
151+
}

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -561,6 +561,15 @@ def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic;
561561
def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic;
562562
def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic;
563563

564+
class AMDGPUCvtScaleF32Intrinsic<LLVMType DstTy, LLVMType Src0Ty, string name> : DefaultAttrsIntrinsic<
565+
[DstTy], [Src0Ty, llvm_float_ty], [IntrNoMem, IntrSpeculatable]
566+
>, ClangBuiltin<"__builtin_amdgcn_"#name>;
567+
568+
def int_amdgcn_cvt_scalef32_pk32_fp6_f16 : AMDGPUCvtScaleF32Intrinsic<llvm_v6i32_ty, llvm_v32f16_ty, "cvt_scalef32_pk32_fp6_f16">;
569+
def int_amdgcn_cvt_scalef32_pk32_bf6_f16 : AMDGPUCvtScaleF32Intrinsic<llvm_v6i32_ty, llvm_v32f16_ty, "cvt_scalef32_pk32_bf6_f16">;
570+
def int_amdgcn_cvt_scalef32_pk32_fp6_bf16 : AMDGPUCvtScaleF32Intrinsic<llvm_v6i32_ty, llvm_v32bf16_ty, "cvt_scalef32_pk32_fp6_bf16">;
571+
def int_amdgcn_cvt_scalef32_pk32_bf6_bf16 : AMDGPUCvtScaleF32Intrinsic<llvm_v6i32_ty, llvm_v32bf16_ty, "cvt_scalef32_pk32_bf6_bf16">;
572+
564573
def int_amdgcn_prng_b32 : DefaultAttrsIntrinsic<
565574
[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]
566575
>, ClangBuiltin<"__builtin_amdgcn_prng_b32">;

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -408,11 +408,23 @@ def FeatureFP6BF6ConversionScaleInsts : SubtargetFeature<"fp6bf6-cvt-scale-insts
408408
"Has fp6 and bf6 conversion scale instructions"
409409
>;
410410

411+
def FeatureF16BF16ToFP6BF6ConversionScaleInsts : SubtargetFeature<"f16bf16-to-fp6bf6-cvt-scale-insts",
412+
"HasF16BF16ToFP6BF6ConversionScaleInsts",
413+
"true",
414+
"Has f16bf16 to fp6bf6 conversion scale instructions"
415+
>;
416+
411417
def FeatureGFX950Insts : SubtargetFeature<"gfx950-insts",
412418
"GFX950Insts",
413419
"true",
414420
"Additional instructions for GFX950+",
415-
[FeaturePermlane16Swap, FeaturePermlane32Swap, FeatureFP8ConversionScaleInsts, FeatureBF8ConversionScaleInsts, FeatureFP4ConversionScaleInsts, FeatureFP6BF6ConversionScaleInsts]
421+
[FeaturePermlane16Swap,
422+
FeaturePermlane32Swap,
423+
FeatureFP8ConversionScaleInsts,
424+
FeatureBF8ConversionScaleInsts,
425+
FeatureFP4ConversionScaleInsts,
426+
FeatureFP6BF6ConversionScaleInsts,
427+
FeatureF16BF16ToFP6BF6ConversionScaleInsts]
416428
>;
417429

418430
def FeatureGFX10Insts : SubtargetFeature<"gfx10-insts",
@@ -2450,6 +2462,9 @@ def HasFP4ConversionScaleInsts : Predicate<"Subtarget->hasFP4ConversionScaleInst
24502462
def HasFP6BF6ConversionScaleInsts : Predicate<"Subtarget->hasFP6BF6ConversionScaleInsts()">,
24512463
AssemblerPredicate<(all_of FeatureFP6BF6ConversionScaleInsts)>;
24522464

2465+
def HasF16BF16ToFP6BF6ConversionScaleInsts : Predicate<"Subtarget->hasF16BF16ToFP6BF6ConversionScaleInsts()">,
2466+
AssemblerPredicate<(all_of FeatureF16BF16ToFP6BF6ConversionScaleInsts)>;
2467+
24532468
def HasGDS : Predicate<"Subtarget->hasGDS()">;
24542469

24552470
def HasGWS : Predicate<"Subtarget->hasGWS()">;

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4519,6 +4519,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
45194519
case Intrinsic::amdgcn_cvt_pk_bf8_f32:
45204520
case Intrinsic::amdgcn_cvt_sr_fp8_f32:
45214521
case Intrinsic::amdgcn_cvt_sr_bf8_f32:
4522+
case Intrinsic::amdgcn_cvt_scalef32_pk32_fp6_f16:
4523+
case Intrinsic::amdgcn_cvt_scalef32_pk32_bf6_f16:
4524+
case Intrinsic::amdgcn_cvt_scalef32_pk32_fp6_bf16:
4525+
case Intrinsic::amdgcn_cvt_scalef32_pk32_bf6_bf16:
45224526
case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16:
45234527
case Intrinsic::amdgcn_wmma_f16_16x16x16_f16:
45244528
case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied:

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ class AMDGPUSubtarget {
5454
bool HasBF8ConversionScaleInsts = false;
5555
bool HasFP4ConversionScaleInsts = false;
5656
bool HasFP6BF6ConversionScaleInsts = false;
57+
bool HasF16BF16ToFP6BF6ConversionScaleInsts = false;
5758
bool EnableRealTrue16Insts = false;
5859
bool HasBF16ConversionInsts = false;
5960
bool HasMadMixInsts = false;
@@ -187,6 +188,8 @@ class AMDGPUSubtarget {
187188

188189
bool hasFP6BF6ConversionScaleInsts() const { return HasFP6BF6ConversionScaleInsts; }
189190

191+
bool hasF16BF16ToFP6BF6ConversionScaleInsts() const { return HasF16BF16ToFP6BF6ConversionScaleInsts; }
192+
190193
bool hasMadMacF32Insts() const {
191194
return HasMadMacF32Insts || !isGCN();
192195
}

llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,7 @@ DECODE_OPERAND_REG_8(VGPR_32_Lo128)
280280
DECODE_OPERAND_REG_8(VReg_64)
281281
DECODE_OPERAND_REG_8(VReg_96)
282282
DECODE_OPERAND_REG_8(VReg_128)
283+
DECODE_OPERAND_REG_8(VReg_192)
283284
DECODE_OPERAND_REG_8(VReg_256)
284285
DECODE_OPERAND_REG_8(VReg_288)
285286
DECODE_OPERAND_REG_8(VReg_352)

llvm/lib/Target/AMDGPU/SIInstrInfo.td

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1559,6 +1559,7 @@ class getVALUDstForVT<ValueType VT, bit IsTrue16 = 0, bit IsVOP3Encoding = 0> {
15591559
RegisterOperand ret = !cond(!eq(VT.Size, 1024) : VOPDstOperand<VReg_1024>,
15601560
!eq(VT.Size, 512) : VOPDstOperand<VReg_512>,
15611561
!eq(VT.Size, 256) : VOPDstOperand<VReg_256>,
1562+
!eq(VT.Size, 192) : VOPDstOperand<VReg_192>,
15621563
!eq(VT.Size, 128) : VOPDstOperand<VReg_128>,
15631564
!eq(VT.Size, 64) : VOPDstOperand<VReg_64>,
15641565
!eq(VT.Size, 32) : VOPDstOperand<VGPR_32>,
@@ -1614,7 +1615,8 @@ class getSOPSrcForVT<ValueType VT> {
16141615
// Returns the vreg register class to use for source operand given VT
16151616
class getVregSrcForVT<ValueType VT, bit IsTrue16 = 0, bit IsFake16 = 0> {
16161617
RegisterOperand ret =
1617-
!cond(!eq(VT.Size, 192) : RegisterOperand<VReg_192>,
1618+
!cond(!eq(VT.Size, 512) : RegisterOperand<VReg_512>,
1619+
!eq(VT.Size, 192) : RegisterOperand<VReg_192>,
16181620
!eq(VT.Size, 128) : RegisterOperand<VReg_128>,
16191621
!eq(VT.Size, 96) : RegisterOperand<VReg_96>,
16201622
!eq(VT.Size, 64) : RegisterOperand<VReg_64>,
@@ -1648,6 +1650,7 @@ class getVOP3SrcForVT<ValueType VT, bit IsTrue16 = 0> {
16481650
!eq(VT, v2i16) : VSrc_v2b16,
16491651
!eq(VT, v4f16) : AVSrc_64,
16501652
!eq(VT, v4bf16) : AVSrc_64,
1653+
!eq(VT.Size, 512) : VRegSrc_512,
16511654
!eq(VT.Size, 192) : VRegSrc_192,
16521655
!eq(VT.Size, 128) : VRegSrc_128,
16531656
!eq(VT.Size, 96) : VRegSrc_96,
@@ -2642,6 +2645,8 @@ def VOP_V2BF16_F32_F32 : VOPProfile <[v2bf16, f32, f32, untyped]>;
26422645
def VOP_V32F32_V6I32_F32 : VOPProfile <[v32f32, v6i32, f32, untyped]>;
26432646
def VOP_V32F16_V6I32_F32 : VOPProfile <[v32f16, v6i32, f32, untyped]>;
26442647
def VOP_V32BF16_V6I32_F32 : VOPProfile <[v32bf16, v6i32, f32, untyped]>;
2648+
def VOP_V6I32_V32F16_F32 : VOPProfile<[v6i32, v32f16, f32, untyped]>;
2649+
def VOP_V6I32_V32BF16_F32 : VOPProfile<[v6i32, v32bf16, f32, untyped]>;
26452650

26462651
def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>;
26472652
def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>;

llvm/lib/Target/AMDGPU/SIRegisterInfo.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1252,6 +1252,7 @@ def VRegSrc_96 : SrcReg9<VReg_96, "OPW96">;
12521252
def VRegSrc_128: SrcReg9<VReg_128, "OPW128">;
12531253
def VRegSrc_192: SrcReg9<VReg_192, "OPW192">;
12541254
def VRegSrc_256: SrcReg9<VReg_256, "OPW256">;
1255+
def VRegSrc_512: SrcReg9<VReg_512, "OPW512">;
12551256
def VRegOrLdsSrc_32 : SrcReg9<VRegOrLds_32, "OPW32">;
12561257

12571258
//===----------------------------------------------------------------------===//

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -975,6 +975,13 @@ let SubtargetPredicate = HasFP6BF6ConversionScaleInsts, mayRaiseFPException = 0
975975
defm V_CVT_SCALEF32_PK32_BF16_BF6 : VOP3Inst<"v_cvt_scalef32_pk32_bf16_bf6", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V32BF16_V6I32_F32>>;
976976
}
977977

978+
let SubtargetPredicate = HasF16BF16ToFP6BF6ConversionScaleInsts, mayRaiseFPException = 0 in {
979+
defm V_CVT_SCALEF32_PK32_FP6_F16 : VOP3Inst<"v_cvt_scalef32_pk32_fp6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32F16_F32>, int_amdgcn_cvt_scalef32_pk32_fp6_f16>;
980+
defm V_CVT_SCALEF32_PK32_BF6_F16 : VOP3Inst<"v_cvt_scalef32_pk32_bf6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32F16_F32>, int_amdgcn_cvt_scalef32_pk32_bf6_f16>;
981+
defm V_CVT_SCALEF32_PK32_FP6_BF16 : VOP3Inst<"v_cvt_scalef32_pk32_fp6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32BF16_F32>, int_amdgcn_cvt_scalef32_pk32_fp6_bf16>;
982+
defm V_CVT_SCALEF32_PK32_BF6_BF16 : VOP3Inst<"v_cvt_scalef32_pk32_bf6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32BF16_F32>, int_amdgcn_cvt_scalef32_pk32_bf6_bf16>;
983+
}
984+
978985
let SubtargetPredicate = isGFX10Plus in {
979986
let isCommutable = 1, isReMaterializable = 1 in {
980987
defm V_XOR3_B32 : VOP3Inst <"v_xor3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
@@ -1904,3 +1911,10 @@ defm V_CVT_SCALEF32_PK32_BF16_FP6 : VOP3_Real_gfx9<0x261, "v_cvt_scalef32_pk32_b
19041911
defm V_CVT_SCALEF32_PK32_F16_BF6 : VOP3_Real_gfx9<0x262, "v_cvt_scalef32_pk32_f16_bf6">;
19051912
defm V_CVT_SCALEF32_PK32_BF16_BF6 : VOP3_Real_gfx9<0x263, "v_cvt_scalef32_pk32_bf16_bf6">;
19061913
}
1914+
1915+
let OtherPredicates = [HasF16BF16ToFP6BF6ConversionScaleInsts] in {
1916+
defm V_CVT_SCALEF32_PK32_FP6_F16 : VOP3_Real_gfx9<0x258, "v_cvt_scalef32_pk32_fp6_f16">;
1917+
defm V_CVT_SCALEF32_PK32_FP6_BF16 : VOP3_Real_gfx9<0x259, "v_cvt_scalef32_pk32_fp6_bf16">;
1918+
defm V_CVT_SCALEF32_PK32_BF6_F16 : VOP3_Real_gfx9<0x25a, "v_cvt_scalef32_pk32_bf6_f16">;
1919+
defm V_CVT_SCALEF32_PK32_BF6_BF16 : VOP3_Real_gfx9<0x25b, "v_cvt_scalef32_pk32_bf6_bf16">;
1920+
}

llvm/lib/TargetParser/TargetParser.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -470,6 +470,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
470470
Features["gws"] = true;
471471
break;
472472
case GK_GFX950:
473+
Features["f16bf16-to-fp6bf6-cvt-scale-insts"] = true;
473474
Features["prng-inst"] = true;
474475
Features["permlane16-swap"] = true;
475476
Features["permlane32-swap"] = true;

0 commit comments

Comments
 (0)