Skip to content

Commit 13b0656

Browse files
pravinjagtaparsenm
authored andcommitted
AMDGPU: Builtin & codegen support for v_cvt_scalef32_pk32_{bf|f}16_{bf|fp}6 for gfx950
Co-authored-by: Pravin Jagtap <[email protected]>
1 parent c28a3f3 commit 13b0656

File tree

7 files changed

+316
-7
lines changed

7 files changed

+316
-7
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -588,6 +588,10 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk_f16_fp4, "V2hUifIi", "nc", "fp4-
588588
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk_bf16_fp4, "V2yUifIi", "nc", "fp4-cvt-scale-insts")
589589
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk32_f32_fp6, "V32fV6Uif", "nc", "fp6bf6-cvt-scale-insts")
590590
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk32_f32_bf6, "V32fV6Uif", "nc", "fp6bf6-cvt-scale-insts")
591+
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk32_f16_fp6, "V32hV6Uif", "nc", "fp6bf6-cvt-scale-insts")
592+
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk32_bf16_fp6, "V32yV6Uif", "nc", "fp6bf6-cvt-scale-insts")
593+
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk32_f16_bf6, "V32hV6Uif", "nc", "fp6bf6-cvt-scale-insts")
594+
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk32_bf16_bf6, "V32yV6Uif", "nc", "fp6bf6-cvt-scale-insts")
591595

592596
#undef BUILTIN
593597
#undef TARGET_BUILTIN

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-err.cl

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,12 @@ typedef float __attribute__((ext_vector_type(2))) float2;
1818
typedef __bf16 __attribute__((ext_vector_type(2))) bfloat2;
1919
typedef float __attribute__((ext_vector_type(32))) float32;
2020
typedef unsigned int __attribute__((ext_vector_type(6))) uint6;
21+
typedef half __attribute__((ext_vector_type(32))) half32;
22+
typedef __bf16 __attribute__((ext_vector_type(32))) bfloat32;
2123

2224
void test(global uint* out, global uint2* out_v2u32, uint a, uint b, global half2* out_v2f16, global float* out_f32, float scale, global short2* out_v2i16, float src0, float src1,
23-
global float2* out_v2f32, half2 src0_v2f16, bfloat2 src0_v2bf16, global bfloat2* out_v2bf16, global float32* out_v36f32, uint6 src_v6i32) {
25+
global float2* out_v2f32, half2 src0_v2f16, bfloat2 src0_v2bf16, global bfloat2* out_v2bf16, global float32* out_v32f32, uint6 src_v6i32,
26+
global half32 *out_v32f16, global bfloat32 *out_v32bf16) {
2427
*out = __builtin_amdgcn_prng_b32(a); // expected-error{{'__builtin_amdgcn_prng_b32' needs target feature prng-inst}}
2528
*out_v2u32 = __builtin_amdgcn_permlane16_swap(a, b, false, false); // expected-error{{'__builtin_amdgcn_permlane16_swap' needs target feature permlane16-swap}}
2629
*out_v2u32 = __builtin_amdgcn_permlane32_swap(a, b, false, false); // expected-error{{'__builtin_amdgcn_permlane32_swap' needs target feature permlane32-swap}}
@@ -40,6 +43,10 @@ void test(global uint* out, global uint2* out_v2u32, uint a, uint b, global half
4043
*out = __builtin_amdgcn_cvt_scalef32_pk_fp4_f32(*out, src0, src1, scale, 3); // expected-error{{'__builtin_amdgcn_cvt_scalef32_pk_fp4_f32' needs target feature fp4-cvt-scale-insts}}
4144
*out_v2f16 = __builtin_amdgcn_cvt_scalef32_pk_f16_fp4(a, scale, 3); // expected-error{{'__builtin_amdgcn_cvt_scalef32_pk_f16_fp4' needs target feature fp4-cvt-scale-insts}}
4245
*out_v2bf16 = __builtin_amdgcn_cvt_scalef32_pk_bf16_fp4(a, scale, 3); // expected-error{{'__builtin_amdgcn_cvt_scalef32_pk_bf16_fp4' needs target feature fp4-cvt-scale-insts}}
43-
*out_v36f32 = __builtin_amdgcn_cvt_scalef32_pk32_f32_fp6(src_v6i32, scale); // expected-error{{'__builtin_amdgcn_cvt_scalef32_pk32_f32_fp6' needs target feature fp6bf6-cvt-scale-insts}}
44-
*out_v36f32 = __builtin_amdgcn_cvt_scalef32_pk32_f32_bf6(src_v6i32, scale); // expected-error{{'__builtin_amdgcn_cvt_scalef32_pk32_f32_bf6' needs target feature fp6bf6-cvt-scale-insts}}
46+
*out_v32f32 = __builtin_amdgcn_cvt_scalef32_pk32_f32_fp6(src_v6i32, scale); // expected-error{{'__builtin_amdgcn_cvt_scalef32_pk32_f32_fp6' needs target feature fp6bf6-cvt-scale-insts}}
47+
*out_v32f32 = __builtin_amdgcn_cvt_scalef32_pk32_f32_bf6(src_v6i32, scale); // expected-error{{'__builtin_amdgcn_cvt_scalef32_pk32_f32_bf6' needs target feature fp6bf6-cvt-scale-insts}}
48+
*out_v32f16 = __builtin_amdgcn_cvt_scalef32_pk32_f16_fp6(src_v6i32, scale); // expected-error{{'__builtin_amdgcn_cvt_scalef32_pk32_f16_fp6' needs target feature fp6bf6-cvt-scale-insts}}
49+
*out_v32f16 = __builtin_amdgcn_cvt_scalef32_pk32_f16_bf6(src_v6i32, scale); // expected-error{{'__builtin_amdgcn_cvt_scalef32_pk32_f16_bf6' needs target feature fp6bf6-cvt-scale-insts}}
50+
*out_v32bf16 = __builtin_amdgcn_cvt_scalef32_pk32_bf16_fp6(src_v6i32, scale); // expected-error{{'__builtin_amdgcn_cvt_scalef32_pk32_bf16_fp6' needs target feature fp6bf6-cvt-scale-insts}}
51+
*out_v32bf16 = __builtin_amdgcn_cvt_scalef32_pk32_bf16_bf6(src_v6i32, scale); // expected-error{{'__builtin_amdgcn_cvt_scalef32_pk32_bf16_bf6' needs target feature fp6bf6-cvt-scale-insts}}
4552
}

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -932,3 +932,55 @@ void test_cvt_scalef32_pk_f32_fp6(global float32* out, uint6 src, float scale)
932932
*out = __builtin_amdgcn_cvt_scalef32_pk32_f32_fp6(src, scale);
933933
*out = __builtin_amdgcn_cvt_scalef32_pk32_f32_bf6(src, scale);
934934
}
935+
936+
// CHECK-LABEL: @test_cvt_scalef32_pk32_f16_fpbf6(
937+
// CHECK-NEXT: entry:
938+
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
939+
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <6 x i32>, align 32, addrspace(5)
940+
// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
941+
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
942+
// CHECK-NEXT: store <6 x i32> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 32
943+
// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
944+
// CHECK-NEXT: [[TMP0:%.*]] = load <6 x i32>, ptr addrspace(5) [[SRC_ADDR]], align 32
945+
// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
946+
// CHECK-NEXT: [[TMP2:%.*]] = call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.fp6(<6 x i32> [[TMP0]], float [[TMP1]])
947+
// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
948+
// CHECK-NEXT: store <32 x half> [[TMP2]], ptr addrspace(1) [[TMP3]], align 64
949+
// CHECK-NEXT: [[TMP4:%.*]] = load <6 x i32>, ptr addrspace(5) [[SRC_ADDR]], align 32
950+
// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
951+
// CHECK-NEXT: [[TMP6:%.*]] = call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.bf6(<6 x i32> [[TMP4]], float [[TMP5]])
952+
// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
953+
// CHECK-NEXT: store <32 x half> [[TMP6]], ptr addrspace(1) [[TMP7]], align 64
954+
// CHECK-NEXT: ret void
955+
//
956+
void test_cvt_scalef32_pk32_f16_fpbf6(global half32 *out, uint6 src, float scale)
957+
{
958+
*out = __builtin_amdgcn_cvt_scalef32_pk32_f16_fp6(src, scale);
959+
*out = __builtin_amdgcn_cvt_scalef32_pk32_f16_bf6(src, scale);
960+
}
961+
962+
// CHECK-LABEL: @test_cvt_scalef32_pk32_bf16_fpbf6(
963+
// CHECK-NEXT: entry:
964+
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
965+
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <6 x i32>, align 32, addrspace(5)
966+
// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
967+
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
968+
// CHECK-NEXT: store <6 x i32> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 32
969+
// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
970+
// CHECK-NEXT: [[TMP0:%.*]] = load <6 x i32>, ptr addrspace(5) [[SRC_ADDR]], align 32
971+
// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
972+
// CHECK-NEXT: [[TMP2:%.*]] = call <32 x bfloat> @llvm.amdgcn.cvt.scalef32.pk32.bf16.fp6(<6 x i32> [[TMP0]], float [[TMP1]])
973+
// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
974+
// CHECK-NEXT: store <32 x bfloat> [[TMP2]], ptr addrspace(1) [[TMP3]], align 64
975+
// CHECK-NEXT: [[TMP4:%.*]] = load <6 x i32>, ptr addrspace(5) [[SRC_ADDR]], align 32
976+
// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
977+
// CHECK-NEXT: [[TMP6:%.*]] = call <32 x bfloat> @llvm.amdgcn.cvt.scalef32.pk32.bf16.bf6(<6 x i32> [[TMP4]], float [[TMP5]])
978+
// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
979+
// CHECK-NEXT: store <32 x bfloat> [[TMP6]], ptr addrspace(1) [[TMP7]], align 64
980+
// CHECK-NEXT: ret void
981+
//
982+
void test_cvt_scalef32_pk32_bf16_fpbf6(global bfloat32 *out, uint6 src, float scale)
983+
{
984+
*out = __builtin_amdgcn_cvt_scalef32_pk32_bf16_fp6(src, scale);
985+
*out = __builtin_amdgcn_cvt_scalef32_pk32_bf16_bf6(src, scale);
986+
}

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -700,6 +700,12 @@ def int_amdgcn_cvt_scalef32_pk_bf16_fp4: AMDGPUCvtScaleFP4FP8BF8ToF1632Intrinsic
700700
def int_amdgcn_cvt_scalef32_pk32_f32_fp6 : AMDGPUCvtScaleF32Intrinsic<llvm_v32f32_ty, llvm_v6i32_ty, "cvt_scalef32_pk32_f32_fp6">;
701701
def int_amdgcn_cvt_scalef32_pk32_f32_bf6 : AMDGPUCvtScaleF32Intrinsic<llvm_v32f32_ty, llvm_v6i32_ty, "cvt_scalef32_pk32_f32_bf6">;
702702

703+
// llvm.amdgcn.cvt.scalef32.pk32.f16.fp6 v6i32 src, float scale
704+
def int_amdgcn_cvt_scalef32_pk32_f16_bf6 : AMDGPUCvtScaleF32Intrinsic<llvm_v32f16_ty, llvm_v6i32_ty, "cvt_scalef32_pk32_f16_bf6">;
705+
def int_amdgcn_cvt_scalef32_pk32_bf16_bf6 : AMDGPUCvtScaleF32Intrinsic<llvm_v32bf16_ty, llvm_v6i32_ty, "cvt_scalef32_pk32_bf16_bf6">;
706+
def int_amdgcn_cvt_scalef32_pk32_f16_fp6 : AMDGPUCvtScaleF32Intrinsic<llvm_v32f16_ty, llvm_v6i32_ty, "cvt_scalef32_pk32_f16_fp6">;
707+
def int_amdgcn_cvt_scalef32_pk32_bf16_fp6 : AMDGPUCvtScaleF32Intrinsic<llvm_v32bf16_ty, llvm_v6i32_ty, "cvt_scalef32_pk32_bf16_fp6">;
708+
703709
def int_amdgcn_prng_b32 : DefaultAttrsIntrinsic<
704710
[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]
705711
>, ClangBuiltin<"__builtin_amdgcn_prng_b32">;

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4565,6 +4565,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
45654565
case Intrinsic::amdgcn_cvt_scalef32_pk_bf16_fp4:
45664566
case Intrinsic::amdgcn_cvt_scalef32_pk32_f32_fp6:
45674567
case Intrinsic::amdgcn_cvt_scalef32_pk32_f32_bf6:
4568+
case Intrinsic::amdgcn_cvt_scalef32_pk32_f16_bf6:
4569+
case Intrinsic::amdgcn_cvt_scalef32_pk32_bf16_bf6:
4570+
case Intrinsic::amdgcn_cvt_scalef32_pk32_f16_fp6:
4571+
case Intrinsic::amdgcn_cvt_scalef32_pk32_bf16_fp6:
45684572
case Intrinsic::amdgcn_ashr_pk_i8_i32:
45694573
case Intrinsic::amdgcn_ashr_pk_u8_i32:
45704574
case Intrinsic::amdgcn_cvt_scalef32_2xpk16_fp6_f32:

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1066,10 +1066,10 @@ let SubtargetPredicate = HasFP4ConversionScaleInsts, mayRaiseFPException = 0 in
10661066
let SubtargetPredicate = HasFP6BF6ConversionScaleInsts, mayRaiseFPException = 0 in {
10671067
defm V_CVT_SCALEF32_PK32_F32_FP6 : VOP3Inst<"v_cvt_scalef32_pk32_f32_fp6", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V32F32_V6I32_F32>, int_amdgcn_cvt_scalef32_pk32_f32_fp6>;
10681068
defm V_CVT_SCALEF32_PK32_F32_BF6 : VOP3Inst<"v_cvt_scalef32_pk32_f32_bf6", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V32F32_V6I32_F32>, int_amdgcn_cvt_scalef32_pk32_f32_bf6>;
1069-
defm V_CVT_SCALEF32_PK32_F16_FP6 : VOP3Inst<"v_cvt_scalef32_pk32_f16_fp6", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V32F16_V6I32_F32>>;
1070-
defm V_CVT_SCALEF32_PK32_BF16_FP6 : VOP3Inst<"v_cvt_scalef32_pk32_bf16_fp6", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V32BF16_V6I32_F32>>;
1071-
defm V_CVT_SCALEF32_PK32_F16_BF6 : VOP3Inst<"v_cvt_scalef32_pk32_f16_bf6", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V32F16_V6I32_F32>>;
1072-
defm V_CVT_SCALEF32_PK32_BF16_BF6 : VOP3Inst<"v_cvt_scalef32_pk32_bf16_bf6", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V32BF16_V6I32_F32>>;
1069+
defm V_CVT_SCALEF32_PK32_F16_FP6 : VOP3Inst<"v_cvt_scalef32_pk32_f16_fp6", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V32F16_V6I32_F32>, int_amdgcn_cvt_scalef32_pk32_f16_fp6>;
1070+
defm V_CVT_SCALEF32_PK32_BF16_FP6 : VOP3Inst<"v_cvt_scalef32_pk32_bf16_fp6", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V32BF16_V6I32_F32>, int_amdgcn_cvt_scalef32_pk32_bf16_fp6>;
1071+
defm V_CVT_SCALEF32_PK32_F16_BF6 : VOP3Inst<"v_cvt_scalef32_pk32_f16_bf6", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V32F16_V6I32_F32>, int_amdgcn_cvt_scalef32_pk32_f16_bf6>;
1072+
defm V_CVT_SCALEF32_PK32_BF16_BF6 : VOP3Inst<"v_cvt_scalef32_pk32_bf16_bf6", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V32BF16_V6I32_F32>, int_amdgcn_cvt_scalef32_pk32_bf16_bf6>;
10731073
}
10741074

10751075
let SubtargetPredicate = HasF16BF16ToFP6BF6ConversionScaleInsts, mayRaiseFPException = 0 in {

0 commit comments

Comments
 (0)