Skip to content

Commit df89015

Browse files
pravinjagtaparsenm
authored andcommitted
AMDGPU: Builtins & Codegen support for v_cvt_scalef32_pk_f32_{fp8|bf8} for gfx950
OPSEL[0] determines low/high 16 bits of src0 to read. Co-authored-by: Pravin Jagtap <[email protected]>
1 parent 5147e59 commit df89015

10 files changed

+147
-4
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -576,5 +576,8 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_f32_fp8, "fifIi", "nc", "fp8-cvt-sc
576576
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_f32_bf8, "fifIi", "nc", "bf8-cvt-scale-insts")
577577
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk_fp8_f32, "V2sV2sfffIb", "nc", "fp8-cvt-scale-insts")
578578
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk_bf8_f32, "V2sV2sfffIb", "nc", "bf8-cvt-scale-insts")
579+
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk_f32_fp8, "V2fUifIb", "nc", "fp8-cvt-scale-insts")
580+
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk_f32_bf8, "V2fUifIb", "nc", "bf8-cvt-scale-insts")
581+
579582
#undef BUILTIN
580583
#undef TARGET_BUILTIN

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-err.cl

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,10 @@ typedef unsigned int uint;
1414
typedef unsigned int uint2 __attribute__((ext_vector_type(2)));
1515
typedef half __attribute__((ext_vector_type(2))) half2;
1616
typedef short __attribute__((ext_vector_type(2))) short2;
17+
typedef float __attribute__((ext_vector_type(2))) float2;
1718

18-
void test(global uint* out, global uint2* out_v2u32, uint a, uint b, global half2* out_v2f16, global float* out_f32, float scale, global short2* out_v2i16, float src0, float src1) {
19+
void test(global uint* out, global uint2* out_v2u32, uint a, uint b, global half2* out_v2f16, global float* out_f32, float scale, global short2* out_v2i16, float src0, float src1,
20+
global float2* out_v2f32) {
1921
*out = __builtin_amdgcn_prng_b32(a); // expected-error{{'__builtin_amdgcn_prng_b32' needs target feature prng-inst}}
2022
*out_v2u32 = __builtin_amdgcn_permlane16_swap(a, b, false, false); // expected-error{{'__builtin_amdgcn_permlane16_swap' needs target feature permlane16-swap}}
2123
*out_v2u32 = __builtin_amdgcn_permlane32_swap(a, b, false, false); // expected-error{{'__builtin_amdgcn_permlane32_swap' needs target feature permlane32-swap}}
@@ -25,4 +27,6 @@ void test(global uint* out, global uint2* out_v2u32, uint a, uint b, global half
2527
*out_f32 = __builtin_amdgcn_cvt_scalef32_f32_bf8(a, scale, 0); // expected-error{{'__builtin_amdgcn_cvt_scalef32_f32_bf8' needs target feature bf8-cvt-scale-insts}}
2628
*out_v2i16 = __builtin_amdgcn_cvt_scalef32_pk_fp8_f32(*out_v2i16, src0, src1, scale, true); // expected-error{{'__builtin_amdgcn_cvt_scalef32_pk_fp8_f32' needs target feature fp8-cvt-scale-insts}}
2729
*out_v2i16 = __builtin_amdgcn_cvt_scalef32_pk_bf8_f32(*out_v2i16, src0, src1, scale, true); // expected-error{{'__builtin_amdgcn_cvt_scalef32_pk_bf8_f32' needs target feature bf8-cvt-scale-insts}}
30+
*out_v2f32 = __builtin_amdgcn_cvt_scalef32_pk_f32_fp8(a, scale, true); // expected-error{{'__builtin_amdgcn_cvt_scalef32_pk_f32_fp8' needs target feature fp8-cvt-scale-insts}}
31+
*out_v2f32 = __builtin_amdgcn_cvt_scalef32_pk_f32_bf8(a, scale, true); // expected-error{{'__builtin_amdgcn_cvt_scalef32_pk_f32_bf8' needs target feature bf8-cvt-scale-insts}}
2832
}

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl

Lines changed: 54 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ typedef short __attribute__((ext_vector_type(2))) short2;
1313
typedef __bf16 __attribute__((ext_vector_type(2))) bfloat2;
1414
typedef float __attribute__((ext_vector_type(16))) float16;
1515
typedef half __attribute__((ext_vector_type(2))) half2;
16+
typedef float __attribute__((ext_vector_type(2))) float2;
1617

1718
// CHECK-LABEL: @test_prng_b32(
1819
// CHECK-NEXT: entry:
@@ -498,7 +499,6 @@ void test_cvt_scalef32_f32_bf8(global float* out, uint src, float scale)
498499
*out = __builtin_amdgcn_cvt_scalef32_f32_bf8(src, scale, 3);
499500
}
500501

501-
502502
// CHECK-LABEL: @test_cvt_scalef32_pk_fp8_f32(
503503
// CHECK-NEXT: entry:
504504
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
@@ -533,7 +533,6 @@ void test_cvt_scalef32_pk_fp8_f32(global short2* out, float src0, float src1, fl
533533
*out = __builtin_amdgcn_cvt_scalef32_pk_fp8_f32(*out, src0, src1, scale, false);
534534
}
535535

536-
537536
// CHECK-LABEL: @test_cvt_scalef32_pk_bf8_f32(
538537
// CHECK-NEXT: entry:
539538
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
@@ -567,3 +566,56 @@ void test_cvt_scalef32_pk_bf8_f32(global short2* out, float src0, float src1, fl
567566
*out = __builtin_amdgcn_cvt_scalef32_pk_bf8_f32(*out, src0, src1, scale, true);
568567
*out = __builtin_amdgcn_cvt_scalef32_pk_bf8_f32(*out, src0, src1, scale, false);
569568
}
569+
570+
571+
// CHECK-LABEL: @test_cvt_scalef32_pk_f32_fp8(
572+
// CHECK-NEXT: entry:
573+
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
574+
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
575+
// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
576+
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
577+
// CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
578+
// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
579+
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
580+
// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
581+
// CHECK-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp8(i32 [[TMP0]], float [[TMP1]], i1 true)
582+
// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
583+
// CHECK-NEXT: store <2 x float> [[TMP2]], ptr addrspace(1) [[TMP3]], align 8
584+
// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
585+
// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
586+
// CHECK-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp8(i32 [[TMP4]], float [[TMP5]], i1 false)
587+
// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
588+
// CHECK-NEXT: store <2 x float> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8
589+
// CHECK-NEXT: ret void
590+
//
591+
void test_cvt_scalef32_pk_f32_fp8(global float2* out, unsigned int src, float scale)
592+
{
593+
*out = __builtin_amdgcn_cvt_scalef32_pk_f32_fp8(src, scale, true);
594+
*out = __builtin_amdgcn_cvt_scalef32_pk_f32_fp8(src, scale, false);
595+
}
596+
597+
// CHECK-LABEL: @test_cvt_scalef32_pk_f32_bf8(
598+
// CHECK-NEXT: entry:
599+
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
600+
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
601+
// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
602+
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
603+
// CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
604+
// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
605+
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
606+
// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
607+
// CHECK-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.bf8(i32 [[TMP0]], float [[TMP1]], i1 true)
608+
// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
609+
// CHECK-NEXT: store <2 x float> [[TMP2]], ptr addrspace(1) [[TMP3]], align 8
610+
// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
611+
// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
612+
// CHECK-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.bf8(i32 [[TMP4]], float [[TMP5]], i1 false)
613+
// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
614+
// CHECK-NEXT: store <2 x float> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8
615+
// CHECK-NEXT: ret void
616+
//
617+
void test_cvt_scalef32_pk_f32_bf8(global float2* out, unsigned int src, float scale)
618+
{
619+
*out = __builtin_amdgcn_cvt_scalef32_pk_f32_bf8(src, scale, true);
620+
*out = __builtin_amdgcn_cvt_scalef32_pk_f32_bf8(src, scale, false);
621+
}

clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ typedef int int16 __attribute__((ext_vector_type(16)));
1313
typedef unsigned int uint;
1414
typedef half half2 __attribute__((ext_vector_type(2)));
1515
typedef short short2 __attribute__((ext_vector_type(2)));
16+
typedef float float2 __attribute__((ext_vector_type(2)));
1617

1718
void test_mfma_f32_16x16x32_f16(__global float4* out, half8 a, half8 b, float4 c, int X) {
1819

@@ -162,11 +163,13 @@ void test_permlane32_swap(__global int* out, int old, int src, bool X) {
162163
}
163164

164165
void test_cvt_scalef32(global half2* out_v2f16, global float* out_f32, uint src, float scale, int index, bool X,
165-
global short2* out_v2i16, float src0, float src1) {
166+
global short2* out_v2i16, float src0, float src1, global float2* out_v2f32) {
166167
*out_v2f16 = __builtin_amdgcn_cvt_scalef32_f16_fp8(*out_v2f16, src, scale, index, X); // expected-error{{argument to '__builtin_amdgcn_cvt_scalef32_f16_fp8' must be a constant integer}}
167168
*out_f32 = __builtin_amdgcn_cvt_scalef32_f32_fp8(src, scale, index); // // expected-error{{argument to '__builtin_amdgcn_cvt_scalef32_f32_fp8' must be a constant integer}}
168169
*out_v2f16 = __builtin_amdgcn_cvt_scalef32_f16_bf8(*out_v2f16, src, scale, index, X); // expected-error{{argument to '__builtin_amdgcn_cvt_scalef32_f16_bf8' must be a constant integer}}
169170
*out_f32 = __builtin_amdgcn_cvt_scalef32_f32_bf8(src, scale, index); // expected-error{{argument to '__builtin_amdgcn_cvt_scalef32_f32_bf8' must be a constant integer}}
170171
*out_v2i16 = __builtin_amdgcn_cvt_scalef32_pk_fp8_f32(*out_v2i16, src0, src1, scale, X); // expected-error{{argument to '__builtin_amdgcn_cvt_scalef32_pk_fp8_f32' must be a constant integer}}
171172
*out_v2i16 = __builtin_amdgcn_cvt_scalef32_pk_bf8_f32(*out_v2i16, src0, src1, scale, X); // expected-error{{argument to '__builtin_amdgcn_cvt_scalef32_pk_bf8_f32' must be a constant integer}}
173+
*out_v2f32 = __builtin_amdgcn_cvt_scalef32_pk_f32_fp8(src, scale, X); // expected-error{{argument to '__builtin_amdgcn_cvt_scalef32_pk_f32_fp8' must be a constant integer}}
174+
*out_v2f32 = __builtin_amdgcn_cvt_scalef32_pk_f32_bf8(src, scale, X); // expected-error{{argument to '__builtin_amdgcn_cvt_scalef32_pk_f32_bf8' must be a constant integer}}
172175
}

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -617,6 +617,14 @@ class AMDGPUCvtScaleFP8BF8ToF32Intrinsic<LLVMType DstTy, string name> : DefaultA
617617
[IntrNoMem, IntrWillReturn, ImmArg<ArgIndex<2>>]
618618
>, ClangBuiltin<"__builtin_amdgcn_"#name>;
619619

620+
class AMDGPUCvtScale_pk_FP8BF8ToF32Intrinsic<string name> : DefaultAttrsIntrinsic<
621+
[llvm_v2f32_ty],
622+
[llvm_i32_ty, // src
623+
llvm_float_ty, // scale
624+
llvm_i1_ty], // src_lo_hi_sel[true false]
625+
[IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<2>>]
626+
>, ClangBuiltin<"__builtin_amdgcn_"#name>;
627+
620628
class AMDGPUCvtScaleF32ToFP8BF8TiedInputIntrinsic<string name> : DefaultAttrsIntrinsic<
621629
[llvm_v2i16_ty],
622630
[llvm_v2i16_ty, // old_vdst
@@ -649,6 +657,10 @@ def int_amdgcn_cvt_scalef32_f32_bf8 : AMDGPUCvtScaleFP8BF8ToF32Intrinsic<llvm_f
649657
def int_amdgcn_cvt_scalef32_pk_fp8_f32 : AMDGPUCvtScaleF32ToFP8BF8TiedInputIntrinsic<"cvt_scalef32_pk_fp8_f32">;
650658
def int_amdgcn_cvt_scalef32_pk_bf8_f32 : AMDGPUCvtScaleF32ToFP8BF8TiedInputIntrinsic<"cvt_scalef32_pk_bf8_f32">;
651659

660+
// llvm.amdgcn.cvt.scalef32.pk.fp32.fp8 int src, float scale, bool src_lo_hi_sel
661+
def int_amdgcn_cvt_scalef32_pk_f32_fp8 : AMDGPUCvtScale_pk_FP8BF8ToF32Intrinsic<"cvt_scalef32_pk_f32_fp8">;
662+
def int_amdgcn_cvt_scalef32_pk_f32_bf8 : AMDGPUCvtScale_pk_FP8BF8ToF32Intrinsic<"cvt_scalef32_pk_f32_bf8">;
663+
652664
def int_amdgcn_prng_b32 : DefaultAttrsIntrinsic<
653665
[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]
654666
>, ClangBuiltin<"__builtin_amdgcn_prng_b32">;

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5853,6 +5853,13 @@ void AMDGPUInstructionSelector::renderDstSelToOpSelXForm(
58535853
: 0);
58545854
}
58555855

5856+
void AMDGPUInstructionSelector::renderSrcSelToOpSelXForm(
5857+
MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
5858+
assert(OpIdx >= 0 && "expected to match an immediate operand");
5859+
MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::OP_SEL_0)
5860+
: 0);
5861+
}
5862+
58565863
void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
58575864
const MachineInstr &MI,
58585865
int OpIdx) const {

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -354,6 +354,9 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
354354
void renderDstSelToOpSelXForm(MachineInstrBuilder &MIB,
355355
const MachineInstr &MI, int OpIdx) const;
356356

357+
void renderSrcSelToOpSelXForm(MachineInstrBuilder &MIB,
358+
const MachineInstr &MI, int OpIdx) const;
359+
357360
void renderNegateImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
358361
int OpIdx) const;
359362

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4553,6 +4553,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
45534553
case Intrinsic::amdgcn_cvt_scalef32_f32_bf8:
45544554
case Intrinsic::amdgcn_cvt_scalef32_pk_fp8_f32:
45554555
case Intrinsic::amdgcn_cvt_scalef32_pk_bf8_f32:
4556+
case Intrinsic::amdgcn_cvt_scalef32_pk_f32_fp8:
4557+
case Intrinsic::amdgcn_cvt_scalef32_pk_f32_bf8:
45564558
case Intrinsic::amdgcn_ashr_pk_i8_i32:
45574559
case Intrinsic::amdgcn_ashr_pk_u8_i32:
45584560
case Intrinsic::amdgcn_cvt_scalef32_2xpk16_fp6_f32:

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -877,6 +877,14 @@ def DstSelToOpSelXForm : SDNodeXForm<timm, [{
877877
def gi_DstSelToOpSelXForm : GICustomOperandRenderer<"renderDstSelToOpSelXForm">,
878878
GISDNodeXFormEquiv<DstSelToOpSelXForm>;
879879

880+
def SrcSelToOpSelXForm : SDNodeXForm<timm, [{
881+
return CurDAG->getTargetConstant(
882+
N->getZExtValue() ? SISrcMods::OP_SEL_0 : SISrcMods::NONE,
883+
SDLoc(N), MVT::i32);
884+
}]>;
885+
def gi_SrcSelToOpSelXForm : GICustomOperandRenderer<"renderSrcSelToOpSelXForm">,
886+
GISDNodeXFormEquiv<SrcSelToOpSelXForm>;
887+
880888
class PermlanePat<SDPatternOperator permlane,
881889
Instruction inst, ValueType vt> : GCNPat<
882890
(vt (permlane vt:$vdst_in, vt:$src0, i32:$src1, i32:$src2,
@@ -1100,6 +1108,13 @@ class Cvt_Scale_PK_F32ToFP8BF8_Pat<SDPatternOperator node, VOP3_Pseudo inst> : G
11001108
def : Cvt_Scale_PK_F32ToFP8BF8_Pat<int_amdgcn_cvt_scalef32_pk_fp8_f32, V_CVT_SCALEF32_PK_FP8_F32_e64>;
11011109
def : Cvt_Scale_PK_F32ToFP8BF8_Pat<int_amdgcn_cvt_scalef32_pk_bf8_f32, V_CVT_SCALEF32_PK_BF8_F32_e64>;
11021110

1111+
class Cvt_Scale_PK_FP8BF8ToF32_Pat<SDPatternOperator node, VOP3_Pseudo inst> : GCNPat<
1112+
(v2f32 (node i32:$src0, f32:$src1, timm:$word_sel)),
1113+
(inst (SrcSelToOpSelXForm $word_sel), $src0, 0, $src1)
1114+
>;
1115+
def : Cvt_Scale_PK_FP8BF8ToF32_Pat<int_amdgcn_cvt_scalef32_pk_f32_fp8, V_CVT_SCALEF32_PK_F32_FP8_e64>;
1116+
def : Cvt_Scale_PK_FP8BF8ToF32_Pat<int_amdgcn_cvt_scalef32_pk_f32_bf8, V_CVT_SCALEF32_PK_F32_BF8_e64>;
1117+
11031118
let SubtargetPredicate = isGFX10Plus in {
11041119
let isCommutable = 1, isReMaterializable = 1 in {
11051120
defm V_XOR3_B32 : VOP3Inst <"v_xor3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ declare <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half>, i32, float, i32
1010
declare float @llvm.amdgcn.cvt.scalef32.f32.bf8(i32, float, i32)
1111
declare <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f32(<2 x i16>, float, float, float, i1)
1212
declare <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f32(<2 x i16>, float, float, float, i1)
13+
declare <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp8(i32, float, i1)
14+
declare <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.bf8(i32, float, i1)
1315

1416
define amdgpu_ps void @test_scalef32_pk32_fp6_f32_vv(<16 x float> %src, float %scale, ptr addrspace(1) %out) {
1517
; GFX950-SDAG-LABEL: test_scalef32_pk32_fp6_f32_vv:
@@ -518,3 +520,43 @@ define <2 x i16> @test_cvt_scalef32_pk_bf8_f32_word1_fabs_fneg(<2 x i16> %old, f
518520
%ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f32(<2 x i16> %old, float %fabs.src0, float %fneg.src1, float %scale, i1 true)
519521
ret <2 x i16> %ret
520522
}
523+
524+
define <2 x float> @test_cvt_scalef32_pk_f32_fp8_word0(i32 %src, float %scale) {
525+
; GCN-LABEL: test_cvt_scalef32_pk_f32_fp8_word0:
526+
; GCN: ; %bb.0:
527+
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
528+
; GCN-NEXT: v_cvt_scalef32_pk_f32_fp8 v[0:1], v0, v1
529+
; GCN-NEXT: s_setpc_b64 s[30:31]
530+
%ret = tail call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp8(i32 %src, float %scale, i1 false)
531+
ret <2 x float> %ret
532+
}
533+
534+
define <2 x float> @test_cvt_scalef32_pk_f32_fp8_word1(i32 %src, float %scale) {
535+
; GCN-LABEL: test_cvt_scalef32_pk_f32_fp8_word1:
536+
; GCN: ; %bb.0:
537+
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
538+
; GCN-NEXT: v_cvt_scalef32_pk_f32_fp8 v[0:1], v0, v1 op_sel:[1,0,0]
539+
; GCN-NEXT: s_setpc_b64 s[30:31]
540+
%ret = tail call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp8(i32 %src, float %scale, i1 true)
541+
ret <2 x float> %ret
542+
}
543+
544+
define <2 x float> @test_cvt_scalef32_pk_f32_bf8_word0(i32 %src, float %scale) {
545+
; GCN-LABEL: test_cvt_scalef32_pk_f32_bf8_word0:
546+
; GCN: ; %bb.0:
547+
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
548+
; GCN-NEXT: v_cvt_scalef32_pk_f32_bf8 v[0:1], v0, v1
549+
; GCN-NEXT: s_setpc_b64 s[30:31]
550+
%ret = tail call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.bf8(i32 %src, float %scale, i1 false)
551+
ret <2 x float> %ret
552+
}
553+
554+
define <2 x float> @test_cvt_scalef32_pk_f32_bf8_word1(i32 %src, float %scale) {
555+
; GCN-LABEL: test_cvt_scalef32_pk_f32_bf8_word1:
556+
; GCN: ; %bb.0:
557+
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
558+
; GCN-NEXT: v_cvt_scalef32_pk_f32_bf8 v[0:1], v0, v1 op_sel:[1,0,0]
559+
; GCN-NEXT: s_setpc_b64 s[30:31]
560+
%ret = tail call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.bf8(i32 %src, float %scale, i1 true)
561+
ret <2 x float> %ret
562+
}

0 commit comments

Comments
 (0)