Skip to content

AMDGPU: Support v_cvt_scalef32_2xpk16_{bf|fp}6_f32 for gfx950. #117595

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions clang/include/clang/Basic/BuiltinsAMDGPU.def
Original file line number Diff line number Diff line change
Expand Up @@ -467,6 +467,9 @@ TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr6_b96_v3i32, "V3iV3i*3", "nc", "gfx950
TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr8_b64_v2i32, "V2iV2i*3", "nc", "gfx950-insts")
TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr16_b64_v4i16, "V4sV4s*3", "nc", "gfx950-insts")

TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_2xpk16_fp6_f32, "V6UiV16fV16ff", "nc", "gfx950-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_2xpk16_bf6_f32, "V6UiV16fV16ff", "nc", "gfx950-insts")

//===----------------------------------------------------------------------===//
// GFX12+ only builtins.
//===----------------------------------------------------------------------===//
Expand Down
22 changes: 21 additions & 1 deletion clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ typedef unsigned int __attribute__((ext_vector_type(2))) uint2;
typedef unsigned int __attribute__((ext_vector_type(6))) uint6;
typedef __bf16 __attribute__((ext_vector_type(32))) bfloat32;
typedef half __attribute__((ext_vector_type(32))) half32;
typedef short __attribute__((ext_vector_type(2))) short2;
typedef float __attribute__((ext_vector_type(16))) float16;

// CHECK-LABEL: @test_prng_b32(
// CHECK-NEXT: entry:
Expand Down Expand Up @@ -115,10 +117,14 @@ void test_permlane32_swap(global uint2* out, uint old, uint src) {
// CHECK-NEXT: [[OUT6_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT: [[SRCBF32_ADDR:%.*]] = alloca <32 x bfloat>, align 64, addrspace(5)
// CHECK-NEXT: [[SRCH32_ADDR:%.*]] = alloca <32 x half>, align 64, addrspace(5)
// CHECK-NEXT: [[SRC0F32_ADDR:%.*]] = alloca <16 x float>, align 64, addrspace(5)
// CHECK-NEXT: [[SRC1F32_ADDR:%.*]] = alloca <16 x float>, align 64, addrspace(5)
// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
// CHECK-NEXT: store ptr addrspace(1) [[OUT6:%.*]], ptr addrspace(5) [[OUT6_ADDR]], align 8
// CHECK-NEXT: store <32 x bfloat> [[SRCBF32:%.*]], ptr addrspace(5) [[SRCBF32_ADDR]], align 64
// CHECK-NEXT: store <32 x half> [[SRCH32:%.*]], ptr addrspace(5) [[SRCH32_ADDR]], align 64
// CHECK-NEXT: store <16 x float> [[SRC0F32:%.*]], ptr addrspace(5) [[SRC0F32_ADDR]], align 64
// CHECK-NEXT: store <16 x float> [[SRC1F32:%.*]], ptr addrspace(5) [[SRC1F32_ADDR]], align 64
// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load <32 x bfloat>, ptr addrspace(5) [[SRCBF32_ADDR]], align 64
// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
Expand All @@ -140,12 +146,26 @@ void test_permlane32_swap(global uint2* out, uint old, uint src) {
// CHECK-NEXT: [[TMP14:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.f16(<32 x half> [[TMP12]], float [[TMP13]])
// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8
// CHECK-NEXT: store <6 x i32> [[TMP14]], ptr addrspace(1) [[TMP15]], align 32
// CHECK-NEXT: [[TMP16:%.*]] = load <16 x float>, ptr addrspace(5) [[SRC0F32_ADDR]], align 64
// CHECK-NEXT: [[TMP17:%.*]] = load <16 x float>, ptr addrspace(5) [[SRC1F32_ADDR]], align 64
// CHECK-NEXT: [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
// CHECK-NEXT: [[TMP19:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.bf6.f32(<16 x float> [[TMP16]], <16 x float> [[TMP17]], float [[TMP18]])
// CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8
// CHECK-NEXT: store <6 x i32> [[TMP19]], ptr addrspace(1) [[TMP20]], align 32
// CHECK-NEXT: [[TMP21:%.*]] = load <16 x float>, ptr addrspace(5) [[SRC0F32_ADDR]], align 64
// CHECK-NEXT: [[TMP22:%.*]] = load <16 x float>, ptr addrspace(5) [[SRC1F32_ADDR]], align 64
// CHECK-NEXT: [[TMP23:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
// CHECK-NEXT: [[TMP24:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.fp6.f32(<16 x float> [[TMP21]], <16 x float> [[TMP22]], float [[TMP23]])
// CHECK-NEXT: [[TMP25:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8
// CHECK-NEXT: store <6 x i32> [[TMP24]], ptr addrspace(1) [[TMP25]], align 32
// CHECK-NEXT: ret void
//
void test_cvt_scalef32_pk(global uint6 *out6, bfloat32 srcbf32, half32 srch32, float scale)
void test_cvt_scalef32_pk(global uint6 *out6, bfloat32 srcbf32, half32 srch32, float16 src0f32, float16 src1f32, float scale)
{
*out6 = __builtin_amdgcn_cvt_scalef32_pk32_bf6_bf16(srcbf32, scale);
*out6 = __builtin_amdgcn_cvt_scalef32_pk32_bf6_f16(srch32, scale);
*out6 = __builtin_amdgcn_cvt_scalef32_pk32_fp6_bf16(srcbf32, scale);
*out6 = __builtin_amdgcn_cvt_scalef32_pk32_fp6_f16(srch32, scale);
*out6 = __builtin_amdgcn_cvt_scalef32_2xpk16_bf6_f32(src0f32, src1f32, scale);
*out6 = __builtin_amdgcn_cvt_scalef32_2xpk16_fp6_f32(src0f32, src1f32, scale);
}
6 changes: 6 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -598,10 +598,16 @@ class AMDGPUCvtScaleF32Intrinsic<LLVMType DstTy, LLVMType Src0Ty, string name> :
[DstTy], [Src0Ty, llvm_float_ty], [IntrNoMem, IntrSpeculatable]
>, ClangBuiltin<"__builtin_amdgcn_"#name>;

class AMDGPUCvtScaleF32ToFP6BF6Intrinsic<LLVMType DstTy, LLVMType Src0Ty, LLVMType Src1Ty, string name> : DefaultAttrsIntrinsic<
[DstTy], [Src0Ty, Src1Ty, llvm_float_ty], [IntrNoMem, IntrSpeculatable]
>, ClangBuiltin<"__builtin_amdgcn_"#name>;

def int_amdgcn_cvt_scalef32_pk32_fp6_f16 : AMDGPUCvtScaleF32Intrinsic<llvm_v6i32_ty, llvm_v32f16_ty, "cvt_scalef32_pk32_fp6_f16">;
def int_amdgcn_cvt_scalef32_pk32_bf6_f16 : AMDGPUCvtScaleF32Intrinsic<llvm_v6i32_ty, llvm_v32f16_ty, "cvt_scalef32_pk32_bf6_f16">;
def int_amdgcn_cvt_scalef32_pk32_fp6_bf16 : AMDGPUCvtScaleF32Intrinsic<llvm_v6i32_ty, llvm_v32bf16_ty, "cvt_scalef32_pk32_fp6_bf16">;
def int_amdgcn_cvt_scalef32_pk32_bf6_bf16 : AMDGPUCvtScaleF32Intrinsic<llvm_v6i32_ty, llvm_v32bf16_ty, "cvt_scalef32_pk32_bf6_bf16">;
def int_amdgcn_cvt_scalef32_2xpk16_fp6_f32 : AMDGPUCvtScaleF32ToFP6BF6Intrinsic<llvm_v6i32_ty, llvm_v16f32_ty, llvm_v16f32_ty, "cvt_scalef32_2xpk16_fp6_f32">;
def int_amdgcn_cvt_scalef32_2xpk16_bf6_f32 : AMDGPUCvtScaleF32ToFP6BF6Intrinsic<llvm_v6i32_ty, llvm_v16f32_ty, llvm_v16f32_ty, "cvt_scalef32_2xpk16_bf6_f32">;

def int_amdgcn_prng_b32 : DefaultAttrsIntrinsic<
[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4546,6 +4546,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_cvt_scalef32_pk32_bf6_f16:
case Intrinsic::amdgcn_cvt_scalef32_pk32_fp6_bf16:
case Intrinsic::amdgcn_cvt_scalef32_pk32_bf6_bf16:
case Intrinsic::amdgcn_cvt_scalef32_2xpk16_fp6_f32:
case Intrinsic::amdgcn_cvt_scalef32_2xpk16_bf6_f32:
case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16:
case Intrinsic::amdgcn_wmma_f16_16x16x16_f16:
case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied:
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -2840,6 +2840,7 @@ def VOP_V32F16_V6I32_F32 : VOPProfile <[v32f16, v6i32, f32, untyped]>;
def VOP_V32BF16_V6I32_F32 : VOPProfile <[v32bf16, v6i32, f32, untyped]>;
def VOP_V6I32_V32F16_F32 : VOPProfile<[v6i32, v32f16, f32, untyped]>;
def VOP_V6I32_V32BF16_F32 : VOPProfile<[v6i32, v32bf16, f32, untyped]>;
def VOP_V6I32_V16F32_V16F32_F32 : VOPProfile<[v6i32, v16f32, v16f32, f32]>;

def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>;
def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>;
Expand Down
8 changes: 8 additions & 0 deletions llvm/lib/Target/AMDGPU/VOP3Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1007,6 +1007,11 @@ let SubtargetPredicate = HasF16BF16ToFP6BF6ConversionScaleInsts, mayRaiseFPExcep
defm V_CVT_SCALEF32_PK32_BF6_BF16 : VOP3Inst<"v_cvt_scalef32_pk32_bf6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32BF16_F32>, int_amdgcn_cvt_scalef32_pk32_bf6_bf16>;
}

let SubtargetPredicate = HasGFX950Insts, mayRaiseFPException = 0 in {
defm V_CVT_SCALEF32_2XPK16_FP6_F32 : VOP3Inst<"v_cvt_scalef32_2xpk16_fp6_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V16F32_V16F32_F32>, int_amdgcn_cvt_scalef32_2xpk16_fp6_f32>;
defm V_CVT_SCALEF32_2XPK16_BF6_F32 : VOP3Inst<"v_cvt_scalef32_2xpk16_bf6_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V16F32_V16F32_F32>, int_amdgcn_cvt_scalef32_2xpk16_bf6_f32>;
}

let SubtargetPredicate = isGFX10Plus in {
let isCommutable = 1, isReMaterializable = 1 in {
defm V_XOR3_B32 : VOP3Inst <"v_xor3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
Expand Down Expand Up @@ -1972,3 +1977,6 @@ defm V_CVT_SCALEF32_PK32_FP6_BF16 : VOP3_Real_gfx9<0x259, "v_cvt_scalef32_pk32_f
defm V_CVT_SCALEF32_PK32_BF6_F16 : VOP3_Real_gfx9<0x25a, "v_cvt_scalef32_pk32_bf6_f16">;
defm V_CVT_SCALEF32_PK32_BF6_BF16 : VOP3_Real_gfx9<0x25b, "v_cvt_scalef32_pk32_bf6_bf16">;
}

defm V_CVT_SCALEF32_2XPK16_FP6_F32 : VOP3_Real_gfx9<0x252, "v_cvt_scalef32_2xpk16_fp6_f32">;
defm V_CVT_SCALEF32_2XPK16_BF6_F32 : VOP3_Real_gfx9<0x253, "v_cvt_scalef32_2xpk16_bf6_f32">;
128 changes: 128 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX950-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX950-GISEL %s

declare <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.bf6.f32(<16 x float> %src0, <16 x float> %src1, float %scale)
declare <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.fp6.f32(<16 x float> %src0, <16 x float> %src1, float %scale)

define amdgpu_ps void @test_scalef32_pk32_fp6_f32_vv(<16 x float> %src, float %scale, ptr addrspace(1) %out) {
; GFX950-SDAG-LABEL: test_scalef32_pk32_fp6_f32_vv:
; GFX950-SDAG: ; %bb.0:
; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v18
; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v17
; GFX950-SDAG-NEXT: v_cvt_scalef32_2xpk16_fp6_f32 v[0:5], v[0:15], v[0:15], v16
; GFX950-SDAG-NEXT: global_store_dwordx2 v[18:19], v[4:5], off offset:16
; GFX950-SDAG-NEXT: global_store_dwordx4 v[18:19], v[0:3], off
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: test_scalef32_pk32_fp6_f32_vv:
; GFX950-GISEL: ; %bb.0:
; GFX950-GISEL-NEXT: v_mov_b32_e32 v20, v17
; GFX950-GISEL-NEXT: v_mov_b32_e32 v21, v18
; GFX950-GISEL-NEXT: v_cvt_scalef32_2xpk16_fp6_f32 v[0:5], v[0:15], v[0:15], v16
; GFX950-GISEL-NEXT: global_store_dwordx4 v[20:21], v[0:3], off
; GFX950-GISEL-NEXT: global_store_dwordx2 v[20:21], v[4:5], off offset:16
; GFX950-GISEL-NEXT: s_endpgm
%cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.fp6.f32(<16 x float> %src, <16 x float> %src, float %scale)
store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
ret void
}

define amdgpu_ps void @test_scalef32_pk32_fp6_f32_sl(<16 x float> inreg %src, ptr addrspace(1) %out) {
; GFX950-SDAG-LABEL: test_scalef32_pk32_fp6_f32_sl:
; GFX950-SDAG: ; %bb.0:
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[14:15]
; GFX950-SDAG-NEXT: s_mov_b32 s16, 0x42c80000
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[12:13]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[10:11]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[8:9]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[6:7]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[4:5]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-SDAG-NEXT: v_cvt_scalef32_2xpk16_fp6_f32 v[2:7], v[2:17], v[2:17], s16
; GFX950-SDAG-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16
; GFX950-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: test_scalef32_pk32_fp6_f32_sl:
; GFX950-GISEL: ; %bb.0:
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[8:9]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, 0x42c80000
; GFX950-GISEL-NEXT: v_cvt_scalef32_2xpk16_fp6_f32 v[2:7], v[2:17], v[2:17], v18
; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16
; GFX950-GISEL-NEXT: s_endpgm
%cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.fp6.f32(<16 x float> %src, <16 x float> %src, float 100.0)
store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
ret void
}

define amdgpu_ps void @test_scalef32_pk32_bf6_f32_vv(<16 x float> %src, float %scale, ptr addrspace(1) %out) {
; GFX950-SDAG-LABEL: test_scalef32_pk32_bf6_f32_vv:
; GFX950-SDAG: ; %bb.0:
; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v18
; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v17
; GFX950-SDAG-NEXT: v_cvt_scalef32_2xpk16_bf6_f32 v[0:5], v[0:15], v[0:15], v16
; GFX950-SDAG-NEXT: global_store_dwordx2 v[18:19], v[4:5], off offset:16
; GFX950-SDAG-NEXT: global_store_dwordx4 v[18:19], v[0:3], off
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: test_scalef32_pk32_bf6_f32_vv:
; GFX950-GISEL: ; %bb.0:
; GFX950-GISEL-NEXT: v_mov_b32_e32 v20, v17
; GFX950-GISEL-NEXT: v_mov_b32_e32 v21, v18
; GFX950-GISEL-NEXT: v_cvt_scalef32_2xpk16_bf6_f32 v[0:5], v[0:15], v[0:15], v16
; GFX950-GISEL-NEXT: global_store_dwordx4 v[20:21], v[0:3], off
; GFX950-GISEL-NEXT: global_store_dwordx2 v[20:21], v[4:5], off offset:16
; GFX950-GISEL-NEXT: s_endpgm
%cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.bf6.f32(<16 x float> %src, <16 x float> %src, float %scale)
store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
ret void
}

define amdgpu_ps void @test_scalef32_pk32_bf6_f32_sl(<16 x float> inreg %src, ptr addrspace(1) %out) {
; GFX950-SDAG-LABEL: test_scalef32_pk32_bf6_f32_sl:
; GFX950-SDAG: ; %bb.0:
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[14:15]
; GFX950-SDAG-NEXT: s_mov_b32 s16, 0x42c80000
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[12:13]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[10:11]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[8:9]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[6:7]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[4:5]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-SDAG-NEXT: v_cvt_scalef32_2xpk16_bf6_f32 v[2:7], v[2:17], v[2:17], s16
; GFX950-SDAG-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16
; GFX950-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: test_scalef32_pk32_bf6_f32_sl:
; GFX950-GISEL: ; %bb.0:
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[8:9]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, 0x42c80000
; GFX950-GISEL-NEXT: v_cvt_scalef32_2xpk16_bf6_f32 v[2:7], v[2:17], v[2:17], v18
; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16
; GFX950-GISEL-NEXT: s_endpgm
%cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.bf6.f32(<16 x float> %src, <16 x float> %src, float 100.0)
store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
ret void
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GCN: {{.*}}
24 changes: 24 additions & 0 deletions llvm/test/MC/AMDGPU/gfx950_asm_features.s
Original file line number Diff line number Diff line change
Expand Up @@ -1065,3 +1065,27 @@ v_cvt_scalef32_pk_fp4_bf16 v1, v2, v3 op_sel:[0,0,0,1]
// NOT-GFX950: error: instruction not supported on this GPU
// GFX950: v_cvt_scalef32_pk_fp4_bf16 v1, -|s2|, v3 ; encoding: [0x01,0x01,0x4d,0xd2,0x02,0x06,0x02,0x20]
v_cvt_scalef32_pk_fp4_bf16 v1, -|s2|, v3

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: v_cvt_scalef32_2xpk16_fp6_f32 v[20:25], v[10:25], v[10:25], v6 ; encoding: [0x14,0x00,0x52,0xd2,0x0a,0x15,0x1a,0x04]
v_cvt_scalef32_2xpk16_fp6_f32 v[20:25], v[10:25], v[10:25], v6

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: v_cvt_scalef32_2xpk16_bf6_f32 v[20:25], v[10:25], v[10:25], v6 ; encoding: [0x14,0x00,0x53,0xd2,0x0a,0x15,0x1a,0x04]
v_cvt_scalef32_2xpk16_bf6_f32 v[20:25], v[10:25], v[10:25], v6

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: v_cvt_scalef32_2xpk16_fp6_f32 v[20:25], v[10:25], v[10:25], s6 ; encoding: [0x14,0x00,0x52,0xd2,0x0a,0x15,0x1a,0x00]
v_cvt_scalef32_2xpk16_fp6_f32 v[20:25], v[10:25], v[10:25], s6

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: v_cvt_scalef32_2xpk16_bf6_f32 v[20:25], v[10:25], v[10:25], s6 ; encoding: [0x14,0x00,0x53,0xd2,0x0a,0x15,0x1a,0x00]
v_cvt_scalef32_2xpk16_bf6_f32 v[20:25], v[10:25], v[10:25], s6

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: v_cvt_scalef32_2xpk16_fp6_f32 v[20:25], v[10:25], v[10:25], 22 ; encoding: [0x14,0x00,0x52,0xd2,0x0a,0x15,0x5a,0x02]
v_cvt_scalef32_2xpk16_fp6_f32 v[20:25], v[10:25], v[10:25], 22

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: v_cvt_scalef32_2xpk16_bf6_f32 v[20:25], v[10:25], v[10:25], 11 ; encoding: [0x14,0x00,0x53,0xd2,0x0a,0x15,0x2e,0x02]
v_cvt_scalef32_2xpk16_bf6_f32 v[20:25], v[10:25], v[10:25], 11
24 changes: 24 additions & 0 deletions llvm/test/MC/AMDGPU/gfx950_err.s
Original file line number Diff line number Diff line change
Expand Up @@ -317,3 +317,27 @@ v_cvt_scalef32_pk_fp4_bf16 v1, v2, v3 div:2

// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: not a valid operand
v_cvt_scalef32_pk_fp4_bf16 v1, v2, v3 clamp div:2

// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
v_cvt_scalef32_2xpk16_fp6_f32 v[20:25], v[10:25], v[10:25], v6 clamp

// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: not a valid operand
v_cvt_scalef32_2xpk16_fp6_f32 v[20:25], v[10:25], v[10:25], v6 mul:2

// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: not a valid operand
v_cvt_scalef32_2xpk16_fp6_f32 v[20:25], v[10:25], v[10:25], v6 div:2

// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: not a valid operand
v_cvt_scalef32_2xpk16_fp6_f32 v[20:25], v[10:25], v[10:25], v6 clamp div:2

// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
v_cvt_scalef32_2xpk16_bf6_f32 v[20:25], v[10:25], v[10:25], v6 clamp

// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: not a valid operand
v_cvt_scalef32_2xpk16_bf6_f32 v[20:25], v[10:25], v[10:25], v6 mul:2

// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: not a valid operand
v_cvt_scalef32_2xpk16_bf6_f32 v[20:25], v[10:25], v[10:25], v6 div:2

// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: not a valid operand
v_cvt_scalef32_2xpk16_bf6_f32 v[20:25], v[10:25], v[10:25], v6 clamp div:2
18 changes: 18 additions & 0 deletions llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt
Original file line number Diff line number Diff line change
Expand Up @@ -743,3 +743,21 @@

# GFX950: v_cvt_scalef32_pk_fp4_bf16 v1, -|s2|, v3 ; encoding: [0x01,0x01,0x4d,0xd2,0x02,0x06,0x02,0x20]
0x01,0x01,0x4d,0xd2,0x02,0x06,0x02,0x20

# GFX950: v_cvt_scalef32_2xpk16_fp6_f32 v[20:25], v[10:25], v[10:25], v6 ; encoding: [0x14,0x00,0x52,0xd2,0x0a,0x15,0x1a,0x04]
0x14,0x00,0x52,0xd2,0x0a,0x15,0x1a,0x04

# GFX950: v_cvt_scalef32_2xpk16_bf6_f32 v[20:25], v[10:25], v[10:25], v6 ; encoding: [0x14,0x00,0x53,0xd2,0x0a,0x15,0x1a,0x04]
0x14,0x00,0x53,0xd2,0x0a,0x15,0x1a,0x04

# GFX950: v_cvt_scalef32_2xpk16_fp6_f32 v[20:25], v[10:25], v[10:25], s6 ; encoding: [0x14,0x00,0x52,0xd2,0x0a,0x15,0x1a,0x00]
0x14,0x00,0x52,0xd2,0x0a,0x15,0x1a,0x00

# GFX950: v_cvt_scalef32_2xpk16_bf6_f32 v[20:25], v[10:25], v[10:25], s6 ; encoding: [0x14,0x00,0x53,0xd2,0x0a,0x15,0x1a,0x00]
0x14,0x00,0x53,0xd2,0x0a,0x15,0x1a,0x00

# GFX950: v_cvt_scalef32_2xpk16_fp6_f32 v[20:25], v[10:25], v[10:25], 22 ; encoding: [0x14,0x00,0x52,0xd2,0x0a,0x15,0x5a,0x02]
0x14,0x00,0x52,0xd2,0x0a,0x15,0x5a,0x02

# GFX950: v_cvt_scalef32_2xpk16_bf6_f32 v[20:25], v[10:25], v[10:25], 11 ; encoding: [0x14,0x00,0x53,0xd2,0x0a,0x15,0x2e,0x02]
0x14,0x00,0x53,0xd2,0x0a,0x15,0x2e,0x02
Loading