Skip to content

Commit b208cef

Browse files
arsenmpravinjagtap
andcommitted
AMDGPU: Support v_cvt_scalef32_2xpk16_{bf|fp}6_f32 for gfx950. (llvm#117595)
Scale packed 16-component single-precision float vectors from two source inputs using the exponent provided by the third single-precision float input, then convert the values to a packed 32-component FP6 float value. Co-authored-by: Pravin Jagtap <[email protected]>
1 parent 8d6bf9b commit b208cef

File tree

10 files changed

+235
-1
lines changed

10 files changed

+235
-1
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -467,6 +467,9 @@ TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr6_b96_v3i32, "V3iV3i*3", "nc", "gfx950
467467
TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr8_b64_v2i32, "V2iV2i*3", "nc", "gfx950-insts")
468468
TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr16_b64_v4i16, "V4sV4s*3", "nc", "gfx950-insts")
469469

470+
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_2xpk16_fp6_f32, "V6UiV16fV16ff", "nc", "gfx950-insts")
471+
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_2xpk16_bf6_f32, "V6UiV16fV16ff", "nc", "gfx950-insts")
472+
470473
//===----------------------------------------------------------------------===//
471474
// GFX12+ only builtins.
472475
//===----------------------------------------------------------------------===//

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ typedef unsigned int __attribute__((ext_vector_type(2))) uint2;
77
typedef unsigned int __attribute__((ext_vector_type(6))) uint6;
88
typedef __bf16 __attribute__((ext_vector_type(32))) bfloat32;
99
typedef half __attribute__((ext_vector_type(32))) half32;
10+
typedef short __attribute__((ext_vector_type(2))) short2;
11+
typedef float __attribute__((ext_vector_type(16))) float16;
1012

1113
// CHECK-LABEL: @test_prng_b32(
1214
// CHECK-NEXT: entry:
@@ -115,10 +117,14 @@ void test_permlane32_swap(global uint2* out, uint old, uint src) {
115117
// CHECK-NEXT: [[OUT6_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
116118
// CHECK-NEXT: [[SRCBF32_ADDR:%.*]] = alloca <32 x bfloat>, align 64, addrspace(5)
117119
// CHECK-NEXT: [[SRCH32_ADDR:%.*]] = alloca <32 x half>, align 64, addrspace(5)
120+
// CHECK-NEXT: [[SRC0F32_ADDR:%.*]] = alloca <16 x float>, align 64, addrspace(5)
121+
// CHECK-NEXT: [[SRC1F32_ADDR:%.*]] = alloca <16 x float>, align 64, addrspace(5)
118122
// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
119123
// CHECK-NEXT: store ptr addrspace(1) [[OUT6:%.*]], ptr addrspace(5) [[OUT6_ADDR]], align 8
120124
// CHECK-NEXT: store <32 x bfloat> [[SRCBF32:%.*]], ptr addrspace(5) [[SRCBF32_ADDR]], align 64
121125
// CHECK-NEXT: store <32 x half> [[SRCH32:%.*]], ptr addrspace(5) [[SRCH32_ADDR]], align 64
126+
// CHECK-NEXT: store <16 x float> [[SRC0F32:%.*]], ptr addrspace(5) [[SRC0F32_ADDR]], align 64
127+
// CHECK-NEXT: store <16 x float> [[SRC1F32:%.*]], ptr addrspace(5) [[SRC1F32_ADDR]], align 64
122128
// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
123129
// CHECK-NEXT: [[TMP0:%.*]] = load <32 x bfloat>, ptr addrspace(5) [[SRCBF32_ADDR]], align 64
124130
// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
@@ -140,12 +146,26 @@ void test_permlane32_swap(global uint2* out, uint old, uint src) {
140146
// CHECK-NEXT: [[TMP14:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.f16(<32 x half> [[TMP12]], float [[TMP13]])
141147
// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8
142148
// CHECK-NEXT: store <6 x i32> [[TMP14]], ptr addrspace(1) [[TMP15]], align 32
149+
// CHECK-NEXT: [[TMP16:%.*]] = load <16 x float>, ptr addrspace(5) [[SRC0F32_ADDR]], align 64
150+
// CHECK-NEXT: [[TMP17:%.*]] = load <16 x float>, ptr addrspace(5) [[SRC1F32_ADDR]], align 64
151+
// CHECK-NEXT: [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
152+
// CHECK-NEXT: [[TMP19:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.bf6.f32(<16 x float> [[TMP16]], <16 x float> [[TMP17]], float [[TMP18]])
153+
// CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8
154+
// CHECK-NEXT: store <6 x i32> [[TMP19]], ptr addrspace(1) [[TMP20]], align 32
155+
// CHECK-NEXT: [[TMP21:%.*]] = load <16 x float>, ptr addrspace(5) [[SRC0F32_ADDR]], align 64
156+
// CHECK-NEXT: [[TMP22:%.*]] = load <16 x float>, ptr addrspace(5) [[SRC1F32_ADDR]], align 64
157+
// CHECK-NEXT: [[TMP23:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
158+
// CHECK-NEXT: [[TMP24:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.fp6.f32(<16 x float> [[TMP21]], <16 x float> [[TMP22]], float [[TMP23]])
159+
// CHECK-NEXT: [[TMP25:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8
160+
// CHECK-NEXT: store <6 x i32> [[TMP24]], ptr addrspace(1) [[TMP25]], align 32
143161
// CHECK-NEXT: ret void
144162
//
145-
void test_cvt_scalef32_pk(global uint6 *out6, bfloat32 srcbf32, half32 srch32, float scale)
163+
void test_cvt_scalef32_pk(global uint6 *out6, bfloat32 srcbf32, half32 srch32, float16 src0f32, float16 src1f32, float scale)
146164
{
147165
*out6 = __builtin_amdgcn_cvt_scalef32_pk32_bf6_bf16(srcbf32, scale);
148166
*out6 = __builtin_amdgcn_cvt_scalef32_pk32_bf6_f16(srch32, scale);
149167
*out6 = __builtin_amdgcn_cvt_scalef32_pk32_fp6_bf16(srcbf32, scale);
150168
*out6 = __builtin_amdgcn_cvt_scalef32_pk32_fp6_f16(srch32, scale);
169+
*out6 = __builtin_amdgcn_cvt_scalef32_2xpk16_bf6_f32(src0f32, src1f32, scale);
170+
*out6 = __builtin_amdgcn_cvt_scalef32_2xpk16_fp6_f32(src0f32, src1f32, scale);
151171
}

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -565,10 +565,16 @@ class AMDGPUCvtScaleF32Intrinsic<LLVMType DstTy, LLVMType Src0Ty, string name> :
565565
[DstTy], [Src0Ty, llvm_float_ty], [IntrNoMem, IntrSpeculatable]
566566
>, ClangBuiltin<"__builtin_amdgcn_"#name>;
567567

568+
class AMDGPUCvtScaleF32ToFP6BF6Intrinsic<LLVMType DstTy, LLVMType Src0Ty, LLVMType Src1Ty, string name> : DefaultAttrsIntrinsic<
569+
[DstTy], [Src0Ty, Src1Ty, llvm_float_ty], [IntrNoMem, IntrSpeculatable]
570+
>, ClangBuiltin<"__builtin_amdgcn_"#name>;
571+
568572
def int_amdgcn_cvt_scalef32_pk32_fp6_f16 : AMDGPUCvtScaleF32Intrinsic<llvm_v6i32_ty, llvm_v32f16_ty, "cvt_scalef32_pk32_fp6_f16">;
569573
def int_amdgcn_cvt_scalef32_pk32_bf6_f16 : AMDGPUCvtScaleF32Intrinsic<llvm_v6i32_ty, llvm_v32f16_ty, "cvt_scalef32_pk32_bf6_f16">;
570574
def int_amdgcn_cvt_scalef32_pk32_fp6_bf16 : AMDGPUCvtScaleF32Intrinsic<llvm_v6i32_ty, llvm_v32bf16_ty, "cvt_scalef32_pk32_fp6_bf16">;
571575
def int_amdgcn_cvt_scalef32_pk32_bf6_bf16 : AMDGPUCvtScaleF32Intrinsic<llvm_v6i32_ty, llvm_v32bf16_ty, "cvt_scalef32_pk32_bf6_bf16">;
576+
def int_amdgcn_cvt_scalef32_2xpk16_fp6_f32 : AMDGPUCvtScaleF32ToFP6BF6Intrinsic<llvm_v6i32_ty, llvm_v16f32_ty, llvm_v16f32_ty, "cvt_scalef32_2xpk16_fp6_f32">;
577+
def int_amdgcn_cvt_scalef32_2xpk16_bf6_f32 : AMDGPUCvtScaleF32ToFP6BF6Intrinsic<llvm_v6i32_ty, llvm_v16f32_ty, llvm_v16f32_ty, "cvt_scalef32_2xpk16_bf6_f32">;
572578

573579
def int_amdgcn_prng_b32 : DefaultAttrsIntrinsic<
574580
[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4523,6 +4523,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
45234523
case Intrinsic::amdgcn_cvt_scalef32_pk32_bf6_f16:
45244524
case Intrinsic::amdgcn_cvt_scalef32_pk32_fp6_bf16:
45254525
case Intrinsic::amdgcn_cvt_scalef32_pk32_bf6_bf16:
4526+
case Intrinsic::amdgcn_cvt_scalef32_2xpk16_fp6_f32:
4527+
case Intrinsic::amdgcn_cvt_scalef32_2xpk16_bf6_f32:
45264528
case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16:
45274529
case Intrinsic::amdgcn_wmma_f16_16x16x16_f16:
45284530
case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied:

llvm/lib/Target/AMDGPU/SIInstrInfo.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2647,6 +2647,7 @@ def VOP_V32F16_V6I32_F32 : VOPProfile <[v32f16, v6i32, f32, untyped]>;
26472647
def VOP_V32BF16_V6I32_F32 : VOPProfile <[v32bf16, v6i32, f32, untyped]>;
26482648
def VOP_V6I32_V32F16_F32 : VOPProfile<[v6i32, v32f16, f32, untyped]>;
26492649
def VOP_V6I32_V32BF16_F32 : VOPProfile<[v6i32, v32bf16, f32, untyped]>;
2650+
def VOP_V6I32_V16F32_V16F32_F32 : VOPProfile<[v6i32, v16f32, v16f32, f32]>;
26502651

26512652
def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>;
26522653
def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>;

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1010,6 +1010,11 @@ let SubtargetPredicate = HasF16BF16ToFP6BF6ConversionScaleInsts, mayRaiseFPExcep
10101010
defm V_CVT_SCALEF32_PK32_BF6_BF16 : VOP3Inst<"v_cvt_scalef32_pk32_bf6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32BF16_F32>, int_amdgcn_cvt_scalef32_pk32_bf6_bf16>;
10111011
}
10121012

1013+
let SubtargetPredicate = HasGFX950Insts, mayRaiseFPException = 0 in {
1014+
defm V_CVT_SCALEF32_2XPK16_FP6_F32 : VOP3Inst<"v_cvt_scalef32_2xpk16_fp6_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V16F32_V16F32_F32>, int_amdgcn_cvt_scalef32_2xpk16_fp6_f32>;
1015+
defm V_CVT_SCALEF32_2XPK16_BF6_F32 : VOP3Inst<"v_cvt_scalef32_2xpk16_bf6_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V16F32_V16F32_F32>, int_amdgcn_cvt_scalef32_2xpk16_bf6_f32>;
1016+
}
1017+
10131018
let SubtargetPredicate = isGFX10Plus in {
10141019
let isCommutable = 1, isReMaterializable = 1 in {
10151020
defm V_XOR3_B32 : VOP3Inst <"v_xor3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
@@ -1952,3 +1957,6 @@ defm V_CVT_SCALEF32_PK32_FP6_BF16 : VOP3_Real_gfx9<0x259, "v_cvt_scalef32_pk32_f
19521957
defm V_CVT_SCALEF32_PK32_BF6_F16 : VOP3_Real_gfx9<0x25a, "v_cvt_scalef32_pk32_bf6_f16">;
19531958
defm V_CVT_SCALEF32_PK32_BF6_BF16 : VOP3_Real_gfx9<0x25b, "v_cvt_scalef32_pk32_bf6_bf16">;
19541959
}
1960+
1961+
defm V_CVT_SCALEF32_2XPK16_FP6_F32 : VOP3_Real_gfx9<0x252, "v_cvt_scalef32_2xpk16_fp6_f32">;
1962+
defm V_CVT_SCALEF32_2XPK16_BF6_F32 : VOP3_Real_gfx9<0x253, "v_cvt_scalef32_2xpk16_bf6_f32">;
Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2+
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX950-SDAG %s
3+
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX950-GISEL %s
4+
5+
declare <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.bf6.f32(<16 x float> %src0, <16 x float> %src1, float %scale)
6+
declare <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.fp6.f32(<16 x float> %src0, <16 x float> %src1, float %scale)
7+
8+
define amdgpu_ps void @test_scalef32_pk32_fp6_f32_vv(<16 x float> %src, float %scale, ptr addrspace(1) %out) {
9+
; GFX950-SDAG-LABEL: test_scalef32_pk32_fp6_f32_vv:
10+
; GFX950-SDAG: ; %bb.0:
11+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v18
12+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v17
13+
; GFX950-SDAG-NEXT: v_cvt_scalef32_2xpk16_fp6_f32 v[0:5], v[0:15], v[0:15], v16
14+
; GFX950-SDAG-NEXT: global_store_dwordx2 v[18:19], v[4:5], off offset:16
15+
; GFX950-SDAG-NEXT: global_store_dwordx4 v[18:19], v[0:3], off
16+
; GFX950-SDAG-NEXT: s_endpgm
17+
;
18+
; GFX950-GISEL-LABEL: test_scalef32_pk32_fp6_f32_vv:
19+
; GFX950-GISEL: ; %bb.0:
20+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v20, v17
21+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v21, v18
22+
; GFX950-GISEL-NEXT: v_cvt_scalef32_2xpk16_fp6_f32 v[0:5], v[0:15], v[0:15], v16
23+
; GFX950-GISEL-NEXT: global_store_dwordx4 v[20:21], v[0:3], off
24+
; GFX950-GISEL-NEXT: global_store_dwordx2 v[20:21], v[4:5], off offset:16
25+
; GFX950-GISEL-NEXT: s_endpgm
26+
%cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.fp6.f32(<16 x float> %src, <16 x float> %src, float %scale)
27+
store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
28+
ret void
29+
}
30+
31+
define amdgpu_ps void @test_scalef32_pk32_fp6_f32_sl(<16 x float> inreg %src, ptr addrspace(1) %out) {
32+
; GFX950-SDAG-LABEL: test_scalef32_pk32_fp6_f32_sl:
33+
; GFX950-SDAG: ; %bb.0:
34+
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[14:15]
35+
; GFX950-SDAG-NEXT: s_mov_b32 s16, 0x42c80000
36+
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[12:13]
37+
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[10:11]
38+
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[8:9]
39+
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[6:7]
40+
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[4:5]
41+
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
42+
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
43+
; GFX950-SDAG-NEXT: v_cvt_scalef32_2xpk16_fp6_f32 v[2:7], v[2:17], v[2:17], s16
44+
; GFX950-SDAG-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16
45+
; GFX950-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
46+
; GFX950-SDAG-NEXT: s_endpgm
47+
;
48+
; GFX950-GISEL-LABEL: test_scalef32_pk32_fp6_f32_sl:
49+
; GFX950-GISEL: ; %bb.0:
50+
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15]
51+
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13]
52+
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11]
53+
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[8:9]
54+
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7]
55+
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5]
56+
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
57+
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
58+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, 0x42c80000
59+
; GFX950-GISEL-NEXT: v_cvt_scalef32_2xpk16_fp6_f32 v[2:7], v[2:17], v[2:17], v18
60+
; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
61+
; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16
62+
; GFX950-GISEL-NEXT: s_endpgm
63+
%cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.fp6.f32(<16 x float> %src, <16 x float> %src, float 100.0)
64+
store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
65+
ret void
66+
}
67+
68+
define amdgpu_ps void @test_scalef32_pk32_bf6_f32_vv(<16 x float> %src, float %scale, ptr addrspace(1) %out) {
69+
; GFX950-SDAG-LABEL: test_scalef32_pk32_bf6_f32_vv:
70+
; GFX950-SDAG: ; %bb.0:
71+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v18
72+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v17
73+
; GFX950-SDAG-NEXT: v_cvt_scalef32_2xpk16_bf6_f32 v[0:5], v[0:15], v[0:15], v16
74+
; GFX950-SDAG-NEXT: global_store_dwordx2 v[18:19], v[4:5], off offset:16
75+
; GFX950-SDAG-NEXT: global_store_dwordx4 v[18:19], v[0:3], off
76+
; GFX950-SDAG-NEXT: s_endpgm
77+
;
78+
; GFX950-GISEL-LABEL: test_scalef32_pk32_bf6_f32_vv:
79+
; GFX950-GISEL: ; %bb.0:
80+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v20, v17
81+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v21, v18
82+
; GFX950-GISEL-NEXT: v_cvt_scalef32_2xpk16_bf6_f32 v[0:5], v[0:15], v[0:15], v16
83+
; GFX950-GISEL-NEXT: global_store_dwordx4 v[20:21], v[0:3], off
84+
; GFX950-GISEL-NEXT: global_store_dwordx2 v[20:21], v[4:5], off offset:16
85+
; GFX950-GISEL-NEXT: s_endpgm
86+
%cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.bf6.f32(<16 x float> %src, <16 x float> %src, float %scale)
87+
store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
88+
ret void
89+
}
90+
91+
define amdgpu_ps void @test_scalef32_pk32_bf6_f32_sl(<16 x float> inreg %src, ptr addrspace(1) %out) {
92+
; GFX950-SDAG-LABEL: test_scalef32_pk32_bf6_f32_sl:
93+
; GFX950-SDAG: ; %bb.0:
94+
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[14:15]
95+
; GFX950-SDAG-NEXT: s_mov_b32 s16, 0x42c80000
96+
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[12:13]
97+
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[10:11]
98+
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[8:9]
99+
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[6:7]
100+
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[4:5]
101+
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
102+
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
103+
; GFX950-SDAG-NEXT: v_cvt_scalef32_2xpk16_bf6_f32 v[2:7], v[2:17], v[2:17], s16
104+
; GFX950-SDAG-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16
105+
; GFX950-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
106+
; GFX950-SDAG-NEXT: s_endpgm
107+
;
108+
; GFX950-GISEL-LABEL: test_scalef32_pk32_bf6_f32_sl:
109+
; GFX950-GISEL: ; %bb.0:
110+
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15]
111+
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13]
112+
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11]
113+
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[8:9]
114+
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7]
115+
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5]
116+
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
117+
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
118+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, 0x42c80000
119+
; GFX950-GISEL-NEXT: v_cvt_scalef32_2xpk16_bf6_f32 v[2:7], v[2:17], v[2:17], v18
120+
; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
121+
; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16
122+
; GFX950-GISEL-NEXT: s_endpgm
123+
%cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.bf6.f32(<16 x float> %src, <16 x float> %src, float 100.0)
124+
store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
125+
ret void
126+
}
127+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
128+
; GCN: {{.*}}

llvm/test/MC/AMDGPU/gfx950_asm_features.s

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1065,3 +1065,27 @@ v_cvt_scalef32_pk_fp4_bf16 v1, v2, v3 op_sel:[0,0,0,1]
10651065
// NOT-GFX950: error: instruction not supported on this GPU
10661066
// GFX950: v_cvt_scalef32_pk_fp4_bf16 v1, -|s2|, v3 ; encoding: [0x01,0x01,0x4d,0xd2,0x02,0x06,0x02,0x20]
10671067
v_cvt_scalef32_pk_fp4_bf16 v1, -|s2|, v3
1068+
1069+
// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
1070+
// GFX950: v_cvt_scalef32_2xpk16_fp6_f32 v[20:25], v[10:25], v[10:25], v6 ; encoding: [0x14,0x00,0x52,0xd2,0x0a,0x15,0x1a,0x04]
1071+
v_cvt_scalef32_2xpk16_fp6_f32 v[20:25], v[10:25], v[10:25], v6
1072+
1073+
// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
1074+
// GFX950: v_cvt_scalef32_2xpk16_bf6_f32 v[20:25], v[10:25], v[10:25], v6 ; encoding: [0x14,0x00,0x53,0xd2,0x0a,0x15,0x1a,0x04]
1075+
v_cvt_scalef32_2xpk16_bf6_f32 v[20:25], v[10:25], v[10:25], v6
1076+
1077+
// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
1078+
// GFX950: v_cvt_scalef32_2xpk16_fp6_f32 v[20:25], v[10:25], v[10:25], s6 ; encoding: [0x14,0x00,0x52,0xd2,0x0a,0x15,0x1a,0x00]
1079+
v_cvt_scalef32_2xpk16_fp6_f32 v[20:25], v[10:25], v[10:25], s6
1080+
1081+
// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
1082+
// GFX950: v_cvt_scalef32_2xpk16_bf6_f32 v[20:25], v[10:25], v[10:25], s6 ; encoding: [0x14,0x00,0x53,0xd2,0x0a,0x15,0x1a,0x00]
1083+
v_cvt_scalef32_2xpk16_bf6_f32 v[20:25], v[10:25], v[10:25], s6
1084+
1085+
// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
1086+
// GFX950: v_cvt_scalef32_2xpk16_fp6_f32 v[20:25], v[10:25], v[10:25], 22 ; encoding: [0x14,0x00,0x52,0xd2,0x0a,0x15,0x5a,0x02]
1087+
v_cvt_scalef32_2xpk16_fp6_f32 v[20:25], v[10:25], v[10:25], 22
1088+
1089+
// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
1090+
// GFX950: v_cvt_scalef32_2xpk16_bf6_f32 v[20:25], v[10:25], v[10:25], 11 ; encoding: [0x14,0x00,0x53,0xd2,0x0a,0x15,0x2e,0x02]
1091+
v_cvt_scalef32_2xpk16_bf6_f32 v[20:25], v[10:25], v[10:25], 11

llvm/test/MC/AMDGPU/gfx950_err.s

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -317,3 +317,27 @@ v_cvt_scalef32_pk_fp4_bf16 v1, v2, v3 div:2
317317

318318
// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: not a valid operand
319319
v_cvt_scalef32_pk_fp4_bf16 v1, v2, v3 clamp div:2
320+
321+
// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
322+
v_cvt_scalef32_2xpk16_fp6_f32 v[20:25], v[10:25], v[10:25], v6 clamp
323+
324+
// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: not a valid operand
325+
v_cvt_scalef32_2xpk16_fp6_f32 v[20:25], v[10:25], v[10:25], v6 mul:2
326+
327+
// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: not a valid operand
328+
v_cvt_scalef32_2xpk16_fp6_f32 v[20:25], v[10:25], v[10:25], v6 div:2
329+
330+
// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: not a valid operand
331+
v_cvt_scalef32_2xpk16_fp6_f32 v[20:25], v[10:25], v[10:25], v6 clamp div:2
332+
333+
// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
334+
v_cvt_scalef32_2xpk16_bf6_f32 v[20:25], v[10:25], v[10:25], v6 clamp
335+
336+
// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: not a valid operand
337+
v_cvt_scalef32_2xpk16_bf6_f32 v[20:25], v[10:25], v[10:25], v6 mul:2
338+
339+
// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: not a valid operand
340+
v_cvt_scalef32_2xpk16_bf6_f32 v[20:25], v[10:25], v[10:25], v6 div:2
341+
342+
// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: not a valid operand
343+
v_cvt_scalef32_2xpk16_bf6_f32 v[20:25], v[10:25], v[10:25], v6 clamp div:2

llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -743,3 +743,21 @@
743743

744744
# GFX950: v_cvt_scalef32_pk_fp4_bf16 v1, -|s2|, v3 ; encoding: [0x01,0x01,0x4d,0xd2,0x02,0x06,0x02,0x20]
745745
0x01,0x01,0x4d,0xd2,0x02,0x06,0x02,0x20
746+
747+
# GFX950: v_cvt_scalef32_2xpk16_fp6_f32 v[20:25], v[10:25], v[10:25], v6 ; encoding: [0x14,0x00,0x52,0xd2,0x0a,0x15,0x1a,0x04]
748+
0x14,0x00,0x52,0xd2,0x0a,0x15,0x1a,0x04
749+
750+
# GFX950: v_cvt_scalef32_2xpk16_bf6_f32 v[20:25], v[10:25], v[10:25], v6 ; encoding: [0x14,0x00,0x53,0xd2,0x0a,0x15,0x1a,0x04]
751+
0x14,0x00,0x53,0xd2,0x0a,0x15,0x1a,0x04
752+
753+
# GFX950: v_cvt_scalef32_2xpk16_fp6_f32 v[20:25], v[10:25], v[10:25], s6 ; encoding: [0x14,0x00,0x52,0xd2,0x0a,0x15,0x1a,0x00]
754+
0x14,0x00,0x52,0xd2,0x0a,0x15,0x1a,0x00
755+
756+
# GFX950: v_cvt_scalef32_2xpk16_bf6_f32 v[20:25], v[10:25], v[10:25], s6 ; encoding: [0x14,0x00,0x53,0xd2,0x0a,0x15,0x1a,0x00]
757+
0x14,0x00,0x53,0xd2,0x0a,0x15,0x1a,0x00
758+
759+
# GFX950: v_cvt_scalef32_2xpk16_fp6_f32 v[20:25], v[10:25], v[10:25], 22 ; encoding: [0x14,0x00,0x52,0xd2,0x0a,0x15,0x5a,0x02]
760+
0x14,0x00,0x52,0xd2,0x0a,0x15,0x5a,0x02
761+
762+
# GFX950: v_cvt_scalef32_2xpk16_bf6_f32 v[20:25], v[10:25], v[10:25], 11 ; encoding: [0x14,0x00,0x53,0xd2,0x0a,0x15,0x2e,0x02]
763+
0x14,0x00,0x53,0xd2,0x0a,0x15,0x2e,0x02

0 commit comments

Comments
 (0)