Skip to content

Commit 476fffe

Browse files
arsenmsrpande
authored andcommitted
AMDGPU: Add support for v_ashr_pk_i8/u8_i32 instructions for gfx950 (llvm#117596)
This patch adds assembly and builtin support for v_ashr_pk_i8/u8_i32 instructions. Co-authored-by: Sirish Pande <[email protected]>
1 parent b208cef commit 476fffe

File tree

13 files changed

+194
-1
lines changed

13 files changed

+194
-1
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -467,6 +467,9 @@ TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr6_b96_v3i32, "V3iV3i*3", "nc", "gfx950
467467
TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr8_b64_v2i32, "V2iV2i*3", "nc", "gfx950-insts")
468468
TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr16_b64_v4i16, "V4sV4s*3", "nc", "gfx950-insts")
469469

470+
TARGET_BUILTIN(__builtin_amdgcn_ashr_pk_i8_i32, "UsUiUiUi", "nc", "ashr-pk-insts")
471+
TARGET_BUILTIN(__builtin_amdgcn_ashr_pk_u8_i32, "UsUiUiUi", "nc", "ashr-pk-insts")
472+
470473
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_2xpk16_fp6_f32, "V6UiV16fV16ff", "nc", "gfx950-insts")
471474
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_2xpk16_bf6_f32, "V6UiV16fV16ff", "nc", "gfx950-insts")
472475

clang/test/CodeGenOpenCL/amdgpu-features.cl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@
8989
// GFX941: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts"
9090
// GFX942: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts"
9191
// GFX9_4_Generic: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
92-
// GFX950: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
92+
// GFX950: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
9393
// GFX1010: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
9494
// GFX1011: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
9595
// GFX1012: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,3 +169,49 @@ void test_cvt_scalef32_pk(global uint6 *out6, bfloat32 srcbf32, half32 srch32, f
169169
*out6 = __builtin_amdgcn_cvt_scalef32_2xpk16_bf6_f32(src0f32, src1f32, scale);
170170
*out6 = __builtin_amdgcn_cvt_scalef32_2xpk16_fp6_f32(src0f32, src1f32, scale);
171171
}
172+
173+
// CHECK-LABEL: @test_ashr_pk_i8_i32(
174+
// CHECK-NEXT: entry:
175+
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
176+
// CHECK-NEXT: [[SRC0_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
177+
// CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
178+
// CHECK-NEXT: [[SRC2_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
179+
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
180+
// CHECK-NEXT: store i32 [[SRC0:%.*]], ptr addrspace(5) [[SRC0_ADDR]], align 4
181+
// CHECK-NEXT: store i32 [[SRC1:%.*]], ptr addrspace(5) [[SRC1_ADDR]], align 4
182+
// CHECK-NEXT: store i32 [[SRC2:%.*]], ptr addrspace(5) [[SRC2_ADDR]], align 4
183+
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC0_ADDR]], align 4
184+
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4
185+
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[SRC2_ADDR]], align 4
186+
// CHECK-NEXT: [[TMP3:%.*]] = call i16 @llvm.amdgcn.ashr.pk.i8.i32(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]])
187+
// CHECK-NEXT: [[CONV:%.*]] = zext i16 [[TMP3]] to i32
188+
// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
189+
// CHECK-NEXT: store i32 [[CONV]], ptr addrspace(1) [[TMP4]], align 4
190+
// CHECK-NEXT: ret void
191+
//
192+
void test_ashr_pk_i8_i32(global int* out, uint src0, uint src1, uint src2) {
193+
*out = __builtin_amdgcn_ashr_pk_i8_i32(src0, src1, src2);
194+
}
195+
196+
// CHECK-LABEL: @test_ashr_pk_u8_i32(
197+
// CHECK-NEXT: entry:
198+
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
199+
// CHECK-NEXT: [[SRC0_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
200+
// CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
201+
// CHECK-NEXT: [[SRC2_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
202+
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
203+
// CHECK-NEXT: store i32 [[SRC0:%.*]], ptr addrspace(5) [[SRC0_ADDR]], align 4
204+
// CHECK-NEXT: store i32 [[SRC1:%.*]], ptr addrspace(5) [[SRC1_ADDR]], align 4
205+
// CHECK-NEXT: store i32 [[SRC2:%.*]], ptr addrspace(5) [[SRC2_ADDR]], align 4
206+
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC0_ADDR]], align 4
207+
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4
208+
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[SRC2_ADDR]], align 4
209+
// CHECK-NEXT: [[TMP3:%.*]] = call i16 @llvm.amdgcn.ashr.pk.u8.i32(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]])
210+
// CHECK-NEXT: [[CONV:%.*]] = zext i16 [[TMP3]] to i32
211+
// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
212+
// CHECK-NEXT: store i32 [[CONV]], ptr addrspace(1) [[TMP4]], align 4
213+
// CHECK-NEXT: ret void
214+
//
215+
void test_ashr_pk_u8_i32(global int* out, uint src0, uint src1, uint src2) {
216+
*out = __builtin_amdgcn_ashr_pk_u8_i32(src0, src1, src2);
217+
}

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3229,6 +3229,16 @@ def int_amdgcn_permlane32_swap :
32293229
[IntrNoMem, IntrConvergent, IntrWillReturn,
32303230
ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, IntrNoCallback, IntrNoFree]>;
32313231

3232+
// llvm.amdgcn.ashr_pk_i8_i32 int vdst, int src0, int src1 int src2
3233+
def int_amdgcn_ashr_pk_i8_i32 : ClangBuiltin<"__builtin_amdgcn_ashr_pk_i8_i32">,
3234+
DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
3235+
[IntrNoMem, IntrSpeculatable]>;
3236+
3237+
// llvm.amdgcn.ashr_pk_u8_i32 int vdst, int src0, int src1 int src2
3238+
def int_amdgcn_ashr_pk_u8_i32 : ClangBuiltin<"__builtin_amdgcn_ashr_pk_u8_i32">,
3239+
DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
3240+
[IntrNoMem, IntrSpeculatable]>;
3241+
32323242
//===----------------------------------------------------------------------===//
32333243
// Special Intrinsics for backend internal use only. No frontend
32343244
// should emit calls to these.

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -414,12 +414,19 @@ def FeatureF16BF16ToFP6BF6ConversionScaleInsts : SubtargetFeature<"f16bf16-to-fp
414414
"Has f16bf16 to fp6bf6 conversion scale instructions"
415415
>;
416416

417+
def FeatureAshrPkInsts : SubtargetFeature<"ashr-pk-insts",
418+
"HasAshrPkInsts",
419+
"true",
420+
"Has Arithmetic Shift Pack instructions"
421+
>;
422+
417423
def FeatureGFX950Insts : SubtargetFeature<"gfx950-insts",
418424
"GFX950Insts",
419425
"true",
420426
"Additional instructions for GFX950+",
421427
[FeaturePermlane16Swap,
422428
FeaturePermlane32Swap,
429+
FeatureAshrPkInsts,
423430
FeatureFP8ConversionScaleInsts,
424431
FeatureBF8ConversionScaleInsts,
425432
FeatureFP4ConversionScaleInsts,
@@ -2479,6 +2486,9 @@ def HasScalarDwordx3Loads : Predicate<"Subtarget->hasScalarDwordx3Loads()">;
24792486
def HasXF32Insts : Predicate<"Subtarget->hasXF32Insts()">,
24802487
AssemblerPredicate<(all_of FeatureXF32Insts)>;
24812488

2489+
def HasAshrPkInsts : Predicate<"Subtarget->hasAshrPkInsts()">,
2490+
AssemblerPredicate<(all_of FeatureAshrPkInsts)>;
2491+
24822492
// Include AMDGPU TD files
24832493
include "SISchedule.td"
24842494
include "GCNProcessors.td"

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4523,6 +4523,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
45234523
case Intrinsic::amdgcn_cvt_scalef32_pk32_bf6_f16:
45244524
case Intrinsic::amdgcn_cvt_scalef32_pk32_fp6_bf16:
45254525
case Intrinsic::amdgcn_cvt_scalef32_pk32_bf6_bf16:
4526+
case Intrinsic::amdgcn_ashr_pk_i8_i32:
4527+
case Intrinsic::amdgcn_ashr_pk_u8_i32:
45264528
case Intrinsic::amdgcn_cvt_scalef32_2xpk16_fp6_f32:
45274529
case Intrinsic::amdgcn_cvt_scalef32_2xpk16_bf6_f32:
45284530
case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16:

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,8 +246,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
246246
bool HasForceStoreSC0SC1 = false;
247247
bool HasRequiredExportPriority = false;
248248
bool HasVmemWriteVgprInOrder = false;
249+
bool HasAshrPkInsts = false;
249250
bool HasMinimum3Maximum3F32 = false;
250251
bool HasMinimum3Maximum3F16 = false;
252+
251253
bool RequiresCOV6 = false;
252254

253255
// Dummy feature to use for assembler in tablegen.
@@ -1323,6 +1325,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
13231325

13241326
bool hasPermlane16Swap() const { return HasPermlane16Swap; }
13251327
bool hasPermlane32Swap() const { return HasPermlane32Swap; }
1328+
bool hasAshrPkInsts() const { return HasAshrPkInsts; }
13261329

13271330
bool hasMinimum3Maximum3F32() const {
13281331
return HasMinimum3Maximum3F32;

llvm/lib/Target/AMDGPU/SIInstrInfo.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2662,6 +2662,7 @@ def VOP_I64_I32_I32_I64 : VOPProfile <[i64, i32, i32, i64]>;
26622662
def VOP_I32_F32_I32_I32 : VOPProfile <[i32, f32, i32, i32]>;
26632663
def VOP_I64_I64_I32_I64 : VOPProfile <[i64, i64, i32, i64]>;
26642664
def VOP_V4I32_I64_I32_V4I32 : VOPProfile <[v4i32, i64, i32, v4i32]>;
2665+
def VOP_I16_I32_I32_I32 : VOPProfile <[i16, i32, i32, i32]>;
26652666

26662667
def VOP_F32_V2F16_V2F16_F32 : VOPProfile <[f32, v2f16, v2f16, f32]>;
26672668
def VOP_I32_V2I16_V2I16_I32 : VOPProfile <[i32, v2i16, v2i16, i32]>;

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1182,6 +1182,11 @@ let SubtargetPredicate = HasPseudoScalarTrans in {
11821182
def : PseudoScalarPatF16<any_amdgcn_sqrt, V_S_SQRT_F16_e64>;
11831183
}
11841184

1185+
let SubtargetPredicate = HasAshrPkInsts, isReMaterializable = 1 in {
1186+
defm V_ASHR_PK_I8_I32 : VOP3Inst<"v_ashr_pk_i8_i32", VOP3_Profile<VOP_I16_I32_I32_I32, VOP3_OPSEL_ONLY>, int_amdgcn_ashr_pk_i8_i32>;
1187+
defm V_ASHR_PK_U8_I32 : VOP3Inst<"v_ashr_pk_u8_i32", VOP3_Profile<VOP_I16_I32_I32_I32, VOP3_OPSEL_ONLY>, int_amdgcn_ashr_pk_u8_i32>;
1188+
} // End SubtargetPredicate = HasAshrPkInsts, isReMaterializable = 1
1189+
11851190
//===----------------------------------------------------------------------===//
11861191
// Integer Clamp Patterns
11871192
//===----------------------------------------------------------------------===//
@@ -1958,5 +1963,8 @@ defm V_CVT_SCALEF32_PK32_BF6_F16 : VOP3_Real_gfx9<0x25a, "v_cvt_scalef32_pk32_b
19581963
defm V_CVT_SCALEF32_PK32_BF6_BF16 : VOP3_Real_gfx9<0x25b, "v_cvt_scalef32_pk32_bf6_bf16">;
19591964
}
19601965

1966+
defm V_ASHR_PK_I8_I32 : VOP3OpSel_Real_gfx9 <0x265>;
1967+
defm V_ASHR_PK_U8_I32 : VOP3OpSel_Real_gfx9 <0x266>;
1968+
19611969
defm V_CVT_SCALEF32_2XPK16_FP6_F32 : VOP3_Real_gfx9<0x252, "v_cvt_scalef32_2xpk16_fp6_f32">;
19621970
defm V_CVT_SCALEF32_2XPK16_BF6_F32 : VOP3_Real_gfx9<0x253, "v_cvt_scalef32_2xpk16_bf6_f32">;

llvm/lib/Target/AMDGPU/VOPInstructions.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1350,6 +1350,7 @@ def VOP3_CLAMP : VOP3Features<1, 0, 0, 0>;
13501350
def VOP3_OPSEL : VOP3Features<1, 1, 0, 0>;
13511351
def VOP3_PACKED : VOP3Features<1, 1, 1, 0>;
13521352
def VOP3_MAI : VOP3Features<0, 0, 0, 1>;
1353+
def VOP3_OPSEL_ONLY : VOP3Features<0, 1, 0, 0>;
13531354

13541355
// Packed is misleading, but it enables the appropriate op_sel
13551356
// modifiers.

llvm/lib/TargetParser/TargetParser.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -474,6 +474,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
474474
Features["prng-inst"] = true;
475475
Features["permlane16-swap"] = true;
476476
Features["permlane32-swap"] = true;
477+
Features["ashr-pk-insts"] = true;
477478
Features["gfx950-insts"] = true;
478479
[[fallthrough]];
479480
case GK_GFX942:

llvm/test/MC/AMDGPU/gfx950_asm_vop3.s

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,3 +74,75 @@ v_bitop3_b16 v5, v1, v2, s3 bitop3:161
7474
// GFX940-ERR: error: instruction not supported on this GPU
7575
// GFX950: v_bitop3_b16 v5, v1, v2, s3 bitop3:0xa1 ; encoding: [0x05,0x04,0x33,0xd2,0x01,0x05,0x0e,0x30]
7676
// GFX12-ERR: error: instruction not supported on this GPU
77+
78+
v_ashr_pk_i8_i32 v2, s4, v7, v8
79+
// GFX906-ERR: error: instruction not supported on this GPU
80+
// GFX940-ERR: error: instruction not supported on this GPU
81+
// GFX950: v_ashr_pk_i8_i32 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x0e,0x22,0x04]
82+
// GFX12-ERR: error: instruction not supported on this GPU
83+
84+
v_ashr_pk_i8_i32 v2, v4, 0, 1
85+
// GFX906-ERR: error: instruction not supported on this GPU
86+
// GFX940-ERR: error: instruction not supported on this GPU
87+
// GFX950: v_ashr_pk_i8_i32 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x01,0x05,0x02]
88+
// GFX12-ERR: error: instruction not supported on this GPU
89+
90+
v_ashr_pk_i8_i32 v2, v4, 3, s2
91+
// GFX906-ERR: error: instruction not supported on this GPU
92+
// GFX940-ERR: error: instruction not supported on this GPU
93+
// GFX950: v_ashr_pk_i8_i32 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x07,0x09,0x00]
94+
// GFX12-ERR: error: instruction not supported on this GPU
95+
96+
v_ashr_pk_i8_i32 v2, s4, 4, v2
97+
// GFX906-ERR: error: instruction not supported on this GPU
98+
// GFX940-ERR: error: instruction not supported on this GPU
99+
// GFX950: v_ashr_pk_i8_i32 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x08,0x09,0x04]
100+
// GFX12-ERR: error: instruction not supported on this GPU
101+
102+
v_ashr_pk_i8_i32 v2, v4, v7, 0.5
103+
// GFX906-ERR: error: instruction not supported on this GPU
104+
// GFX940-ERR: error: instruction not supported on this GPU
105+
// GFX950: v_ashr_pk_i8_i32 v2, v4, v7, 0.5 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x0f,0xc2,0x03]
106+
// GFX12-ERR: error: instruction not supported on this GPU
107+
108+
v_ashr_pk_i8_i32 v1, v2, v3, v4 op_sel:[0,0,0,1]
109+
// GFX906-ERR: error: instruction not supported on this GPU
110+
// GFX940-ERR: error: instruction not supported on this GPU
111+
// GFX950: v_ashr_pk_i8_i32 v1, v2, v3, v4 op_sel:[0,0,0,1] ; encoding: [0x01,0x40,0x65,0xd2,0x02,0x07,0x12,0x04]
112+
// GFX12-ERR: error: instruction not supported on this GPU
113+
114+
v_ashr_pk_u8_i32 v2, s4, v7, v8
115+
// GFX906-ERR: error: instruction not supported on this GPU
116+
// GFX940-ERR: error: instruction not supported on this GPU
117+
// GFX950: v_ashr_pk_u8_i32 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x0e,0x22,0x04]
118+
// GFX12-ERR: error: instruction not supported on this GPU
119+
120+
v_ashr_pk_u8_i32 v2, v4, 0, 1
121+
// GFX906-ERR: error: instruction not supported on this GPU
122+
// GFX940-ERR: error: instruction not supported on this GPU
123+
// GFX950: v_ashr_pk_u8_i32 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x01,0x05,0x02]
124+
// GFX12-ERR: error: instruction not supported on this GPU
125+
126+
v_ashr_pk_u8_i32 v2, v4, 3, s2
127+
// GFX906-ERR: error: instruction not supported on this GPU
128+
// GFX940-ERR: error: instruction not supported on this GPU
129+
// GFX950: v_ashr_pk_u8_i32 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x07,0x09,0x00]
130+
// GFX12-ERR: error: instruction not supported on this GPU
131+
132+
v_ashr_pk_u8_i32 v2, s4, 4, v2
133+
// GFX906-ERR: error: instruction not supported on this GPU
134+
// GFX940-ERR: error: instruction not supported on this GPU
135+
// GFX950: v_ashr_pk_u8_i32 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x08,0x09,0x04]
136+
// GFX12-ERR: error: instruction not supported on this GPU
137+
138+
v_ashr_pk_u8_i32 v2, v4, v7, -2.0
139+
// GFX906-ERR: error: instruction not supported on this GPU
140+
// GFX940-ERR: error: instruction not supported on this GPU
141+
// GFX950: v_ashr_pk_u8_i32 v2, v4, v7, -2.0 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x0f,0xd6,0x03]
142+
// GFX12-ERR: error: instruction not supported on this GPU
143+
144+
v_ashr_pk_u8_i32 v1, v2, v3, v4 op_sel:[0,0,0,1]
145+
// GFX906-ERR: error: instruction not supported on this GPU
146+
// GFX940-ERR: error: instruction not supported on this GPU
147+
// GFX950: v_ashr_pk_u8_i32 v1, v2, v3, v4 op_sel:[0,0,0,1] ; encoding: [0x01,0x40,0x66,0xd2,0x02,0x07,0x12,0x04]
148+
// GFX12-ERR: error: instruction not supported on this GPU

llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -744,6 +744,42 @@
744744
# GFX950: v_cvt_scalef32_pk_fp4_bf16 v1, -|s2|, v3 ; encoding: [0x01,0x01,0x4d,0xd2,0x02,0x06,0x02,0x20]
745745
0x01,0x01,0x4d,0xd2,0x02,0x06,0x02,0x20
746746

747+
# GFX950: v_ashr_pk_i8_i32 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x08,0x09,0x04]
748+
0x02,0x00,0x65,0xd2,0x04,0x08,0x09,0x04
749+
750+
# GFX950: v_ashr_pk_i8_i32 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x0e,0x22,0x04]
751+
0x02,0x00,0x65,0xd2,0x04,0x0e,0x22,0x04
752+
753+
# GFX950: v_ashr_pk_i8_i32 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x01,0x05,0x02]
754+
0x02,0x00,0x65,0xd2,0x04,0x01,0x05,0x02
755+
756+
# GFX950: v_ashr_pk_i8_i32 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x07,0x09,0x00]
757+
0x02,0x00,0x65,0xd2,0x04,0x07,0x09,0x00
758+
759+
# GFX950: v_ashr_pk_i8_i32 v2, v4, v7, 0.5 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x0f,0xc2,0x03]
760+
0x02,0x00,0x65,0xd2,0x04,0x0f,0xc2,0x03
761+
762+
# GFX950: v_ashr_pk_i8_i32 v1, v2, v3, v4 op_sel:[0,0,0,1] ; encoding: [0x01,0x40,0x65,0xd2,0x02,0x07,0x12,0x04]
763+
0x01,0x40,0x65,0xd2,0x02,0x07,0x12,0x04
764+
765+
# GFX950: v_ashr_pk_u8_i32 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x08,0x09,0x04]
766+
0x02,0x00,0x66,0xd2,0x04,0x08,0x09,0x04
767+
768+
# GFX950: v_ashr_pk_u8_i32 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x0e,0x22,0x04]
769+
0x02,0x00,0x66,0xd2,0x04,0x0e,0x22,0x04
770+
771+
# GFX950: v_ashr_pk_u8_i32 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x01,0x05,0x02]
772+
0x02,0x00,0x66,0xd2,0x04,0x01,0x05,0x02
773+
774+
# GFX950: v_ashr_pk_u8_i32 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x07,0x09,0x00]
775+
0x02,0x00,0x66,0xd2,0x04,0x07,0x09,0x00
776+
777+
# GFX950: v_ashr_pk_u8_i32 v2, v4, v7, -2.0 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x0f,0xd6,0x03]
778+
0x02,0x00,0x66,0xd2,0x04,0x0f,0xd6,0x03
779+
780+
# GFX950: v_ashr_pk_u8_i32 v1, v2, v3, v4 op_sel:[0,0,0,1] ; encoding: [0x01,0x40,0x66,0xd2,0x02,0x07,0x12,0x04]
781+
0x01,0x40,0x66,0xd2,0x02,0x07,0x12,0x04
782+
747783
# GFX950: v_cvt_scalef32_2xpk16_fp6_f32 v[20:25], v[10:25], v[10:25], v6 ; encoding: [0x14,0x00,0x52,0xd2,0x0a,0x15,0x1a,0x04]
748784
0x14,0x00,0x52,0xd2,0x0a,0x15,0x1a,0x04
749785

0 commit comments

Comments
 (0)