Skip to content

AMDGPU: Add support for v_ashr_pk_i8/u8_i32 instructions for gfx950 #117596

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions clang/include/clang/Basic/BuiltinsAMDGPU.def
Original file line number Diff line number Diff line change
Expand Up @@ -467,6 +467,9 @@ TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr6_b96_v3i32, "V3iV3i*3", "nc", "gfx950
TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr8_b64_v2i32, "V2iV2i*3", "nc", "gfx950-insts")
TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr16_b64_v4i16, "V4sV4s*3", "nc", "gfx950-insts")

TARGET_BUILTIN(__builtin_amdgcn_ashr_pk_i8_i32, "UsUiUiUi", "nc", "ashr-pk-insts")
TARGET_BUILTIN(__builtin_amdgcn_ashr_pk_u8_i32, "UsUiUiUi", "nc", "ashr-pk-insts")

TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_2xpk16_fp6_f32, "V6UiV16fV16ff", "nc", "gfx950-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_2xpk16_bf6_f32, "V6UiV16fV16ff", "nc", "gfx950-insts")

Expand Down
2 changes: 1 addition & 1 deletion clang/test/CodeGenOpenCL/amdgpu-features.cl
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@
// GFX941: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts"
// GFX942: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts"
// GFX9_4_Generic: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
// GFX950: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
// GFX950: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
// GFX1010: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
// GFX1011: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
// GFX1012: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
Expand Down
46 changes: 46 additions & 0 deletions clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
Original file line number Diff line number Diff line change
Expand Up @@ -169,3 +169,49 @@ void test_cvt_scalef32_pk(global uint6 *out6, bfloat32 srcbf32, half32 srch32, f
*out6 = __builtin_amdgcn_cvt_scalef32_2xpk16_bf6_f32(src0f32, src1f32, scale);
*out6 = __builtin_amdgcn_cvt_scalef32_2xpk16_fp6_f32(src0f32, src1f32, scale);
}

// CHECK-LABEL: @test_ashr_pk_i8_i32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT: [[SRC0_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[SRC2_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
// CHECK-NEXT: store i32 [[SRC0:%.*]], ptr addrspace(5) [[SRC0_ADDR]], align 4
// CHECK-NEXT: store i32 [[SRC1:%.*]], ptr addrspace(5) [[SRC1_ADDR]], align 4
// CHECK-NEXT: store i32 [[SRC2:%.*]], ptr addrspace(5) [[SRC2_ADDR]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC0_ADDR]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[SRC2_ADDR]], align 4
// CHECK-NEXT: [[TMP3:%.*]] = call i16 @llvm.amdgcn.ashr.pk.i8.i32(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]])
// CHECK-NEXT: [[CONV:%.*]] = zext i16 [[TMP3]] to i32
// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
// CHECK-NEXT: store i32 [[CONV]], ptr addrspace(1) [[TMP4]], align 4
// CHECK-NEXT: ret void
//
void test_ashr_pk_i8_i32(global int* out, uint src0, uint src1, uint src2) {
*out = __builtin_amdgcn_ashr_pk_i8_i32(src0, src1, src2);
}

// CHECK-LABEL: @test_ashr_pk_u8_i32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT: [[SRC0_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[SRC2_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
// CHECK-NEXT: store i32 [[SRC0:%.*]], ptr addrspace(5) [[SRC0_ADDR]], align 4
// CHECK-NEXT: store i32 [[SRC1:%.*]], ptr addrspace(5) [[SRC1_ADDR]], align 4
// CHECK-NEXT: store i32 [[SRC2:%.*]], ptr addrspace(5) [[SRC2_ADDR]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC0_ADDR]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[SRC2_ADDR]], align 4
// CHECK-NEXT: [[TMP3:%.*]] = call i16 @llvm.amdgcn.ashr.pk.u8.i32(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]])
// CHECK-NEXT: [[CONV:%.*]] = zext i16 [[TMP3]] to i32
// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
// CHECK-NEXT: store i32 [[CONV]], ptr addrspace(1) [[TMP4]], align 4
// CHECK-NEXT: ret void
//
void test_ashr_pk_u8_i32(global int* out, uint src0, uint src1, uint src2) {
*out = __builtin_amdgcn_ashr_pk_u8_i32(src0, src1, src2);
}
10 changes: 10 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -3199,6 +3199,16 @@ def int_amdgcn_permlane32_swap :
[IntrNoMem, IntrConvergent, IntrWillReturn,
ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, IntrNoCallback, IntrNoFree]>;

// llvm.amdgcn.ashr_pk_i8_i32 int vdst, int src0, int src1 int src2
def int_amdgcn_ashr_pk_i8_i32 : ClangBuiltin<"__builtin_amdgcn_ashr_pk_i8_i32">,
DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
[IntrNoMem, IntrSpeculatable]>;

// llvm.amdgcn.ashr_pk_u8_i32 int vdst, int src0, int src1 int src2
def int_amdgcn_ashr_pk_u8_i32 : ClangBuiltin<"__builtin_amdgcn_ashr_pk_u8_i32">,
DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
[IntrNoMem, IntrSpeculatable]>;

//===----------------------------------------------------------------------===//
// Special Intrinsics for backend internal use only. No frontend
// should emit calls to these.
Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -414,12 +414,19 @@ def FeatureF16BF16ToFP6BF6ConversionScaleInsts : SubtargetFeature<"f16bf16-to-fp
"Has f16bf16 to fp6bf6 conversion scale instructions"
>;

def FeatureAshrPkInsts : SubtargetFeature<"ashr-pk-insts",
"HasAshrPkInsts",
"true",
"Has Arithmetic Shift Pack instructions"
>;

def FeatureGFX950Insts : SubtargetFeature<"gfx950-insts",
"GFX950Insts",
"true",
"Additional instructions for GFX950+",
[FeaturePermlane16Swap,
FeaturePermlane32Swap,
FeatureAshrPkInsts,
FeatureFP8ConversionScaleInsts,
FeatureBF8ConversionScaleInsts,
FeatureFP4ConversionScaleInsts,
Expand Down Expand Up @@ -2474,6 +2481,9 @@ def HasScalarDwordx3Loads : Predicate<"Subtarget->hasScalarDwordx3Loads()">;
def HasXF32Insts : Predicate<"Subtarget->hasXF32Insts()">,
AssemblerPredicate<(all_of FeatureXF32Insts)>;

def HasAshrPkInsts : Predicate<"Subtarget->hasAshrPkInsts()">,
AssemblerPredicate<(all_of FeatureAshrPkInsts)>;

// Include AMDGPU TD files
include "SISchedule.td"
include "GCNProcessors.td"
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4546,6 +4546,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_cvt_scalef32_pk32_bf6_f16:
case Intrinsic::amdgcn_cvt_scalef32_pk32_fp6_bf16:
case Intrinsic::amdgcn_cvt_scalef32_pk32_bf6_bf16:
case Intrinsic::amdgcn_ashr_pk_i8_i32:
case Intrinsic::amdgcn_ashr_pk_u8_i32:
case Intrinsic::amdgcn_cvt_scalef32_2xpk16_fp6_f32:
case Intrinsic::amdgcn_cvt_scalef32_2xpk16_bf6_f32:
case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16:
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -245,8 +245,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool HasForceStoreSC0SC1 = false;
bool HasRequiredExportPriority = false;
bool HasVmemWriteVgprInOrder = false;
bool HasAshrPkInsts = false;
bool HasMinimum3Maximum3F32 = false;
bool HasMinimum3Maximum3F16 = false;

bool RequiresCOV6 = false;

// Dummy feature to use for assembler in tablegen.
Expand Down Expand Up @@ -1326,6 +1328,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,

bool hasPermlane16Swap() const { return HasPermlane16Swap; }
bool hasPermlane32Swap() const { return HasPermlane32Swap; }
bool hasAshrPkInsts() const { return HasAshrPkInsts; }

bool hasMinimum3Maximum3F32() const {
return HasMinimum3Maximum3F32;
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -2856,6 +2856,7 @@ def VOP_I64_I32_I32_I64 : VOPProfile <[i64, i32, i32, i64]>;
def VOP_I32_F32_I32_I32 : VOPProfile <[i32, f32, i32, i32]>;
def VOP_I64_I64_I32_I64 : VOPProfile <[i64, i64, i32, i64]>;
def VOP_V4I32_I64_I32_V4I32 : VOPProfile <[v4i32, i64, i32, v4i32]>;
def VOP_I16_I32_I32_I32 : VOPProfile <[i16, i32, i32, i32]>;

def VOP_F32_V2F16_V2F16_F32 : VOPProfile <[f32, v2f16, v2f16, f32]>;
def VOP_I32_V2I16_V2I16_I32 : VOPProfile <[i32, v2i16, v2i16, i32]>;
Expand Down
8 changes: 8 additions & 0 deletions llvm/lib/Target/AMDGPU/VOP3Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1189,6 +1189,11 @@ let SubtargetPredicate = HasPseudoScalarTrans in {
def : PseudoScalarPatF16<any_amdgcn_sqrt, V_S_SQRT_F16_e64>;
}

let SubtargetPredicate = HasAshrPkInsts, isReMaterializable = 1 in {
defm V_ASHR_PK_I8_I32 : VOP3Inst<"v_ashr_pk_i8_i32", VOP3_Profile<VOP_I16_I32_I32_I32, VOP3_OPSEL_ONLY>, int_amdgcn_ashr_pk_i8_i32>;
defm V_ASHR_PK_U8_I32 : VOP3Inst<"v_ashr_pk_u8_i32", VOP3_Profile<VOP_I16_I32_I32_I32, VOP3_OPSEL_ONLY>, int_amdgcn_ashr_pk_u8_i32>;
} // End SubtargetPredicate = HasAshrPkInsts, isReMaterializable = 1

//===----------------------------------------------------------------------===//
// Integer Clamp Patterns
//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -1978,5 +1983,8 @@ defm V_CVT_SCALEF32_PK32_BF6_F16 : VOP3_Real_gfx9<0x25a, "v_cvt_scalef32_pk32_b
defm V_CVT_SCALEF32_PK32_BF6_BF16 : VOP3_Real_gfx9<0x25b, "v_cvt_scalef32_pk32_bf6_bf16">;
}

defm V_ASHR_PK_I8_I32 : VOP3OpSel_Real_gfx9 <0x265>;
defm V_ASHR_PK_U8_I32 : VOP3OpSel_Real_gfx9 <0x266>;

defm V_CVT_SCALEF32_2XPK16_FP6_F32 : VOP3_Real_gfx9<0x252, "v_cvt_scalef32_2xpk16_fp6_f32">;
defm V_CVT_SCALEF32_2XPK16_BF6_F32 : VOP3_Real_gfx9<0x253, "v_cvt_scalef32_2xpk16_bf6_f32">;
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/VOPInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1455,6 +1455,7 @@ def VOP3_CLAMP : VOP3Features<1, 0, 0, 0>;
def VOP3_OPSEL : VOP3Features<1, 1, 0, 0>;
def VOP3_PACKED : VOP3Features<1, 1, 1, 0>;
def VOP3_MAI : VOP3Features<0, 0, 0, 1>;
def VOP3_OPSEL_ONLY : VOP3Features<0, 1, 0, 0>;

// Packed is misleading, but it enables the appropriate op_sel
// modifiers.
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/TargetParser/TargetParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -474,6 +474,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
Features["prng-inst"] = true;
Features["permlane16-swap"] = true;
Features["permlane32-swap"] = true;
Features["ashr-pk-insts"] = true;
Features["gfx950-insts"] = true;
[[fallthrough]];
case GK_GFX942:
Expand Down
72 changes: 72 additions & 0 deletions llvm/test/MC/AMDGPU/gfx950_asm_vop3.s
Original file line number Diff line number Diff line change
Expand Up @@ -74,3 +74,75 @@ v_bitop3_b16 v5, v1, v2, s3 bitop3:161
// GFX940-ERR: error: instruction not supported on this GPU
// GFX950: v_bitop3_b16 v5, v1, v2, s3 bitop3:0xa1 ; encoding: [0x05,0x04,0x33,0xd2,0x01,0x05,0x0e,0x30]
// GFX12-ERR: error: instruction not supported on this GPU

v_ashr_pk_i8_i32 v2, s4, v7, v8
// GFX906-ERR: error: instruction not supported on this GPU
// GFX940-ERR: error: instruction not supported on this GPU
// GFX950: v_ashr_pk_i8_i32 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x0e,0x22,0x04]
// GFX12-ERR: error: instruction not supported on this GPU

v_ashr_pk_i8_i32 v2, v4, 0, 1
// GFX906-ERR: error: instruction not supported on this GPU
// GFX940-ERR: error: instruction not supported on this GPU
// GFX950: v_ashr_pk_i8_i32 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x01,0x05,0x02]
// GFX12-ERR: error: instruction not supported on this GPU

v_ashr_pk_i8_i32 v2, v4, 3, s2
// GFX906-ERR: error: instruction not supported on this GPU
// GFX940-ERR: error: instruction not supported on this GPU
// GFX950: v_ashr_pk_i8_i32 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x07,0x09,0x00]
// GFX12-ERR: error: instruction not supported on this GPU

v_ashr_pk_i8_i32 v2, s4, 4, v2
// GFX906-ERR: error: instruction not supported on this GPU
// GFX940-ERR: error: instruction not supported on this GPU
// GFX950: v_ashr_pk_i8_i32 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x08,0x09,0x04]
// GFX12-ERR: error: instruction not supported on this GPU

v_ashr_pk_i8_i32 v2, v4, v7, 0.5
// GFX906-ERR: error: instruction not supported on this GPU
// GFX940-ERR: error: instruction not supported on this GPU
// GFX950: v_ashr_pk_i8_i32 v2, v4, v7, 0.5 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x0f,0xc2,0x03]
// GFX12-ERR: error: instruction not supported on this GPU

v_ashr_pk_i8_i32 v1, v2, v3, v4 op_sel:[0,0,0,1]
// GFX906-ERR: error: instruction not supported on this GPU
// GFX940-ERR: error: instruction not supported on this GPU
// GFX950: v_ashr_pk_i8_i32 v1, v2, v3, v4 op_sel:[0,0,0,1] ; encoding: [0x01,0x40,0x65,0xd2,0x02,0x07,0x12,0x04]
// GFX12-ERR: error: instruction not supported on this GPU

v_ashr_pk_u8_i32 v2, s4, v7, v8
// GFX906-ERR: error: instruction not supported on this GPU
// GFX940-ERR: error: instruction not supported on this GPU
// GFX950: v_ashr_pk_u8_i32 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x0e,0x22,0x04]
// GFX12-ERR: error: instruction not supported on this GPU

v_ashr_pk_u8_i32 v2, v4, 0, 1
// GFX906-ERR: error: instruction not supported on this GPU
// GFX940-ERR: error: instruction not supported on this GPU
// GFX950: v_ashr_pk_u8_i32 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x01,0x05,0x02]
// GFX12-ERR: error: instruction not supported on this GPU

v_ashr_pk_u8_i32 v2, v4, 3, s2
// GFX906-ERR: error: instruction not supported on this GPU
// GFX940-ERR: error: instruction not supported on this GPU
// GFX950: v_ashr_pk_u8_i32 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x07,0x09,0x00]
// GFX12-ERR: error: instruction not supported on this GPU

v_ashr_pk_u8_i32 v2, s4, 4, v2
// GFX906-ERR: error: instruction not supported on this GPU
// GFX940-ERR: error: instruction not supported on this GPU
// GFX950: v_ashr_pk_u8_i32 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x08,0x09,0x04]
// GFX12-ERR: error: instruction not supported on this GPU

v_ashr_pk_u8_i32 v2, v4, v7, -2.0
// GFX906-ERR: error: instruction not supported on this GPU
// GFX940-ERR: error: instruction not supported on this GPU
// GFX950: v_ashr_pk_u8_i32 v2, v4, v7, -2.0 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x0f,0xd6,0x03]
// GFX12-ERR: error: instruction not supported on this GPU

v_ashr_pk_u8_i32 v1, v2, v3, v4 op_sel:[0,0,0,1]
// GFX906-ERR: error: instruction not supported on this GPU
// GFX940-ERR: error: instruction not supported on this GPU
// GFX950: v_ashr_pk_u8_i32 v1, v2, v3, v4 op_sel:[0,0,0,1] ; encoding: [0x01,0x40,0x66,0xd2,0x02,0x07,0x12,0x04]
// GFX12-ERR: error: instruction not supported on this GPU
36 changes: 36 additions & 0 deletions llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt
Original file line number Diff line number Diff line change
Expand Up @@ -744,6 +744,42 @@
# GFX950: v_cvt_scalef32_pk_fp4_bf16 v1, -|s2|, v3 ; encoding: [0x01,0x01,0x4d,0xd2,0x02,0x06,0x02,0x20]
0x01,0x01,0x4d,0xd2,0x02,0x06,0x02,0x20

# GFX950: v_ashr_pk_i8_i32 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x08,0x09,0x04]
0x02,0x00,0x65,0xd2,0x04,0x08,0x09,0x04

# GFX950: v_ashr_pk_i8_i32 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x0e,0x22,0x04]
0x02,0x00,0x65,0xd2,0x04,0x0e,0x22,0x04

# GFX950: v_ashr_pk_i8_i32 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x01,0x05,0x02]
0x02,0x00,0x65,0xd2,0x04,0x01,0x05,0x02

# GFX950: v_ashr_pk_i8_i32 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x07,0x09,0x00]
0x02,0x00,0x65,0xd2,0x04,0x07,0x09,0x00

# GFX950: v_ashr_pk_i8_i32 v2, v4, v7, 0.5 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x0f,0xc2,0x03]
0x02,0x00,0x65,0xd2,0x04,0x0f,0xc2,0x03

# GFX950: v_ashr_pk_i8_i32 v1, v2, v3, v4 op_sel:[0,0,0,1] ; encoding: [0x01,0x40,0x65,0xd2,0x02,0x07,0x12,0x04]
0x01,0x40,0x65,0xd2,0x02,0x07,0x12,0x04

# GFX950: v_ashr_pk_u8_i32 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x08,0x09,0x04]
0x02,0x00,0x66,0xd2,0x04,0x08,0x09,0x04

# GFX950: v_ashr_pk_u8_i32 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x0e,0x22,0x04]
0x02,0x00,0x66,0xd2,0x04,0x0e,0x22,0x04

# GFX950: v_ashr_pk_u8_i32 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x01,0x05,0x02]
0x02,0x00,0x66,0xd2,0x04,0x01,0x05,0x02

# GFX950: v_ashr_pk_u8_i32 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x07,0x09,0x00]
0x02,0x00,0x66,0xd2,0x04,0x07,0x09,0x00

# GFX950: v_ashr_pk_u8_i32 v2, v4, v7, -2.0 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x0f,0xd6,0x03]
0x02,0x00,0x66,0xd2,0x04,0x0f,0xd6,0x03

# GFX950: v_ashr_pk_u8_i32 v1, v2, v3, v4 op_sel:[0,0,0,1] ; encoding: [0x01,0x40,0x66,0xd2,0x02,0x07,0x12,0x04]
0x01,0x40,0x66,0xd2,0x02,0x07,0x12,0x04

# GFX950: v_cvt_scalef32_2xpk16_fp6_f32 v[20:25], v[10:25], v[10:25], v6 ; encoding: [0x14,0x00,0x52,0xd2,0x0a,0x15,0x1a,0x04]
0x14,0x00,0x52,0xd2,0x0a,0x15,0x1a,0x04

Expand Down
Loading