Skip to content

AMDGPU: Add support for v_dot2c_f32_bf16 instruction for gfx950 #117598

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions clang/include/clang/Basic/BuiltinsAMDGPU.def
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,7 @@ TARGET_BUILTIN(__builtin_amdgcn_dot4_f32_fp8_bf8, "fUiUif", "nc", "dot11-insts")
TARGET_BUILTIN(__builtin_amdgcn_dot4_f32_bf8_fp8, "fUiUif", "nc", "dot11-insts")
TARGET_BUILTIN(__builtin_amdgcn_dot4_f32_fp8_fp8, "fUiUif", "nc", "dot11-insts")
TARGET_BUILTIN(__builtin_amdgcn_dot4_f32_bf8_bf8, "fUiUif", "nc", "dot11-insts")
TARGET_BUILTIN(__builtin_amdgcn_fdot2c_f32_bf16, "fV2yV2yfIb", "nc", "dot13-insts")

//===----------------------------------------------------------------------===//
// GFX10+ only builtins.
Expand Down
2 changes: 1 addition & 1 deletion clang/test/CodeGenOpenCL/amdgpu-features.cl
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@
// GFX941: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts"
// GFX942: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts"
// GFX9_4_Generic: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
// GFX950: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot12-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
// GFX950: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
// GFX1010: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
// GFX1011: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
// GFX1012: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
Expand Down
5 changes: 5 additions & 0 deletions clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,14 @@ typedef unsigned int uint;
typedef half __attribute__((ext_vector_type(2))) half2;
typedef short __attribute__((ext_vector_type(2))) short2;
typedef unsigned short __attribute__((ext_vector_type(2))) ushort2;
typedef __bf16 __attribute__((ext_vector_type(2))) bfloat2;

#pragma OPENCL EXTENSION cl_khr_fp16 : enable
kernel void builtins_amdgcn_dl_insts_err(
global float *fOut, global int *siOut, global uint *uiOut,
global short *sOut, global int *iOut, global half *hOut,
half2 v2hA, half2 v2hB, float fC, half hC,
bfloat2 v2bfbfA, bfloat2 v2bfbfB,
short2 v2ssA, short2 v2ssB, short sC, int siA, int siB, int siC,
ushort2 v2usA, ushort2 v2usB, uint uiA, uint uiB, uint uiC,
int A, int B, int C) {
Expand All @@ -26,6 +28,9 @@ kernel void builtins_amdgcn_dl_insts_err(
fOut[3] = __builtin_amdgcn_fdot2_f32_bf16(v2ssA, v2ssB, fC, false); // expected-error {{'__builtin_amdgcn_fdot2_f32_bf16' needs target feature dot12-insts}}
fOut[4] = __builtin_amdgcn_fdot2_f32_bf16(v2ssA, v2ssB, fC, true); // expected-error {{'__builtin_amdgcn_fdot2_f32_bf16' needs target feature dot12-insts}}

fOut[3] = __builtin_amdgcn_fdot2c_f32_bf16(v2bfbfA, v2bfbfB, fC, false); // expected-error {{'__builtin_amdgcn_fdot2c_f32_bf16' needs target feature dot13-insts}}
fOut[4] = __builtin_amdgcn_fdot2c_f32_bf16(v2bfbfA, v2bfbfB, fC, true); // expected-error {{'__builtin_amdgcn_fdot2c_f32_bf16' needs target feature dot13-insts}}

siOut[0] = __builtin_amdgcn_sdot2(v2ssA, v2ssB, siC, false); // expected-error {{'__builtin_amdgcn_sdot2' needs target feature dot2-insts}}
siOut[1] = __builtin_amdgcn_sdot2(v2ssA, v2ssB, siC, true); // expected-error {{'__builtin_amdgcn_sdot2' needs target feature dot2-insts}}

Expand Down
34 changes: 28 additions & 6 deletions clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ typedef unsigned int __attribute__((ext_vector_type(6))) uint6;
typedef __bf16 __attribute__((ext_vector_type(32))) bfloat32;
typedef half __attribute__((ext_vector_type(32))) half32;
typedef short __attribute__((ext_vector_type(2))) short2;
typedef __bf16 __attribute__((ext_vector_type(2))) bfloat2;
typedef float __attribute__((ext_vector_type(16))) float16;

// CHECK-LABEL: @test_prng_b32(
Expand Down Expand Up @@ -216,17 +217,16 @@ void test_ashr_pk_u8_i32(global int* out, uint src0, uint src1, uint src2) {
*out = __builtin_amdgcn_ashr_pk_u8_i32(src0, src1, src2);
}

// CHECK-LABEL: define dso_local void @builtins_amdgcn_dl_insts(
// CHECK-SAME: ptr addrspace(1) noundef [[OUT:%.*]], float noundef [[FC:%.*]], <2 x i16> noundef [[V2SSA:%.*]], <2 x i16> noundef [[V2SSB:%.*]]) #[[ATTR0:[0-9]+]] {
// CHECK-LABEL: @builtins_amdgcn_dl_insts(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT: [[FC_ADDR:%.*]] = alloca float, align 4, addrspace(5)
// CHECK-NEXT: [[V2SSA_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5)
// CHECK-NEXT: [[V2SSB_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5)
// CHECK-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8
// CHECK-NEXT: store float [[FC]], ptr addrspace(5) [[FC_ADDR]], align 4
// CHECK-NEXT: store <2 x i16> [[V2SSA]], ptr addrspace(5) [[V2SSA_ADDR]], align 4
// CHECK-NEXT: store <2 x i16> [[V2SSB]], ptr addrspace(5) [[V2SSB_ADDR]], align 4
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
// CHECK-NEXT: store float [[FC:%.*]], ptr addrspace(5) [[FC_ADDR]], align 4
// CHECK-NEXT: store <2 x i16> [[V2SSA:%.*]], ptr addrspace(5) [[V2SSA_ADDR]], align 4
// CHECK-NEXT: store <2 x i16> [[V2SSB:%.*]], ptr addrspace(5) [[V2SSB_ADDR]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i16>, ptr addrspace(5) [[V2SSA_ADDR]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i16> [[TMP0]] to <2 x bfloat>
// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i16>, ptr addrspace(5) [[V2SSB_ADDR]], align 4
Expand All @@ -240,3 +240,25 @@ void test_ashr_pk_u8_i32(global int* out, uint src0, uint src1, uint src2) {
void builtins_amdgcn_dl_insts(global float *out, float fC, short2 v2ssA, short2 v2ssB) {
*out = __builtin_amdgcn_fdot2_f32_bf16(v2ssA, v2ssB, fC, false);
}

// CHECK-LABEL: @builtins_amdgcn_dl_dot2c(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT: [[FC_ADDR:%.*]] = alloca float, align 4, addrspace(5)
// CHECK-NEXT: [[V2SSA_ADDR:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5)
// CHECK-NEXT: [[V2SSB_ADDR:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5)
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
// CHECK-NEXT: store float [[FC:%.*]], ptr addrspace(5) [[FC_ADDR]], align 4
// CHECK-NEXT: store <2 x bfloat> [[V2SSA:%.*]], ptr addrspace(5) [[V2SSA_ADDR]], align 4
// CHECK-NEXT: store <2 x bfloat> [[V2SSB:%.*]], ptr addrspace(5) [[V2SSB_ADDR]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[V2SSA_ADDR]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[V2SSB_ADDR]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[FC_ADDR]], align 4
// CHECK-NEXT: [[TMP3:%.*]] = call float @llvm.amdgcn.fdot2c.f32.bf16(<2 x bfloat> [[TMP0]], <2 x bfloat> [[TMP1]], float [[TMP2]], i1 false)
// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
// CHECK-NEXT: store float [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
// CHECK-NEXT: ret void
//
void builtins_amdgcn_dl_dot2c(global float *out, float fC, bfloat2 v2ssA, bfloat2 v2ssB) {
*out = __builtin_amdgcn_fdot2c_f32_bf16(v2ssA, v2ssB, fC, false);
}
18 changes: 18 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -2820,6 +2820,24 @@ def int_amdgcn_fdot2_f32_bf16 :
[IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
>;

// f32 %r = llvm.amdgcn.fdot2c.f32.bf16(v2bf16 %a, v2bf16 %b, f32 %c, i1 %clamp)
// %r = %a[0] * %b[0] + %a[1] * %b[1] + c
// TODO: This actually is similar to llvm.amdgcn.fdot2 intrinsics which produces
// v_dot2c_f32_f16 on gfx940. Maybe we can consolidate these.

def int_amdgcn_fdot2c_f32_bf16 :
ClangBuiltin<"__builtin_amdgcn_fdot2c_f32_bf16">,
DefaultAttrsIntrinsic<
[llvm_float_ty], // %r
[
llvm_v2bf16_ty, // %a
llvm_v2bf16_ty, // %b
llvm_float_ty, // %c
llvm_i1_ty // %clamp
],
[IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
>;

// i32 %r = llvm.amdgcn.sdot2(v2i16 %a, v2i16 %b, i32 %c, i1 %clamp)
// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
def int_amdgcn_sdot2 :
Expand Down
13 changes: 12 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -753,6 +753,13 @@ def FeatureDot12Insts : SubtargetFeature<"dot12-insts",
"Has v_dot2_f32_bf16 instructions"
>;

def FeatureDot13Insts : SubtargetFeature<"dot13-insts",
"HasDot13Insts",
"true",
"Has v_dot2c_f32_bf16 instructions"
>;


def FeatureMAIInsts : SubtargetFeature<"mai-insts",
"HasMAIInsts",
"true",
Expand Down Expand Up @@ -1585,7 +1592,8 @@ def FeatureISAVersion9_5_Common : FeatureSet<
FeatureBF8ConversionScaleInsts,
FeatureFP4ConversionScaleInsts,
FeatureFP6BF6ConversionScaleInsts,
FeatureDot12Insts
FeatureDot12Insts,
FeatureDot13Insts
])>;

def FeatureISAVersion9_4_0 : FeatureSet<
Expand Down Expand Up @@ -2373,6 +2381,9 @@ def HasDot11Insts : Predicate<"Subtarget->hasDot11Insts()">,
def HasDot12Insts : Predicate<"Subtarget->hasDot12Insts()">,
AssemblerPredicate<(all_of FeatureDot12Insts)>;

def HasDot13Insts : Predicate<"Subtarget->hasDot13Insts()">,
AssemblerPredicate<(all_of FeatureDot13Insts)>;

def HasGetWaveIdInst : Predicate<"Subtarget->hasGetWaveIdInst()">,
AssemblerPredicate<(all_of FeatureGetWaveIdInst)>;

Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4528,6 +4528,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_fdot2_bf16_bf16:
case Intrinsic::amdgcn_fdot2_f16_f16:
case Intrinsic::amdgcn_fdot2_f32_bf16:
case Intrinsic::amdgcn_fdot2c_f32_bf16:
case Intrinsic::amdgcn_sudot4:
case Intrinsic::amdgcn_sudot8:
case Intrinsic::amdgcn_dot4_f32_fp8_bf8:
Expand Down
8 changes: 8 additions & 0 deletions llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -566,6 +566,10 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
tryDecodeInst(DecoderTableGFX80_UNPACKED64, MI, QW, Address, CS))
break;

if (STI.hasFeature(AMDGPU::FeatureGFX950Insts) &&
tryDecodeInst(DecoderTableGFX95064, MI, QW, Address, CS))
break;

// Some GFX9 subtargets repurposed the v_mad_mix_f32, v_mad_mixlo_f16 and
// v_mad_mixhi_f16 for FMA variants. Try to decode using this special
// table first so we print the correct name.
Expand Down Expand Up @@ -627,6 +631,10 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
if (isGFX9() && tryDecodeInst(DecoderTableGFX932, MI, DW, Address, CS))
break;

if (STI.hasFeature(AMDGPU::FeatureGFX950Insts) &&
tryDecodeInst(DecoderTableGFX95032, MI, DW, Address, CS))
break;

if (STI.hasFeature(AMDGPU::FeatureGFX90AInsts) &&
tryDecodeInst(DecoderTableGFX90A32, MI, DW, Address, CS))
break;
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool HasDot10Insts = false;
bool HasDot11Insts = false;
bool HasDot12Insts = false;
bool HasDot13Insts = false;
bool HasMAIInsts = false;
bool HasFP8Insts = false;
bool HasFP8ConversionInsts = false;
Expand Down Expand Up @@ -830,6 +831,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
return HasDot12Insts;
}

bool hasDot13Insts() const {
return HasDot13Insts;
}

bool hasMAIInsts() const {
return HasMAIInsts;
}
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -591,6 +591,7 @@ bool isMAC(unsigned Opc) {
Opc == AMDGPU::V_FMAC_F16_fake16_e64_gfx11 ||
Opc == AMDGPU::V_FMAC_F16_fake16_e64_gfx12 ||
Opc == AMDGPU::V_DOT2C_F32_F16_e64_vi ||
Opc == AMDGPU::V_DOT2C_F32_BF16_e64_vi ||
Opc == AMDGPU::V_DOT2C_I32_I16_e64_vi ||
Opc == AMDGPU::V_DOT4C_I32_I8_e64_vi ||
Opc == AMDGPU::V_DOT8C_I32_I4_e64_vi;
Expand Down
20 changes: 20 additions & 0 deletions llvm/lib/Target/AMDGPU/VOP2Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -567,6 +567,12 @@ def VOP_DOT_ACC_F32_V2F16 : VOP_DOT_ACC<f32, v2f16> {
let HasClamp = 1;
}

def VOP_DOT_ACC_F32_V2BF16 : VOP_DOT_ACC<f32, v2bf16> {
let Src0ModDPP = FPVRegInputMods;
let Src1ModDPP = FPVRegInputMods;
let HasClamp = 1;
}

def VOP_DOT_ACC_I32_I32 : VOP_DOT_ACC<i32, i32> {
let HasExtVOP3DPP = 0;
let HasSrc0Mods = 1;
Expand Down Expand Up @@ -1182,6 +1188,9 @@ let Constraints = "$vdst = $src2",
defm V_DOT2C_I32_I16 : VOP2Inst<"v_dot2c_i32_i16", VOP_DOT_ACC_I32_I32>;
let SubtargetPredicate = HasDot3Insts in
defm V_DOT8C_I32_I4 : VOP2Inst<"v_dot8c_i32_i4", VOP_DOT_ACC_I32_I32>;

let SubtargetPredicate = HasDot13Insts in
defm V_DOT2C_F32_BF16 : VOP2Inst<"v_dot2c_f32_bf16", VOP_DOT_ACC_F32_V2BF16>;
}

let AddedComplexity = 30 in {
Expand All @@ -1191,6 +1200,12 @@ let AddedComplexity = 30 in {
> {
let SubtargetPredicate = HasDot5Insts;
}
def : GCNPat<
(f32 (int_amdgcn_fdot2_f32_bf16 v2bf16:$src0, v2bf16:$src1, f32:$src2, (i1 DSTCLAMP.NONE))),
(f32 (V_DOT2C_F32_BF16_e32 $src0, $src1, $src2))
> {
let SubtargetPredicate = HasDot13Insts;
}
def : GCNPat<
(i32 (int_amdgcn_sdot4 i32:$src0, i32:$src1, i32:$src2, (i1 DSTCLAMP.NONE))),
(i32 (V_DOT4C_I32_I8_e32 $src0, $src1, $src2))
Expand Down Expand Up @@ -2670,3 +2685,8 @@ let SubtargetPredicate = HasDot3Insts in {
let DecoderNamespace = "GFX10_B" in
defm V_DOT8C_I32_I4 : VOP2_Real_DOT_ACC_gfx10<0x02>;
}

let OtherPredicates = [HasDot13Insts] in {
let DecoderNamespace = "GFX950" in
defm V_DOT2C_F32_BF16 : VOP2_Real_DOT_ACC_gfx9<0x16>;
}
1 change: 1 addition & 0 deletions llvm/lib/TargetParser/TargetParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -478,6 +478,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
Features["permlane32-swap"] = true;
Features["ashr-pk-insts"] = true;
Features["dot12-insts"] = true;
Features["dot13-insts"] = true;
Features["gfx950-insts"] = true;
[[fallthrough]];
case GK_GFX942:
Expand Down
12 changes: 5 additions & 7 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_clamp(
; GFX950-ISEL-NEXT: s_nop 1
; GFX950-ISEL-NEXT: global_store_dword v1, v0, s[8:9]
; GFX950-ISEL-NEXT: s_endpgm

ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b,
Expand Down Expand Up @@ -93,9 +92,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_no_clamp(
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v1, s0
; GFX950-NEXT: v_mov_b32_e32 v2, s1
; GFX950-NEXT: v_dot2_f32_bf16 v1, s2, v1, v2
; GFX950-NEXT: v_dot2c_f32_bf16_e32 v2, s2, v1
; GFX950-NEXT: s_nop 2
; GFX950-NEXT: global_store_dword v0, v1, s[8:9]
; GFX950-NEXT: global_store_dword v0, v2, s[8:9]
; GFX950-NEXT: s_endpgm
;
; GFX950-ISEL-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_no_clamp:
Expand All @@ -108,12 +107,11 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_no_clamp(
; GFX950-ISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-ISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX950-ISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX950-ISEL-NEXT: v_dot2_f32_bf16 v0, s2, v0, v1
; GFX950-ISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX950-ISEL-NEXT: v_dot2c_f32_bf16_e32 v1, s2, v0
; GFX950-ISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-ISEL-NEXT: s_nop 1
; GFX950-ISEL-NEXT: global_store_dword v1, v0, s[8:9]
; GFX950-ISEL-NEXT: global_store_dword v0, v1, s[8:9]
; GFX950-ISEL-NEXT: s_endpgm

ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b,
Expand Down
Loading
Loading