Skip to content

Commit 5450c2d

Browse files
arsenmsrpande
authored andcommitted
AMDGPU: Add support for v_dot2c_f32_bf16 instruction for gfx950 (llvm#117598)
The encoding of v_dot2c_f32_bf16 opcode is same as v_mac_f32 in gfx90a, both from gfx9 series. This required a new decoderNameSpace GFX950_DOT. Co-authored-by: Sirish Pande <[email protected]>
1 parent 72b906f commit 5450c2d

File tree

15 files changed

+372
-15
lines changed

15 files changed

+372
-15
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,7 @@ TARGET_BUILTIN(__builtin_amdgcn_dot4_f32_fp8_bf8, "fUiUif", "nc", "dot11-insts")
276276
TARGET_BUILTIN(__builtin_amdgcn_dot4_f32_bf8_fp8, "fUiUif", "nc", "dot11-insts")
277277
TARGET_BUILTIN(__builtin_amdgcn_dot4_f32_fp8_fp8, "fUiUif", "nc", "dot11-insts")
278278
TARGET_BUILTIN(__builtin_amdgcn_dot4_f32_bf8_bf8, "fUiUif", "nc", "dot11-insts")
279+
TARGET_BUILTIN(__builtin_amdgcn_fdot2c_f32_bf16, "fV2yV2yfIb", "nc", "dot13-insts")
279280

280281
//===----------------------------------------------------------------------===//
281282
// GFX10+ only builtins.

clang/test/CodeGenOpenCL/amdgpu-features.cl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@
8989
// GFX941: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts"
9090
// GFX942: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts"
9191
// GFX9_4_Generic: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
92-
// GFX950: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot12-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
92+
// GFX950: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
9393
// GFX1010: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
9494
// GFX1011: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
9595
// GFX1012: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"

clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,14 @@ typedef unsigned int uint;
77
typedef half __attribute__((ext_vector_type(2))) half2;
88
typedef short __attribute__((ext_vector_type(2))) short2;
99
typedef unsigned short __attribute__((ext_vector_type(2))) ushort2;
10+
typedef __bf16 __attribute__((ext_vector_type(2))) bfloat2;
1011

1112
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
1213
kernel void builtins_amdgcn_dl_insts_err(
1314
global float *fOut, global int *siOut, global uint *uiOut,
1415
global short *sOut, global int *iOut, global half *hOut,
1516
half2 v2hA, half2 v2hB, float fC, half hC,
17+
bfloat2 v2bfbfA, bfloat2 v2bfbfB,
1618
short2 v2ssA, short2 v2ssB, short sC, int siA, int siB, int siC,
1719
ushort2 v2usA, ushort2 v2usB, uint uiA, uint uiB, uint uiC,
1820
int A, int B, int C) {
@@ -26,6 +28,9 @@ kernel void builtins_amdgcn_dl_insts_err(
2628
fOut[3] = __builtin_amdgcn_fdot2_f32_bf16(v2ssA, v2ssB, fC, false); // expected-error {{'__builtin_amdgcn_fdot2_f32_bf16' needs target feature dot12-insts}}
2729
fOut[4] = __builtin_amdgcn_fdot2_f32_bf16(v2ssA, v2ssB, fC, true); // expected-error {{'__builtin_amdgcn_fdot2_f32_bf16' needs target feature dot12-insts}}
2830

31+
fOut[3] = __builtin_amdgcn_fdot2c_f32_bf16(v2bfbfA, v2bfbfB, fC, false); // expected-error {{'__builtin_amdgcn_fdot2c_f32_bf16' needs target feature dot13-insts}}
32+
fOut[4] = __builtin_amdgcn_fdot2c_f32_bf16(v2bfbfA, v2bfbfB, fC, true); // expected-error {{'__builtin_amdgcn_fdot2c_f32_bf16' needs target feature dot13-insts}}
33+
2934
siOut[0] = __builtin_amdgcn_sdot2(v2ssA, v2ssB, siC, false); // expected-error {{'__builtin_amdgcn_sdot2' needs target feature dot2-insts}}
3035
siOut[1] = __builtin_amdgcn_sdot2(v2ssA, v2ssB, siC, true); // expected-error {{'__builtin_amdgcn_sdot2' needs target feature dot2-insts}}
3136

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ typedef unsigned int __attribute__((ext_vector_type(6))) uint6;
88
typedef __bf16 __attribute__((ext_vector_type(32))) bfloat32;
99
typedef half __attribute__((ext_vector_type(32))) half32;
1010
typedef short __attribute__((ext_vector_type(2))) short2;
11+
typedef __bf16 __attribute__((ext_vector_type(2))) bfloat2;
1112
typedef float __attribute__((ext_vector_type(16))) float16;
1213

1314
// CHECK-LABEL: @test_prng_b32(
@@ -216,17 +217,16 @@ void test_ashr_pk_u8_i32(global int* out, uint src0, uint src1, uint src2) {
216217
*out = __builtin_amdgcn_ashr_pk_u8_i32(src0, src1, src2);
217218
}
218219

219-
// CHECK-LABEL: define dso_local void @builtins_amdgcn_dl_insts(
220-
// CHECK-SAME: ptr addrspace(1) noundef [[OUT:%.*]], float noundef [[FC:%.*]], <2 x i16> noundef [[V2SSA:%.*]], <2 x i16> noundef [[V2SSB:%.*]]) #[[ATTR0:[0-9]+]] {
220+
// CHECK-LABEL: @builtins_amdgcn_dl_insts(
221221
// CHECK-NEXT: entry:
222222
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
223223
// CHECK-NEXT: [[FC_ADDR:%.*]] = alloca float, align 4, addrspace(5)
224224
// CHECK-NEXT: [[V2SSA_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5)
225225
// CHECK-NEXT: [[V2SSB_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5)
226-
// CHECK-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8
227-
// CHECK-NEXT: store float [[FC]], ptr addrspace(5) [[FC_ADDR]], align 4
228-
// CHECK-NEXT: store <2 x i16> [[V2SSA]], ptr addrspace(5) [[V2SSA_ADDR]], align 4
229-
// CHECK-NEXT: store <2 x i16> [[V2SSB]], ptr addrspace(5) [[V2SSB_ADDR]], align 4
226+
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
227+
// CHECK-NEXT: store float [[FC:%.*]], ptr addrspace(5) [[FC_ADDR]], align 4
228+
// CHECK-NEXT: store <2 x i16> [[V2SSA:%.*]], ptr addrspace(5) [[V2SSA_ADDR]], align 4
229+
// CHECK-NEXT: store <2 x i16> [[V2SSB:%.*]], ptr addrspace(5) [[V2SSB_ADDR]], align 4
230230
// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i16>, ptr addrspace(5) [[V2SSA_ADDR]], align 4
231231
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i16> [[TMP0]] to <2 x bfloat>
232232
// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i16>, ptr addrspace(5) [[V2SSB_ADDR]], align 4
@@ -240,3 +240,25 @@ void test_ashr_pk_u8_i32(global int* out, uint src0, uint src1, uint src2) {
240240
void builtins_amdgcn_dl_insts(global float *out, float fC, short2 v2ssA, short2 v2ssB) {
241241
*out = __builtin_amdgcn_fdot2_f32_bf16(v2ssA, v2ssB, fC, false);
242242
}
243+
244+
// CHECK-LABEL: @builtins_amdgcn_dl_dot2c(
245+
// CHECK-NEXT: entry:
246+
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
247+
// CHECK-NEXT: [[FC_ADDR:%.*]] = alloca float, align 4, addrspace(5)
248+
// CHECK-NEXT: [[V2SSA_ADDR:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5)
249+
// CHECK-NEXT: [[V2SSB_ADDR:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5)
250+
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
251+
// CHECK-NEXT: store float [[FC:%.*]], ptr addrspace(5) [[FC_ADDR]], align 4
252+
// CHECK-NEXT: store <2 x bfloat> [[V2SSA:%.*]], ptr addrspace(5) [[V2SSA_ADDR]], align 4
253+
// CHECK-NEXT: store <2 x bfloat> [[V2SSB:%.*]], ptr addrspace(5) [[V2SSB_ADDR]], align 4
254+
// CHECK-NEXT: [[TMP0:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[V2SSA_ADDR]], align 4
255+
// CHECK-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[V2SSB_ADDR]], align 4
256+
// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[FC_ADDR]], align 4
257+
// CHECK-NEXT: [[TMP3:%.*]] = call float @llvm.amdgcn.fdot2c.f32.bf16(<2 x bfloat> [[TMP0]], <2 x bfloat> [[TMP1]], float [[TMP2]], i1 false)
258+
// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
259+
// CHECK-NEXT: store float [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
260+
// CHECK-NEXT: ret void
261+
//
262+
void builtins_amdgcn_dl_dot2c(global float *out, float fC, bfloat2 v2ssA, bfloat2 v2ssB) {
263+
*out = __builtin_amdgcn_fdot2c_f32_bf16(v2ssA, v2ssB, fC, false);
264+
}

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2838,6 +2838,24 @@ def int_amdgcn_fdot2_f32_bf16 :
28382838
[IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
28392839
>;
28402840

2841+
// f32 %r = llvm.amdgcn.fdot2c.f32.bf16(v2bf16 %a, v2bf16 %b, f32 %c, i1 %clamp)
2842+
// %r = %a[0] * %b[0] + %a[1] * %b[1] + c
2843+
// TODO: This actually is similar to llvm.amdgcn.fdot2 intrinsics which produces
2844+
// v_dot2c_f32_f16 on gfx940. Maybe we can consolidate these.
2845+
2846+
def int_amdgcn_fdot2c_f32_bf16 :
2847+
ClangBuiltin<"__builtin_amdgcn_fdot2c_f32_bf16">,
2848+
DefaultAttrsIntrinsic<
2849+
[llvm_float_ty], // %r
2850+
[
2851+
llvm_v2bf16_ty, // %a
2852+
llvm_v2bf16_ty, // %b
2853+
llvm_float_ty, // %c
2854+
llvm_i1_ty // %clamp
2855+
],
2856+
[IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
2857+
>;
2858+
28412859
// i32 %r = llvm.amdgcn.sdot2(v2i16 %a, v2i16 %b, i32 %c, i1 %clamp)
28422860
// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
28432861
def int_amdgcn_sdot2 :

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -753,6 +753,13 @@ def FeatureDot12Insts : SubtargetFeature<"dot12-insts",
753753
"Has v_dot2_f32_bf16 instructions"
754754
>;
755755

756+
def FeatureDot13Insts : SubtargetFeature<"dot13-insts",
757+
"HasDot13Insts",
758+
"true",
759+
"Has v_dot2c_f32_bf16 instructions"
760+
>;
761+
762+
756763
def FeatureMAIInsts : SubtargetFeature<"mai-insts",
757764
"HasMAIInsts",
758765
"true",
@@ -1587,7 +1594,8 @@ def FeatureISAVersion9_5_Common : FeatureSet<
15871594
FeatureBF8ConversionScaleInsts,
15881595
FeatureFP4ConversionScaleInsts,
15891596
FeatureFP6BF6ConversionScaleInsts,
1590-
FeatureDot12Insts
1597+
FeatureDot12Insts,
1598+
FeatureDot13Insts
15911599
])>;
15921600

15931601
def FeatureISAVersion9_4_0 : FeatureSet<
@@ -2372,6 +2380,9 @@ def HasDot11Insts : Predicate<"Subtarget->hasDot11Insts()">,
23722380
def HasDot12Insts : Predicate<"Subtarget->hasDot12Insts()">,
23732381
AssemblerPredicate<(all_of FeatureDot12Insts)>;
23742382

2383+
def HasDot13Insts : Predicate<"Subtarget->hasDot13Insts()">,
2384+
AssemblerPredicate<(all_of FeatureDot13Insts)>;
2385+
23752386
def HasGetWaveIdInst : Predicate<"Subtarget->hasGetWaveIdInst()">,
23762387
AssemblerPredicate<(all_of FeatureGetWaveIdInst)>;
23772388

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4505,6 +4505,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
45054505
case Intrinsic::amdgcn_fdot2_bf16_bf16:
45064506
case Intrinsic::amdgcn_fdot2_f16_f16:
45074507
case Intrinsic::amdgcn_fdot2_f32_bf16:
4508+
case Intrinsic::amdgcn_fdot2c_f32_bf16:
45084509
case Intrinsic::amdgcn_sudot4:
45094510
case Intrinsic::amdgcn_sudot8:
45104511
case Intrinsic::amdgcn_dot4_f32_fp8_bf8:

llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -547,6 +547,10 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
547547
tryDecodeInst(DecoderTableGFX80_UNPACKED64, MI, QW, Address, CS))
548548
break;
549549

550+
if (STI.hasFeature(AMDGPU::FeatureGFX950Insts) &&
551+
tryDecodeInst(DecoderTableGFX95064, MI, QW, Address, CS))
552+
break;
553+
550554
// Some GFX9 subtargets repurposed the v_mad_mix_f32, v_mad_mixlo_f16 and
551555
// v_mad_mixhi_f16 for FMA variants. Try to decode using this special
552556
// table first so we print the correct name.
@@ -608,6 +612,10 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
608612
if (isGFX9() && tryDecodeInst(DecoderTableGFX932, MI, DW, Address, CS))
609613
break;
610614

615+
if (STI.hasFeature(AMDGPU::FeatureGFX950Insts) &&
616+
tryDecodeInst(DecoderTableGFX95032, MI, DW, Address, CS))
617+
break;
618+
611619
if (STI.hasFeature(AMDGPU::FeatureGFX90AInsts) &&
612620
tryDecodeInst(DecoderTableGFX90A32, MI, DW, Address, CS))
613621
break;

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
157157
bool HasDot10Insts = false;
158158
bool HasDot11Insts = false;
159159
bool HasDot12Insts = false;
160+
bool HasDot13Insts = false;
160161
bool HasMAIInsts = false;
161162
bool HasFP8Insts = false;
162163
bool HasFP8ConversionInsts = false;
@@ -831,6 +832,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
831832
return HasDot12Insts;
832833
}
833834

835+
bool hasDot13Insts() const {
836+
return HasDot13Insts;
837+
}
838+
834839
bool hasMAIInsts() const {
835840
return HasMAIInsts;
836841
}

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -600,6 +600,7 @@ bool isMAC(unsigned Opc) {
600600
Opc == AMDGPU::V_FMAC_F16_t16_e64_gfx11 ||
601601
Opc == AMDGPU::V_FMAC_F16_t16_e64_gfx12 ||
602602
Opc == AMDGPU::V_DOT2C_F32_F16_e64_vi ||
603+
Opc == AMDGPU::V_DOT2C_F32_BF16_e64_vi ||
603604
Opc == AMDGPU::V_DOT2C_I32_I16_e64_vi ||
604605
Opc == AMDGPU::V_DOT4C_I32_I8_e64_vi ||
605606
Opc == AMDGPU::V_DOT8C_I32_I4_e64_vi;

llvm/lib/Target/AMDGPU/VOP2Instructions.td

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -527,6 +527,12 @@ def VOP_DOT_ACC_F32_V2F16 : VOP_DOT_ACC<f32, v2f16> {
527527
let HasClamp = 1;
528528
}
529529

530+
def VOP_DOT_ACC_F32_V2BF16 : VOP_DOT_ACC<f32, v2bf16> {
531+
let Src0ModDPP = FPVRegInputMods;
532+
let Src1ModDPP = FPVRegInputMods;
533+
let HasClamp = 1;
534+
}
535+
530536
def VOP_DOT_ACC_I32_I32 : VOP_DOT_ACC<i32, i32> {
531537
let HasExtVOP3DPP = 0;
532538
let HasSrc0Mods = 1;
@@ -1102,6 +1108,9 @@ let Constraints = "$vdst = $src2",
11021108
defm V_DOT2C_I32_I16 : VOP2Inst<"v_dot2c_i32_i16", VOP_DOT_ACC_I32_I32>;
11031109
let SubtargetPredicate = HasDot3Insts in
11041110
defm V_DOT8C_I32_I4 : VOP2Inst<"v_dot8c_i32_i4", VOP_DOT_ACC_I32_I32>;
1111+
1112+
let SubtargetPredicate = HasDot13Insts in
1113+
defm V_DOT2C_F32_BF16 : VOP2Inst<"v_dot2c_f32_bf16", VOP_DOT_ACC_F32_V2BF16>;
11051114
}
11061115

11071116
let AddedComplexity = 30 in {
@@ -1111,6 +1120,12 @@ let AddedComplexity = 30 in {
11111120
> {
11121121
let SubtargetPredicate = HasDot5Insts;
11131122
}
1123+
def : GCNPat<
1124+
(f32 (int_amdgcn_fdot2_f32_bf16 v2bf16:$src0, v2bf16:$src1, f32:$src2, (i1 DSTCLAMP.NONE))),
1125+
(f32 (V_DOT2C_F32_BF16_e32 $src0, $src1, $src2))
1126+
> {
1127+
let SubtargetPredicate = HasDot13Insts;
1128+
}
11141129
def : GCNPat<
11151130
(i32 (int_amdgcn_sdot4 i32:$src0, i32:$src1, i32:$src2, (i1 DSTCLAMP.NONE))),
11161131
(i32 (V_DOT4C_I32_I8_e32 $src0, $src1, $src2))
@@ -2564,3 +2579,8 @@ let SubtargetPredicate = HasDot3Insts in {
25642579
let DecoderNamespace = "GFX10_B" in
25652580
defm V_DOT8C_I32_I4 : VOP2_Real_DOT_ACC_gfx10<0x02>;
25662581
}
2582+
2583+
let OtherPredicates = [HasDot13Insts] in {
2584+
let DecoderNamespace = "GFX950" in
2585+
defm V_DOT2C_F32_BF16 : VOP2_Real_DOT_ACC_gfx9<0x16>;
2586+
}

llvm/lib/TargetParser/TargetParser.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -478,6 +478,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
478478
Features["permlane32-swap"] = true;
479479
Features["ashr-pk-insts"] = true;
480480
Features["dot12-insts"] = true;
481+
Features["dot13-insts"] = true;
481482
Features["gfx950-insts"] = true;
482483
[[fallthrough]];
483484
case GK_GFX942:

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,6 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_clamp(
5454
; GFX950-ISEL-NEXT: s_nop 1
5555
; GFX950-ISEL-NEXT: global_store_dword v1, v0, s[0:1]
5656
; GFX950-ISEL-NEXT: s_endpgm
57-
5857
ptr addrspace(1) %r,
5958
ptr addrspace(1) %a,
6059
ptr addrspace(1) %b,
@@ -97,9 +96,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_no_clamp(
9796
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
9897
; GFX950-NEXT: v_mov_b32_e32 v1, s8
9998
; GFX950-NEXT: v_mov_b32_e32 v2, s9
100-
; GFX950-NEXT: v_dot2_f32_bf16 v1, s10, v1, v2
99+
; GFX950-NEXT: v_dot2c_f32_bf16_e32 v2, s10, v1
101100
; GFX950-NEXT: s_nop 2
102-
; GFX950-NEXT: global_store_dword v0, v1, s[0:1]
101+
; GFX950-NEXT: global_store_dword v0, v2, s[0:1]
103102
; GFX950-NEXT: s_endpgm
104103
;
105104
; GFX950-ISEL-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_no_clamp:
@@ -112,12 +111,11 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_no_clamp(
112111
; GFX950-ISEL-NEXT: s_waitcnt lgkmcnt(0)
113112
; GFX950-ISEL-NEXT: v_mov_b32_e32 v0, s8
114113
; GFX950-ISEL-NEXT: v_mov_b32_e32 v1, s9
115-
; GFX950-ISEL-NEXT: v_dot2_f32_bf16 v0, s10, v0, v1
116-
; GFX950-ISEL-NEXT: v_mov_b32_e32 v1, 0
114+
; GFX950-ISEL-NEXT: v_dot2c_f32_bf16_e32 v1, s10, v0
115+
; GFX950-ISEL-NEXT: v_mov_b32_e32 v0, 0
117116
; GFX950-ISEL-NEXT: s_nop 1
118-
; GFX950-ISEL-NEXT: global_store_dword v1, v0, s[0:1]
117+
; GFX950-ISEL-NEXT: global_store_dword v0, v1, s[0:1]
119118
; GFX950-ISEL-NEXT: s_endpgm
120-
121119
ptr addrspace(1) %r,
122120
ptr addrspace(1) %a,
123121
ptr addrspace(1) %b,

0 commit comments

Comments
 (0)