Skip to content

Commit 1fd3d1d

Browse files
jthackraymomchil-velikovLukacmaCarolineConcattoSpencerAbson
authored
[AArch64] Add intrinsics for SME FP8 FDOT LANE instructions (#118492)
Add support for the following SME 8 bit floating-point dot-product intrinsics: * void svdot_lane_za16_mf8_vg1x2_fpm(uint32_t slice, svmfloat8x2_t zn, svmfloat8_t zm, uint64_t imm_idx, fpm_t fpm); * void svdot_lane_za16_mf8_vg1x4_fpm(uint32_t slice, svmfloat8x4_t zn, svmfloat8_t zm, uint64_t imm_idx, fpm_t fpm); * void svdot_lane_za32_mf8_vg1x2_fpm(uint32_t slice, svmfloat8x2_t zn, svmfloat8_t zm, uint64_t imm_idx, fpm_t fpm); * void svdot_lane_za32_mf8_vg1x4_fpm(uint32_t slice, svmfloat8x4_t zn, svmfloat8_t zm, uint64_t imm_idx, fpm_t fpm); --------- Co-authored-by: Momchil Velikov <[email protected]> Co-authored-by: Marian Lukac <[email protected]> Co-authored-by: Caroline Concatto <[email protected]> Co-authored-by: SpencerAbson <[email protected]>
1 parent ccc8e45 commit 1fd3d1d

File tree

8 files changed

+327
-4
lines changed

8 files changed

+327
-4
lines changed

clang/include/clang/Basic/arm_sme.td

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -740,6 +740,21 @@ let SMETargetGuard = "sme2" in {
740740
def SVLUTI4_LANE_ZT_X2 : Inst<"svluti4_lane_zt_{d}_x2", "2.di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti4_lane_zt_x2", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_3>]>;
741741
}
742742

743+
//
744+
// SME2 FP8 instructions
745+
//
746+
747+
// FDOT
748+
let SMETargetGuard = "sme-f8f32" in {
749+
def SVDOT_LANE_FP8_ZA32_VG1x2 : Inst<"svdot_lane_za32[_mf8]_vg1x2_fpm", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fdot_lane_za32_vg1x2", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_3>]>;
750+
def SVDOT_LANE_FP8_ZA32_VG1x4 : Inst<"svdot_lane_za32[_mf8]_vg1x4_fpm", "vm4di>", "m", MergeNone, "aarch64_sme_fp8_fdot_lane_za32_vg1x4", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_3>]>;
751+
}
752+
753+
let SMETargetGuard = "sme-f8f16" in {
754+
def SVDOT_LANE_FP8_ZA16_VG1x2 : Inst<"svdot_lane_za16[_mf8]_vg1x2_fpm", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fdot_lane_za16_vg1x2", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_7>]>;
755+
def SVDOT_LANE_FP8_ZA16_VG1x4 : Inst<"svdot_lane_za16[_mf8]_vg1x4_fpm", "vm4di>", "m", MergeNone, "aarch64_sme_fp8_fdot_lane_za16_vg1x4", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_7>]>;
756+
}
757+
743758
////////////////////////////////////////////////////////////////////////////////
744759
// SME2p1 - FMOPA, FMOPS (non-widening)
745760
let SMETargetGuard = "sme-b16b16" in {

clang/include/clang/Basic/arm_sve_sme_incl.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ include "arm_immcheck_incl.td"
5252
// h: half-float
5353
// d: double
5454
// b: bfloat
55+
// m: mfloat8
5556

5657
// Typespec modifiers
5758
// ------------------
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
2+
// REQUIRES: aarch64-registered-target
3+
#include <arm_sme.h>
4+
5+
// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s
6+
// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
7+
// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s
8+
// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
9+
// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
10+
#include <arm_sme.h>
11+
12+
#ifdef SVE_OVERLOADED_FORMS
13+
#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
14+
#else
15+
#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
16+
#endif
17+
18+
// CHECK-LABEL: define dso_local void @test_svdot_lane_za32_f8_vg1x2(
19+
// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
20+
// CHECK-NEXT: [[ENTRY:.*:]]
21+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
22+
// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fdot.lane.za32.vg1x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]], i32 3)
23+
// CHECK-NEXT: ret void
24+
//
25+
// CPP-CHECK-LABEL: define dso_local void @_Z29test_svdot_lane_za32_f8_vg1x2j13svmfloat8x2_tu13__SVMfloat8_tm(
26+
// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
27+
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
28+
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
29+
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fdot.lane.za32.vg1x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]], i32 3)
30+
// CPP-CHECK-NEXT: ret void
31+
//
32+
void test_svdot_lane_za32_f8_vg1x2(uint32_t slice, svmfloat8x2_t zn,
33+
svmfloat8_t zm, fpm_t fpmr)
34+
__arm_streaming __arm_inout("za") {
35+
SVE_ACLE_FUNC(svdot_lane_za32,_mf8,_vg1x2_fpm)(slice, zn, zm, 3, fpmr);
36+
}
37+
38+
// CHECK-LABEL: define dso_local void @test_svdot_lane_za32_f8_vg1x4(
39+
// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
40+
// CHECK-NEXT: [[ENTRY:.*:]]
41+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
42+
// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fdot.lane.za32.vg1x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZN_COERCE2]], <vscale x 16 x i8> [[ZN_COERCE3]], <vscale x 16 x i8> [[ZM]], i32 3)
43+
// CHECK-NEXT: ret void
44+
//
45+
// CPP-CHECK-LABEL: define dso_local void @_Z29test_svdot_lane_za32_f8_vg1x4j13svmfloat8x4_tu13__SVMfloat8_tm(
46+
// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
47+
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
48+
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
49+
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fdot.lane.za32.vg1x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZN_COERCE2]], <vscale x 16 x i8> [[ZN_COERCE3]], <vscale x 16 x i8> [[ZM]], i32 3)
50+
// CPP-CHECK-NEXT: ret void
51+
//
52+
void test_svdot_lane_za32_f8_vg1x4(uint32_t slice, svmfloat8x4_t zn,
53+
svmfloat8_t zm, fpm_t fpmr)
54+
__arm_streaming __arm_inout("za") {
55+
SVE_ACLE_FUNC(svdot_lane_za32,_mf8,_vg1x4_fpm)(slice, zn, zm, 3, fpmr);
56+
}
57+
58+
// CHECK-LABEL: define dso_local void @test_svdot_lane_za16_f8_vg1x2(
59+
// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
60+
// CHECK-NEXT: [[ENTRY:.*:]]
61+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
62+
// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fdot.lane.za16.vg1x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]], i32 3)
63+
// CHECK-NEXT: ret void
64+
//
65+
// CPP-CHECK-LABEL: define dso_local void @_Z29test_svdot_lane_za16_f8_vg1x2j13svmfloat8x2_tu13__SVMfloat8_tm(
66+
// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
67+
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
68+
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
69+
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fdot.lane.za16.vg1x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]], i32 3)
70+
// CPP-CHECK-NEXT: ret void
71+
//
72+
void test_svdot_lane_za16_f8_vg1x2(uint32_t slice, svmfloat8x2_t zn,
73+
svmfloat8_t zm, fpm_t fpmr)
74+
__arm_streaming __arm_inout("za") {
75+
SVE_ACLE_FUNC(svdot_lane_za16,_mf8,_vg1x2_fpm)(slice, zn, zm, 3, fpmr);
76+
}
77+
78+
// CHECK-LABEL: define dso_local void @test_svdot_lane_za16_f8_vg1x4(
79+
// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
80+
// CHECK-NEXT: [[ENTRY:.*:]]
81+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
82+
// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fdot.lane.za16.vg1x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZN_COERCE2]], <vscale x 16 x i8> [[ZN_COERCE3]], <vscale x 16 x i8> [[ZM]], i32 3)
83+
// CHECK-NEXT: ret void
84+
//
85+
// CPP-CHECK-LABEL: define dso_local void @_Z29test_svdot_lane_za16_f8_vg1x4j13svmfloat8x4_tu13__SVMfloat8_tm(
86+
// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
87+
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
88+
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
89+
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fdot.lane.za16.vg1x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZN_COERCE2]], <vscale x 16 x i8> [[ZN_COERCE3]], <vscale x 16 x i8> [[ZM]], i32 3)
90+
// CPP-CHECK-NEXT: ret void
91+
//
92+
void test_svdot_lane_za16_f8_vg1x4(uint32_t slice, svmfloat8x4_t zn,
93+
svmfloat8_t zm, fpm_t fpmr)
94+
__arm_streaming __arm_inout("za") {
95+
SVE_ACLE_FUNC(svdot_lane_za16,_mf8,_vg1x4_fpm)(slice, zn, zm, 3, fpmr);
96+
}
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -verify -emit-llvm -o - %s
2+
3+
// REQUIRES: aarch64-registered-target
4+
5+
#include <arm_sme.h>
6+
7+
void test_features(uint32_t slice, svmfloat8_t f8, svmfloat8x2_t f8x2,
8+
svmfloat8x4_t f8x4, uint64_t fpmr) __arm_streaming __arm_inout("za") {
9+
// expected-error@+1 {{'svdot_lane_za32_mf8_vg1x2_fpm' needs target feature sme,sme-f8f32}}
10+
svdot_lane_za32_mf8_vg1x2_fpm(slice, f8x2, f8, 3, fpmr);
11+
// expected-error@+1 {{'svdot_lane_za32_mf8_vg1x4_fpm' needs target feature sme,sme-f8f32}}
12+
svdot_lane_za32_mf8_vg1x4_fpm(slice, f8x4, f8, 3, fpmr);
13+
// expected-error@+1 {{'svdot_lane_za16_mf8_vg1x2_fpm' needs target feature sme,sme-f8f16}}
14+
svdot_lane_za16_mf8_vg1x2_fpm(slice, f8x2, f8, 3, fpmr);
15+
// expected-error@+1 {{'svdot_lane_za16_mf8_vg1x4_fpm' needs target feature sme,sme-f8f16}}
16+
svdot_lane_za16_mf8_vg1x4_fpm(slice, f8x4, f8, 3, fpmr);
17+
}
18+
19+
void test_imm(uint32_t slice, svmfloat8_t f8, svmfloat8x2_t f8x2,
20+
svmfloat8x4_t f8x4, uint64_t fpmr) __arm_streaming __arm_inout("za") {
21+
// expected-error@+1{{argument value 18446744073709551615 is outside the valid range [0, 3]}}
22+
svdot_lane_za32_mf8_vg1x2_fpm(slice, f8x2, f8, -1, fpmr);
23+
// expected-error@+1{{argument value 18446744073709551615 is outside the valid range [0, 3]}}
24+
svdot_lane_za32_mf8_vg1x4_fpm(slice, f8x4, f8, -1, fpmr);
25+
// expected-error@+1{{argument value 18446744073709551615 is outside the valid range [0, 7]}}
26+
svdot_lane_za16_mf8_vg1x2_fpm(slice, f8x2, f8, -1, fpmr);
27+
// expected-error@+1{{argument value 18446744073709551615 is outside the valid range [0, 7]}}
28+
svdot_lane_za16_mf8_vg1x4_fpm(slice, f8x4, f8, -1, fpmr);
29+
30+
// expected-error@+1{{argument value 4 is outside the valid range [0, 3]}}
31+
svdot_lane_za32_mf8_vg1x2_fpm(slice, f8x2, f8, 4, fpmr);
32+
// expected-error@+1{{argument value 4 is outside the valid range [0, 3]}}
33+
svdot_lane_za32_mf8_vg1x4_fpm(slice, f8x4, f8, 4, fpmr);
34+
// expected-error@+1{{argument value 8 is outside the valid range [0, 7]}}
35+
svdot_lane_za16_mf8_vg1x2_fpm(slice, f8x2, f8, 8, fpmr);
36+
// expected-error@+1{{argument value 8 is outside the valid range [0, 7]}}
37+
svdot_lane_za16_mf8_vg1x4_fpm(slice, f8x4, f8, 8, fpmr);
38+
}

llvm/include/llvm/IR/IntrinsicsAArch64.td

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3856,6 +3856,31 @@ def int_aarch64_sve_famin_u : AdvSIMD_Pred2VectorArg_Intrinsic;
38563856
def int_aarch64_neon_famax : AdvSIMD_2VectorArg_Intrinsic;
38573857
def int_aarch64_neon_famin : AdvSIMD_2VectorArg_Intrinsic;
38583858

3859+
3860+
// SME FP8 FDOT intrinsics
3861+
let TargetPrefix = "aarch64" in {
3862+
3863+
class SME2_FP8_FDOT_LANE_VG1x2 :
3864+
DefaultAttrsIntrinsic<[], [llvm_i32_ty,
3865+
llvm_nxv16i8_ty, llvm_nxv16i8_ty,
3866+
llvm_nxv16i8_ty,
3867+
llvm_i32_ty],
3868+
[IntrInaccessibleMemOnly, IntrHasSideEffects, ImmArg<ArgIndex<4>>]>;
3869+
3870+
class SME2_FP8_FDOT_LANE_VG1x4 :
3871+
DefaultAttrsIntrinsic<[], [llvm_i32_ty,
3872+
llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty,
3873+
llvm_nxv16i8_ty,
3874+
llvm_i32_ty],
3875+
[IntrInaccessibleMemOnly, IntrHasSideEffects, ImmArg<ArgIndex<6>>]>;
3876+
3877+
def int_aarch64_sme_fp8_fdot_lane_za16_vg1x2 : SME2_FP8_FDOT_LANE_VG1x2;
3878+
def int_aarch64_sme_fp8_fdot_lane_za16_vg1x4 : SME2_FP8_FDOT_LANE_VG1x4;
3879+
3880+
def int_aarch64_sme_fp8_fdot_lane_za32_vg1x2 : SME2_FP8_FDOT_LANE_VG1x2;
3881+
def int_aarch64_sme_fp8_fdot_lane_za32_vg1x4 : SME2_FP8_FDOT_LANE_VG1x4;
3882+
}
3883+
38593884
//
38603885
// FP8 Intrinsics
38613886
//

llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -986,8 +986,8 @@ def LUTI4_S_4ZZT2Z : sme2_luti4_vector_vg4_strided<0b00, 0b00, "luti4">;
986986

987987
let Predicates = [HasSMEF8F16] in {
988988
defm FVDOT_VG2_M2ZZI_BtoH : sme2p1_multi_vec_array_vg2_index_f8f16<"fvdot", 0b11, 0b110, ZZ_b_mul_r, ZPR4b8>;
989-
defm FDOT_VG2_M2ZZI_BtoH : sme2p1_multi_vec_array_vg2_index_f8f16<"fdot", 0b11, 0b010, ZZ_b_mul_r, ZPR4b8>;
990-
defm FDOT_VG4_M4ZZI_BtoH : sme2p1_multi_vec_array_vg4_index_f8f16<"fdot", 0b100, ZZZZ_b_mul_r, ZPR4b8>;
989+
defm FDOT_VG2_M2ZZI_BtoH : sme2_fp8_fdot_index_za16_vg1x2<"fdot", int_aarch64_sme_fp8_fdot_lane_za16_vg1x2>;
990+
defm FDOT_VG4_M4ZZI_BtoH : sme2_fp8_fdot_index_za16_vg1x4<"fdot", int_aarch64_sme_fp8_fdot_lane_za16_vg1x4>;
991991
defm FDOT_VG2_M2ZZ_BtoH : sme2_dot_mla_add_sub_array_vg24_single<"fdot", 0b0010001, MatrixOp16, ZZ_b, ZPR4b8>;
992992
defm FDOT_VG4_M4ZZ_BtoH : sme2_dot_mla_add_sub_array_vg24_single<"fdot", 0b0110001, MatrixOp16, ZZZZ_b, ZPR4b8>;
993993

@@ -1008,9 +1008,9 @@ defm FMOPA_MPPZZ_BtoH : sme2_fp8_fmopa_za16<"fmopa", int_aarch64_sme_fp8_fmopa_z
10081008
} //[HasSMEF8F16]
10091009

10101010
let Predicates = [HasSMEF8F32] in {
1011+
defm FDOT_VG2_M2ZZI_BtoS : sme2_fp8_fdot_index_za32_vg1x2<"fdot", int_aarch64_sme_fp8_fdot_lane_za32_vg1x2>;
1012+
defm FDOT_VG4_M4ZZI_BtoS : sme2_fp8_fdot_index_za32_vg1x4<"fdot", int_aarch64_sme_fp8_fdot_lane_za32_vg1x4>;
10111013

1012-
defm FDOT_VG2_M2ZZI_BtoS : sme2_multi_vec_array_vg2_index_32b<"fdot", 0b01, 0b0111, ZZ_b_mul_r, ZPR4b8, nxv16i8, null_frag>;
1013-
defm FDOT_VG4_M4ZZI_BtoS : sme2_multi_vec_array_vg4_index_32b<"fdot", 0b0001, ZZZZ_b_mul_r, ZPR4b8, nxv16i8, null_frag>;
10141014
defm FDOT_VG2_M2ZZ_BtoS : sme2_dot_mla_add_sub_array_vg24_single<"fdot", 0b0010011, MatrixOp32, ZZ_b, ZPR4b8>;
10151015
defm FDOT_VG4_M4ZZ_BtoS : sme2_dot_mla_add_sub_array_vg24_single<"fdot", 0b0110011, MatrixOp32, ZZZZ_b, ZPR4b8>;
10161016

llvm/lib/Target/AArch64/SMEInstrFormats.td

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,7 @@ class SME2_Tile_Movaz_Pat<string name, SDPatternOperator intrinsic, ValueType ou
244244
: Pat<(out_vt (intrinsic tile_imm:$tile, (i32 (tileslice MatrixIndexGPR32Op12_15:$base, index_ty:$offset)))),
245245
(!cast<Instruction>(name # _PSEUDO) $tile, $base, $offset)>;
246246

247+
247248
//===----------------------------------------------------------------------===//
248249
// SME pattern match helpers.
249250
//===----------------------------------------------------------------------===//
@@ -5793,3 +5794,91 @@ multiclass sme2_fmop4a_fp8_fp16_2way<string mnemonic> {
57935794
// Multiple vectors
57945795
def _M2Z2Z_BtoH : sme2_fp8_fp16_quarter_tile_outer_product<0b1, 0b1, mnemonic, ZZ_b_mul_r_Lo, ZZ_b_mul_r_Hi>;
57955796
}
5797+
5798+
// FP8 SME FDOT instructions
5799+
5800+
multiclass sme2_fp8_fdot_index_za16_vg1x2<string mnemonic,
5801+
SDPatternOperator intrinsic> {
5802+
def NAME : sme2_multi_vec_array_vg2_index<0b11, {0b0,?,?,0b10,?}, MatrixOp16,
5803+
ZZ_b_mul_r, ZPR4b8,
5804+
VectorIndexH32b_timm, mnemonic>,
5805+
SMEPseudo2Instr<NAME, 1>{
5806+
let Uses=[FPMR, FPCR];
5807+
5808+
bits<3> i;
5809+
let Inst{11-10} = i{2-1};
5810+
let Inst{3} = i{0};
5811+
}
5812+
5813+
def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm3], $Zn, $Zm$i",
5814+
(!cast<Instruction>(NAME) MatrixOp16:$ZAda, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3,
5815+
ZZ_b_mul_r:$Zn, ZPR4b8:$Zm, VectorIndexH32b_timm:$i), 0>;
5816+
5817+
5818+
def _PSEUDO : sme2_za_array_2op_multi_index_pseudo<NAME, sme_elm_idx0_7, ZZ_b_mul_r, ZPR4b8, VectorIndexH32b_timm, SMEMatrixArray>;
5819+
5820+
def : SME2_ZA_TwoOp_VG2_Multi_Index_Pat<NAME, intrinsic, sme_elm_idx0_7, ZPR4b8, nxv16i8, VectorIndexH32b_timm, tileslice16>;
5821+
}
5822+
5823+
multiclass sme2_fp8_fdot_index_za16_vg1x4<string mnemonic,
5824+
SDPatternOperator intrinsic> {
5825+
def NAME : sme2_multi_vec_array_vg4_index<0b0, {0b1,?,?,0b100,?}, MatrixOp16,
5826+
ZZZZ_b_mul_r, ZPR4b8,
5827+
VectorIndexH32b_timm, mnemonic>,
5828+
SMEPseudo2Instr<NAME, 1> {
5829+
let Uses=[FPMR, FPCR];
5830+
5831+
bits<3> i;
5832+
let Inst{11-10} = i{2-1};
5833+
let Inst{3} = i{0};
5834+
}
5835+
5836+
def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm3], $Zn, $Zm$i",
5837+
(!cast<Instruction>(NAME) MatrixOp16:$ZAda, MatrixIndexGPR32Op8_11:$Rv,
5838+
sme_elm_idx0_7:$imm3, ZZZZ_b_mul_r:$Zn, ZPR4b8:$Zm, VectorIndexH32b_timm:$i), 0>;
5839+
5840+
5841+
def _PSEUDO : sme2_za_array_2op_multi_index_pseudo<NAME, sme_elm_idx0_7, ZZZZ_b_mul_r, ZPR4b8, VectorIndexH32b_timm, SMEMatrixArray>;
5842+
5843+
def : SME2_ZA_TwoOp_VG4_Multi_Index_Pat<NAME, intrinsic, sme_elm_idx0_7, ZPR4b8, nxv16i8, VectorIndexH32b_timm, tileslice16>;
5844+
}
5845+
5846+
multiclass sme2_fp8_fdot_index_za32_vg1x2<string mnemonic,
5847+
SDPatternOperator intrinsic> {
5848+
def NAME : sme2_multi_vec_array_vg2_index<0b01, {0b0,?,?,0b111}, MatrixOp32, ZZ_b_mul_r, ZPR4b8,
5849+
VectorIndexS32b_timm, mnemonic>,
5850+
SMEPseudo2Instr<NAME, 1> {
5851+
let Uses=[FPMR, FPCR];
5852+
5853+
bits<2> i;
5854+
let Inst{11-10} = i;
5855+
}
5856+
5857+
def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm3], $Zn, $Zm$i",
5858+
(!cast<Instruction>(NAME) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3,
5859+
ZZ_b_mul_r:$Zn, ZPR4b8:$Zm, VectorIndexS32b_timm:$i), 0>;
5860+
5861+
def _PSEUDO : sme2_za_array_2op_multi_index_pseudo<NAME, sme_elm_idx0_7, ZZ_b_mul_r, ZPR4b8, VectorIndexS32b_timm, SMEMatrixArray>;
5862+
5863+
def : SME2_ZA_TwoOp_VG2_Multi_Index_Pat<NAME, intrinsic, sme_elm_idx0_7, ZPR4b8, nxv16i8, VectorIndexS32b_timm, tileslice16>;
5864+
}
5865+
5866+
multiclass sme2_fp8_fdot_index_za32_vg1x4<string mnemonic,
5867+
SDPatternOperator intrinsic> {
5868+
def NAME : sme2_multi_vec_array_vg4_index<0b1, {0b0,?,?,0b0,0b001}, MatrixOp32, ZZZZ_b_mul_r,
5869+
ZPR4b8, VectorIndexS32b_timm, mnemonic>,
5870+
SMEPseudo2Instr<NAME, 1> {
5871+
let Uses=[FPMR, FPCR];
5872+
5873+
bits<2> i;
5874+
let Inst{11-10} = i;
5875+
}
5876+
5877+
def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm3], $Zn, $Zm$i",
5878+
(!cast<Instruction>(NAME) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3,
5879+
ZZZZ_b_mul_r:$Zn, ZPR4b8:$Zm, VectorIndexS32b_timm:$i), 0>;
5880+
5881+
def _PSEUDO : sme2_za_array_2op_multi_index_pseudo<NAME, sme_elm_idx0_7, ZZZZ_b_mul_r, ZPR4b8, VectorIndexS32b_timm, SMEMatrixArray>;
5882+
5883+
def : SME2_ZA_TwoOp_VG4_Multi_Index_Pat<NAME, intrinsic, sme_elm_idx0_7, ZPR4b8, nxv16i8, VectorIndexS32b_timm, tileslice16>;
5884+
}

0 commit comments

Comments
 (0)