Skip to content

[Clang][llvm] Implement fp8 FMOP4A intrinsics #130127

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Apr 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions clang/include/clang/Basic/arm_sme.td
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,24 @@ let SMETargetGuard = "sme2,sme-mop4,sme-b16b16" in {
defm SVBMOP4S_H : MOP4<"s", "_za16", "b", "aarch64_sme_mop4s", [ImmCheck<0, ImmCheck0_1>]>;
}

////////////////////////////////////////////////////////////////////////////////
// SME2 - FP8 FMOP4A, FMOP4S

multiclass MOP4_FP8<string za, list<ImmCheck> checks> {
def _1x1 : Inst<"svmop4a" # "[_1x1]" # za # "[_{d}_{d}]", "vidd>", "m", MergeNone, "aarch64_sme_fp8_fmop4a" # za # "_1x1", [IsInOutZA, IsStreaming], checks>;
def _1x2 : Inst<"svmop4a" # "[_1x2]" # za # "[_{d}_{d}]", "vid2>", "m", MergeNone, "aarch64_sme_fp8_fmop4a" # za # "_1x2", [IsInOutZA, IsStreaming], checks>;
def _2x1 : Inst<"svmop4a" # "[_2x1]" # za # "[_{d}_{d}]", "vi2d>", "m", MergeNone, "aarch64_sme_fp8_fmop4a" # za # "_2x1", [IsInOutZA, IsStreaming], checks>;
def _2x2 : Inst<"svmop4a" # "[_2x2]" # za # "[_{d}_{d}]", "vi22>", "m", MergeNone, "aarch64_sme_fp8_fmop4a" # za # "_2x2", [IsInOutZA, IsStreaming], checks>;
}

let SMETargetGuard = "sme2,sme-mop4,sme-f8f32" in {
defm SVMOP4A_FP8_ZA32 : MOP4_FP8<"_za32", [ImmCheck<0, ImmCheck0_3>]>;
}

let SMETargetGuard = "sme2,sme-mop4,sme-f8f16" in {
defm SVMOP4A_FP8_ZA16 : MOP4_FP8<"_za16", [ImmCheck<0, ImmCheck0_1>]>;
}

////////////////////////////////////////////////////////////////////////////////
// SME2 - SMOP4A, SMOP4S, UMOP4A, UMOP4S

Expand Down
160 changes: 160 additions & 0 deletions clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_fp8.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
// REQUIRES: aarch64-registered-target
// RUN: %clang_cc1 -triple aarch64 -target-feature +sme-f8f32 -target-feature +sme-f8f16 -target-feature +sme-mop4 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
// RUN: %clang_cc1 -triple aarch64 -target-feature +sme-f8f32 -target-feature +sme-f8f16 -target-feature +sme-mop4 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +sme-f8f32 -target-feature +sme-f8f16 -target-feature +sme-mop4 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +sme-f8f32 -target-feature +sme-f8f16 -target-feature +sme-mop4 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
// RUN: %clang_cc1 -triple aarch64 -target-feature +sme-f8f32 -target-feature +sme-f8f16 -target-feature +sme-mop4 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s


#include <arm_sme.h>

#ifdef SME_OVERLOADED_FORMS
#define SME_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5
#else
#define SME_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5
#endif
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should change the tests to be like:
ifdef SME_OVERLOADED_FORMS
#define SME_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
#else
#define SME_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
#endif
So the C tests are : (svmop4a,_2x1_za32,_s8_s8)


// CHECK-LABEL: define dso_local void @test_svmop4a_1x1_za16_mf8_mf8_fpm(
// CHECK-SAME: <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmop4a.za16.1x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: define dso_local void @_Z33test_svmop4a_1x1_za16_mf8_mf8_fpmu13__SVMfloat8_tS_m(
// CPP-CHECK-SAME: <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmop4a.za16.1x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x1_za16_mf8_mf8_fpm(svmfloat8_t zn, svmfloat8_t zm, fpm_t fpmr) __arm_streaming __arm_inout("za") {
SME_ACLE_FUNC(svmop4a,_1x1,_za16,_mf8_mf8,_fpm)(1, zn, zm, fpmr);
}

// CHECK-LABEL: define dso_local void @test_svmop4a_1x2_za16_mf8_mf8_fpm(
// CHECK-SAME: <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmop4a.za16.1x2.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM_COERCE0]], <vscale x 16 x i8> [[ZM_COERCE1]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: define dso_local void @_Z33test_svmop4a_1x2_za16_mf8_mf8_fpmu13__SVMfloat8_t13svmfloat8x2_tm(
// CPP-CHECK-SAME: <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmop4a.za16.1x2.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM_COERCE0]], <vscale x 16 x i8> [[ZM_COERCE1]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x2_za16_mf8_mf8_fpm(svmfloat8_t zn, svmfloat8x2_t zm, fpm_t fpmr) __arm_streaming __arm_inout("za") {
SME_ACLE_FUNC(svmop4a,_1x2,_za16,_mf8_mf8,_fpm)(1, zn, zm, fpmr);
}

// CHECK-LABEL: define dso_local void @test_svmop4a_2x1_za16_mf8_mf8_fpm(
// CHECK-SAME: <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmop4a.za16.2x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: define dso_local void @_Z33test_svmop4a_2x1_za16_mf8_mf8_fpm13svmfloat8x2_tu13__SVMfloat8_tm(
// CPP-CHECK-SAME: <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmop4a.za16.2x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_2x1_za16_mf8_mf8_fpm(svmfloat8x2_t zn, svmfloat8_t zm, fpm_t fpmr) __arm_streaming __arm_inout("za") {
SME_ACLE_FUNC(svmop4a,_2x1,_za16,_mf8_mf8,_fpm)(1, zn, zm, fpmr);
}

// CHECK-LABEL: define dso_local void @test_svmop4a_2x2_za16_mf8_mf8_fpm(
// CHECK-SAME: <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmop4a.za16.2x2.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM_COERCE0]], <vscale x 16 x i8> [[ZM_COERCE1]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: define dso_local void @_Z33test_svmop4a_2x2_za16_mf8_mf8_fpm13svmfloat8x2_tS_m(
// CPP-CHECK-SAME: <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmop4a.za16.2x2.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM_COERCE0]], <vscale x 16 x i8> [[ZM_COERCE1]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_2x2_za16_mf8_mf8_fpm(svmfloat8x2_t zn, svmfloat8x2_t zm, fpm_t fpmr) __arm_streaming __arm_inout("za") {
SME_ACLE_FUNC(svmop4a,_2x2,_za16,_mf8_mf8,_fpm)(1, zn, zm, fpmr);
}

// CHECK-LABEL: define dso_local void @test_svmop4a_1x1_za32_mf8_mf8_fpm(
// CHECK-SAME: <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmop4a.za32.1x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: define dso_local void @_Z33test_svmop4a_1x1_za32_mf8_mf8_fpmu13__SVMfloat8_tS_m(
// CPP-CHECK-SAME: <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmop4a.za32.1x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x1_za32_mf8_mf8_fpm(svmfloat8_t zn, svmfloat8_t zm, fpm_t fpmr) __arm_streaming __arm_inout("za") {
SME_ACLE_FUNC(svmop4a,_1x1,_za32,_mf8_mf8,_fpm)(1, zn, zm, fpmr);
}

// CHECK-LABEL: define dso_local void @test_svmop4a_1x2_za32_mf8_mf8_fpm(
// CHECK-SAME: <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmop4a.za32.1x2.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM_COERCE0]], <vscale x 16 x i8> [[ZM_COERCE1]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: define dso_local void @_Z33test_svmop4a_1x2_za32_mf8_mf8_fpmu13__SVMfloat8_t13svmfloat8x2_tm(
// CPP-CHECK-SAME: <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmop4a.za32.1x2.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM_COERCE0]], <vscale x 16 x i8> [[ZM_COERCE1]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x2_za32_mf8_mf8_fpm(svmfloat8_t zn, svmfloat8x2_t zm, fpm_t fpmr) __arm_streaming __arm_inout("za") {
SME_ACLE_FUNC(svmop4a,_1x2,_za32,_mf8_mf8,_fpm)(1, zn, zm, fpmr);
}

// CHECK-LABEL: define dso_local void @test_svmop4a_2x1_za32_mf8_mf8_fpm(
// CHECK-SAME: <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmop4a.za32.2x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: define dso_local void @_Z33test_svmop4a_2x1_za32_mf8_mf8_fpm13svmfloat8x2_tu13__SVMfloat8_tm(
// CPP-CHECK-SAME: <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmop4a.za32.2x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_2x1_za32_mf8_mf8_fpm(svmfloat8x2_t zn, svmfloat8_t zm, fpm_t fpmr) __arm_streaming __arm_inout("za") {
SME_ACLE_FUNC(svmop4a,_2x1,_za32,_mf8_mf8,_fpm)(1, zn, zm, fpmr);
}

// CHECK-LABEL: define dso_local void @test_svmop4a_2x2_za32_mf8_mf8_fpm(
// CHECK-SAME: <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmop4a.za32.2x2.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM_COERCE0]], <vscale x 16 x i8> [[ZM_COERCE1]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: define dso_local void @_Z33test_svmop4a_2x2_za32_mf8_mf8_fpm13svmfloat8x2_tS_m(
// CPP-CHECK-SAME: <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmop4a.za32.2x2.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM_COERCE0]], <vscale x 16 x i8> [[ZM_COERCE1]])
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_2x2_za32_mf8_mf8_fpm(svmfloat8x2_t zn, svmfloat8x2_t zm, fpm_t fpmr) __arm_streaming __arm_inout("za") {
SME_ACLE_FUNC(svmop4a,_2x2,_za32,_mf8_mf8,_fpm)(1, zn, zm, fpmr);
}
31 changes: 31 additions & 0 deletions clang/test/Sema/aarch64-sme2p2-instrinsics/acle_sme2p2_fp8_imm.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu \
// RUN: -target-feature +sme -target-feature +sme2p2 -target-feature +sme-mop4 -target-feature +sme-f8f32 -target-feature +sme-f8f16 -fsyntax-only -verify %s

// REQUIRES: aarch64-registered-target

#include <arm_sme.h>

void tests_mop4_imm_1x1(svmfloat8_t zn, svmfloat8_t zm, fpm_t fpmr) __arm_streaming __arm_inout("za") {
svmop4a_1x1_za16_mf8_mf8_fpm(-1, zn, zm, fpmr); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}}
svmop4a_1x1_za32_mf8_mf8_fpm(-1, zn, zm, fpmr); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}}
return;
}

void tests_mop4_imm_1x2(svmfloat8_t zn, svmfloat8x2_t zm, fpm_t fpmr) __arm_streaming __arm_inout("za") {
svmop4a_1x2_za16_mf8_mf8_fpm(-1, zn, zm, fpmr); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}}
svmop4a_1x2_za32_mf8_mf8_fpm(-1, zn, zm, fpmr); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}}
return;
}

void tests_mop4_imm_2x1(svmfloat8x2_t zn, svmfloat8_t zm, fpm_t fpmr) __arm_streaming __arm_inout("za") {
svmop4a_2x1_za16_mf8_mf8_fpm(-1, zn, zm, fpmr); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}}
svmop4a_2x1_za32_mf8_mf8_fpm(-1, zn, zm, fpmr); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}}
return;
}

void tests_mop4_imm_2x2(svmfloat8x2_t zn, svmfloat8x2_t zm, fpm_t fpmr) __arm_streaming __arm_inout("za") {
svmop4a_2x2_za16_mf8_mf8_fpm(-1, zn, zm, fpmr); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}}
svmop4a_2x2_za32_mf8_mf8_fpm(-1, zn, zm, fpmr); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}}
return;
}
14 changes: 11 additions & 3 deletions llvm/include/llvm/IR/IntrinsicsAArch64.td
Original file line number Diff line number Diff line change
Expand Up @@ -3068,22 +3068,22 @@ let TargetPrefix = "aarch64" in {
: DefaultAttrsIntrinsic<[],
[llvm_i32_ty,
llvm_anyvector_ty,
LLVMMatchType<0>], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>;
LLVMMatchType<0>], [ImmArg<ArgIndex<0>>, IntrInaccessibleMemOnly, IntrHasSideEffects]>;

class SME_OuterProduct_QuarterTile_Single_Multi
: DefaultAttrsIntrinsic<[],
[llvm_i32_ty,
llvm_anyvector_ty,
LLVMMatchType<0>,
LLVMMatchType<0>], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>;
LLVMMatchType<0>], [ImmArg<ArgIndex<0>>, IntrInaccessibleMemOnly, IntrHasSideEffects]>;

class SME_OuterProduct_QuarterTile_Multi_Multi
: DefaultAttrsIntrinsic<[],
[llvm_i32_ty,
llvm_anyvector_ty,
LLVMMatchType<0>,
LLVMMatchType<0>,
LLVMMatchType<0>], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>;
LLVMMatchType<0>], [ImmArg<ArgIndex<0>>, IntrInaccessibleMemOnly, IntrHasSideEffects]>;

// 2-way and 4-way multi-vector signed/unsigned Quarter Tile Quarter Product A/S
foreach mode = ["s", "a"] in {
Expand Down Expand Up @@ -3125,6 +3125,14 @@ let TargetPrefix = "aarch64" in {
def int_aarch64_sme_sutmopa_za32 : SME_OuterProduct_TMOP_Intrinsic;
def int_aarch64_sme_ustmopa_za32 : SME_OuterProduct_TMOP_Intrinsic;

// 16 and 32 bit multi-vector floating point 8 Quarter Tile Quarter Product
foreach za = ["za16", "za32"] in {
def int_aarch64_sme_fp8_fmop4a_ # za # "_1x1" : SME_OuterProduct_QuarterTile_Single_Single;
def int_aarch64_sme_fp8_fmop4a_ # za # "_1x2" : SME_OuterProduct_QuarterTile_Single_Multi;
def int_aarch64_sme_fp8_fmop4a_ # za # "_2x1" : SME_OuterProduct_QuarterTile_Single_Multi;
def int_aarch64_sme_fp8_fmop4a_ # za # "_2x2" : SME_OuterProduct_QuarterTile_Multi_Multi;
}

class SME_AddVectorToTile_Intrinsic
: DefaultAttrsIntrinsic<[],
[llvm_i32_ty,
Expand Down
8 changes: 3 additions & 5 deletions llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -1091,8 +1091,8 @@ let Predicates = [HasSME2p2] in {

} // [HasSME2p2]

let Predicates = [HasSME_MOP4, HasSMEF8F16], Uses = [FPMR, FPCR] in {
defm FMOP4A : sme2_fmop4a_fp8_fp16_2way<"fmop4a">;
let Predicates = [HasSME_MOP4, HasSMEF8F16] in {
defm FMOP4A : sme2_fmop4a_fp8_fp16_2way<"fmop4a", "int_aarch64_sme_fp8_fmop4a_za16">;
}

let Predicates = [HasSME_MOP4, HasSMEF16F16] in {
Expand All @@ -1105,10 +1105,8 @@ let Predicates = [HasSME2, HasSVEBFSCALE] in {
defm BFMUL : sme2_bfmul_multi<"bfmul">;
}

let Uses = [FPMR, FPCR] in {
let Predicates = [HasSME_MOP4, HasSMEF8F32] in {
defm FMOP4A : sme2_fmop4a_fp8_fp32_4way<"fmop4a">;
}
defm FMOP4A : sme2_fmop4a_fp8_fp32_4way<"fmop4a", "int_aarch64_sme_fp8_fmop4a_za32">;
}

let Predicates = [HasSME_MOP4, HasSMEB16B16] in {
Expand Down
Loading
Loading