Skip to content

[AArch64] Add intrinsics for SME FP8 FDOT LANE instructions #118492

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Dec 13, 2024
Merged
5 changes: 5 additions & 0 deletions clang/include/clang/Basic/arm_sme.td
Original file line number Diff line number Diff line change
Expand Up @@ -740,6 +740,11 @@ let SMETargetGuard = "sme2" in {
def SVLUTI4_LANE_ZT_X2 : Inst<"svluti4_lane_zt_{d}_x2", "2.di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti4_lane_zt_x2", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_3>]>;
}

// FDOT
let SMETargetGuard = "sme2,sme-f8f16" in {
def SVDOT_LANE_FP8_ZA16_VG1x2 : Inst<"svdot_lane_za16[_mf8]_vg1x2_fpm", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fdot_lane_za16_vg1x2", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_7>]>;
def SVDOT_LANE_FP8_ZA16_VG1x4 : Inst<"svdot_lane_za16[_mf8]_vg1x4_fpm", "vm4di>", "m", MergeNone, "aarch64_sme_fp8_fdot_lane_za16_vg1x4", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_7>]>;
}
////////////////////////////////////////////////////////////////////////////////
// SME2p1 - FMOPA, FMOPS (non-widening)
let SMETargetGuard = "sme-b16b16" in {
Expand Down
1 change: 1 addition & 0 deletions clang/include/clang/Basic/arm_sve_sme_incl.td
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ include "arm_immcheck_incl.td"
// h: half-float
// d: double
// b: bfloat
// m: mfloat8

// Typespec modifiers
// ------------------
Expand Down
6 changes: 6 additions & 0 deletions clang/lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10183,6 +10183,8 @@ CodeGenFunction::getSVEType(const SVETypeFlags &TypeFlags) {
case SVETypeFlags::EltTyInt64:
return llvm::ScalableVectorType::get(Builder.getInt64Ty(), 2);

case SVETypeFlags::EltTyMFloat8:
return llvm::ScalableVectorType::get(Builder.getInt8Ty(), 16);
case SVETypeFlags::EltTyFloat16:
return llvm::ScalableVectorType::get(Builder.getHalfTy(), 8);
case SVETypeFlags::EltTyBFloat16:
Expand Down Expand Up @@ -11234,6 +11236,10 @@ Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID,
BuiltinID == SME::BI__builtin_sme_svstr_za)
return EmitSMELdrStr(TypeFlags, Ops, Builtin->LLVMIntrinsic);

// Emit set FPMR for intrinsics that require it
if (TypeFlags.setsFPMR())
Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_set_fpmr),
Ops.pop_back_val());
// Handle builtins which require their multi-vector operands to be swapped
swapCommutativeSMEOperands(BuiltinID, Ops);

Expand Down
57 changes: 57 additions & 0 deletions clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fdot.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
// REQUIRES: aarch64-registered-target
#include <arm_sme.h>

// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s
// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s
// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
#include <arm_sme.h>

#ifdef SVE_OVERLOADED_FORMS
#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
#else
#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
#endif

// CHECK-LABEL: define dso_local void @test_svdot_lane_za16_f8_vg1x2(
// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fdot.lane.za16.vg1x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]], i32 3)
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: define dso_local void @_Z29test_svdot_lane_za16_f8_vg1x2j13svmfloat8x2_tu13__SVMfloat8_tm(
// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fdot.lane.za16.vg1x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]], i32 3)
// CPP-CHECK-NEXT: ret void
//
void test_svdot_lane_za16_f8_vg1x2(uint32_t slice, svmfloat8x2_t zn,
svmfloat8_t zm, fpm_t fpmr)
__arm_streaming __arm_inout("za") {
SVE_ACLE_FUNC(svdot_lane_za16,_mf8,_vg1x2_fpm)(slice, zn, zm, 3, fpmr);
}


// CHECK-LABEL: define dso_local void @test_svdot_lane_za16_f8_vg1x4(
// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fdot.lane.za16.vg1x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZN_COERCE2]], <vscale x 16 x i8> [[ZN_COERCE3]], <vscale x 16 x i8> [[ZM]], i32 3)
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: define dso_local void @_Z29test_svdot_lane_za16_f8_vg1x4j13svmfloat8x4_tu13__SVMfloat8_tm(
// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fdot.lane.za16.vg1x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZN_COERCE2]], <vscale x 16 x i8> [[ZN_COERCE3]], <vscale x 16 x i8> [[ZM]], i32 3)
// CPP-CHECK-NEXT: ret void
//
void test_svdot_lane_za16_f8_vg1x4(uint32_t slice, svmfloat8x4_t zn,
svmfloat8_t zm, fpm_t fpmr)
__arm_streaming __arm_inout("za") {
SVE_ACLE_FUNC(svdot_lane_za16,_mf8,_vg1x4_fpm)(slice, zn, zm, 3, fpmr);
}
22 changes: 22 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAArch64.td
Original file line number Diff line number Diff line change
Expand Up @@ -3864,3 +3864,25 @@ def int_aarch64_sve_famin_u : AdvSIMD_Pred2VectorArg_Intrinsic;
// Neon absolute maximum and minimum
def int_aarch64_neon_famax : AdvSIMD_2VectorArg_Intrinsic;
def int_aarch64_neon_famin : AdvSIMD_2VectorArg_Intrinsic;

// SME FDOT instructions
let TargetPrefix = "aarch64" in {


class SME2_FP8_FDOT_LANE_VG1x2 :
DefaultAttrsIntrinsic<[], [llvm_i32_ty,
llvm_nxv16i8_ty, llvm_nxv16i8_ty,
llvm_nxv16i8_ty,
llvm_i32_ty],
[IntrInaccessibleMemOnly, IntrHasSideEffects, ImmArg<ArgIndex<4>>]>;

class SME2_FP8_FDOT_LANE_VG1x4 :
DefaultAttrsIntrinsic<[], [llvm_i32_ty,
llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty,
llvm_nxv16i8_ty,
llvm_i32_ty],
[IntrInaccessibleMemOnly, IntrHasSideEffects, ImmArg<ArgIndex<6>>]>;

def int_aarch64_sme_fp8_fdot_lane_za16_vg1x2 : SME2_FP8_FDOT_LANE_VG1x2;
def int_aarch64_sme_fp8_fdot_lane_za16_vg1x4 : SME2_FP8_FDOT_LANE_VG1x4;
}
4 changes: 2 additions & 2 deletions llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -986,8 +986,8 @@ def LUTI4_S_4ZZT2Z : sme2_luti4_vector_vg4_strided<0b00, 0b00, "luti4">;

let Predicates = [HasSMEF8F16] in {
defm FVDOT_VG2_M2ZZI_BtoH : sme2p1_multi_vec_array_vg2_index_f8f16<"fvdot", 0b11, 0b110, ZZ_b_mul_r, ZPR4b8>;
defm FDOT_VG2_M2ZZI_BtoH : sme2p1_multi_vec_array_vg2_index_f8f16<"fdot", 0b11, 0b010, ZZ_b_mul_r, ZPR4b8>;
defm FDOT_VG4_M4ZZI_BtoH : sme2p1_multi_vec_array_vg4_index_f8f16<"fdot", 0b100, ZZZZ_b_mul_r, ZPR4b8>;
defm FDOT_VG2_M2ZZI_BtoH : sme2_fp8_fdot_index_za16_vg1x2<"fdot", 0b11, 0b010, ZZ_b_mul_r, int_aarch64_sme_fp8_fdot_lane_za16_vg1x2>;
defm FDOT_VG4_M4ZZI_BtoH : sme2_fp8_fdot_index_za16_vg1x4<"fdot", 0b100, ZZZZ_b_mul_r, int_aarch64_sme_fp8_fdot_lane_za16_vg1x4>;
defm FDOT_VG2_M2ZZ_BtoH : sme2_dot_mla_add_sub_array_vg24_single<"fdot", 0b0010001, MatrixOp16, ZZ_b, ZPR4b8>;
defm FDOT_VG4_M4ZZ_BtoH : sme2_dot_mla_add_sub_array_vg24_single<"fdot", 0b0110001, MatrixOp16, ZZZZ_b, ZPR4b8>;
// TODO: Replace nxv16i8 by nxv16f8
Expand Down
141 changes: 141 additions & 0 deletions llvm/lib/Target/AArch64/SMEInstrFormats.td
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,37 @@ class SME2_Tile_Movaz_Pat<string name, SDPatternOperator intrinsic, ValueType ou
: Pat<(out_vt (intrinsic tile_imm:$tile, (i32 (tileslice MatrixIndexGPR32Op12_15:$base, index_ty:$offset)))),
(!cast<Instruction>(name # _PSEUDO) $tile, $base, $offset)>;


// FP8 SME FDOT instructions

// Selection DAG patterns - map to first level of pseudo-instructions (xxx_PSEUDO)
class SME2_FP8_FMLA_FDOT_Index_VG1x2_Pat<string name, SDPatternOperator intrinsic,
ComplexPattern tileslice, Operand offset_ty, Operand imm_ty,
ValueType vt = nxv16i8>
: Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, offset_ty:$offset)),
vt:$Zn1, vt:$Zn2, vt:$Zm, (i32 imm_ty:$i)),
(!cast<Instruction>(name # _PSEUDO) $base, $offset,
(REG_SEQUENCE ZPR2Mul2, vt:$Zn1, zsub0, vt:$Zn2, zsub1),
ZPR4b8:$Zm, imm_ty:$i)>;

class SME2_FP8_FMLA_FDOT_Index_VG1x4_Pat<string name, SDPatternOperator intrinsic,
ComplexPattern tileslice, Operand offset_ty, Operand imm_ty,
ValueType vt = nxv16i8>
: Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, offset_ty:$offset)),
vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4,
vt:$Zm, (i32 imm_ty:$i)),
(!cast<Instruction>(name # _PSEUDO) $base, $offset,
(REG_SEQUENCE ZPR4Mul4, vt:$Zn1, zsub0, vt:$Zn2, zsub1, vt:$Zn3, zsub2, vt:$Zn4, zsub3),
ZPR4b8:$Zm, imm_ty:$i)>;

class sme2_fp8_fmla_fdot_index_pseudo<string name, Operand offset_ty, RegisterOperand src1_ty, RegisterOperand src2_ty, Operand imm_ty>
: SMEPseudo2Instr<name, 0>,
Pseudo<(outs), (ins MatrixIndexGPR32Op8_11:$Rv, offset_ty:$offs, src1_ty:$Zn, src2_ty:$Zm, imm_ty:$i), []> {
let mayLoad = 1;
let SMEMatrixType = SMEMatrixArray;
let usesCustomInserter = 1;
}

//===----------------------------------------------------------------------===//
// SME pattern match helpers.
//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -5737,3 +5768,113 @@ multiclass sme2_fmop4a_fp8_fp16_2way<string mnemonic> {
// Multiple vectors
def _M2Z2Z_BtoH : sme2_fp8_fp16_quarter_tile_outer_product<0b1, 0b1, mnemonic, ZZ_b_mul_r_Lo, ZZ_b_mul_r_Hi>;
}

// FP8 SME FDOT instructions

// Selection DAG patterns - map to first level of pseudo-instructions (xxx_PSEUDO)

class SME2_FP8_FDOT_Index_VG1x2_Pat<string name, SDPatternOperator intrinsic,
ComplexPattern tileslice, Operand offset_ty, Operand imm_ty,
ValueType vt = nxv16i8>
: Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, offset_ty:$offset)),
vt:$Zn1, vt:$Zn2, vt:$Zm, (i32 imm_ty:$i), i64:$fpmr),
(!cast<Instruction>(name # _PSEUDO) $base, $offset,
(REG_SEQUENCE ZPR2Mul2, vt:$Zn1, zsub0, vt:$Zn2, zsub1),
ZPR4b8:$Zm, imm_ty:$i, GPR64:$fpmr)>;

class SME2_FP8_FDOT_Index_VG1x4_Pat<string name, SDPatternOperator intrinsic,
ComplexPattern tileslice, Operand offset_ty, Operand imm_ty,
ValueType vt = nxv16i8>
: Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, offset_ty:$offset)),
vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4,
vt:$Zm, (i32 imm_ty:$i), i64:$fpmr),
(!cast<Instruction>(name # _PSEUDO) $base, $offset,
(REG_SEQUENCE ZPR4Mul4, vt:$Zn1, zsub0, vt:$Zn2, zsub1, vt:$Zn3, zsub2, vt:$Zn4, zsub3),
ZPR4b8:$Zm, imm_ty:$i, GPR64:$fpmr)>;

// First level pseudo-instructions (xxx_PSEUDO) - transformed to second level pseudo-instructions (xxx_FPMR_PSEUDO)
// during instruction selection.
class sme2_fp8_fdot_index_pseudo<string name, Operand offset_ty, RegisterOperand src1_ty, RegisterOperand src2_ty, Operand imm_ty>
: SMEPseudo2Instr<name, 0>,
Pseudo<(outs), (ins MatrixIndexGPR32Op8_11:$Rv, offset_ty:$offs, src1_ty:$Zn, src2_ty:$Zm, imm_ty:$i, GPR64:$fpmr), []> {
let SMEMatrixType = SMEMatrixArray;
let usesCustomInserter = 1;
}

class sme2_fp8_fdot_pseudo<string name, Operand offset_ty, RegisterOperand src1_ty, RegisterOperand src2_ty>
: SMEPseudo2Instr<name, 0>,
Pseudo<(outs), (ins MatrixIndexGPR32Op8_11:$Rv, offset_ty:$offs, src1_ty:$Zn, src2_ty:$Zm, GPR64:$fpmr), []> {
let SMEMatrixType = SMEMatrixArray;
let usesCustomInserter = 1;
}

// Second level pseudo-instruction - expanded to real instruction by the AArch64 pseudo instruction expansion pass
class sme2_fp8_fdot_index_fpmr_pseudo<string name, MatrixOperand matrix_ty, Operand offset_ty,
RegisterOperand src1_ty, RegisterOperand src2_ty,
Operand imm_ty>
: Pseudo<(outs matrix_ty:$ZAda),
(ins matrix_ty:$_ZAda, MatrixIndexGPR32Op8_11:$Rv, offset_ty:$offs,
src1_ty:$Zn, src2_ty:$Zm, imm_ty:$i, GPR64:$fpmr), []>,
SMEPseudo2Instr<name, 1> {
let hasNoSchedulingInfo = 1;
let Constraints = "$ZAda = $_ZAda";
}

class sme2_fp8_fdot_fpmr_pseudo<string name, MatrixOperand matrix_ty, Operand offset_ty,
RegisterOperand src1_ty, RegisterOperand src2_ty>
: Pseudo<(outs matrix_ty:$ZAda),
(ins matrix_ty:$_ZAda, MatrixIndexGPR32Op8_11:$Rv, offset_ty:$offs,
src1_ty:$Zn, src2_ty:$Zm, GPR64:$fpmr), []>,
SMEPseudo2Instr<name, 1> {
let hasNoSchedulingInfo = 1;
let Constraints = "$ZAda = $_ZAda";
}

// FDOT instructions
multiclass sme2_fp8_fdot_index_za16_vg1x2<string mnemonic, bits<2> sz, bits<3> op,
RegisterOperand multi_vector_ty, SDPatternOperator intrinsic> {
def NAME : sme2_multi_vec_array_vg2_index<sz, {op{2},?,?,op{1-0},?}, MatrixOp16,
multi_vector_ty, ZPR4b8,
VectorIndexH32b_timm, mnemonic>,
SMEPseudo2Instr<NAME, 1>{
let Uses=[FPMR, FPCR];
let mayLoad = 1;

bits<3> i;
let Inst{11-10} = i{2-1};
let Inst{3} = i{0};
}

def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm3], $Zn, $Zm$i",
(!cast<Instruction>(NAME) MatrixOp16:$ZAda, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3,
multi_vector_ty:$Zn, ZPR4b8:$Zm, VectorIndexH32b_timm:$i), 0>;


def _PSEUDO : sme2_fp8_fmla_fdot_index_pseudo<NAME, sme_elm_idx0_7, multi_vector_ty, ZPR4b8, VectorIndexH32b_timm>;

def : SME2_FP8_FMLA_FDOT_Index_VG1x2_Pat<NAME, intrinsic, tileslice16, sme_elm_idx0_7, VectorIndexH32b_timm>;
}

multiclass sme2_fp8_fdot_index_za16_vg1x4<string mnemonic, bits<3> op,
RegisterOperand multi_vector_ty, SDPatternOperator intrinsic> {
def NAME : sme2_multi_vec_array_vg4_index<0b0,{0b1,?,?,op,?}, MatrixOp16,
multi_vector_ty, ZPR4b8,
VectorIndexH32b_timm, mnemonic>,
SMEPseudo2Instr<NAME, 1> {
let Uses=[FPMR, FPCR];
let mayLoad = 1;

bits<3> i;
let Inst{11-10} = i{2-1};
let Inst{3} = i{0};
}

def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm3], $Zn, $Zm$i",
(!cast<Instruction>(NAME) MatrixOp16:$ZAda, MatrixIndexGPR32Op8_11:$Rv,
sme_elm_idx0_7:$imm3, multi_vector_ty:$Zn, ZPR4b8:$Zm, VectorIndexH32b_timm:$i), 0>;


def _PSEUDO : sme2_fp8_fmla_fdot_index_pseudo<NAME, sme_elm_idx0_7, multi_vector_ty, ZPR4b8, VectorIndexH32b_timm>;

def : SME2_FP8_FMLA_FDOT_Index_VG1x4_Pat<NAME, intrinsic, tileslice16, sme_elm_idx0_7, VectorIndexH32b_timm>;
}
32 changes: 32 additions & 0 deletions llvm/test/CodeGen/AArch64/sme2-intrinsics-fp8-fdot.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter-out "^[ \t]*//.*$" --version 4
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2,+sme-f8f16,+sme-f8f32 -verify-machineinstrs -force-streaming < %s | FileCheck %s

target triple = "aarch64-linux"

define void @test_fdot16_1x2_indexed(i32 %slice.0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm) #0 {
; CHECK-LABEL: test_fdot16_1x2_indexed:
; CHECK: mov w8, w0
; CHECK: fdot za.h[w8, 7, vgx2], { z0.b, z1.b }, z2.b[1]
; CHECK: ret
%slice = add i32 %slice.0, 7
call void @llvm.aarch64.sme.fp8.fdot.lane.za16.vg1x2(i32 %slice,
<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2,
<vscale x 16 x i8> %zm, i32 1)
ret void
}

define void @test_fdot16_1x4_indexed(i32 %slice.0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4,
; CHECK-LABEL: test_fdot16_1x4_indexed:
; CHECK: mov w8, w0
; CHECK: fdot za.h[w8, 7, vgx4], { z0.b - z3.b }, z4.b[1]
; CHECK: ret
<vscale x 16 x i8> %zm) #0 {
%slice = add i32 %slice.0, 7
call void @llvm.aarch64.sme.fp8.fdot.lane.za16.vg1x4(i32 %slice,
<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4,
<vscale x 16 x i8> %zm, i32 1)
ret void
}


attributes #0 = { "target-features" = "+sme,+sme-f8f32,+sme-f8f16" }
Loading