Skip to content

Commit e93799f

Browse files
[SME] Add intrinsics for FCVT(wid.) and FCVTL (#93202)
According to the specification in ARM-software/acle#309 this adds the intrinsics ``` svfloat32x2_t svcvt_f32[_f16_x2](svfloat16_t zn) __arm_streaming; svfloat32x2_t svcvtl_f32[_f16_x2](svfloat16_t zn) __arm_streaming; ``` These are available only if __ARM_FEATURE_SME_F16F16 is enabled. --------- Co-authored-by: Caroline Concatto <[email protected]>
1 parent 23a09b9 commit e93799f

File tree

7 files changed

+113
-2
lines changed

7 files changed

+113
-2
lines changed

clang/include/clang/Basic/arm_sve.td

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2270,6 +2270,10 @@ let TargetGuard = "sme2" in {
22702270
def SVCVT_S32_F32_X4 : SInst<"svcvt_{d}[_f32_x4]", "4.d4.M", "i", MergeNone, "aarch64_sve_fcvtzs_x4", [IsStreaming, IsOverloadWhileOrMultiVecCvt], []>;
22712271
}
22722272

2273+
let TargetGuard = "sme-f16f16" in {
2274+
def SVCVT_F32_X2 : SInst<"svcvt_{d}[_f16_x2]", "2h", "f", MergeNone, "aarch64_sve_fcvt_widen_x2", [ IsStreaming],[]>;
2275+
}
2276+
22732277
//
22742278
// Multi-vector floating-point convert from single-precision to interleaved half-precision/BFloat16
22752279
//
@@ -2278,6 +2282,13 @@ let TargetGuard = "sme2" in {
22782282
def SVCVTN_BF16_X2 : SInst<"svcvtn_bf16[_f32_x2]", "$2", "f", MergeNone, "aarch64_sve_bfcvtn_x2", [IsOverloadNone, IsStreaming],[]>;
22792283
}
22802284

2285+
//
2286+
//Multi-vector floating-point convert from half-precision to deinterleaved single-precision.
2287+
//
2288+
let TargetGuard = "sme-f16f16" in {
2289+
def SVCVTL_F32_X2 : SInst<"svcvtl_f32[_f16_x2]", "2h", "f", MergeNone, "aarch64_sve_fcvtl_widen_x2", [ IsStreaming],[]>;
2290+
}
2291+
22812292
//
22822293
// Multi-vector saturating extract narrow
22832294
//

clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -497,3 +497,25 @@ svuint8_t test_qcvt_u8_s32_x4(svint32x4_t zn) __arm_streaming {
497497
svuint16_t test_qcvt_u16_s64_x4(svint64x4_t zn) __arm_streaming {
498498
return SVE_ACLE_FUNC(svqcvt_u16,_s64_x4,,)(zn);
499499
}
500+
501+
// CHECK-LABEL: @test_cvt_f32_x2(
502+
// CHECK-NEXT: entry:
503+
// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fcvt.widen.x2.nxv4f32(<vscale x 8 x half> [[ZN:%.*]])
504+
// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
505+
// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
506+
// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
507+
// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
508+
// CHECK-NEXT: ret <vscale x 8 x float> [[TMP4]]
509+
//
510+
// CPP-CHECK-LABEL: @_Z15test_cvt_f32_x2u13__SVFloat16_t(
511+
// CPP-CHECK-NEXT: entry:
512+
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fcvt.widen.x2.nxv4f32(<vscale x 8 x half> [[ZN:%.*]])
513+
// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
514+
// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
515+
// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
516+
// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
517+
// CPP-CHECK-NEXT: ret <vscale x 8 x float> [[TMP4]]
518+
//
519+
__attribute__((target("sme-f16f16"))) svfloat32x2_t test_cvt_f32_x2(svfloat16_t zn) __arm_streaming {
520+
return SVE_ACLE_FUNC(svcvt_f32,_f16_x2,,)(zn);
521+
}
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
2+
3+
// REQUIRES: aarch64-registered-target
4+
5+
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f16f16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
6+
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f16f16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
7+
// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f16f16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
8+
// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f16f16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
9+
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f16f16 -disable-O0-optnone -Werror -Wall -o /dev/null %s
10+
11+
#include <arm_sme.h>
12+
13+
#ifdef SVE_OVERLOADED_FORMS
14+
// A simple used,unused... macro, long enough to represent any SVE builtin.
15+
#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
16+
#else
17+
#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
18+
#endif
19+
20+
// CHECK-LABEL: @test_cvtl_f32_x2(
21+
// CHECK-NEXT: entry:
22+
// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fcvtl.widen.x2.nxv4f32(<vscale x 8 x half> [[ZN:%.*]])
23+
// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
24+
// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
25+
// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
26+
// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
27+
// CHECK-NEXT: ret <vscale x 8 x float> [[TMP4]]
28+
//
29+
// CPP-CHECK-LABEL: @_Z16test_cvtl_f32_x2u13__SVFloat16_t(
30+
// CPP-CHECK-NEXT: entry:
31+
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fcvtl.widen.x2.nxv4f32(<vscale x 8 x half> [[ZN:%.*]])
32+
// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
33+
// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
34+
// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
35+
// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
36+
// CPP-CHECK-NEXT: ret <vscale x 8 x float> [[TMP4]]
37+
//
38+
svfloat32x2_t test_cvtl_f32_x2(svfloat16_t zn) __arm_streaming {
39+
return SVE_ACLE_FUNC(svcvtl_f32,_f16_x2,,)(zn);
40+
}

llvm/include/llvm/IR/IntrinsicsAArch64.td

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3121,6 +3121,11 @@ let TargetPrefix = "aarch64" in {
31213121
: DefaultAttrsIntrinsic<[llvm_nxv8bf16_ty],
31223122
[llvm_nxv4f32_ty, llvm_nxv4f32_ty],
31233123
[IntrNoMem]>;
3124+
3125+
class SME2_CVT_WIDENING_VG2_Intrinsic
3126+
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
3127+
[LLVMSubdivide2VectorType<0>], [IntrNoMem]>;
3128+
31243129

31253130
class SME2_CVT_VG4_SINGLE_Intrinsic
31263131
: DefaultAttrsIntrinsic<[LLVMSubdivide4VectorType<0>],
@@ -3412,6 +3417,13 @@ let TargetPrefix = "aarch64" in {
34123417
def int_aarch64_sme_suvdot_lane_za32_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Index_Intrinsic;
34133418
def int_aarch64_sme_usvdot_lane_za32_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Index_Intrinsic;
34143419

3420+
3421+
//
3422+
//Multi-vector floating-point convert from half-precision to deinterleaved single-precision.
3423+
//
3424+
3425+
def int_aarch64_sve_fcvtl_widen_x2 : SME2_CVT_WIDENING_VG2_Intrinsic;
3426+
34153427
//
34163428
// Multi-vector floating-point CVT from single-precision to interleaved half-precision/BFloat16
34173429
//
@@ -3431,7 +3443,7 @@ let TargetPrefix = "aarch64" in {
34313443
def int_aarch64_sve_fcvtzu_x4 : SME2_CVT_X4_Intrinsic;
34323444
def int_aarch64_sve_scvtf_x4 : SME2_CVT_X4_Intrinsic;
34333445
def int_aarch64_sve_ucvtf_x4 : SME2_CVT_X4_Intrinsic;
3434-
3446+
def int_aarch64_sve_fcvt_widen_x2 : SME2_CVT_WIDENING_VG2_Intrinsic;
34353447
//
34363448
// Multi-vector saturating extract narrow
34373449
//

llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5717,6 +5717,12 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
57175717
case Intrinsic::aarch64_sve_ucvtf_x4:
57185718
SelectCVTIntrinsic(Node, 4, AArch64::UCVTF_4Z4Z_StoS);
57195719
return;
5720+
case Intrinsic::aarch64_sve_fcvt_widen_x2:
5721+
SelectUnaryMultiIntrinsic(Node, 2, false, AArch64::FCVT_2ZZ_H_S);
5722+
return;
5723+
case Intrinsic::aarch64_sve_fcvtl_widen_x2:
5724+
SelectUnaryMultiIntrinsic(Node, 2, false, AArch64::FCVTL_2ZZ_H_S);
5725+
return;
57205726
case Intrinsic::aarch64_sve_sclamp_single_x2:
57215727
if (auto Op = SelectOpcodeFromVT<SelectTypeKind::Int>(
57225728
Node->getValueType(0),

llvm/test/CodeGen/AArch64/sme2-intrinsics-cvt.ll

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s
2+
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme-f16f16 -verify-machineinstrs < %s | FileCheck %s
33

44
;
55
; FCVT
@@ -139,6 +139,15 @@ define {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale
139139
ret {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} %res
140140
}
141141

142+
define {<vscale x 4 x float>, <vscale x 4 x float>} @multi_vector_cvt_widen_x2_f16(<vscale x 8 x half> %zn0) {
143+
; CHECK-LABEL: multi_vector_cvt_widen_x2_f16:
144+
; CHECK: // %bb.0:
145+
; CHECK-NEXT: fcvt { z0.s, z1.s }, z0.h
146+
; CHECK-NEXT: ret
147+
%res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fcvt.widen.x2.nxv4f32(<vscale x 8 x half> %zn0)
148+
ret {<vscale x 4 x float>, <vscale x 4 x float>} %res
149+
}
150+
142151
declare <vscale x 8 x half> @llvm.aarch64.sve.fcvt.x2.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>)
143152
declare <vscale x 8 x bfloat> @llvm.aarch64.sve.bfcvt.x2(<vscale x 4 x float>, <vscale x 4 x float>)
144153
declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sve.fcvtzs.x2.nxv4i32.nxv4f32(<vscale x 4 x float>,<vscale x 4 x float>)
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme-f16f16 -verify-machineinstrs < %s | FileCheck %s
3+
4+
define {<vscale x 4 x float>, <vscale x 4 x float>} @multi_vector_cvtl_widen_x2_f16(<vscale x 8 x half> %zn0) {
5+
; CHECK-LABEL: multi_vector_cvtl_widen_x2_f16:
6+
; CHECK: // %bb.0:
7+
; CHECK-NEXT: fcvtl { z0.s, z1.s }, z0.h
8+
; CHECK-NEXT: ret
9+
%res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fcvtl.widen.x2.nxv4f32(<vscale x 8 x half> %zn0)
10+
ret {<vscale x 4 x float>, <vscale x 4 x float>} %res
11+
}

0 commit comments

Comments
 (0)