[AArch64][SME2] Add multi-vector frint intrinsics

kmclaughlin-arm · kmclaughlin-arm · commit f3188b98d05d · 2023-02-07T11:34:07.000Z
Adds x2 and x4 vector intrinsics for the following instructions: - frinta - frintm - frintn - frintp NOTE: These intrinsics are still in development and are subject to future changes. Reviewed By: david-arm Differential Revision: https://reviews.llvm.org/D143058
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -3044,6 +3044,13 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_sve_sqdmulh_vgx2 : SME2_VG2_Multi_Multi_Intrinsic;
   def int_aarch64_sve_sqdmulh_vgx4 : SME2_VG4_Multi_Multi_Intrinsic;
 
+  // Multi-vector floating-point round to integral value
+
+  foreach inst = ["a", "m", "n", "p"] in {
+    def int_aarch64_sve_frint # inst # _x2 : SVE2_VG2_ZipUzp_Intrinsic;
+    def int_aarch64_sve_frint # inst # _x4 : SVE2_VG4_ZipUzp_Intrinsic;
+  }
+
   //
   // Multi-vector min/max
   //
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -367,6 +367,7 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
   void SelectClamp(SDNode *N, unsigned NumVecs, unsigned Opcode);
   void SelectUnaryMultiIntrinsic(SDNode *N, unsigned NumOutVecs,
                                  bool IsTupleInput, unsigned Opc);
+  void SelectFrintFromVT(SDNode *N, unsigned NumVecs, unsigned Opcode);
 
   template <unsigned MaxIdx, unsigned Scale>
   void SelectMultiVectorMove(SDNode *N, unsigned NumVecs, unsigned BaseReg,
@@ -1874,6 +1875,13 @@ void AArch64DAGToDAGISel::SelectPredicatedLoad(SDNode *N, unsigned NumVecs,
   CurDAG->RemoveDeadNode(N);
 }
 
+void AArch64DAGToDAGISel::SelectFrintFromVT(SDNode *N, unsigned NumVecs,
+                                            unsigned Opcode) {
+  if (N->getValueType(0) != MVT::nxv4f32)
+    return;
+  SelectUnaryMultiIntrinsic(N, NumVecs, true, Opcode);
+}
+
 void AArch64DAGToDAGISel::SelectClamp(SDNode *N, unsigned NumVecs,
                                       unsigned Op) {
   SDLoc DL(N);
@@ -5380,6 +5388,30 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
       SelectUnaryMultiIntrinsic(Node, 4, /*IsTupleInput=*/true,
                                 AArch64::UZP_VG4_4Z4Z_Q);
       return;
+    case Intrinsic::aarch64_sve_frinta_x2:
+      SelectFrintFromVT(Node, 2, AArch64::FRINTA_2Z2Z_S);
+      return;
+    case Intrinsic::aarch64_sve_frinta_x4:
+      SelectFrintFromVT(Node, 4, AArch64::FRINTA_4Z4Z_S);
+      return;
+    case Intrinsic::aarch64_sve_frintm_x2:
+      SelectFrintFromVT(Node, 2, AArch64::FRINTM_2Z2Z_S);
+      return;
+    case Intrinsic::aarch64_sve_frintm_x4:
+      SelectFrintFromVT(Node, 4, AArch64::FRINTM_4Z4Z_S);
+      return;
+    case Intrinsic::aarch64_sve_frintn_x2:
+      SelectFrintFromVT(Node, 2, AArch64::FRINTN_2Z2Z_S);
+      return;
+    case Intrinsic::aarch64_sve_frintn_x4:
+      SelectFrintFromVT(Node, 4, AArch64::FRINTN_4Z4Z_S);
+      return;
+    case Intrinsic::aarch64_sve_frintp_x2:
+      SelectFrintFromVT(Node, 2, AArch64::FRINTP_2Z2Z_S);
+      return;
+    case Intrinsic::aarch64_sve_frintp_x4:
+      SelectFrintFromVT(Node, 4, AArch64::FRINTP_4Z4Z_S);
+      return;
     }
     break;
   }
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-frint.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-frint.ll
@@ -0,0 +1,118 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s
+
+; FRINTA
+
+define { <vscale x 4 x float>, <vscale x 4 x float> } @multi_vec_frinta_x2_f32(<vscale x 4 x float> %unused, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2) {
+; CHECK-LABEL: multi_vec_frinta_x2_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z3.d, z2.d
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    frinta { z0.s, z1.s }, { z2.s, z3.s }
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frinta.x2.nxv4f32(<vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2)
+  ret { <vscale x 4 x float>, <vscale x 4 x float> } %res
+}
+
+define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @multi_vec_frinta_x4_f32(<vscale x 4 x float> %unused, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, <vscale x 4 x float> %zn4) {
+; CHECK-LABEL: multi_vec_frinta_x4_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    frinta { z0.s - z3.s }, { z4.s - z7.s }
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frinta.x4.nxv4f32(<vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, <vscale x 4 x float> %zn4)
+  ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
+}
+
+; FRINTM
+
+define { <vscale x 4 x float>, <vscale x 4 x float> } @multi_vec_frintm_x2_f32(<vscale x 4 x float> %unused, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2) {
+; CHECK-LABEL: multi_vec_frintm_x2_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z3.d, z2.d
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    frintm { z0.s, z1.s }, { z2.s, z3.s }
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frintm.x2.nxv4f32(<vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2)
+  ret { <vscale x 4 x float>, <vscale x 4 x float> } %res
+}
+
+define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @multi_vec_frintm_x4_f32(<vscale x 4 x float> %unused, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, <vscale x 4 x float> %zn4) {
+; CHECK-LABEL: multi_vec_frintm_x4_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    frintm { z0.s - z3.s }, { z4.s - z7.s }
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frintm.x4.nxv4f32(<vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, <vscale x 4 x float> %zn4)
+  ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
+}
+
+; FRINTN
+
+define { <vscale x 4 x float>, <vscale x 4 x float> } @multi_vec_frintn_x2_f32(<vscale x 4 x float> %unused, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2) {
+; CHECK-LABEL: multi_vec_frintn_x2_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z3.d, z2.d
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    frintn { z0.s, z1.s }, { z2.s, z3.s }
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frintn.x2.nxv4f32(<vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2)
+  ret { <vscale x 4 x float>, <vscale x 4 x float> } %res
+}
+
+define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @multi_vec_frintn_x4_f32(<vscale x 4 x float> %unused, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, <vscale x 4 x float> %zn4) {
+; CHECK-LABEL: multi_vec_frintn_x4_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    frintn { z0.s - z3.s }, { z4.s - z7.s }
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frintn.x4.nxv4f32(<vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, <vscale x 4 x float> %zn4)
+  ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
+}
+
+; FRINTP
+
+define { <vscale x 4 x float>, <vscale x 4 x float> } @multi_vec_frintp_x2_f32(<vscale x 4 x float> %unused, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2) {
+; CHECK-LABEL: multi_vec_frintp_x2_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z3.d, z2.d
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    frintp { z0.s, z1.s }, { z2.s, z3.s }
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frintp.x2.nxv4f32(<vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2)
+  ret { <vscale x 4 x float>, <vscale x 4 x float> } %res
+}
+
+define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @multi_vec_frintp_x4_f32(<vscale x 4 x float> %unused, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, <vscale x 4 x float> %zn4) {
+; CHECK-LABEL: multi_vec_frintp_x4_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    frintp { z0.s - z3.s }, { z4.s - z7.s }
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frintp.x4.nxv4f32(<vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, <vscale x 4 x float> %zn4)
+  ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
+}
+
+declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frinta.x2.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>)
+declare { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frinta.x4.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
+
+declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frintm.x2.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>)
+declare { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frintm.x4.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
+
+declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frintn.x2.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>)
+declare { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frintn.x4.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
+
+declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frintp.x2.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>)
+declare { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frintp.x4.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)