Skip to content

Commit 99ee374

Browse files
committed
[AArch64] Lower aarch64_neon_saddlv via SADDLV nodes.
This mirrors what GISel already does, extending the existing lowering of aarch64_neon_saddlv/aarch64_neon_uaddlv to SADDLV/UADDLV. This allows us to remove some tablegen patterns, and provides a little nicer codegen in places as the nodes represent the result being in a vector register correctly.
1 parent 277ca48 commit 99ee374

File tree

4 files changed

+41
-136
lines changed

4 files changed

+41
-136
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6089,20 +6089,24 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
60896089
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, MaskAsInt,
60906090
DAG.getVectorIdxConstant(0, dl));
60916091
}
6092+
case Intrinsic::aarch64_neon_saddlv:
60926093
case Intrinsic::aarch64_neon_uaddlv: {
60936094
EVT OpVT = Op.getOperand(1).getValueType();
60946095
EVT ResVT = Op.getValueType();
6095-
if (ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 ||
6096-
OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) {
6097-
// In order to avoid insert_subvector, used v4i32 than v2i32.
6098-
SDValue UADDLV =
6099-
DAG.getNode(AArch64ISD::UADDLV, dl, MVT::v4i32, Op.getOperand(1));
6100-
SDValue EXTRACT_VEC_ELT =
6101-
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, UADDLV,
6102-
DAG.getConstant(0, dl, MVT::i64));
6103-
return EXTRACT_VEC_ELT;
6104-
}
6105-
return SDValue();
6096+
assert(
6097+
((ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 ||
6098+
OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) ||
6099+
(ResVT == MVT::i64 && (OpVT == MVT::v4i32 || OpVT == MVT::v2i32))) &&
6100+
"Unexpected aarch64_neon_u/saddlv type");
6101+
// In order to avoid insert_subvector, used v4i32 than v2i32.
6102+
SDValue ADDLV = DAG.getNode(
6103+
IntNo == Intrinsic::aarch64_neon_uaddlv ? AArch64ISD::UADDLV
6104+
: AArch64ISD::SADDLV,
6105+
dl, ResVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64, Op.getOperand(1));
6106+
SDValue EXTRACT_VEC_ELT = DAG.getNode(
6107+
ISD::EXTRACT_VECTOR_ELT, dl, ResVT == MVT::i32 ? MVT::i32 : MVT::i64,
6108+
ADDLV, DAG.getConstant(0, dl, MVT::i64));
6109+
return EXTRACT_VEC_ELT;
61066110
}
61076111
case Intrinsic::experimental_cttz_elts: {
61086112
SDValue CttzOp = Op.getOperand(1);

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 6 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -7174,17 +7174,6 @@ multiclass SIMDAcrossLaneLongPairIntrinsicGISel<string Opc, SDPatternOperator ad
71747174
defm : SIMDAcrossLaneLongPairIntrinsicGISel<"UADDLV", AArch64uaddlp>;
71757175
defm : SIMDAcrossLaneLongPairIntrinsicGISel<"SADDLV", AArch64saddlp>;
71767176

7177-
// Patterns for uaddlv(uaddlp(x)) ==> uaddlv
7178-
def : Pat<(i64 (int_aarch64_neon_uaddlv (v4i32 (AArch64uaddlp (v8i16 V128:$op))))),
7179-
(i64 (EXTRACT_SUBREG
7180-
(v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv8i16v V128:$op), ssub)),
7181-
dsub))>;
7182-
7183-
def : Pat<(i32 (int_aarch64_neon_uaddlv (v8i16 (AArch64uaddlp (v16i8 V128:$op))))),
7184-
(i32 (EXTRACT_SUBREG
7185-
(v8i16 (SUBREG_TO_REG (i64 0), (UADDLVv16i8v V128:$op), hsub)),
7186-
ssub))>;
7187-
71887177
def : Pat<(v2i64 (AArch64uaddlv (v4i32 (AArch64uaddlp (v8i16 V128:$op))))),
71897178
(v2i64 (SUBREG_TO_REG (i64 0), (UADDLVv8i16v V128:$op), ssub))>;
71907179

@@ -7405,82 +7394,12 @@ defm : SIMDAcrossLanesVecReductionIntrinsic<"SMAXV", vecreduce_smax>;
74057394
def : Pat<(i32 (vecreduce_smax (v2i32 V64:$Rn))),
74067395
(i32 (EXTRACT_SUBREG (SMAXPv2i32 V64:$Rn, V64:$Rn), ssub))>;
74077396

7408-
multiclass SIMDAcrossLanesSignedLongIntrinsic<string baseOpc, Intrinsic intOp> {
7409-
def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
7410-
(i32 (SMOVvi16to32
7411-
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
7412-
(!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub),
7413-
(i64 0)))>;
7414-
def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
7415-
(i32 (SMOVvi16to32
7416-
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
7417-
(!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub),
7418-
(i64 0)))>;
7419-
7420-
def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
7421-
(i32 (EXTRACT_SUBREG
7422-
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
7423-
(!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub),
7424-
ssub))>;
7425-
def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
7426-
(i32 (EXTRACT_SUBREG
7427-
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
7428-
(!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub),
7429-
ssub))>;
7430-
7431-
def : Pat<(i64 (intOp (v4i32 V128:$Rn))),
7432-
(i64 (EXTRACT_SUBREG
7433-
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
7434-
(!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub),
7435-
dsub))>;
7436-
}
7437-
7438-
multiclass SIMDAcrossLanesUnsignedLongIntrinsic<string baseOpc,
7439-
Intrinsic intOp> {
7440-
def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
7441-
(i32 (EXTRACT_SUBREG
7442-
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
7443-
(!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub),
7444-
ssub))>;
7445-
def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
7446-
(i32 (EXTRACT_SUBREG
7447-
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
7448-
(!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub),
7449-
ssub))>;
7450-
7451-
def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
7452-
(i32 (EXTRACT_SUBREG
7453-
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
7454-
(!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub),
7455-
ssub))>;
7456-
def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
7457-
(i32 (EXTRACT_SUBREG
7458-
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
7459-
(!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub),
7460-
ssub))>;
7461-
7462-
def : Pat<(i64 (intOp (v4i32 V128:$Rn))),
7463-
(i64 (EXTRACT_SUBREG
7464-
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
7465-
(!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub),
7466-
dsub))>;
7467-
}
7468-
7469-
defm : SIMDAcrossLanesSignedLongIntrinsic<"SADDLV", int_aarch64_neon_saddlv>;
7470-
defm : SIMDAcrossLanesUnsignedLongIntrinsic<"UADDLV", int_aarch64_neon_uaddlv>;
7471-
7472-
// The vaddlv_s32 intrinsic gets mapped to SADDLP.
7473-
def : Pat<(i64 (int_aarch64_neon_saddlv (v2i32 V64:$Rn))),
7474-
(i64 (EXTRACT_SUBREG
7475-
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
7476-
(SADDLPv2i32_v1i64 V64:$Rn), dsub),
7477-
dsub))>;
7478-
// The vaddlv_u32 intrinsic gets mapped to UADDLP.
7479-
def : Pat<(i64 (int_aarch64_neon_uaddlv (v2i32 V64:$Rn))),
7480-
(i64 (EXTRACT_SUBREG
7481-
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
7482-
(UADDLPv2i32_v1i64 V64:$Rn), dsub),
7483-
dsub))>;
7397+
// The SADDLV v2i32 gets mapped to SADDLP.
7398+
def : Pat<(v2i64 (AArch64saddlv (v2i32 V64:$Rn))),
7399+
(v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (SADDLPv2i32_v1i64 V64:$Rn), dsub))>;
7400+
// The UADDLV v2i32 gets mapped to UADDLP.
7401+
def : Pat<(v2i64 (AArch64uaddlv (v2i32 V64:$Rn))),
7402+
(v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (UADDLPv2i32_v1i64 V64:$Rn), dsub))>;
74847403

74857404
//------------------------------------------------------------------------------
74867405
// AdvSIMD modified immediate instructions

llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll

Lines changed: 10 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -146,11 +146,11 @@ define void @insert_vec_v6i64_uaddlv_from_v4i32(ptr %0) {
146146
; CHECK-LABEL: insert_vec_v6i64_uaddlv_from_v4i32:
147147
; CHECK: ; %bb.0: ; %entry
148148
; CHECK-NEXT: movi.2d v0, #0000000000000000
149-
; CHECK-NEXT: movi.2d v2, #0000000000000000
150149
; CHECK-NEXT: uaddlv.4s d1, v0
151-
; CHECK-NEXT: str d2, [x0, #16]
152150
; CHECK-NEXT: mov.d v0[0], v1[0]
151+
; CHECK-NEXT: movi.2d v1, #0000000000000000
153152
; CHECK-NEXT: ucvtf.2d v0, v0
153+
; CHECK-NEXT: str d1, [x0, #16]
154154
; CHECK-NEXT: fcvtn v0.2s, v0.2d
155155
; CHECK-NEXT: str q0, [x0]
156156
; CHECK-NEXT: ret
@@ -491,9 +491,8 @@ define void @store_saddlv_v8i8(ptr %H, <8 x i8> %sum_h, i32 %idx) {
491491
; CHECK: ; %bb.0: ; %entry
492492
; CHECK-NEXT: saddlv.8b h0, v0
493493
; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1
494-
; CHECK-NEXT: sbfiz x9, x1, #3, #32
495-
; CHECK-NEXT: smov.h w8, v0[0]
496-
; CHECK-NEXT: str w8, [x0, x9]
494+
; CHECK-NEXT: sbfiz x8, x1, #3, #32
495+
; CHECK-NEXT: str s0, [x0, x8]
497496
; CHECK-NEXT: ret
498497
entry:
499498
%vaddlvq_s32.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v8i8(<8 x i8> %sum_h)
@@ -508,9 +507,8 @@ define void @store_saddlv_v16i8(ptr %H, <16 x i8> %sum_h, i32 %idx) {
508507
; CHECK: ; %bb.0: ; %entry
509508
; CHECK-NEXT: saddlv.16b h0, v0
510509
; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1
511-
; CHECK-NEXT: sbfiz x9, x1, #3, #32
512-
; CHECK-NEXT: smov.h w8, v0[0]
513-
; CHECK-NEXT: str w8, [x0, x9]
510+
; CHECK-NEXT: sbfiz x8, x1, #3, #32
511+
; CHECK-NEXT: str s0, [x0, x8]
514512
; CHECK-NEXT: ret
515513
entry:
516514
%vaddlvq_s32.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v16i8(<16 x i8> %sum_h)
@@ -526,8 +524,7 @@ define void @store_saddlv_v4i16(ptr %H, <4 x i16> %sum_h, i32 %idx) {
526524
; CHECK-NEXT: saddlv.4h s0, v0
527525
; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1
528526
; CHECK-NEXT: sbfiz x8, x1, #3, #32
529-
; CHECK-NEXT: fmov w9, s0
530-
; CHECK-NEXT: str w9, [x0, x8]
527+
; CHECK-NEXT: str s0, [x0, x8]
531528
; CHECK-NEXT: ret
532529
entry:
533530
%vaddlvq_s32.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v4i16(<4 x i16> %sum_h)
@@ -543,8 +540,7 @@ define void @store_saddlv_v8i16(ptr %H, <8 x i16> %sum_h, i32 %idx) {
543540
; CHECK-NEXT: saddlv.8h s0, v0
544541
; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1
545542
; CHECK-NEXT: sbfiz x8, x1, #3, #32
546-
; CHECK-NEXT: fmov w9, s0
547-
; CHECK-NEXT: str w9, [x0, x8]
543+
; CHECK-NEXT: str s0, [x0, x8]
548544
; CHECK-NEXT: ret
549545
entry:
550546
%vaddlvq_s32.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v8i16(<8 x i16> %sum_h)
@@ -558,8 +554,7 @@ define void @store_saddlv_v2i32(ptr %H, <2 x i32> %sum_h, i32 %idx) {
558554
; CHECK-LABEL: store_saddlv_v2i32:
559555
; CHECK: ; %bb.0: ; %entry
560556
; CHECK-NEXT: saddlp.1d v0, v0
561-
; CHECK-NEXT: fmov x8, d0
562-
; CHECK-NEXT: str x8, [x0, w1, sxtw #3]
557+
; CHECK-NEXT: str d0, [x0, w1, sxtw #3]
563558
; CHECK-NEXT: ret
564559
entry:
565560
%vaddlvq_s32.i = tail call i64 @llvm.aarch64.neon.saddlv.i64.v2i32(<2 x i32> %sum_h)
@@ -573,8 +568,7 @@ define void @store_saddlv_v4i32(ptr %H, <4 x i32> %sum_h, i32 %idx) {
573568
; CHECK-LABEL: store_saddlv_v4i32:
574569
; CHECK: ; %bb.0: ; %entry
575570
; CHECK-NEXT: saddlv.4s d0, v0
576-
; CHECK-NEXT: fmov x8, d0
577-
; CHECK-NEXT: str x8, [x0, w1, sxtw #3]
571+
; CHECK-NEXT: str d0, [x0, w1, sxtw #3]
578572
; CHECK-NEXT: ret
579573
entry:
580574
%vaddlvq_s32.i = tail call i64 @llvm.aarch64.neon.saddlv.i64.v4i32(<4 x i32> %sum_h)

llvm/test/CodeGen/AArch64/arm64-neon-across.ll

Lines changed: 10 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -43,17 +43,11 @@ declare i32 @llvm.aarch64.neon.saddlv.i32.v4i16(<4 x i16>)
4343
declare i32 @llvm.aarch64.neon.saddlv.i32.v8i8(<8 x i8>)
4444

4545
define i16 @test_vaddlv_s8(<8 x i8> %a) {
46-
; CHECK-SD-LABEL: test_vaddlv_s8:
47-
; CHECK-SD: // %bb.0: // %entry
48-
; CHECK-SD-NEXT: saddlv h0, v0.8b
49-
; CHECK-SD-NEXT: smov w0, v0.h[0]
50-
; CHECK-SD-NEXT: ret
51-
;
52-
; CHECK-GI-LABEL: test_vaddlv_s8:
53-
; CHECK-GI: // %bb.0: // %entry
54-
; CHECK-GI-NEXT: saddlv h0, v0.8b
55-
; CHECK-GI-NEXT: fmov w0, s0
56-
; CHECK-GI-NEXT: ret
46+
; CHECK-LABEL: test_vaddlv_s8:
47+
; CHECK: // %bb.0: // %entry
48+
; CHECK-NEXT: saddlv h0, v0.8b
49+
; CHECK-NEXT: fmov w0, s0
50+
; CHECK-NEXT: ret
5751
entry:
5852
%saddlvv.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v8i8(<8 x i8> %a)
5953
%0 = trunc i32 %saddlvv.i to i16
@@ -95,17 +89,11 @@ entry:
9589
}
9690

9791
define i16 @test_vaddlvq_s8(<16 x i8> %a) {
98-
; CHECK-SD-LABEL: test_vaddlvq_s8:
99-
; CHECK-SD: // %bb.0: // %entry
100-
; CHECK-SD-NEXT: saddlv h0, v0.16b
101-
; CHECK-SD-NEXT: smov w0, v0.h[0]
102-
; CHECK-SD-NEXT: ret
103-
;
104-
; CHECK-GI-LABEL: test_vaddlvq_s8:
105-
; CHECK-GI: // %bb.0: // %entry
106-
; CHECK-GI-NEXT: saddlv h0, v0.16b
107-
; CHECK-GI-NEXT: fmov w0, s0
108-
; CHECK-GI-NEXT: ret
92+
; CHECK-LABEL: test_vaddlvq_s8:
93+
; CHECK: // %bb.0: // %entry
94+
; CHECK-NEXT: saddlv h0, v0.16b
95+
; CHECK-NEXT: fmov w0, s0
96+
; CHECK-NEXT: ret
10997
entry:
11098
%saddlvv.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v16i8(<16 x i8> %a)
11199
%0 = trunc i32 %saddlvv.i to i16

0 commit comments

Comments
 (0)