Skip to content

Commit 8ab6140

Browse files
authored
[AArch64] Lower aarch64_neon_saddlv via SADDLV nodes. (#103307)
This mirrors what GISel already does, extending the existing lowering of aarch64_neon_saddlv/aarch64_neon_uaddlv to SADDLV/UADDLV. This allows us to remove some tablegen patterns, and provides a little nicer codegen in places as the nodes represent the result being in a vector register correctly.
1 parent b21756f commit 8ab6140

File tree

4 files changed

+41
-136
lines changed

4 files changed

+41
-136
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6097,20 +6097,24 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
60976097
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, MaskAsInt,
60986098
DAG.getVectorIdxConstant(0, dl));
60996099
}
6100+
case Intrinsic::aarch64_neon_saddlv:
61006101
case Intrinsic::aarch64_neon_uaddlv: {
61016102
EVT OpVT = Op.getOperand(1).getValueType();
61026103
EVT ResVT = Op.getValueType();
6103-
if (ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 ||
6104-
OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) {
6105-
// In order to avoid insert_subvector, used v4i32 than v2i32.
6106-
SDValue UADDLV =
6107-
DAG.getNode(AArch64ISD::UADDLV, dl, MVT::v4i32, Op.getOperand(1));
6108-
SDValue EXTRACT_VEC_ELT =
6109-
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, UADDLV,
6110-
DAG.getConstant(0, dl, MVT::i64));
6111-
return EXTRACT_VEC_ELT;
6112-
}
6113-
return SDValue();
6104+
assert(
6105+
((ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 ||
6106+
OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) ||
6107+
(ResVT == MVT::i64 && (OpVT == MVT::v4i32 || OpVT == MVT::v2i32))) &&
6108+
"Unexpected aarch64_neon_u/saddlv type");
6109+
// In order to avoid insert_subvector, use v4i32 rather than v2i32.
6110+
SDValue ADDLV = DAG.getNode(
6111+
IntNo == Intrinsic::aarch64_neon_uaddlv ? AArch64ISD::UADDLV
6112+
: AArch64ISD::SADDLV,
6113+
dl, ResVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64, Op.getOperand(1));
6114+
SDValue EXTRACT_VEC_ELT = DAG.getNode(
6115+
ISD::EXTRACT_VECTOR_ELT, dl, ResVT == MVT::i32 ? MVT::i32 : MVT::i64,
6116+
ADDLV, DAG.getConstant(0, dl, MVT::i64));
6117+
return EXTRACT_VEC_ELT;
61146118
}
61156119
case Intrinsic::experimental_cttz_elts: {
61166120
SDValue CttzOp = Op.getOperand(1);

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 6 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -7196,17 +7196,6 @@ multiclass SIMDAcrossLaneLongPairIntrinsicGISel<string Opc, SDPatternOperator ad
71967196
defm : SIMDAcrossLaneLongPairIntrinsicGISel<"UADDLV", AArch64uaddlp>;
71977197
defm : SIMDAcrossLaneLongPairIntrinsicGISel<"SADDLV", AArch64saddlp>;
71987198

7199-
// Patterns for uaddlv(uaddlp(x)) ==> uaddlv
7200-
def : Pat<(i64 (int_aarch64_neon_uaddlv (v4i32 (AArch64uaddlp (v8i16 V128:$op))))),
7201-
(i64 (EXTRACT_SUBREG
7202-
(v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv8i16v V128:$op), ssub)),
7203-
dsub))>;
7204-
7205-
def : Pat<(i32 (int_aarch64_neon_uaddlv (v8i16 (AArch64uaddlp (v16i8 V128:$op))))),
7206-
(i32 (EXTRACT_SUBREG
7207-
(v8i16 (SUBREG_TO_REG (i64 0), (UADDLVv16i8v V128:$op), hsub)),
7208-
ssub))>;
7209-
72107199
def : Pat<(v2i64 (AArch64uaddlv (v4i32 (AArch64uaddlp (v8i16 V128:$op))))),
72117200
(v2i64 (SUBREG_TO_REG (i64 0), (UADDLVv8i16v V128:$op), ssub))>;
72127201

@@ -7427,82 +7416,12 @@ defm : SIMDAcrossLanesVecReductionIntrinsic<"SMAXV", vecreduce_smax>;
74277416
def : Pat<(i32 (vecreduce_smax (v2i32 V64:$Rn))),
74287417
(i32 (EXTRACT_SUBREG (SMAXPv2i32 V64:$Rn, V64:$Rn), ssub))>;
74297418

7430-
multiclass SIMDAcrossLanesSignedLongIntrinsic<string baseOpc, Intrinsic intOp> {
7431-
def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
7432-
(i32 (SMOVvi16to32
7433-
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
7434-
(!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub),
7435-
(i64 0)))>;
7436-
def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
7437-
(i32 (SMOVvi16to32
7438-
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
7439-
(!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub),
7440-
(i64 0)))>;
7441-
7442-
def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
7443-
(i32 (EXTRACT_SUBREG
7444-
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
7445-
(!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub),
7446-
ssub))>;
7447-
def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
7448-
(i32 (EXTRACT_SUBREG
7449-
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
7450-
(!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub),
7451-
ssub))>;
7452-
7453-
def : Pat<(i64 (intOp (v4i32 V128:$Rn))),
7454-
(i64 (EXTRACT_SUBREG
7455-
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
7456-
(!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub),
7457-
dsub))>;
7458-
}
7459-
7460-
multiclass SIMDAcrossLanesUnsignedLongIntrinsic<string baseOpc,
7461-
Intrinsic intOp> {
7462-
def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
7463-
(i32 (EXTRACT_SUBREG
7464-
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
7465-
(!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub),
7466-
ssub))>;
7467-
def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
7468-
(i32 (EXTRACT_SUBREG
7469-
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
7470-
(!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub),
7471-
ssub))>;
7472-
7473-
def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
7474-
(i32 (EXTRACT_SUBREG
7475-
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
7476-
(!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub),
7477-
ssub))>;
7478-
def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
7479-
(i32 (EXTRACT_SUBREG
7480-
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
7481-
(!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub),
7482-
ssub))>;
7483-
7484-
def : Pat<(i64 (intOp (v4i32 V128:$Rn))),
7485-
(i64 (EXTRACT_SUBREG
7486-
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
7487-
(!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub),
7488-
dsub))>;
7489-
}
7490-
7491-
defm : SIMDAcrossLanesSignedLongIntrinsic<"SADDLV", int_aarch64_neon_saddlv>;
7492-
defm : SIMDAcrossLanesUnsignedLongIntrinsic<"UADDLV", int_aarch64_neon_uaddlv>;
7493-
7494-
// The vaddlv_s32 intrinsic gets mapped to SADDLP.
7495-
def : Pat<(i64 (int_aarch64_neon_saddlv (v2i32 V64:$Rn))),
7496-
(i64 (EXTRACT_SUBREG
7497-
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
7498-
(SADDLPv2i32_v1i64 V64:$Rn), dsub),
7499-
dsub))>;
7500-
// The vaddlv_u32 intrinsic gets mapped to UADDLP.
7501-
def : Pat<(i64 (int_aarch64_neon_uaddlv (v2i32 V64:$Rn))),
7502-
(i64 (EXTRACT_SUBREG
7503-
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
7504-
(UADDLPv2i32_v1i64 V64:$Rn), dsub),
7505-
dsub))>;
7419+
// The SADDLV v2i32 gets mapped to SADDLP.
7420+
def : Pat<(v2i64 (AArch64saddlv (v2i32 V64:$Rn))),
7421+
(v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (SADDLPv2i32_v1i64 V64:$Rn), dsub))>;
7422+
// The UADDLV v2i32 gets mapped to UADDLP.
7423+
def : Pat<(v2i64 (AArch64uaddlv (v2i32 V64:$Rn))),
7424+
(v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (UADDLPv2i32_v1i64 V64:$Rn), dsub))>;
75067425

75077426
//------------------------------------------------------------------------------
75087427
// AdvSIMD modified immediate instructions

llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll

Lines changed: 10 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -146,11 +146,11 @@ define void @insert_vec_v6i64_uaddlv_from_v4i32(ptr %0) {
146146
; CHECK-LABEL: insert_vec_v6i64_uaddlv_from_v4i32:
147147
; CHECK: ; %bb.0: ; %entry
148148
; CHECK-NEXT: movi.2d v0, #0000000000000000
149-
; CHECK-NEXT: movi.2d v2, #0000000000000000
150149
; CHECK-NEXT: uaddlv.4s d1, v0
151-
; CHECK-NEXT: str d2, [x0, #16]
152150
; CHECK-NEXT: mov.d v0[0], v1[0]
151+
; CHECK-NEXT: movi.2d v1, #0000000000000000
153152
; CHECK-NEXT: ucvtf.2d v0, v0
153+
; CHECK-NEXT: str d1, [x0, #16]
154154
; CHECK-NEXT: fcvtn v0.2s, v0.2d
155155
; CHECK-NEXT: str q0, [x0]
156156
; CHECK-NEXT: ret
@@ -491,9 +491,8 @@ define void @store_saddlv_v8i8(ptr %H, <8 x i8> %sum_h, i32 %idx) {
491491
; CHECK: ; %bb.0: ; %entry
492492
; CHECK-NEXT: saddlv.8b h0, v0
493493
; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1
494-
; CHECK-NEXT: sbfiz x9, x1, #3, #32
495-
; CHECK-NEXT: smov.h w8, v0[0]
496-
; CHECK-NEXT: str w8, [x0, x9]
494+
; CHECK-NEXT: sbfiz x8, x1, #3, #32
495+
; CHECK-NEXT: str s0, [x0, x8]
497496
; CHECK-NEXT: ret
498497
entry:
499498
%vaddlvq_s32.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v8i8(<8 x i8> %sum_h)
@@ -508,9 +507,8 @@ define void @store_saddlv_v16i8(ptr %H, <16 x i8> %sum_h, i32 %idx) {
508507
; CHECK: ; %bb.0: ; %entry
509508
; CHECK-NEXT: saddlv.16b h0, v0
510509
; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1
511-
; CHECK-NEXT: sbfiz x9, x1, #3, #32
512-
; CHECK-NEXT: smov.h w8, v0[0]
513-
; CHECK-NEXT: str w8, [x0, x9]
510+
; CHECK-NEXT: sbfiz x8, x1, #3, #32
511+
; CHECK-NEXT: str s0, [x0, x8]
514512
; CHECK-NEXT: ret
515513
entry:
516514
%vaddlvq_s32.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v16i8(<16 x i8> %sum_h)
@@ -526,8 +524,7 @@ define void @store_saddlv_v4i16(ptr %H, <4 x i16> %sum_h, i32 %idx) {
526524
; CHECK-NEXT: saddlv.4h s0, v0
527525
; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1
528526
; CHECK-NEXT: sbfiz x8, x1, #3, #32
529-
; CHECK-NEXT: fmov w9, s0
530-
; CHECK-NEXT: str w9, [x0, x8]
527+
; CHECK-NEXT: str s0, [x0, x8]
531528
; CHECK-NEXT: ret
532529
entry:
533530
%vaddlvq_s32.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v4i16(<4 x i16> %sum_h)
@@ -543,8 +540,7 @@ define void @store_saddlv_v8i16(ptr %H, <8 x i16> %sum_h, i32 %idx) {
543540
; CHECK-NEXT: saddlv.8h s0, v0
544541
; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1
545542
; CHECK-NEXT: sbfiz x8, x1, #3, #32
546-
; CHECK-NEXT: fmov w9, s0
547-
; CHECK-NEXT: str w9, [x0, x8]
543+
; CHECK-NEXT: str s0, [x0, x8]
548544
; CHECK-NEXT: ret
549545
entry:
550546
%vaddlvq_s32.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v8i16(<8 x i16> %sum_h)
@@ -558,8 +554,7 @@ define void @store_saddlv_v2i32(ptr %H, <2 x i32> %sum_h, i32 %idx) {
558554
; CHECK-LABEL: store_saddlv_v2i32:
559555
; CHECK: ; %bb.0: ; %entry
560556
; CHECK-NEXT: saddlp.1d v0, v0
561-
; CHECK-NEXT: fmov x8, d0
562-
; CHECK-NEXT: str x8, [x0, w1, sxtw #3]
557+
; CHECK-NEXT: str d0, [x0, w1, sxtw #3]
563558
; CHECK-NEXT: ret
564559
entry:
565560
%vaddlvq_s32.i = tail call i64 @llvm.aarch64.neon.saddlv.i64.v2i32(<2 x i32> %sum_h)
@@ -573,8 +568,7 @@ define void @store_saddlv_v4i32(ptr %H, <4 x i32> %sum_h, i32 %idx) {
573568
; CHECK-LABEL: store_saddlv_v4i32:
574569
; CHECK: ; %bb.0: ; %entry
575570
; CHECK-NEXT: saddlv.4s d0, v0
576-
; CHECK-NEXT: fmov x8, d0
577-
; CHECK-NEXT: str x8, [x0, w1, sxtw #3]
571+
; CHECK-NEXT: str d0, [x0, w1, sxtw #3]
578572
; CHECK-NEXT: ret
579573
entry:
580574
%vaddlvq_s32.i = tail call i64 @llvm.aarch64.neon.saddlv.i64.v4i32(<4 x i32> %sum_h)

llvm/test/CodeGen/AArch64/arm64-neon-across.ll

Lines changed: 10 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -43,17 +43,11 @@ declare i32 @llvm.aarch64.neon.saddlv.i32.v4i16(<4 x i16>)
4343
declare i32 @llvm.aarch64.neon.saddlv.i32.v8i8(<8 x i8>)
4444

4545
define i16 @test_vaddlv_s8(<8 x i8> %a) {
46-
; CHECK-SD-LABEL: test_vaddlv_s8:
47-
; CHECK-SD: // %bb.0: // %entry
48-
; CHECK-SD-NEXT: saddlv h0, v0.8b
49-
; CHECK-SD-NEXT: smov w0, v0.h[0]
50-
; CHECK-SD-NEXT: ret
51-
;
52-
; CHECK-GI-LABEL: test_vaddlv_s8:
53-
; CHECK-GI: // %bb.0: // %entry
54-
; CHECK-GI-NEXT: saddlv h0, v0.8b
55-
; CHECK-GI-NEXT: fmov w0, s0
56-
; CHECK-GI-NEXT: ret
46+
; CHECK-LABEL: test_vaddlv_s8:
47+
; CHECK: // %bb.0: // %entry
48+
; CHECK-NEXT: saddlv h0, v0.8b
49+
; CHECK-NEXT: fmov w0, s0
50+
; CHECK-NEXT: ret
5751
entry:
5852
%saddlvv.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v8i8(<8 x i8> %a)
5953
%0 = trunc i32 %saddlvv.i to i16
@@ -95,17 +89,11 @@ entry:
9589
}
9690

9791
define i16 @test_vaddlvq_s8(<16 x i8> %a) {
98-
; CHECK-SD-LABEL: test_vaddlvq_s8:
99-
; CHECK-SD: // %bb.0: // %entry
100-
; CHECK-SD-NEXT: saddlv h0, v0.16b
101-
; CHECK-SD-NEXT: smov w0, v0.h[0]
102-
; CHECK-SD-NEXT: ret
103-
;
104-
; CHECK-GI-LABEL: test_vaddlvq_s8:
105-
; CHECK-GI: // %bb.0: // %entry
106-
; CHECK-GI-NEXT: saddlv h0, v0.16b
107-
; CHECK-GI-NEXT: fmov w0, s0
108-
; CHECK-GI-NEXT: ret
92+
; CHECK-LABEL: test_vaddlvq_s8:
93+
; CHECK: // %bb.0: // %entry
94+
; CHECK-NEXT: saddlv h0, v0.16b
95+
; CHECK-NEXT: fmov w0, s0
96+
; CHECK-NEXT: ret
10997
entry:
11098
%saddlvv.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v16i8(<16 x i8> %a)
11199
%0 = trunc i32 %saddlvv.i to i16

0 commit comments

Comments
 (0)