Skip to content

[AArch64] Lower aarch64_neon_saddlv via SADDLV nodes. #103307

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 15 additions & 11 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6097,20 +6097,24 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, MaskAsInt,
DAG.getVectorIdxConstant(0, dl));
}
case Intrinsic::aarch64_neon_saddlv:
case Intrinsic::aarch64_neon_uaddlv: {
EVT OpVT = Op.getOperand(1).getValueType();
EVT ResVT = Op.getValueType();
if (ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 ||
OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) {
// In order to avoid insert_subvector, used v4i32 than v2i32.
SDValue UADDLV =
DAG.getNode(AArch64ISD::UADDLV, dl, MVT::v4i32, Op.getOperand(1));
SDValue EXTRACT_VEC_ELT =
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, UADDLV,
DAG.getConstant(0, dl, MVT::i64));
return EXTRACT_VEC_ELT;
}
return SDValue();
assert(
((ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 ||
OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) ||
(ResVT == MVT::i64 && (OpVT == MVT::v4i32 || OpVT == MVT::v2i32))) &&
"Unexpected aarch64_neon_u/saddlv type");
// In order to avoid insert_subvector, use v4i32 rather than v2i32.
SDValue ADDLV = DAG.getNode(
IntNo == Intrinsic::aarch64_neon_uaddlv ? AArch64ISD::UADDLV
: AArch64ISD::SADDLV,
dl, ResVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64, Op.getOperand(1));
SDValue EXTRACT_VEC_ELT = DAG.getNode(
ISD::EXTRACT_VECTOR_ELT, dl, ResVT == MVT::i32 ? MVT::i32 : MVT::i64,
ADDLV, DAG.getConstant(0, dl, MVT::i64));
return EXTRACT_VEC_ELT;
}
case Intrinsic::experimental_cttz_elts: {
SDValue CttzOp = Op.getOperand(1);
Expand Down
93 changes: 6 additions & 87 deletions llvm/lib/Target/AArch64/AArch64InstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -7196,17 +7196,6 @@ multiclass SIMDAcrossLaneLongPairIntrinsicGISel<string Opc, SDPatternOperator ad
defm : SIMDAcrossLaneLongPairIntrinsicGISel<"UADDLV", AArch64uaddlp>;
defm : SIMDAcrossLaneLongPairIntrinsicGISel<"SADDLV", AArch64saddlp>;

// Patterns for uaddlv(uaddlp(x)) ==> uaddlv
def : Pat<(i64 (int_aarch64_neon_uaddlv (v4i32 (AArch64uaddlp (v8i16 V128:$op))))),
(i64 (EXTRACT_SUBREG
(v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv8i16v V128:$op), ssub)),
dsub))>;

def : Pat<(i32 (int_aarch64_neon_uaddlv (v8i16 (AArch64uaddlp (v16i8 V128:$op))))),
(i32 (EXTRACT_SUBREG
(v8i16 (SUBREG_TO_REG (i64 0), (UADDLVv16i8v V128:$op), hsub)),
ssub))>;

def : Pat<(v2i64 (AArch64uaddlv (v4i32 (AArch64uaddlp (v8i16 V128:$op))))),
(v2i64 (SUBREG_TO_REG (i64 0), (UADDLVv8i16v V128:$op), ssub))>;

Expand Down Expand Up @@ -7427,82 +7416,12 @@ defm : SIMDAcrossLanesVecReductionIntrinsic<"SMAXV", vecreduce_smax>;
def : Pat<(i32 (vecreduce_smax (v2i32 V64:$Rn))),
(i32 (EXTRACT_SUBREG (SMAXPv2i32 V64:$Rn, V64:$Rn), ssub))>;

multiclass SIMDAcrossLanesSignedLongIntrinsic<string baseOpc, Intrinsic intOp> {
def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
(i32 (SMOVvi16to32
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
(!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub),
(i64 0)))>;
def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
(i32 (SMOVvi16to32
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
(!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub),
(i64 0)))>;

def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
(i32 (EXTRACT_SUBREG
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
(!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub),
ssub))>;
def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
(i32 (EXTRACT_SUBREG
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
(!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub),
ssub))>;

def : Pat<(i64 (intOp (v4i32 V128:$Rn))),
(i64 (EXTRACT_SUBREG
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
(!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub),
dsub))>;
}

multiclass SIMDAcrossLanesUnsignedLongIntrinsic<string baseOpc,
Intrinsic intOp> {
def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
(i32 (EXTRACT_SUBREG
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
(!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub),
ssub))>;
def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
(i32 (EXTRACT_SUBREG
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
(!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub),
ssub))>;

def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
(i32 (EXTRACT_SUBREG
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
(!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub),
ssub))>;
def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
(i32 (EXTRACT_SUBREG
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
(!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub),
ssub))>;

def : Pat<(i64 (intOp (v4i32 V128:$Rn))),
(i64 (EXTRACT_SUBREG
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
(!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub),
dsub))>;
}

defm : SIMDAcrossLanesSignedLongIntrinsic<"SADDLV", int_aarch64_neon_saddlv>;
defm : SIMDAcrossLanesUnsignedLongIntrinsic<"UADDLV", int_aarch64_neon_uaddlv>;

// The vaddlv_s32 intrinsic gets mapped to SADDLP.
def : Pat<(i64 (int_aarch64_neon_saddlv (v2i32 V64:$Rn))),
(i64 (EXTRACT_SUBREG
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
(SADDLPv2i32_v1i64 V64:$Rn), dsub),
dsub))>;
// The vaddlv_u32 intrinsic gets mapped to UADDLP.
def : Pat<(i64 (int_aarch64_neon_uaddlv (v2i32 V64:$Rn))),
(i64 (EXTRACT_SUBREG
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
(UADDLPv2i32_v1i64 V64:$Rn), dsub),
dsub))>;
// The SADDLV v2i32 gets mapped to SADDLP.
def : Pat<(v2i64 (AArch64saddlv (v2i32 V64:$Rn))),
(v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (SADDLPv2i32_v1i64 V64:$Rn), dsub))>;
// The UADDLV v2i32 gets mapped to UADDLP.
def : Pat<(v2i64 (AArch64uaddlv (v2i32 V64:$Rn))),
(v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (UADDLPv2i32_v1i64 V64:$Rn), dsub))>;

//------------------------------------------------------------------------------
// AdvSIMD modified immediate instructions
Expand Down
26 changes: 10 additions & 16 deletions llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
Original file line number Diff line number Diff line change
Expand Up @@ -146,11 +146,11 @@ define void @insert_vec_v6i64_uaddlv_from_v4i32(ptr %0) {
; CHECK-LABEL: insert_vec_v6i64_uaddlv_from_v4i32:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: movi.2d v0, #0000000000000000
; CHECK-NEXT: movi.2d v2, #0000000000000000
; CHECK-NEXT: uaddlv.4s d1, v0
; CHECK-NEXT: str d2, [x0, #16]
; CHECK-NEXT: mov.d v0[0], v1[0]
; CHECK-NEXT: movi.2d v1, #0000000000000000
; CHECK-NEXT: ucvtf.2d v0, v0
; CHECK-NEXT: str d1, [x0, #16]
; CHECK-NEXT: fcvtn v0.2s, v0.2d
; CHECK-NEXT: str q0, [x0]
; CHECK-NEXT: ret
Expand Down Expand Up @@ -491,9 +491,8 @@ define void @store_saddlv_v8i8(ptr %H, <8 x i8> %sum_h, i32 %idx) {
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: saddlv.8b h0, v0
; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1
; CHECK-NEXT: sbfiz x9, x1, #3, #32
; CHECK-NEXT: smov.h w8, v0[0]
; CHECK-NEXT: str w8, [x0, x9]
; CHECK-NEXT: sbfiz x8, x1, #3, #32
; CHECK-NEXT: str s0, [x0, x8]
; CHECK-NEXT: ret
entry:
%vaddlvq_s32.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v8i8(<8 x i8> %sum_h)
Expand All @@ -508,9 +507,8 @@ define void @store_saddlv_v16i8(ptr %H, <16 x i8> %sum_h, i32 %idx) {
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: saddlv.16b h0, v0
; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1
; CHECK-NEXT: sbfiz x9, x1, #3, #32
; CHECK-NEXT: smov.h w8, v0[0]
; CHECK-NEXT: str w8, [x0, x9]
; CHECK-NEXT: sbfiz x8, x1, #3, #32
; CHECK-NEXT: str s0, [x0, x8]
; CHECK-NEXT: ret
entry:
%vaddlvq_s32.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v16i8(<16 x i8> %sum_h)
Expand All @@ -526,8 +524,7 @@ define void @store_saddlv_v4i16(ptr %H, <4 x i16> %sum_h, i32 %idx) {
; CHECK-NEXT: saddlv.4h s0, v0
; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1
; CHECK-NEXT: sbfiz x8, x1, #3, #32
; CHECK-NEXT: fmov w9, s0
; CHECK-NEXT: str w9, [x0, x8]
; CHECK-NEXT: str s0, [x0, x8]
; CHECK-NEXT: ret
entry:
%vaddlvq_s32.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v4i16(<4 x i16> %sum_h)
Expand All @@ -543,8 +540,7 @@ define void @store_saddlv_v8i16(ptr %H, <8 x i16> %sum_h, i32 %idx) {
; CHECK-NEXT: saddlv.8h s0, v0
; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1
; CHECK-NEXT: sbfiz x8, x1, #3, #32
; CHECK-NEXT: fmov w9, s0
; CHECK-NEXT: str w9, [x0, x8]
; CHECK-NEXT: str s0, [x0, x8]
; CHECK-NEXT: ret
entry:
%vaddlvq_s32.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v8i16(<8 x i16> %sum_h)
Expand All @@ -558,8 +554,7 @@ define void @store_saddlv_v2i32(ptr %H, <2 x i32> %sum_h, i32 %idx) {
; CHECK-LABEL: store_saddlv_v2i32:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: saddlp.1d v0, v0
; CHECK-NEXT: fmov x8, d0
; CHECK-NEXT: str x8, [x0, w1, sxtw #3]
; CHECK-NEXT: str d0, [x0, w1, sxtw #3]
; CHECK-NEXT: ret
entry:
%vaddlvq_s32.i = tail call i64 @llvm.aarch64.neon.saddlv.i64.v2i32(<2 x i32> %sum_h)
Expand All @@ -573,8 +568,7 @@ define void @store_saddlv_v4i32(ptr %H, <4 x i32> %sum_h, i32 %idx) {
; CHECK-LABEL: store_saddlv_v4i32:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: saddlv.4s d0, v0
; CHECK-NEXT: fmov x8, d0
; CHECK-NEXT: str x8, [x0, w1, sxtw #3]
; CHECK-NEXT: str d0, [x0, w1, sxtw #3]
; CHECK-NEXT: ret
entry:
%vaddlvq_s32.i = tail call i64 @llvm.aarch64.neon.saddlv.i64.v4i32(<4 x i32> %sum_h)
Expand Down
32 changes: 10 additions & 22 deletions llvm/test/CodeGen/AArch64/arm64-neon-across.ll
Original file line number Diff line number Diff line change
Expand Up @@ -43,17 +43,11 @@ declare i32 @llvm.aarch64.neon.saddlv.i32.v4i16(<4 x i16>)
declare i32 @llvm.aarch64.neon.saddlv.i32.v8i8(<8 x i8>)

define i16 @test_vaddlv_s8(<8 x i8> %a) {
; CHECK-SD-LABEL: test_vaddlv_s8:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: saddlv h0, v0.8b
; CHECK-SD-NEXT: smov w0, v0.h[0]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: test_vaddlv_s8:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: saddlv h0, v0.8b
; CHECK-GI-NEXT: fmov w0, s0
; CHECK-GI-NEXT: ret
; CHECK-LABEL: test_vaddlv_s8:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: saddlv h0, v0.8b
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
entry:
%saddlvv.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v8i8(<8 x i8> %a)
%0 = trunc i32 %saddlvv.i to i16
Expand Down Expand Up @@ -95,17 +89,11 @@ entry:
}

define i16 @test_vaddlvq_s8(<16 x i8> %a) {
; CHECK-SD-LABEL: test_vaddlvq_s8:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: saddlv h0, v0.16b
; CHECK-SD-NEXT: smov w0, v0.h[0]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: test_vaddlvq_s8:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: saddlv h0, v0.16b
; CHECK-GI-NEXT: fmov w0, s0
; CHECK-GI-NEXT: ret
; CHECK-LABEL: test_vaddlvq_s8:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: saddlv h0, v0.16b
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
entry:
%saddlvv.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v16i8(<16 x i8> %a)
%0 = trunc i32 %saddlvv.i to i16
Expand Down
Loading