Skip to content

Commit 59c3dca

Browse files
authored
[AArch64] Remove copy instruction between uaddlv with v4i16/v8i16 and dup (#66508)
If there are copy instructions between uaddlv with v4i16/v8i16 and dup for transfer from gpr to fpr, try to remove them with duplane. It is a follow-up patch of https://reviews.llvm.org/D159267
1 parent ec7baca commit 59c3dca

6 files changed

+98
-21
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5335,7 +5335,8 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
53355335
case Intrinsic::aarch64_neon_uaddlv: {
53365336
EVT OpVT = Op.getOperand(1).getValueType();
53375337
EVT ResVT = Op.getValueType();
5338-
if (ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8)) {
5338+
if (ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 ||
5339+
OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) {
53395340
// In order to avoid insert_subvector, used v4i32 than v2i32.
53405341
SDValue UADDLV =
53415342
DAG.getNode(AArch64ISD::UADDLV, dl, MVT::v4i32, Op.getOperand(1));
@@ -22273,21 +22274,40 @@ static SDValue performSelectCombine(SDNode *N,
2227322274
static SDValue performDUPCombine(SDNode *N,
2227422275
TargetLowering::DAGCombinerInfo &DCI) {
2227522276
EVT VT = N->getValueType(0);
22277+
SDLoc DL(N);
2227622278
// If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the
2227722279
// 128bit vector version.
2227822280
if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) {
2227922281
EVT LVT = VT.getDoubleNumVectorElementsVT(*DCI.DAG.getContext());
2228022282
SmallVector<SDValue> Ops(N->ops());
2228122283
if (SDNode *LN = DCI.DAG.getNodeIfExists(N->getOpcode(),
2228222284
DCI.DAG.getVTList(LVT), Ops)) {
22283-
SDLoc DL(N);
2228422285
return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0),
2228522286
DCI.DAG.getConstant(0, DL, MVT::i64));
2228622287
}
2228722288
}
2228822289

22289-
if (N->getOpcode() == AArch64ISD::DUP)
22290+
if (N->getOpcode() == AArch64ISD::DUP) {
22291+
if (DCI.isAfterLegalizeDAG()) {
22292+
// If scalar dup's operand is extract_vector_elt, try to combine them into
22293+
// duplane. For example,
22294+
//
22295+
// t21: i32 = extract_vector_elt t19, Constant:i64<0>
22296+
// t18: v4i32 = AArch64ISD::DUP t21
22297+
// ==>
22298+
// t22: v4i32 = AArch64ISD::DUPLANE32 t19, Constant:i64<0>
22299+
SDValue EXTRACT_VEC_ELT = N->getOperand(0);
22300+
if (EXTRACT_VEC_ELT.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
22301+
if (VT == EXTRACT_VEC_ELT.getOperand(0).getValueType()) {
22302+
unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
22303+
return DCI.DAG.getNode(Opcode, DL, VT, EXTRACT_VEC_ELT.getOperand(0),
22304+
EXTRACT_VEC_ELT.getOperand(1));
22305+
}
22306+
}
22307+
}
22308+
2229022309
return performPostLD1Combine(N, DCI, false);
22310+
}
2229122311

2229222312
return SDValue();
2229322313
}

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6472,12 +6472,24 @@ def : Pat<(i32 (int_aarch64_neon_uaddlv (v8i16 (AArch64uaddlp (v16i8 V128:$op)))
64726472
(v8i16 (SUBREG_TO_REG (i64 0), (UADDLVv16i8v V128:$op), hsub)),
64736473
ssub))>;
64746474

6475+
def : Pat<(v4i32 (AArch64uaddlv (v8i16 (AArch64uaddlp (v16i8 V128:$op))))),
6476+
(v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv16i8v V128:$op), hsub))>;
6477+
6478+
def : Pat<(v4i32 (AArch64uaddlv (v4i16 (AArch64uaddlp (v8i8 V64:$op))))),
6479+
(v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv8i8v V64:$op), hsub))>;
6480+
64756481
def : Pat<(v4i32 (AArch64uaddlv (v8i8 V64:$Rn))),
64766482
(v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv8i8v V64:$Rn), hsub))>;
64776483

6484+
def : Pat<(v4i32 (AArch64uaddlv (v4i16 V64:$Rn))),
6485+
(v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv4i16v V64:$Rn), ssub))>;
6486+
64786487
def : Pat<(v4i32 (AArch64uaddlv (v16i8 V128:$Rn))),
64796488
(v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv16i8v V128:$Rn), hsub))>;
64806489

6490+
def : Pat<(v4i32 (AArch64uaddlv (v8i16 V128:$Rn))),
6491+
(v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv8i16v V128:$Rn), ssub))>;
6492+
64816493
// Patterns for across-vector intrinsics, that have a node equivalent, that
64826494
// returns a vector (with only the low lane defined) instead of a scalar.
64836495
// In effect, opNode is the same as (scalar_to_vector (IntNode)).

llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@ define void @insert_vec_v2i32_uaddlv_from_v8i16(ptr %0) {
1414
; CHECK-NEXT: movi.2d v1, #0000000000000000
1515
; CHECK-NEXT: uaddlv.8h s0, v0
1616
; CHECK-NEXT: mov.s v1[0], v0[0]
17-
; CHECK-NEXT: ucvtf.2s v1, v1
18-
; CHECK-NEXT: str d1, [x0]
17+
; CHECK-NEXT: ucvtf.2s v0, v1
18+
; CHECK-NEXT: str d0, [x0]
1919
; CHECK-NEXT: ret
2020

2121
entry:
@@ -52,8 +52,8 @@ define void @insert_vec_v16i32_uaddlv_from_v8i16(ptr %0) {
5252
; CHECK-NEXT: uaddlv.8h s1, v0
5353
; CHECK-NEXT: stp q0, q0, [x0, #32]
5454
; CHECK-NEXT: mov.s v2[0], v1[0]
55-
; CHECK-NEXT: ucvtf.4s v2, v2
56-
; CHECK-NEXT: stp q2, q0, [x0]
55+
; CHECK-NEXT: ucvtf.4s v1, v2
56+
; CHECK-NEXT: stp q1, q0, [x0]
5757
; CHECK-NEXT: ret
5858

5959
entry:
@@ -76,8 +76,8 @@ define void @insert_vec_v23i32_uaddlv_from_v8i16(ptr %0) {
7676
; CHECK-NEXT: st1.s { v0 }[2], [x8]
7777
; CHECK-NEXT: str d0, [x0, #80]
7878
; CHECK-NEXT: mov.s v2[0], v1[0]
79-
; CHECK-NEXT: ucvtf.4s v2, v2
80-
; CHECK-NEXT: str q2, [x0]
79+
; CHECK-NEXT: ucvtf.4s v1, v2
80+
; CHECK-NEXT: str q1, [x0]
8181
; CHECK-NEXT: ret
8282

8383
entry:
@@ -256,9 +256,9 @@ define void @insert_vec_v16i64_uaddlv_from_v4i16(ptr %0) {
256256
; CHECK-NEXT: uaddlv.4h s1, v0
257257
; CHECK-NEXT: stp q0, q0, [x0, #32]
258258
; CHECK-NEXT: mov.s v2[0], v1[0]
259-
; CHECK-NEXT: ucvtf.2d v2, v2
260-
; CHECK-NEXT: fcvtn v2.2s, v2.2d
261-
; CHECK-NEXT: stp q2, q0, [x0]
259+
; CHECK-NEXT: ucvtf.2d v1, v2
260+
; CHECK-NEXT: fcvtn v1.2s, v1.2d
261+
; CHECK-NEXT: stp q1, q0, [x0]
262262
; CHECK-NEXT: ret
263263

264264
entry:

llvm/test/CodeGen/AArch64/build-vector-to-extract-subvec-crash.ll

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,16 +9,15 @@ define i32 @widget(i64 %arg, <8 x i16> %arg1) {
99
; CHECK: // %bb.0: // %bb
1010
; CHECK-NEXT: sub sp, sp, #16
1111
; CHECK-NEXT: .cfi_def_cfa_offset 16
12-
; CHECK-NEXT: umov w9, v0.h[0]
13-
; CHECK-NEXT: movi v0.2d, #0000000000000000
14-
; CHECK-NEXT: mov x10, sp
15-
; CHECK-NEXT: bfi x10, x0, #1, #3
12+
; CHECK-NEXT: movi v1.2d, #0000000000000000
13+
; CHECK-NEXT: mov x9, sp
14+
; CHECK-NEXT: dup v0.8h, v0.h[0]
15+
; CHECK-NEXT: bfi x9, x0, #1, #3
1616
; CHECK-NEXT: mov x8, x0
1717
; CHECK-NEXT: mov w0, wzr
18-
; CHECK-NEXT: dup v1.8h, w9
19-
; CHECK-NEXT: str q0, [sp]
20-
; CHECK-NEXT: ld1 { v1.h }[1], [x10]
21-
; CHECK-NEXT: str q1, [x8]
18+
; CHECK-NEXT: str q1, [sp]
19+
; CHECK-NEXT: ld1 { v0.h }[1], [x9]
20+
; CHECK-NEXT: str q0, [x8]
2221
; CHECK-NEXT: add sp, sp, #16
2322
; CHECK-NEXT: ret
2423
bb:

llvm/test/CodeGen/AArch64/neon-addlv.ll

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,6 @@ entry:
195195
}
196196

197197
declare <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32)
198-
199198
declare i64 @llvm.aarch64.neon.urshl.i64(i64, i64)
200199

201200
define <8 x i8> @uaddlv_v8i8_urshr(<8 x i8> %a) {
@@ -215,3 +214,36 @@ entry:
215214
%vecinit7.i = shufflevector <8 x i8> %vecinit.i, <8 x i8> poison, <8 x i32> zeroinitializer
216215
ret <8 x i8> %vecinit7.i
217216
}
217+
218+
define <4 x i32> @uaddlv_dup_v4i16(<4 x i16> %a) {
219+
; CHECK-LABEL: uaddlv_dup_v4i16:
220+
; CHECK: // %bb.0: // %entry
221+
; CHECK-NEXT: uaddlv s0, v0.4h
222+
; CHECK-NEXT: dup v0.4s, v0.s[0]
223+
; CHECK-NEXT: ushr v0.4s, v0.4s, #3
224+
; CHECK-NEXT: ret
225+
entry:
226+
%vaddlv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(<4 x i16> %a)
227+
%vecinit.i = insertelement <4 x i32> undef, i32 %vaddlv.i, i64 0
228+
%vecinit7.i = shufflevector <4 x i32> %vecinit.i, <4 x i32> poison, <4 x i32> zeroinitializer
229+
%vshr_n = lshr <4 x i32> %vecinit7.i, <i32 3, i32 3, i32 3, i32 3>
230+
ret <4 x i32> %vshr_n
231+
}
232+
233+
define <4 x i32> @uaddlv_dup_v8i16(<8 x i16> %a) {
234+
; CHECK-LABEL: uaddlv_dup_v8i16:
235+
; CHECK: // %bb.0: // %entry
236+
; CHECK-NEXT: uaddlv s0, v0.8h
237+
; CHECK-NEXT: dup v0.4s, v0.s[0]
238+
; CHECK-NEXT: ushr v0.4s, v0.4s, #3
239+
; CHECK-NEXT: ret
240+
entry:
241+
%vaddlv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16> %a)
242+
%vecinit.i = insertelement <4 x i32> undef, i32 %vaddlv.i, i64 0
243+
%vecinit7.i = shufflevector <4 x i32> %vecinit.i, <4 x i32> poison, <4 x i32> zeroinitializer
244+
%vshr_n = lshr <4 x i32> %vecinit7.i, <i32 3, i32 3, i32 3, i32 3>
245+
ret <4 x i32> %vshr_n
246+
}
247+
248+
declare i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16>)
249+
declare i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(<4 x i16>)

llvm/test/CodeGen/AArch64/uaddlv-vaddlp-combine.ll

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,21 @@ define i16 @uaddlv_uaddlp_v16i8(<16 x i8> %0) {
2626
ret i16 %4
2727
}
2828

29+
define i16 @uaddlv_uaddlp_v8i8(<8 x i8> %0) {
30+
; CHECK-LABEL: uaddlv_uaddlp_v8i8:
31+
; CHECK: // %bb.0:
32+
; CHECK-NEXT: uaddlv h0, v0.8b
33+
; CHECK-NEXT: fmov w0, s0
34+
; CHECK-NEXT: ret
35+
%2 = tail call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %0)
36+
%3 = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(<4 x i16> %2)
37+
%4 = trunc i32 %3 to i16
38+
ret i16 %4
39+
}
40+
2941
declare i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32>)
3042
declare i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16>)
43+
declare i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(<4 x i16>)
3144
declare <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16>)
3245
declare <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8>)
46+
declare <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8>)

0 commit comments

Comments
 (0)