Skip to content

Commit 57b991a

Browse files
authored
[AArch64] Improve lowering of truncating uzp1 (llvm#82457)
There were two existing patterns: `concat_vectors(trunc(x), trunc(y)) -> uzp1(x, y)` `concat_vectors(assertzext(trunc(x)), assertzext(trunc(y))) -> uzp1(x, y)` Move them into a class and add the following `assertsext` pattern to it: `concat_vectors(assertsext(trunc(x)), assertsext(trunc(y))) -> uzp1(x, y)` Add the following transform for v8i8 and v4i16 result types to help with pattern matching: `truncating uzp1(x, y) -> trunc(concat(x, y))` And a pattern to go with it: `trunc(concat_vectors(x, y)) -> uzp1 (x, y)` Add another isel pattern for v8i8 and v4i16 result vector types, similar to the existing concat pattern, but with a trunc node in the begining: `trunc(concat_vectors(assertext_trunc(x), assertext_trunc(y))) -> xtn(uzp1(x, y))`
1 parent f15a790 commit 57b991a

17 files changed

+209
-284
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 21 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -21423,12 +21423,8 @@ static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG,
2142321423
}
2142421424
}
2142521425

21426-
// uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))
21427-
// Only implemented on little-endian subtargets.
21428-
bool IsLittleEndian = DAG.getDataLayout().isLittleEndian();
21429-
21430-
// This optimization only works on little endian.
21431-
if (!IsLittleEndian)
21426+
// These optimizations only work on little endian.
21427+
if (!DAG.getDataLayout().isLittleEndian())
2143221428
return SDValue();
2143321429

2143421430
// uzp1(bitcast(x), bitcast(y)) -> uzp1(x, y)
@@ -21447,21 +21443,28 @@ static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG,
2144721443
if (ResVT != MVT::v2i32 && ResVT != MVT::v4i16 && ResVT != MVT::v8i8)
2144821444
return SDValue();
2144921445

21450-
auto getSourceOp = [](SDValue Operand) -> SDValue {
21451-
const unsigned Opcode = Operand.getOpcode();
21452-
if (Opcode == ISD::TRUNCATE)
21453-
return Operand->getOperand(0);
21454-
if (Opcode == ISD::BITCAST &&
21455-
Operand->getOperand(0).getOpcode() == ISD::TRUNCATE)
21456-
return Operand->getOperand(0)->getOperand(0);
21457-
return SDValue();
21458-
};
21446+
SDValue SourceOp0 = peekThroughBitcasts(Op0);
21447+
SDValue SourceOp1 = peekThroughBitcasts(Op1);
2145921448

21460-
SDValue SourceOp0 = getSourceOp(Op0);
21461-
SDValue SourceOp1 = getSourceOp(Op1);
21449+
// truncating uzp1(x, y) -> xtn(concat (x, y))
21450+
if (SourceOp0.getValueType() == SourceOp1.getValueType()) {
21451+
EVT Op0Ty = SourceOp0.getValueType();
21452+
if ((ResVT == MVT::v4i16 && Op0Ty == MVT::v2i32) ||
21453+
(ResVT == MVT::v8i8 && Op0Ty == MVT::v4i16)) {
21454+
SDValue Concat =
21455+
DAG.getNode(ISD::CONCAT_VECTORS, DL,
21456+
Op0Ty.getDoubleNumVectorElementsVT(*DAG.getContext()),
21457+
SourceOp0, SourceOp1);
21458+
return DAG.getNode(ISD::TRUNCATE, DL, ResVT, Concat);
21459+
}
21460+
}
2146221461

21463-
if (!SourceOp0 || !SourceOp1)
21462+
// uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))
21463+
if (SourceOp0.getOpcode() != ISD::TRUNCATE ||
21464+
SourceOp1.getOpcode() != ISD::TRUNCATE)
2146421465
return SDValue();
21466+
SourceOp0 = SourceOp0.getOperand(0);
21467+
SourceOp1 = SourceOp1.getOperand(0);
2146521468

2146621469
if (SourceOp0.getValueType() != SourceOp1.getValueType() ||
2146721470
!SourceOp0.getValueType().isSimple())

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 33 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -6153,26 +6153,39 @@ defm UZP2 : SIMDZipVector<0b101, "uzp2", AArch64uzp2>;
61536153
defm ZIP1 : SIMDZipVector<0b011, "zip1", AArch64zip1>;
61546154
defm ZIP2 : SIMDZipVector<0b111, "zip2", AArch64zip2>;
61556155

6156-
def : Pat<(v16i8 (concat_vectors (v8i8 (trunc (v8i16 V128:$Vn))),
6157-
(v8i8 (trunc (v8i16 V128:$Vm))))),
6158-
(UZP1v16i8 V128:$Vn, V128:$Vm)>;
6159-
def : Pat<(v8i16 (concat_vectors (v4i16 (trunc (v4i32 V128:$Vn))),
6160-
(v4i16 (trunc (v4i32 V128:$Vm))))),
6161-
(UZP1v8i16 V128:$Vn, V128:$Vm)>;
6162-
def : Pat<(v4i32 (concat_vectors (v2i32 (trunc (v2i64 V128:$Vn))),
6163-
(v2i32 (trunc (v2i64 V128:$Vm))))),
6164-
(UZP1v4i32 V128:$Vn, V128:$Vm)>;
6165-
// These are the same as above, with an optional assertzext node that can be
6166-
// generated from fptoi lowering.
6167-
def : Pat<(v16i8 (concat_vectors (v8i8 (assertzext (trunc (v8i16 V128:$Vn)))),
6168-
(v8i8 (assertzext (trunc (v8i16 V128:$Vm)))))),
6169-
(UZP1v16i8 V128:$Vn, V128:$Vm)>;
6170-
def : Pat<(v8i16 (concat_vectors (v4i16 (assertzext (trunc (v4i32 V128:$Vn)))),
6171-
(v4i16 (assertzext (trunc (v4i32 V128:$Vm)))))),
6172-
(UZP1v8i16 V128:$Vn, V128:$Vm)>;
6173-
def : Pat<(v4i32 (concat_vectors (v2i32 (assertzext (trunc (v2i64 V128:$Vn)))),
6174-
(v2i32 (assertzext (trunc (v2i64 V128:$Vm)))))),
6175-
(UZP1v4i32 V128:$Vn, V128:$Vm)>;
6156+
def trunc_optional_assert_ext : PatFrags<(ops node:$op0),
6157+
[(trunc node:$op0),
6158+
(assertzext (trunc node:$op0)),
6159+
(assertsext (trunc node:$op0))]>;
6160+
6161+
// concat_vectors(trunc(x), trunc(y)) -> uzp1(x, y)
6162+
// concat_vectors(assertzext(trunc(x)), assertzext(trunc(y))) -> uzp1(x, y)
6163+
// concat_vectors(assertsext(trunc(x)), assertsext(trunc(y))) -> uzp1(x, y)
6164+
class concat_trunc_to_uzp1_pat<ValueType SrcTy, ValueType TruncTy, ValueType ConcatTy>
6165+
: Pat<(ConcatTy (concat_vectors (TruncTy (trunc_optional_assert_ext (SrcTy V128:$Vn))),
6166+
(TruncTy (trunc_optional_assert_ext (SrcTy V128:$Vm))))),
6167+
(!cast<Instruction>("UZP1"#ConcatTy) V128:$Vn, V128:$Vm)>;
6168+
def : concat_trunc_to_uzp1_pat<v8i16, v8i8, v16i8>;
6169+
def : concat_trunc_to_uzp1_pat<v4i32, v4i16, v8i16>;
6170+
def : concat_trunc_to_uzp1_pat<v2i64, v2i32, v4i32>;
6171+
6172+
// trunc(concat_vectors(trunc(x), trunc(y))) -> xtn(uzp1(x, y))
6173+
// trunc(concat_vectors(assertzext(trunc(x)), assertzext(trunc(y)))) -> xtn(uzp1(x, y))
6174+
// trunc(concat_vectors(assertsext(trunc(x)), assertsext(trunc(y)))) -> xtn(uzp1(x, y))
6175+
class trunc_concat_trunc_to_xtn_uzp1_pat<ValueType SrcTy, ValueType TruncTy, ValueType ConcatTy,
6176+
ValueType Ty>
6177+
: Pat<(Ty (trunc_optional_assert_ext
6178+
(ConcatTy (concat_vectors
6179+
(TruncTy (trunc_optional_assert_ext (SrcTy V128:$Vn))),
6180+
(TruncTy (trunc_optional_assert_ext (SrcTy V128:$Vm))))))),
6181+
(!cast<Instruction>("XTN"#Ty) (!cast<Instruction>("UZP1"#ConcatTy) V128:$Vn, V128:$Vm))>;
6182+
def : trunc_concat_trunc_to_xtn_uzp1_pat<v4i32, v4i16, v8i16, v8i8>;
6183+
def : trunc_concat_trunc_to_xtn_uzp1_pat<v2i64, v2i32, v4i32, v4i16>;
6184+
6185+
def : Pat<(v8i8 (trunc (concat_vectors (v4i16 V64:$Vn), (v4i16 V64:$Vm)))),
6186+
(UZP1v8i8 V64:$Vn, V64:$Vm)>;
6187+
def : Pat<(v4i16 (trunc (concat_vectors (v2i32 V64:$Vn), (v2i32 V64:$Vm)))),
6188+
(UZP1v4i16 V64:$Vn, V64:$Vm)>;
61766189

61776190
def : Pat<(v16i8 (concat_vectors
61786191
(v8i8 (trunc (AArch64vlshr (v8i16 V128:$Vn), (i32 8)))),

llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll

Lines changed: 8 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,8 @@ define <4 x i16> @fptosi_v4f64_to_v4i16(ptr %ptr) {
88
; CHECK-NEXT: ldp q0, q1, [x0]
99
; CHECK-NEXT: fcvtzs v1.2d, v1.2d
1010
; CHECK-NEXT: fcvtzs v0.2d, v0.2d
11-
; CHECK-NEXT: xtn v1.2s, v1.2d
12-
; CHECK-NEXT: xtn v0.2s, v0.2d
13-
; CHECK-NEXT: uzp1 v0.4h, v0.4h, v1.4h
11+
; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s
12+
; CHECK-NEXT: xtn v0.4h, v0.4s
1413
; CHECK-NEXT: ret
1514
%tmp1 = load <4 x double>, ptr %ptr
1615
%tmp2 = fptosi <4 x double> %tmp1 to <4 x i16>
@@ -26,13 +25,10 @@ define <8 x i8> @fptosi_v4f64_to_v4i8(ptr %ptr) {
2625
; CHECK-NEXT: fcvtzs v1.2d, v1.2d
2726
; CHECK-NEXT: fcvtzs v3.2d, v3.2d
2827
; CHECK-NEXT: fcvtzs v2.2d, v2.2d
29-
; CHECK-NEXT: xtn v0.2s, v0.2d
30-
; CHECK-NEXT: xtn v1.2s, v1.2d
31-
; CHECK-NEXT: xtn v3.2s, v3.2d
32-
; CHECK-NEXT: xtn v2.2s, v2.2d
33-
; CHECK-NEXT: uzp1 v0.4h, v1.4h, v0.4h
34-
; CHECK-NEXT: uzp1 v1.4h, v2.4h, v3.4h
35-
; CHECK-NEXT: uzp1 v0.8b, v1.8b, v0.8b
28+
; CHECK-NEXT: uzp1 v0.4s, v1.4s, v0.4s
29+
; CHECK-NEXT: uzp1 v1.4s, v2.4s, v3.4s
30+
; CHECK-NEXT: uzp1 v0.8h, v1.8h, v0.8h
31+
; CHECK-NEXT: xtn v0.8b, v0.8h
3632
; CHECK-NEXT: ret
3733
%tmp1 = load <8 x double>, ptr %ptr
3834
%tmp2 = fptosi <8 x double> %tmp1 to <8 x i8>
@@ -96,9 +92,8 @@ define <4 x i16> @fptoui_v4f64_to_v4i16(ptr %ptr) {
9692
; CHECK-NEXT: ldp q0, q1, [x0]
9793
; CHECK-NEXT: fcvtzs v1.2d, v1.2d
9894
; CHECK-NEXT: fcvtzs v0.2d, v0.2d
99-
; CHECK-NEXT: xtn v1.2s, v1.2d
100-
; CHECK-NEXT: xtn v0.2s, v0.2d
101-
; CHECK-NEXT: uzp1 v0.4h, v0.4h, v1.4h
95+
; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s
96+
; CHECK-NEXT: xtn v0.4h, v0.4s
10297
; CHECK-NEXT: ret
10398
%tmp1 = load <4 x double>, ptr %ptr
10499
%tmp2 = fptoui <4 x double> %tmp1 to <4 x i16>

llvm/test/CodeGen/AArch64/extbinopload.ll

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -650,7 +650,7 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
650650
; CHECK-NEXT: add x11, x3, #12
651651
; CHECK-NEXT: str s1, [x4]
652652
; CHECK-NEXT: ushll v1.8h, v1.8b, #0
653-
; CHECK-NEXT: ldp s0, s5, [x2]
653+
; CHECK-NEXT: ldp s0, s4, [x2]
654654
; CHECK-NEXT: ushll v2.8h, v0.8b, #0
655655
; CHECK-NEXT: umov w9, v2.h[0]
656656
; CHECK-NEXT: umov w10, v2.h[1]
@@ -662,24 +662,25 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
662662
; CHECK-NEXT: ushll v2.8h, v2.8b, #0
663663
; CHECK-NEXT: mov v0.b[10], w9
664664
; CHECK-NEXT: add x9, x1, #4
665-
; CHECK-NEXT: uzp1 v1.8b, v1.8b, v2.8b
665+
; CHECK-NEXT: mov v1.d[1], v2.d[0]
666666
; CHECK-NEXT: mov v0.b[11], w10
667667
; CHECK-NEXT: add x10, x1, #12
668+
; CHECK-NEXT: bic v1.8h, #255, lsl #8
668669
; CHECK-NEXT: ld1 { v0.s }[3], [x3], #4
669-
; CHECK-NEXT: ldr s4, [x0, #12]
670-
; CHECK-NEXT: ldp s3, s16, [x0, #4]
671-
; CHECK-NEXT: ld1 { v5.s }[1], [x3]
672-
; CHECK-NEXT: ldp s6, s7, [x2, #8]
673-
; CHECK-NEXT: ld1 { v4.s }[1], [x10]
674-
; CHECK-NEXT: ld1 { v3.s }[1], [x9]
675-
; CHECK-NEXT: ld1 { v6.s }[1], [x8]
676-
; CHECK-NEXT: ld1 { v7.s }[1], [x11]
670+
; CHECK-NEXT: ldr s3, [x0, #12]
671+
; CHECK-NEXT: ldp s2, s7, [x0, #4]
672+
; CHECK-NEXT: ld1 { v4.s }[1], [x3]
673+
; CHECK-NEXT: ldp s5, s6, [x2, #8]
674+
; CHECK-NEXT: ld1 { v3.s }[1], [x10]
675+
; CHECK-NEXT: ld1 { v2.s }[1], [x9]
676+
; CHECK-NEXT: ld1 { v5.s }[1], [x8]
677+
; CHECK-NEXT: ld1 { v6.s }[1], [x11]
677678
; CHECK-NEXT: add x8, x1, #8
678-
; CHECK-NEXT: ld1 { v16.s }[1], [x8]
679-
; CHECK-NEXT: uaddl v2.8h, v3.8b, v4.8b
680-
; CHECK-NEXT: ushll v3.8h, v6.8b, #0
681-
; CHECK-NEXT: uaddl v4.8h, v5.8b, v7.8b
682-
; CHECK-NEXT: uaddl v1.8h, v1.8b, v16.8b
679+
; CHECK-NEXT: ld1 { v7.s }[1], [x8]
680+
; CHECK-NEXT: uaddl v2.8h, v2.8b, v3.8b
681+
; CHECK-NEXT: ushll v3.8h, v5.8b, #0
682+
; CHECK-NEXT: uaddl v4.8h, v4.8b, v6.8b
683+
; CHECK-NEXT: uaddw v1.8h, v1.8h, v7.8b
683684
; CHECK-NEXT: uaddw2 v5.8h, v3.8h, v0.16b
684685
; CHECK-NEXT: ushll v0.4s, v2.4h, #3
685686
; CHECK-NEXT: ushll2 v2.4s, v2.8h, #3

llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -73,9 +73,8 @@ define void @fptoui_v8f32_to_v8i8_no_loop(ptr %A, ptr %dst) {
7373
; CHECK-NEXT: ldp q0, q1, [x0]
7474
; CHECK-NEXT: fcvtzs.4s v1, v1
7575
; CHECK-NEXT: fcvtzs.4s v0, v0
76-
; CHECK-NEXT: xtn.4h v1, v1
77-
; CHECK-NEXT: xtn.4h v0, v0
78-
; CHECK-NEXT: uzp1.8b v0, v0, v1
76+
; CHECK-NEXT: uzp1.8h v0, v0, v1
77+
; CHECK-NEXT: xtn.8b v0, v0
7978
; CHECK-NEXT: str d0, [x1]
8079
; CHECK-NEXT: ret
8180
entry:

0 commit comments

Comments
 (0)