Skip to content

Commit c8dc571

Browse files
committed
[AArch64] Prefer zip over ushll for anyext.
Many CPUs have a higher throughput of ZIP instructions vs USHLL. This adds some tablegen patterns for preferring zip in anyext patterns.
1 parent fd0785e commit c8dc571

File tree

10 files changed

+45
-26
lines changed

10 files changed

+45
-26
lines changed

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6751,6 +6751,23 @@ def : Pat<(v4i32 (concat_vectors
67516751
(v2i32 (trunc (AArch64vlshr (v2i64 V128:$Vm), (i32 32)))))),
67526752
(UZP2v4i32 V128:$Vn, V128:$Vm)>;
67536753

6754+
// extract_subvec(anyext) can use zip. Check for one use on the anyext, otherwise
6755+
// the extract_subvector can be free.
6756+
let HasOneUse = 1 in
6757+
def anyext_oneuse: PatFrag<(ops node:$src0), (anyext $src0)>;
6758+
def : Pat<(v4i16 (extract_subvector (v8i16 (anyext_oneuse (v8i8 V64:$Vn))), (i64 0))),
6759+
(ZIP1v8i8 V64:$Vn, V64:$Vn)>;
6760+
def : Pat<(v2i32 (extract_subvector (v4i32 (anyext_oneuse (v4i16 V64:$Vn))), (i64 0))),
6761+
(ZIP1v4i16 V64:$Vn, V64:$Vn)>;
6762+
def : Pat<(v1i64 (extract_subvector (v2i64 (anyext_oneuse (v2i32 V64:$Vn))), (i64 0))),
6763+
(ZIP1v2i32 V64:$Vn, V64:$Vn)>;
6764+
def : Pat<(v4i16 (extract_subvector (v8i16 (anyext_oneuse (v8i8 V64:$Vn))), (i64 4))),
6765+
(ZIP2v8i8 V64:$Vn, V64:$Vn)>;
6766+
def : Pat<(v2i32 (extract_subvector (v4i32 (anyext_oneuse (v4i16 V64:$Vn))), (i64 2))),
6767+
(ZIP2v4i16 V64:$Vn, V64:$Vn)>;
6768+
def : Pat<(v1i64 (extract_subvector (v2i64 (anyext_oneuse (v2i32 V64:$Vn))), (i64 1))),
6769+
(ZIP2v2i32 V64:$Vn, V64:$Vn)>;
6770+
67546771
//----------------------------------------------------------------------------
67556772
// AdvSIMD TBL/TBX instructions
67566773
//----------------------------------------------------------------------------

llvm/test/CodeGen/AArch64/andorxor.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -433,8 +433,8 @@ define void @and_v4i8(ptr %p1, ptr %p2) {
433433
; CHECK-SD: // %bb.0: // %entry
434434
; CHECK-SD-NEXT: ldr s0, [x0]
435435
; CHECK-SD-NEXT: ldr s1, [x1]
436-
; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
437-
; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
436+
; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b
437+
; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v1.8b
438438
; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b
439439
; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v0.8b
440440
; CHECK-SD-NEXT: str s0, [x0]
@@ -482,8 +482,8 @@ define void @or_v4i8(ptr %p1, ptr %p2) {
482482
; CHECK-SD: // %bb.0: // %entry
483483
; CHECK-SD-NEXT: ldr s0, [x0]
484484
; CHECK-SD-NEXT: ldr s1, [x1]
485-
; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
486-
; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
485+
; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b
486+
; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v1.8b
487487
; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b
488488
; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v0.8b
489489
; CHECK-SD-NEXT: str s0, [x0]
@@ -531,8 +531,8 @@ define void @xor_v4i8(ptr %p1, ptr %p2) {
531531
; CHECK-SD: // %bb.0: // %entry
532532
; CHECK-SD-NEXT: ldr s0, [x0]
533533
; CHECK-SD-NEXT: ldr s1, [x1]
534-
; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
535-
; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
534+
; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b
535+
; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v1.8b
536536
; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b
537537
; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v0.8b
538538
; CHECK-SD-NEXT: str s0, [x0]

llvm/test/CodeGen/AArch64/bitcast-promote-widen.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,7 @@
66
define <2 x i16> @bitcast_v2i16_v2f16(<2 x half> %x) {
77
; CHECK-LABEL: bitcast_v2i16_v2f16:
88
; CHECK: // %bb.0:
9-
; CHECK-NEXT: ushll v0.4s, v0.4h, #0
10-
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
9+
; CHECK-NEXT: zip1 v0.4h, v0.4h, v0.4h
1110
; CHECK-NEXT: ret
1211
%y = bitcast <2 x half> %x to <2 x i16>
1312
ret <2 x i16> %y

llvm/test/CodeGen/AArch64/bitcast.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -125,8 +125,7 @@ define <2 x i16> @bitcast_i32_v2i16(i32 %a, i32 %b){
125125
; CHECK-SD: // %bb.0:
126126
; CHECK-SD-NEXT: add w8, w0, w1
127127
; CHECK-SD-NEXT: fmov s0, w8
128-
; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
129-
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
128+
; CHECK-SD-NEXT: zip1 v0.4h, v0.4h, v0.4h
130129
; CHECK-SD-NEXT: ret
131130
;
132131
; CHECK-GI-LABEL: bitcast_i32_v2i16:

llvm/test/CodeGen/AArch64/extbinopload.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -649,7 +649,7 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
649649
; CHECK-NEXT: add x8, x3, #8
650650
; CHECK-NEXT: add x11, x1, #12
651651
; CHECK-NEXT: str s1, [x4]
652-
; CHECK-NEXT: ushll v1.8h, v1.8b, #0
652+
; CHECK-NEXT: zip1 v1.8b, v1.8b, v1.8b
653653
; CHECK-NEXT: ldr s0, [x2]
654654
; CHECK-NEXT: ushll v2.8h, v0.8b, #0
655655
; CHECK-NEXT: umov w9, v2.h[0]
@@ -659,7 +659,7 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
659659
; CHECK-NEXT: mov v0.b[9], w10
660660
; CHECK-NEXT: umov w10, v2.h[3]
661661
; CHECK-NEXT: ldr s2, [x1]
662-
; CHECK-NEXT: ushll v2.8h, v2.8b, #0
662+
; CHECK-NEXT: zip1 v2.8b, v2.8b, v2.8b
663663
; CHECK-NEXT: mov v0.b[10], w9
664664
; CHECK-NEXT: add x9, x1, #4
665665
; CHECK-NEXT: mov v1.d[1], v2.d[0]

llvm/test/CodeGen/AArch64/extract-subvec-combine.ll

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -104,12 +104,19 @@ define <2 x i32> @sext_extract_zext_idx0(<4 x i16> %vec) nounwind {
104104

105105
; Negative test, combine should not fire if sign extension is for a different width.
106106
define <2 x i32> @sext_extract_zext_idx0_negtest(<4 x i16> %vec) nounwind {
107-
; CHECK-LABEL: sext_extract_zext_idx0_negtest:
108-
; CHECK: // %bb.0:
109-
; CHECK-NEXT: ushll v0.4s, v0.4h, #0
110-
; CHECK-NEXT: shl v0.2s, v0.2s, #17
111-
; CHECK-NEXT: sshr v0.2s, v0.2s, #17
112-
; CHECK-NEXT: ret
107+
; CHECK-SD-LABEL: sext_extract_zext_idx0_negtest:
108+
; CHECK-SD: // %bb.0:
109+
; CHECK-SD-NEXT: zip1 v0.4h, v0.4h, v0.4h
110+
; CHECK-SD-NEXT: shl v0.2s, v0.2s, #17
111+
; CHECK-SD-NEXT: sshr v0.2s, v0.2s, #17
112+
; CHECK-SD-NEXT: ret
113+
;
114+
; CHECK-GI-LABEL: sext_extract_zext_idx0_negtest:
115+
; CHECK-GI: // %bb.0:
116+
; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
117+
; CHECK-GI-NEXT: shl v0.2s, v0.2s, #17
118+
; CHECK-GI-NEXT: sshr v0.2s, v0.2s, #17
119+
; CHECK-GI-NEXT: ret
113120
%zext = zext <4 x i16> %vec to <4 x i32>
114121
%extract = call <2 x i32> @llvm.vector.extract.v2i32.v4i32(<4 x i32> %zext, i64 0)
115122
%sext_inreg_step0 = shl <2 x i32> %extract, <i32 17, i32 17>

llvm/test/CodeGen/AArch64/neon-bitcast.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -518,15 +518,14 @@ define <2 x i16> @bitcast_i32_to_v2i16(i32 %word) {
518518
; CHECK-LE-LABEL: bitcast_i32_to_v2i16:
519519
; CHECK-LE: // %bb.0:
520520
; CHECK-LE-NEXT: fmov s0, w0
521-
; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0
522-
; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0
521+
; CHECK-LE-NEXT: zip1 v0.4h, v0.4h, v0.4h
523522
; CHECK-LE-NEXT: ret
524523
;
525524
; CHECK-BE-LABEL: bitcast_i32_to_v2i16:
526525
; CHECK-BE: // %bb.0:
527526
; CHECK-BE-NEXT: fmov s0, w0
528527
; CHECK-BE-NEXT: rev32 v0.4h, v0.4h
529-
; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0
528+
; CHECK-BE-NEXT: zip1 v0.4h, v0.4h, v0.4h
530529
; CHECK-BE-NEXT: rev64 v0.2s, v0.2s
531530
; CHECK-BE-NEXT: ret
532531
%ret = bitcast i32 %word to <2 x i16>

llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -88,9 +88,7 @@ define void @extract_subvector_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
8888
define <2 x i16> @extract_subvector_v4i16(<4 x i16> %op) vscale_range(2,0) #0 {
8989
; CHECK-LABEL: extract_subvector_v4i16:
9090
; CHECK: // %bb.0:
91-
; CHECK-NEXT: ushll v0.4s, v0.4h, #0
92-
; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
93-
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
91+
; CHECK-NEXT: zip2 v0.4h, v0.4h, v0.4h
9492
; CHECK-NEXT: ret
9593
%ret = call <2 x i16> @llvm.vector.extract.v2i16.v4i16(<4 x i16> %op, i64 2)
9694
ret <2 x i16> %ret

llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -303,7 +303,7 @@ define <3 x i32> @load_v3i8_zext_to_3xi32(ptr %src) {
303303
; BE-NEXT: add x8, x0, #2
304304
; BE-NEXT: ldr s0, [sp, #12]
305305
; BE-NEXT: rev32 v0.8b, v0.8b
306-
; BE-NEXT: ushll v0.8h, v0.8b, #0
306+
; BE-NEXT: zip1 v0.8b, v0.8b, v0.8b
307307
; BE-NEXT: ld1 { v0.b }[4], [x8]
308308
; BE-NEXT: ushll v0.4s, v0.4h, #0
309309
; BE-NEXT: and v0.16b, v0.16b, v1.16b

llvm/test/CodeGen/AArch64/zext.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -447,7 +447,7 @@ define <3 x i64> @zext_v3i10_v3i64(<3 x i10> %a) {
447447
; CHECK-SD-NEXT: mov w8, #1023 // =0x3ff
448448
; CHECK-SD-NEXT: dup v2.2d, x8
449449
; CHECK-SD-NEXT: mov v0.s[1], w1
450-
; CHECK-SD-NEXT: ushll v3.2d, v1.2s, #0
450+
; CHECK-SD-NEXT: zip1 v3.2s, v1.2s, v1.2s
451451
; CHECK-SD-NEXT: ushll v0.2d, v0.2s, #0
452452
; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b
453453
; CHECK-SD-NEXT: and v2.8b, v3.8b, v2.8b

0 commit comments

Comments
 (0)