Skip to content

[AArch64] Prefer zip over ushll for anyext. #133433

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 1, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions llvm/lib/Target/AArch64/AArch64InstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -6751,6 +6751,23 @@ def : Pat<(v4i32 (concat_vectors
(v2i32 (trunc (AArch64vlshr (v2i64 V128:$Vm), (i32 32)))))),
(UZP2v4i32 V128:$Vn, V128:$Vm)>;

// extract_subvec(anyext) can use zip. Check for one use on the anyext, otherwise
// the extract_subvector can be free.
let HasOneUse = 1 in
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Question on this as I am not sure I remember seeing this before, what is this HasOneUse variable doing?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It was added relatively recently in #91578. In this case it makes sure that we don't generate zip1 as well as the original ushll, as if we have the ushll then the sub-vector extract at 0 is free.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, thanks, nice!

def anyext_oneuse: PatFrag<(ops node:$src0), (anyext $src0)>;
def : Pat<(v4i16 (extract_subvector (v8i16 (anyext_oneuse (v8i8 V64:$Vn))), (i64 0))),
(ZIP1v8i8 V64:$Vn, V64:$Vn)>;
def : Pat<(v2i32 (extract_subvector (v4i32 (anyext_oneuse (v4i16 V64:$Vn))), (i64 0))),
(ZIP1v4i16 V64:$Vn, V64:$Vn)>;
def : Pat<(v1i64 (extract_subvector (v2i64 (anyext_oneuse (v2i32 V64:$Vn))), (i64 0))),
(ZIP1v2i32 V64:$Vn, V64:$Vn)>;
def : Pat<(v4i16 (extract_subvector (v8i16 (anyext_oneuse (v8i8 V64:$Vn))), (i64 4))),
(ZIP2v8i8 V64:$Vn, V64:$Vn)>;
def : Pat<(v2i32 (extract_subvector (v4i32 (anyext_oneuse (v4i16 V64:$Vn))), (i64 2))),
(ZIP2v4i16 V64:$Vn, V64:$Vn)>;
def : Pat<(v1i64 (extract_subvector (v2i64 (anyext_oneuse (v2i32 V64:$Vn))), (i64 1))),
(ZIP2v2i32 V64:$Vn, V64:$Vn)>;

//----------------------------------------------------------------------------
// AdvSIMD TBL/TBX instructions
//----------------------------------------------------------------------------
Expand Down
12 changes: 6 additions & 6 deletions llvm/test/CodeGen/AArch64/andorxor.ll
Original file line number Diff line number Diff line change
Expand Up @@ -433,8 +433,8 @@ define void @and_v4i8(ptr %p1, ptr %p2) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ldr s0, [x0]
; CHECK-SD-NEXT: ldr s1, [x1]
; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b
; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v1.8b
; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v0.8b
; CHECK-SD-NEXT: str s0, [x0]
Expand Down Expand Up @@ -482,8 +482,8 @@ define void @or_v4i8(ptr %p1, ptr %p2) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ldr s0, [x0]
; CHECK-SD-NEXT: ldr s1, [x1]
; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b
; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v1.8b
; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v0.8b
; CHECK-SD-NEXT: str s0, [x0]
Expand Down Expand Up @@ -531,8 +531,8 @@ define void @xor_v4i8(ptr %p1, ptr %p2) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ldr s0, [x0]
; CHECK-SD-NEXT: ldr s1, [x1]
; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b
; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v1.8b
; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v0.8b
; CHECK-SD-NEXT: str s0, [x0]
Expand Down
3 changes: 1 addition & 2 deletions llvm/test/CodeGen/AArch64/bitcast-promote-widen.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@
define <2 x i16> @bitcast_v2i16_v2f16(<2 x half> %x) {
; CHECK-LABEL: bitcast_v2i16_v2f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: zip1 v0.4h, v0.4h, v0.4h
; CHECK-NEXT: ret
%y = bitcast <2 x half> %x to <2 x i16>
ret <2 x i16> %y
Expand Down
3 changes: 1 addition & 2 deletions llvm/test/CodeGen/AArch64/bitcast.ll
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,7 @@ define <2 x i16> @bitcast_i32_v2i16(i32 %a, i32 %b){
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: add w8, w0, w1
; CHECK-SD-NEXT: fmov s0, w8
; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-SD-NEXT: zip1 v0.4h, v0.4h, v0.4h
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: bitcast_i32_v2i16:
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AArch64/extbinopload.ll
Original file line number Diff line number Diff line change
Expand Up @@ -649,7 +649,7 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
; CHECK-NEXT: add x8, x3, #8
; CHECK-NEXT: add x11, x1, #12
; CHECK-NEXT: str s1, [x4]
; CHECK-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-NEXT: zip1 v1.8b, v1.8b, v1.8b
; CHECK-NEXT: ldr s0, [x2]
; CHECK-NEXT: ushll v2.8h, v0.8b, #0
; CHECK-NEXT: umov w9, v2.h[0]
Expand All @@ -659,7 +659,7 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
; CHECK-NEXT: mov v0.b[9], w10
; CHECK-NEXT: umov w10, v2.h[3]
; CHECK-NEXT: ldr s2, [x1]
; CHECK-NEXT: ushll v2.8h, v2.8b, #0
; CHECK-NEXT: zip1 v2.8b, v2.8b, v2.8b
; CHECK-NEXT: mov v0.b[10], w9
; CHECK-NEXT: add x9, x1, #4
; CHECK-NEXT: mov v1.d[1], v2.d[0]
Expand Down
19 changes: 13 additions & 6 deletions llvm/test/CodeGen/AArch64/extract-subvec-combine.ll
Original file line number Diff line number Diff line change
Expand Up @@ -104,12 +104,19 @@ define <2 x i32> @sext_extract_zext_idx0(<4 x i16> %vec) nounwind {

; Negative test, combine should not fire if sign extension is for a different width.
define <2 x i32> @sext_extract_zext_idx0_negtest(<4 x i16> %vec) nounwind {
; CHECK-LABEL: sext_extract_zext_idx0_negtest:
; CHECK: // %bb.0:
; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: shl v0.2s, v0.2s, #17
; CHECK-NEXT: sshr v0.2s, v0.2s, #17
; CHECK-NEXT: ret
; CHECK-SD-LABEL: sext_extract_zext_idx0_negtest:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: zip1 v0.4h, v0.4h, v0.4h
; CHECK-SD-NEXT: shl v0.2s, v0.2s, #17
; CHECK-SD-NEXT: sshr v0.2s, v0.2s, #17
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: sext_extract_zext_idx0_negtest:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-NEXT: shl v0.2s, v0.2s, #17
; CHECK-GI-NEXT: sshr v0.2s, v0.2s, #17
; CHECK-GI-NEXT: ret
%zext = zext <4 x i16> %vec to <4 x i32>
%extract = call <2 x i32> @llvm.vector.extract.v2i32.v4i32(<4 x i32> %zext, i64 0)
%sext_inreg_step0 = shl <2 x i32> %extract, <i32 17, i32 17>
Expand Down
5 changes: 2 additions & 3 deletions llvm/test/CodeGen/AArch64/neon-bitcast.ll
Original file line number Diff line number Diff line change
Expand Up @@ -518,15 +518,14 @@ define <2 x i16> @bitcast_i32_to_v2i16(i32 %word) {
; CHECK-LE-LABEL: bitcast_i32_to_v2i16:
; CHECK-LE: // %bb.0:
; CHECK-LE-NEXT: fmov s0, w0
; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-LE-NEXT: zip1 v0.4h, v0.4h, v0.4h
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: bitcast_i32_to_v2i16:
; CHECK-BE: // %bb.0:
; CHECK-BE-NEXT: fmov s0, w0
; CHECK-BE-NEXT: rev32 v0.4h, v0.4h
; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-BE-NEXT: zip1 v0.4h, v0.4h, v0.4h
; CHECK-BE-NEXT: rev64 v0.2s, v0.2s
; CHECK-BE-NEXT: ret
%ret = bitcast i32 %word to <2 x i16>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,7 @@ define void @extract_subvector_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
define <2 x i16> @extract_subvector_v4i16(<4 x i16> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v4i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: zip2 v0.4h, v0.4h, v0.4h
; CHECK-NEXT: ret
%ret = call <2 x i16> @llvm.vector.extract.v2i16.v4i16(<4 x i16> %op, i64 2)
ret <2 x i16> %ret
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,7 @@ define <3 x i32> @load_v3i8_zext_to_3xi32(ptr %src) {
; BE-NEXT: add x8, x0, #2
; BE-NEXT: ldr s0, [sp, #12]
; BE-NEXT: rev32 v0.8b, v0.8b
; BE-NEXT: ushll v0.8h, v0.8b, #0
; BE-NEXT: zip1 v0.8b, v0.8b, v0.8b
; BE-NEXT: ld1 { v0.b }[4], [x8]
; BE-NEXT: ushll v0.4s, v0.4h, #0
; BE-NEXT: and v0.16b, v0.16b, v1.16b
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AArch64/zext.ll
Original file line number Diff line number Diff line change
Expand Up @@ -447,7 +447,7 @@ define <3 x i64> @zext_v3i10_v3i64(<3 x i10> %a) {
; CHECK-SD-NEXT: mov w8, #1023 // =0x3ff
; CHECK-SD-NEXT: dup v2.2d, x8
; CHECK-SD-NEXT: mov v0.s[1], w1
; CHECK-SD-NEXT: ushll v3.2d, v1.2s, #0
; CHECK-SD-NEXT: zip1 v3.2s, v1.2s, v1.2s
; CHECK-SD-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-SD-NEXT: and v2.8b, v3.8b, v2.8b
Expand Down
Loading