-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[AArch64] Prefer zip over ushll for anyext. #133433
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-aarch64 Author: David Green (davemgreen) ChangesMany CPUs have a higher throughput of ZIP instructions vs USHLL. This adds some tablegen patterns for preferring zip in anyext patterns. Full diff: https://github.com/llvm/llvm-project/pull/133433.diff 10 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 6c61e3a613f6f..f291589e04c6b 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6751,6 +6751,23 @@ def : Pat<(v4i32 (concat_vectors
(v2i32 (trunc (AArch64vlshr (v2i64 V128:$Vm), (i32 32)))))),
(UZP2v4i32 V128:$Vn, V128:$Vm)>;
+// extract_subvec(anyext) can use zip. Check for one use on the anyext, otherwise
+// the extract_subvector can be free.
+let HasOneUse = 1 in
+def anyext_oneuse: PatFrag<(ops node:$src0), (anyext $src0)>;
+def : Pat<(v4i16 (extract_subvector (v8i16 (anyext_oneuse (v8i8 V64:$Vn))), (i64 0))),
+ (ZIP1v8i8 V64:$Vn, V64:$Vn)>;
+def : Pat<(v2i32 (extract_subvector (v4i32 (anyext_oneuse (v4i16 V64:$Vn))), (i64 0))),
+ (ZIP1v4i16 V64:$Vn, V64:$Vn)>;
+def : Pat<(v1i64 (extract_subvector (v2i64 (anyext_oneuse (v2i32 V64:$Vn))), (i64 0))),
+ (ZIP1v2i32 V64:$Vn, V64:$Vn)>;
+def : Pat<(v4i16 (extract_subvector (v8i16 (anyext_oneuse (v8i8 V64:$Vn))), (i64 4))),
+ (ZIP2v8i8 V64:$Vn, V64:$Vn)>;
+def : Pat<(v2i32 (extract_subvector (v4i32 (anyext_oneuse (v4i16 V64:$Vn))), (i64 2))),
+ (ZIP2v4i16 V64:$Vn, V64:$Vn)>;
+def : Pat<(v1i64 (extract_subvector (v2i64 (anyext_oneuse (v2i32 V64:$Vn))), (i64 1))),
+ (ZIP2v2i32 V64:$Vn, V64:$Vn)>;
+
//----------------------------------------------------------------------------
// AdvSIMD TBL/TBX instructions
//----------------------------------------------------------------------------
diff --git a/llvm/test/CodeGen/AArch64/andorxor.ll b/llvm/test/CodeGen/AArch64/andorxor.ll
index 24f2549cce785..0384848082caa 100644
--- a/llvm/test/CodeGen/AArch64/andorxor.ll
+++ b/llvm/test/CodeGen/AArch64/andorxor.ll
@@ -433,8 +433,8 @@ define void @and_v4i8(ptr %p1, ptr %p2) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ldr s0, [x0]
; CHECK-SD-NEXT: ldr s1, [x1]
-; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
-; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b
+; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v1.8b
; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v0.8b
; CHECK-SD-NEXT: str s0, [x0]
@@ -482,8 +482,8 @@ define void @or_v4i8(ptr %p1, ptr %p2) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ldr s0, [x0]
; CHECK-SD-NEXT: ldr s1, [x1]
-; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
-; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b
+; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v1.8b
; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v0.8b
; CHECK-SD-NEXT: str s0, [x0]
@@ -531,8 +531,8 @@ define void @xor_v4i8(ptr %p1, ptr %p2) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ldr s0, [x0]
; CHECK-SD-NEXT: ldr s1, [x1]
-; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
-; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b
+; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v1.8b
; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v0.8b
; CHECK-SD-NEXT: str s0, [x0]
diff --git a/llvm/test/CodeGen/AArch64/bitcast-promote-widen.ll b/llvm/test/CodeGen/AArch64/bitcast-promote-widen.ll
index 864ddc2967c18..90fa294505c84 100644
--- a/llvm/test/CodeGen/AArch64/bitcast-promote-widen.ll
+++ b/llvm/test/CodeGen/AArch64/bitcast-promote-widen.ll
@@ -6,8 +6,7 @@
define <2 x i16> @bitcast_v2i16_v2f16(<2 x half> %x) {
; CHECK-LABEL: bitcast_v2i16_v2f16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: zip1 v0.4h, v0.4h, v0.4h
; CHECK-NEXT: ret
%y = bitcast <2 x half> %x to <2 x i16>
ret <2 x i16> %y
diff --git a/llvm/test/CodeGen/AArch64/bitcast.ll b/llvm/test/CodeGen/AArch64/bitcast.ll
index d9199ce2c79de..d54cc4adb81b3 100644
--- a/llvm/test/CodeGen/AArch64/bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/bitcast.ll
@@ -125,8 +125,7 @@ define <2 x i16> @bitcast_i32_v2i16(i32 %a, i32 %b){
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: add w8, w0, w1
; CHECK-SD-NEXT: fmov s0, w8
-; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT: zip1 v0.4h, v0.4h, v0.4h
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: bitcast_i32_v2i16:
diff --git a/llvm/test/CodeGen/AArch64/extbinopload.ll b/llvm/test/CodeGen/AArch64/extbinopload.ll
index 72f4d58a425e7..82114d60c4a93 100644
--- a/llvm/test/CodeGen/AArch64/extbinopload.ll
+++ b/llvm/test/CodeGen/AArch64/extbinopload.ll
@@ -649,7 +649,7 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
; CHECK-NEXT: add x8, x3, #8
; CHECK-NEXT: add x11, x1, #12
; CHECK-NEXT: str s1, [x4]
-; CHECK-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-NEXT: zip1 v1.8b, v1.8b, v1.8b
; CHECK-NEXT: ldr s0, [x2]
; CHECK-NEXT: ushll v2.8h, v0.8b, #0
; CHECK-NEXT: umov w9, v2.h[0]
@@ -659,7 +659,7 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
; CHECK-NEXT: mov v0.b[9], w10
; CHECK-NEXT: umov w10, v2.h[3]
; CHECK-NEXT: ldr s2, [x1]
-; CHECK-NEXT: ushll v2.8h, v2.8b, #0
+; CHECK-NEXT: zip1 v2.8b, v2.8b, v2.8b
; CHECK-NEXT: mov v0.b[10], w9
; CHECK-NEXT: add x9, x1, #4
; CHECK-NEXT: mov v1.d[1], v2.d[0]
diff --git a/llvm/test/CodeGen/AArch64/extract-subvec-combine.ll b/llvm/test/CodeGen/AArch64/extract-subvec-combine.ll
index 75d55773b3681..368103bf2f2fe 100644
--- a/llvm/test/CodeGen/AArch64/extract-subvec-combine.ll
+++ b/llvm/test/CodeGen/AArch64/extract-subvec-combine.ll
@@ -104,12 +104,19 @@ define <2 x i32> @sext_extract_zext_idx0(<4 x i16> %vec) nounwind {
; Negative test, combine should not fire if sign extension is for a different width.
define <2 x i32> @sext_extract_zext_idx0_negtest(<4 x i16> %vec) nounwind {
-; CHECK-LABEL: sext_extract_zext_idx0_negtest:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-NEXT: shl v0.2s, v0.2s, #17
-; CHECK-NEXT: sshr v0.2s, v0.2s, #17
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: sext_extract_zext_idx0_negtest:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: zip1 v0.4h, v0.4h, v0.4h
+; CHECK-SD-NEXT: shl v0.2s, v0.2s, #17
+; CHECK-SD-NEXT: sshr v0.2s, v0.2s, #17
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: sext_extract_zext_idx0_negtest:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT: shl v0.2s, v0.2s, #17
+; CHECK-GI-NEXT: sshr v0.2s, v0.2s, #17
+; CHECK-GI-NEXT: ret
%zext = zext <4 x i16> %vec to <4 x i32>
%extract = call <2 x i32> @llvm.vector.extract.v2i32.v4i32(<4 x i32> %zext, i64 0)
%sext_inreg_step0 = shl <2 x i32> %extract, <i32 17, i32 17>
diff --git a/llvm/test/CodeGen/AArch64/neon-bitcast.ll b/llvm/test/CodeGen/AArch64/neon-bitcast.ll
index d06612e2332e6..07772b716ec58 100644
--- a/llvm/test/CodeGen/AArch64/neon-bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/neon-bitcast.ll
@@ -518,15 +518,14 @@ define <2 x i16> @bitcast_i32_to_v2i16(i32 %word) {
; CHECK-LE-LABEL: bitcast_i32_to_v2i16:
; CHECK-LE: // %bb.0:
; CHECK-LE-NEXT: fmov s0, w0
-; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-LE-NEXT: zip1 v0.4h, v0.4h, v0.4h
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: bitcast_i32_to_v2i16:
; CHECK-BE: // %bb.0:
; CHECK-BE-NEXT: fmov s0, w0
; CHECK-BE-NEXT: rev32 v0.4h, v0.4h
-; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: zip1 v0.4h, v0.4h, v0.4h
; CHECK-BE-NEXT: rev64 v0.2s, v0.2s
; CHECK-BE-NEXT: ret
%ret = bitcast i32 %word to <2 x i16>
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll
index 8fac0e1067684..bda7ff9115e09 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll
@@ -88,9 +88,7 @@ define void @extract_subvector_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
define <2 x i16> @extract_subvector_v4i16(<4 x i16> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v4i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: zip2 v0.4h, v0.4h, v0.4h
; CHECK-NEXT: ret
%ret = call <2 x i16> @llvm.vector.extract.v2i16.v4i16(<4 x i16> %op, i64 2)
ret <2 x i16> %ret
diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
index b52cbfe08156b..45b7a2759b0b3 100644
--- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
+++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
@@ -303,7 +303,7 @@ define <3 x i32> @load_v3i8_zext_to_3xi32(ptr %src) {
; BE-NEXT: add x8, x0, #2
; BE-NEXT: ldr s0, [sp, #12]
; BE-NEXT: rev32 v0.8b, v0.8b
-; BE-NEXT: ushll v0.8h, v0.8b, #0
+; BE-NEXT: zip1 v0.8b, v0.8b, v0.8b
; BE-NEXT: ld1 { v0.b }[4], [x8]
; BE-NEXT: ushll v0.4s, v0.4h, #0
; BE-NEXT: and v0.16b, v0.16b, v1.16b
diff --git a/llvm/test/CodeGen/AArch64/zext.ll b/llvm/test/CodeGen/AArch64/zext.ll
index e40b9cb5c8482..962486afa3bb8 100644
--- a/llvm/test/CodeGen/AArch64/zext.ll
+++ b/llvm/test/CodeGen/AArch64/zext.ll
@@ -447,7 +447,7 @@ define <3 x i64> @zext_v3i10_v3i64(<3 x i10> %a) {
; CHECK-SD-NEXT: mov w8, #1023 // =0x3ff
; CHECK-SD-NEXT: dup v2.2d, x8
; CHECK-SD-NEXT: mov v0.s[1], w1
-; CHECK-SD-NEXT: ushll v3.2d, v1.2s, #0
+; CHECK-SD-NEXT: zip1 v3.2s, v1.2s, v1.2s
; CHECK-SD-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-SD-NEXT: and v2.8b, v3.8b, v2.8b
|
Many CPUs have a higher throughput of ZIP instructions vs USHLL. This adds some tablegen patterns for preferring zip in anyext patterns.
f61ef9c
to
c8dc571
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks really familiar, I thought we had done this somewhere already. But anyway, looks like a good idea, just one quick question inlined.
@@ -6751,6 +6751,23 @@ def : Pat<(v4i32 (concat_vectors | |||
(v2i32 (trunc (AArch64vlshr (v2i64 V128:$Vm), (i32 32)))))), | |||
(UZP2v4i32 V128:$Vn, V128:$Vm)>; | |||
|
|||
// extract_subvec(anyext) can use zip. Check for one use on the anyext, otherwise | |||
// the extract_subvector can be free. | |||
let HasOneUse = 1 in |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Question on this as I am not sure I remember seeing this before, what is this HasOneUse variable doing?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It was added relatively recently in #91578. In this case it makes sure that we don't generate zip1 as well as the original ushll, as if we have the ushll then the sub-vector extract at 0 is free.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah, thanks, nice!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
@@ -6751,6 +6751,23 @@ def : Pat<(v4i32 (concat_vectors | |||
(v2i32 (trunc (AArch64vlshr (v2i64 V128:$Vm), (i32 32)))))), | |||
(UZP2v4i32 V128:$Vn, V128:$Vm)>; | |||
|
|||
// extract_subvec(anyext) can use zip. Check for one use on the anyext, otherwise | |||
// the extract_subvector can be free. | |||
let HasOneUse = 1 in |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah, thanks, nice!
Many CPUs have a higher throughput of ZIP instructions vs USHLL. This adds some tablegen patterns for preferring zip in anyext patterns.
Many CPUs have a higher throughput of ZIP instructions vs USHLL. This adds some tablegen patterns for preferring zip in anyext patterns.