Skip to content

Commit f67cb05

Browse files
committed
[AArch64][SVE] Fold ADD+CNTB to INCB/DECB
Currently, given: ```cpp uint64_t incb(uint64_t x) { return x+svcntb(); } ``` LLVM generates: ```gas incb: addvl x0, x0, #1 ret ``` Which is functionally equivalent to: ```gas incb: incb x0 ret ``` However, on microarchitectures like the Neoverse V2 and Neoverse V3, the second form (with INCB) can have significantly better latency and throughput. On the Neoverse V2, for example, ADDVL has a latency and throughput of 2, whereas INCB has a latency of 1 and a throughput of 4 (and similarly for the Neoverse V3, though in this case the throughput is further increased to 8). The same applies to DECB. This patch adds patterns to prefer the INCB/DECB forms over ADDVL where applicable.
1 parent a545cf5 commit f67cb05

File tree

9 files changed

+81
-64
lines changed

9 files changed

+81
-64
lines changed

llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td

+22-8
Original file line numberDiff line numberDiff line change
@@ -142,11 +142,13 @@ def AArch64st1q_scatter : SDNode<"AArch64ISD::SST1Q_PRED", SDT_AArch64_SCATTER_V
142142

143143
// SVE CNT/INC/RDVL
144144
def sve_rdvl_imm : ComplexPattern<i64, 1, "SelectRDVLImm<-32, 31, 16>">;
145+
def sve_cntb_imm : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, 16>">;
145146
def sve_cnth_imm : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, 8>">;
146147
def sve_cntw_imm : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, 4>">;
147148
def sve_cntd_imm : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, 2>">;
148149

149150
// SVE DEC
151+
def sve_cntb_imm_neg : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, -16>">;
150152
def sve_cnth_imm_neg : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, -8>">;
151153
def sve_cntw_imm_neg : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, -4>">;
152154
def sve_cntd_imm_neg : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, -2>">;
@@ -2680,28 +2682,31 @@ let Predicates = [HasSVEorSME] in {
26802682
}
26812683

26822684
let Predicates = [HasSVEorSME, UseScalarIncVL], AddedComplexity = 5 in {
2683-
def : Pat<(add GPR64:$op, (vscale (sve_rdvl_imm i32:$imm))),
2684-
(ADDVL_XXI GPR64:$op, $imm)>;
2685-
2686-
def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_rdvl_imm i32:$imm))))),
2687-
(EXTRACT_SUBREG (ADDVL_XXI (INSERT_SUBREG (IMPLICIT_DEF),
2688-
GPR32:$op, sub_32), $imm),
2689-
sub_32)>;
2690-
2685+
def : Pat<(add GPR64:$op, (vscale (sve_cntb_imm i32:$imm))),
2686+
(INCB_XPiI GPR64:$op, 31, $imm)>;
26912687
def : Pat<(add GPR64:$op, (vscale (sve_cnth_imm i32:$imm))),
26922688
(INCH_XPiI GPR64:$op, 31, $imm)>;
26932689
def : Pat<(add GPR64:$op, (vscale (sve_cntw_imm i32:$imm))),
26942690
(INCW_XPiI GPR64:$op, 31, $imm)>;
26952691
def : Pat<(add GPR64:$op, (vscale (sve_cntd_imm i32:$imm))),
26962692
(INCD_XPiI GPR64:$op, 31, $imm)>;
26972693

2694+
def : Pat<(add GPR64:$op, (vscale (sve_cntb_imm_neg i32:$imm))),
2695+
(DECB_XPiI GPR64:$op, 31, $imm)>;
26982696
def : Pat<(add GPR64:$op, (vscale (sve_cnth_imm_neg i32:$imm))),
26992697
(DECH_XPiI GPR64:$op, 31, $imm)>;
27002698
def : Pat<(add GPR64:$op, (vscale (sve_cntw_imm_neg i32:$imm))),
27012699
(DECW_XPiI GPR64:$op, 31, $imm)>;
27022700
def : Pat<(add GPR64:$op, (vscale (sve_cntd_imm_neg i32:$imm))),
27032701
(DECD_XPiI GPR64:$op, 31, $imm)>;
27042702

2703+
def : Pat<(add GPR64:$op, (vscale (sve_rdvl_imm i32:$imm))),
2704+
(ADDVL_XXI GPR64:$op, $imm)>;
2705+
2706+
def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cntb_imm i32:$imm))))),
2707+
(EXTRACT_SUBREG (INCB_XPiI (INSERT_SUBREG (IMPLICIT_DEF),
2708+
GPR32:$op, sub_32), 31, $imm),
2709+
sub_32)>;
27052710
def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cnth_imm i32:$imm))))),
27062711
(EXTRACT_SUBREG (INCH_XPiI (INSERT_SUBREG (IMPLICIT_DEF),
27072712
GPR32:$op, sub_32), 31, $imm),
@@ -2715,6 +2720,10 @@ let Predicates = [HasSVEorSME] in {
27152720
GPR32:$op, sub_32), 31, $imm),
27162721
sub_32)>;
27172722

2723+
def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cntb_imm_neg i32:$imm))))),
2724+
(EXTRACT_SUBREG (DECB_XPiI (INSERT_SUBREG (IMPLICIT_DEF),
2725+
GPR32:$op, sub_32), 31, $imm),
2726+
sub_32)>;
27182727
def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cnth_imm_neg i32:$imm))))),
27192728
(EXTRACT_SUBREG (DECH_XPiI (INSERT_SUBREG (IMPLICIT_DEF),
27202729
GPR32:$op, sub_32), 31, $imm),
@@ -2727,6 +2736,11 @@ let Predicates = [HasSVEorSME] in {
27272736
(EXTRACT_SUBREG (DECD_XPiI (INSERT_SUBREG (IMPLICIT_DEF),
27282737
GPR32:$op, sub_32), 31, $imm),
27292738
sub_32)>;
2739+
2740+
def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_rdvl_imm i32:$imm))))),
2741+
(EXTRACT_SUBREG (ADDVL_XXI (INSERT_SUBREG (IMPLICIT_DEF),
2742+
GPR32:$op, sub_32), $imm),
2743+
sub_32)>;
27302744
}
27312745

27322746
// For big endian, only BITCASTs involving same sized vector types with same

llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll

+4-2
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,8 @@ define void @quux() #1 {
6464
; CHECK-NEXT: subs x9, x9, x14
6565
; CHECK-NEXT: mov sp, x9
6666
; CHECK-NEXT: str x9, [x19, #32] // 8-byte Folded Spill
67-
; CHECK-NEXT: addvl x9, x8, #1
67+
; CHECK-NEXT: mov x9, x8
68+
; CHECK-NEXT: incb x9
6869
; CHECK-NEXT: mov w0, w9
6970
; CHECK-NEXT: // implicit-def: $x9
7071
; CHECK-NEXT: mov w9, w0
@@ -147,7 +148,8 @@ define void @quux() #1 {
147148
; CHECK-NEXT: mov x9, sp
148149
; CHECK-NEXT: subs x9, x9, #16
149150
; CHECK-NEXT: mov sp, x9
150-
; CHECK-NEXT: addvl x9, x8, #2
151+
; CHECK-NEXT: mov x9, x8
152+
; CHECK-NEXT: incb x9, all, mul #2
151153
; CHECK-NEXT: mov w0, w9
152154
; CHECK-NEXT: // implicit-def: $x9
153155
; CHECK-NEXT: mov w9, w0

llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll

+4-4
Original file line numberDiff line numberDiff line change
@@ -271,9 +271,9 @@ define void @ldr_with_off_15(ptr %ptr) {
271271
define void @ldr_with_off_15mulvl(ptr %ptr) {
272272
; CHECK-LABEL: ldr_with_off_15mulvl:
273273
; CHECK: // %bb.0:
274+
; CHECK-NEXT: incb x0, all, mul #15
274275
; CHECK-NEXT: mov w12, #15 // =0xf
275-
; CHECK-NEXT: addvl x8, x0, #15
276-
; CHECK-NEXT: ldr za[w12, 0], [x8]
276+
; CHECK-NEXT: ldr za[w12, 0], [x0]
277277
; CHECK-NEXT: ret
278278
%vscale = call i64 @llvm.vscale.i64()
279279
%mulvl = mul i64 %vscale, 240
@@ -285,9 +285,9 @@ define void @ldr_with_off_15mulvl(ptr %ptr) {
285285
define void @ldr_with_off_16mulvl(ptr %ptr) {
286286
; CHECK-LABEL: ldr_with_off_16mulvl:
287287
; CHECK: // %bb.0:
288+
; CHECK-NEXT: incb x0, all, mul #16
288289
; CHECK-NEXT: mov w12, #16 // =0x10
289-
; CHECK-NEXT: addvl x8, x0, #16
290-
; CHECK-NEXT: ldr za[w12, 0], [x8]
290+
; CHECK-NEXT: ldr za[w12, 0], [x0]
291291
; CHECK-NEXT: ret
292292
%vscale = call i64 @llvm.vscale.i64()
293293
%mulvl = mul i64 %vscale, 256

llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll

+4-4
Original file line numberDiff line numberDiff line change
@@ -271,9 +271,9 @@ define void @str_with_off_15(ptr %ptr) {
271271
define void @str_with_off_15mulvl(ptr %ptr) {
272272
; CHECK-LABEL: str_with_off_15mulvl:
273273
; CHECK: // %bb.0:
274+
; CHECK-NEXT: incb x0, all, mul #15
274275
; CHECK-NEXT: mov w12, #15 // =0xf
275-
; CHECK-NEXT: addvl x8, x0, #15
276-
; CHECK-NEXT: str za[w12, 0], [x8]
276+
; CHECK-NEXT: str za[w12, 0], [x0]
277277
; CHECK-NEXT: ret
278278
%vscale = call i64 @llvm.vscale.i64()
279279
%mulvl = mul i64 %vscale, 240
@@ -285,9 +285,9 @@ define void @str_with_off_15mulvl(ptr %ptr) {
285285
define void @str_with_off_16mulvl(ptr %ptr) {
286286
; CHECK-LABEL: str_with_off_16mulvl:
287287
; CHECK: // %bb.0:
288+
; CHECK-NEXT: incb x0, all, mul #16
288289
; CHECK-NEXT: mov w12, #16 // =0x10
289-
; CHECK-NEXT: addvl x8, x0, #16
290-
; CHECK-NEXT: str za[w12, 0], [x8]
290+
; CHECK-NEXT: str za[w12, 0], [x0]
291291
; CHECK-NEXT: ret
292292
%vscale = call i64 @llvm.vscale.i64()
293293
%mulvl = mul i64 %vscale, 256

llvm/test/CodeGen/AArch64/sve-lsrchain.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ define void @test(ptr nocapture noundef readonly %kernel, i32 noundef %kw, float
8585
; CHECK-NEXT: ld1h { z5.h }, p0/z, [x4, #3, mul vl]
8686
; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z3.h
8787
; CHECK-NEXT: st1h { z4.h }, p0, [x16, #3, mul vl]
88-
; CHECK-NEXT: addvl x16, x16, #4
88+
; CHECK-NEXT: incb x16, all, mul #4
8989
; CHECK-NEXT: cmp x16, x11
9090
; CHECK-NEXT: b.lo .LBB0_4
9191
; CHECK-NEXT: // %bb.5: // %while.cond.i..exit_crit_edge.us

llvm/test/CodeGen/AArch64/sve-vl-arith.ll

+4-4
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ define i64 @incb_scalar_i64(i64 %a) {
123123
;
124124
; CHECK-LABEL: incb_scalar_i64:
125125
; CHECK: // %bb.0:
126-
; CHECK-NEXT: addvl x0, x0, #1
126+
; CHECK-NEXT: incb x0
127127
; CHECK-NEXT: ret
128128
%vscale = call i64 @llvm.vscale.i64()
129129
%mul = mul i64 %vscale, 16
@@ -193,7 +193,7 @@ define i64 @decb_scalar_i64(i64 %a) {
193193
;
194194
; CHECK-LABEL: decb_scalar_i64:
195195
; CHECK: // %bb.0:
196-
; CHECK-NEXT: addvl x0, x0, #-2
196+
; CHECK-NEXT: decb x0, all, mul #2
197197
; CHECK-NEXT: ret
198198
%vscale = call i64 @llvm.vscale.i64()
199199
%mul = mul i64 %vscale, 32
@@ -264,7 +264,7 @@ define i32 @incb_scalar_i32(i32 %a) {
264264
; CHECK-LABEL: incb_scalar_i32:
265265
; CHECK: // %bb.0:
266266
; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
267-
; CHECK-NEXT: addvl x0, x0, #3
267+
; CHECK-NEXT: incb x0, all, mul #3
268268
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
269269
; CHECK-NEXT: ret
270270

@@ -350,7 +350,7 @@ define i32 @decb_scalar_i32(i32 %a) {
350350
; CHECK-LABEL: decb_scalar_i32:
351351
; CHECK: // %bb.0:
352352
; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
353-
; CHECK-NEXT: addvl x0, x0, #-4
353+
; CHECK-NEXT: decb x0, all, mul #4
354354
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
355355
; CHECK-NEXT: ret
356356

llvm/test/CodeGen/AArch64/sve2p1-intrinsics-ld1-single.ll

+4-4
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,8 @@ define <vscale x 4 x i32> @test_svld1uwq_i32_si(<vscale x 1 x i1> %pred, ptr %ba
3333
define <vscale x 4 x i32> @test_svld1uwq_i32_out_of_bound(<vscale x 1 x i1> %pred, ptr %base) {
3434
; CHECK-LABEL: test_svld1uwq_i32_out_of_bound:
3535
; CHECK: // %bb.0:
36-
; CHECK-NEXT: addvl x8, x0, #2
37-
; CHECK-NEXT: ld1w { z0.q }, p0/z, [x8]
36+
; CHECK-NEXT: incb x0, all, mul #2
37+
; CHECK-NEXT: ld1w { z0.q }, p0/z, [x0]
3838
; CHECK-NEXT: ret
3939
%gep = getelementptr inbounds <vscale x 1 x i32>, ptr %base, i64 8
4040
%res = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1uwq.nxv4i32(<vscale x 1 x i1> %pred, ptr %gep)
@@ -101,8 +101,8 @@ define <vscale x 2 x i64> @test_svld1udq_i64_si(<vscale x 1 x i1> %pred, ptr %ba
101101
define <vscale x 2 x i64> @test_svld1udq_i64_out_of_bound(<vscale x 1 x i1> %pred, ptr %base) {
102102
; CHECK-LABEL: test_svld1udq_i64_out_of_bound:
103103
; CHECK: // %bb.0:
104-
; CHECK-NEXT: addvl x8, x0, #-5
105-
; CHECK-NEXT: ld1d { z0.q }, p0/z, [x8]
104+
; CHECK-NEXT: decb x0, all, mul #5
105+
; CHECK-NEXT: ld1d { z0.q }, p0/z, [x0]
106106
; CHECK-NEXT: ret
107107
%gep = getelementptr inbounds <vscale x 1 x i64>, ptr %base, i64 -10
108108
%res = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1udq.nxv2i64(<vscale x 1 x i1> %pred, ptr %gep)

llvm/test/CodeGen/AArch64/sve2p1-intrinsics-st1-single.ll

+4-4
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,8 @@ define void @test_svst1wq_i32_si(<vscale x 4 x i32> %zt, <vscale x 1 x i1> %pred
3030
define void @test_svst1wq_i32_out_of_bound(<vscale x 4 x i32> %zt, <vscale x 1 x i1> %pred, ptr %base) {
3131
; CHECK-LABEL: test_svst1wq_i32_out_of_bound:
3232
; CHECK: // %bb.0:
33-
; CHECK-NEXT: addvl x8, x0, #2
34-
; CHECK-NEXT: st1w { z0.q }, p0, [x8]
33+
; CHECK-NEXT: incb x0, all, mul #2
34+
; CHECK-NEXT: st1w { z0.q }, p0, [x0]
3535
; CHECK-NEXT: ret
3636
%gep = getelementptr inbounds <vscale x 1 x i32>, ptr %base, i64 8
3737
call void @llvm.aarch64.sve.st1wq.nxv4i32(<vscale x 4 x i32> %zt, <vscale x 1 x i1> %pred, ptr %gep)
@@ -91,8 +91,8 @@ define void @test_svst1dq_i64_si(<vscale x 2 x i64> %zt, <vscale x 1 x i1> %pred
9191
define void @test_svst1dq_i64_out_of_bound(<vscale x 2 x i64> %zt, <vscale x 1 x i1> %pred, ptr %base) {
9292
; CHECK-LABEL: test_svst1dq_i64_out_of_bound:
9393
; CHECK: // %bb.0:
94-
; CHECK-NEXT: addvl x8, x0, #-5
95-
; CHECK-NEXT: st1d { z0.q }, p0, [x8]
94+
; CHECK-NEXT: decb x0, all, mul #5
95+
; CHECK-NEXT: st1d { z0.q }, p0, [x0]
9696
; CHECK-NEXT: ret
9797
%gep = getelementptr inbounds <vscale x 1 x i64>, ptr %base, i64 -10
9898
call void @llvm.aarch64.sve.st1dq.nxv2i64(<vscale x 2 x i64> %zt, <vscale x 1 x i1> %pred, ptr %gep)

0 commit comments

Comments
 (0)