Skip to content

Commit 08cb1a6

Browse files
authored
[AArch64][SVE] Add intrinsincs to assembly mapping for svpmov (#81861)
This patch enables translation of svpmov intrinsic to the correct assembly instruction, instead of function call.
1 parent 3b232f0 commit 08cb1a6

File tree

3 files changed

+47
-126
lines changed

3 files changed

+47
-126
lines changed

llvm/include/llvm/IR/IntrinsicsAArch64.td

Lines changed: 27 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1367,6 +1367,27 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.".
13671367
llvm_i32_ty,
13681368
llvm_i32_ty],
13691369
[IntrNoMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
1370+
1371+
class SVE2_1VectorArg_Pred_Intrinsic
1372+
: DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
1373+
[llvm_anyvector_ty],
1374+
[IntrNoMem]>;
1375+
1376+
class SVE2_1VectorArgIndexed_Pred_Intrinsic
1377+
: DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
1378+
[llvm_anyvector_ty, llvm_i32_ty],
1379+
[IntrNoMem, ImmArg<ArgIndex<1>>]>;
1380+
1381+
class SVE2_Pred_1VectorArgIndexed_Intrinsic
1382+
: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
1383+
[LLVMMatchType<0>,
1384+
LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_i32_ty],
1385+
[IntrNoMem, ImmArg<ArgIndex<2>>]>;
1386+
1387+
class SVE2_Pred_1VectorArg_Intrinsic
1388+
: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
1389+
[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
1390+
[IntrNoMem]>;
13701391

13711392
// NOTE: There is no relationship between these intrinsics beyond an attempt
13721393
// to reuse currently identical class definitions.
@@ -3610,23 +3631,10 @@ def int_aarch64_sve_extq : AdvSIMD_2VectorArgIndexed_Intrinsic;
36103631
//
36113632
// SVE2.1 - Move predicate to/from vector
36123633
//
3613-
def int_aarch64_sve_pmov_to_pred_lane :
3614-
DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
3615-
[llvm_anyvector_ty, llvm_i32_ty],
3616-
[IntrNoMem, ImmArg<ArgIndex<1>>]>;
3617-
3618-
def int_aarch64_sve_pmov_to_pred_lane_zero :
3619-
DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
3620-
[llvm_anyvector_ty],
3621-
[IntrNoMem]>;
3622-
3623-
def int_aarch64_sve_pmov_to_vector_lane_merging :
3624-
DefaultAttrsIntrinsic<[llvm_anyvector_ty],
3625-
[LLVMMatchType<0>,
3626-
LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_i32_ty],
3627-
[IntrNoMem, ImmArg<ArgIndex<2>>]>;
3634+
def int_aarch64_sve_pmov_to_pred_lane : SVE2_1VectorArgIndexed_Pred_Intrinsic;
3635+
3636+
def int_aarch64_sve_pmov_to_pred_lane_zero : SVE2_1VectorArg_Pred_Intrinsic;
36283637

3629-
def int_aarch64_sve_pmov_to_vector_lane_zeroing :
3630-
DefaultAttrsIntrinsic<[llvm_anyvector_ty],
3631-
[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
3632-
[IntrNoMem]>;
3638+
def int_aarch64_sve_pmov_to_vector_lane_merging : SVE2_Pred_1VectorArgIndexed_Intrinsic;
3639+
3640+
def int_aarch64_sve_pmov_to_vector_lane_zeroing : SVE2_Pred_1VectorArg_Intrinsic;

llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-pred.ll

Lines changed: 13 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,7 @@
44
define <vscale x 16 x i1> @test_pmov_to_pred_i8(<vscale x 16 x i8> %zn) {
55
; CHECK-LABEL: test_pmov_to_pred_i8:
66
; CHECK: // %bb.0: // %entry
7-
; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
8-
; CHECK-NEXT: .cfi_def_cfa_offset 16
9-
; CHECK-NEXT: .cfi_offset w30, -16
10-
; CHECK-NEXT: mov w0, wzr
11-
; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.pred.lane.nxv16i8
12-
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
7+
; CHECK-NEXT: pmov p0.b, z0
138
; CHECK-NEXT: ret
149
entry:
1510
%res = call <vscale x 16 x i1> @llvm.aarch64.sve.pmov.to.pred.lane.nxv16i8(<vscale x 16 x i8> %zn, i32 0)
@@ -19,27 +14,10 @@ define <vscale x 16 x i1> @test_pmov_to_pred_i8(<vscale x 16 x i8> %zn) {
1914
define <vscale x 8 x i1> @test_pmov_to_pred_i16(<vscale x 8 x i16> %zn) {
2015
; CHECK-LABEL: test_pmov_to_pred_i16:
2116
; CHECK: // %bb.0: // %entry
22-
; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
23-
; CHECK-NEXT: addvl sp, sp, #-2
24-
; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
25-
; CHECK-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
26-
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
27-
; CHECK-NEXT: .cfi_offset w30, -8
28-
; CHECK-NEXT: .cfi_offset w29, -16
29-
; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
30-
; CHECK-NEXT: mov w0, wzr
31-
; CHECK-NEXT: mov z8.d, z0.d
32-
; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.pred.lane.nxv8i16
33-
; CHECK-NEXT: mov z0.d, z8.d
34-
; CHECK-NEXT: mov w0, #1 // =0x1
35-
; CHECK-NEXT: mov p4.b, p0.b
36-
; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.pred.lane.nxv8i16
37-
; CHECK-NEXT: ptrue p1.h
38-
; CHECK-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
39-
; CHECK-NEXT: eor p0.b, p1/z, p4.b, p0.b
40-
; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
41-
; CHECK-NEXT: addvl sp, sp, #2
42-
; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
17+
; CHECK-NEXT: ptrue p0.h
18+
; CHECK-NEXT: pmov p1.h, z0[0]
19+
; CHECK-NEXT: pmov p2.h, z0[1]
20+
; CHECK-NEXT: eor p0.b, p0/z, p1.b, p2.b
4321
; CHECK-NEXT: ret
4422
entry:
4523
%res1 = call <vscale x 8 x i1> @llvm.aarch64.sve.pmov.to.pred.lane.nxv8i16(<vscale x 8 x i16> %zn, i32 0)
@@ -52,27 +30,10 @@ define <vscale x 8 x i1> @test_pmov_to_pred_i16(<vscale x 8 x i16> %zn) {
5230
define <vscale x 4 x i1> @test_pmov_to_pred_i32(<vscale x 4 x i32> %zn) {
5331
; CHECK-LABEL: test_pmov_to_pred_i32:
5432
; CHECK: // %bb.0: // %entry
55-
; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
56-
; CHECK-NEXT: addvl sp, sp, #-2
57-
; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
58-
; CHECK-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
59-
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
60-
; CHECK-NEXT: .cfi_offset w30, -8
61-
; CHECK-NEXT: .cfi_offset w29, -16
62-
; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
63-
; CHECK-NEXT: mov w0, wzr
64-
; CHECK-NEXT: mov z8.d, z0.d
65-
; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.pred.lane.nxv4i32
66-
; CHECK-NEXT: mov z0.d, z8.d
67-
; CHECK-NEXT: mov w0, #3 // =0x3
68-
; CHECK-NEXT: mov p4.b, p0.b
69-
; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.pred.lane.nxv4i32
70-
; CHECK-NEXT: ptrue p1.s
71-
; CHECK-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
72-
; CHECK-NEXT: eor p0.b, p1/z, p4.b, p0.b
73-
; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
74-
; CHECK-NEXT: addvl sp, sp, #2
75-
; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
33+
; CHECK-NEXT: ptrue p0.s
34+
; CHECK-NEXT: pmov p1.s, z0[0]
35+
; CHECK-NEXT: pmov p2.s, z0[3]
36+
; CHECK-NEXT: eor p0.b, p0/z, p1.b, p2.b
7637
; CHECK-NEXT: ret
7738
entry:
7839
%res1 = call <vscale x 4 x i1> @llvm.aarch64.sve.pmov.to.pred.lane.nxv4i32(<vscale x 4 x i32> %zn, i32 0)
@@ -85,27 +46,10 @@ define <vscale x 4 x i1> @test_pmov_to_pred_i32(<vscale x 4 x i32> %zn) {
8546
define <vscale x 2 x i1> @test_pmov_to_pred_i64(<vscale x 2 x i64> %zn) {
8647
; CHECK-LABEL: test_pmov_to_pred_i64:
8748
; CHECK: // %bb.0: // %entry
88-
; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
89-
; CHECK-NEXT: addvl sp, sp, #-2
90-
; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
91-
; CHECK-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
92-
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
93-
; CHECK-NEXT: .cfi_offset w30, -8
94-
; CHECK-NEXT: .cfi_offset w29, -16
95-
; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
96-
; CHECK-NEXT: mov w0, wzr
97-
; CHECK-NEXT: mov z8.d, z0.d
98-
; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.pred.lane.nxv2i64
99-
; CHECK-NEXT: mov z0.d, z8.d
100-
; CHECK-NEXT: mov w0, #7 // =0x7
101-
; CHECK-NEXT: mov p4.b, p0.b
102-
; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.pred.lane.nxv2i64
103-
; CHECK-NEXT: ptrue p1.d
104-
; CHECK-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
105-
; CHECK-NEXT: eor p0.b, p1/z, p4.b, p0.b
106-
; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
107-
; CHECK-NEXT: addvl sp, sp, #2
108-
; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
49+
; CHECK-NEXT: ptrue p0.d
50+
; CHECK-NEXT: pmov p1.d, z0[0]
51+
; CHECK-NEXT: pmov p2.d, z0[7]
52+
; CHECK-NEXT: eor p0.b, p0/z, p1.b, p2.b
10953
; CHECK-NEXT: ret
11054
entry:
11155
%res1 = call <vscale x 2 x i1> @llvm.aarch64.sve.pmov.to.pred.lane.nxv2i64(<vscale x 2 x i64> %zn, i32 0)

llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-vector.ll

Lines changed: 7 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,7 @@
66
define <vscale x 8 x i16> @test_pmov_to_vector_i16(<vscale x 8 x i16> %zn, <vscale x 8 x i1> %pn) {
77
; CHECK-LABEL: test_pmov_to_vector_i16:
88
; CHECK: // %bb.0: // %entry
9-
; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
10-
; CHECK-NEXT: .cfi_def_cfa_offset 16
11-
; CHECK-NEXT: .cfi_offset w30, -16
12-
; CHECK-NEXT: mov w0, #1 // =0x1
13-
; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv8i16
14-
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
9+
; CHECK-NEXT: pmov z0[1], p0.h
1510
; CHECK-NEXT: ret
1611
entry:
1712
%res = call <vscale x 8 x i16> @llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv8i16(<vscale x 8 x i16> %zn, <vscale x 8 x i1> %pn, i32 1)
@@ -21,12 +16,7 @@ define <vscale x 8 x i16> @test_pmov_to_vector_i16(<vscale x 8 x i16> %zn, <vsca
2116
define <vscale x 4 x i32> @test_pmov_to_vector_i32(<vscale x 4 x i32> %zn, <vscale x 4 x i1> %pn) {
2217
; CHECK-LABEL: test_pmov_to_vector_i32:
2318
; CHECK: // %bb.0: // %entry
24-
; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
25-
; CHECK-NEXT: .cfi_def_cfa_offset 16
26-
; CHECK-NEXT: .cfi_offset w30, -16
27-
; CHECK-NEXT: mov w0, #3 // =0x3
28-
; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv4i32
29-
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
19+
; CHECK-NEXT: pmov z0[3], p0.s
3020
; CHECK-NEXT: ret
3121
entry:
3222
%res = call <vscale x 4 x i32> @llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv4i32(<vscale x 4 x i32> %zn, <vscale x 4 x i1> %pn, i32 3)
@@ -36,12 +26,7 @@ define <vscale x 4 x i32> @test_pmov_to_vector_i32(<vscale x 4 x i32> %zn, <vsca
3626
define <vscale x 2 x i64> @test_pmov_to_vector_i64(<vscale x 2 x i64> %zn, <vscale x 2 x i1> %pn) {
3727
; CHECK-LABEL: test_pmov_to_vector_i64:
3828
; CHECK: // %bb.0: // %entry
39-
; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
40-
; CHECK-NEXT: .cfi_def_cfa_offset 16
41-
; CHECK-NEXT: .cfi_offset w30, -16
42-
; CHECK-NEXT: mov w0, #7 // =0x7
43-
; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv2i64
44-
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
29+
; CHECK-NEXT: pmov z0[7], p0.d
4530
; CHECK-NEXT: ret
4631
entry:
4732
%res = call <vscale x 2 x i64> @llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv2i64(<vscale x 2 x i64> %zn, <vscale x 2 x i1> %pn, i32 7)
@@ -54,11 +39,7 @@ define <vscale x 2 x i64> @test_pmov_to_vector_i64(<vscale x 2 x i64> %zn, <vsca
5439
define <vscale x 16 x i8> @test_pmov_to_vector_zero_i8(<vscale x 16 x i1> %pn) {
5540
; CHECK-LABEL: test_pmov_to_vector_zero_i8:
5641
; CHECK: // %bb.0: // %entry
57-
; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
58-
; CHECK-NEXT: .cfi_def_cfa_offset 16
59-
; CHECK-NEXT: .cfi_offset w30, -16
60-
; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv16i8
61-
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
42+
; CHECK-NEXT: pmov z0, p0.b
6243
; CHECK-NEXT: ret
6344
entry:
6445
%res = call <vscale x 16 x i8> @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv16i8(<vscale x 16 x i1> %pn)
@@ -68,11 +49,7 @@ define <vscale x 16 x i8> @test_pmov_to_vector_zero_i8(<vscale x 16 x i1> %pn) {
6849
define <vscale x 8 x i16> @test_pmov_to_vector_zero_i16(<vscale x 8 x i1> %pn) {
6950
; CHECK-LABEL: test_pmov_to_vector_zero_i16:
7051
; CHECK: // %bb.0: // %entry
71-
; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
72-
; CHECK-NEXT: .cfi_def_cfa_offset 16
73-
; CHECK-NEXT: .cfi_offset w30, -16
74-
; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv8i16
75-
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
52+
; CHECK-NEXT: pmov z0[0], p0.h
7653
; CHECK-NEXT: ret
7754
entry:
7855
%res = call <vscale x 8 x i16> @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv8i16(<vscale x 8 x i1> %pn)
@@ -82,11 +59,7 @@ define <vscale x 8 x i16> @test_pmov_to_vector_zero_i16(<vscale x 8 x i1> %pn) {
8259
define <vscale x 4 x i32> @test_pmov_to_vector_zero_i32(<vscale x 4 x i1> %pn) {
8360
; CHECK-LABEL: test_pmov_to_vector_zero_i32:
8461
; CHECK: // %bb.0: // %entry
85-
; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
86-
; CHECK-NEXT: .cfi_def_cfa_offset 16
87-
; CHECK-NEXT: .cfi_offset w30, -16
88-
; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv4i32
89-
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
62+
; CHECK-NEXT: pmov z0[0], p0.s
9063
; CHECK-NEXT: ret
9164
entry:
9265
%res = call <vscale x 4 x i32> @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv4i32(<vscale x 4 x i1> %pn)
@@ -96,11 +69,7 @@ define <vscale x 4 x i32> @test_pmov_to_vector_zero_i32(<vscale x 4 x i1> %pn) {
9669
define <vscale x 2 x i64> @test_pmov_to_vector_zero_i64(<vscale x 2 x i1> %pn) {
9770
; CHECK-LABEL: test_pmov_to_vector_zero_i64:
9871
; CHECK: // %bb.0: // %entry
99-
; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
100-
; CHECK-NEXT: .cfi_def_cfa_offset 16
101-
; CHECK-NEXT: .cfi_offset w30, -16
102-
; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv2i64
103-
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
72+
; CHECK-NEXT: pmov z0[0], p0.d
10473
; CHECK-NEXT: ret
10574
entry:
10675
%res = call <vscale x 2 x i64> @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv2i64(<vscale x 2 x i1> %pn)

0 commit comments

Comments
 (0)