Skip to content

Commit c7e55d4

Browse files
committed
[ARM] MVE predicate register support
This adds support code for building and shuffling i1 predicate registers. It generally uses two basic principles, either converting the predicate into an scalar (through a PREDICATE_CAST) and doing scalar operations on it there, or by converting the register to an full vector register and back. Some of the code here is a not super efficient but will hopefully cover most cases of moving i1 vectors around and can be improved in subsequent patches. Some code by David Sherwood. Differential Revision: https://reviews.llvm.org/D65052 llvm-svn: 366890
1 parent b09bc8a commit c7e55d4

9 files changed

+1758
-13
lines changed

llvm/lib/Target/ARM/ARMISelLowering.cpp

Lines changed: 307 additions & 13 deletions
Large diffs are not rendered by default.

llvm/lib/Target/ARM/ARMISelLowering.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,8 @@ class VectorType;
129129
LOOP_DEC, // Really a part of LE, performs the sub
130130
LE, // Low-overhead loops, Loop End
131131

132+
PREDICATE_CAST, // Predicate cast for MVE i1 types
133+
132134
VCEQ, // Vector compare equal.
133135
VCEQZ, // Vector compare equal to zero.
134136
VCNE, // Vector compare not equal (MVE)

llvm/lib/Target/ARM/ARMInstrMVE.td

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3016,6 +3016,26 @@ let Predicates = [HasMVEInt] in {
30163016
defm MVE_VCGEU : unpred_vcmp_r<ARMvcgeu, "u", 2>;
30173017
}
30183018

3019+
// Occasionally we need to cast between a i32 and a boolean vector, for
3020+
// example when moving between rGPR and VPR.P0 as part of predicate vector
3021+
// shuffles. We also sometimes need to cast between different predicate
3022+
// vector types (v4i1<>v8i1, etc.) also as part of lowering vector shuffles.
3023+
3024+
def predicate_cast : SDNode<"ARMISD::PREDICATE_CAST", SDTUnaryOp>;
3025+
3026+
let Predicates = [HasMVEInt] in {
3027+
foreach VT = [ v4i1, v8i1, v16i1 ] in {
3028+
def : Pat<(i32 (predicate_cast (VT VCCR:$src))),
3029+
(i32 (COPY_TO_REGCLASS (VT VCCR:$src), VCCR))>;
3030+
def : Pat<(VT (predicate_cast (i32 VCCR:$src))),
3031+
(VT (COPY_TO_REGCLASS (i32 VCCR:$src), VCCR))>;
3032+
3033+
foreach VT2 = [ v4i1, v8i1, v16i1 ] in
3034+
def : Pat<(VT (predicate_cast (VT2 VCCR:$src))),
3035+
(VT (COPY_TO_REGCLASS (VT2 VCCR:$src), VCCR))>;
3036+
}
3037+
}
3038+
30193039
// end of MVE compares
30203040

30213041
// start of MVE_qDest_qSrc
@@ -4410,6 +4430,37 @@ let Predicates = [HasMVEInt] in {
44104430
(v8i16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, VCCR:$pred))>;
44114431
def : Pat<(v4i32 (vselect (v4i1 VCCR:$pred), (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))),
44124432
(v4i32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, VCCR:$pred))>;
4433+
4434+
def : Pat<(v16i8 (vselect (v16i8 MQPR:$pred), (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))),
4435+
(v16i8 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0,
4436+
(MVE_VCMPi8 (v16i8 MQPR:$pred), (MVE_VMOVimmi8 0), 1)))>;
4437+
def : Pat<(v8i16 (vselect (v8i16 MQPR:$pred), (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))),
4438+
(v8i16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0,
4439+
(MVE_VCMPi16 (v8i16 MQPR:$pred), (MVE_VMOVimmi16 0), 1)))>;
4440+
def : Pat<(v4i32 (vselect (v4i32 MQPR:$pred), (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))),
4441+
(v4i32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0,
4442+
(MVE_VCMPi32 (v4i32 MQPR:$pred), (MVE_VMOVimmi32 0), 1)))>;
4443+
4444+
def : Pat<(v16i8 (zext (v16i1 VCCR:$pred))),
4445+
(v16i8 (MVE_VPSEL (MVE_VMOVimmi8 1), (MVE_VMOVimmi8 0), 0, VCCR:$pred))>;
4446+
def : Pat<(v8i16 (zext (v8i1 VCCR:$pred))),
4447+
(v8i16 (MVE_VPSEL (MVE_VMOVimmi16 1), (MVE_VMOVimmi16 0), 0, VCCR:$pred))>;
4448+
def : Pat<(v4i32 (zext (v4i1 VCCR:$pred))),
4449+
(v4i32 (MVE_VPSEL (MVE_VMOVimmi32 1), (MVE_VMOVimmi32 0), 0, VCCR:$pred))>;
4450+
4451+
def : Pat<(v16i8 (sext (v16i1 VCCR:$pred))),
4452+
(v16i8 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi8 0), 0, VCCR:$pred))>;
4453+
def : Pat<(v8i16 (sext (v8i1 VCCR:$pred))),
4454+
(v8i16 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi16 0), 0, VCCR:$pred))>;
4455+
def : Pat<(v4i32 (sext (v4i1 VCCR:$pred))),
4456+
(v4i32 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi32 0), 0, VCCR:$pred))>;
4457+
4458+
def : Pat<(v16i8 (anyext (v16i1 VCCR:$pred))),
4459+
(v16i8 (MVE_VPSEL (MVE_VMOVimmi8 1), (MVE_VMOVimmi8 0), 0, VCCR:$pred))>;
4460+
def : Pat<(v8i16 (anyext (v8i1 VCCR:$pred))),
4461+
(v8i16 (MVE_VPSEL (MVE_VMOVimmi16 1), (MVE_VMOVimmi16 0), 0, VCCR:$pred))>;
4462+
def : Pat<(v4i32 (anyext (v4i1 VCCR:$pred))),
4463+
(v4i32 (MVE_VPSEL (MVE_VMOVimmi32 1), (MVE_VMOVimmi32 0), 0, VCCR:$pred))>;
44134464
}
44144465

44154466
def MVE_VPNOT : MVE_p<(outs), (ins), NoItinerary,
Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s
3+
4+
5+
define arm_aapcs_vfpcc <4 x i32> @build_true_v4i1(<4 x i32> %a, <4 x i32> %b) {
6+
; CHECK-LABEL: build_true_v4i1:
7+
; CHECK: @ %bb.0: @ %entry
8+
; CHECK-NEXT: bx lr
9+
entry:
10+
%s = select <4 x i1> <i1 1, i1 1, i1 1, i1 1>, <4 x i32> %a, <4 x i32> %b
11+
ret <4 x i32> %s
12+
}
13+
14+
define arm_aapcs_vfpcc <4 x i32> @build_false_v4i1(<4 x i32> %a, <4 x i32> %b) {
15+
; CHECK-LABEL: build_false_v4i1:
16+
; CHECK: @ %bb.0: @ %entry
17+
; CHECK-NEXT: vmov q0, q1
18+
; CHECK-NEXT: bx lr
19+
entry:
20+
%s = select <4 x i1> <i1 0, i1 0, i1 0, i1 0>, <4 x i32> %a, <4 x i32> %b
21+
ret <4 x i32> %s
22+
}
23+
24+
define arm_aapcs_vfpcc <4 x i32> @build_upper_v4i1(<4 x i32> %a, <4 x i32> %b) {
25+
; CHECK-LABEL: build_upper_v4i1:
26+
; CHECK: @ %bb.0: @ %entry
27+
; CHECK-NEXT: mov.w r0, #65280
28+
; CHECK-NEXT: vmsr p0, r0
29+
; CHECK-NEXT: vpsel q0, q0, q1
30+
; CHECK-NEXT: bx lr
31+
entry:
32+
%s = select <4 x i1> <i1 0, i1 0, i1 1, i1 1>, <4 x i32> %a, <4 x i32> %b
33+
ret <4 x i32> %s
34+
}
35+
36+
define arm_aapcs_vfpcc <4 x i32> @build_lower_v4i1(<4 x i32> %a, <4 x i32> %b) {
37+
; CHECK-LABEL: build_lower_v4i1:
38+
; CHECK: @ %bb.0: @ %entry
39+
; CHECK-NEXT: movs r0, #255
40+
; CHECK-NEXT: vmsr p0, r0
41+
; CHECK-NEXT: vpsel q0, q0, q1
42+
; CHECK-NEXT: bx lr
43+
entry:
44+
%s = select <4 x i1> <i1 1, i1 1, i1 0, i1 0>, <4 x i32> %a, <4 x i32> %b
45+
ret <4 x i32> %s
46+
}
47+
48+
49+
define arm_aapcs_vfpcc <8 x i16> @build_true_v8i1(<8 x i16> %a, <8 x i16> %b) {
50+
; CHECK-LABEL: build_true_v8i1:
51+
; CHECK: @ %bb.0: @ %entry
52+
; CHECK-NEXT: bx lr
53+
entry:
54+
%s = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i16> %a, <8 x i16> %b
55+
ret <8 x i16> %s
56+
}
57+
58+
define arm_aapcs_vfpcc <8 x i16> @build_false_v8i1(<8 x i16> %a, <8 x i16> %b) {
59+
; CHECK-LABEL: build_false_v8i1:
60+
; CHECK: @ %bb.0: @ %entry
61+
; CHECK-NEXT: vmov q0, q1
62+
; CHECK-NEXT: bx lr
63+
entry:
64+
%s = select <8 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x i16> %a, <8 x i16> %b
65+
ret <8 x i16> %s
66+
}
67+
68+
define arm_aapcs_vfpcc <8 x i16> @build_upper_v8i1(<8 x i16> %a, <8 x i16> %b) {
69+
; CHECK-LABEL: build_upper_v8i1:
70+
; CHECK: @ %bb.0: @ %entry
71+
; CHECK-NEXT: mov.w r0, #65280
72+
; CHECK-NEXT: vmsr p0, r0
73+
; CHECK-NEXT: vpsel q0, q0, q1
74+
; CHECK-NEXT: bx lr
75+
entry:
76+
%s = select <8 x i1> <i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1>, <8 x i16> %a, <8 x i16> %b
77+
ret <8 x i16> %s
78+
}
79+
80+
define arm_aapcs_vfpcc <8 x i16> @build_lower_v8i1(<8 x i16> %a, <8 x i16> %b) {
81+
; CHECK-LABEL: build_lower_v8i1:
82+
; CHECK: @ %bb.0: @ %entry
83+
; CHECK-NEXT: movs r0, #255
84+
; CHECK-NEXT: vmsr p0, r0
85+
; CHECK-NEXT: vpsel q0, q0, q1
86+
; CHECK-NEXT: bx lr
87+
entry:
88+
%s = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0>, <8 x i16> %a, <8 x i16> %b
89+
ret <8 x i16> %s
90+
}
91+
92+
93+
define arm_aapcs_vfpcc <16 x i8> @build_true_v16i1(<16 x i8> %a, <16 x i8> %b) {
94+
; CHECK-LABEL: build_true_v16i1:
95+
; CHECK: @ %bb.0: @ %entry
96+
; CHECK-NEXT: bx lr
97+
entry:
98+
%s = select <16 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <16 x i8> %a, <16 x i8> %b
99+
ret <16 x i8> %s
100+
}
101+
102+
define arm_aapcs_vfpcc <16 x i8> @build_false_v16i1(<16 x i8> %a, <16 x i8> %b) {
103+
; CHECK-LABEL: build_false_v16i1:
104+
; CHECK: @ %bb.0: @ %entry
105+
; CHECK-NEXT: vmov q0, q1
106+
; CHECK-NEXT: bx lr
107+
entry:
108+
%s = select <16 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <16 x i8> %a, <16 x i8> %b
109+
ret <16 x i8> %s
110+
}
111+
112+
define arm_aapcs_vfpcc <16 x i8> @build_upper_v16i1(<16 x i8> %a, <16 x i8> %b) {
113+
; CHECK-LABEL: build_upper_v16i1:
114+
; CHECK: @ %bb.0: @ %entry
115+
; CHECK-NEXT: mov.w r0, #65280
116+
; CHECK-NEXT: vmsr p0, r0
117+
; CHECK-NEXT: vpsel q0, q0, q1
118+
; CHECK-NEXT: bx lr
119+
entry:
120+
%s = select <16 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <16 x i8> %a, <16 x i8> %b
121+
ret <16 x i8> %s
122+
}
123+
124+
define arm_aapcs_vfpcc <16 x i8> @build_lower_v16i1(<16 x i8> %a, <16 x i8> %b) {
125+
; CHECK-LABEL: build_lower_v16i1:
126+
; CHECK: @ %bb.0: @ %entry
127+
; CHECK-NEXT: movs r0, #255
128+
; CHECK-NEXT: vmsr p0, r0
129+
; CHECK-NEXT: vpsel q0, q0, q1
130+
; CHECK-NEXT: bx lr
131+
entry:
132+
%s = select <16 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <16 x i8> %a, <16 x i8> %b
133+
ret <16 x i8> %s
134+
}
135+
136+
137+
define arm_aapcs_vfpcc <2 x i64> @build_true_v2i1(<2 x i64> %a, <2 x i64> %b) {
138+
; CHECK-LABEL: build_true_v2i1:
139+
; CHECK: @ %bb.0: @ %entry
140+
; CHECK-NEXT: bx lr
141+
entry:
142+
%s = select <2 x i1> <i1 1, i1 1>, <2 x i64> %a, <2 x i64> %b
143+
ret <2 x i64> %s
144+
}
145+
146+
define arm_aapcs_vfpcc <2 x i64> @build_false_v2i1(<2 x i64> %a, <2 x i64> %b) {
147+
; CHECK-LABEL: build_false_v2i1:
148+
; CHECK: @ %bb.0: @ %entry
149+
; CHECK-NEXT: vmov q0, q1
150+
; CHECK-NEXT: bx lr
151+
entry:
152+
%s = select <2 x i1> <i1 0, i1 0>, <2 x i64> %a, <2 x i64> %b
153+
ret <2 x i64> %s
154+
}
155+
156+
define arm_aapcs_vfpcc <2 x i64> @build_upper_v2i1(<2 x i64> %a, <2 x i64> %b) {
157+
; CHECK-LABEL: build_upper_v2i1:
158+
; CHECK: @ %bb.0: @ %entry
159+
; CHECK-NEXT: adr r0, .LCPI14_0
160+
; CHECK-NEXT: vldrw.u32 q2, [r0]
161+
; CHECK-NEXT: vbic q1, q1, q2
162+
; CHECK-NEXT: vand q0, q0, q2
163+
; CHECK-NEXT: vorr q0, q0, q1
164+
; CHECK-NEXT: bx lr
165+
; CHECK-NEXT: .p2align 4
166+
; CHECK-NEXT: @ %bb.1:
167+
; CHECK-NEXT: .LCPI14_0:
168+
; CHECK-NEXT: .long 0 @ 0x0
169+
; CHECK-NEXT: .long 0 @ 0x0
170+
; CHECK-NEXT: .long 4294967295 @ 0xffffffff
171+
; CHECK-NEXT: .long 4294967295 @ 0xffffffff
172+
entry:
173+
%s = select <2 x i1> <i1 0, i1 1>, <2 x i64> %a, <2 x i64> %b
174+
ret <2 x i64> %s
175+
}
176+
177+
define arm_aapcs_vfpcc <2 x i64> @build_lower_v2i1(<2 x i64> %a, <2 x i64> %b) {
178+
; CHECK-LABEL: build_lower_v2i1:
179+
; CHECK: @ %bb.0: @ %entry
180+
; CHECK-NEXT: adr r0, .LCPI15_0
181+
; CHECK-NEXT: vldrw.u32 q2, [r0]
182+
; CHECK-NEXT: vbic q1, q1, q2
183+
; CHECK-NEXT: vand q0, q0, q2
184+
; CHECK-NEXT: vorr q0, q0, q1
185+
; CHECK-NEXT: bx lr
186+
; CHECK-NEXT: .p2align 4
187+
; CHECK-NEXT: @ %bb.1:
188+
; CHECK-NEXT: .LCPI15_0:
189+
; CHECK-NEXT: .long 4294967295 @ 0xffffffff
190+
; CHECK-NEXT: .long 4294967295 @ 0xffffffff
191+
; CHECK-NEXT: .long 0 @ 0x0
192+
; CHECK-NEXT: .long 0 @ 0x0
193+
entry:
194+
%s = select <2 x i1> <i1 1, i1 0>, <2 x i64> %a, <2 x i64> %b
195+
ret <2 x i64> %s
196+
}

0 commit comments

Comments
 (0)