Skip to content

Commit 734d113

Browse files
committed
[ARM] Remove reduce(shuffle) if all the lanes are used
This looks for vaddv(shuffle) or vmlav(shuffle, shuffle), with a shuffle where all the lanes are used once. Due to the reduction being commutative the shuffle can be removed. Differential Revision: https://reviews.llvm.org/D143382
1 parent 1ae4bd8 commit 734d113

File tree

2 files changed

+55
-126
lines changed

2 files changed

+55
-126
lines changed

llvm/lib/Target/ARM/ARMISelLowering.cpp

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17124,6 +17124,42 @@ static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG,
1712417124
return SDValue();
1712517125
}
1712617126

17127+
// Looks for vaddv(shuffle) or vmlav(shuffle, shuffle), with a shuffle where all
17128+
// the lanes are used. Due to the reduction being commutative the shuffle can be
17129+
// removed.
17130+
static SDValue PerformReduceShuffleCombine(SDNode *N, SelectionDAG &DAG) {
17131+
unsigned VecOp = N->getOperand(0).getValueType().isVector() ? 0 : 2;
17132+
auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(VecOp));
17133+
if (!Shuf || !Shuf->getOperand(1).isUndef())
17134+
return SDValue();
17135+
17136+
// Check all elements are used once in the mask.
17137+
ArrayRef<int> Mask = Shuf->getMask();
17138+
APInt SetElts(Mask.size(), 0);
17139+
for (int E : Mask) {
17140+
if (E < 0 || E >= (int)Mask.size())
17141+
return SDValue();
17142+
SetElts |= 1 << E;
17143+
}
17144+
if (!SetElts.isAllOnes())
17145+
return SDValue();
17146+
17147+
if (N->getNumOperands() != VecOp + 1) {
17148+
auto *Shuf2 = dyn_cast<ShuffleVectorSDNode>(N->getOperand(VecOp + 1));
17149+
if (!Shuf2 || !Shuf2->getOperand(1).isUndef() || Shuf2->getMask() != Mask)
17150+
return SDValue();
17151+
}
17152+
17153+
SmallVector<SDValue> Ops;
17154+
for (SDValue Op : N->ops()) {
17155+
if (Op.getValueType().isVector())
17156+
Ops.push_back(Op.getOperand(0));
17157+
else
17158+
Ops.push_back(Op);
17159+
}
17160+
return DAG.getNode(N->getOpcode(), SDLoc(N), N->getVTList(), Ops);
17161+
}
17162+
1712717163
static SDValue PerformVMOVNCombine(SDNode *N,
1712817164
TargetLowering::DAGCombinerInfo &DCI) {
1712917165
SDValue Op0 = N->getOperand(0);
@@ -18724,6 +18760,19 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
1872418760
return PerformVCMPCombine(N, DCI.DAG, Subtarget);
1872518761
case ISD::VECREDUCE_ADD:
1872618762
return PerformVECREDUCE_ADDCombine(N, DCI.DAG, Subtarget);
18763+
case ARMISD::VADDVs:
18764+
case ARMISD::VADDVu:
18765+
case ARMISD::VADDLVs:
18766+
case ARMISD::VADDLVu:
18767+
case ARMISD::VADDLVAs:
18768+
case ARMISD::VADDLVAu:
18769+
case ARMISD::VMLAVs:
18770+
case ARMISD::VMLAVu:
18771+
case ARMISD::VMLALVs:
18772+
case ARMISD::VMLALVu:
18773+
case ARMISD::VMLALVAs:
18774+
case ARMISD::VMLALVAu:
18775+
return PerformReduceShuffleCombine(N, DCI.DAG);
1872718776
case ARMISD::VMOVN:
1872818777
return PerformVMOVNCombine(N, DCI);
1872918778
case ARMISD::VQMOVNs:

llvm/test/CodeGen/Thumb2/mve-vecreduce-add-combine.ll

Lines changed: 6 additions & 126 deletions
Original file line numberDiff line numberDiff line change
@@ -103,39 +103,7 @@ entry:
103103
define arm_aapcs_vfpcc i16 @vaddv_shuffle_v16i8(<16 x i8> %s0) {
104104
; CHECK-LABEL: vaddv_shuffle_v16i8:
105105
; CHECK: @ %bb.0: @ %entry
106-
; CHECK-NEXT: vmov.u8 r0, q0[0]
107-
; CHECK-NEXT: vmov.8 q1[0], r0
108-
; CHECK-NEXT: vmov.u8 r0, q0[2]
109-
; CHECK-NEXT: vmov.8 q1[1], r0
110-
; CHECK-NEXT: vmov.u8 r0, q0[4]
111-
; CHECK-NEXT: vmov.8 q1[2], r0
112-
; CHECK-NEXT: vmov.u8 r0, q0[6]
113-
; CHECK-NEXT: vmov.8 q1[3], r0
114-
; CHECK-NEXT: vmov.u8 r0, q0[8]
115-
; CHECK-NEXT: vmov.8 q1[4], r0
116-
; CHECK-NEXT: vmov.u8 r0, q0[10]
117-
; CHECK-NEXT: vmov.8 q1[5], r0
118-
; CHECK-NEXT: vmov.u8 r0, q0[12]
119-
; CHECK-NEXT: vmov.8 q1[6], r0
120-
; CHECK-NEXT: vmov.u8 r0, q0[14]
121-
; CHECK-NEXT: vmov.8 q1[7], r0
122-
; CHECK-NEXT: vmov.u8 r0, q0[1]
123-
; CHECK-NEXT: vmov.8 q1[8], r0
124-
; CHECK-NEXT: vmov.u8 r0, q0[3]
125-
; CHECK-NEXT: vmov.8 q1[9], r0
126-
; CHECK-NEXT: vmov.u8 r0, q0[5]
127-
; CHECK-NEXT: vmov.8 q1[10], r0
128-
; CHECK-NEXT: vmov.u8 r0, q0[7]
129-
; CHECK-NEXT: vmov.8 q1[11], r0
130-
; CHECK-NEXT: vmov.u8 r0, q0[9]
131-
; CHECK-NEXT: vmov.8 q1[12], r0
132-
; CHECK-NEXT: vmov.u8 r0, q0[11]
133-
; CHECK-NEXT: vmov.8 q1[13], r0
134-
; CHECK-NEXT: vmov.u8 r0, q0[13]
135-
; CHECK-NEXT: vmov.8 q1[14], r0
136-
; CHECK-NEXT: vmov.u8 r0, q0[15]
137-
; CHECK-NEXT: vmov.8 q1[15], r0
138-
; CHECK-NEXT: vaddv.u8 r0, q1
106+
; CHECK-NEXT: vaddv.u8 r0, q0
139107
; CHECK-NEXT: bx lr
140108
entry:
141109
%s2 = shufflevector <16 x i8> %s0, <16 x i8> %s0, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -232,11 +200,7 @@ entry:
232200
define arm_aapcs_vfpcc i64 @vaddv_shuffle_v4i32_long(<4 x i32> %s0) {
233201
; CHECK-LABEL: vaddv_shuffle_v4i32_long:
234202
; CHECK: @ %bb.0: @ %entry
235-
; CHECK-NEXT: vmov.f32 s4, s3
236-
; CHECK-NEXT: vmov.f32 s5, s2
237-
; CHECK-NEXT: vmov.f32 s6, s1
238-
; CHECK-NEXT: vmov.f32 s7, s0
239-
; CHECK-NEXT: vaddlv.u32 r0, r1, q1
203+
; CHECK-NEXT: vaddlv.u32 r0, r1, q0
240204
; CHECK-NEXT: bx lr
241205
entry:
242206
%s2 = shufflevector <4 x i32> %s0, <4 x i32> %s0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -248,11 +212,7 @@ entry:
248212
define arm_aapcs_vfpcc i64 @vaddv_shuffle_v4i32_long_a(<4 x i32> %s0, i64 %a) {
249213
; CHECK-LABEL: vaddv_shuffle_v4i32_long_a:
250214
; CHECK: @ %bb.0: @ %entry
251-
; CHECK-NEXT: vmov.f32 s4, s3
252-
; CHECK-NEXT: vmov.f32 s5, s2
253-
; CHECK-NEXT: vmov.f32 s6, s1
254-
; CHECK-NEXT: vmov.f32 s7, s0
255-
; CHECK-NEXT: vaddlva.u32 r0, r1, q1
215+
; CHECK-NEXT: vaddlva.u32 r0, r1, q0
256216
; CHECK-NEXT: bx lr
257217
entry:
258218
%s2 = shufflevector <4 x i32> %s0, <4 x i32> %s0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -265,71 +225,7 @@ entry:
265225
define arm_aapcs_vfpcc i16 @vmla_shuffle_v16i8(<16 x i8> %s0, <16 x i8> %s0b) {
266226
; CHECK-LABEL: vmla_shuffle_v16i8:
267227
; CHECK: @ %bb.0: @ %entry
268-
; CHECK-NEXT: vmov.u8 r0, q1[0]
269-
; CHECK-NEXT: vmov.8 q2[0], r0
270-
; CHECK-NEXT: vmov.u8 r0, q1[2]
271-
; CHECK-NEXT: vmov.8 q2[1], r0
272-
; CHECK-NEXT: vmov.u8 r0, q1[4]
273-
; CHECK-NEXT: vmov.8 q2[2], r0
274-
; CHECK-NEXT: vmov.u8 r0, q1[6]
275-
; CHECK-NEXT: vmov.8 q2[3], r0
276-
; CHECK-NEXT: vmov.u8 r0, q1[8]
277-
; CHECK-NEXT: vmov.8 q2[4], r0
278-
; CHECK-NEXT: vmov.u8 r0, q1[10]
279-
; CHECK-NEXT: vmov.8 q2[5], r0
280-
; CHECK-NEXT: vmov.u8 r0, q1[12]
281-
; CHECK-NEXT: vmov.8 q2[6], r0
282-
; CHECK-NEXT: vmov.u8 r0, q1[14]
283-
; CHECK-NEXT: vmov.8 q2[7], r0
284-
; CHECK-NEXT: vmov.u8 r0, q1[1]
285-
; CHECK-NEXT: vmov.8 q2[8], r0
286-
; CHECK-NEXT: vmov.u8 r0, q1[3]
287-
; CHECK-NEXT: vmov.8 q2[9], r0
288-
; CHECK-NEXT: vmov.u8 r0, q1[5]
289-
; CHECK-NEXT: vmov.8 q2[10], r0
290-
; CHECK-NEXT: vmov.u8 r0, q1[7]
291-
; CHECK-NEXT: vmov.8 q2[11], r0
292-
; CHECK-NEXT: vmov.u8 r0, q1[9]
293-
; CHECK-NEXT: vmov.8 q2[12], r0
294-
; CHECK-NEXT: vmov.u8 r0, q1[11]
295-
; CHECK-NEXT: vmov.8 q2[13], r0
296-
; CHECK-NEXT: vmov.u8 r0, q1[13]
297-
; CHECK-NEXT: vmov.8 q2[14], r0
298-
; CHECK-NEXT: vmov.u8 r0, q1[15]
299-
; CHECK-NEXT: vmov.8 q2[15], r0
300-
; CHECK-NEXT: vmov.u8 r0, q0[0]
301-
; CHECK-NEXT: vmov.8 q1[0], r0
302-
; CHECK-NEXT: vmov.u8 r0, q0[2]
303-
; CHECK-NEXT: vmov.8 q1[1], r0
304-
; CHECK-NEXT: vmov.u8 r0, q0[4]
305-
; CHECK-NEXT: vmov.8 q1[2], r0
306-
; CHECK-NEXT: vmov.u8 r0, q0[6]
307-
; CHECK-NEXT: vmov.8 q1[3], r0
308-
; CHECK-NEXT: vmov.u8 r0, q0[8]
309-
; CHECK-NEXT: vmov.8 q1[4], r0
310-
; CHECK-NEXT: vmov.u8 r0, q0[10]
311-
; CHECK-NEXT: vmov.8 q1[5], r0
312-
; CHECK-NEXT: vmov.u8 r0, q0[12]
313-
; CHECK-NEXT: vmov.8 q1[6], r0
314-
; CHECK-NEXT: vmov.u8 r0, q0[14]
315-
; CHECK-NEXT: vmov.8 q1[7], r0
316-
; CHECK-NEXT: vmov.u8 r0, q0[1]
317-
; CHECK-NEXT: vmov.8 q1[8], r0
318-
; CHECK-NEXT: vmov.u8 r0, q0[3]
319-
; CHECK-NEXT: vmov.8 q1[9], r0
320-
; CHECK-NEXT: vmov.u8 r0, q0[5]
321-
; CHECK-NEXT: vmov.8 q1[10], r0
322-
; CHECK-NEXT: vmov.u8 r0, q0[7]
323-
; CHECK-NEXT: vmov.8 q1[11], r0
324-
; CHECK-NEXT: vmov.u8 r0, q0[9]
325-
; CHECK-NEXT: vmov.8 q1[12], r0
326-
; CHECK-NEXT: vmov.u8 r0, q0[11]
327-
; CHECK-NEXT: vmov.8 q1[13], r0
328-
; CHECK-NEXT: vmov.u8 r0, q0[13]
329-
; CHECK-NEXT: vmov.8 q1[14], r0
330-
; CHECK-NEXT: vmov.u8 r0, q0[15]
331-
; CHECK-NEXT: vmov.8 q1[15], r0
332-
; CHECK-NEXT: vmlav.s8 r0, q1, q2
228+
; CHECK-NEXT: vmlav.s8 r0, q0, q1
333229
; CHECK-NEXT: bx lr
334230
entry:
335231
%s2a = shufflevector <16 x i8> %s0, <16 x i8> %s0, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -423,15 +319,7 @@ entry:
423319
define arm_aapcs_vfpcc i64 @vmla_shuffle_v4i32_long(<4 x i32> %s0, <4 x i32> %s0b) {
424320
; CHECK-LABEL: vmla_shuffle_v4i32_long:
425321
; CHECK: @ %bb.0: @ %entry
426-
; CHECK-NEXT: vmov.f32 s8, s7
427-
; CHECK-NEXT: vmov.f32 s9, s6
428-
; CHECK-NEXT: vmov.f32 s10, s5
429-
; CHECK-NEXT: vmov.f32 s11, s4
430-
; CHECK-NEXT: vmov.f32 s4, s3
431-
; CHECK-NEXT: vmov.f32 s5, s2
432-
; CHECK-NEXT: vmov.f32 s6, s1
433-
; CHECK-NEXT: vmov.f32 s7, s0
434-
; CHECK-NEXT: vmlalv.u32 r0, r1, q1, q2
322+
; CHECK-NEXT: vmlalv.u32 r0, r1, q0, q1
435323
; CHECK-NEXT: bx lr
436324
entry:
437325
%s2a = shufflevector <4 x i32> %s0, <4 x i32> %s0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -446,15 +334,7 @@ entry:
446334
define arm_aapcs_vfpcc i64 @vmla_shuffle_v4i32_long_a(<4 x i32> %s0, <4 x i32> %s0b, i64 %a) {
447335
; CHECK-LABEL: vmla_shuffle_v4i32_long_a:
448336
; CHECK: @ %bb.0: @ %entry
449-
; CHECK-NEXT: vmov.f32 s8, s7
450-
; CHECK-NEXT: vmov.f32 s9, s6
451-
; CHECK-NEXT: vmov.f32 s10, s5
452-
; CHECK-NEXT: vmov.f32 s11, s4
453-
; CHECK-NEXT: vmov.f32 s4, s3
454-
; CHECK-NEXT: vmov.f32 s5, s2
455-
; CHECK-NEXT: vmov.f32 s6, s1
456-
; CHECK-NEXT: vmov.f32 s7, s0
457-
; CHECK-NEXT: vmlalva.u32 r0, r1, q1, q2
337+
; CHECK-NEXT: vmlalva.u32 r0, r1, q0, q1
458338
; CHECK-NEXT: bx lr
459339
entry:
460340
%s2a = shufflevector <4 x i32> %s0, <4 x i32> %s0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>

0 commit comments

Comments
 (0)