@@ -222,88 +222,88 @@ define arm_aapcs_vfpcc void @ssatmul_4_q31(ptr nocapture readonly %pSrcA, ptr no
222
222
; CHECK-NEXT: vldrw.u32 q1, [r4]
223
223
; CHECK-NEXT: .LBB1_4: @ %vector.body
224
224
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
225
- ; CHECK-NEXT: vldrw.u32 q4 , [r5], #16
226
- ; CHECK-NEXT: vldrw.u32 q3 , [r0], #16
225
+ ; CHECK-NEXT: vldrw.u32 q3 , [r5], #16
226
+ ; CHECK-NEXT: vldrw.u32 q2 , [r0], #16
227
227
; CHECK-NEXT: str r2, [sp, #12] @ 4-byte Spill
228
228
; CHECK-NEXT: mov.w r2, #-1
229
- ; CHECK-NEXT: vmov.f32 s8, s14
229
+ ; CHECK-NEXT: vmov.f32 s16, s10
230
230
; CHECK-NEXT: str r5, [sp, #8] @ 4-byte Spill
231
- ; CHECK-NEXT: vmov.f32 s20, s18
231
+ ; CHECK-NEXT: vmov.f32 s20, s14
232
+ ; CHECK-NEXT: vmov.f32 s18, s11
233
+ ; CHECK-NEXT: vmov.f32 s22, s15
232
234
; CHECK-NEXT: mov.w r8, #0
233
- ; CHECK-NEXT: vmov.f32 s10, s15
234
- ; CHECK-NEXT: vmov.f32 s22, s19
235
- ; CHECK-NEXT: vmullb.s32 q6, q5, q2
236
- ; CHECK-NEXT: vmov.f32 s18, s17
235
+ ; CHECK-NEXT: vmullb.s32 q6, q5, q4
236
+ ; CHECK-NEXT: vmov.f32 s14, s13
237
237
; CHECK-NEXT: vmov r4, r7, d12
238
238
; CHECK-NEXT: asrl r4, r7, #31
239
- ; CHECK-NEXT: vmov.f32 s14, s13
239
+ ; CHECK-NEXT: vmov.f32 s10, s9
240
240
; CHECK-NEXT: rsbs.w r5, r4, #-2147483648
241
241
; CHECK-NEXT: sbcs.w r5, r2, r7
242
242
; CHECK-NEXT: csetm r5, lt
243
243
; CHECK-NEXT: bfi r8, r5, #0, #8
244
244
; CHECK-NEXT: vmov r10, r5, d13
245
245
; CHECK-NEXT: asrl r10, r5, #31
246
- ; CHECK-NEXT: vmov r6, s18
246
+ ; CHECK-NEXT: vmov r6, s14
247
247
; CHECK-NEXT: rsbs.w r3, r10, #-2147483648
248
- ; CHECK-NEXT: vmov q2 [2], q2 [0], r4, r10
248
+ ; CHECK-NEXT: vmov q4 [2], q4 [0], r4, r10
249
249
; CHECK-NEXT: sbcs.w r3, r2, r5
250
- ; CHECK-NEXT: vmov q2 [3], q2 [1], r7, r5
250
+ ; CHECK-NEXT: vmov q4 [3], q4 [1], r7, r5
251
251
; CHECK-NEXT: csetm r3, lt
252
252
; CHECK-NEXT: bfi r8, r3, #8, #8
253
253
; CHECK-NEXT: vmsr p0, r8
254
254
; CHECK-NEXT: mvn r8, #-2147483648
255
- ; CHECK-NEXT: vpsel q2, q2 , q0
256
- ; CHECK-NEXT: vmov r3, r4, d4
255
+ ; CHECK-NEXT: vpsel q4, q4 , q0
256
+ ; CHECK-NEXT: vmov r3, r4, d8
257
257
; CHECK-NEXT: subs.w r3, r3, r8
258
258
; CHECK-NEXT: sbcs r3, r4, #0
259
259
; CHECK-NEXT: mov.w r4, #0
260
260
; CHECK-NEXT: csetm r3, lt
261
261
; CHECK-NEXT: bfi r4, r3, #0, #8
262
- ; CHECK-NEXT: vmov r3, r5, d5
262
+ ; CHECK-NEXT: vmov r3, r5, d9
263
263
; CHECK-NEXT: subs.w r3, r3, r8
264
264
; CHECK-NEXT: sbcs r3, r5, #0
265
265
; CHECK-NEXT: mov.w r5, #0
266
266
; CHECK-NEXT: csetm r3, lt
267
267
; CHECK-NEXT: bfi r4, r3, #8, #8
268
- ; CHECK-NEXT: vmov r3, s12
268
+ ; CHECK-NEXT: vmov r3, s8
269
269
; CHECK-NEXT: vmsr p0, r4
270
- ; CHECK-NEXT: vmov r4, s16
271
- ; CHECK-NEXT: vpsel q2, q2 , q1
270
+ ; CHECK-NEXT: vmov r4, s12
271
+ ; CHECK-NEXT: vpsel q4, q4 , q1
272
272
; CHECK-NEXT: smull r4, r7, r4, r3
273
273
; CHECK-NEXT: asrl r4, r7, #31
274
274
; CHECK-NEXT: rsbs.w r3, r4, #-2147483648
275
275
; CHECK-NEXT: sbcs.w r3, r2, r7
276
276
; CHECK-NEXT: csetm r3, lt
277
277
; CHECK-NEXT: bfi r5, r3, #0, #8
278
- ; CHECK-NEXT: vmov r3, s14
278
+ ; CHECK-NEXT: vmov r3, s10
279
279
; CHECK-NEXT: smull r6, r3, r6, r3
280
280
; CHECK-NEXT: asrl r6, r3, #31
281
281
; CHECK-NEXT: rsbs.w r1, r6, #-2147483648
282
- ; CHECK-NEXT: vmov q3 [2], q3 [0], r4, r6
282
+ ; CHECK-NEXT: vmov q2 [2], q2 [0], r4, r6
283
283
; CHECK-NEXT: sbcs.w r1, r2, r3
284
- ; CHECK-NEXT: vmov q3 [3], q3 [1], r7, r3
284
+ ; CHECK-NEXT: vmov q2 [3], q2 [1], r7, r3
285
285
; CHECK-NEXT: csetm r1, lt
286
286
; CHECK-NEXT: bfi r5, r1, #8, #8
287
287
; CHECK-NEXT: vmsr p0, r5
288
288
; CHECK-NEXT: ldrd r5, r2, [sp, #8] @ 8-byte Folded Reload
289
- ; CHECK-NEXT: vpsel q3, q3 , q0
290
- ; CHECK-NEXT: vmov r1, r3, d6
289
+ ; CHECK-NEXT: vpsel q2, q2 , q0
290
+ ; CHECK-NEXT: vmov r1, r3, d4
291
291
; CHECK-NEXT: subs.w r1, r1, r8
292
292
; CHECK-NEXT: sbcs r1, r3, #0
293
293
; CHECK-NEXT: mov.w r3, #0
294
294
; CHECK-NEXT: csetm r1, lt
295
295
; CHECK-NEXT: bfi r3, r1, #0, #8
296
- ; CHECK-NEXT: vmov r1, r4, d7
296
+ ; CHECK-NEXT: vmov r1, r4, d5
297
297
; CHECK-NEXT: subs.w r1, r1, r8
298
298
; CHECK-NEXT: sbcs r1, r4, #0
299
299
; CHECK-NEXT: csetm r1, lt
300
300
; CHECK-NEXT: bfi r3, r1, #8, #8
301
301
; CHECK-NEXT: vmsr p0, r3
302
- ; CHECK-NEXT: vpsel q3, q3 , q1
303
- ; CHECK-NEXT: vmov.f32 s13, s14
304
- ; CHECK-NEXT: vmov.f32 s14, s8
305
- ; CHECK-NEXT: vmov.f32 s15, s10
306
- ; CHECK-NEXT: vstrb.8 q3 , [r2], #16
302
+ ; CHECK-NEXT: vpsel q2, q2 , q1
303
+ ; CHECK-NEXT: vmov.f32 s9, s10
304
+ ; CHECK-NEXT: vmov.f32 s10, s16
305
+ ; CHECK-NEXT: vmov.f32 s11, s18
306
+ ; CHECK-NEXT: vstrb.8 q2 , [r2], #16
307
307
; CHECK-NEXT: le lr, .LBB1_4
308
308
; CHECK-NEXT: @ %bb.5: @ %middle.block
309
309
; CHECK-NEXT: ldrd r1, r3, [sp] @ 8-byte Folded Reload
@@ -462,14 +462,14 @@ define arm_aapcs_vfpcc void @ssatmul_4t_q31(ptr nocapture readonly %pSrcA, ptr n
462
462
; CHECK-NEXT: vcmp.u32 cs, q1, q4
463
463
; CHECK-NEXT: vstr p0, [sp, #20] @ 4-byte Spill
464
464
; CHECK-NEXT: vpstt
465
- ; CHECK-NEXT: vldrwt.u32 q5 , [r0], #16
466
- ; CHECK-NEXT: vldrwt.u32 q6 , [r1], #16
467
- ; CHECK-NEXT: vmov.f32 s16, s22
468
- ; CHECK-NEXT: vmov.f32 s18, s23
469
- ; CHECK-NEXT: vmov.f32 s28, s26
470
- ; CHECK-NEXT: vmov.f32 s30, s27
471
- ; CHECK-NEXT: vmullb.s32 q0, q7, q4
472
- ; CHECK-NEXT: vmov.f32 s22, s25
465
+ ; CHECK-NEXT: vldrwt.u32 q4 , [r0], #16
466
+ ; CHECK-NEXT: vldrwt.u32 q5 , [r1], #16
467
+ ; CHECK-NEXT: vmov.f32 s24, s18
468
+ ; CHECK-NEXT: vmov.f32 s26, s19
469
+ ; CHECK-NEXT: vmov.f32 s28, s22
470
+ ; CHECK-NEXT: vmov.f32 s30, s23
471
+ ; CHECK-NEXT: vmullb.s32 q0, q7, q6
472
+ ; CHECK-NEXT: vmov.f32 s18, s21
473
473
; CHECK-NEXT: vmov r10, r5, d0
474
474
; CHECK-NEXT: asrl r10, r5, #31
475
475
; CHECK-NEXT: rsbs.w r7, r10, #-2147483648
@@ -483,7 +483,7 @@ define arm_aapcs_vfpcc void @ssatmul_4t_q31(ptr nocapture readonly %pSrcA, ptr n
483
483
; CHECK-NEXT: sbcs.w r3, r12, r7
484
484
; CHECK-NEXT: vmov q0[3], q0[1], r5, r7
485
485
; CHECK-NEXT: csetm r3, lt
486
- ; CHECK-NEXT: vmov r7, s22
486
+ ; CHECK-NEXT: vmov r7, s18
487
487
; CHECK-NEXT: bfi r4, r3, #8, #8
488
488
; CHECK-NEXT: vmsr p0, r4
489
489
; CHECK-NEXT: vpsel q0, q0, q2
@@ -498,11 +498,11 @@ define arm_aapcs_vfpcc void @ssatmul_4t_q31(ptr nocapture readonly %pSrcA, ptr n
498
498
; CHECK-NEXT: sbcs r3, r5, #0
499
499
; CHECK-NEXT: csetm r3, lt
500
500
; CHECK-NEXT: bfi r4, r3, #8, #8
501
- ; CHECK-NEXT: vmov r3, s20
501
+ ; CHECK-NEXT: vmov r3, s16
502
502
; CHECK-NEXT: vmsr p0, r4
503
- ; CHECK-NEXT: vmov r4, s24
504
- ; CHECK-NEXT: vpsel q4 , q0, q3
505
- ; CHECK-NEXT: vmov.f32 s2, s21
503
+ ; CHECK-NEXT: vmov r4, s20
504
+ ; CHECK-NEXT: vpsel q6 , q0, q3
505
+ ; CHECK-NEXT: vmov.f32 s2, s17
506
506
; CHECK-NEXT: smull r10, r5, r4, r3
507
507
; CHECK-NEXT: movs r4, #0
508
508
; CHECK-NEXT: asrl r10, r5, #31
@@ -536,8 +536,8 @@ define arm_aapcs_vfpcc void @ssatmul_4t_q31(ptr nocapture readonly %pSrcA, ptr n
536
536
; CHECK-NEXT: vpsel q0, q0, q3
537
537
; CHECK-NEXT: vldr p0, [sp, #20] @ 4-byte Reload
538
538
; CHECK-NEXT: vmov.f32 s1, s2
539
- ; CHECK-NEXT: vmov.f32 s2, s16
540
- ; CHECK-NEXT: vmov.f32 s3, s18
539
+ ; CHECK-NEXT: vmov.f32 s2, s24
540
+ ; CHECK-NEXT: vmov.f32 s3, s26
541
541
; CHECK-NEXT: vpst
542
542
; CHECK-NEXT: vstrwt.32 q0, [r2], #16
543
543
; CHECK-NEXT: le lr, .LBB2_2
@@ -778,34 +778,34 @@ define arm_aapcs_vfpcc void @usatmul_4_q31(ptr nocapture readonly %pSrcA, ptr no
778
778
; CHECK-NEXT: .LBB4_4: @ %vector.body
779
779
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
780
780
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
781
- ; CHECK-NEXT: vldrw.u32 q3 , [r1], #16
782
- ; CHECK-NEXT: vmov.f32 s8 , s6
783
- ; CHECK-NEXT: vmov.f32 s16, s14
784
- ; CHECK-NEXT: vmov.f32 s10 , s7
785
- ; CHECK-NEXT: vmov.f32 s18, s15
786
- ; CHECK-NEXT: vmullb.u32 q5, q4, q2
781
+ ; CHECK-NEXT: vldrw.u32 q2 , [r1], #16
782
+ ; CHECK-NEXT: vmov.f32 s12 , s6
783
+ ; CHECK-NEXT: vmov.f32 s16, s10
784
+ ; CHECK-NEXT: vmov.f32 s14 , s7
785
+ ; CHECK-NEXT: vmov.f32 s18, s11
786
+ ; CHECK-NEXT: vmullb.u32 q5, q4, q3
787
787
; CHECK-NEXT: vmov.f32 s6, s5
788
788
; CHECK-NEXT: vmov r10, r5, d10
789
789
; CHECK-NEXT: lsrl r10, r5, #31
790
- ; CHECK-NEXT: vmov.f32 s14, s13
790
+ ; CHECK-NEXT: vmov.f32 s10, s9
791
791
; CHECK-NEXT: subs.w r6, r10, #-1
792
- ; CHECK-NEXT: vmullb.u32 q4, q3, q1
793
792
; CHECK-NEXT: sbcs r5, r5, #0
794
793
; CHECK-NEXT: mov.w r6, #0
795
794
; CHECK-NEXT: csetm r5, lo
796
795
; CHECK-NEXT: bfi r6, r5, #0, #8
797
796
; CHECK-NEXT: vmov r4, r5, d11
798
797
; CHECK-NEXT: lsrl r4, r5, #31
799
798
; CHECK-NEXT: subs.w r7, r4, #-1
800
- ; CHECK-NEXT: vmov q2 [2], q2 [0], r10, r4
799
+ ; CHECK-NEXT: vmov q3 [2], q3 [0], r10, r4
801
800
; CHECK-NEXT: sbcs r5, r5, #0
802
801
; CHECK-NEXT: csetm r5, lo
803
802
; CHECK-NEXT: bfi r6, r5, #8, #8
803
+ ; CHECK-NEXT: vmsr p0, r6
804
+ ; CHECK-NEXT: vpsel q3, q3, q0
805
+ ; CHECK-NEXT: vmullb.u32 q4, q2, q1
804
806
; CHECK-NEXT: vmov r10, r5, d8
805
807
; CHECK-NEXT: lsrl r10, r5, #31
806
- ; CHECK-NEXT: vmsr p0, r6
807
808
; CHECK-NEXT: subs.w r6, r10, #-1
808
- ; CHECK-NEXT: vpsel q2, q2, q0
809
809
; CHECK-NEXT: sbcs r5, r5, #0
810
810
; CHECK-NEXT: mov.w r6, #0
811
811
; CHECK-NEXT: csetm r5, lo
@@ -820,8 +820,8 @@ define arm_aapcs_vfpcc void @usatmul_4_q31(ptr nocapture readonly %pSrcA, ptr no
820
820
; CHECK-NEXT: vmsr p0, r6
821
821
; CHECK-NEXT: vpsel q1, q1, q0
822
822
; CHECK-NEXT: vmov.f32 s5, s6
823
- ; CHECK-NEXT: vmov.f32 s6, s8
824
- ; CHECK-NEXT: vmov.f32 s7, s10
823
+ ; CHECK-NEXT: vmov.f32 s6, s12
824
+ ; CHECK-NEXT: vmov.f32 s7, s14
825
825
; CHECK-NEXT: vstrb.8 q1, [r2], #16
826
826
; CHECK-NEXT: le lr, .LBB4_4
827
827
; CHECK-NEXT: @ %bb.5: @ %middle.block
0 commit comments