@@ -87,6 +87,77 @@ define <4 x i32> @load_v3i8_to_4xi32(ptr %src) {
87
87
ret <4 x i32 > %e
88
88
}
89
89
90
+ define <4 x i32 > @load_v3i8_to_4xi32_align_2 (ptr %src ) {
91
+ ; CHECK-LABEL: load_v3i8_to_4xi32_align_2:
92
+ ; CHECK: ; %bb.0:
93
+ ; CHECK-NEXT: sub sp, sp, #16
94
+ ; CHECK-NEXT: .cfi_def_cfa_offset 16
95
+ ; CHECK-NEXT: ldrh w8, [x0]
96
+ ; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff
97
+ ; CHECK-NEXT: strh w8, [sp, #12]
98
+ ; CHECK-NEXT: ldr s0, [sp, #12]
99
+ ; CHECK-NEXT: ldrsb w8, [x0, #2]
100
+ ; CHECK-NEXT: ushll.8h v0, v0, #0
101
+ ; CHECK-NEXT: mov.h v0[1], v0[1]
102
+ ; CHECK-NEXT: mov.h v0[2], w8
103
+ ; CHECK-NEXT: ushll.4s v0, v0, #0
104
+ ; CHECK-NEXT: and.16b v0, v0, v1
105
+ ; CHECK-NEXT: add sp, sp, #16
106
+ ; CHECK-NEXT: ret
107
+ ;
108
+ ; BE-LABEL: load_v3i8_to_4xi32_align_2:
109
+ ; BE: // %bb.0:
110
+ ; BE-NEXT: sub sp, sp, #16
111
+ ; BE-NEXT: .cfi_def_cfa_offset 16
112
+ ; BE-NEXT: ldrh w8, [x0]
113
+ ; BE-NEXT: movi v1.2d, #0x0000ff000000ff
114
+ ; BE-NEXT: strh w8, [sp, #12]
115
+ ; BE-NEXT: ldr s0, [sp, #12]
116
+ ; BE-NEXT: ldrsb w8, [x0, #2]
117
+ ; BE-NEXT: rev32 v0.8b, v0.8b
118
+ ; BE-NEXT: ushll v0.8h, v0.8b, #0
119
+ ; BE-NEXT: mov v0.h[1], v0.h[1]
120
+ ; BE-NEXT: mov v0.h[2], w8
121
+ ; BE-NEXT: ushll v0.4s, v0.4h, #0
122
+ ; BE-NEXT: and v0.16b, v0.16b, v1.16b
123
+ ; BE-NEXT: rev64 v0.4s, v0.4s
124
+ ; BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
125
+ ; BE-NEXT: add sp, sp, #16
126
+ ; BE-NEXT: ret
127
+ %l = load <3 x i8 >, ptr %src , align 2
128
+ %s = shufflevector <3 x i8 > poison, <3 x i8 > %l , <4 x i32 > <i32 3 , i32 4 , i32 5 , i32 undef >
129
+ %e = zext <4 x i8 > %s to <4 x i32 >
130
+ ret <4 x i32 > %e
131
+ }
132
+
133
+ define <4 x i32 > @load_v3i8_to_4xi32_align_4 (ptr %src ) {
134
+ ; CHECK-LABEL: load_v3i8_to_4xi32_align_4:
135
+ ; CHECK: ; %bb.0:
136
+ ; CHECK-NEXT: ldr s0, [x0]
137
+ ; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff
138
+ ; CHECK-NEXT: zip1.8b v0, v0, v0
139
+ ; CHECK-NEXT: ushll.4s v0, v0, #0
140
+ ; CHECK-NEXT: and.16b v0, v0, v1
141
+ ; CHECK-NEXT: ret
142
+ ;
143
+ ; BE-LABEL: load_v3i8_to_4xi32_align_4:
144
+ ; BE: // %bb.0:
145
+ ; BE-NEXT: ldr s0, [x0]
146
+ ; BE-NEXT: movi v1.2d, #0x0000ff000000ff
147
+ ; BE-NEXT: rev32 v0.8b, v0.8b
148
+ ; BE-NEXT: zip1 v0.8b, v0.8b, v0.8b
149
+ ; BE-NEXT: rev16 v0.8b, v0.8b
150
+ ; BE-NEXT: ushll v0.4s, v0.4h, #0
151
+ ; BE-NEXT: and v0.16b, v0.16b, v1.16b
152
+ ; BE-NEXT: rev64 v0.4s, v0.4s
153
+ ; BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
154
+ ; BE-NEXT: ret
155
+ %l = load <3 x i8 >, ptr %src , align 4
156
+ %s = shufflevector <3 x i8 > poison, <3 x i8 > %l , <4 x i32 > <i32 3 , i32 4 , i32 5 , i32 undef >
157
+ %e = zext <4 x i8 > %s to <4 x i32 >
158
+ ret <4 x i32 > %e
159
+ }
160
+
90
161
define <4 x i32 > @load_v3i8_to_4xi32_const_offset_1 (ptr %src ) {
91
162
; CHECK-LABEL: load_v3i8_to_4xi32_const_offset_1:
92
163
; CHECK: ; %bb.0:
@@ -176,6 +247,42 @@ define <4 x i32> @load_v3i8_to_4xi32_const_offset_3(ptr %src) {
176
247
}
177
248
178
249
define <4 x i32 > @volatile_load_v3i8_to_4xi32 (ptr %src ) {
250
+ ; check-label: volatile_load_v3i8_to_4xi32:
251
+ ; check: ; %bb.0:
252
+ ; check-next: sub sp, sp, #16
253
+ ; check-next: .cfi_def_cfa_offset 16
254
+ ; check-next: ldrh w8, [x0]
255
+ ; check-next: movi.2d v1, #0x0000ff000000ff
256
+ ; check-next: strh w8, [sp, #12]
257
+ ; check-next: ldr s0, [sp, #12]
258
+ ; check-next: ldrsb w8, [x0, #2]
259
+ ; check-next: ushll.8h v0, v0, #0
260
+ ; check-next: mov.h v0[1], v0[1]
261
+ ; check-next: mov.h v0[2], w8
262
+ ; check-next: ushll.4s v0, v0, #0
263
+ ; check-next: and.16b v0, v0, v1
264
+ ; check-next: add sp, sp, #16
265
+ ; check-next: ret
266
+ ;
267
+ ; be-label: volatile_load_v3i8_to_4xi32:
268
+ ; be: // %bb.0:
269
+ ; be-next: sub sp, sp, #16
270
+ ; be-next: .cfi_def_cfa_offset 16
271
+ ; be-next: ldrh w8, [x0]
272
+ ; be-next: movi v1.2d, #0x0000ff000000ff
273
+ ; be-next: strh w8, [sp, #12]
274
+ ; be-next: ldr s0, [sp, #12]
275
+ ; be-next: ldrsb w8, [x0, #2]
276
+ ; be-next: rev32 v0.8b, v0.8b
277
+ ; be-next: ushll v0.8h, v0.8b, #0
278
+ ; be-next: mov v0.h[1], v0.h[1]
279
+ ; be-next: mov v0.h[2], w8
280
+ ; be-next: ushll v0.4s, v0.4h, #0
281
+ ; be-next: and v0.16b, v0.16b, v1.16b
282
+ ; be-next: rev64 v0.4s, v0.4s
283
+ ; be-next: ext v0.16b, v0.16b, v0.16b, #8
284
+ ; be-next: add sp, sp, #16
285
+ ; be-next: ret
179
286
; CHECK-LABEL: volatile_load_v3i8_to_4xi32:
180
287
; CHECK: ; %bb.0:
181
288
; CHECK-NEXT: sub sp, sp, #16
@@ -221,8 +328,8 @@ define <4 x i32> @volatile_load_v3i8_to_4xi32(ptr %src) {
221
328
define <3 x i32 > @load_v3i32 (ptr %src ) {
222
329
; CHECK-LABEL: load_v3i32:
223
330
; CHECK: ; %bb.0:
224
- ; CHECK-NEXT: ldr d0, [x0]
225
331
; CHECK-NEXT: add x8, x0, #8
332
+ ; CHECK-NEXT: ldr d0, [x0]
226
333
; CHECK-NEXT: ld1.s { v0 }[2], [x8]
227
334
; CHECK-NEXT: ret
228
335
;
@@ -283,13 +390,13 @@ define void @store_trunc_add_from_64bits(ptr %src, ptr %dst) {
283
390
; CHECK: ; %bb.0: ; %entry
284
391
; CHECK-NEXT: sub sp, sp, #16
285
392
; CHECK-NEXT: .cfi_def_cfa_offset 16
393
+ ; CHECK-NEXT: add x8, x0, #4
286
394
; CHECK-NEXT: ldr s0, [x0]
287
- ; CHECK-NEXT: add x9, x0, #4
288
395
; CHECK-NEXT: Lloh0:
289
- ; CHECK-NEXT: adrp x8, lCPI7_0@PAGE
396
+ ; CHECK-NEXT: adrp x9, lCPI9_0@PAGE
397
+ ; CHECK-NEXT: ld1.h { v0 }[2], [x8]
290
398
; CHECK-NEXT: Lloh1:
291
- ; CHECK-NEXT: ldr d1, [x8, lCPI7_0@PAGEOFF]
292
- ; CHECK-NEXT: ld1.h { v0 }[2], [x9]
399
+ ; CHECK-NEXT: ldr d1, [x9, lCPI9_0@PAGEOFF]
293
400
; CHECK-NEXT: add.4h v0, v0, v1
294
401
; CHECK-NEXT: xtn.8b v1, v0
295
402
; CHECK-NEXT: umov.h w8, v0[2]
@@ -307,11 +414,11 @@ define void @store_trunc_add_from_64bits(ptr %src, ptr %dst) {
307
414
; BE-NEXT: .cfi_def_cfa_offset 16
308
415
; BE-NEXT: ldr s0, [x0]
309
416
; BE-NEXT: add x8, x0, #4
417
+ ; BE-NEXT: adrp x9, .LCPI9_0
418
+ ; BE-NEXT: add x9, x9, :lo12:.LCPI9_0
310
419
; BE-NEXT: rev32 v0.4h, v0.4h
420
+ ; BE-NEXT: ld1 { v1.4h }, [x9]
311
421
; BE-NEXT: ld1 { v0.h }[2], [x8]
312
- ; BE-NEXT: adrp x8, .LCPI7_0
313
- ; BE-NEXT: add x8, x8, :lo12:.LCPI7_0
314
- ; BE-NEXT: ld1 { v1.4h }, [x8]
315
422
; BE-NEXT: add v0.4h, v0.4h, v1.4h
316
423
; BE-NEXT: xtn v1.8b, v0.8h
317
424
; BE-NEXT: umov w8, v0.h[2]
@@ -373,22 +480,82 @@ entry:
373
480
ret void
374
481
}
375
482
483
+ define void @load_ext_to_64bits_default_align (ptr %src , ptr %dst ) {
484
+ ; CHECK-LABEL: load_ext_to_64bits_default_align:
485
+ ; CHECK: ; %bb.0: ; %entry
486
+ ; CHECK-NEXT: ldr s0, [x0]
487
+ ; CHECK-NEXT: add x8, x1, #4
488
+ ; CHECK-NEXT: zip1.8b v0, v0, v0
489
+ ; CHECK-NEXT: bic.4h v0, #255, lsl #8
490
+ ; CHECK-NEXT: st1.h { v0 }[2], [x8]
491
+ ; CHECK-NEXT: str s0, [x1]
492
+ ; CHECK-NEXT: ret
493
+ ;
494
+ ; BE-LABEL: load_ext_to_64bits_default_align:
495
+ ; BE: // %bb.0: // %entry
496
+ ; BE-NEXT: ldr s0, [x0]
497
+ ; BE-NEXT: add x8, x1, #4
498
+ ; BE-NEXT: rev32 v0.8b, v0.8b
499
+ ; BE-NEXT: zip1 v0.8b, v0.8b, v0.8b
500
+ ; BE-NEXT: rev16 v0.8b, v0.8b
501
+ ; BE-NEXT: bic v0.4h, #255, lsl #8
502
+ ; BE-NEXT: rev32 v1.8h, v0.8h
503
+ ; BE-NEXT: st1 { v0.h }[2], [x8]
504
+ ; BE-NEXT: str s1, [x1]
505
+ ; BE-NEXT: ret
506
+ entry:
507
+ %l = load <3 x i8 >, ptr %src
508
+ %e = zext <3 x i8 > %l to <3 x i16 >
509
+ store <3 x i16 > %e , ptr %dst , align 1
510
+ ret void
511
+ }
512
+
513
+ define void @load_ext_to_64bits_align_4 (ptr %src , ptr %dst ) {
514
+ ; CHECK-LABEL: load_ext_to_64bits_align_4:
515
+ ; CHECK: ; %bb.0: ; %entry
516
+ ; CHECK-NEXT: ldr s0, [x0]
517
+ ; CHECK-NEXT: add x8, x1, #4
518
+ ; CHECK-NEXT: zip1.8b v0, v0, v0
519
+ ; CHECK-NEXT: bic.4h v0, #255, lsl #8
520
+ ; CHECK-NEXT: st1.h { v0 }[2], [x8]
521
+ ; CHECK-NEXT: str s0, [x1]
522
+ ; CHECK-NEXT: ret
523
+ ;
524
+ ; BE-LABEL: load_ext_to_64bits_align_4:
525
+ ; BE: // %bb.0: // %entry
526
+ ; BE-NEXT: ldr s0, [x0]
527
+ ; BE-NEXT: add x8, x1, #4
528
+ ; BE-NEXT: rev32 v0.8b, v0.8b
529
+ ; BE-NEXT: zip1 v0.8b, v0.8b, v0.8b
530
+ ; BE-NEXT: rev16 v0.8b, v0.8b
531
+ ; BE-NEXT: bic v0.4h, #255, lsl #8
532
+ ; BE-NEXT: rev32 v1.8h, v0.8h
533
+ ; BE-NEXT: st1 { v0.h }[2], [x8]
534
+ ; BE-NEXT: str s1, [x1]
535
+ ; BE-NEXT: ret
536
+ entry:
537
+ %l = load <3 x i8 >, ptr %src , align 4
538
+ %e = zext <3 x i8 > %l to <3 x i16 >
539
+ store <3 x i16 > %e , ptr %dst , align 1
540
+ ret void
541
+ }
542
+
376
543
define void @load_ext_add_to_64bits (ptr %src , ptr %dst ) {
377
544
; CHECK-LABEL: load_ext_add_to_64bits:
378
545
; CHECK: ; %bb.0: ; %entry
379
546
; CHECK-NEXT: sub sp, sp, #16
380
547
; CHECK-NEXT: .cfi_def_cfa_offset 16
381
- ; CHECK-NEXT: ldrh w9 , [x0]
548
+ ; CHECK-NEXT: ldrh w8 , [x0]
382
549
; CHECK-NEXT: Lloh2:
383
- ; CHECK-NEXT: adrp x8, lCPI9_0@PAGE
384
- ; CHECK-NEXT: Lloh3:
385
- ; CHECK-NEXT: ldr d1, [x8, lCPI9_0@PAGEOFF]
386
- ; CHECK-NEXT: add x8, x1, #4
387
- ; CHECK-NEXT: strh w9, [sp, #12]
388
- ; CHECK-NEXT: add x9, x0, #2
550
+ ; CHECK-NEXT: adrp x9, lCPI13_0@PAGE
551
+ ; CHECK-NEXT: strh w8, [sp, #12]
552
+ ; CHECK-NEXT: add x8, x0, #2
389
553
; CHECK-NEXT: ldr s0, [sp, #12]
554
+ ; CHECK-NEXT: Lloh3:
555
+ ; CHECK-NEXT: ldr d1, [x9, lCPI13_0@PAGEOFF]
390
556
; CHECK-NEXT: ushll.8h v0, v0, #0
391
- ; CHECK-NEXT: ld1.b { v0 }[4], [x9]
557
+ ; CHECK-NEXT: ld1.b { v0 }[4], [x8]
558
+ ; CHECK-NEXT: add x8, x1, #4
392
559
; CHECK-NEXT: bic.4h v0, #255, lsl #8
393
560
; CHECK-NEXT: add.4h v0, v0, v1
394
561
; CHECK-NEXT: st1.h { v0 }[2], [x8]
@@ -408,11 +575,11 @@ define void @load_ext_add_to_64bits(ptr %src, ptr %dst) {
408
575
; BE-NEXT: rev32 v0.8b, v0.8b
409
576
; BE-NEXT: ushll v0.8h, v0.8b, #0
410
577
; BE-NEXT: ld1 { v0.b }[4], [x8]
411
- ; BE-NEXT: adrp x8, .LCPI9_0
412
- ; BE-NEXT: add x8, x8, :lo12:.LCPI9_0
578
+ ; BE-NEXT: adrp x8, .LCPI13_0
579
+ ; BE-NEXT: add x8, x8, :lo12:.LCPI13_0
413
580
; BE-NEXT: ld1 { v1.4h }, [x8]
414
- ; BE-NEXT: add x8, x1, #4
415
581
; BE-NEXT: bic v0.4h, #255, lsl #8
582
+ ; BE-NEXT: add x8, x1, #4
416
583
; BE-NEXT: add v0.4h, v0.4h, v1.4h
417
584
; BE-NEXT: rev32 v1.8h, v0.8h
418
585
; BE-NEXT: st1 { v0.h }[2], [x8]
@@ -465,6 +632,82 @@ define void @shift_trunc_store(ptr %src, ptr %dst) {
465
632
ret void
466
633
}
467
634
635
+ define void @shift_trunc_store_default_align (ptr %src , ptr %dst ) {
636
+ ; CHECK-LABEL: shift_trunc_store_default_align:
637
+ ; CHECK: ; %bb.0:
638
+ ; CHECK-NEXT: sub sp, sp, #16
639
+ ; CHECK-NEXT: .cfi_def_cfa_offset 16
640
+ ; CHECK-NEXT: ldr q0, [x0]
641
+ ; CHECK-NEXT: shrn.4h v0, v0, #16
642
+ ; CHECK-NEXT: xtn.8b v1, v0
643
+ ; CHECK-NEXT: umov.h w8, v0[2]
644
+ ; CHECK-NEXT: str s1, [sp, #12]
645
+ ; CHECK-NEXT: ldrh w9, [sp, #12]
646
+ ; CHECK-NEXT: strb w8, [x1, #2]
647
+ ; CHECK-NEXT: strh w9, [x1]
648
+ ; CHECK-NEXT: add sp, sp, #16
649
+ ; CHECK-NEXT: ret
650
+ ;
651
+ ; BE-LABEL: shift_trunc_store_default_align:
652
+ ; BE: // %bb.0:
653
+ ; BE-NEXT: sub sp, sp, #16
654
+ ; BE-NEXT: .cfi_def_cfa_offset 16
655
+ ; BE-NEXT: ld1 { v0.4s }, [x0]
656
+ ; BE-NEXT: shrn v0.4h, v0.4s, #16
657
+ ; BE-NEXT: xtn v1.8b, v0.8h
658
+ ; BE-NEXT: umov w8, v0.h[2]
659
+ ; BE-NEXT: rev32 v1.16b, v1.16b
660
+ ; BE-NEXT: str s1, [sp, #12]
661
+ ; BE-NEXT: ldrh w9, [sp, #12]
662
+ ; BE-NEXT: strb w8, [x1, #2]
663
+ ; BE-NEXT: strh w9, [x1]
664
+ ; BE-NEXT: add sp, sp, #16
665
+ ; BE-NEXT: ret
666
+ %l = load <3 x i32 >, ptr %src
667
+ %s = lshr <3 x i32 > %l , <i32 16 , i32 16 , i32 16 >
668
+ %t = trunc <3 x i32 > %s to <3 x i8 >
669
+ store <3 x i8 > %t , ptr %dst
670
+ ret void
671
+ }
672
+
673
+ define void @shift_trunc_store_align_4 (ptr %src , ptr %dst ) {
674
+ ; CHECK-LABEL: shift_trunc_store_align_4:
675
+ ; CHECK: ; %bb.0:
676
+ ; CHECK-NEXT: sub sp, sp, #16
677
+ ; CHECK-NEXT: .cfi_def_cfa_offset 16
678
+ ; CHECK-NEXT: ldr q0, [x0]
679
+ ; CHECK-NEXT: shrn.4h v0, v0, #16
680
+ ; CHECK-NEXT: xtn.8b v1, v0
681
+ ; CHECK-NEXT: umov.h w8, v0[2]
682
+ ; CHECK-NEXT: str s1, [sp, #12]
683
+ ; CHECK-NEXT: ldrh w9, [sp, #12]
684
+ ; CHECK-NEXT: strb w8, [x1, #2]
685
+ ; CHECK-NEXT: strh w9, [x1]
686
+ ; CHECK-NEXT: add sp, sp, #16
687
+ ; CHECK-NEXT: ret
688
+ ;
689
+ ; BE-LABEL: shift_trunc_store_align_4:
690
+ ; BE: // %bb.0:
691
+ ; BE-NEXT: sub sp, sp, #16
692
+ ; BE-NEXT: .cfi_def_cfa_offset 16
693
+ ; BE-NEXT: ld1 { v0.4s }, [x0]
694
+ ; BE-NEXT: shrn v0.4h, v0.4s, #16
695
+ ; BE-NEXT: xtn v1.8b, v0.8h
696
+ ; BE-NEXT: umov w8, v0.h[2]
697
+ ; BE-NEXT: rev32 v1.16b, v1.16b
698
+ ; BE-NEXT: str s1, [sp, #12]
699
+ ; BE-NEXT: ldrh w9, [sp, #12]
700
+ ; BE-NEXT: strb w8, [x1, #2]
701
+ ; BE-NEXT: strh w9, [x1]
702
+ ; BE-NEXT: add sp, sp, #16
703
+ ; BE-NEXT: ret
704
+ %l = load <3 x i32 >, ptr %src
705
+ %s = lshr <3 x i32 > %l , <i32 16 , i32 16 , i32 16 >
706
+ %t = trunc <3 x i32 > %s to <3 x i8 >
707
+ store <3 x i8 > %t , ptr %dst , align 4
708
+ ret void
709
+ }
710
+
468
711
define void @shift_trunc_store_const_offset_1 (ptr %src , ptr %dst ) {
469
712
; CHECK-LABEL: shift_trunc_store_const_offset_1:
470
713
; CHECK: ; %bb.0:
0 commit comments