@@ -14,24 +14,22 @@ define void @test(ptr nocapture noundef readonly %kernel, i32 noundef %kw, float
14
14
; CHECK-NEXT: // %bb.2: // %for.body.us.preheader
15
15
; CHECK-NEXT: ptrue p0.h
16
16
; CHECK-NEXT: add x11, x2, x11, lsl #1
17
- ; CHECK-NEXT: mov x12, #-16 // =0xfffffffffffffff0
18
- ; CHECK-NEXT: ptrue p1.b
19
17
; CHECK-NEXT: mov w8, wzr
18
+ ; CHECK-NEXT: ptrue p1.b
20
19
; CHECK-NEXT: mov x9, xzr
21
20
; CHECK-NEXT: mov w10, wzr
22
- ; CHECK-NEXT: addvl x12, x12, #1
23
- ; CHECK-NEXT: mov x13, #4 // =0x4
24
- ; CHECK-NEXT: mov x14, #8 // =0x8
21
+ ; CHECK-NEXT: mov x12, #4 // =0x4
22
+ ; CHECK-NEXT: mov x13, #8 // =0x8
25
23
; CHECK-NEXT: .LBB0_3: // %for.body.us
26
24
; CHECK-NEXT: // =>This Loop Header: Depth=1
27
25
; CHECK-NEXT: // Child Loop BB0_4 Depth 2
28
- ; CHECK-NEXT: add x15 , x0, x9, lsl #2
29
- ; CHECK-NEXT: sbfiz x16 , x8, #1, #32
30
- ; CHECK-NEXT: mov x17 , x2
31
- ; CHECK-NEXT: ldp s0, s1, [x15 ]
32
- ; CHECK-NEXT: add x16, x16 , #8
33
- ; CHECK-NEXT: ldp s2, s3, [x15 , #8]
34
- ; CHECK-NEXT: ubfiz x15 , x8, #1, #32
26
+ ; CHECK-NEXT: add x14 , x0, x9, lsl #2
27
+ ; CHECK-NEXT: sbfiz x15 , x8, #1, #32
28
+ ; CHECK-NEXT: mov x16 , x2
29
+ ; CHECK-NEXT: ldp s0, s1, [x14 ]
30
+ ; CHECK-NEXT: add x15, x15 , #8
31
+ ; CHECK-NEXT: ldp s2, s3, [x14 , #8]
32
+ ; CHECK-NEXT: ubfiz x14 , x8, #1, #32
35
33
; CHECK-NEXT: fcvt h0, s0
36
34
; CHECK-NEXT: fcvt h1, s1
37
35
; CHECK-NEXT: fcvt h2, s2
@@ -43,56 +41,52 @@ define void @test(ptr nocapture noundef readonly %kernel, i32 noundef %kw, float
43
41
; CHECK-NEXT: .LBB0_4: // %for.cond.i.preheader.us
44
42
; CHECK-NEXT: // Parent Loop BB0_3 Depth=1
45
43
; CHECK-NEXT: // => This Inner Loop Header: Depth=2
46
- ; CHECK-NEXT: ld1b { z4.b }, p1/z, [x17, x15]
47
- ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x17]
48
- ; CHECK-NEXT: add x18, x17, x16
49
- ; CHECK-NEXT: add x3, x17, x15
44
+ ; CHECK-NEXT: ld1b { z4.b }, p1/z, [x16, x14]
45
+ ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x16]
46
+ ; CHECK-NEXT: add x17, x16, x15
47
+ ; CHECK-NEXT: add x18, x16, x14
48
+ ; CHECK-NEXT: add x3, x17, #8
49
+ ; CHECK-NEXT: add x4, x17, #16
50
50
; CHECK-NEXT: fmad z4.h, p0/m, z0.h, z5.h
51
- ; CHECK-NEXT: ld1b { z5.b }, p1/z, [x17, x16 ]
51
+ ; CHECK-NEXT: ld1b { z5.b }, p1/z, [x16, x15 ]
52
52
; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z1.h
53
- ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x18, x13 , lsl #1]
53
+ ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x17, x12 , lsl #1]
54
54
; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z2.h
55
- ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x18, x14, lsl #1]
56
- ; CHECK-NEXT: add x18, x18, #16
55
+ ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x17, x13, lsl #1]
57
56
; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z3.h
58
- ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x17 , #1, mul vl]
59
- ; CHECK-NEXT: st1h { z4.h }, p0, [x17 ]
60
- ; CHECK-NEXT: ld1h { z4.h }, p0/z, [x3 , #1, mul vl]
57
+ ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x16 , #1, mul vl]
58
+ ; CHECK-NEXT: st1h { z4.h }, p0, [x16 ]
59
+ ; CHECK-NEXT: ld1h { z4.h }, p0/z, [x18 , #1, mul vl]
61
60
; CHECK-NEXT: fmad z4.h, p0/m, z0.h, z5.h
62
- ; CHECK-NEXT: ld1b { z5.b }, p1/z, [x18, x12]
63
- ; CHECK-NEXT: add x18, x18, x12
61
+ ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x17, #1, mul vl]
64
62
; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z1.h
65
- ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x18, x13, lsl #1 ]
63
+ ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x3, #1, mul vl ]
66
64
; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z2.h
67
- ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x18, x14, lsl #1]
68
- ; CHECK-NEXT: add x18, x18, #16
65
+ ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x4, #1, mul vl]
69
66
; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z3.h
70
- ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x17 , #2, mul vl]
71
- ; CHECK-NEXT: st1h { z4.h }, p0, [x17 , #1, mul vl]
72
- ; CHECK-NEXT: ld1h { z4.h }, p0/z, [x3 , #2, mul vl]
67
+ ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x16 , #2, mul vl]
68
+ ; CHECK-NEXT: st1h { z4.h }, p0, [x16 , #1, mul vl]
69
+ ; CHECK-NEXT: ld1h { z4.h }, p0/z, [x18 , #2, mul vl]
73
70
; CHECK-NEXT: fmad z4.h, p0/m, z0.h, z5.h
74
- ; CHECK-NEXT: ld1b { z5.b }, p1/z, [x18, x12]
75
- ; CHECK-NEXT: add x18, x18, x12
71
+ ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x17, #2, mul vl]
76
72
; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z1.h
77
- ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x18, x13, lsl #1 ]
73
+ ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x3, #2, mul vl ]
78
74
; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z2.h
79
- ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x18, x14, lsl #1]
80
- ; CHECK-NEXT: add x18, x18, #16
75
+ ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x4, #2, mul vl]
81
76
; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z3.h
82
- ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x17 , #3, mul vl]
83
- ; CHECK-NEXT: st1h { z4.h }, p0, [x17 , #2, mul vl]
84
- ; CHECK-NEXT: ld1h { z4.h }, p0/z, [x3 , #3, mul vl]
77
+ ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x16 , #3, mul vl]
78
+ ; CHECK-NEXT: st1h { z4.h }, p0, [x16 , #2, mul vl]
79
+ ; CHECK-NEXT: ld1h { z4.h }, p0/z, [x18 , #3, mul vl]
85
80
; CHECK-NEXT: fmad z4.h, p0/m, z0.h, z5.h
86
- ; CHECK-NEXT: ld1b { z5.b }, p1/z, [x18, x12]
87
- ; CHECK-NEXT: add x18, x18, x12
81
+ ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x17, #3, mul vl]
88
82
; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z1.h
89
- ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x18, x13, lsl #1 ]
83
+ ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x3, #3, mul vl ]
90
84
; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z2.h
91
- ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x18, x14, lsl #1 ]
85
+ ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x4, #3, mul vl ]
92
86
; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z3.h
93
- ; CHECK-NEXT: st1h { z4.h }, p0, [x17 , #3, mul vl]
94
- ; CHECK-NEXT: addvl x17, x17 , #4
95
- ; CHECK-NEXT: cmp x17 , x11
87
+ ; CHECK-NEXT: st1h { z4.h }, p0, [x16 , #3, mul vl]
88
+ ; CHECK-NEXT: addvl x16, x16 , #4
89
+ ; CHECK-NEXT: cmp x16 , x11
96
90
; CHECK-NEXT: b.lo .LBB0_4
97
91
; CHECK-NEXT: // %bb.5: // %while.cond.i..exit_crit_edge.us
98
92
; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1
0 commit comments