Skip to content

Commit c52cd83

Browse files
committed
[AArch64] Improve codegen of store lane 0 instructions by directly storing the subregister.
For 0-lane stores, we used to generate code similar to: fmov w8, s0 str w8, [x0, x1, lsl rust-lang#2] instead of: str s0, [x0, x1, lsl rust-lang#2] To correct that: for store lane 0 patterns, directly match to STR <subreg>0. Byte-sized instructions don't have the special case for a 0 index, because FPR8s are defined to have untyped content. rdar://16372710 Differential Revision: http://reviews.llvm.org/D6772 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@225181 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent 19d9f34 commit c52cd83

File tree

2 files changed

+119
-0
lines changed

2 files changed

+119
-0
lines changed

lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1889,6 +1889,33 @@ let Predicates = [IsLE] in {
18891889
}
18901890
} // AddedComplexity = 10
18911891

1892+
// Match stores from lane 0 to the appropriate subreg's store.
1893+
multiclass VecROStoreLane0Pat<ROAddrMode ro, SDPatternOperator storeop,
1894+
ValueType VecTy, ValueType STy,
1895+
SubRegIndex SubRegIdx,
1896+
Instruction STRW, Instruction STRX> {
1897+
1898+
def : Pat<(storeop (STy (vector_extract (VecTy VecListOne128:$Vt), 0)),
1899+
(ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)),
1900+
(STRW (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx),
1901+
GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
1902+
1903+
def : Pat<(storeop (STy (vector_extract (VecTy VecListOne128:$Vt), 0)),
1904+
(ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)),
1905+
(STRX (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx),
1906+
GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
1907+
}
1908+
1909+
let AddedComplexity = 19 in {
1910+
defm : VecROStoreLane0Pat<ro16, truncstorei16, v8i16, i32, hsub, STRHroW, STRHroX>;
1911+
defm : VecROStoreLane0Pat<ro16, store , v8i16, i16, hsub, STRHroW, STRHroX>;
1912+
defm : VecROStoreLane0Pat<ro32, truncstorei32, v4i32, i32, ssub, STRSroW, STRSroX>;
1913+
defm : VecROStoreLane0Pat<ro32, store , v4i32, i32, ssub, STRSroW, STRSroX>;
1914+
defm : VecROStoreLane0Pat<ro32, store , v4f32, f32, ssub, STRSroW, STRSroX>;
1915+
defm : VecROStoreLane0Pat<ro64, store , v2i64, i64, dsub, STRDroW, STRDroX>;
1916+
defm : VecROStoreLane0Pat<ro64, store , v2f64, f64, dsub, STRDroW, STRDroX>;
1917+
}
1918+
18921919
//---
18931920
// (unsigned immediate)
18941921
defm STRX : StoreUI<0b11, 0, 0b00, GPR64, uimm12s8, "str",

test/CodeGen/AArch64/arm64-st1.ll

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,16 @@ define void @st1lane_16b(<16 x i8> %A, i8* %D) {
88
ret void
99
}
1010

11+
define void @st1lane0_ro_16b(<16 x i8> %A, i8* %D, i64 %offset) {
12+
; CHECK-LABEL: st1lane0_ro_16b
13+
; CHECK: umov.b w[[WREG:[0-9]+]], v0[0]
14+
; CHECK: strb w[[WREG]], [x0, x1]
15+
%ptr = getelementptr i8* %D, i64 %offset
16+
%tmp = extractelement <16 x i8> %A, i32 0
17+
store i8 %tmp, i8* %ptr
18+
ret void
19+
}
20+
1121
define void @st1lane_8h(<8 x i16> %A, i16* %D) {
1222
; CHECK-LABEL: st1lane_8h
1323
; CHECK: st1.h
@@ -16,6 +26,15 @@ define void @st1lane_8h(<8 x i16> %A, i16* %D) {
1626
ret void
1727
}
1828

29+
define void @st1lane0_ro_8h(<8 x i16> %A, i16* %D, i64 %offset) {
30+
; CHECK-LABEL: st1lane0_ro_8h
31+
; CHECK: str h0, [x0, x1, lsl #1]
32+
%ptr = getelementptr i16* %D, i64 %offset
33+
%tmp = extractelement <8 x i16> %A, i32 0
34+
store i16 %tmp, i16* %ptr
35+
ret void
36+
}
37+
1938
define void @st1lane_4s(<4 x i32> %A, i32* %D) {
2039
; CHECK-LABEL: st1lane_4s
2140
; CHECK: st1.s
@@ -24,6 +43,15 @@ define void @st1lane_4s(<4 x i32> %A, i32* %D) {
2443
ret void
2544
}
2645

46+
define void @st1lane0_ro_4s(<4 x i32> %A, i32* %D, i64 %offset) {
47+
; CHECK-LABEL: st1lane0_ro_4s
48+
; CHECK: str s0, [x0, x1, lsl #2]
49+
%ptr = getelementptr i32* %D, i64 %offset
50+
%tmp = extractelement <4 x i32> %A, i32 0
51+
store i32 %tmp, i32* %ptr
52+
ret void
53+
}
54+
2755
define void @st1lane_4s_float(<4 x float> %A, float* %D) {
2856
; CHECK-LABEL: st1lane_4s_float
2957
; CHECK: st1.s
@@ -32,6 +60,15 @@ define void @st1lane_4s_float(<4 x float> %A, float* %D) {
3260
ret void
3361
}
3462

63+
define void @st1lane0_ro_4s_float(<4 x float> %A, float* %D, i64 %offset) {
64+
; CHECK-LABEL: st1lane0_ro_4s_float
65+
; CHECK: str s0, [x0, x1, lsl #2]
66+
%ptr = getelementptr float* %D, i64 %offset
67+
%tmp = extractelement <4 x float> %A, i32 0
68+
store float %tmp, float* %ptr
69+
ret void
70+
}
71+
3572
define void @st1lane_2d(<2 x i64> %A, i64* %D) {
3673
; CHECK-LABEL: st1lane_2d
3774
; CHECK: st1.d
@@ -40,6 +77,15 @@ define void @st1lane_2d(<2 x i64> %A, i64* %D) {
4077
ret void
4178
}
4279

80+
define void @st1lane0_ro_2d(<2 x i64> %A, i64* %D, i64 %offset) {
81+
; CHECK-LABEL: st1lane0_ro_2d
82+
; CHECK: str d0, [x0, x1, lsl #3]
83+
%ptr = getelementptr i64* %D, i64 %offset
84+
%tmp = extractelement <2 x i64> %A, i32 0
85+
store i64 %tmp, i64* %ptr
86+
ret void
87+
}
88+
4389
define void @st1lane_2d_double(<2 x double> %A, double* %D) {
4490
; CHECK-LABEL: st1lane_2d_double
4591
; CHECK: st1.d
@@ -48,6 +94,15 @@ define void @st1lane_2d_double(<2 x double> %A, double* %D) {
4894
ret void
4995
}
5096

97+
define void @st1lane0_ro_2d_double(<2 x double> %A, double* %D, i64 %offset) {
98+
; CHECK-LABEL: st1lane0_ro_2d_double
99+
; CHECK: str d0, [x0, x1, lsl #3]
100+
%ptr = getelementptr double* %D, i64 %offset
101+
%tmp = extractelement <2 x double> %A, i32 0
102+
store double %tmp, double* %ptr
103+
ret void
104+
}
105+
51106
define void @st1lane_8b(<8 x i8> %A, i8* %D) {
52107
; CHECK-LABEL: st1lane_8b
53108
; CHECK: st1.b
@@ -56,6 +111,16 @@ define void @st1lane_8b(<8 x i8> %A, i8* %D) {
56111
ret void
57112
}
58113

114+
define void @st1lane0_ro_8b(<8 x i8> %A, i8* %D, i64 %offset) {
115+
; CHECK-LABEL: st1lane0_ro_8b
116+
; CHECK: umov.b w[[WREG:[0-9]+]], v0[0]
117+
; CHECK: strb w[[WREG]], [x0, x1]
118+
%ptr = getelementptr i8* %D, i64 %offset
119+
%tmp = extractelement <8 x i8> %A, i32 0
120+
store i8 %tmp, i8* %ptr
121+
ret void
122+
}
123+
59124
define void @st1lane_4h(<4 x i16> %A, i16* %D) {
60125
; CHECK-LABEL: st1lane_4h
61126
; CHECK: st1.h
@@ -64,6 +129,15 @@ define void @st1lane_4h(<4 x i16> %A, i16* %D) {
64129
ret void
65130
}
66131

132+
define void @st1lane0_ro_4h(<4 x i16> %A, i16* %D, i64 %offset) {
133+
; CHECK-LABEL: st1lane0_ro_4h
134+
; CHECK: str h0, [x0, x1, lsl #1]
135+
%ptr = getelementptr i16* %D, i64 %offset
136+
%tmp = extractelement <4 x i16> %A, i32 0
137+
store i16 %tmp, i16* %ptr
138+
ret void
139+
}
140+
67141
define void @st1lane_2s(<2 x i32> %A, i32* %D) {
68142
; CHECK-LABEL: st1lane_2s
69143
; CHECK: st1.s
@@ -72,6 +146,15 @@ define void @st1lane_2s(<2 x i32> %A, i32* %D) {
72146
ret void
73147
}
74148

149+
define void @st1lane0_ro_2s(<2 x i32> %A, i32* %D, i64 %offset) {
150+
; CHECK-LABEL: st1lane0_ro_2s
151+
; CHECK: str s0, [x0, x1, lsl #2]
152+
%ptr = getelementptr i32* %D, i64 %offset
153+
%tmp = extractelement <2 x i32> %A, i32 0
154+
store i32 %tmp, i32* %ptr
155+
ret void
156+
}
157+
75158
define void @st1lane_2s_float(<2 x float> %A, float* %D) {
76159
; CHECK-LABEL: st1lane_2s_float
77160
; CHECK: st1.s
@@ -80,6 +163,15 @@ define void @st1lane_2s_float(<2 x float> %A, float* %D) {
80163
ret void
81164
}
82165

166+
define void @st1lane0_ro_2s_float(<2 x float> %A, float* %D, i64 %offset) {
167+
; CHECK-LABEL: st1lane0_ro_2s_float
168+
; CHECK: str s0, [x0, x1, lsl #2]
169+
%ptr = getelementptr float* %D, i64 %offset
170+
%tmp = extractelement <2 x float> %A, i32 0
171+
store float %tmp, float* %ptr
172+
ret void
173+
}
174+
83175
define void @st2lane_16b(<16 x i8> %A, <16 x i8> %B, i8* %D) {
84176
; CHECK-LABEL: st2lane_16b
85177
; CHECK: st2.b

0 commit comments

Comments
 (0)