@@ -31,6 +31,45 @@ define <4 x float> @load_float4_float3(<4 x float>* nocapture readonly dereferen
31
31
ret <4 x float > %r2
32
32
}
33
33
34
+ define <4 x float > @load_float4_float3_0122 (<4 x float >* nocapture readonly dereferenceable (16 )) {
35
+ ; SSE2-LABEL: load_float4_float3_0122:
36
+ ; SSE2: # %bb.0:
37
+ ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
38
+ ; SSE2-NEXT: movups (%rdi), %xmm0
39
+ ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
40
+ ; SSE2-NEXT: retq
41
+ ;
42
+ ; SSSE3-LABEL: load_float4_float3_0122:
43
+ ; SSSE3: # %bb.0:
44
+ ; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
45
+ ; SSSE3-NEXT: movups (%rdi), %xmm0
46
+ ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
47
+ ; SSSE3-NEXT: retq
48
+ ;
49
+ ; SSE41-LABEL: load_float4_float3_0122:
50
+ ; SSE41: # %bb.0:
51
+ ; SSE41-NEXT: movups (%rdi), %xmm0
52
+ ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
53
+ ; SSE41-NEXT: retq
54
+ ;
55
+ ; AVX-LABEL: load_float4_float3_0122:
56
+ ; AVX: # %bb.0:
57
+ ; AVX-NEXT: vmovups (%rdi), %xmm0
58
+ ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
59
+ ; AVX-NEXT: retq
60
+ %p0 = getelementptr inbounds <4 x float >, <4 x float >* %0 , i64 0 , i64 0
61
+ %p1 = getelementptr inbounds <4 x float >, <4 x float >* %0 , i64 0 , i64 1
62
+ %p2 = getelementptr inbounds <4 x float >, <4 x float >* %0 , i64 0 , i64 2
63
+ %ld0 = load float , float * %p0 , align 4
64
+ %ld1 = load float , float * %p1 , align 4
65
+ %ld2 = load float , float * %p2 , align 4
66
+ %r0 = insertelement <4 x float > undef , float %ld0 , i32 0
67
+ %r1 = insertelement <4 x float > %r0 , float %ld1 , i32 1
68
+ %r2 = insertelement <4 x float > %r1 , float %ld2 , i32 2
69
+ %r3 = insertelement <4 x float > %r1 , float %ld2 , i32 3
70
+ ret <4 x float > %r3
71
+ }
72
+
34
73
define <8 x float > @load_float8_float3 (<4 x float >* nocapture readonly dereferenceable (16 )) {
35
74
; SSE-LABEL: load_float8_float3:
36
75
; SSE: # %bb.0:
@@ -53,6 +92,45 @@ define <8 x float> @load_float8_float3(<4 x float>* nocapture readonly dereferen
53
92
ret <8 x float > %r2
54
93
}
55
94
95
+ define <8 x float > @load_float8_float3_0122 (<4 x float >* nocapture readonly dereferenceable (16 )) {
96
+ ; SSE2-LABEL: load_float8_float3_0122:
97
+ ; SSE2: # %bb.0:
98
+ ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
99
+ ; SSE2-NEXT: movups (%rdi), %xmm0
100
+ ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
101
+ ; SSE2-NEXT: retq
102
+ ;
103
+ ; SSSE3-LABEL: load_float8_float3_0122:
104
+ ; SSSE3: # %bb.0:
105
+ ; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
106
+ ; SSSE3-NEXT: movups (%rdi), %xmm0
107
+ ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
108
+ ; SSSE3-NEXT: retq
109
+ ;
110
+ ; SSE41-LABEL: load_float8_float3_0122:
111
+ ; SSE41: # %bb.0:
112
+ ; SSE41-NEXT: movups (%rdi), %xmm0
113
+ ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
114
+ ; SSE41-NEXT: retq
115
+ ;
116
+ ; AVX-LABEL: load_float8_float3_0122:
117
+ ; AVX: # %bb.0:
118
+ ; AVX-NEXT: vmovups (%rdi), %xmm0
119
+ ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
120
+ ; AVX-NEXT: retq
121
+ %p0 = getelementptr inbounds <4 x float >, <4 x float >* %0 , i64 0 , i64 0
122
+ %p1 = getelementptr inbounds <4 x float >, <4 x float >* %0 , i64 0 , i64 1
123
+ %p2 = getelementptr inbounds <4 x float >, <4 x float >* %0 , i64 0 , i64 2
124
+ %ld0 = load float , float * %p0 , align 4
125
+ %ld1 = load float , float * %p1 , align 4
126
+ %ld2 = load float , float * %p2 , align 4
127
+ %r0 = insertelement <8 x float > undef , float %ld0 , i32 0
128
+ %r1 = insertelement <8 x float > %r0 , float %ld1 , i32 1
129
+ %r2 = insertelement <8 x float > %r1 , float %ld2 , i32 2
130
+ %r3 = insertelement <8 x float > %r1 , float %ld2 , i32 3
131
+ ret <8 x float > %r3
132
+ }
133
+
56
134
define <4 x float > @load_float4_float3_as_float2_float (<4 x float >* nocapture readonly dereferenceable (16 )) {
57
135
; SSE-LABEL: load_float4_float3_as_float2_float:
58
136
; SSE: # %bb.0:
@@ -75,6 +153,47 @@ define <4 x float> @load_float4_float3_as_float2_float(<4 x float>* nocapture re
75
153
ret <4 x float > %10
76
154
}
77
155
156
+ define <4 x float > @load_float4_float3_as_float2_float_0122 (<4 x float >* nocapture readonly dereferenceable (16 )) {
157
+ ; SSE2-LABEL: load_float4_float3_as_float2_float_0122:
158
+ ; SSE2: # %bb.0:
159
+ ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
160
+ ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
161
+ ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
162
+ ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
163
+ ; SSE2-NEXT: retq
164
+ ;
165
+ ; SSSE3-LABEL: load_float4_float3_as_float2_float_0122:
166
+ ; SSSE3: # %bb.0:
167
+ ; SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
168
+ ; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
169
+ ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
170
+ ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
171
+ ; SSSE3-NEXT: retq
172
+ ;
173
+ ; SSE41-LABEL: load_float4_float3_as_float2_float_0122:
174
+ ; SSE41: # %bb.0:
175
+ ; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
176
+ ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
177
+ ; SSE41-NEXT: retq
178
+ ;
179
+ ; AVX-LABEL: load_float4_float3_as_float2_float_0122:
180
+ ; AVX: # %bb.0:
181
+ ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
182
+ ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
183
+ ; AVX-NEXT: retq
184
+ %2 = bitcast <4 x float >* %0 to <2 x float >*
185
+ %3 = load <2 x float >, <2 x float >* %2 , align 4
186
+ %4 = extractelement <2 x float > %3 , i32 0
187
+ %5 = insertelement <4 x float > undef , float %4 , i32 0
188
+ %6 = extractelement <2 x float > %3 , i32 1
189
+ %7 = insertelement <4 x float > %5 , float %6 , i32 1
190
+ %8 = getelementptr inbounds <4 x float >, <4 x float >* %0 , i64 0 , i64 2
191
+ %9 = load float , float * %8 , align 4
192
+ %10 = insertelement <4 x float > %7 , float %9 , i32 2
193
+ %11 = insertelement <4 x float > %7 , float %9 , i32 3
194
+ ret <4 x float > %11
195
+ }
196
+
78
197
define <4 x float > @load_float4_float3_trunc (<4 x float >* nocapture readonly dereferenceable (16 )) {
79
198
; SSE-LABEL: load_float4_float3_trunc:
80
199
; SSE: # %bb.0:
@@ -103,6 +222,99 @@ define <4 x float> @load_float4_float3_trunc(<4 x float>* nocapture readonly der
103
222
ret <4 x float > %16
104
223
}
105
224
225
+ define <4 x float > @load_float4_float3_trunc_0122 (<4 x float >* nocapture readonly dereferenceable (16 )) {
226
+ ; SSE2-LABEL: load_float4_float3_trunc_0122:
227
+ ; SSE2: # %bb.0:
228
+ ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
229
+ ; SSE2-NEXT: movaps (%rdi), %xmm0
230
+ ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
231
+ ; SSE2-NEXT: retq
232
+ ;
233
+ ; SSSE3-LABEL: load_float4_float3_trunc_0122:
234
+ ; SSSE3: # %bb.0:
235
+ ; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
236
+ ; SSSE3-NEXT: movaps (%rdi), %xmm0
237
+ ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
238
+ ; SSSE3-NEXT: retq
239
+ ;
240
+ ; SSE41-LABEL: load_float4_float3_trunc_0122:
241
+ ; SSE41: # %bb.0:
242
+ ; SSE41-NEXT: movaps (%rdi), %xmm0
243
+ ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
244
+ ; SSE41-NEXT: retq
245
+ ;
246
+ ; AVX-LABEL: load_float4_float3_trunc_0122:
247
+ ; AVX: # %bb.0:
248
+ ; AVX-NEXT: vmovaps (%rdi), %xmm0
249
+ ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
250
+ ; AVX-NEXT: retq
251
+ %2 = bitcast <4 x float >* %0 to i64*
252
+ %3 = load i64 , i64* %2 , align 16
253
+ %4 = getelementptr inbounds <4 x float >, <4 x float >* %0 , i64 0 , i64 2
254
+ %5 = bitcast float * %4 to i64*
255
+ %6 = load i64 , i64* %5 , align 8
256
+ %7 = trunc i64 %3 to i32
257
+ %8 = bitcast i32 %7 to float
258
+ %9 = insertelement <4 x float > undef , float %8 , i32 0
259
+ %10 = lshr i64 %3 , 32
260
+ %11 = trunc i64 %10 to i32
261
+ %12 = bitcast i32 %11 to float
262
+ %13 = insertelement <4 x float > %9 , float %12 , i32 1
263
+ %14 = trunc i64 %6 to i32
264
+ %15 = bitcast i32 %14 to float
265
+ %16 = insertelement <4 x float > %13 , float %15 , i32 2
266
+ %17 = insertelement <4 x float > %13 , float %15 , i32 3
267
+ ret <4 x float > %17
268
+ }
269
+
270
+ define <4 x float > @load_float4_float3_trunc_0123 (<4 x float >* nocapture readonly dereferenceable (16 )) {
271
+ ; SSE2-LABEL: load_float4_float3_trunc_0123:
272
+ ; SSE2: # %bb.0:
273
+ ; SSE2-NEXT: movaps (%rdi), %xmm0
274
+ ; SSE2-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
275
+ ; SSE2-NEXT: retq
276
+ ;
277
+ ; SSSE3-LABEL: load_float4_float3_trunc_0123:
278
+ ; SSSE3: # %bb.0:
279
+ ; SSSE3-NEXT: movaps (%rdi), %xmm0
280
+ ; SSSE3-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
281
+ ; SSSE3-NEXT: retq
282
+ ;
283
+ ; SSE41-LABEL: load_float4_float3_trunc_0123:
284
+ ; SSE41: # %bb.0:
285
+ ; SSE41-NEXT: movaps (%rdi), %xmm0
286
+ ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
287
+ ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
288
+ ; SSE41-NEXT: retq
289
+ ;
290
+ ; AVX-LABEL: load_float4_float3_trunc_0123:
291
+ ; AVX: # %bb.0:
292
+ ; AVX-NEXT: vmovaps (%rdi), %xmm0
293
+ ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
294
+ ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
295
+ ; AVX-NEXT: retq
296
+ %2 = bitcast <4 x float >* %0 to i64*
297
+ %3 = load i64 , i64* %2 , align 16
298
+ %4 = getelementptr inbounds <4 x float >, <4 x float >* %0 , i64 0 , i64 2
299
+ %5 = bitcast float * %4 to i64*
300
+ %6 = load i64 , i64* %5 , align 8
301
+ %7 = trunc i64 %3 to i32
302
+ %8 = bitcast i32 %7 to float
303
+ %9 = insertelement <4 x float > undef , float %8 , i32 0
304
+ %10 = lshr i64 %3 , 32
305
+ %11 = trunc i64 %10 to i32
306
+ %12 = bitcast i32 %11 to float
307
+ %13 = insertelement <4 x float > %9 , float %12 , i32 1
308
+ %14 = trunc i64 %6 to i32
309
+ %15 = bitcast i32 %14 to float
310
+ %16 = insertelement <4 x float > %13 , float %15 , i32 2
311
+ %17 = lshr i64 %6 , 32
312
+ %18 = trunc i64 %17 to i32
313
+ %19 = bitcast i32 %18 to float
314
+ %20 = insertelement <4 x float > %16 , float %19 , i32 3
315
+ ret <4 x float > %20
316
+ }
317
+
106
318
; PR21780
107
319
define <4 x double > @load_double4_0u2u (double * nocapture readonly dereferenceable (32 )) {
108
320
; SSE2-LABEL: load_double4_0u2u:
0 commit comments