Skip to content

Commit 7c01fe3

Browse files
committed
Address review comments
1 parent 065e985 commit 7c01fe3

File tree

1 file changed

+268
-10
lines changed

1 file changed

+268
-10
lines changed

llvm/test/CodeGen/X86/arithmetic_fence2.ll

Lines changed: 268 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -192,11 +192,11 @@ define half @f7(half %a) nounwind {
192192
; X64-NEXT: callq __truncsfhf2@PLT
193193
; X64-NEXT: popq %rax
194194
; X64-NEXT: retq
195-
%1 = fadd fast half %a, %a
196-
%t = call half @llvm.arithmetic.fence.f16(half %1)
197-
%2 = fadd fast half %a, %a
198-
%3 = fadd fast half %1, %2
199-
ret half %3
195+
%b = fadd half %a, %a
196+
%c = call half @llvm.arithmetic.fence.f16(half %b)
197+
%d = fadd half %a, %a
198+
%e = fadd half %b, %d
199+
ret half %e
200200
}
201201

202202
define bfloat @f8(bfloat %a) nounwind {
@@ -233,15 +233,273 @@ define bfloat @f8(bfloat %a) nounwind {
233233
; X64-NEXT: callq __truncsfbf2@PLT
234234
; X64-NEXT: popq %rax
235235
; X64-NEXT: retq
236-
%1 = fadd fast bfloat %a, %a
237-
%t = call bfloat @llvm.arithmetic.fence.bf16(bfloat %1)
238-
%2 = fadd fast bfloat %a, %a
239-
%3 = fadd fast bfloat %1, %2
240-
ret bfloat %3
236+
%b = fadd bfloat %a, %a
237+
%c = call bfloat @llvm.arithmetic.fence.bf16(bfloat %b)
238+
%d = fadd bfloat %a, %a
239+
%e = fadd bfloat %b, %d
240+
ret bfloat %e
241+
}
242+
243+
define <2 x half> @f9(<2 x half> %a) nounwind {
244+
; X86-LABEL: f9:
245+
; X86: # %bb.0:
246+
; X86-NEXT: subl $36, %esp
247+
; X86-NEXT: movdqa %xmm0, %xmm1
248+
; X86-NEXT: psrld $16, %xmm1
249+
; X86-NEXT: movdqu %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
250+
; X86-NEXT: pextrw $0, %xmm0, %eax
251+
; X86-NEXT: movw %ax, (%esp)
252+
; X86-NEXT: calll __extendhfsf2
253+
; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
254+
; X86-NEXT: pextrw $0, %xmm0, %eax
255+
; X86-NEXT: movw %ax, (%esp)
256+
; X86-NEXT: fstps {{[0-9]+}}(%esp)
257+
; X86-NEXT: calll __extendhfsf2
258+
; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
259+
; X86-NEXT: addss %xmm0, %xmm0
260+
; X86-NEXT: movss %xmm0, (%esp)
261+
; X86-NEXT: fstps {{[0-9]+}}(%esp)
262+
; X86-NEXT: calll __truncsfhf2
263+
; X86-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
264+
; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
265+
; X86-NEXT: addss %xmm0, %xmm0
266+
; X86-NEXT: movss %xmm0, (%esp)
267+
; X86-NEXT: calll __truncsfhf2
268+
; X86-NEXT: pextrw $0, %xmm0, %eax
269+
; X86-NEXT: movw %ax, (%esp)
270+
; X86-NEXT: calll __extendhfsf2
271+
; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
272+
; X86-NEXT: pextrw $0, %xmm0, %eax
273+
; X86-NEXT: movw %ax, (%esp)
274+
; X86-NEXT: fstps {{[0-9]+}}(%esp)
275+
; X86-NEXT: calll __extendhfsf2
276+
; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
277+
; X86-NEXT: addss %xmm0, %xmm0
278+
; X86-NEXT: movss %xmm0, (%esp)
279+
; X86-NEXT: fstps {{[0-9]+}}(%esp)
280+
; X86-NEXT: calll __truncsfhf2
281+
; X86-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
282+
; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
283+
; X86-NEXT: addss %xmm0, %xmm0
284+
; X86-NEXT: movss %xmm0, (%esp)
285+
; X86-NEXT: calll __truncsfhf2
286+
; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
287+
; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
288+
; X86-NEXT: addl $36, %esp
289+
; X86-NEXT: retl
290+
;
291+
; X64-LABEL: f9:
292+
; X64: # %bb.0:
293+
; X64-NEXT: subq $40, %rsp
294+
; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
295+
; X64-NEXT: psrld $16, %xmm0
296+
; X64-NEXT: callq __extendhfsf2@PLT
297+
; X64-NEXT: movd %xmm0, (%rsp) # 4-byte Folded Spill
298+
; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
299+
; X64-NEXT: callq __extendhfsf2@PLT
300+
; X64-NEXT: addss %xmm0, %xmm0
301+
; X64-NEXT: callq __truncsfhf2@PLT
302+
; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
303+
; X64-NEXT: movss (%rsp), %xmm0 # 4-byte Reload
304+
; X64-NEXT: # xmm0 = mem[0],zero,zero,zero
305+
; X64-NEXT: addss %xmm0, %xmm0
306+
; X64-NEXT: callq __truncsfhf2@PLT
307+
; X64-NEXT: callq __extendhfsf2@PLT
308+
; X64-NEXT: addss %xmm0, %xmm0
309+
; X64-NEXT: callq __truncsfhf2@PLT
310+
; X64-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
311+
; X64-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
312+
; X64-NEXT: # xmm0 = mem[0],zero,zero,zero
313+
; X64-NEXT: callq __extendhfsf2@PLT
314+
; X64-NEXT: addss %xmm0, %xmm0
315+
; X64-NEXT: callq __truncsfhf2@PLT
316+
; X64-NEXT: punpcklwd (%rsp), %xmm0 # 16-byte Folded Reload
317+
; X64-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
318+
; X64-NEXT: addq $40, %rsp
319+
; X64-NEXT: retq
320+
%b = fadd <2 x half> %a, %a
321+
%c = call <2 x half> @llvm.arithmetic.fence.v2f16(<2 x half> %b)
322+
%d = fadd <2 x half> %a, %a
323+
%e = fadd <2 x half> %b, %d
324+
ret <2 x half> %e
325+
}
326+
327+
define <4 x bfloat> @f10(<4 x bfloat> %a) nounwind {
328+
; X86-LABEL: f10:
329+
; X86: # %bb.0:
330+
; X86-NEXT: pushl %ebx
331+
; X86-NEXT: pushl %edi
332+
; X86-NEXT: pushl %esi
333+
; X86-NEXT: subl $56, %esp
334+
; X86-NEXT: movdqa %xmm0, %xmm1
335+
; X86-NEXT: movdqa %xmm0, %xmm2
336+
; X86-NEXT: pextrw $0, %xmm0, %eax
337+
; X86-NEXT: psrld $16, %xmm0
338+
; X86-NEXT: pextrw $0, %xmm0, %esi
339+
; X86-NEXT: psrlq $48, %xmm1
340+
; X86-NEXT: pextrw $0, %xmm1, %edi
341+
; X86-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1]
342+
; X86-NEXT: pextrw $0, %xmm2, %ecx
343+
; X86-NEXT: shll $16, %eax
344+
; X86-NEXT: movd %eax, %xmm0
345+
; X86-NEXT: addss %xmm0, %xmm0
346+
; X86-NEXT: movss %xmm0, (%esp)
347+
; X86-NEXT: shll $16, %ecx
348+
; X86-NEXT: movd %ecx, %xmm0
349+
; X86-NEXT: addss %xmm0, %xmm0
350+
; X86-NEXT: movss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
351+
; X86-NEXT: calll __truncsfbf2
352+
; X86-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
353+
; X86-NEXT: movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
354+
; X86-NEXT: # xmm0 = mem[0],zero,zero,zero
355+
; X86-NEXT: movss %xmm0, (%esp)
356+
; X86-NEXT: shll $16, %edi
357+
; X86-NEXT: movd %edi, %xmm0
358+
; X86-NEXT: addss %xmm0, %xmm0
359+
; X86-NEXT: movss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
360+
; X86-NEXT: calll __truncsfbf2
361+
; X86-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
362+
; X86-NEXT: movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
363+
; X86-NEXT: # xmm0 = mem[0],zero,zero,zero
364+
; X86-NEXT: movss %xmm0, (%esp)
365+
; X86-NEXT: shll $16, %esi
366+
; X86-NEXT: movd %esi, %xmm0
367+
; X86-NEXT: addss %xmm0, %xmm0
368+
; X86-NEXT: movss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
369+
; X86-NEXT: calll __truncsfbf2
370+
; X86-NEXT: movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 4-byte Reload
371+
; X86-NEXT: # xmm1 = mem[0],zero,zero,zero
372+
; X86-NEXT: movss %xmm1, (%esp)
373+
; X86-NEXT: pextrw $0, %xmm0, %esi
374+
; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
375+
; X86-NEXT: pextrw $0, %xmm0, %edi
376+
; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
377+
; X86-NEXT: pextrw $0, %xmm0, %ebx
378+
; X86-NEXT: calll __truncsfbf2
379+
; X86-NEXT: pextrw $0, %xmm0, %eax
380+
; X86-NEXT: shll $16, %eax
381+
; X86-NEXT: movd %eax, %xmm0
382+
; X86-NEXT: addss %xmm0, %xmm0
383+
; X86-NEXT: movss %xmm0, (%esp)
384+
; X86-NEXT: shll $16, %ebx
385+
; X86-NEXT: movd %ebx, %xmm0
386+
; X86-NEXT: addss %xmm0, %xmm0
387+
; X86-NEXT: movss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
388+
; X86-NEXT: calll __truncsfbf2
389+
; X86-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
390+
; X86-NEXT: movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
391+
; X86-NEXT: # xmm0 = mem[0],zero,zero,zero
392+
; X86-NEXT: movss %xmm0, (%esp)
393+
; X86-NEXT: shll $16, %edi
394+
; X86-NEXT: movd %edi, %xmm0
395+
; X86-NEXT: addss %xmm0, %xmm0
396+
; X86-NEXT: movss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
397+
; X86-NEXT: calll __truncsfbf2
398+
; X86-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
399+
; X86-NEXT: movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
400+
; X86-NEXT: # xmm0 = mem[0],zero,zero,zero
401+
; X86-NEXT: movss %xmm0, (%esp)
402+
; X86-NEXT: shll $16, %esi
403+
; X86-NEXT: movd %esi, %xmm0
404+
; X86-NEXT: addss %xmm0, %xmm0
405+
; X86-NEXT: movss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
406+
; X86-NEXT: calll __truncsfbf2
407+
; X86-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
408+
; X86-NEXT: movd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Folded Reload
409+
; X86-NEXT: # xmm0 = mem[0],zero,zero,zero
410+
; X86-NEXT: movd %xmm0, (%esp)
411+
; X86-NEXT: calll __truncsfbf2
412+
; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload
413+
; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
414+
; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
415+
; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
416+
; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
417+
; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
418+
; X86-NEXT: movdqa %xmm1, %xmm0
419+
; X86-NEXT: addl $56, %esp
420+
; X86-NEXT: popl %esi
421+
; X86-NEXT: popl %edi
422+
; X86-NEXT: popl %ebx
423+
; X86-NEXT: retl
424+
;
425+
; X64-LABEL: f10:
426+
; X64: # %bb.0:
427+
; X64-NEXT: pushq %rbp
428+
; X64-NEXT: pushq %r15
429+
; X64-NEXT: pushq %r14
430+
; X64-NEXT: pushq %rbx
431+
; X64-NEXT: subq $56, %rsp
432+
; X64-NEXT: movdqa %xmm0, %xmm1
433+
; X64-NEXT: psrld $16, %xmm1
434+
; X64-NEXT: pextrw $0, %xmm1, %ebp
435+
; X64-NEXT: pextrw $0, %xmm0, %r15d
436+
; X64-NEXT: movdqa %xmm0, %xmm1
437+
; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
438+
; X64-NEXT: pextrw $0, %xmm1, %r14d
439+
; X64-NEXT: psrlq $48, %xmm0
440+
; X64-NEXT: pextrw $0, %xmm0, %eax
441+
; X64-NEXT: shll $16, %eax
442+
; X64-NEXT: movd %eax, %xmm0
443+
; X64-NEXT: addss %xmm0, %xmm0
444+
; X64-NEXT: callq __truncsfbf2@PLT
445+
; X64-NEXT: pextrw $0, %xmm0, %ebx
446+
; X64-NEXT: shll $16, %r14d
447+
; X64-NEXT: movd %r14d, %xmm0
448+
; X64-NEXT: addss %xmm0, %xmm0
449+
; X64-NEXT: callq __truncsfbf2@PLT
450+
; X64-NEXT: pextrw $0, %xmm0, %r14d
451+
; X64-NEXT: shll $16, %r15d
452+
; X64-NEXT: movd %r15d, %xmm0
453+
; X64-NEXT: addss %xmm0, %xmm0
454+
; X64-NEXT: callq __truncsfbf2@PLT
455+
; X64-NEXT: pextrw $0, %xmm0, %r15d
456+
; X64-NEXT: shll $16, %ebp
457+
; X64-NEXT: movd %ebp, %xmm0
458+
; X64-NEXT: addss %xmm0, %xmm0
459+
; X64-NEXT: callq __truncsfbf2@PLT
460+
; X64-NEXT: pextrw $0, %xmm0, %eax
461+
; X64-NEXT: shll $16, %eax
462+
; X64-NEXT: movd %eax, %xmm0
463+
; X64-NEXT: addss %xmm0, %xmm0
464+
; X64-NEXT: callq __truncsfbf2@PLT
465+
; X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
466+
; X64-NEXT: shll $16, %r15d
467+
; X64-NEXT: movd %r15d, %xmm0
468+
; X64-NEXT: addss %xmm0, %xmm0
469+
; X64-NEXT: callq __truncsfbf2@PLT
470+
; X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
471+
; X64-NEXT: shll $16, %r14d
472+
; X64-NEXT: movd %r14d, %xmm0
473+
; X64-NEXT: addss %xmm0, %xmm0
474+
; X64-NEXT: callq __truncsfbf2@PLT
475+
; X64-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
476+
; X64-NEXT: shll $16, %ebx
477+
; X64-NEXT: movd %ebx, %xmm0
478+
; X64-NEXT: addss %xmm0, %xmm0
479+
; X64-NEXT: callq __truncsfbf2@PLT
480+
; X64-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload
481+
; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
482+
; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
483+
; X64-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
484+
; X64-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
485+
; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
486+
; X64-NEXT: addq $56, %rsp
487+
; X64-NEXT: popq %rbx
488+
; X64-NEXT: popq %r14
489+
; X64-NEXT: popq %r15
490+
; X64-NEXT: popq %rbp
491+
; X64-NEXT: retq
492+
%b = fadd <4 x bfloat> %a, %a
493+
%c = call <4 x bfloat> @llvm.arithmetic.fence.v4bf16(<4 x bfloat> %b)
494+
%d = fadd <4 x bfloat> %a, %a
495+
%e = fadd <4 x bfloat> %b, %d
496+
ret <4 x bfloat> %e
241497
}
242498

243499
declare half @llvm.arithmetic.fence.f16(half)
244500
declare bfloat @llvm.arithmetic.fence.bf16(bfloat)
501+
declare <2 x half> @llvm.arithmetic.fence.v2f16(<2 x half>)
502+
declare <4 x bfloat> @llvm.arithmetic.fence.v4bf16(<4 x bfloat>)
245503
declare float @llvm.arithmetic.fence.f32(float)
246504
declare double @llvm.arithmetic.fence.f64(double)
247505
declare <2 x float> @llvm.arithmetic.fence.v2f32(<2 x float>)

0 commit comments

Comments
 (0)