@@ -192,11 +192,11 @@ define half @f7(half %a) nounwind {
192
192
; X64-NEXT: callq __truncsfhf2@PLT
193
193
; X64-NEXT: popq %rax
194
194
; X64-NEXT: retq
195
- %1 = fadd fast half %a , %a
196
- %t = call half @llvm.arithmetic.fence.f16 (half %1 )
197
- %2 = fadd fast half %a , %a
198
- %3 = fadd fast half %1 , %2
199
- ret half %3
195
+ %b = fadd half %a , %a
196
+ %c = call half @llvm.arithmetic.fence.f16 (half %b )
197
+ %d = fadd half %a , %a
198
+ %e = fadd half %b , %d
199
+ ret half %e
200
200
}
201
201
202
202
define bfloat @f8 (bfloat %a ) nounwind {
@@ -233,15 +233,273 @@ define bfloat @f8(bfloat %a) nounwind {
233
233
; X64-NEXT: callq __truncsfbf2@PLT
234
234
; X64-NEXT: popq %rax
235
235
; X64-NEXT: retq
236
- %1 = fadd fast bfloat %a , %a
237
- %t = call bfloat @llvm.arithmetic.fence.bf16 (bfloat %1 )
238
- %2 = fadd fast bfloat %a , %a
239
- %3 = fadd fast bfloat %1 , %2
240
- ret bfloat %3
236
+ %b = fadd bfloat %a , %a
237
+ %c = call bfloat @llvm.arithmetic.fence.bf16 (bfloat %b )
238
+ %d = fadd bfloat %a , %a
239
+ %e = fadd bfloat %b , %d
240
+ ret bfloat %e
241
+ }
242
+
243
+ define <2 x half > @f9 (<2 x half > %a ) nounwind {
244
+ ; X86-LABEL: f9:
245
+ ; X86: # %bb.0:
246
+ ; X86-NEXT: subl $36, %esp
247
+ ; X86-NEXT: movdqa %xmm0, %xmm1
248
+ ; X86-NEXT: psrld $16, %xmm1
249
+ ; X86-NEXT: movdqu %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
250
+ ; X86-NEXT: pextrw $0, %xmm0, %eax
251
+ ; X86-NEXT: movw %ax, (%esp)
252
+ ; X86-NEXT: calll __extendhfsf2
253
+ ; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
254
+ ; X86-NEXT: pextrw $0, %xmm0, %eax
255
+ ; X86-NEXT: movw %ax, (%esp)
256
+ ; X86-NEXT: fstps {{[0-9]+}}(%esp)
257
+ ; X86-NEXT: calll __extendhfsf2
258
+ ; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
259
+ ; X86-NEXT: addss %xmm0, %xmm0
260
+ ; X86-NEXT: movss %xmm0, (%esp)
261
+ ; X86-NEXT: fstps {{[0-9]+}}(%esp)
262
+ ; X86-NEXT: calll __truncsfhf2
263
+ ; X86-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
264
+ ; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
265
+ ; X86-NEXT: addss %xmm0, %xmm0
266
+ ; X86-NEXT: movss %xmm0, (%esp)
267
+ ; X86-NEXT: calll __truncsfhf2
268
+ ; X86-NEXT: pextrw $0, %xmm0, %eax
269
+ ; X86-NEXT: movw %ax, (%esp)
270
+ ; X86-NEXT: calll __extendhfsf2
271
+ ; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
272
+ ; X86-NEXT: pextrw $0, %xmm0, %eax
273
+ ; X86-NEXT: movw %ax, (%esp)
274
+ ; X86-NEXT: fstps {{[0-9]+}}(%esp)
275
+ ; X86-NEXT: calll __extendhfsf2
276
+ ; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
277
+ ; X86-NEXT: addss %xmm0, %xmm0
278
+ ; X86-NEXT: movss %xmm0, (%esp)
279
+ ; X86-NEXT: fstps {{[0-9]+}}(%esp)
280
+ ; X86-NEXT: calll __truncsfhf2
281
+ ; X86-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
282
+ ; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
283
+ ; X86-NEXT: addss %xmm0, %xmm0
284
+ ; X86-NEXT: movss %xmm0, (%esp)
285
+ ; X86-NEXT: calll __truncsfhf2
286
+ ; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
287
+ ; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
288
+ ; X86-NEXT: addl $36, %esp
289
+ ; X86-NEXT: retl
290
+ ;
291
+ ; X64-LABEL: f9:
292
+ ; X64: # %bb.0:
293
+ ; X64-NEXT: subq $40, %rsp
294
+ ; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
295
+ ; X64-NEXT: psrld $16, %xmm0
296
+ ; X64-NEXT: callq __extendhfsf2@PLT
297
+ ; X64-NEXT: movd %xmm0, (%rsp) # 4-byte Folded Spill
298
+ ; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
299
+ ; X64-NEXT: callq __extendhfsf2@PLT
300
+ ; X64-NEXT: addss %xmm0, %xmm0
301
+ ; X64-NEXT: callq __truncsfhf2@PLT
302
+ ; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
303
+ ; X64-NEXT: movss (%rsp), %xmm0 # 4-byte Reload
304
+ ; X64-NEXT: # xmm0 = mem[0],zero,zero,zero
305
+ ; X64-NEXT: addss %xmm0, %xmm0
306
+ ; X64-NEXT: callq __truncsfhf2@PLT
307
+ ; X64-NEXT: callq __extendhfsf2@PLT
308
+ ; X64-NEXT: addss %xmm0, %xmm0
309
+ ; X64-NEXT: callq __truncsfhf2@PLT
310
+ ; X64-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
311
+ ; X64-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
312
+ ; X64-NEXT: # xmm0 = mem[0],zero,zero,zero
313
+ ; X64-NEXT: callq __extendhfsf2@PLT
314
+ ; X64-NEXT: addss %xmm0, %xmm0
315
+ ; X64-NEXT: callq __truncsfhf2@PLT
316
+ ; X64-NEXT: punpcklwd (%rsp), %xmm0 # 16-byte Folded Reload
317
+ ; X64-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
318
+ ; X64-NEXT: addq $40, %rsp
319
+ ; X64-NEXT: retq
320
+ %b = fadd <2 x half > %a , %a
321
+ %c = call <2 x half > @llvm.arithmetic.fence.v2f16 (<2 x half > %b )
322
+ %d = fadd <2 x half > %a , %a
323
+ %e = fadd <2 x half > %b , %d
324
+ ret <2 x half > %e
325
+ }
326
+
327
+ define <4 x bfloat> @f10 (<4 x bfloat> %a ) nounwind {
328
+ ; X86-LABEL: f10:
329
+ ; X86: # %bb.0:
330
+ ; X86-NEXT: pushl %ebx
331
+ ; X86-NEXT: pushl %edi
332
+ ; X86-NEXT: pushl %esi
333
+ ; X86-NEXT: subl $56, %esp
334
+ ; X86-NEXT: movdqa %xmm0, %xmm1
335
+ ; X86-NEXT: movdqa %xmm0, %xmm2
336
+ ; X86-NEXT: pextrw $0, %xmm0, %eax
337
+ ; X86-NEXT: psrld $16, %xmm0
338
+ ; X86-NEXT: pextrw $0, %xmm0, %esi
339
+ ; X86-NEXT: psrlq $48, %xmm1
340
+ ; X86-NEXT: pextrw $0, %xmm1, %edi
341
+ ; X86-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1]
342
+ ; X86-NEXT: pextrw $0, %xmm2, %ecx
343
+ ; X86-NEXT: shll $16, %eax
344
+ ; X86-NEXT: movd %eax, %xmm0
345
+ ; X86-NEXT: addss %xmm0, %xmm0
346
+ ; X86-NEXT: movss %xmm0, (%esp)
347
+ ; X86-NEXT: shll $16, %ecx
348
+ ; X86-NEXT: movd %ecx, %xmm0
349
+ ; X86-NEXT: addss %xmm0, %xmm0
350
+ ; X86-NEXT: movss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
351
+ ; X86-NEXT: calll __truncsfbf2
352
+ ; X86-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
353
+ ; X86-NEXT: movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
354
+ ; X86-NEXT: # xmm0 = mem[0],zero,zero,zero
355
+ ; X86-NEXT: movss %xmm0, (%esp)
356
+ ; X86-NEXT: shll $16, %edi
357
+ ; X86-NEXT: movd %edi, %xmm0
358
+ ; X86-NEXT: addss %xmm0, %xmm0
359
+ ; X86-NEXT: movss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
360
+ ; X86-NEXT: calll __truncsfbf2
361
+ ; X86-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
362
+ ; X86-NEXT: movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
363
+ ; X86-NEXT: # xmm0 = mem[0],zero,zero,zero
364
+ ; X86-NEXT: movss %xmm0, (%esp)
365
+ ; X86-NEXT: shll $16, %esi
366
+ ; X86-NEXT: movd %esi, %xmm0
367
+ ; X86-NEXT: addss %xmm0, %xmm0
368
+ ; X86-NEXT: movss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
369
+ ; X86-NEXT: calll __truncsfbf2
370
+ ; X86-NEXT: movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 4-byte Reload
371
+ ; X86-NEXT: # xmm1 = mem[0],zero,zero,zero
372
+ ; X86-NEXT: movss %xmm1, (%esp)
373
+ ; X86-NEXT: pextrw $0, %xmm0, %esi
374
+ ; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
375
+ ; X86-NEXT: pextrw $0, %xmm0, %edi
376
+ ; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
377
+ ; X86-NEXT: pextrw $0, %xmm0, %ebx
378
+ ; X86-NEXT: calll __truncsfbf2
379
+ ; X86-NEXT: pextrw $0, %xmm0, %eax
380
+ ; X86-NEXT: shll $16, %eax
381
+ ; X86-NEXT: movd %eax, %xmm0
382
+ ; X86-NEXT: addss %xmm0, %xmm0
383
+ ; X86-NEXT: movss %xmm0, (%esp)
384
+ ; X86-NEXT: shll $16, %ebx
385
+ ; X86-NEXT: movd %ebx, %xmm0
386
+ ; X86-NEXT: addss %xmm0, %xmm0
387
+ ; X86-NEXT: movss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
388
+ ; X86-NEXT: calll __truncsfbf2
389
+ ; X86-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
390
+ ; X86-NEXT: movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
391
+ ; X86-NEXT: # xmm0 = mem[0],zero,zero,zero
392
+ ; X86-NEXT: movss %xmm0, (%esp)
393
+ ; X86-NEXT: shll $16, %edi
394
+ ; X86-NEXT: movd %edi, %xmm0
395
+ ; X86-NEXT: addss %xmm0, %xmm0
396
+ ; X86-NEXT: movss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
397
+ ; X86-NEXT: calll __truncsfbf2
398
+ ; X86-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
399
+ ; X86-NEXT: movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
400
+ ; X86-NEXT: # xmm0 = mem[0],zero,zero,zero
401
+ ; X86-NEXT: movss %xmm0, (%esp)
402
+ ; X86-NEXT: shll $16, %esi
403
+ ; X86-NEXT: movd %esi, %xmm0
404
+ ; X86-NEXT: addss %xmm0, %xmm0
405
+ ; X86-NEXT: movss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
406
+ ; X86-NEXT: calll __truncsfbf2
407
+ ; X86-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
408
+ ; X86-NEXT: movd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Folded Reload
409
+ ; X86-NEXT: # xmm0 = mem[0],zero,zero,zero
410
+ ; X86-NEXT: movd %xmm0, (%esp)
411
+ ; X86-NEXT: calll __truncsfbf2
412
+ ; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload
413
+ ; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
414
+ ; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
415
+ ; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
416
+ ; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
417
+ ; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
418
+ ; X86-NEXT: movdqa %xmm1, %xmm0
419
+ ; X86-NEXT: addl $56, %esp
420
+ ; X86-NEXT: popl %esi
421
+ ; X86-NEXT: popl %edi
422
+ ; X86-NEXT: popl %ebx
423
+ ; X86-NEXT: retl
424
+ ;
425
+ ; X64-LABEL: f10:
426
+ ; X64: # %bb.0:
427
+ ; X64-NEXT: pushq %rbp
428
+ ; X64-NEXT: pushq %r15
429
+ ; X64-NEXT: pushq %r14
430
+ ; X64-NEXT: pushq %rbx
431
+ ; X64-NEXT: subq $56, %rsp
432
+ ; X64-NEXT: movdqa %xmm0, %xmm1
433
+ ; X64-NEXT: psrld $16, %xmm1
434
+ ; X64-NEXT: pextrw $0, %xmm1, %ebp
435
+ ; X64-NEXT: pextrw $0, %xmm0, %r15d
436
+ ; X64-NEXT: movdqa %xmm0, %xmm1
437
+ ; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
438
+ ; X64-NEXT: pextrw $0, %xmm1, %r14d
439
+ ; X64-NEXT: psrlq $48, %xmm0
440
+ ; X64-NEXT: pextrw $0, %xmm0, %eax
441
+ ; X64-NEXT: shll $16, %eax
442
+ ; X64-NEXT: movd %eax, %xmm0
443
+ ; X64-NEXT: addss %xmm0, %xmm0
444
+ ; X64-NEXT: callq __truncsfbf2@PLT
445
+ ; X64-NEXT: pextrw $0, %xmm0, %ebx
446
+ ; X64-NEXT: shll $16, %r14d
447
+ ; X64-NEXT: movd %r14d, %xmm0
448
+ ; X64-NEXT: addss %xmm0, %xmm0
449
+ ; X64-NEXT: callq __truncsfbf2@PLT
450
+ ; X64-NEXT: pextrw $0, %xmm0, %r14d
451
+ ; X64-NEXT: shll $16, %r15d
452
+ ; X64-NEXT: movd %r15d, %xmm0
453
+ ; X64-NEXT: addss %xmm0, %xmm0
454
+ ; X64-NEXT: callq __truncsfbf2@PLT
455
+ ; X64-NEXT: pextrw $0, %xmm0, %r15d
456
+ ; X64-NEXT: shll $16, %ebp
457
+ ; X64-NEXT: movd %ebp, %xmm0
458
+ ; X64-NEXT: addss %xmm0, %xmm0
459
+ ; X64-NEXT: callq __truncsfbf2@PLT
460
+ ; X64-NEXT: pextrw $0, %xmm0, %eax
461
+ ; X64-NEXT: shll $16, %eax
462
+ ; X64-NEXT: movd %eax, %xmm0
463
+ ; X64-NEXT: addss %xmm0, %xmm0
464
+ ; X64-NEXT: callq __truncsfbf2@PLT
465
+ ; X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
466
+ ; X64-NEXT: shll $16, %r15d
467
+ ; X64-NEXT: movd %r15d, %xmm0
468
+ ; X64-NEXT: addss %xmm0, %xmm0
469
+ ; X64-NEXT: callq __truncsfbf2@PLT
470
+ ; X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
471
+ ; X64-NEXT: shll $16, %r14d
472
+ ; X64-NEXT: movd %r14d, %xmm0
473
+ ; X64-NEXT: addss %xmm0, %xmm0
474
+ ; X64-NEXT: callq __truncsfbf2@PLT
475
+ ; X64-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
476
+ ; X64-NEXT: shll $16, %ebx
477
+ ; X64-NEXT: movd %ebx, %xmm0
478
+ ; X64-NEXT: addss %xmm0, %xmm0
479
+ ; X64-NEXT: callq __truncsfbf2@PLT
480
+ ; X64-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload
481
+ ; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
482
+ ; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
483
+ ; X64-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
484
+ ; X64-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
485
+ ; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
486
+ ; X64-NEXT: addq $56, %rsp
487
+ ; X64-NEXT: popq %rbx
488
+ ; X64-NEXT: popq %r14
489
+ ; X64-NEXT: popq %r15
490
+ ; X64-NEXT: popq %rbp
491
+ ; X64-NEXT: retq
492
+ %b = fadd <4 x bfloat> %a , %a
493
+ %c = call <4 x bfloat> @llvm.arithmetic.fence.v4bf16 (<4 x bfloat> %b )
494
+ %d = fadd <4 x bfloat> %a , %a
495
+ %e = fadd <4 x bfloat> %b , %d
496
+ ret <4 x bfloat> %e
241
497
}
242
498
243
499
declare half @llvm.arithmetic.fence.f16 (half )
244
500
declare bfloat @llvm.arithmetic.fence.bf16 (bfloat)
501
+ declare <2 x half > @llvm.arithmetic.fence.v2f16 (<2 x half >)
502
+ declare <4 x bfloat> @llvm.arithmetic.fence.v4bf16 (<4 x bfloat>)
245
503
declare float @llvm.arithmetic.fence.f32 (float )
246
504
declare double @llvm.arithmetic.fence.f64 (double )
247
505
declare <2 x float > @llvm.arithmetic.fence.v2f32 (<2 x float >)
0 commit comments