@@ -344,6 +344,70 @@ for.end:
344
344
ret float %.sroa.speculated
345
345
}
346
346
347
+ ; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
348
+ define half @fmin_fast_half_zvfhmin (ptr noalias nocapture readonly %a , i64 %n ) #1 {
349
+ ; CHECK-LABEL: @fmin_fast
350
+ ; CHECK: vector.body:
351
+ ; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x half>
352
+ ; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x half>
353
+ ; CHECK: %[[FCMP1:.*]] = fcmp olt <vscale x 8 x half> %[[LOAD1]]
354
+ ; CHECK: %[[FCMP2:.*]] = fcmp olt <vscale x 8 x half> %[[LOAD2]]
355
+ ; CHECK: %[[SEL1:.*]] = select <vscale x 8 x i1> %[[FCMP1]], <vscale x 8 x half> %[[LOAD1]]
356
+ ; CHECK: %[[SEL2:.*]] = select <vscale x 8 x i1> %[[FCMP2]], <vscale x 8 x half> %[[LOAD2]]
357
+ ; CHECK: middle.block:
358
+ ; CHECK: %[[FCMP:.*]] = fcmp olt <vscale x 8 x half> %[[SEL1]], %[[SEL2]]
359
+ ; CHECK-NEXT: %[[SEL:.*]] = select <vscale x 8 x i1> %[[FCMP]], <vscale x 8 x half> %[[SEL1]], <vscale x 8 x half> %[[SEL2]]
360
+ ; CHECK-NEXT: call half @llvm.vector.reduce.fmin.nxv8f16(<vscale x 8 x half> %[[SEL]])
361
+ entry:
362
+ br label %for.body
363
+
364
+ for.body:
365
+ %iv = phi i64 [ 0 , %entry ], [ %iv.next , %for.body ]
366
+ %sum.07 = phi half [ 0 .000000e+00 , %entry ], [ %.sroa.speculated , %for.body ]
367
+ %arrayidx = getelementptr inbounds half , ptr %a , i64 %iv
368
+ %0 = load half , ptr %arrayidx , align 4
369
+ %cmp.i = fcmp olt half %0 , %sum.07
370
+ %.sroa.speculated = select i1 %cmp.i , half %0 , half %sum.07
371
+ %iv.next = add nuw nsw i64 %iv , 1
372
+ %exitcond.not = icmp eq i64 %iv.next , %n
373
+ br i1 %exitcond.not , label %for.end , label %for.body , !llvm.loop !0
374
+
375
+ for.end:
376
+ ret half %.sroa.speculated
377
+ }
378
+
379
+ ; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
380
+ define bfloat @fmin_fast_bfloat_zvfbfmin (ptr noalias nocapture readonly %a , i64 %n ) #2 {
381
+ ; CHECK-LABEL: @fmin_fast
382
+ ; CHECK: vector.body:
383
+ ; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x bfloat>
384
+ ; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x bfloat>
385
+ ; CHECK: %[[FCMP1:.*]] = fcmp olt <vscale x 8 x bfloat> %[[LOAD1]]
386
+ ; CHECK: %[[FCMP2:.*]] = fcmp olt <vscale x 8 x bfloat> %[[LOAD2]]
387
+ ; CHECK: %[[SEL1:.*]] = select <vscale x 8 x i1> %[[FCMP1]], <vscale x 8 x bfloat> %[[LOAD1]]
388
+ ; CHECK: %[[SEL2:.*]] = select <vscale x 8 x i1> %[[FCMP2]], <vscale x 8 x bfloat> %[[LOAD2]]
389
+ ; CHECK: middle.block:
390
+ ; CHECK: %[[FCMP:.*]] = fcmp olt <vscale x 8 x bfloat> %[[SEL1]], %[[SEL2]]
391
+ ; CHECK-NEXT: %[[SEL:.*]] = select <vscale x 8 x i1> %[[FCMP]], <vscale x 8 x bfloat> %[[SEL1]], <vscale x 8 x bfloat> %[[SEL2]]
392
+ ; CHECK-NEXT: call bfloat @llvm.vector.reduce.fmin.nxv8bf16(<vscale x 8 x bfloat> %[[SEL]])
393
+ entry:
394
+ br label %for.body
395
+
396
+ for.body:
397
+ %iv = phi i64 [ 0 , %entry ], [ %iv.next , %for.body ]
398
+ %sum.07 = phi bfloat [ 0 .000000e+00 , %entry ], [ %.sroa.speculated , %for.body ]
399
+ %arrayidx = getelementptr inbounds bfloat, ptr %a , i64 %iv
400
+ %0 = load bfloat, ptr %arrayidx , align 4
401
+ %cmp.i = fcmp olt bfloat %0 , %sum.07
402
+ %.sroa.speculated = select i1 %cmp.i , bfloat %0 , bfloat %sum.07
403
+ %iv.next = add nuw nsw i64 %iv , 1
404
+ %exitcond.not = icmp eq i64 %iv.next , %n
405
+ br i1 %exitcond.not , label %for.end , label %for.body , !llvm.loop !0
406
+
407
+ for.end:
408
+ ret bfloat %.sroa.speculated
409
+ }
410
+
347
411
; FMAX (FAST)
348
412
349
413
; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
@@ -378,6 +442,70 @@ for.end:
378
442
ret float %.sroa.speculated
379
443
}
380
444
445
+ ; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
446
+ define half @fmax_fast_half_zvfhmin (ptr noalias nocapture readonly %a , i64 %n ) #1 {
447
+ ; CHECK-LABEL: @fmax_fast
448
+ ; CHECK: vector.body:
449
+ ; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x half>
450
+ ; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x half>
451
+ ; CHECK: %[[FCMP1:.*]] = fcmp fast ogt <vscale x 8 x half> %[[LOAD1]]
452
+ ; CHECK: %[[FCMP2:.*]] = fcmp fast ogt <vscale x 8 x half> %[[LOAD2]]
453
+ ; CHECK: %[[SEL1:.*]] = select <vscale x 8 x i1> %[[FCMP1]], <vscale x 8 x half> %[[LOAD1]]
454
+ ; CHECK: %[[SEL2:.*]] = select <vscale x 8 x i1> %[[FCMP2]], <vscale x 8 x half> %[[LOAD2]]
455
+ ; CHECK: middle.block:
456
+ ; CHECK: %[[FCMP:.*]] = fcmp fast ogt <vscale x 8 x half> %[[SEL1]], %[[SEL2]]
457
+ ; CHECK-NEXT: %[[SEL:.*]] = select fast <vscale x 8 x i1> %[[FCMP]], <vscale x 8 x half> %[[SEL1]], <vscale x 8 x half> %[[SEL2]]
458
+ ; CHECK-NEXT: call fast half @llvm.vector.reduce.fmax.nxv8f16(<vscale x 8 x half> %[[SEL]])
459
+ entry:
460
+ br label %for.body
461
+
462
+ for.body:
463
+ %iv = phi i64 [ 0 , %entry ], [ %iv.next , %for.body ]
464
+ %sum.07 = phi half [ 0 .000000e+00 , %entry ], [ %.sroa.speculated , %for.body ]
465
+ %arrayidx = getelementptr inbounds half , ptr %a , i64 %iv
466
+ %0 = load half , ptr %arrayidx , align 4
467
+ %cmp.i = fcmp fast ogt half %0 , %sum.07
468
+ %.sroa.speculated = select i1 %cmp.i , half %0 , half %sum.07
469
+ %iv.next = add nuw nsw i64 %iv , 1
470
+ %exitcond.not = icmp eq i64 %iv.next , %n
471
+ br i1 %exitcond.not , label %for.end , label %for.body , !llvm.loop !0
472
+
473
+ for.end:
474
+ ret half %.sroa.speculated
475
+ }
476
+
477
+ ; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
478
+ define bfloat @fmax_fast_bfloat_zvfbfmin (ptr noalias nocapture readonly %a , i64 %n ) #2 {
479
+ ; CHECK-LABEL: @fmax_fast
480
+ ; CHECK: vector.body:
481
+ ; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x bfloat>
482
+ ; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x bfloat>
483
+ ; CHECK: %[[FCMP1:.*]] = fcmp fast ogt <vscale x 8 x bfloat> %[[LOAD1]]
484
+ ; CHECK: %[[FCMP2:.*]] = fcmp fast ogt <vscale x 8 x bfloat> %[[LOAD2]]
485
+ ; CHECK: %[[SEL1:.*]] = select <vscale x 8 x i1> %[[FCMP1]], <vscale x 8 x bfloat> %[[LOAD1]]
486
+ ; CHECK: %[[SEL2:.*]] = select <vscale x 8 x i1> %[[FCMP2]], <vscale x 8 x bfloat> %[[LOAD2]]
487
+ ; CHECK: middle.block:
488
+ ; CHECK: %[[FCMP:.*]] = fcmp fast ogt <vscale x 8 x bfloat> %[[SEL1]], %[[SEL2]]
489
+ ; CHECK-NEXT: %[[SEL:.*]] = select fast <vscale x 8 x i1> %[[FCMP]], <vscale x 8 x bfloat> %[[SEL1]], <vscale x 8 x bfloat> %[[SEL2]]
490
+ ; CHECK-NEXT: call fast bfloat @llvm.vector.reduce.fmax.nxv8bf16(<vscale x 8 x bfloat> %[[SEL]])
491
+ entry:
492
+ br label %for.body
493
+
494
+ for.body:
495
+ %iv = phi i64 [ 0 , %entry ], [ %iv.next , %for.body ]
496
+ %sum.07 = phi bfloat [ 0 .000000e+00 , %entry ], [ %.sroa.speculated , %for.body ]
497
+ %arrayidx = getelementptr inbounds bfloat, ptr %a , i64 %iv
498
+ %0 = load bfloat, ptr %arrayidx , align 4
499
+ %cmp.i = fcmp fast ogt bfloat %0 , %sum.07
500
+ %.sroa.speculated = select i1 %cmp.i , bfloat %0 , bfloat %sum.07
501
+ %iv.next = add nuw nsw i64 %iv , 1
502
+ %exitcond.not = icmp eq i64 %iv.next , %n
503
+ br i1 %exitcond.not , label %for.end , label %for.body , !llvm.loop !0
504
+
505
+ for.end:
506
+ ret bfloat %.sroa.speculated
507
+ }
508
+
381
509
; Reduction cannot be vectorized
382
510
383
511
; MUL
@@ -591,6 +719,8 @@ for.end:
591
719
declare float @llvm.fmuladd.f32 (float , float , float )
592
720
593
721
attributes #0 = { "no-nans-fp-math" ="true" "no-signed-zeros-fp-math" ="true" }
722
+ attributes #1 = { "no-nans-fp-math" ="true" "no-signed-zeros-fp-math" ="true" "target-features" ="+zfhmin,+zvfhmin" }
723
+ attributes #2 = { "no-nans-fp-math" ="true" "no-signed-zeros-fp-math" ="true" "target-features" ="+zfbfmin,+zvfbfmin" }
594
724
595
725
!0 = distinct !{!0 , !1 , !2 , !3 , !4 }
596
726
!1 = !{!"llvm.loop.vectorize.width" , i32 8 }
0 commit comments