Skip to content

Commit 03505a0

Browse files
authored
[RISCV] Enable scalable loop vectorization for fmax/fmin reductions with f16/bf16 type for zvfhmin/zvfbfmin (#129629)
This PR enable scalable loop vectorization for fmax and fmin reductions with f16/bf16 type when only zvfhmin/zvfbfmin are enabled. After #128800, we can promote the fmax/fmin reductions with f16/bf16 type to f32 reductions for zvfhmin/zvfbfmin.
1 parent 47fb9c4 commit 03505a0

File tree

2 files changed

+138
-9
lines changed

2 files changed

+138
-9
lines changed

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -349,27 +349,26 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
349349
if (!TLI->isLegalElementTypeForRVV(TLI->getValueType(DL, Ty)))
350350
return false;
351351

352-
// We can't promote f16/bf16 fadd reductions and scalable vectors can't be
353-
// expanded.
354-
// TODO: Promote f16/bf16 fmin/fmax reductions
355-
if (Ty->isBFloatTy() || (Ty->isHalfTy() && !ST->hasVInstructionsF16()))
356-
return false;
357-
358352
switch (RdxDesc.getRecurrenceKind()) {
359353
case RecurKind::Add:
360-
case RecurKind::FAdd:
361354
case RecurKind::And:
362355
case RecurKind::Or:
363356
case RecurKind::Xor:
364357
case RecurKind::SMin:
365358
case RecurKind::SMax:
366359
case RecurKind::UMin:
367360
case RecurKind::UMax:
361+
case RecurKind::IAnyOf:
368362
case RecurKind::FMin:
369363
case RecurKind::FMax:
370-
case RecurKind::FMulAdd:
371-
case RecurKind::IAnyOf:
364+
return true;
372365
case RecurKind::FAnyOf:
366+
case RecurKind::FAdd:
367+
case RecurKind::FMulAdd:
368+
// We can't promote f16/bf16 fadd reductions and scalable vectors can't be
369+
// expanded.
370+
if (Ty->isBFloatTy() || (Ty->isHalfTy() && !ST->hasVInstructionsF16()))
371+
return false;
373372
return true;
374373
default:
375374
return false;

llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -344,6 +344,70 @@ for.end:
344344
ret float %.sroa.speculated
345345
}
346346

347+
; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
348+
define half @fmin_fast_half_zvfhmin(ptr noalias nocapture readonly %a, i64 %n) #1 {
349+
; CHECK-LABEL: @fmin_fast
350+
; CHECK: vector.body:
351+
; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x half>
352+
; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x half>
353+
; CHECK: %[[FCMP1:.*]] = fcmp olt <vscale x 8 x half> %[[LOAD1]]
354+
; CHECK: %[[FCMP2:.*]] = fcmp olt <vscale x 8 x half> %[[LOAD2]]
355+
; CHECK: %[[SEL1:.*]] = select <vscale x 8 x i1> %[[FCMP1]], <vscale x 8 x half> %[[LOAD1]]
356+
; CHECK: %[[SEL2:.*]] = select <vscale x 8 x i1> %[[FCMP2]], <vscale x 8 x half> %[[LOAD2]]
357+
; CHECK: middle.block:
358+
; CHECK: %[[FCMP:.*]] = fcmp olt <vscale x 8 x half> %[[SEL1]], %[[SEL2]]
359+
; CHECK-NEXT: %[[SEL:.*]] = select <vscale x 8 x i1> %[[FCMP]], <vscale x 8 x half> %[[SEL1]], <vscale x 8 x half> %[[SEL2]]
360+
; CHECK-NEXT: call half @llvm.vector.reduce.fmin.nxv8f16(<vscale x 8 x half> %[[SEL]])
361+
entry:
362+
br label %for.body
363+
364+
for.body:
365+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
366+
%sum.07 = phi half [ 0.000000e+00, %entry ], [ %.sroa.speculated, %for.body ]
367+
%arrayidx = getelementptr inbounds half, ptr %a, i64 %iv
368+
%0 = load half, ptr %arrayidx, align 4
369+
%cmp.i = fcmp olt half %0, %sum.07
370+
%.sroa.speculated = select i1 %cmp.i, half %0, half %sum.07
371+
%iv.next = add nuw nsw i64 %iv, 1
372+
%exitcond.not = icmp eq i64 %iv.next, %n
373+
br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
374+
375+
for.end:
376+
ret half %.sroa.speculated
377+
}
378+
379+
; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
380+
define bfloat @fmin_fast_bfloat_zvfbfmin(ptr noalias nocapture readonly %a, i64 %n) #2 {
381+
; CHECK-LABEL: @fmin_fast
382+
; CHECK: vector.body:
383+
; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x bfloat>
384+
; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x bfloat>
385+
; CHECK: %[[FCMP1:.*]] = fcmp olt <vscale x 8 x bfloat> %[[LOAD1]]
386+
; CHECK: %[[FCMP2:.*]] = fcmp olt <vscale x 8 x bfloat> %[[LOAD2]]
387+
; CHECK: %[[SEL1:.*]] = select <vscale x 8 x i1> %[[FCMP1]], <vscale x 8 x bfloat> %[[LOAD1]]
388+
; CHECK: %[[SEL2:.*]] = select <vscale x 8 x i1> %[[FCMP2]], <vscale x 8 x bfloat> %[[LOAD2]]
389+
; CHECK: middle.block:
390+
; CHECK: %[[FCMP:.*]] = fcmp olt <vscale x 8 x bfloat> %[[SEL1]], %[[SEL2]]
391+
; CHECK-NEXT: %[[SEL:.*]] = select <vscale x 8 x i1> %[[FCMP]], <vscale x 8 x bfloat> %[[SEL1]], <vscale x 8 x bfloat> %[[SEL2]]
392+
; CHECK-NEXT: call bfloat @llvm.vector.reduce.fmin.nxv8bf16(<vscale x 8 x bfloat> %[[SEL]])
393+
entry:
394+
br label %for.body
395+
396+
for.body:
397+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
398+
%sum.07 = phi bfloat [ 0.000000e+00, %entry ], [ %.sroa.speculated, %for.body ]
399+
%arrayidx = getelementptr inbounds bfloat, ptr %a, i64 %iv
400+
%0 = load bfloat, ptr %arrayidx, align 4
401+
%cmp.i = fcmp olt bfloat %0, %sum.07
402+
%.sroa.speculated = select i1 %cmp.i, bfloat %0, bfloat %sum.07
403+
%iv.next = add nuw nsw i64 %iv, 1
404+
%exitcond.not = icmp eq i64 %iv.next, %n
405+
br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
406+
407+
for.end:
408+
ret bfloat %.sroa.speculated
409+
}
410+
347411
; FMAX (FAST)
348412

349413
; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
@@ -378,6 +442,70 @@ for.end:
378442
ret float %.sroa.speculated
379443
}
380444

445+
; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
446+
define half @fmax_fast_half_zvfhmin(ptr noalias nocapture readonly %a, i64 %n) #1 {
447+
; CHECK-LABEL: @fmax_fast
448+
; CHECK: vector.body:
449+
; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x half>
450+
; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x half>
451+
; CHECK: %[[FCMP1:.*]] = fcmp fast ogt <vscale x 8 x half> %[[LOAD1]]
452+
; CHECK: %[[FCMP2:.*]] = fcmp fast ogt <vscale x 8 x half> %[[LOAD2]]
453+
; CHECK: %[[SEL1:.*]] = select <vscale x 8 x i1> %[[FCMP1]], <vscale x 8 x half> %[[LOAD1]]
454+
; CHECK: %[[SEL2:.*]] = select <vscale x 8 x i1> %[[FCMP2]], <vscale x 8 x half> %[[LOAD2]]
455+
; CHECK: middle.block:
456+
; CHECK: %[[FCMP:.*]] = fcmp fast ogt <vscale x 8 x half> %[[SEL1]], %[[SEL2]]
457+
; CHECK-NEXT: %[[SEL:.*]] = select fast <vscale x 8 x i1> %[[FCMP]], <vscale x 8 x half> %[[SEL1]], <vscale x 8 x half> %[[SEL2]]
458+
; CHECK-NEXT: call fast half @llvm.vector.reduce.fmax.nxv8f16(<vscale x 8 x half> %[[SEL]])
459+
entry:
460+
br label %for.body
461+
462+
for.body:
463+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
464+
%sum.07 = phi half [ 0.000000e+00, %entry ], [ %.sroa.speculated, %for.body ]
465+
%arrayidx = getelementptr inbounds half, ptr %a, i64 %iv
466+
%0 = load half, ptr %arrayidx, align 4
467+
%cmp.i = fcmp fast ogt half %0, %sum.07
468+
%.sroa.speculated = select i1 %cmp.i, half %0, half %sum.07
469+
%iv.next = add nuw nsw i64 %iv, 1
470+
%exitcond.not = icmp eq i64 %iv.next, %n
471+
br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
472+
473+
for.end:
474+
ret half %.sroa.speculated
475+
}
476+
477+
; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
478+
define bfloat @fmax_fast_bfloat_zvfbfmin(ptr noalias nocapture readonly %a, i64 %n) #2 {
479+
; CHECK-LABEL: @fmax_fast
480+
; CHECK: vector.body:
481+
; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x bfloat>
482+
; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x bfloat>
483+
; CHECK: %[[FCMP1:.*]] = fcmp fast ogt <vscale x 8 x bfloat> %[[LOAD1]]
484+
; CHECK: %[[FCMP2:.*]] = fcmp fast ogt <vscale x 8 x bfloat> %[[LOAD2]]
485+
; CHECK: %[[SEL1:.*]] = select <vscale x 8 x i1> %[[FCMP1]], <vscale x 8 x bfloat> %[[LOAD1]]
486+
; CHECK: %[[SEL2:.*]] = select <vscale x 8 x i1> %[[FCMP2]], <vscale x 8 x bfloat> %[[LOAD2]]
487+
; CHECK: middle.block:
488+
; CHECK: %[[FCMP:.*]] = fcmp fast ogt <vscale x 8 x bfloat> %[[SEL1]], %[[SEL2]]
489+
; CHECK-NEXT: %[[SEL:.*]] = select fast <vscale x 8 x i1> %[[FCMP]], <vscale x 8 x bfloat> %[[SEL1]], <vscale x 8 x bfloat> %[[SEL2]]
490+
; CHECK-NEXT: call fast bfloat @llvm.vector.reduce.fmax.nxv8bf16(<vscale x 8 x bfloat> %[[SEL]])
491+
entry:
492+
br label %for.body
493+
494+
for.body:
495+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
496+
%sum.07 = phi bfloat [ 0.000000e+00, %entry ], [ %.sroa.speculated, %for.body ]
497+
%arrayidx = getelementptr inbounds bfloat, ptr %a, i64 %iv
498+
%0 = load bfloat, ptr %arrayidx, align 4
499+
%cmp.i = fcmp fast ogt bfloat %0, %sum.07
500+
%.sroa.speculated = select i1 %cmp.i, bfloat %0, bfloat %sum.07
501+
%iv.next = add nuw nsw i64 %iv, 1
502+
%exitcond.not = icmp eq i64 %iv.next, %n
503+
br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
504+
505+
for.end:
506+
ret bfloat %.sroa.speculated
507+
}
508+
381509
; Reduction cannot be vectorized
382510

383511
; MUL
@@ -591,6 +719,8 @@ for.end:
591719
declare float @llvm.fmuladd.f32(float, float, float)
592720

593721
attributes #0 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" }
722+
attributes #1 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "target-features"="+zfhmin,+zvfhmin"}
723+
attributes #2 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "target-features"="+zfbfmin,+zvfbfmin"}
594724

595725
!0 = distinct !{!0, !1, !2, !3, !4}
596726
!1 = !{!"llvm.loop.vectorize.width", i32 8}

0 commit comments

Comments
 (0)