-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[RISCV] Enable scalable loop vectorization for fmax/fmin reductions with f16/bf16 type for zvfhmin/zvfbfmin #129629
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[RISCV] Enable scalable loop vectorization for fmax/fmin reductions with f16/bf16 type for zvfhmin/zvfbfmin #129629
Conversation
…ith f16/bf16 type for zvfhmin/zvfbfmin This PR enable scalable loop vectorization for fmax and fmin reductions with f16/bf16 type when only zvfhmin/zvfbfmin are enabled. After llvm#128800, we can promote the fmax/fmin reductions with f16/bf16 type to f32 reductions for zvfhmin/zvfbfmin.
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-risc-v Author: Jim Lin (tclin914) ChangesThis PR enable scalable loop vectorization for fmax and fmin reductions with f16/bf16 type when only zvfhmin/zvfbfmin are enabled. After #128800, we can promote the fmax/fmin reductions with f16/bf16 type to f32 reductions for zvfhmin/zvfbfmin. Full diff: https://github.com/llvm/llvm-project/pull/129629.diff 2 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 3f57560d3c127..65369f99cf56e 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -349,15 +349,8 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
if (!TLI->isLegalElementTypeForRVV(TLI->getValueType(DL, Ty)))
return false;
- // We can't promote f16/bf16 fadd reductions and scalable vectors can't be
- // expanded.
- // TODO: Promote f16/bf16 fmin/fmax reductions
- if (Ty->isBFloatTy() || (Ty->isHalfTy() && !ST->hasVInstructionsF16()))
- return false;
-
switch (RdxDesc.getRecurrenceKind()) {
case RecurKind::Add:
- case RecurKind::FAdd:
case RecurKind::And:
case RecurKind::Or:
case RecurKind::Xor:
@@ -365,11 +358,17 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
case RecurKind::SMax:
case RecurKind::UMin:
case RecurKind::UMax:
+ case RecurKind::IAnyOf:
+ case RecurKind::FAnyOf:
case RecurKind::FMin:
case RecurKind::FMax:
+ return true;
+ case RecurKind::FAdd:
case RecurKind::FMulAdd:
- case RecurKind::IAnyOf:
- case RecurKind::FAnyOf:
+ // We can't promote f16/bf16 fadd reductions and scalable vectors can't be
+ // expanded.
+ if (Ty->isBFloatTy() || (Ty->isHalfTy() && !ST->hasVInstructionsF16()))
+ return false;
return true;
default:
return false;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll
index 01a2a757dea5d..c3c2b8ee012a0 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll
@@ -344,6 +344,70 @@ for.end:
ret float %.sroa.speculated
}
+; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
+define half @fmin_fast_half_zvfhmin(ptr noalias nocapture readonly %a, i64 %n) #1 {
+; CHECK-LABEL: @fmin_fast
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x half>
+; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x half>
+; CHECK: %[[FCMP1:.*]] = fcmp olt <vscale x 8 x half> %[[LOAD1]]
+; CHECK: %[[FCMP2:.*]] = fcmp olt <vscale x 8 x half> %[[LOAD2]]
+; CHECK: %[[SEL1:.*]] = select <vscale x 8 x i1> %[[FCMP1]], <vscale x 8 x half> %[[LOAD1]]
+; CHECK: %[[SEL2:.*]] = select <vscale x 8 x i1> %[[FCMP2]], <vscale x 8 x half> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[FCMP:.*]] = fcmp olt <vscale x 8 x half> %[[SEL1]], %[[SEL2]]
+; CHECK-NEXT: %[[SEL:.*]] = select <vscale x 8 x i1> %[[FCMP]], <vscale x 8 x half> %[[SEL1]], <vscale x 8 x half> %[[SEL2]]
+; CHECK-NEXT: call half @llvm.vector.reduce.fmin.nxv8f16(<vscale x 8 x half> %[[SEL]])
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %sum.07 = phi half [ 0.000000e+00, %entry ], [ %.sroa.speculated, %for.body ]
+ %arrayidx = getelementptr inbounds half, ptr %a, i64 %iv
+ %0 = load half, ptr %arrayidx, align 4
+ %cmp.i = fcmp olt half %0, %sum.07
+ %.sroa.speculated = select i1 %cmp.i, half %0, half %sum.07
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret half %.sroa.speculated
+}
+
+; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
+define bfloat @fmin_fast_bfloat_zvfbfmin(ptr noalias nocapture readonly %a, i64 %n) #2 {
+; CHECK-LABEL: @fmin_fast
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x bfloat>
+; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x bfloat>
+; CHECK: %[[FCMP1:.*]] = fcmp olt <vscale x 8 x bfloat> %[[LOAD1]]
+; CHECK: %[[FCMP2:.*]] = fcmp olt <vscale x 8 x bfloat> %[[LOAD2]]
+; CHECK: %[[SEL1:.*]] = select <vscale x 8 x i1> %[[FCMP1]], <vscale x 8 x bfloat> %[[LOAD1]]
+; CHECK: %[[SEL2:.*]] = select <vscale x 8 x i1> %[[FCMP2]], <vscale x 8 x bfloat> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[FCMP:.*]] = fcmp olt <vscale x 8 x bfloat> %[[SEL1]], %[[SEL2]]
+; CHECK-NEXT: %[[SEL:.*]] = select <vscale x 8 x i1> %[[FCMP]], <vscale x 8 x bfloat> %[[SEL1]], <vscale x 8 x bfloat> %[[SEL2]]
+; CHECK-NEXT: call bfloat @llvm.vector.reduce.fmin.nxv8bf16(<vscale x 8 x bfloat> %[[SEL]])
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %sum.07 = phi bfloat [ 0.000000e+00, %entry ], [ %.sroa.speculated, %for.body ]
+ %arrayidx = getelementptr inbounds bfloat, ptr %a, i64 %iv
+ %0 = load bfloat, ptr %arrayidx, align 4
+ %cmp.i = fcmp olt bfloat %0, %sum.07
+ %.sroa.speculated = select i1 %cmp.i, bfloat %0, bfloat %sum.07
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret bfloat %.sroa.speculated
+}
+
; FMAX (FAST)
; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
@@ -378,6 +442,70 @@ for.end:
ret float %.sroa.speculated
}
+; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
+define half @fmax_fast_half_zvfhmin(ptr noalias nocapture readonly %a, i64 %n) #1 {
+; CHECK-LABEL: @fmax_fast
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x half>
+; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x half>
+; CHECK: %[[FCMP1:.*]] = fcmp fast ogt <vscale x 8 x half> %[[LOAD1]]
+; CHECK: %[[FCMP2:.*]] = fcmp fast ogt <vscale x 8 x half> %[[LOAD2]]
+; CHECK: %[[SEL1:.*]] = select <vscale x 8 x i1> %[[FCMP1]], <vscale x 8 x half> %[[LOAD1]]
+; CHECK: %[[SEL2:.*]] = select <vscale x 8 x i1> %[[FCMP2]], <vscale x 8 x half> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[FCMP:.*]] = fcmp fast ogt <vscale x 8 x half> %[[SEL1]], %[[SEL2]]
+; CHECK-NEXT: %[[SEL:.*]] = select fast <vscale x 8 x i1> %[[FCMP]], <vscale x 8 x half> %[[SEL1]], <vscale x 8 x half> %[[SEL2]]
+; CHECK-NEXT: call fast half @llvm.vector.reduce.fmax.nxv8f16(<vscale x 8 x half> %[[SEL]])
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %sum.07 = phi half [ 0.000000e+00, %entry ], [ %.sroa.speculated, %for.body ]
+ %arrayidx = getelementptr inbounds half, ptr %a, i64 %iv
+ %0 = load half, ptr %arrayidx, align 4
+ %cmp.i = fcmp fast ogt half %0, %sum.07
+ %.sroa.speculated = select i1 %cmp.i, half %0, half %sum.07
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret half %.sroa.speculated
+}
+
+; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
+define bfloat @fmax_fast_bfloat_zvfbfmin(ptr noalias nocapture readonly %a, i64 %n) #2 {
+; CHECK-LABEL: @fmax_fast
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x bfloat>
+; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x bfloat>
+; CHECK: %[[FCMP1:.*]] = fcmp fast ogt <vscale x 8 x bfloat> %[[LOAD1]]
+; CHECK: %[[FCMP2:.*]] = fcmp fast ogt <vscale x 8 x bfloat> %[[LOAD2]]
+; CHECK: %[[SEL1:.*]] = select <vscale x 8 x i1> %[[FCMP1]], <vscale x 8 x bfloat> %[[LOAD1]]
+; CHECK: %[[SEL2:.*]] = select <vscale x 8 x i1> %[[FCMP2]], <vscale x 8 x bfloat> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[FCMP:.*]] = fcmp fast ogt <vscale x 8 x bfloat> %[[SEL1]], %[[SEL2]]
+; CHECK-NEXT: %[[SEL:.*]] = select fast <vscale x 8 x i1> %[[FCMP]], <vscale x 8 x bfloat> %[[SEL1]], <vscale x 8 x bfloat> %[[SEL2]]
+; CHECK-NEXT: call fast bfloat @llvm.vector.reduce.fmax.nxv8bf16(<vscale x 8 x bfloat> %[[SEL]])
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %sum.07 = phi bfloat [ 0.000000e+00, %entry ], [ %.sroa.speculated, %for.body ]
+ %arrayidx = getelementptr inbounds bfloat, ptr %a, i64 %iv
+ %0 = load bfloat, ptr %arrayidx, align 4
+ %cmp.i = fcmp fast ogt bfloat %0, %sum.07
+ %.sroa.speculated = select i1 %cmp.i, bfloat %0, bfloat %sum.07
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret bfloat %.sroa.speculated
+}
+
; Reduction cannot be vectorized
; MUL
@@ -591,6 +719,8 @@ for.end:
declare float @llvm.fmuladd.f32(float, float, float)
attributes #0 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" }
+attributes #1 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "target-features"="+zfhmin,+zvfhmin"}
+attributes #2 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "target-features"="+zfbfmin,+zvfbfmin"}
!0 = distinct !{!0, !1, !2, !3, !4}
!1 = !{!"llvm.loop.vectorize.width", i32 8}
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
…ith f16/bf16 type for zvfhmin/zvfbfmin (llvm#129629) This PR enable scalable loop vectorization for fmax and fmin reductions with f16/bf16 type when only zvfhmin/zvfbfmin are enabled. After llvm#128800, we can promote the fmax/fmin reductions with f16/bf16 type to f32 reductions for zvfhmin/zvfbfmin.
This PR enable scalable loop vectorization for fmax and fmin reductions with f16/bf16 type when only zvfhmin/zvfbfmin are enabled.
After #128800, we can promote the fmax/fmin reductions with f16/bf16 type to f32 reductions for zvfhmin/zvfbfmin.