-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[RISCV] Vectorize phi for loop carried @llvm.vp.reduce.* #131974
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-risc-v Author: MingYan (NexMing) ChangesThis patch is vector predication version of commit 15b0fab Patch is 52.25 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/131974.diff 3 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp
index 5be5345cca73a..39877fb511ec3 100644
--- a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp
+++ b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp
@@ -137,7 +137,8 @@ bool RISCVCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) {
if (expandVPStrideLoad(I))
return true;
- if (I.getIntrinsicID() != Intrinsic::vector_reduce_fadd)
+ if (I.getIntrinsicID() != Intrinsic::vector_reduce_fadd &&
+ !isa<VPReductionIntrinsic>(&I))
return false;
auto *PHI = dyn_cast<PHINode>(I.getOperand(0));
diff --git a/llvm/test/CodeGen/RISCV/rvv/riscv-codegenprepare-asm.ll b/llvm/test/CodeGen/RISCV/rvv/riscv-codegenprepare-asm.ll
index 3bbdd1a257fdb..4e5f6e0f65489 100644
--- a/llvm/test/CodeGen/RISCV/rvv/riscv-codegenprepare-asm.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/riscv-codegenprepare-asm.ll
@@ -42,3 +42,459 @@ vector.body:
exit:
ret float %acc
}
+
+define i32 @vp_reduce_add(ptr %a) {
+; CHECK-LABEL: vp_reduce_add:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: li a1, 0
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v8, zero
+; CHECK-NEXT: li a2, 1024
+; CHECK-NEXT: .LBB1_1: # %vector.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vsetvli a3, a2, e32, m2, ta, ma
+; CHECK-NEXT: slli a4, a1, 2
+; CHECK-NEXT: add a4, a0, a4
+; CHECK-NEXT: vle32.v v10, (a4)
+; CHECK-NEXT: sub a2, a2, a3
+; CHECK-NEXT: vredsum.vs v8, v10, v8
+; CHECK-NEXT: add a1, a1, a3
+; CHECK-NEXT: bnez a2, .LBB1_1
+; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: ret
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi i32 [ 0, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call i32 @llvm.vp.reduce.add.nxv4i32(i32 %red.phi, <vscale x 4 x i32> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret i32 %red
+}
+
+define i32 @vp_reduce_and(ptr %a) {
+; CHECK-LABEL: vp_reduce_and:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: li a1, 0
+; CHECK-NEXT: lui a2, 524288
+; CHECK-NEXT: vsetvli a3, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v8, a2
+; CHECK-NEXT: li a2, 1024
+; CHECK-NEXT: .LBB2_1: # %vector.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vsetvli a3, a2, e32, m2, ta, ma
+; CHECK-NEXT: slli a4, a1, 2
+; CHECK-NEXT: add a4, a0, a4
+; CHECK-NEXT: vle32.v v10, (a4)
+; CHECK-NEXT: sub a2, a2, a3
+; CHECK-NEXT: vredand.vs v8, v10, v8
+; CHECK-NEXT: add a1, a1, a3
+; CHECK-NEXT: bnez a2, .LBB2_1
+; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: ret
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi i32 [ -2147483648, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call i32 @llvm.vp.reduce.and.nxv4i32(i32 %red.phi, <vscale x 4 x i32> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret i32 %red
+}
+
+define i32 @vp_reduce_or(ptr %a) {
+; CHECK-LABEL: vp_reduce_or:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: li a1, 0
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v8, zero
+; CHECK-NEXT: li a2, 1024
+; CHECK-NEXT: .LBB3_1: # %vector.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vsetvli a3, a2, e32, m2, ta, ma
+; CHECK-NEXT: slli a4, a1, 2
+; CHECK-NEXT: add a4, a0, a4
+; CHECK-NEXT: vle32.v v10, (a4)
+; CHECK-NEXT: sub a2, a2, a3
+; CHECK-NEXT: vredor.vs v8, v10, v8
+; CHECK-NEXT: add a1, a1, a3
+; CHECK-NEXT: bnez a2, .LBB3_1
+; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: ret
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi i32 [ 0, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call i32 @llvm.vp.reduce.or.nxv4i32(i32 %red.phi, <vscale x 4 x i32> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret i32 %red
+}
+
+define i32 @vp_reduce_xor(ptr %a) {
+; CHECK-LABEL: vp_reduce_xor:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: li a1, 0
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v8, zero
+; CHECK-NEXT: li a2, 1024
+; CHECK-NEXT: .LBB4_1: # %vector.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vsetvli a3, a2, e32, m2, ta, ma
+; CHECK-NEXT: slli a4, a1, 2
+; CHECK-NEXT: add a4, a0, a4
+; CHECK-NEXT: vle32.v v10, (a4)
+; CHECK-NEXT: sub a2, a2, a3
+; CHECK-NEXT: vredxor.vs v8, v10, v8
+; CHECK-NEXT: add a1, a1, a3
+; CHECK-NEXT: bnez a2, .LBB4_1
+; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: ret
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi i32 [ 0, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call i32 @llvm.vp.reduce.xor.nxv4i32(i32 %red.phi, <vscale x 4 x i32> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret i32 %red
+}
+
+define i32 @vp_reduce_smax(ptr %a) {
+; CHECK-LABEL: vp_reduce_smax:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: li a1, 0
+; CHECK-NEXT: lui a2, 524288
+; CHECK-NEXT: vsetvli a3, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v8, a2
+; CHECK-NEXT: li a2, 1024
+; CHECK-NEXT: .LBB5_1: # %vector.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vsetvli a3, a2, e32, m2, ta, ma
+; CHECK-NEXT: slli a4, a1, 2
+; CHECK-NEXT: add a4, a0, a4
+; CHECK-NEXT: vle32.v v10, (a4)
+; CHECK-NEXT: sub a2, a2, a3
+; CHECK-NEXT: vredmax.vs v8, v10, v8
+; CHECK-NEXT: add a1, a1, a3
+; CHECK-NEXT: bnez a2, .LBB5_1
+; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: ret
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi i32 [ -2147483648, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call i32 @llvm.vp.reduce.smax.nxv4i32(i32 %red.phi, <vscale x 4 x i32> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret i32 %red
+}
+
+define i32 @vp_reduce_smin(ptr %a) {
+; CHECK-LABEL: vp_reduce_smin:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: li a1, 0
+; CHECK-NEXT: lui a2, 524288
+; CHECK-NEXT: addi a2, a2, -1
+; CHECK-NEXT: vsetvli a3, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v8, a2
+; CHECK-NEXT: li a2, 1024
+; CHECK-NEXT: .LBB6_1: # %vector.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vsetvli a3, a2, e32, m2, ta, ma
+; CHECK-NEXT: slli a4, a1, 2
+; CHECK-NEXT: add a4, a0, a4
+; CHECK-NEXT: vle32.v v10, (a4)
+; CHECK-NEXT: sub a2, a2, a3
+; CHECK-NEXT: vredmin.vs v8, v10, v8
+; CHECK-NEXT: add a1, a1, a3
+; CHECK-NEXT: bnez a2, .LBB6_1
+; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: ret
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi i32 [ 2147483647, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call i32 @llvm.vp.reduce.smin.nxv4i32(i32 %red.phi, <vscale x 4 x i32> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret i32 %red
+}
+
+define i32 @vp_reduce_umax(ptr %a) {
+; CHECK-LABEL: vp_reduce_umax:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: li a1, 0
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v8, zero
+; CHECK-NEXT: li a2, 1024
+; CHECK-NEXT: .LBB7_1: # %vector.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vsetvli a3, a2, e32, m2, ta, ma
+; CHECK-NEXT: slli a4, a1, 2
+; CHECK-NEXT: add a4, a0, a4
+; CHECK-NEXT: vle32.v v10, (a4)
+; CHECK-NEXT: sub a2, a2, a3
+; CHECK-NEXT: vredmaxu.vs v8, v10, v8
+; CHECK-NEXT: add a1, a1, a3
+; CHECK-NEXT: bnez a2, .LBB7_1
+; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: ret
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi i32 [ 0, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call i32 @llvm.vp.reduce.umax.nxv4i32(i32 %red.phi, <vscale x 4 x i32> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret i32 %red
+}
+
+define i32 @vp_reduce_umin(ptr %a) {
+; CHECK-LABEL: vp_reduce_umin:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: li a1, 0
+; CHECK-NEXT: lui a2, 524288
+; CHECK-NEXT: vsetvli a3, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v8, a2
+; CHECK-NEXT: li a2, 1024
+; CHECK-NEXT: .LBB8_1: # %vector.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vsetvli a3, a2, e32, m2, ta, ma
+; CHECK-NEXT: slli a4, a1, 2
+; CHECK-NEXT: add a4, a0, a4
+; CHECK-NEXT: vle32.v v10, (a4)
+; CHECK-NEXT: sub a2, a2, a3
+; CHECK-NEXT: vredminu.vs v8, v10, v8
+; CHECK-NEXT: add a1, a1, a3
+; CHECK-NEXT: bnez a2, .LBB8_1
+; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: ret
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi i32 [ -2147483648, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call i32 @llvm.vp.reduce.umin.nxv4i32(i32 %red.phi, <vscale x 4 x i32> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret i32 %red
+}
+
+define float @vp_reduce_fadd(ptr %a) {
+; CHECK-LABEL: vp_reduce_fadd:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: li a1, 0
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v8, zero
+; CHECK-NEXT: li a2, 1024
+; CHECK-NEXT: .LBB9_1: # %vector.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vsetvli a3, a2, e32, m2, ta, ma
+; CHECK-NEXT: slli a4, a1, 2
+; CHECK-NEXT: add a4, a0, a4
+; CHECK-NEXT: vle32.v v10, (a4)
+; CHECK-NEXT: sub a2, a2, a3
+; CHECK-NEXT: vfredosum.vs v8, v10, v8
+; CHECK-NEXT: add a1, a1, a3
+; CHECK-NEXT: bnez a2, .LBB9_1
+; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT: vfmv.f.s fa0, v8
+; CHECK-NEXT: ret
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi float [ 0.000000e+00, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds float, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call float @llvm.vp.reduce.fadd.nxv4f32(float %red.phi, <vscale x 4 x float> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret float %red
+}
+
+define float @vp_reduce_fmax(ptr %a) {
+; CHECK-LABEL: vp_reduce_fmax:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: li a1, 0
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v8, zero
+; CHECK-NEXT: li a2, 1024
+; CHECK-NEXT: .LBB10_1: # %vector.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vsetvli a3, a2, e32, m2, ta, ma
+; CHECK-NEXT: slli a4, a1, 2
+; CHECK-NEXT: add a4, a0, a4
+; CHECK-NEXT: vle32.v v10, (a4)
+; CHECK-NEXT: sub a2, a2, a3
+; CHECK-NEXT: vfredmax.vs v8, v10, v8
+; CHECK-NEXT: add a1, a1, a3
+; CHECK-NEXT: bnez a2, .LBB10_1
+; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT: vfmv.f.s fa0, v8
+; CHECK-NEXT: ret
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi float [ 0.000000e+00, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds float, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call float @llvm.vp.reduce.fmax.nxv4f32(float %red.phi, <vscale x 4 x float> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret float %red
+}
+
+define float @vp_reduce_fmin(ptr %a) {
+; CHECK-LABEL: vp_reduce_fmin:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: li a1, 0
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v8, zero
+; CHECK-NEXT: li a2, 1024
+; CHECK-NEXT: .LBB11_1: # %vector.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vsetvli a3, a2, e32, m2, ta, ma
+; CHECK-NEXT: slli a4, a1, 2
+; CHECK-NEXT: add a4, a0, a4
+; CHECK-NEXT: vle32.v v10, (a4)
+; CHECK-NEXT: sub a2, a2, a3
+; CHECK-NEXT: vfredmin.vs v8, v10, v8
+; CHECK-NEXT: add a1, a1, a3
+; CHECK-NEXT: bnez a2, .LBB11_1
+; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT: vfmv.f.s fa0, v8
+; CHECK-NEXT: ret
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %r...
[truncated]
|
The original patch only supported ordered fadd. Why does this to support more than just vp.reduce.fadd? |
Oh I guess its because of the scalar input that the regular reduce intrinsics don't have? |
That's right. I think this optimization will not change the order of operations. |
This patch is vector predication version of commit 15b0fab
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks
Can you update the comment above visitIntrinsicInst to say something like
// This affects ordered fadd reductions and VP reductions that have a scalar start value. // This tries to vectorize any scalar phis that feed into these reductions:
Could someone help merge this PR? |
I'm just noticing it looks like you're a member of the LLVM organization, does that mean you already have commit access? |
I applied for permissions back when LLVM was using Phabricator, but they may have been revoked due to long-term inactivity. Do I need to reapply for the permissions now? |
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/17/builds/6891 Here is the relevant piece of the build log for the reference
|
Oh yes I see you've been moved to the triagers team. I think according to the RFC you can create a new GitHub issue to restore commit access? @tstellar I couldn't find anything in https://llvm.org/docs/DeveloperPolicy.html#obtaining-commit-access about regaining commit access as described in the RFC, should that be documented somewhere? |
LLVM vector predication reduction intrinsics return a scalar result, but on RISC-V vector reduction instructions write the result in the first element of a vector register. So when a reduction in a loop uses a scalar phi, we end up with unnecessary scalar moves: ```asm loop: vmv.s.x v8, zero vredsum.vs v8, v10, v8 vmv.x.s a0, v8 ```` This mainly affects vector predication reduction. This tries to vectorize any scalar phis that feed into a vector predication reduction in RISCVCodeGenPrepare, converting: ```llvm vector.body: %red.phi = phi i32 [ ..., %entry ], [ %red, %vector.body ] %red = tail call i32 @llvm.vp.reduce.add.nxv4i32(i32 %red.phi, <vscale x 4 x i32> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl) ``` to ```llvm vector.body: %red.phi = phi <vscale x 2 x i32> [ ..., %entry ], [ %acc.vec, %vector.body] %phi.scalar = extractelement <vscale x 2 x i32> %red.phi, i64 0 %acc = tail call i32 @llvm.vp.reduce.add.nxv4i32(i32 %phi.scalar, <vscale x 4 x i32> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl) %acc.vec = insertelement <vscale x 2 x i32> poison, float %acc, i64 0 ``` Which eliminates the scalar -> vector -> scalar crossing during instruction selection. --------- Co-authored-by: yanming <[email protected]>
LLVM vector predication reduction intrinsics return a scalar result, but on RISC-V vector reduction instructions write the result in the first element of a vector register. So when a reduction in a loop uses a scalar phi, we end up with unnecessary scalar moves:
This mainly affects vector predication reduction. This tries to vectorize any scalar phis that feed into a vector predication reduction in RISCVCodeGenPrepare, converting:
to
Which eliminates the scalar -> vector -> scalar crossing during instruction selection.