[RISCVISel] Compute leading zeros for RISCVISD::VCPOP_VL node #127705

vbe-sc · 2025-02-18T22:12:16Z

This patch adds handling of the RISCVISD::VCPOP_VL node in RISCVTargetLowering::computeKnownBitsForTargetNode. It eliminates redundant zero-extension instructions.

…ompute

llvmbot · 2025-02-18T22:12:48Z

@llvm/pr-subscribers-backend-risc-v

Author: Vladislav Belov (vbe-sc)

Changes

This patch adds handling of the RISCVISD::VCPOP_VL node in RISCVTargetLowering::computeKnownBitsForTargetNode. It eliminates redundant zero-extension instructions.

Full diff: https://github.com/llvm/llvm-project/pull/127705.diff

4 Files Affected:

(modified) llvm/lib/Target/RISCV/RISCVISelLowering.cpp (+5)
(modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vreductions-mask.ll (-5)
(added) llvm/test/CodeGen/RISCV/rvv/vcpop-compute-known-bits.ll (+18)
(added) llvm/test/CodeGen/RISCV/rvv/vcpop-shl-zext-opt.ll (+198)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 98c25bc93a8a2..28aa6e46c58c7 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -19462,6 +19462,11 @@ void RISCVTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
     Known = Known.intersectWith(Known2);
     break;
   }
+  case RISCVISD::VCPOP_VL: {
+    KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
+    Known.Zero.setBitsFrom(Known2.countMaxTrailingZeros());
+    break;
+  }
   case RISCVISD::CZERO_EQZ:
   case RISCVISD::CZERO_NEZ:
     Known = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vreductions-mask.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vreductions-mask.ll
index 0d31ec5f78435..9f1a3f78024a6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vreductions-mask.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vreductions-mask.ll
@@ -100,7 +100,6 @@ define zeroext i1 @vreduce_or_v2i1(<2 x i1> %v) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; CHECK-NEXT:    vcpop.m a0, v0
-; CHECK-NEXT:    snez a0, a0
 ; CHECK-NEXT:    ret
   %red = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %v)
   ret i1 %red
@@ -113,7 +112,6 @@ define zeroext i1 @vreduce_xor_v2i1(<2 x i1> %v) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; CHECK-NEXT:    vcpop.m a0, v0
-; CHECK-NEXT:    andi a0, a0, 1
 ; CHECK-NEXT:    ret
   %red = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> %v)
   ret i1 %red
@@ -140,7 +138,6 @@ define zeroext i1 @vreduce_umax_v2i1(<2 x i1> %v) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; CHECK-NEXT:    vcpop.m a0, v0
-; CHECK-NEXT:    snez a0, a0
 ; CHECK-NEXT:    ret
   %red = call i1 @llvm.vector.reduce.umax.v2i1(<2 x i1> %v)
   ret i1 %red
@@ -181,7 +178,6 @@ define zeroext i1 @vreduce_smin_v2i1(<2 x i1> %v) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; CHECK-NEXT:    vcpop.m a0, v0
-; CHECK-NEXT:    snez a0, a0
 ; CHECK-NEXT:    ret
   %red = call i1 @llvm.vector.reduce.smin.v2i1(<2 x i1> %v)
   ret i1 %red
@@ -691,7 +687,6 @@ define zeroext i1 @vreduce_add_v2i1(<2 x i1> %v) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; CHECK-NEXT:    vcpop.m a0, v0
-; CHECK-NEXT:    andi a0, a0, 1
 ; CHECK-NEXT:    ret
   %red = call i1 @llvm.vector.reduce.add.v2i1(<2 x i1> %v)
   ret i1 %red
diff --git a/llvm/test/CodeGen/RISCV/rvv/vcpop-compute-known-bits.ll b/llvm/test/CodeGen/RISCV/rvv/vcpop-compute-known-bits.ll
new file mode 100644
index 0000000000000..7c569da9291db
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vcpop-compute-known-bits.ll
@@ -0,0 +1,18 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=riscv32 -mattr=+v | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v | FileCheck %s --check-prefixes=CHECK,RV64
+
+define i32 @test(<8 x i1> %mask) {
+; CHECK-LABEL: test:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vcpop.m a0, v0
+; CHECK-NEXT:    ret
+    %1 = bitcast <8 x i1> %mask to i8
+    %2 = call range(i8 0, 9) i8 @llvm.ctpop.i8(i8 %1)
+    %3 = zext nneg i8 %2 to i32
+    ret i32 %3
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; RV32: {{.*}}
+; RV64: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vcpop-shl-zext-opt.ll b/llvm/test/CodeGen/RISCV/rvv/vcpop-shl-zext-opt.ll
new file mode 100644
index 0000000000000..16c4ade7fa9cb
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vcpop-shl-zext-opt.ll
@@ -0,0 +1,198 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=riscv32 -mattr=+v | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v | FileCheck %s --check-prefixes=CHECK,RV64
+
+define dso_local void @test_store1(ptr nocapture noundef writeonly %dst, ptr nocapture noundef readonly %src, i32 noundef signext %c, i32 noundef signext %n) {
+; RV32-LABEL: test_store1:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    blez a3, .LBB0_6
+; RV32-NEXT:  # %bb.1: # %for.body.preheader
+; RV32-NEXT:    li a4, 8
+; RV32-NEXT:    bltu a3, a4, .LBB0_7
+; RV32-NEXT:  # %bb.2: # %for.body.preheader
+; RV32-NEXT:    sub a4, a0, a1
+; RV32-NEXT:    sltu a5, a0, a1
+; RV32-NEXT:    neg a5, a5
+; RV32-NEXT:    sltiu a4, a4, 32
+; RV32-NEXT:    seqz a5, a5
+; RV32-NEXT:    and a4, a5, a4
+; RV32-NEXT:    bnez a4, .LBB0_7
+; RV32-NEXT:  # %bb.3: # %vector.ph
+; RV32-NEXT:    lui a5, 524288
+; RV32-NEXT:    addi a5, a5, -8
+; RV32-NEXT:    and a5, a3, a5
+; RV32-NEXT:    li a7, 0
+; RV32-NEXT:    li a6, 0
+; RV32-NEXT:  .LBB0_4: # %vector.body
+; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32-NEXT:    slli t0, a7, 2
+; RV32-NEXT:    addi t1, a7, 8
+; RV32-NEXT:    add t0, a1, t0
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vle32.v v8, (t0)
+; RV32-NEXT:    sltu a7, t1, a7
+; RV32-NEXT:    xor t0, t1, a5
+; RV32-NEXT:    add a6, a6, a7
+; RV32-NEXT:    vmslt.vx v10, v8, a2
+; RV32-NEXT:    vcompress.vm v12, v8, v10
+; RV32-NEXT:    vcpop.m a7, v10
+; RV32-NEXT:    vsetvli zero, a7, e32, m2, ta, ma
+; RV32-NEXT:    vse32.v v12, (a0)
+; RV32-NEXT:    slli a7, a7, 2
+; RV32-NEXT:    or t0, t0, a6
+; RV32-NEXT:    add a0, a0, a7
+; RV32-NEXT:    mv a7, t1
+; RV32-NEXT:    bnez t0, .LBB0_4
+; RV32-NEXT:  # %bb.5: # %middle.block
+; RV32-NEXT:    bne a5, a3, .LBB0_9
+; RV32-NEXT:  .LBB0_6: # %for.cond.cleanup
+; RV32-NEXT:    ret
+; RV32-NEXT:  .LBB0_7:
+; RV32-NEXT:    li a5, 0
+; RV32-NEXT:    li a4, 0
+; RV32-NEXT:    j .LBB0_9
+; RV32-NEXT:  .LBB0_8: # %for.inc
+; RV32-NEXT:    # in Loop: Header=BB0_9 Depth=1
+; RV32-NEXT:    addi a5, a5, 1
+; RV32-NEXT:    seqz a6, a5
+; RV32-NEXT:    add a4, a4, a6
+; RV32-NEXT:    xor a6, a5, a3
+; RV32-NEXT:    or a6, a6, a4
+; RV32-NEXT:    beqz a6, .LBB0_6
+; RV32-NEXT:  .LBB0_9: # %for.body
+; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32-NEXT:    slli a6, a5, 2
+; RV32-NEXT:    add a6, a1, a6
+; RV32-NEXT:    lw a6, 0(a6)
+; RV32-NEXT:    bge a6, a2, .LBB0_8
+; RV32-NEXT:  # %bb.10: # %if.then
+; RV32-NEXT:    # in Loop: Header=BB0_9 Depth=1
+; RV32-NEXT:    addi a7, a0, 4
+; RV32-NEXT:    sw a6, 0(a0)
+; RV32-NEXT:    mv a0, a7
+; RV32-NEXT:    j .LBB0_8
+;
+; RV64-LABEL: test_store1:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    blez a3, .LBB0_6
+; RV64-NEXT:  # %bb.1: # %for.body.preheader
+; RV64-NEXT:    li a5, 8
+; RV64-NEXT:    li a4, 0
+; RV64-NEXT:    bltu a3, a5, .LBB0_7
+; RV64-NEXT:  # %bb.2: # %for.body.preheader
+; RV64-NEXT:    sub a5, a0, a1
+; RV64-NEXT:    li a6, 31
+; RV64-NEXT:    bgeu a6, a5, .LBB0_7
+; RV64-NEXT:  # %bb.3: # %vector.ph
+; RV64-NEXT:    lui a4, 524288
+; RV64-NEXT:    addiw a4, a4, -8
+; RV64-NEXT:    and a4, a3, a4
+; RV64-NEXT:    slli a5, a4, 2
+; RV64-NEXT:    add a5, a5, a1
+; RV64-NEXT:    mv a6, a1
+; RV64-NEXT:  .LBB0_4: # %vector.body
+; RV64-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT:    vle32.v v8, (a6)
+; RV64-NEXT:    addi a6, a6, 32
+; RV64-NEXT:    vmslt.vx v10, v8, a2
+; RV64-NEXT:    vcompress.vm v12, v8, v10
+; RV64-NEXT:    vcpop.m a7, v10
+; RV64-NEXT:    vsetvli zero, a7, e32, m2, ta, ma
+; RV64-NEXT:    vse32.v v12, (a0)
+; RV64-NEXT:    slli a7, a7, 2
+; RV64-NEXT:    add a0, a0, a7
+; RV64-NEXT:    bne a6, a5, .LBB0_4
+; RV64-NEXT:  # %bb.5: # %middle.block
+; RV64-NEXT:    bne a4, a3, .LBB0_7
+; RV64-NEXT:  .LBB0_6: # %for.cond.cleanup
+; RV64-NEXT:    ret
+; RV64-NEXT:  .LBB0_7: # %for.body.preheader13
+; RV64-NEXT:    slli a4, a4, 2
+; RV64-NEXT:    slli a5, a3, 2
+; RV64-NEXT:    add a3, a1, a4
+; RV64-NEXT:    add a1, a1, a5
+; RV64-NEXT:    j .LBB0_9
+; RV64-NEXT:  .LBB0_8: # %for.inc
+; RV64-NEXT:    # in Loop: Header=BB0_9 Depth=1
+; RV64-NEXT:    addi a3, a3, 4
+; RV64-NEXT:    beq a3, a1, .LBB0_6
+; RV64-NEXT:  .LBB0_9: # %for.body
+; RV64-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64-NEXT:    lw a4, 0(a3)
+; RV64-NEXT:    bge a4, a2, .LBB0_8
+; RV64-NEXT:  # %bb.10: # %if.then
+; RV64-NEXT:    # in Loop: Header=BB0_9 Depth=1
+; RV64-NEXT:    addi a5, a0, 4
+; RV64-NEXT:    sw a4, 0(a0)
+; RV64-NEXT:    mv a0, a5
+; RV64-NEXT:    j .LBB0_8
+entry:
+  %cmp8 = icmp sgt i32 %n, 0
+  br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %dst11 = ptrtoint ptr %dst to i64
+  %src12 = ptrtoint ptr %src to i64
+  %wide.trip.count = zext nneg i32 %n to i64
+  %min.iters.check = icmp ult i32 %n, 8
+  %0 = sub i64 %dst11, %src12
+  %diff.check = icmp ult i64 %0, 32
+  %or.cond = or i1 %min.iters.check, %diff.check
+  br i1 %or.cond, label %for.body.preheader13, label %vector.ph
+
+for.body.preheader13:                             ; preds = %middle.block, %for.body.preheader
+  %indvars.iv.ph = phi i64 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
+  %dst.addr.09.ph = phi ptr [ %dst, %for.body.preheader ], [ %monotonic.add, %middle.block ]
+  br label %for.body
+
+vector.ph:                                        ; preds = %for.body.preheader
+  %n.vec = and i64 %wide.trip.count, 2147483640
+  %broadcast.splatinsert = insertelement <8 x i32> poison, i32 %c, i64 0
+  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> poison, <8 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %monotonic.iv = phi ptr [ %dst, %vector.ph ], [ %monotonic.add, %vector.body ]
+  %1 = getelementptr inbounds i32, ptr %src, i64 %index
+  %wide.load = load <8 x i32>, ptr %1, align 4
+  %2 = icmp slt <8 x i32> %wide.load, %broadcast.splat
+  tail call void @llvm.masked.compressstore.v8i32(<8 x i32> %wide.load, ptr align 4 %monotonic.iv, <8 x i1> %2)
+  %3 = bitcast <8 x i1> %2 to i8
+  %4 = tail call range(i8 0, 9) i8 @llvm.ctpop.i8(i8 %3)
+  %5 = shl nuw nsw i8 %4, 2
+  %6 = zext nneg i8 %5 to i64
+  %monotonic.add = getelementptr inbounds i8, ptr %monotonic.iv, i64 %6
+  %index.next = add nuw i64 %index, 8
+  %7 = icmp eq i64 %index.next, %n.vec
+  br i1 %7, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
+  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader13
+
+for.cond.cleanup:                                 ; preds = %for.inc, %middle.block, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader13, %for.inc
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ %indvars.iv.ph, %for.body.preheader13 ]
+  %dst.addr.09 = phi ptr [ %dst.addr.1, %for.inc ], [ %dst.addr.09.ph, %for.body.preheader13 ]
+  %arrayidx = getelementptr inbounds i32, ptr %src, i64 %indvars.iv
+  %8 = load i32, ptr %arrayidx, align 4
+  %cmp1 = icmp slt i32 %8, %c
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %incdec.ptr = getelementptr inbounds i8, ptr %dst.addr.09, i64 4
+  store i32 %8, ptr %dst.addr.09, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %dst.addr.1 = phi ptr [ %incdec.ptr, %if.then ], [ %dst.addr.09, %for.body ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}

topperc · 2025-02-18T22:16:48Z

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

@@ -19462,6 +19462,11 @@ void RISCVTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
    Known = Known.intersectWith(Known2);
    break;
  }
+  case RISCVISD::VCPOP_VL: {
+    KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
+    Known.Zero.setBitsFrom(Known2.countMaxTrailingZeros());


This isn't correct. It would need Known.Zero.setBitsFrom(Known2.countMaxActiveBits());

You are right, thanks. Fixed

topperc · 2025-02-18T22:18:45Z

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vreductions-mask.ll

@@ -100,7 +100,6 @@ define zeroext i1 @vreduce_or_v2i1(<2 x i1> %v) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; CHECK-NEXT:    vcpop.m a0, v0


The population count here is 0, 1, or 2. The return value must be 0 or 1. It's not legal to remove the snez since that would allow 2 to be returned.

Yes, you are right. With your fix it doesn't touch this code any more

topperc

LGTM

[RISCVISel] Add precommit test for RISCVISD::VCPOP_VL leading zeros c…

fbf630a

…ompute

llvmbot added the backend:RISC-V label Feb 18, 2025

topperc reviewed Feb 18, 2025

View reviewed changes

[RISCVISel] Compute leading zeros for RISCVISD::VCPOP_VL node

9ef3072

vbe-sc force-pushed the vbe-sc/vcpop-optimization branch from fda2e7e to 9ef3072 Compare February 18, 2025 22:21

vbe-sc requested a review from topperc February 18, 2025 22:26

topperc approved these changes Feb 18, 2025

View reviewed changes

lukel97 approved these changes Feb 19, 2025

View reviewed changes

asi-sc merged commit b9a1e58 into llvm:main Feb 19, 2025
8 checks passed

joaosaffran mentioned this pull request Feb 26, 2025

[libc++] Set feature-test macro __cpp_lib_atomic_float (#127559) joaosaffran/llvm-project#1

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[RISCVISel] Compute leading zeros for RISCVISD::VCPOP_VL node #127705

[RISCVISel] Compute leading zeros for RISCVISD::VCPOP_VL node #127705

Uh oh!

vbe-sc commented Feb 18, 2025

Uh oh!

llvmbot commented Feb 18, 2025

Uh oh!

topperc Feb 18, 2025

Uh oh!

vbe-sc Feb 18, 2025

Uh oh!

topperc Feb 18, 2025

Uh oh!

vbe-sc Feb 18, 2025

Uh oh!

topperc left a comment

Uh oh!

Uh oh!

Uh oh!

[RISCVISel] Compute leading zeros for RISCVISD::VCPOP_VL node #127705

[RISCVISel] Compute leading zeros for RISCVISD::VCPOP_VL node #127705

Uh oh!

Conversation

vbe-sc commented Feb 18, 2025

Uh oh!

llvmbot commented Feb 18, 2025

Uh oh!

topperc Feb 18, 2025

Choose a reason for hiding this comment

Uh oh!

vbe-sc Feb 18, 2025

Choose a reason for hiding this comment

Uh oh!

topperc Feb 18, 2025

Choose a reason for hiding this comment

Uh oh!

vbe-sc Feb 18, 2025

Choose a reason for hiding this comment

Uh oh!

topperc left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!