Skip to content

Reland [CodeGenPrepare] Convert ctpop(X) ==/!= 1 into ctpop(X) u</u> 2/1 (#111284) #111998

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions llvm/lib/CodeGen/CodeGenPrepare.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2111,6 +2111,31 @@ bool CodeGenPrepare::optimizeURem(Instruction *Rem) {
return false;
}

/// Some targets have better codegen for `ctpop(X) u< 2` than `ctpop(X) == 1`.
/// This function converts `ctpop(X) ==/!= 1` into `ctpop(X) u</u> 2/1` if the
/// result cannot be zero.
static bool adjustIsPower2Test(CmpInst *Cmp, const TargetLowering &TLI,
const TargetTransformInfo &TTI,
const DataLayout &DL) {
ICmpInst::Predicate Pred;
if (!match(Cmp, m_ICmp(Pred, m_Intrinsic<Intrinsic::ctpop>(), m_One())))
return false;
if (!ICmpInst::isEquality(Pred))
return false;
auto *II = cast<IntrinsicInst>(Cmp->getOperand(0));

if (isKnownNonZero(II, DL)) {
if (Pred == ICmpInst::ICMP_EQ) {
Cmp->setOperand(1, ConstantInt::get(II->getType(), 2));
Cmp->setPredicate(ICmpInst::ICMP_ULT);
} else {
Cmp->setPredicate(ICmpInst::ICMP_UGT);
}
return true;
}
return false;
}

bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) {
if (sinkCmpExpression(Cmp, *TLI))
return true;
Expand All @@ -2130,6 +2155,9 @@ bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) {
if (foldFCmpToFPClassTest(Cmp, *TLI, *DL))
return true;

if (adjustIsPower2Test(Cmp, *TLI, *TTI, *DL))
return true;

return false;
}

Expand Down
68 changes: 61 additions & 7 deletions llvm/test/CodeGen/AArch64/arm64-popcnt.ll
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ define i32 @cnt32_advsimd(i32 %x) nounwind readnone {
; CHECK-NONEON-LABEL: cnt32_advsimd:
; CHECK-NONEON: // %bb.0:
; CHECK-NONEON-NEXT: lsr w9, w0, #1
; CHECK-NONEON-NEXT: mov w8, #16843009
; CHECK-NONEON-NEXT: mov w8, #16843009 // =0x1010101
; CHECK-NONEON-NEXT: and w9, w9, #0x55555555
; CHECK-NONEON-NEXT: sub w9, w0, w9
; CHECK-NONEON-NEXT: lsr w10, w9, #2
Expand Down Expand Up @@ -50,7 +50,7 @@ define i32 @cnt32_advsimd_2(<2 x i32> %x) {
; CHECK-NONEON-LABEL: cnt32_advsimd_2:
; CHECK-NONEON: // %bb.0:
; CHECK-NONEON-NEXT: lsr w9, w0, #1
; CHECK-NONEON-NEXT: mov w8, #16843009
; CHECK-NONEON-NEXT: mov w8, #16843009 // =0x1010101
; CHECK-NONEON-NEXT: and w9, w9, #0x55555555
; CHECK-NONEON-NEXT: sub w9, w0, w9
; CHECK-NONEON-NEXT: lsr w10, w9, #2
Expand Down Expand Up @@ -86,7 +86,7 @@ define i64 @cnt64_advsimd(i64 %x) nounwind readnone {
; CHECK-NONEON-LABEL: cnt64_advsimd:
; CHECK-NONEON: // %bb.0:
; CHECK-NONEON-NEXT: lsr x9, x0, #1
; CHECK-NONEON-NEXT: mov x8, #72340172838076673
; CHECK-NONEON-NEXT: mov x8, #72340172838076673 // =0x101010101010101
; CHECK-NONEON-NEXT: and x9, x9, #0x5555555555555555
; CHECK-NONEON-NEXT: sub x9, x0, x9
; CHECK-NONEON-NEXT: lsr x10, x9, #2
Expand Down Expand Up @@ -114,7 +114,7 @@ define i32 @cnt32(i32 %x) nounwind readnone noimplicitfloat {
; CHECK-LABEL: cnt32:
; CHECK: // %bb.0:
; CHECK-NEXT: lsr w9, w0, #1
; CHECK-NEXT: mov w8, #16843009
; CHECK-NEXT: mov w8, #16843009 // =0x1010101
; CHECK-NEXT: and w9, w9, #0x55555555
; CHECK-NEXT: sub w9, w0, w9
; CHECK-NEXT: lsr w10, w9, #2
Expand All @@ -130,7 +130,7 @@ define i32 @cnt32(i32 %x) nounwind readnone noimplicitfloat {
; CHECK-NONEON-LABEL: cnt32:
; CHECK-NONEON: // %bb.0:
; CHECK-NONEON-NEXT: lsr w9, w0, #1
; CHECK-NONEON-NEXT: mov w8, #16843009
; CHECK-NONEON-NEXT: mov w8, #16843009 // =0x1010101
; CHECK-NONEON-NEXT: and w9, w9, #0x55555555
; CHECK-NONEON-NEXT: sub w9, w0, w9
; CHECK-NONEON-NEXT: lsr w10, w9, #2
Expand All @@ -155,7 +155,7 @@ define i64 @cnt64(i64 %x) nounwind readnone noimplicitfloat {
; CHECK-LABEL: cnt64:
; CHECK: // %bb.0:
; CHECK-NEXT: lsr x9, x0, #1
; CHECK-NEXT: mov x8, #72340172838076673
; CHECK-NEXT: mov x8, #72340172838076673 // =0x101010101010101
; CHECK-NEXT: and x9, x9, #0x5555555555555555
; CHECK-NEXT: sub x9, x0, x9
; CHECK-NEXT: lsr x10, x9, #2
Expand All @@ -171,7 +171,7 @@ define i64 @cnt64(i64 %x) nounwind readnone noimplicitfloat {
; CHECK-NONEON-LABEL: cnt64:
; CHECK-NONEON: // %bb.0:
; CHECK-NONEON-NEXT: lsr x9, x0, #1
; CHECK-NONEON-NEXT: mov x8, #72340172838076673
; CHECK-NONEON-NEXT: mov x8, #72340172838076673 // =0x101010101010101
; CHECK-NONEON-NEXT: and x9, x9, #0x5555555555555555
; CHECK-NONEON-NEXT: sub x9, x0, x9
; CHECK-NONEON-NEXT: lsr x10, x9, #2
Expand Down Expand Up @@ -278,5 +278,59 @@ define i1 @ctpop32_ne_one(i32 %x) nounwind readnone {
ret i1 %cmp
}

define i1 @ctpop32_eq_one_nonzero(i32 %x) {
; CHECK-LABEL: ctpop32_eq_one_nonzero:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sub w8, w0, #1
; CHECK-NEXT: tst w0, w8
; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: ret
;
; CHECK-NONEON-LABEL: ctpop32_eq_one_nonzero:
; CHECK-NONEON: // %bb.0: // %entry
; CHECK-NONEON-NEXT: sub w8, w0, #1
; CHECK-NONEON-NEXT: tst w0, w8
; CHECK-NONEON-NEXT: cset w0, eq
; CHECK-NONEON-NEXT: ret
;
; CHECK-CSSC-LABEL: ctpop32_eq_one_nonzero:
; CHECK-CSSC: // %bb.0: // %entry
; CHECK-CSSC-NEXT: sub w8, w0, #1
; CHECK-CSSC-NEXT: tst w0, w8
; CHECK-CSSC-NEXT: cset w0, eq
; CHECK-CSSC-NEXT: ret
entry:
%popcnt = call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x)
%cmp = icmp eq i32 %popcnt, 1
ret i1 %cmp
}

define i1 @ctpop32_ne_one_nonzero(i32 %x) {
; CHECK-LABEL: ctpop32_ne_one_nonzero:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sub w8, w0, #1
; CHECK-NEXT: tst w0, w8
; CHECK-NEXT: cset w0, ne
; CHECK-NEXT: ret
;
; CHECK-NONEON-LABEL: ctpop32_ne_one_nonzero:
; CHECK-NONEON: // %bb.0: // %entry
; CHECK-NONEON-NEXT: sub w8, w0, #1
; CHECK-NONEON-NEXT: tst w0, w8
; CHECK-NONEON-NEXT: cset w0, ne
; CHECK-NONEON-NEXT: ret
;
; CHECK-CSSC-LABEL: ctpop32_ne_one_nonzero:
; CHECK-CSSC: // %bb.0: // %entry
; CHECK-CSSC-NEXT: sub w8, w0, #1
; CHECK-CSSC-NEXT: tst w0, w8
; CHECK-CSSC-NEXT: cset w0, ne
; CHECK-CSSC-NEXT: ret
entry:
%popcnt = tail call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x)
%cmp = icmp ne i32 %popcnt, 1
ret i1 %cmp
}

declare i32 @llvm.ctpop.i32(i32) nounwind readnone
declare i64 @llvm.ctpop.i64(i64) nounwind readnone
39 changes: 39 additions & 0 deletions llvm/test/CodeGen/RISCV/rv32zbb.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1441,3 +1441,42 @@ define i32 @srai_slli2(i16 signext %0) {
%3 = sext i16 %sext to i32
ret i32 %3
}

define i1 @ctpop32_eq_one_nonzero(i32 %x) {
; RV32I-LABEL: ctpop32_eq_one_nonzero:
; RV32I: # %bb.0: # %entry
; RV32I-NEXT: addi a1, a0, -1
; RV32I-NEXT: and a0, a0, a1
; RV32I-NEXT: seqz a0, a0
; RV32I-NEXT: ret
;
; RV32ZBB-LABEL: ctpop32_eq_one_nonzero:
; RV32ZBB: # %bb.0: # %entry
; RV32ZBB-NEXT: cpop a0, a0
; RV32ZBB-NEXT: sltiu a0, a0, 2
; RV32ZBB-NEXT: ret
entry:
%popcnt = call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x)
%cmp = icmp eq i32 %popcnt, 1
ret i1 %cmp
}

define i1 @ctpop32_ne_one_nonzero(i32 %x) {
; RV32I-LABEL: ctpop32_ne_one_nonzero:
; RV32I: # %bb.0: # %entry
; RV32I-NEXT: addi a1, a0, -1
; RV32I-NEXT: and a0, a0, a1
; RV32I-NEXT: snez a0, a0
; RV32I-NEXT: ret
;
; RV32ZBB-LABEL: ctpop32_ne_one_nonzero:
; RV32ZBB: # %bb.0: # %entry
; RV32ZBB-NEXT: cpop a0, a0
; RV32ZBB-NEXT: sltiu a0, a0, 2
; RV32ZBB-NEXT: xori a0, a0, 1
; RV32ZBB-NEXT: ret
entry:
%popcnt = tail call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x)
%cmp = icmp ne i32 %popcnt, 1
ret i1 %cmp
}
81 changes: 81 additions & 0 deletions llvm/test/CodeGen/RISCV/rv64zbb.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1618,3 +1618,84 @@ entry:
%5 = add nsw i32 %4, %0
ret i32 %5
}

define i1 @ctpop32_eq_one_nonzero(i32 %x) {
; RV64I-LABEL: ctpop32_eq_one_nonzero:
; RV64I: # %bb.0: # %entry
; RV64I-NEXT: addi a1, a0, -1
; RV64I-NEXT: and a0, a0, a1
; RV64I-NEXT: sext.w a0, a0
; RV64I-NEXT: seqz a0, a0
; RV64I-NEXT: ret
;
; RV64ZBB-LABEL: ctpop32_eq_one_nonzero:
; RV64ZBB: # %bb.0: # %entry
; RV64ZBB-NEXT: cpopw a0, a0
; RV64ZBB-NEXT: sltiu a0, a0, 2
; RV64ZBB-NEXT: ret
entry:
%popcnt = call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x)
%cmp = icmp eq i32 %popcnt, 1
ret i1 %cmp
}

define i1 @ctpop32_ne_one_nonzero(i32 %x) {
; RV64I-LABEL: ctpop32_ne_one_nonzero:
; RV64I: # %bb.0: # %entry
; RV64I-NEXT: addi a1, a0, -1
; RV64I-NEXT: and a0, a0, a1
; RV64I-NEXT: sext.w a0, a0
; RV64I-NEXT: snez a0, a0
; RV64I-NEXT: ret
;
; RV64ZBB-LABEL: ctpop32_ne_one_nonzero:
; RV64ZBB: # %bb.0: # %entry
; RV64ZBB-NEXT: cpopw a0, a0
; RV64ZBB-NEXT: sltiu a0, a0, 2
; RV64ZBB-NEXT: xori a0, a0, 1
; RV64ZBB-NEXT: ret
entry:
%popcnt = tail call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x)
%cmp = icmp ne i32 %popcnt, 1
ret i1 %cmp
}

define i1 @ctpop64_eq_one_nonzero(i64 %x) {
; RV64I-LABEL: ctpop64_eq_one_nonzero:
; RV64I: # %bb.0: # %entry
; RV64I-NEXT: addi a1, a0, -1
; RV64I-NEXT: and a0, a0, a1
; RV64I-NEXT: seqz a0, a0
; RV64I-NEXT: ret
;
; RV64ZBB-LABEL: ctpop64_eq_one_nonzero:
; RV64ZBB: # %bb.0: # %entry
; RV64ZBB-NEXT: cpop a0, a0
; RV64ZBB-NEXT: sltiu a0, a0, 2
; RV64ZBB-NEXT: ret
entry:
%popcnt = call range(i64 1, 65) i64 @llvm.ctpop.i64(i64 %x)
%cmp = icmp eq i64 %popcnt, 1
ret i1 %cmp
}

define i1 @ctpop32_eq_one_maybezero(i32 %x) {
; RV64I-LABEL: ctpop32_eq_one_maybezero:
; RV64I: # %bb.0: # %entry
; RV64I-NEXT: addiw a1, a0, -1
; RV64I-NEXT: xor a0, a0, a1
; RV64I-NEXT: sext.w a0, a0
; RV64I-NEXT: sltu a0, a1, a0
; RV64I-NEXT: ret
;
; RV64ZBB-LABEL: ctpop32_eq_one_maybezero:
; RV64ZBB: # %bb.0: # %entry
; RV64ZBB-NEXT: cpopw a0, a0
; RV64ZBB-NEXT: addi a0, a0, -1
; RV64ZBB-NEXT: seqz a0, a0
; RV64ZBB-NEXT: ret
entry:
%popcnt = call range(i32 0, 16) i32 @llvm.ctpop.i32(i32 %x)
%cmp = icmp eq i32 %popcnt, 1
ret i1 %cmp
}
45 changes: 43 additions & 2 deletions llvm/test/CodeGen/X86/ispow2.ll
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ define <4 x i1> @is_pow2_non_zero_4xv64(<4 x i64> %xin) {
; CHECK-AVX512: # %bb.0:
; CHECK-AVX512-NEXT: vporq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
; CHECK-AVX512-NEXT: vpopcntq %ymm0, %ymm0
; CHECK-AVX512-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %k1
; CHECK-AVX512-NEXT: vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %k1
; CHECK-AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; CHECK-AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; CHECK-AVX512-NEXT: vzeroupper
Expand Down Expand Up @@ -155,7 +155,7 @@ define <4 x i1> @neither_pow2_non_zero_4xv64(<4 x i64> %xin) {
; CHECK-AVX512: # %bb.0:
; CHECK-AVX512-NEXT: vporq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
; CHECK-AVX512-NEXT: vpopcntq %ymm0, %ymm0
; CHECK-AVX512-NEXT: vpcmpneqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %k1
; CHECK-AVX512-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %k1
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

noticing, this is probably not profitable for vectors on x86-64. Particularly if there is no avx512

Copy link
Member Author

@dtcxzyw dtcxzyw Oct 11, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you explain it further? I have checked intel intrinsic guide. Both vpcmpneqq and vpcmpgtq take 3 cycles.

; CHECK-AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; CHECK-AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; CHECK-AVX512-NEXT: vzeroupper
Expand Down Expand Up @@ -220,3 +220,44 @@ define <4 x i1> @neither_pow2_non_zero_4xv64_x_maybe_z(<4 x i64> %x) {
%r = icmp ne <4 x i64> %cnt, <i64 1, i64 1, i64 1, i64 1>
ret <4 x i1> %r
}


define i1 @ctpop32_eq_one_nonzero(i32 %x) {
; CHECK-NOBMI-LABEL: ctpop32_eq_one_nonzero:
; CHECK-NOBMI: # %bb.0: # %entry
; CHECK-NOBMI-NEXT: # kill: def $edi killed $edi def $rdi
; CHECK-NOBMI-NEXT: leal -1(%rdi), %eax
; CHECK-NOBMI-NEXT: testl %eax, %edi
; CHECK-NOBMI-NEXT: sete %al
; CHECK-NOBMI-NEXT: retq
;
; CHECK-BMI2-LABEL: ctpop32_eq_one_nonzero:
; CHECK-BMI2: # %bb.0: # %entry
; CHECK-BMI2-NEXT: blsrl %edi, %eax
; CHECK-BMI2-NEXT: sete %al
; CHECK-BMI2-NEXT: retq
entry:
%popcnt = call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x)
%cmp = icmp eq i32 %popcnt, 1
ret i1 %cmp
}

define i1 @ctpop32_ne_one_nonzero(i32 %x) {
; CHECK-NOBMI-LABEL: ctpop32_ne_one_nonzero:
; CHECK-NOBMI: # %bb.0: # %entry
; CHECK-NOBMI-NEXT: # kill: def $edi killed $edi def $rdi
; CHECK-NOBMI-NEXT: leal -1(%rdi), %eax
; CHECK-NOBMI-NEXT: testl %eax, %edi
; CHECK-NOBMI-NEXT: setne %al
; CHECK-NOBMI-NEXT: retq
;
; CHECK-BMI2-LABEL: ctpop32_ne_one_nonzero:
; CHECK-BMI2: # %bb.0: # %entry
; CHECK-BMI2-NEXT: blsrl %edi, %eax
; CHECK-BMI2-NEXT: setne %al
; CHECK-BMI2-NEXT: retq
entry:
%popcnt = tail call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x)
%cmp = icmp ne i32 %popcnt, 1
ret i1 %cmp
}
12 changes: 6 additions & 6 deletions llvm/test/CodeGen/X86/known-never-zero.ll
Original file line number Diff line number Diff line change
Expand Up @@ -555,9 +555,9 @@ define <4 x i32> @smax_known_zero_vec(<4 x i32> %x, <4 x i32> %y) {
; X86-NEXT: por %xmm2, %xmm0
; X86-NEXT: pcmpeqd %xmm1, %xmm1
; X86-NEXT: paddd %xmm0, %xmm1
; X86-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
; X86-NEXT: pxor %xmm1, %xmm0
; X86-NEXT: pcmpgtd %xmm1, %xmm0
; X86-NEXT: pand %xmm1, %xmm0
; X86-NEXT: pxor %xmm1, %xmm1
; X86-NEXT: pcmpeqd %xmm1, %xmm0
; X86-NEXT: psrld $31, %xmm0
; X86-NEXT: retl
;
Expand All @@ -566,10 +566,10 @@ define <4 x i32> @smax_known_zero_vec(<4 x i32> %x, <4 x i32> %y) {
; X64-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; X64-NEXT: vpaddd %xmm1, %xmm0, %xmm1
; X64-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-NEXT: vpminud %xmm1, %xmm0, %xmm1
; X64-NEXT: vpand %xmm1, %xmm0, %xmm0
; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; X64-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; X64-NEXT: vpsrld $31, %xmm0, %xmm0
; X64-NEXT: retq
%z = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %x, <4 x i32> <i32 54, i32 23, i32 12, i32 1>)
%r = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %z)
Expand Down
Loading