Skip to content

[CodeGenPrepare] Unfold slow ctpop when used in power-of-two test #102731

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Apr 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 73 additions & 28 deletions llvm/lib/CodeGen/CodeGenPrepare.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -474,6 +474,7 @@ class CodeGenPrepare {
bool optimizeURem(Instruction *Rem);
bool combineToUSubWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
bool combineToUAddWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
bool unfoldPowerOf2Test(CmpInst *Cmp);
void verifyBFIUpdates(Function &F);
bool _run(Function &F);
};
Expand Down Expand Up @@ -1762,6 +1763,75 @@ bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp,
return true;
}

// Decanonicalizes icmp+ctpop power-of-two test if ctpop is slow.
// The same transformation exists in DAG combiner, but we repeat it here because
// DAG builder can break the pattern by moving icmp into a successor block.
bool CodeGenPrepare::unfoldPowerOf2Test(CmpInst *Cmp) {
CmpPredicate Pred;
Value *X;
const APInt *C;

// (icmp (ctpop x), c)
if (!match(Cmp, m_ICmp(Pred, m_Intrinsic<Intrinsic::ctpop>(m_Value(X)),
m_APIntAllowPoison(C))))
return false;

// We're only interested in "is power of 2 [or zero]" patterns.
bool IsStrictlyPowerOf2Test = ICmpInst::isEquality(Pred) && *C == 1;
bool IsPowerOf2OrZeroTest = (Pred == CmpInst::ICMP_ULT && *C == 2) ||
(Pred == CmpInst::ICMP_UGT && *C == 1);
if (!IsStrictlyPowerOf2Test && !IsPowerOf2OrZeroTest)
return false;

// Some targets have better codegen for `ctpop(x) u</u>= 2/1`than for
// `ctpop(x) ==/!= 1`. If ctpop is fast, only try changing the comparison,
// and otherwise expand ctpop into a few simple instructions.
Type *OpTy = X->getType();
if (TLI->isCtpopFast(TLI->getValueType(*DL, OpTy))) {
// Look for `ctpop(x) ==/!= 1`, where `ctpop(x)` is known to be non-zero.
if (!IsStrictlyPowerOf2Test || !isKnownNonZero(Cmp->getOperand(0), *DL))
return false;

// ctpop(x) == 1 -> ctpop(x) u< 2
// ctpop(x) != 1 -> ctpop(x) u> 1
if (Pred == ICmpInst::ICMP_EQ) {
Cmp->setOperand(1, ConstantInt::get(OpTy, 2));
Cmp->setPredicate(ICmpInst::ICMP_ULT);
} else {
Cmp->setPredicate(ICmpInst::ICMP_UGT);
}
return true;
}

Value *NewCmp;
if (IsPowerOf2OrZeroTest ||
(IsStrictlyPowerOf2Test && isKnownNonZero(Cmp->getOperand(0), *DL))) {
// ctpop(x) u< 2 -> (x & (x - 1)) == 0
// ctpop(x) u> 1 -> (x & (x - 1)) != 0
IRBuilder<> Builder(Cmp);
Value *Sub = Builder.CreateAdd(X, Constant::getAllOnesValue(OpTy));
Value *And = Builder.CreateAnd(X, Sub);
CmpInst::Predicate NewPred =
(Pred == CmpInst::ICMP_ULT || Pred == CmpInst::ICMP_EQ)
? CmpInst::ICMP_EQ
: CmpInst::ICMP_NE;
NewCmp = Builder.CreateICmp(NewPred, And, ConstantInt::getNullValue(OpTy));
} else {
// ctpop(x) == 1 -> (x ^ (x - 1)) u> (x - 1)
// ctpop(x) != 1 -> (x ^ (x - 1)) u<= (x - 1)
IRBuilder<> Builder(Cmp);
Value *Sub = Builder.CreateAdd(X, Constant::getAllOnesValue(OpTy));
Value *Xor = Builder.CreateXor(X, Sub);
CmpInst::Predicate NewPred =
Pred == CmpInst::ICMP_EQ ? CmpInst::ICMP_UGT : CmpInst::ICMP_ULE;
NewCmp = Builder.CreateICmp(NewPred, Xor, Sub);
}

Cmp->replaceAllUsesWith(NewCmp);
RecursivelyDeleteTriviallyDeadInstructions(Cmp);
return true;
}

/// Sink the given CmpInst into user blocks to reduce the number of virtual
/// registers that must be created and coalesced. This is a clear win except on
/// targets with multiple condition code registers (PowerPC), where it might
Expand Down Expand Up @@ -2148,31 +2218,6 @@ bool CodeGenPrepare::optimizeURem(Instruction *Rem) {
return false;
}

/// Some targets have better codegen for `ctpop(X) u< 2` than `ctpop(X) == 1`.
/// This function converts `ctpop(X) ==/!= 1` into `ctpop(X) u</u> 2/1` if the
/// result cannot be zero.
static bool adjustIsPower2Test(CmpInst *Cmp, const TargetLowering &TLI,
const TargetTransformInfo &TTI,
const DataLayout &DL) {
CmpPredicate Pred;
if (!match(Cmp, m_ICmp(Pred, m_Intrinsic<Intrinsic::ctpop>(), m_One())))
return false;
if (!ICmpInst::isEquality(Pred))
return false;
auto *II = cast<IntrinsicInst>(Cmp->getOperand(0));

if (isKnownNonZero(II, DL)) {
if (Pred == ICmpInst::ICMP_EQ) {
Cmp->setOperand(1, ConstantInt::get(II->getType(), 2));
Cmp->setPredicate(ICmpInst::ICMP_ULT);
} else {
Cmp->setPredicate(ICmpInst::ICMP_UGT);
}
return true;
}
return false;
}

bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) {
if (sinkCmpExpression(Cmp, *TLI))
return true;
Expand All @@ -2183,6 +2228,9 @@ bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) {
if (combineToUSubWithOverflow(Cmp, ModifiedDT))
return true;

if (unfoldPowerOf2Test(Cmp))
return true;

if (foldICmpWithDominatingICmp(Cmp, *TLI))
return true;

Expand All @@ -2192,9 +2240,6 @@ bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) {
if (foldFCmpToFPClassTest(Cmp, *TLI, *DL))
return true;

if (adjustIsPower2Test(Cmp, *TLI, *TTI, *DL))
return true;

return false;
}

Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/PowerPC/vector-popcnt-128-ult-ugt.ll
Original file line number Diff line number Diff line change
Expand Up @@ -11945,23 +11945,23 @@ define <2 x i64> @ugt_1_v2i64(<2 x i64> %0) {
; PWR5-LABEL: ugt_1_v2i64:
; PWR5: # %bb.0:
; PWR5-NEXT: addi 5, 3, -1
; PWR5-NEXT: addi 6, 4, -1
; PWR5-NEXT: and 3, 3, 5
; PWR5-NEXT: addi 5, 4, -1
; PWR5-NEXT: and 4, 4, 6
; PWR5-NEXT: subfic 3, 3, 0
; PWR5-NEXT: subfe 3, 3, 3
; PWR5-NEXT: and 4, 4, 5
; PWR5-NEXT: subfic 4, 4, 0
; PWR5-NEXT: subfe 4, 4, 4
; PWR5-NEXT: blr
;
; PWR6-LABEL: ugt_1_v2i64:
; PWR6: # %bb.0:
; PWR6-NEXT: addi 5, 3, -1
; PWR6-NEXT: addi 6, 4, -1
; PWR6-NEXT: and 3, 3, 5
; PWR6-NEXT: addi 5, 4, -1
; PWR6-NEXT: and 4, 4, 6
; PWR6-NEXT: subfic 3, 3, 0
; PWR6-NEXT: subfe 3, 3, 3
; PWR6-NEXT: and 4, 4, 5
; PWR6-NEXT: subfic 4, 4, 0
; PWR6-NEXT: subfe 4, 4, 4
; PWR6-NEXT: blr
Expand Down Expand Up @@ -12016,23 +12016,23 @@ define <2 x i64> @ult_2_v2i64(<2 x i64> %0) {
; PWR5-LABEL: ult_2_v2i64:
; PWR5: # %bb.0:
; PWR5-NEXT: addi 5, 3, -1
; PWR5-NEXT: addi 6, 4, -1
; PWR5-NEXT: and 3, 3, 5
; PWR5-NEXT: addi 5, 4, -1
; PWR5-NEXT: and 4, 4, 6
; PWR5-NEXT: addic 3, 3, -1
; PWR5-NEXT: subfe 3, 3, 3
; PWR5-NEXT: and 4, 4, 5
; PWR5-NEXT: addic 4, 4, -1
; PWR5-NEXT: subfe 4, 4, 4
; PWR5-NEXT: blr
;
; PWR6-LABEL: ult_2_v2i64:
; PWR6: # %bb.0:
; PWR6-NEXT: addi 5, 3, -1
; PWR6-NEXT: addi 6, 4, -1
; PWR6-NEXT: and 3, 3, 5
; PWR6-NEXT: addi 5, 4, -1
; PWR6-NEXT: and 4, 4, 6
; PWR6-NEXT: addic 3, 3, -1
; PWR6-NEXT: subfe 3, 3, 3
; PWR6-NEXT: and 4, 4, 5
; PWR6-NEXT: addic 4, 4, -1
; PWR6-NEXT: subfe 4, 4, 4
; PWR6-NEXT: blr
Expand Down
Loading
Loading