Skip to content

Commit a23c221

Browse files
committed
[DAG] Add generic i8 CTPOP lowering using i32 MUL
Limit this behind a TLI.shouldAllowMultiplyInBitCounts callback as so far only x86 really benefits from this Fixes #79823
1 parent ea29842 commit a23c221

File tree

5 files changed

+370
-379
lines changed

5 files changed

+370
-379
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3258,6 +3258,12 @@ class TargetLoweringBase {
32583258
return false;
32593259
}
32603260

3261+
/// Return true if CTPOP/CTTZ/CTLZ/PARITY expansions should try to use integer
3262+
/// multiples should the input value be suitable.
3263+
virtual bool shouldAllowMultiplyInBitCounts(EVT CntVT, EVT MulVT) const {
3264+
return false;
3265+
}
3266+
32613267
// Should we fold (select_cc seteq (and x, y), 0, 0, A) -> (and (sra (shl x))
32623268
// A) where y has a single bit set?
32633269
virtual bool shouldFoldSelectWithSingleBitTest(EVT VT,

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8639,7 +8639,22 @@ SDValue TargetLowering::expandCTPOP(SDNode *Node, SelectionDAG &DAG) const {
86398639
if (VT.isVector() && !canExpandVectorCTPOP(*this, VT))
86408640
return SDValue();
86418641

8642-
// This is the "best" algorithm from
8642+
// i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply.
8643+
if (VT == MVT::i8 && shouldAllowMultiplyInBitCounts(MVT::i8, MVT::i32)) {
8644+
SDValue Mask11 = DAG.getConstant(0x11111111U, dl, MVT::i32);
8645+
Op = DAG.getZExtOrTrunc(Op, dl, MVT::i32);
8646+
Op = DAG.getNode(ISD::MUL, dl, MVT::i32, Op,
8647+
DAG.getConstant(0x08040201U, dl, MVT::i32));
8648+
Op = DAG.getNode(ISD::SRL, dl, MVT::i32, Op,
8649+
DAG.getShiftAmountConstant(3, MVT::i32, dl));
8650+
Op = DAG.getNode(ISD::AND, dl, MVT::i32, Op, Mask11);
8651+
Op = DAG.getNode(ISD::MUL, dl, MVT::i32, Op, Mask11);
8652+
Op = DAG.getNode(ISD::SRL, dl, MVT::i32, Op,
8653+
DAG.getShiftAmountConstant(28, MVT::i32, dl));
8654+
return DAG.getZExtOrTrunc(Op, dl, MVT::i8);
8655+
}
8656+
8657+
// This is the "best" fallback algorithm from
86438658
// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
86448659
SDValue Mask55 =
86458660
DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)), dl, VT);

llvm/lib/Target/X86/X86ISelLowering.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1174,6 +1174,10 @@ namespace llvm {
11741174

11751175
bool shouldSplatInsEltVarIndex(EVT VT) const override;
11761176

1177+
bool shouldAllowMultiplyInBitCounts(EVT CntVT, EVT MulVT) const override {
1178+
return CntVT.isScalarInteger() && isOperationLegal(ISD::MUL, MulVT);
1179+
}
1180+
11771181
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override {
11781182
// Converting to sat variants holds little benefit on X86 as we will just
11791183
// need to saturate the value back using fp arithmatic.

llvm/test/CodeGen/X86/ctpop-combine.ll

Lines changed: 7 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -88,20 +88,13 @@ define i8 @test4(i8 %x) nounwind readnone {
8888
;
8989
; NO-POPCOUNT-LABEL: test4:
9090
; NO-POPCOUNT: # %bb.0:
91-
; NO-POPCOUNT-NEXT: movl %edi, %ecx
92-
; NO-POPCOUNT-NEXT: andb $127, %cl
93-
; NO-POPCOUNT-NEXT: shrb %dil
94-
; NO-POPCOUNT-NEXT: andb $21, %dil
95-
; NO-POPCOUNT-NEXT: subb %dil, %cl
96-
; NO-POPCOUNT-NEXT: movl %ecx, %eax
97-
; NO-POPCOUNT-NEXT: andb $51, %al
98-
; NO-POPCOUNT-NEXT: shrb $2, %cl
99-
; NO-POPCOUNT-NEXT: andb $51, %cl
100-
; NO-POPCOUNT-NEXT: addb %al, %cl
101-
; NO-POPCOUNT-NEXT: movl %ecx, %eax
102-
; NO-POPCOUNT-NEXT: shrb $4, %al
103-
; NO-POPCOUNT-NEXT: addb %cl, %al
104-
; NO-POPCOUNT-NEXT: andb $15, %al
91+
; NO-POPCOUNT-NEXT: andl $127, %edi
92+
; NO-POPCOUNT-NEXT: imull $134480385, %edi, %eax # imm = 0x8040201
93+
; NO-POPCOUNT-NEXT: shrl $3, %eax
94+
; NO-POPCOUNT-NEXT: andl $286331153, %eax # imm = 0x11111111
95+
; NO-POPCOUNT-NEXT: imull $286331153, %eax, %eax # imm = 0x11111111
96+
; NO-POPCOUNT-NEXT: shrl $28, %eax
97+
; NO-POPCOUNT-NEXT: # kill: def $al killed $al killed $eax
10598
; NO-POPCOUNT-NEXT: retq
10699
%x2 = and i8 %x, 127
107100
%count = tail call i8 @llvm.ctpop.i8(i8 %x2)

0 commit comments

Comments
 (0)