Skip to content

Commit 64594e7

Browse files
committed
[DAG] Add generic i8 CTPOP lowering using i32 MUL
Limit this behind a TLI.shouldAllowMultiplyInBitCounts callback as so far on x86 really benefits from this Fixes #79823
1 parent 648eb7c commit 64594e7

File tree

5 files changed

+56
-51
lines changed

5 files changed

+56
-51
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3258,6 +3258,12 @@ class TargetLoweringBase {
32583258
return false;
32593259
}
32603260

3261+
/// Return true if CTPOP/CTTZ/CTLZ/PARITY expansions should try to use integer
3262+
/// multiples should the input value be suitable.
3263+
virtual bool shouldAllowMultiplyInBitCounts(EVT CntVT, EVT MulVT) const {
3264+
return false;
3265+
}
3266+
32613267
// Should we fold (select_cc seteq (and x, y), 0, 0, A) -> (and (sra (shl x))
32623268
// A) where y has a single bit set?
32633269
virtual bool shouldFoldSelectWithSingleBitTest(EVT VT,

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8639,6 +8639,24 @@ SDValue TargetLowering::expandCTPOP(SDNode *Node, SelectionDAG &DAG) const {
86398639
if (VT.isVector() && !canExpandVectorCTPOP(*this, VT))
86408640
return SDValue();
86418641

8642+
// i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply.
8643+
if (VT == MVT::i8 && shouldAllowMultiplyInBitCounts(MVT::i8, MVT::i32) &&
8644+
isOperationLegal(ISD::AND, MVT::i32) &&
8645+
isOperationLegal(ISD::SRL, MVT::i32) &&
8646+
isOperationLegal(ISD::MUL, MVT::i32)) {
8647+
SDValue Mask11 = DAG.getConstant(0x11111111U, dl, MVT::i32);
8648+
Op = DAG.getZExtOrTrunc(Op, dl, MVT::i32);
8649+
Op = DAG.getNode(ISD::MUL, dl, MVT::i32, Op,
8650+
DAG.getConstant(0x08040201U, dl, MVT::i32));
8651+
Op = DAG.getNode(ISD::SRL, dl, MVT::i32, Op,
8652+
DAG.getShiftAmountConstant(3, MVT::i32, dl));
8653+
Op = DAG.getNode(ISD::AND, dl, MVT::i32, Op, Mask11);
8654+
Op = DAG.getNode(ISD::MUL, dl, MVT::i32, Op, Mask11);
8655+
Op = DAG.getNode(ISD::SRL, dl, MVT::i32, Op,
8656+
DAG.getShiftAmountConstant(28, MVT::i32, dl));
8657+
return DAG.getZExtOrTrunc(Op, dl, MVT::i8);
8658+
}
8659+
86428660
// This is the "best" algorithm from
86438661
// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
86448662
SDValue Mask55 =

llvm/lib/Target/X86/X86ISelLowering.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1174,6 +1174,10 @@ namespace llvm {
11741174

11751175
bool shouldSplatInsEltVarIndex(EVT VT) const override;
11761176

1177+
bool shouldAllowMultiplyInBitCounts(EVT CntVT, EVT MulVT) const override {
1178+
return CntVT.isScalarInteger() && isOperationLegal(ISD::MUL, MulVT);
1179+
}
1180+
11771181
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override {
11781182
// Converting to sat variants holds little benefit on X86 as we will just
11791183
// need to saturate the value back using fp arithmatic.

llvm/test/CodeGen/X86/ctpop-combine.ll

Lines changed: 7 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -88,20 +88,13 @@ define i8 @test4(i8 %x) nounwind readnone {
8888
;
8989
; NO-POPCOUNT-LABEL: test4:
9090
; NO-POPCOUNT: # %bb.0:
91-
; NO-POPCOUNT-NEXT: movl %edi, %ecx
92-
; NO-POPCOUNT-NEXT: andb $127, %cl
93-
; NO-POPCOUNT-NEXT: shrb %dil
94-
; NO-POPCOUNT-NEXT: andb $21, %dil
95-
; NO-POPCOUNT-NEXT: subb %dil, %cl
96-
; NO-POPCOUNT-NEXT: movl %ecx, %eax
97-
; NO-POPCOUNT-NEXT: andb $51, %al
98-
; NO-POPCOUNT-NEXT: shrb $2, %cl
99-
; NO-POPCOUNT-NEXT: andb $51, %cl
100-
; NO-POPCOUNT-NEXT: addb %al, %cl
101-
; NO-POPCOUNT-NEXT: movl %ecx, %eax
102-
; NO-POPCOUNT-NEXT: shrb $4, %al
103-
; NO-POPCOUNT-NEXT: addb %cl, %al
104-
; NO-POPCOUNT-NEXT: andb $15, %al
91+
; NO-POPCOUNT-NEXT: andl $127, %edi
92+
; NO-POPCOUNT-NEXT: imull $134480385, %edi, %eax # imm = 0x8040201
93+
; NO-POPCOUNT-NEXT: shrl $3, %eax
94+
; NO-POPCOUNT-NEXT: andl $286331153, %eax # imm = 0x11111111
95+
; NO-POPCOUNT-NEXT: imull $286331153, %eax, %eax # imm = 0x11111111
96+
; NO-POPCOUNT-NEXT: shrl $28, %eax
97+
; NO-POPCOUNT-NEXT: # kill: def $al killed $al killed $eax
10598
; NO-POPCOUNT-NEXT: retq
10699
%x2 = and i8 %x, 127
107100
%count = tail call i8 @llvm.ctpop.i8(i8 %x2)

llvm/test/CodeGen/X86/popcnt.ll

Lines changed: 21 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -10,37 +10,24 @@
1010
define i8 @cnt8(i8 %x) nounwind readnone {
1111
; X86-LABEL: cnt8:
1212
; X86: # %bb.0:
13-
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
14-
; X86-NEXT: movl %ecx, %eax
15-
; X86-NEXT: shrb %al
16-
; X86-NEXT: andb $85, %al
17-
; X86-NEXT: subb %al, %cl
18-
; X86-NEXT: movl %ecx, %eax
19-
; X86-NEXT: andb $51, %al
20-
; X86-NEXT: shrb $2, %cl
21-
; X86-NEXT: andb $51, %cl
22-
; X86-NEXT: addb %al, %cl
23-
; X86-NEXT: movl %ecx, %eax
24-
; X86-NEXT: shrb $4, %al
25-
; X86-NEXT: addb %cl, %al
26-
; X86-NEXT: andb $15, %al
13+
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
14+
; X86-NEXT: imull $134480385, %eax, %eax # imm = 0x8040201
15+
; X86-NEXT: shrl $3, %eax
16+
; X86-NEXT: andl $286331153, %eax # imm = 0x11111111
17+
; X86-NEXT: imull $286331153, %eax, %eax # imm = 0x11111111
18+
; X86-NEXT: shrl $28, %eax
19+
; X86-NEXT: # kill: def $al killed $al killed $eax
2720
; X86-NEXT: retl
2821
;
2922
; X64-LABEL: cnt8:
3023
; X64: # %bb.0:
31-
; X64-NEXT: movl %edi, %eax
32-
; X64-NEXT: shrb %al
33-
; X64-NEXT: andb $85, %al
34-
; X64-NEXT: subb %al, %dil
35-
; X64-NEXT: movl %edi, %ecx
36-
; X64-NEXT: andb $51, %cl
37-
; X64-NEXT: shrb $2, %dil
38-
; X64-NEXT: andb $51, %dil
39-
; X64-NEXT: addb %dil, %cl
40-
; X64-NEXT: movl %ecx, %eax
41-
; X64-NEXT: shrb $4, %al
42-
; X64-NEXT: addb %cl, %al
43-
; X64-NEXT: andb $15, %al
24+
; X64-NEXT: movzbl %dil, %eax
25+
; X64-NEXT: imull $134480385, %eax, %eax # imm = 0x8040201
26+
; X64-NEXT: shrl $3, %eax
27+
; X64-NEXT: andl $286331153, %eax # imm = 0x11111111
28+
; X64-NEXT: imull $286331153, %eax, %eax # imm = 0x11111111
29+
; X64-NEXT: shrl $28, %eax
30+
; X64-NEXT: # kill: def $al killed $al killed $eax
4431
; X64-NEXT: retq
4532
;
4633
; X86-POPCNT-LABEL: cnt8:
@@ -59,16 +46,13 @@ define i8 @cnt8(i8 %x) nounwind readnone {
5946
;
6047
; X64-NDD-LABEL: cnt8:
6148
; X64-NDD: # %bb.0:
62-
; X64-NDD-NEXT: shrb %dil, %al
63-
; X64-NDD-NEXT: andb $85, %al
64-
; X64-NDD-NEXT: subb %al, %dil, %al
65-
; X64-NDD-NEXT: andb $51, %al, %cl
66-
; X64-NDD-NEXT: shrb $2, %al
67-
; X64-NDD-NEXT: andb $51, %al
68-
; X64-NDD-NEXT: addb %cl, %al
69-
; X64-NDD-NEXT: shrb $4, %al, %cl
70-
; X64-NDD-NEXT: addb %cl, %al
71-
; X64-NDD-NEXT: andb $15, %al
49+
; X64-NDD-NEXT: movzbl %dil, %eax
50+
; X64-NDD-NEXT: imull $134480385, %eax, %eax # imm = 0x8040201
51+
; X64-NDD-NEXT: shrl $3, %eax
52+
; X64-NDD-NEXT: andl $286331153, %eax # imm = 0x11111111
53+
; X64-NDD-NEXT: imull $286331153, %eax, %eax # imm = 0x11111111
54+
; X64-NDD-NEXT: shrl $28, %eax
55+
; X64-NDD-NEXT: # kill: def $al killed $al killed $eax
7256
; X64-NDD-NEXT: retq
7357
%cnt = tail call i8 @llvm.ctpop.i8(i8 %x)
7458
ret i8 %cnt

0 commit comments

Comments
 (0)