[DAG] Add generic i8 CTPOP lowering using i32 MUL

RKSimon · RKSimon · commit 64594e71d522 · 2024-01-31T16:34:35.000Z
Limit this behind a TLI.shouldAllowMultiplyInBitCounts callback as so far on x86 really benefits from this Fixes #79823
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3258,6 +3258,12 @@ class TargetLoweringBase {
     return false;
   }
 
+  /// Return true if CTPOP/CTTZ/CTLZ/PARITY expansions should try to use integer
+  /// multiples should the input value be suitable.
+  virtual bool shouldAllowMultiplyInBitCounts(EVT CntVT, EVT MulVT) const {
+    return false;
+  }
+
   // Should we fold (select_cc seteq (and x, y), 0, 0, A) -> (and (sra (shl x))
   // A) where y has a single bit set?
   virtual bool shouldFoldSelectWithSingleBitTest(EVT VT,
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8639,6 +8639,24 @@ SDValue TargetLowering::expandCTPOP(SDNode *Node, SelectionDAG &DAG) const {
   if (VT.isVector() && !canExpandVectorCTPOP(*this, VT))
     return SDValue();
 
+  // i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply.
+  if (VT == MVT::i8 && shouldAllowMultiplyInBitCounts(MVT::i8, MVT::i32) &&
+      isOperationLegal(ISD::AND, MVT::i32) &&
+      isOperationLegal(ISD::SRL, MVT::i32) &&
+      isOperationLegal(ISD::MUL, MVT::i32)) {
+    SDValue Mask11 = DAG.getConstant(0x11111111U, dl, MVT::i32);
+    Op = DAG.getZExtOrTrunc(Op, dl, MVT::i32);
+    Op = DAG.getNode(ISD::MUL, dl, MVT::i32, Op,
+                     DAG.getConstant(0x08040201U, dl, MVT::i32));
+    Op = DAG.getNode(ISD::SRL, dl, MVT::i32, Op,
+                     DAG.getShiftAmountConstant(3, MVT::i32, dl));
+    Op = DAG.getNode(ISD::AND, dl, MVT::i32, Op, Mask11);
+    Op = DAG.getNode(ISD::MUL, dl, MVT::i32, Op, Mask11);
+    Op = DAG.getNode(ISD::SRL, dl, MVT::i32, Op,
+                     DAG.getShiftAmountConstant(28, MVT::i32, dl));
+    return DAG.getZExtOrTrunc(Op, dl, MVT::i8);
+  }
+
   // This is the "best" algorithm from
   // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
   SDValue Mask55 =
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1174,6 +1174,10 @@ namespace llvm {
 
     bool shouldSplatInsEltVarIndex(EVT VT) const override;
 
+    bool shouldAllowMultiplyInBitCounts(EVT CntVT, EVT MulVT) const override {
+      return CntVT.isScalarInteger() && isOperationLegal(ISD::MUL, MulVT);
+    }
+
     bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override {
       // Converting to sat variants holds little benefit on X86 as we will just
       // need to saturate the value back using fp arithmatic.
diff --git a/llvm/test/CodeGen/X86/ctpop-combine.ll b/llvm/test/CodeGen/X86/ctpop-combine.ll
@@ -88,20 +88,13 @@ define i8 @test4(i8 %x) nounwind readnone {
 ;
 ; NO-POPCOUNT-LABEL: test4:
 ; NO-POPCOUNT:       # %bb.0:
-; NO-POPCOUNT-NEXT:    movl %edi, %ecx
-; NO-POPCOUNT-NEXT:    andb $127, %cl
-; NO-POPCOUNT-NEXT:    shrb %dil
-; NO-POPCOUNT-NEXT:    andb $21, %dil
-; NO-POPCOUNT-NEXT:    subb %dil, %cl
-; NO-POPCOUNT-NEXT:    movl %ecx, %eax
-; NO-POPCOUNT-NEXT:    andb $51, %al
-; NO-POPCOUNT-NEXT:    shrb $2, %cl
-; NO-POPCOUNT-NEXT:    andb $51, %cl
-; NO-POPCOUNT-NEXT:    addb %al, %cl
-; NO-POPCOUNT-NEXT:    movl %ecx, %eax
-; NO-POPCOUNT-NEXT:    shrb $4, %al
-; NO-POPCOUNT-NEXT:    addb %cl, %al
-; NO-POPCOUNT-NEXT:    andb $15, %al
+; NO-POPCOUNT-NEXT:    andl $127, %edi
+; NO-POPCOUNT-NEXT:    imull $134480385, %edi, %eax # imm = 0x8040201
+; NO-POPCOUNT-NEXT:    shrl $3, %eax
+; NO-POPCOUNT-NEXT:    andl $286331153, %eax # imm = 0x11111111
+; NO-POPCOUNT-NEXT:    imull $286331153, %eax, %eax # imm = 0x11111111
+; NO-POPCOUNT-NEXT:    shrl $28, %eax
+; NO-POPCOUNT-NEXT:    # kill: def $al killed $al killed $eax
 ; NO-POPCOUNT-NEXT:    retq
   %x2 = and i8 %x, 127
   %count = tail call i8 @llvm.ctpop.i8(i8 %x2)
diff --git a/llvm/test/CodeGen/X86/popcnt.ll b/llvm/test/CodeGen/X86/popcnt.ll
@@ -10,37 +10,24 @@
 define i8 @cnt8(i8 %x) nounwind readnone {
 ; X86-LABEL: cnt8:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrb %al
-; X86-NEXT:    andb $85, %al
-; X86-NEXT:    subb %al, %cl
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    andb $51, %al
-; X86-NEXT:    shrb $2, %cl
-; X86-NEXT:    andb $51, %cl
-; X86-NEXT:    addb %al, %cl
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrb $4, %al
-; X86-NEXT:    addb %cl, %al
-; X86-NEXT:    andb $15, %al
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    imull $134480385, %eax, %eax # imm = 0x8040201
+; X86-NEXT:    shrl $3, %eax
+; X86-NEXT:    andl $286331153, %eax # imm = 0x11111111
+; X86-NEXT:    imull $286331153, %eax, %eax # imm = 0x11111111
+; X86-NEXT:    shrl $28, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: cnt8:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    shrb %al
-; X64-NEXT:    andb $85, %al
-; X64-NEXT:    subb %al, %dil
-; X64-NEXT:    movl %edi, %ecx
-; X64-NEXT:    andb $51, %cl
-; X64-NEXT:    shrb $2, %dil
-; X64-NEXT:    andb $51, %dil
-; X64-NEXT:    addb %dil, %cl
-; X64-NEXT:    movl %ecx, %eax
-; X64-NEXT:    shrb $4, %al
-; X64-NEXT:    addb %cl, %al
-; X64-NEXT:    andb $15, %al
+; X64-NEXT:    movzbl %dil, %eax
+; X64-NEXT:    imull $134480385, %eax, %eax # imm = 0x8040201
+; X64-NEXT:    shrl $3, %eax
+; X64-NEXT:    andl $286331153, %eax # imm = 0x11111111
+; X64-NEXT:    imull $286331153, %eax, %eax # imm = 0x11111111
+; X64-NEXT:    shrl $28, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 ;
 ; X86-POPCNT-LABEL: cnt8:
@@ -59,16 +46,13 @@ define i8 @cnt8(i8 %x) nounwind readnone {
 ;
 ; X64-NDD-LABEL: cnt8:
 ; X64-NDD:       # %bb.0:
-; X64-NDD-NEXT:    shrb %dil, %al
-; X64-NDD-NEXT:    andb $85, %al
-; X64-NDD-NEXT:    subb %al, %dil, %al
-; X64-NDD-NEXT:    andb $51, %al, %cl
-; X64-NDD-NEXT:    shrb $2, %al
-; X64-NDD-NEXT:    andb $51, %al
-; X64-NDD-NEXT:    addb %cl, %al
-; X64-NDD-NEXT:    shrb $4, %al, %cl
-; X64-NDD-NEXT:    addb %cl, %al
-; X64-NDD-NEXT:    andb $15, %al
+; X64-NDD-NEXT:    movzbl %dil, %eax
+; X64-NDD-NEXT:    imull $134480385, %eax, %eax # imm = 0x8040201
+; X64-NDD-NEXT:    shrl $3, %eax
+; X64-NDD-NEXT:    andl $286331153, %eax # imm = 0x11111111
+; X64-NDD-NEXT:    imull $286331153, %eax, %eax # imm = 0x11111111
+; X64-NDD-NEXT:    shrl $28, %eax
+; X64-NDD-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NDD-NEXT:    retq
   %cnt = tail call i8 @llvm.ctpop.i8(i8 %x)
   ret i8 %cnt