@@ -427,7 +427,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
427
427
// on the dest that popcntl hasn't had since Cannon Lake.
428
428
setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
429
429
} else {
430
- setOperationAction(ISD::CTPOP , MVT::i8 , Expand );
430
+ setOperationAction(ISD::CTPOP , MVT::i8 , Custom );
431
431
setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
432
432
setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
433
433
if (Subtarget.is64Bit())
@@ -30989,12 +30989,12 @@ static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
30989
30989
30990
30990
// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
30991
30991
// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
30992
- static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
30992
+ static SDValue LowerVectorCTPOP(SDValue Op, const SDLoc &DL,
30993
+ const X86Subtarget &Subtarget,
30993
30994
SelectionDAG &DAG) {
30994
30995
MVT VT = Op.getSimpleValueType();
30995
30996
assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
30996
30997
"Unknown CTPOP type to handle");
30997
- SDLoc DL(Op.getNode());
30998
30998
SDValue Op0 = Op.getOperand(0);
30999
30999
31000
31000
// TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
@@ -31035,9 +31035,27 @@ static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
31035
31035
31036
31036
static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
31037
31037
SelectionDAG &DAG) {
31038
- assert(Op.getSimpleValueType().isVector() &&
31038
+ MVT VT = Op.getSimpleValueType();
31039
+ SDLoc DL(Op);
31040
+
31041
+ // i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply.
31042
+ if (VT == MVT::i8) {
31043
+ SDValue Mask11 = DAG.getConstant(0x11111111U, DL, MVT::i32);
31044
+ Op = DAG.getZExtOrTrunc(Op.getOperand(0), DL, MVT::i32);
31045
+ Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
31046
+ DAG.getConstant(0x08040201U, DL, MVT::i32));
31047
+ Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
31048
+ DAG.getShiftAmountConstant(3, MVT::i32, DL));
31049
+ Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op, Mask11);
31050
+ Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op, Mask11);
31051
+ Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
31052
+ DAG.getShiftAmountConstant(28, MVT::i32, DL));
31053
+ return DAG.getZExtOrTrunc(Op, DL, VT);
31054
+ }
31055
+
31056
+ assert(VT.isVector() &&
31039
31057
"We only do custom lowering for vector population count.");
31040
- return LowerVectorCTPOP(Op, Subtarget, DAG);
31058
+ return LowerVectorCTPOP(Op, DL, Subtarget, DAG);
31041
31059
}
31042
31060
31043
31061
static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
0 commit comments