-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[SDag][ARM][RISCV] Allow lowering CTPOP into a libcall #101786
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
The main change is adding CTPOP to `RuntimeLibcalls.def` to allow targets to use LibCall action for CTPOP. DAG legalizers are changed accordingly.
@llvm/pr-subscribers-backend-arm @llvm/pr-subscribers-llvm-ir Author: Sergei Barannikov (s-barannikov) ChangesThis is a reland of #99752 with the bug fixed. Original commit message: The main change is adding CTPOP to Patch is 118.10 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/101786.diff 20 Files Affected:
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.def b/llvm/include/llvm/IR/RuntimeLibcalls.def
index 89aaf6d1ad83f..3dd75622b8e43 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.def
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.def
@@ -85,6 +85,9 @@ HANDLE_LIBCALL(NEG_I64, "__negdi2")
HANDLE_LIBCALL(CTLZ_I32, "__clzsi2")
HANDLE_LIBCALL(CTLZ_I64, "__clzdi2")
HANDLE_LIBCALL(CTLZ_I128, "__clzti2")
+HANDLE_LIBCALL(CTPOP_I32, "__popcountsi2")
+HANDLE_LIBCALL(CTPOP_I64, "__popcountdi2")
+HANDLE_LIBCALL(CTPOP_I128, "__popcountti2")
// Floating-point
HANDLE_LIBCALL(ADD_F32, "__addsf3")
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index bdb7917073020..8a10f95ca20f6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -129,7 +129,8 @@ class SelectionDAGLegalize {
ArrayRef<int> Mask) const;
std::pair<SDValue, SDValue> ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
- TargetLowering::ArgListTy &&Args, bool isSigned);
+ TargetLowering::ArgListTy &&Args,
+ bool IsSigned, EVT RetVT);
std::pair<SDValue, SDValue> ExpandLibCall(RTLIB::Libcall LC, SDNode *Node, bool isSigned);
void ExpandFrexpLibCall(SDNode *Node, SmallVectorImpl<SDValue> &Results);
@@ -151,6 +152,9 @@ class SelectionDAGLegalize {
RTLIB::Libcall Call_F80, RTLIB::Libcall Call_F128,
RTLIB::Libcall Call_PPCF128,
SmallVectorImpl<SDValue> &Results);
+ SDValue ExpandBitCountingLibCall(SDNode *Node, RTLIB::Libcall CallI32,
+ RTLIB::Libcall CallI64,
+ RTLIB::Libcall CallI128);
void ExpandDivRemLibCall(SDNode *Node, SmallVectorImpl<SDValue> &Results);
void ExpandSinCosLibCall(SDNode *Node, SmallVectorImpl<SDValue> &Results);
@@ -2059,9 +2063,10 @@ SDValue SelectionDAGLegalize::ExpandSPLAT_VECTOR(SDNode *Node) {
// register, return the lo part and set the hi part to the by-reg argument in
// the first. If it does fit into a single register, return the result and
// leave the Hi part unset.
-std::pair<SDValue, SDValue> SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
- TargetLowering::ArgListTy &&Args,
- bool isSigned) {
+std::pair<SDValue, SDValue>
+SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
+ TargetLowering::ArgListTy &&Args,
+ bool IsSigned, EVT RetVT) {
EVT CodePtrTy = TLI.getPointerTy(DAG.getDataLayout());
SDValue Callee;
if (const char *LibcallName = TLI.getLibcallName(LC))
@@ -2072,7 +2077,6 @@ std::pair<SDValue, SDValue> SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall L
Node->getOperationName(&DAG));
}
- EVT RetVT = Node->getValueType(0);
Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
// By default, the input chain to this libcall is the entry node of the
@@ -2092,7 +2096,7 @@ std::pair<SDValue, SDValue> SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall L
InChain = TCChain;
TargetLowering::CallLoweringInfo CLI(DAG);
- bool signExtend = TLI.shouldSignExtendTypeInLibCall(RetVT, isSigned);
+ bool signExtend = TLI.shouldSignExtendTypeInLibCall(RetVT, IsSigned);
CLI.setDebugLoc(SDLoc(Node))
.setChain(InChain)
.setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee,
@@ -2128,7 +2132,8 @@ std::pair<SDValue, SDValue> SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall L
Args.push_back(Entry);
}
- return ExpandLibCall(LC, Node, std::move(Args), isSigned);
+ return ExpandLibCall(LC, Node, std::move(Args), isSigned,
+ Node->getValueType(0));
}
void SelectionDAGLegalize::ExpandFrexpLibCall(
@@ -2155,7 +2160,8 @@ void SelectionDAGLegalize::ExpandFrexpLibCall(
TargetLowering::ArgListTy Args = {FPArgEntry, PtrArgEntry};
RTLIB::Libcall LC = RTLIB::getFREXP(VT);
- auto [Call, Chain] = ExpandLibCall(LC, Node, std::move(Args), false);
+ auto [Call, Chain] = ExpandLibCall(LC, Node, std::move(Args),
+ /*IsSigned=*/false, VT);
// FIXME: Get type of int for libcall declaration and cast
@@ -2243,6 +2249,50 @@ void SelectionDAGLegalize::ExpandArgFPLibCall(SDNode* Node,
ExpandFPLibCall(Node, LC, Results);
}
+SDValue SelectionDAGLegalize::ExpandBitCountingLibCall(
+ SDNode *Node, RTLIB::Libcall CallI32, RTLIB::Libcall CallI64,
+ RTLIB::Libcall CallI128) {
+ RTLIB::Libcall LC;
+ switch (Node->getSimpleValueType(0).SimpleTy) {
+ default:
+ llvm_unreachable("Unexpected request for libcall!");
+ case MVT::i32:
+ LC = CallI32;
+ break;
+ case MVT::i64:
+ LC = CallI64;
+ break;
+ case MVT::i128:
+ LC = CallI128;
+ break;
+ }
+
+ // Bit-counting libcalls have one unsigned argument and return `int`.
+ // Note that `int` may be illegal on this target; ExpandLibCall will
+ // take care of promoting it to a legal type.
+ SDValue Op = Node->getOperand(0);
+ EVT IntVT =
+ EVT::getIntegerVT(*DAG.getContext(), DAG.getLibInfo().getIntSize());
+
+ TargetLowering::ArgListEntry Arg;
+ EVT ArgVT = Op.getValueType();
+ Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+ Arg.Node = Op;
+ Arg.Ty = ArgTy;
+ Arg.IsSExt = TLI.shouldSignExtendTypeInLibCall(ArgVT, /*IsSigned=*/false);
+ Arg.IsZExt = !Arg.IsSExt;
+
+ SDValue Res = ExpandLibCall(LC, Node, TargetLowering::ArgListTy{Arg},
+ /*IsSigned=*/true, IntVT)
+ .first;
+
+ // If ExpandLibCall created a tail call, the result was already
+ // of the correct type. Otherwise, we need to sign extend it.
+ if (Res.getValueType() != MVT::Other)
+ Res = DAG.getSExtOrTrunc(Res, SDLoc(Node), Node->getValueType(0));
+ return Res;
+}
+
/// Issue libcalls to __{u}divmod to compute div / rem pairs.
void
SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
@@ -5000,19 +5050,12 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
RTLIB::MUL_I64, RTLIB::MUL_I128));
break;
case ISD::CTLZ_ZERO_UNDEF:
- switch (Node->getSimpleValueType(0).SimpleTy) {
- default:
- llvm_unreachable("LibCall explicitly requested, but not available");
- case MVT::i32:
- Results.push_back(ExpandLibCall(RTLIB::CTLZ_I32, Node, false).first);
- break;
- case MVT::i64:
- Results.push_back(ExpandLibCall(RTLIB::CTLZ_I64, Node, false).first);
- break;
- case MVT::i128:
- Results.push_back(ExpandLibCall(RTLIB::CTLZ_I128, Node, false).first);
- break;
- }
+ Results.push_back(ExpandBitCountingLibCall(
+ Node, RTLIB::CTLZ_I32, RTLIB::CTLZ_I64, RTLIB::CTLZ_I128));
+ break;
+ case ISD::CTPOP:
+ Results.push_back(ExpandBitCountingLibCall(
+ Node, RTLIB::CTPOP_I32, RTLIB::CTPOP_I64, RTLIB::CTPOP_I128));
break;
case ISD::RESET_FPENV: {
// It is legalized to call 'fesetenv(FE_DFL_ENV)'. On most targets
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index b1ada66aa9aeb..0b0789b75dd50 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -3850,15 +3850,35 @@ void DAGTypeLegalizer::ExpandIntRes_CTLZ(SDNode *N,
Hi = DAG.getConstant(0, dl, NVT);
}
-void DAGTypeLegalizer::ExpandIntRes_CTPOP(SDNode *N,
- SDValue &Lo, SDValue &Hi) {
- SDLoc dl(N);
+void DAGTypeLegalizer::ExpandIntRes_CTPOP(SDNode *N, SDValue &Lo, SDValue &Hi) {
+ SDValue Op = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+ SDLoc DL(N);
+
+ if (TLI.getOperationAction(ISD::CTPOP, VT) == TargetLoweringBase::LibCall) {
+ RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+ if (VT == MVT::i32)
+ LC = RTLIB::CTPOP_I32;
+ else if (VT == MVT::i64)
+ LC = RTLIB::CTPOP_I64;
+ else if (VT == MVT::i128)
+ LC = RTLIB::CTPOP_I128;
+ assert(LC != RTLIB::UNKNOWN_LIBCALL && TLI.getLibcallName(LC) &&
+ "LibCall explicitly requested, but not available");
+ TargetLowering::MakeLibCallOptions CallOptions;
+ EVT IntVT =
+ EVT::getIntegerVT(*DAG.getContext(), DAG.getLibInfo().getIntSize());
+ SDValue Res = TLI.makeLibCall(DAG, LC, IntVT, Op, CallOptions, DL).first;
+ SplitInteger(DAG.getSExtOrTrunc(Res, DL, VT), Lo, Hi);
+ return;
+ }
+
// ctpop(HiLo) -> ctpop(Hi)+ctpop(Lo)
- GetExpandedInteger(N->getOperand(0), Lo, Hi);
+ GetExpandedInteger(Op, Lo, Hi);
EVT NVT = Lo.getValueType();
- Lo = DAG.getNode(ISD::ADD, dl, NVT, DAG.getNode(ISD::CTPOP, dl, NVT, Lo),
- DAG.getNode(ISD::CTPOP, dl, NVT, Hi));
- Hi = DAG.getConstant(0, dl, NVT);
+ Lo = DAG.getNode(ISD::ADD, DL, NVT, DAG.getNode(ISD::CTPOP, DL, NVT, Lo),
+ DAG.getNode(ISD::CTPOP, DL, NVT, Hi));
+ Hi = DAG.getConstant(0, DL, NVT);
}
void DAGTypeLegalizer::ExpandIntRes_CTTZ(SDNode *N,
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 7fa83a5999dfe..0989b79771c03 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -9161,8 +9161,9 @@ SDValue TargetLowering::expandCTTZ(SDNode *Node, SelectionDAG &DAG) const {
!isOperationLegalOrCustomOrPromote(ISD::XOR, VT)))
return SDValue();
- // Emit Table Lookup if ISD::CTLZ and ISD::CTPOP are not legal.
- if (!VT.isVector() && isOperationExpand(ISD::CTPOP, VT) &&
+ // Emit Table Lookup if ISD::CTPOP used in the fallback path below is going
+ // to be expanded or converted to a libcall.
+ if (!VT.isVector() && !isOperationLegalOrCustomOrPromote(ISD::CTPOP, VT) &&
!isOperationLegal(ISD::CTLZ, VT))
if (SDValue V = CTTZTableLookup(Node, DAG, dl, VT, Op, NumBitsPerElt))
return V;
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 75d16a42d0205..1d1ea22f6aac4 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -1204,7 +1204,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::ROTR, VT, Expand);
}
setOperationAction(ISD::CTTZ, MVT::i32, Custom);
- setOperationAction(ISD::CTPOP, MVT::i32, Expand);
+ setOperationAction(ISD::CTPOP, MVT::i32, LibCall);
+ setOperationAction(ISD::CTPOP, MVT::i64, LibCall);
if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) {
setOperationAction(ISD::CTLZ, MVT::i32, Expand);
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, LibCall);
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 9ee60b9db2837..c16b1ca742d7b 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -393,7 +393,12 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom);
}
} else {
- setOperationAction({ISD::CTTZ, ISD::CTPOP}, XLenVT, Expand);
+ setOperationAction(ISD::CTTZ, XLenVT, Expand);
+ if (Subtarget.is64Bit())
+ setOperationAction(ISD::CTPOP, MVT::i128, LibCall);
+ else
+ setOperationAction(ISD::CTPOP, MVT::i32, LibCall);
+ setOperationAction(ISD::CTPOP, MVT::i64, LibCall);
if (RV64LegalI32 && Subtarget.is64Bit())
setOperationAction({ISD::CTTZ, ISD::CTPOP}, MVT::i32, Expand);
}
diff --git a/llvm/test/Analysis/CostModel/RISCV/int-bit-manip.ll b/llvm/test/Analysis/CostModel/RISCV/int-bit-manip.ll
index 380f65b19b8fa..c0ecc63b82dca 100644
--- a/llvm/test/Analysis/CostModel/RISCV/int-bit-manip.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/int-bit-manip.ll
@@ -159,7 +159,7 @@ define void @bitreverse() {
define void @ctpop() {
; NOZVBB-LABEL: 'ctpop'
-; NOZVBB-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %1 = call i8 @llvm.ctpop.i8(i8 undef)
+; NOZVBB-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = call i8 @llvm.ctpop.i8(i8 undef)
; NOZVBB-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %2 = call <2 x i8> @llvm.ctpop.v2i8(<2 x i8> undef)
; NOZVBB-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %3 = call <4 x i8> @llvm.ctpop.v4i8(<4 x i8> undef)
; NOZVBB-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %4 = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> undef)
@@ -169,7 +169,7 @@ define void @ctpop() {
; NOZVBB-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %8 = call <vscale x 4 x i8> @llvm.ctpop.nxv4i8(<vscale x 4 x i8> undef)
; NOZVBB-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %9 = call <vscale x 8 x i8> @llvm.ctpop.nxv8i8(<vscale x 8 x i8> undef)
; NOZVBB-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %10 = call <vscale x 16 x i8> @llvm.ctpop.nxv16i8(<vscale x 16 x i8> undef)
-; NOZVBB-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %11 = call i16 @llvm.ctpop.i16(i16 undef)
+; NOZVBB-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %11 = call i16 @llvm.ctpop.i16(i16 undef)
; NOZVBB-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %12 = call <2 x i16> @llvm.ctpop.v2i16(<2 x i16> undef)
; NOZVBB-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %13 = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> undef)
; NOZVBB-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %14 = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> undef)
@@ -179,7 +179,7 @@ define void @ctpop() {
; NOZVBB-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %18 = call <vscale x 4 x i16> @llvm.ctpop.nxv4i16(<vscale x 4 x i16> undef)
; NOZVBB-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %19 = call <vscale x 8 x i16> @llvm.ctpop.nxv8i16(<vscale x 8 x i16> undef)
; NOZVBB-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %20 = call <vscale x 16 x i16> @llvm.ctpop.nxv16i16(<vscale x 16 x i16> undef)
-; NOZVBB-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %21 = call i32 @llvm.ctpop.i32(i32 undef)
+; NOZVBB-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %21 = call i32 @llvm.ctpop.i32(i32 undef)
; NOZVBB-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %22 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> undef)
; NOZVBB-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %23 = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> undef)
; NOZVBB-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %24 = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> undef)
@@ -189,7 +189,7 @@ define void @ctpop() {
; NOZVBB-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %28 = call <vscale x 4 x i32> @llvm.ctpop.nxv4i32(<vscale x 4 x i32> undef)
; NOZVBB-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %29 = call <vscale x 8 x i32> @llvm.ctpop.nxv8i32(<vscale x 8 x i32> undef)
; NOZVBB-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %30 = call <vscale x 16 x i32> @llvm.ctpop.nxv16i32(<vscale x 16 x i32> undef)
-; NOZVBB-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %31 = call i64 @llvm.ctpop.i64(i64 undef)
+; NOZVBB-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %31 = call i64 @llvm.ctpop.i64(i64 undef)
; NOZVBB-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %32 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> undef)
; NOZVBB-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %33 = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> undef)
; NOZVBB-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %34 = call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> undef)
@@ -202,7 +202,7 @@ define void @ctpop() {
; NOZVBB-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; ZVBB-LABEL: 'ctpop'
-; ZVBB-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %1 = call i8 @llvm.ctpop.i8(i8 undef)
+; ZVBB-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = call i8 @llvm.ctpop.i8(i8 undef)
; ZVBB-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = call <2 x i8> @llvm.ctpop.v2i8(<2 x i8> undef)
; ZVBB-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = call <4 x i8> @llvm.ctpop.v4i8(<4 x i8> undef)
; ZVBB-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> undef)
@@ -212,7 +212,7 @@ define void @ctpop() {
; ZVBB-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x i8> @llvm.ctpop.nxv4i8(<vscale x 4 x i8> undef)
; ZVBB-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x i8> @llvm.ctpop.nxv8i8(<vscale x 8 x i8> undef)
; ZVBB-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %10 = call <vscale x 16 x i8> @llvm.ctpop.nxv16i8(<vscale x 16 x i8> undef)
-; ZVBB-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %11 = call i16 @llvm.ctpop.i16(i16 undef)
+; ZVBB-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %11 = call i16 @llvm.ctpop.i16(i16 undef)
; ZVBB-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %12 = call <2 x i16> @llvm.ctpop.v2i16(<2 x i16> undef)
; ZVBB-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %13 = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> undef)
; ZVBB-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %14 = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> undef)
@@ -222,7 +222,7 @@ define void @ctpop() {
; ZVBB-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %18 = call <vscale x 4 x i16> @llvm.ctpop.nxv4i16(<vscale x 4 x i16> undef)
; ZVBB-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %19 = call <vscale x 8 x i16> @llvm.ctpop.nxv8i16(<vscale x 8 x i16> undef)
; ZVBB-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %20 = call <vscale x 16 x i16> @llvm.ctpop.nxv16i16(<vscale x 16 x i16> undef)
-; ZVBB-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %21 = call i32 @llvm.ctpop.i32(i32 undef)
+; ZVBB-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %21 = call i32 @llvm.ctpop.i32(i32 undef)
; ZVBB-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %22 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> undef)
; ZVBB-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %23 = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> undef)
; ZVBB-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %24 = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> undef)
@@ -232,7 +232,7 @@ define void @ctpop() {
; ZVBB-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %28 = call <vscale x 4 x i32> @llvm.ctpop.nxv4i32(<vscale x 4 x i32> undef)
; ZVBB-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %29 = call <vscale x 8 x i32> @llvm.ctpop.nxv8i32(<vscale x 8 x i32> undef)
; ZVBB-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %30 = call <vscale x 16 x i32> @llvm.ctpop.nxv16i32(<vscale x 16 x i32> undef)
-; ZVBB-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %31 = call i64 @llvm.ctpop.i64(i64 undef)
+; ZVBB-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %31 = call i64 @llvm.ctpop.i64(i64 undef)
; ZVBB-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %32 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> undef)
; ZVBB-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %33 = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> undef)
; ZVBB-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %34 = call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> undef)
diff --git a/llvm/test/CodeGen/ARM/popcnt.ll b/llvm/test/CodeGen/ARM/popcnt.ll
index edcae5e141e73..fc4387320ef77...
[truncated]
|
@llvm/pr-subscribers-llvm-analysis Author: Sergei Barannikov (s-barannikov) ChangesThis is a reland of #99752 with the bug fixed. Original commit message: The main change is adding CTPOP to Patch is 118.10 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/101786.diff 20 Files Affected:
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.def b/llvm/include/llvm/IR/RuntimeLibcalls.def
index 89aaf6d1ad83f..3dd75622b8e43 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.def
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.def
@@ -85,6 +85,9 @@ HANDLE_LIBCALL(NEG_I64, "__negdi2")
HANDLE_LIBCALL(CTLZ_I32, "__clzsi2")
HANDLE_LIBCALL(CTLZ_I64, "__clzdi2")
HANDLE_LIBCALL(CTLZ_I128, "__clzti2")
+HANDLE_LIBCALL(CTPOP_I32, "__popcountsi2")
+HANDLE_LIBCALL(CTPOP_I64, "__popcountdi2")
+HANDLE_LIBCALL(CTPOP_I128, "__popcountti2")
// Floating-point
HANDLE_LIBCALL(ADD_F32, "__addsf3")
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index bdb7917073020..8a10f95ca20f6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -129,7 +129,8 @@ class SelectionDAGLegalize {
ArrayRef<int> Mask) const;
std::pair<SDValue, SDValue> ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
- TargetLowering::ArgListTy &&Args, bool isSigned);
+ TargetLowering::ArgListTy &&Args,
+ bool IsSigned, EVT RetVT);
std::pair<SDValue, SDValue> ExpandLibCall(RTLIB::Libcall LC, SDNode *Node, bool isSigned);
void ExpandFrexpLibCall(SDNode *Node, SmallVectorImpl<SDValue> &Results);
@@ -151,6 +152,9 @@ class SelectionDAGLegalize {
RTLIB::Libcall Call_F80, RTLIB::Libcall Call_F128,
RTLIB::Libcall Call_PPCF128,
SmallVectorImpl<SDValue> &Results);
+ SDValue ExpandBitCountingLibCall(SDNode *Node, RTLIB::Libcall CallI32,
+ RTLIB::Libcall CallI64,
+ RTLIB::Libcall CallI128);
void ExpandDivRemLibCall(SDNode *Node, SmallVectorImpl<SDValue> &Results);
void ExpandSinCosLibCall(SDNode *Node, SmallVectorImpl<SDValue> &Results);
@@ -2059,9 +2063,10 @@ SDValue SelectionDAGLegalize::ExpandSPLAT_VECTOR(SDNode *Node) {
// register, return the lo part and set the hi part to the by-reg argument in
// the first. If it does fit into a single register, return the result and
// leave the Hi part unset.
-std::pair<SDValue, SDValue> SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
- TargetLowering::ArgListTy &&Args,
- bool isSigned) {
+std::pair<SDValue, SDValue>
+SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
+ TargetLowering::ArgListTy &&Args,
+ bool IsSigned, EVT RetVT) {
EVT CodePtrTy = TLI.getPointerTy(DAG.getDataLayout());
SDValue Callee;
if (const char *LibcallName = TLI.getLibcallName(LC))
@@ -2072,7 +2077,6 @@ std::pair<SDValue, SDValue> SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall L
Node->getOperationName(&DAG));
}
- EVT RetVT = Node->getValueType(0);
Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
// By default, the input chain to this libcall is the entry node of the
@@ -2092,7 +2096,7 @@ std::pair<SDValue, SDValue> SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall L
InChain = TCChain;
TargetLowering::CallLoweringInfo CLI(DAG);
- bool signExtend = TLI.shouldSignExtendTypeInLibCall(RetVT, isSigned);
+ bool signExtend = TLI.shouldSignExtendTypeInLibCall(RetVT, IsSigned);
CLI.setDebugLoc(SDLoc(Node))
.setChain(InChain)
.setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee,
@@ -2128,7 +2132,8 @@ std::pair<SDValue, SDValue> SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall L
Args.push_back(Entry);
}
- return ExpandLibCall(LC, Node, std::move(Args), isSigned);
+ return ExpandLibCall(LC, Node, std::move(Args), isSigned,
+ Node->getValueType(0));
}
void SelectionDAGLegalize::ExpandFrexpLibCall(
@@ -2155,7 +2160,8 @@ void SelectionDAGLegalize::ExpandFrexpLibCall(
TargetLowering::ArgListTy Args = {FPArgEntry, PtrArgEntry};
RTLIB::Libcall LC = RTLIB::getFREXP(VT);
- auto [Call, Chain] = ExpandLibCall(LC, Node, std::move(Args), false);
+ auto [Call, Chain] = ExpandLibCall(LC, Node, std::move(Args),
+ /*IsSigned=*/false, VT);
// FIXME: Get type of int for libcall declaration and cast
@@ -2243,6 +2249,50 @@ void SelectionDAGLegalize::ExpandArgFPLibCall(SDNode* Node,
ExpandFPLibCall(Node, LC, Results);
}
+SDValue SelectionDAGLegalize::ExpandBitCountingLibCall(
+ SDNode *Node, RTLIB::Libcall CallI32, RTLIB::Libcall CallI64,
+ RTLIB::Libcall CallI128) {
+ RTLIB::Libcall LC;
+ switch (Node->getSimpleValueType(0).SimpleTy) {
+ default:
+ llvm_unreachable("Unexpected request for libcall!");
+ case MVT::i32:
+ LC = CallI32;
+ break;
+ case MVT::i64:
+ LC = CallI64;
+ break;
+ case MVT::i128:
+ LC = CallI128;
+ break;
+ }
+
+ // Bit-counting libcalls have one unsigned argument and return `int`.
+ // Note that `int` may be illegal on this target; ExpandLibCall will
+ // take care of promoting it to a legal type.
+ SDValue Op = Node->getOperand(0);
+ EVT IntVT =
+ EVT::getIntegerVT(*DAG.getContext(), DAG.getLibInfo().getIntSize());
+
+ TargetLowering::ArgListEntry Arg;
+ EVT ArgVT = Op.getValueType();
+ Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+ Arg.Node = Op;
+ Arg.Ty = ArgTy;
+ Arg.IsSExt = TLI.shouldSignExtendTypeInLibCall(ArgVT, /*IsSigned=*/false);
+ Arg.IsZExt = !Arg.IsSExt;
+
+ SDValue Res = ExpandLibCall(LC, Node, TargetLowering::ArgListTy{Arg},
+ /*IsSigned=*/true, IntVT)
+ .first;
+
+ // If ExpandLibCall created a tail call, the result was already
+ // of the correct type. Otherwise, we need to sign extend it.
+ if (Res.getValueType() != MVT::Other)
+ Res = DAG.getSExtOrTrunc(Res, SDLoc(Node), Node->getValueType(0));
+ return Res;
+}
+
/// Issue libcalls to __{u}divmod to compute div / rem pairs.
void
SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
@@ -5000,19 +5050,12 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
RTLIB::MUL_I64, RTLIB::MUL_I128));
break;
case ISD::CTLZ_ZERO_UNDEF:
- switch (Node->getSimpleValueType(0).SimpleTy) {
- default:
- llvm_unreachable("LibCall explicitly requested, but not available");
- case MVT::i32:
- Results.push_back(ExpandLibCall(RTLIB::CTLZ_I32, Node, false).first);
- break;
- case MVT::i64:
- Results.push_back(ExpandLibCall(RTLIB::CTLZ_I64, Node, false).first);
- break;
- case MVT::i128:
- Results.push_back(ExpandLibCall(RTLIB::CTLZ_I128, Node, false).first);
- break;
- }
+ Results.push_back(ExpandBitCountingLibCall(
+ Node, RTLIB::CTLZ_I32, RTLIB::CTLZ_I64, RTLIB::CTLZ_I128));
+ break;
+ case ISD::CTPOP:
+ Results.push_back(ExpandBitCountingLibCall(
+ Node, RTLIB::CTPOP_I32, RTLIB::CTPOP_I64, RTLIB::CTPOP_I128));
break;
case ISD::RESET_FPENV: {
// It is legalized to call 'fesetenv(FE_DFL_ENV)'. On most targets
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index b1ada66aa9aeb..0b0789b75dd50 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -3850,15 +3850,35 @@ void DAGTypeLegalizer::ExpandIntRes_CTLZ(SDNode *N,
Hi = DAG.getConstant(0, dl, NVT);
}
-void DAGTypeLegalizer::ExpandIntRes_CTPOP(SDNode *N,
- SDValue &Lo, SDValue &Hi) {
- SDLoc dl(N);
+void DAGTypeLegalizer::ExpandIntRes_CTPOP(SDNode *N, SDValue &Lo, SDValue &Hi) {
+ SDValue Op = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+ SDLoc DL(N);
+
+ if (TLI.getOperationAction(ISD::CTPOP, VT) == TargetLoweringBase::LibCall) {
+ RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+ if (VT == MVT::i32)
+ LC = RTLIB::CTPOP_I32;
+ else if (VT == MVT::i64)
+ LC = RTLIB::CTPOP_I64;
+ else if (VT == MVT::i128)
+ LC = RTLIB::CTPOP_I128;
+ assert(LC != RTLIB::UNKNOWN_LIBCALL && TLI.getLibcallName(LC) &&
+ "LibCall explicitly requested, but not available");
+ TargetLowering::MakeLibCallOptions CallOptions;
+ EVT IntVT =
+ EVT::getIntegerVT(*DAG.getContext(), DAG.getLibInfo().getIntSize());
+ SDValue Res = TLI.makeLibCall(DAG, LC, IntVT, Op, CallOptions, DL).first;
+ SplitInteger(DAG.getSExtOrTrunc(Res, DL, VT), Lo, Hi);
+ return;
+ }
+
// ctpop(HiLo) -> ctpop(Hi)+ctpop(Lo)
- GetExpandedInteger(N->getOperand(0), Lo, Hi);
+ GetExpandedInteger(Op, Lo, Hi);
EVT NVT = Lo.getValueType();
- Lo = DAG.getNode(ISD::ADD, dl, NVT, DAG.getNode(ISD::CTPOP, dl, NVT, Lo),
- DAG.getNode(ISD::CTPOP, dl, NVT, Hi));
- Hi = DAG.getConstant(0, dl, NVT);
+ Lo = DAG.getNode(ISD::ADD, DL, NVT, DAG.getNode(ISD::CTPOP, DL, NVT, Lo),
+ DAG.getNode(ISD::CTPOP, DL, NVT, Hi));
+ Hi = DAG.getConstant(0, DL, NVT);
}
void DAGTypeLegalizer::ExpandIntRes_CTTZ(SDNode *N,
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 7fa83a5999dfe..0989b79771c03 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -9161,8 +9161,9 @@ SDValue TargetLowering::expandCTTZ(SDNode *Node, SelectionDAG &DAG) const {
!isOperationLegalOrCustomOrPromote(ISD::XOR, VT)))
return SDValue();
- // Emit Table Lookup if ISD::CTLZ and ISD::CTPOP are not legal.
- if (!VT.isVector() && isOperationExpand(ISD::CTPOP, VT) &&
+ // Emit Table Lookup if ISD::CTPOP used in the fallback path below is going
+ // to be expanded or converted to a libcall.
+ if (!VT.isVector() && !isOperationLegalOrCustomOrPromote(ISD::CTPOP, VT) &&
!isOperationLegal(ISD::CTLZ, VT))
if (SDValue V = CTTZTableLookup(Node, DAG, dl, VT, Op, NumBitsPerElt))
return V;
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 75d16a42d0205..1d1ea22f6aac4 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -1204,7 +1204,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::ROTR, VT, Expand);
}
setOperationAction(ISD::CTTZ, MVT::i32, Custom);
- setOperationAction(ISD::CTPOP, MVT::i32, Expand);
+ setOperationAction(ISD::CTPOP, MVT::i32, LibCall);
+ setOperationAction(ISD::CTPOP, MVT::i64, LibCall);
if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) {
setOperationAction(ISD::CTLZ, MVT::i32, Expand);
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, LibCall);
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 9ee60b9db2837..c16b1ca742d7b 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -393,7 +393,12 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom);
}
} else {
- setOperationAction({ISD::CTTZ, ISD::CTPOP}, XLenVT, Expand);
+ setOperationAction(ISD::CTTZ, XLenVT, Expand);
+ if (Subtarget.is64Bit())
+ setOperationAction(ISD::CTPOP, MVT::i128, LibCall);
+ else
+ setOperationAction(ISD::CTPOP, MVT::i32, LibCall);
+ setOperationAction(ISD::CTPOP, MVT::i64, LibCall);
if (RV64LegalI32 && Subtarget.is64Bit())
setOperationAction({ISD::CTTZ, ISD::CTPOP}, MVT::i32, Expand);
}
diff --git a/llvm/test/Analysis/CostModel/RISCV/int-bit-manip.ll b/llvm/test/Analysis/CostModel/RISCV/int-bit-manip.ll
index 380f65b19b8fa..c0ecc63b82dca 100644
--- a/llvm/test/Analysis/CostModel/RISCV/int-bit-manip.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/int-bit-manip.ll
@@ -159,7 +159,7 @@ define void @bitreverse() {
define void @ctpop() {
; NOZVBB-LABEL: 'ctpop'
-; NOZVBB-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %1 = call i8 @llvm.ctpop.i8(i8 undef)
+; NOZVBB-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = call i8 @llvm.ctpop.i8(i8 undef)
; NOZVBB-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %2 = call <2 x i8> @llvm.ctpop.v2i8(<2 x i8> undef)
; NOZVBB-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %3 = call <4 x i8> @llvm.ctpop.v4i8(<4 x i8> undef)
; NOZVBB-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %4 = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> undef)
@@ -169,7 +169,7 @@ define void @ctpop() {
; NOZVBB-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %8 = call <vscale x 4 x i8> @llvm.ctpop.nxv4i8(<vscale x 4 x i8> undef)
; NOZVBB-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %9 = call <vscale x 8 x i8> @llvm.ctpop.nxv8i8(<vscale x 8 x i8> undef)
; NOZVBB-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %10 = call <vscale x 16 x i8> @llvm.ctpop.nxv16i8(<vscale x 16 x i8> undef)
-; NOZVBB-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %11 = call i16 @llvm.ctpop.i16(i16 undef)
+; NOZVBB-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %11 = call i16 @llvm.ctpop.i16(i16 undef)
; NOZVBB-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %12 = call <2 x i16> @llvm.ctpop.v2i16(<2 x i16> undef)
; NOZVBB-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %13 = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> undef)
; NOZVBB-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %14 = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> undef)
@@ -179,7 +179,7 @@ define void @ctpop() {
; NOZVBB-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %18 = call <vscale x 4 x i16> @llvm.ctpop.nxv4i16(<vscale x 4 x i16> undef)
; NOZVBB-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %19 = call <vscale x 8 x i16> @llvm.ctpop.nxv8i16(<vscale x 8 x i16> undef)
; NOZVBB-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %20 = call <vscale x 16 x i16> @llvm.ctpop.nxv16i16(<vscale x 16 x i16> undef)
-; NOZVBB-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %21 = call i32 @llvm.ctpop.i32(i32 undef)
+; NOZVBB-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %21 = call i32 @llvm.ctpop.i32(i32 undef)
; NOZVBB-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %22 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> undef)
; NOZVBB-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %23 = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> undef)
; NOZVBB-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %24 = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> undef)
@@ -189,7 +189,7 @@ define void @ctpop() {
; NOZVBB-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %28 = call <vscale x 4 x i32> @llvm.ctpop.nxv4i32(<vscale x 4 x i32> undef)
; NOZVBB-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %29 = call <vscale x 8 x i32> @llvm.ctpop.nxv8i32(<vscale x 8 x i32> undef)
; NOZVBB-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %30 = call <vscale x 16 x i32> @llvm.ctpop.nxv16i32(<vscale x 16 x i32> undef)
-; NOZVBB-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %31 = call i64 @llvm.ctpop.i64(i64 undef)
+; NOZVBB-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %31 = call i64 @llvm.ctpop.i64(i64 undef)
; NOZVBB-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %32 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> undef)
; NOZVBB-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %33 = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> undef)
; NOZVBB-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %34 = call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> undef)
@@ -202,7 +202,7 @@ define void @ctpop() {
; NOZVBB-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; ZVBB-LABEL: 'ctpop'
-; ZVBB-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %1 = call i8 @llvm.ctpop.i8(i8 undef)
+; ZVBB-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = call i8 @llvm.ctpop.i8(i8 undef)
; ZVBB-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = call <2 x i8> @llvm.ctpop.v2i8(<2 x i8> undef)
; ZVBB-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = call <4 x i8> @llvm.ctpop.v4i8(<4 x i8> undef)
; ZVBB-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> undef)
@@ -212,7 +212,7 @@ define void @ctpop() {
; ZVBB-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x i8> @llvm.ctpop.nxv4i8(<vscale x 4 x i8> undef)
; ZVBB-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x i8> @llvm.ctpop.nxv8i8(<vscale x 8 x i8> undef)
; ZVBB-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %10 = call <vscale x 16 x i8> @llvm.ctpop.nxv16i8(<vscale x 16 x i8> undef)
-; ZVBB-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %11 = call i16 @llvm.ctpop.i16(i16 undef)
+; ZVBB-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %11 = call i16 @llvm.ctpop.i16(i16 undef)
; ZVBB-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %12 = call <2 x i16> @llvm.ctpop.v2i16(<2 x i16> undef)
; ZVBB-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %13 = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> undef)
; ZVBB-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %14 = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> undef)
@@ -222,7 +222,7 @@ define void @ctpop() {
; ZVBB-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %18 = call <vscale x 4 x i16> @llvm.ctpop.nxv4i16(<vscale x 4 x i16> undef)
; ZVBB-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %19 = call <vscale x 8 x i16> @llvm.ctpop.nxv8i16(<vscale x 8 x i16> undef)
; ZVBB-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %20 = call <vscale x 16 x i16> @llvm.ctpop.nxv16i16(<vscale x 16 x i16> undef)
-; ZVBB-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %21 = call i32 @llvm.ctpop.i32(i32 undef)
+; ZVBB-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %21 = call i32 @llvm.ctpop.i32(i32 undef)
; ZVBB-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %22 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> undef)
; ZVBB-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %23 = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> undef)
; ZVBB-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %24 = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> undef)
@@ -232,7 +232,7 @@ define void @ctpop() {
; ZVBB-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %28 = call <vscale x 4 x i32> @llvm.ctpop.nxv4i32(<vscale x 4 x i32> undef)
; ZVBB-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %29 = call <vscale x 8 x i32> @llvm.ctpop.nxv8i32(<vscale x 8 x i32> undef)
; ZVBB-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %30 = call <vscale x 16 x i32> @llvm.ctpop.nxv16i32(<vscale x 16 x i32> undef)
-; ZVBB-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %31 = call i64 @llvm.ctpop.i64(i64 undef)
+; ZVBB-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %31 = call i64 @llvm.ctpop.i64(i64 undef)
; ZVBB-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %32 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> undef)
; ZVBB-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %33 = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> undef)
; ZVBB-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %34 = call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> undef)
diff --git a/llvm/test/CodeGen/ARM/popcnt.ll b/llvm/test/CodeGen/ARM/popcnt.ll
index edcae5e141e73..fc4387320ef77...
[truncated]
|
For what it's worth, not everything links against the compiler runtime, such as the Linux kernel. This change breaks building the kernel for RISC-V, as noticed by some CI infrastructure that built with the original change (#99752):
|
What about x86 CPUs that don't have POPCNT? |
It appears the kernel handles this with an alternative that detects the feature and falls back to an out of line asm routine if it is not supported (perhaps because of something like this?): https://elixir.bootlin.com/linux/v6.10.3/source/arch/x86/include/asm/arch_hweight.h https://elixir.bootlin.com/linux/v6.10.3/source/arch/x86/lib/hweight.S |
gcc does generate a libcall for __builtin_popcount link and somehow manages to build the kernel. Or the kernel doesn't use builtins and it is the compiler that emitted a libcall even though it wasn't in the source? Note that popcount is not the only libcall that the compiler can generate, but it appears to be only one that breaks the build. |
Some interesting discussions for a similar gcc bug(?): |
It looks like LookIdiomRecognize already bails out if popcount isn't supported by hardware, and the hook is correctly implemented on RISC-V. It would be nice to have a reproducer. |
I'll see if I can reduce one down from the kernel sources. |
long is_power_of_2_n, inode_readahead_blks_store_t;
enum { attr_reserved_clusters } inode_readahead_blks_store_count;
long inode_readahead_blks_store() {
_Bool __trans_tmp_1 = is_power_of_2_n & (is_power_of_2_n - 1);
if (inode_readahead_blks_store_t &&
(__trans_tmp_1 || inode_readahead_blks_store_t > 0))
return 2;
return inode_readahead_blks_store_count;
} which shows
|
That's just checking if something is a power of 2 or 0 which shouldnt' require a popcnt. The middle end likes to canonicalize it to popcnt but the backend is supposed to undo it when popcnt isn't supported. |
This is how the backend undoes it:
Without this canonicalization:
I would argue that this canonicalization is harmful without hardware popcount. |
The problem seems to be that the popcount and branch end up in different basic blocks so the backend can't undo it. The code to undo it is in |
It seems so. This time it is SelectionDAGBuilder to blame, it transforms
The outcome is the same: the popcount becomes liveout of its basic block. |
Just in case it matters, this broke ARM kernel builds too: https://github.com/ClangBuiltLinux/continuous-integration2/actions/runs/10240673671/job/28328987353
I assume this is the same root cause (since the reproducer exhibits the same behavior with |
There is no need for this just yet. It is possible that there are other cases when the call is emitted, but the core issue should be the same. |
Done in #102731. |
@nathanchance I'm landing this now. Please let me know if the issue persists. |
@s-barannikov Thanks for the heads up. Unfortunately, even with 5080a02, I still see
I will try to reduce something out tonight or tomorrow. |
Thanks, |
Looks like it does: https://github.com/torvalds/linux/blob/master/lib/clz_ctz.c |
@koachan you might be interested in this as well. |
The change as is breaks the Linux kernel build as pointed out in the comments.
FYI, this showed up as issues while building for armv7 windows in MSVC style environments; in an MSVC style environment, we don't have the regular libgcc/compiler-rt builtins, only the ones shipped with MSVC. I can work around it with a change like this: diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index bdebd842b011..d65d48a6611f 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -1221,8 +1221,13 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::ROTR, VT, Expand);
}
setOperationAction(ISD::CTTZ, MVT::i32, Custom);
- setOperationAction(ISD::CTPOP, MVT::i32, LibCall);
- setOperationAction(ISD::CTPOP, MVT::i64, LibCall);
+ if (Subtarget->isTargetMSVC()) {
+ setOperationAction(ISD::CTPOP, MVT::i32, Expand);
+ setOperationAction(ISD::CTPOP, MVT::i64, Expand);
+ } else {
+ setOperationAction(ISD::CTPOP, MVT::i32, LibCall);
+ setOperationAction(ISD::CTPOP, MVT::i64, LibCall);
+ }
if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) {
setOperationAction(ISD::CTLZ, MVT::i32, Expand);
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, LibCall);
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h
index 7329d3f2055f..9a9a9b45adb2 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -340,6 +340,7 @@ public:
bool isTargetWatchABI() const { return TargetTriple.isWatchABI(); }
bool isTargetDriverKit() const { return TargetTriple.isDriverKit(); }
bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
+ bool isTargetMSVC() const { return TargetTriple.isWindowsMSVCEnvironment(); }
bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
bool isTargetNetBSD() const { return TargetTriple.isOSNetBSD(); }
bool isTargetWindows() const { return TargetTriple.isOSWindows(); } It's probably fine to just use the existing I see this was already somewhat turned off again, but I'd appreciate if you can fold this in before reenabling! |
Hmmm so what is the takeaway here? I'm having difficulties following the discussion. |
This is my understanding, yes. |
I could try to push for something like this in the kernel: diff --git a/lib/Makefile b/lib/Makefile
index f07b24ce1b3f..0240fa7d6b5b 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -52,7 +52,7 @@ obj-y += lockref.o
obj-y += bcd.o sort.o parser.o debug_locks.o random32.o \
bust_spinlocks.o kasprintf.o bitmap.o scatterlist.o \
- list_sort.o uuid.o iov_iter.o clz_ctz.o \
+ list_sort.o uuid.o iov_iter.o clz_ctz.o popcount.o \
bsearch.o find_bit.o llist.o lwq.o memweight.o kfifo.o \
percpu-refcount.o rhashtable.o base64.o \
once.o refcount.o rcuref.o usercopy.o errseq.o bucket_locks.o \
diff --git a/lib/popcount.c b/lib/popcount.c
new file mode 100644
index 000000000000..0234961cc35e
--- /dev/null
+++ b/lib/popcount.c
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * The functions in this file aren't called directly, but may be emitted
+ * by the compiler.
+ */
+
+#include <linux/bitops.h>
+#include <linux/export.h>
+
+int __popcountsi2(unsigned int val);
+int __popcountsi2(unsigned int val)
+{
+ return __arch_hweight32(val);
+}
+EXPORT_SYMBOL(__popcountsi2);
+
+int __popcountdi2(unsigned long long val);
+int __popcountdi2(unsigned long long val)
+{
+ return __arch_hweight64(val);
+}
+EXPORT_SYMBOL(__popcountdi2); which would at least use existing architecture optimized popcount implementations if they exist, which would hopefully overcome Linus’s recent complaint around compiler optimizations inserting libcalls (resulting in
I wonder if that matches what GCC does, which could be additional justification? I do understand the kernel is being special here so I can pursue the addition of |
I don't mind sharing the headache 😅. And, who knows, maybe this patch will be welcomed?
I think this option only applies to
GCC generates calls to
Will be much appreciated, thank you! I'll continue trying to fix the issue on the compiler side. |
Heh, I sent a message to Linus yesterday and the reaction was generally what I expected. His suggestion was effectively diff --git a/lib/hweight.c b/lib/hweight.c
index c94586b62551..00b2ff970203 100644
--- a/lib/hweight.c
+++ b/lib/hweight.c
@@ -66,3 +66,8 @@ unsigned long __sw_hweight64(__u64 w)
#endif
}
EXPORT_SYMBOL(__sw_hweight64);
+
+int __popcountsi2(unsigned int w) __alias(__sw_hweight32);
+EXPORT_SYMBOL(__popcountsi2);
+int __popcountdi2(__u64 w) __alias(__sw_hweight64);
+EXPORT_SYMBOL(__popcountdi2); which does also resolve the error but I realized that neither of these suggestions I have proposed will work in their current form because (at least for the RISC-V configuration that I chcked)
|
If I adjust For some functions this issue is worked around by comparing the optimized function's name with the name of the new callee and bailing out if they are the same. I can't find right away where I saw it; it wasn't done generically, only for some (or maybe just a couple) functions. We could do the same for Even if the compiler stops generating calls to popcount as part of ctz/clz expansion, it will still recognize |
This is a reland of #99752 with the bug fixed (see test diff in the third commit in this PR). All `popcount` libcalls return `int`, but `ISD::CTPOP` returns the type of the argument, which can be wider than `int`. The fix is to make DAG legalizer pass the correct return type to `makeLibCall` and sign-extend the result afterwards. Original commit message: The main change is adding CTPOP to `RuntimeLibcalls.def` to allow targets to use LibCall action for CTPOP. DAG legalizers are changed accordingly. Pull Request: llvm/llvm-project#101786
…01786) This is a reland of #99752 with the bug fixed (see test diff in the third commit in this PR). All `popcount` libcalls return `int`, but `ISD::CTPOP` returns the type of the argument, which can be wider than `int`. The fix is to make DAG legalizer pass the correct return type to `makeLibCall` and sign-extend the result afterwards. Original commit message: The main change is adding CTPOP to `RuntimeLibcalls.def` to allow targets to use LibCall action for CTPOP. DAG legalizers are changed accordingly. Pull Request: llvm/llvm-project#101786
This is a reland of llvm#99752 with the bug fixed (see test diff in the third commit in this PR). All `popcount` libcalls return `int`, but `ISD::CTPOP` returns the type of the argument, which can be wider than `int`. The fix is to make DAG legalizer pass the correct return type to `makeLibCall` and sign-extend the result afterwards. Original commit message: The main change is adding CTPOP to `RuntimeLibcalls.def` to allow targets to use LibCall action for CTPOP. DAG legalizers are changed accordingly. Pull Request: llvm#101786
The change as is breaks the Linux kernel build as pointed out in the comments.
This is a reland of llvm#99752 with the bug fixed (see test diff in the third commit in this PR). All `popcount` libcalls return `int`, but `ISD::CTPOP` returns the type of the argument, which can be wider than `int`. The fix is to make DAG legalizer pass the correct return type to `makeLibCall` and sign-extend the result afterwards. Original commit message: The main change is adding CTPOP to `RuntimeLibcalls.def` to allow targets to use LibCall action for CTPOP. DAG legalizers are changed accordingly. Pull Request: llvm#101786
The change as is breaks the Linux kernel build as pointed out in the comments.
This is a reland of llvm#99752 with the bug fixed (see test diff in the third commit in this PR). All `popcount` libcalls return `int`, but `ISD::CTPOP` returns the type of the argument, which can be wider than `int`. The fix is to make DAG legalizer pass the correct return type to `makeLibCall` and sign-extend the result afterwards. Original commit message: The main change is adding CTPOP to `RuntimeLibcalls.def` to allow targets to use LibCall action for CTPOP. DAG legalizers are changed accordingly. Pull Request: llvm#101786
The change as is breaks the Linux kernel build as pointed out in the comments.
The change as is breaks the Linux kernel build as pointed out in the comments.
This is a reland of #99752 with the bug fixed (see test diff in the third commit in this PR).
All
popcount
libcalls returnsint
, butISD::CTPOP
returns the type of the argument, which can be wider thanint
. The fix is to make DAG legalizer pass the correct return type tomakeLibCall
and sign-extend the result afterwards.