Skip to content

[SDAG] Allow folding stack slots into sincos/frexp in more cases #118117

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Dec 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions llvm/include/llvm/CodeGen/SelectionDAG.h
Original file line number Diff line number Diff line change
Expand Up @@ -455,6 +455,9 @@ class SelectionDAG {
// Maximum depth for recursive analysis such as computeKnownBits, etc.
static constexpr unsigned MaxRecursionDepth = 6;

// Returns the maximum steps for SDNode->hasPredecessor() like searches.
static unsigned getHasPredecessorMaxSteps();

explicit SelectionDAG(const TargetMachine &TM, CodeGenOptLevel);
SelectionDAG(const SelectionDAG &) = delete;
SelectionDAG &operator=(const SelectionDAG &) = delete;
Expand Down
9 changes: 3 additions & 6 deletions llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -153,12 +153,6 @@ static cl::opt<bool> EnableVectorFCopySignExtendRound(
"combiner-vector-fcopysign-extend-round", cl::Hidden, cl::init(false),
cl::desc(
"Enable merging extends and rounds into FCOPYSIGN on vector types"));

static cl::opt<unsigned int>
MaxSteps("has-predecessor-max-steps", cl::Hidden, cl::init(8192),
cl::desc("DAG combiner limit number of steps when searching DAG "
"for predecessor nodes"));

namespace {

class DAGCombiner {
Expand Down Expand Up @@ -18929,6 +18923,7 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
// can be folded with this one. We should do this to avoid having to keep
// a copy of the original base pointer.
SmallVector<SDNode *, 16> OtherUses;
unsigned MaxSteps = SelectionDAG::getHasPredecessorMaxSteps();
if (isa<ConstantSDNode>(Offset))
for (SDNode::use_iterator UI = BasePtr->use_begin(),
UE = BasePtr->use_end();
Expand Down Expand Up @@ -19093,6 +19088,7 @@ static bool shouldCombineToPostInc(SDNode *N, SDValue Ptr, SDNode *PtrUse,
return false;

SmallPtrSet<const SDNode *, 32> Visited;
unsigned MaxSteps = SelectionDAG::getHasPredecessorMaxSteps();
for (SDNode *Use : BasePtr->uses()) {
if (Use == Ptr.getNode())
continue;
Expand Down Expand Up @@ -19139,6 +19135,7 @@ static SDNode *getPostIndexedLoadStoreOp(SDNode *N, bool &IsLoad,
// 2) Op must be independent of N, i.e. Op is neither a predecessor
// nor a successor of N. Otherwise, if Op is folded that would
// create a cycle.
unsigned MaxSteps = SelectionDAG::getHasPredecessorMaxSteps();
for (SDNode *Op : Ptr->uses()) {
// Check for #1.
if (!shouldCombineToPostInc(N, Ptr, Op, BasePtr, Offset, AM, DAG, TLI))
Expand Down
82 changes: 72 additions & 10 deletions llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,10 +111,17 @@ static cl::opt<int> MaxLdStGlue("ldstmemcpy-glue-max",
cl::desc("Number limit for gluing ld/st of memcpy."),
cl::Hidden, cl::init(0));

static cl::opt<unsigned>
MaxSteps("has-predecessor-max-steps", cl::Hidden, cl::init(8192),
cl::desc("DAG combiner limit number of steps when searching DAG "
"for predecessor nodes"));

static void NewSDValueDbgMsg(SDValue V, StringRef Msg, SelectionDAG *G) {
LLVM_DEBUG(dbgs() << Msg; V.getNode()->dump(G););
}

unsigned SelectionDAG::getHasPredecessorMaxSteps() { return MaxSteps; }

//===----------------------------------------------------------------------===//
// ConstantFPSDNode Class
//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -2474,6 +2481,51 @@ SDValue SelectionDAG::getPartialReduceAdd(SDLoc DL, EVT ReducedTy, SDValue Op1,
return Subvectors[0];
}

/// Given a store node \p StoreNode, return true if it is safe to fold that node
/// into \p FPNode, which expands to a library call with output pointers.
static bool canFoldStoreIntoLibCallOutputPointers(StoreSDNode *StoreNode,
SDNode *FPNode) {
SmallVector<const SDNode *, 8> Worklist;
SmallVector<const SDNode *, 8> DeferredNodes;
SmallPtrSet<const SDNode *, 16> Visited;

// Skip FPNode use by StoreNode (that's the use we want to fold into FPNode).
for (SDValue Op : StoreNode->ops())
if (Op.getNode() != FPNode)
Comment on lines +2493 to +2494
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't you only want to visit the chain?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is probably overly cautious, but I want to make sure that there's no user of another value of the node that occurs as a predecessor of the store (via any value) to avoid creating cycles.

Worklist.push_back(Op.getNode());

unsigned MaxSteps = SelectionDAG::getHasPredecessorMaxSteps();
while (!Worklist.empty()) {
const SDNode *Node = Worklist.pop_back_val();
auto [_, Inserted] = Visited.insert(Node);
if (!Inserted)
continue;

if (MaxSteps > 0 && Visited.size() >= MaxSteps)
return false;

// Reached the FPNode (would result in a cycle).
// OR Reached CALLSEQ_START (would result in nested call sequences).
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Needing to special case calls feels wrong. Can you just use reachesChainWithoutSideEffects?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, reachesChainWithoutSideEffects() does not do the check I need. It simply walks the chain, only allowing loads or token factors to be included (and it's not clear what the "dest" would be; it does not always need to be the entry node here).

I think this really is a special case. The issue comes from the stores being within a call sequence, which means if they're folded into the expansion, we'll get nested call sequences, which is illegal.

if (Node == FPNode || Node->getOpcode() == ISD::CALLSEQ_START)
return false;

if (Node->getOpcode() == ISD::CALLSEQ_END) {
// Defer looking into call sequences (so we can check we're outside one).
// We still need to look through these for the predecessor check.
DeferredNodes.push_back(Node);
continue;
}

for (SDValue Op : Node->ops())
Worklist.push_back(Op.getNode());
}

// True if we're outside a call sequence and don't have the FPNode as a
// predecessor. No cycles or nested call sequences possible.
return !SDNode::hasPredecessorHelper(FPNode, Visited, DeferredNodes,
MaxSteps);
}

bool SelectionDAG::expandMultipleResultFPLibCall(
RTLIB::Libcall LC, SDNode *Node, SmallVectorImpl<SDValue> &Results,
std::optional<unsigned> CallRetResNo) {
Expand Down Expand Up @@ -2502,26 +2554,35 @@ bool SelectionDAG::expandMultipleResultFPLibCall(

// Find users of the node that store the results (and share input chains). The
// destination pointers can be used instead of creating stack allocations.
// FIXME: This should allow stores with the same chains (not just the entry
// chain), but there's a risk the store is within a (CALLSEQ_START,
// CALLSEQ_END) pair, which after this expansion will lead to nested call
// sequences.
SDValue InChain = getEntryNode();
SDValue StoresInChain;
SmallVector<StoreSDNode *, 2> ResultStores(NumResults);
for (SDNode *User : Node->uses()) {
if (!ISD::isNormalStore(User))
continue;
auto *ST = cast<StoreSDNode>(User);
SDValue StoreValue = ST->getValue();
unsigned ResNo = StoreValue.getResNo();
// Ensure the store corresponds to an output pointer.
if (CallRetResNo == ResNo)
continue;
// Ensure the store to the default address space and not atomic or volatile.
if (!ST->isSimple() || ST->getAddressSpace() != 0)
continue;
// Ensure all store chains are the same (so they don't alias).
if (StoresInChain && ST->getChain() != StoresInChain)
continue;
// Ensure the store is properly aligned.
Type *StoreType = StoreValue.getValueType().getTypeForEVT(Ctx);
if (CallRetResNo == ResNo || !ST->isSimple() ||
ST->getAddressSpace() != 0 ||
ST->getAlign() <
getDataLayout().getABITypeAlign(StoreType->getScalarType()) ||
ST->getChain() != InChain)
if (ST->getAlign() <
getDataLayout().getABITypeAlign(StoreType->getScalarType()))
continue;
// Avoid:
// 1. Creating cyclic dependencies.
// 2. Expanding the node to a call within a call sequence.
if (!canFoldStoreIntoLibCallOutputPointers(ST, Node))
continue;
ResultStores[ResNo] = ST;
StoresInChain = ST->getChain();
}

TargetLowering::ArgListTy Args;
Expand Down Expand Up @@ -2563,6 +2624,7 @@ bool SelectionDAG::expandMultipleResultFPLibCall(
Type *RetType = CallRetResNo.has_value()
? Node->getValueType(*CallRetResNo).getTypeForEVT(Ctx)
: Type::getVoidTy(Ctx);
SDValue InChain = StoresInChain ? StoresInChain : getEntryNode();
SDValue Callee = getExternalSymbol(VD ? VD->getVectorFnName().data() : LCName,
TLI->getPointerTy(getDataLayout()));
TargetLowering::CallLoweringInfo CLI(*this);
Expand Down
34 changes: 34 additions & 0 deletions llvm/test/CodeGen/AArch64/sincos-stack-slots.ll
Original file line number Diff line number Diff line change
Expand Up @@ -253,3 +253,37 @@ entry:
store double %cos, ptr %out_cos, align 4
ret void
}

declare void @foo(ptr, ptr)

define void @can_fold_with_call_in_chain(float %x, ptr noalias %a, ptr noalias %b) {
; CHECK-LABEL: can_fold_with_call_in_chain:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: str d8, [sp, #-32]! // 8-byte Folded Spill
; CHECK-NEXT: str x30, [sp, #8] // 8-byte Folded Spill
; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: .cfi_offset w19, -8
; CHECK-NEXT: .cfi_offset w20, -16
; CHECK-NEXT: .cfi_offset w30, -24
; CHECK-NEXT: .cfi_offset b8, -32
; CHECK-NEXT: mov x19, x1
; CHECK-NEXT: mov x20, x0
; CHECK-NEXT: fmov s8, s0
; CHECK-NEXT: bl foo
; CHECK-NEXT: fmov s0, s8
; CHECK-NEXT: mov x0, x20
; CHECK-NEXT: mov x1, x19
; CHECK-NEXT: bl sincosf
; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: ldr d8, [sp], #32 // 8-byte Folded Reload
; CHECK-NEXT: ret
entry:
%sin = tail call float @llvm.sin.f32(float %x)
%cos = tail call float @llvm.cos.f32(float %x)
Comment on lines +283 to +284
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if this is a codegen solution to an IR problem. These could have moved past the call

Copy link
Member Author

@MacDue MacDue Dec 3, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure I follow. In SDAG, these don't have any particular place (as they're not side-effecting nodes until after the expansion).

The test here shows the two stores after the call to @foo, can still be folded into the @sincos library call, even though they're both chained to the call to @foo (as they don't alias with each other).

call void @foo(ptr %a, ptr %b)
store float %sin, ptr %a, align 4
store float %cos, ptr %b, align 4
ret void
}
32 changes: 10 additions & 22 deletions llvm/test/CodeGen/PowerPC/f128-arith.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1365,45 +1365,33 @@ define dso_local fp128 @qpFREXP(ptr %a, ptr %b) {
; CHECK-LABEL: qpFREXP:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: mflr r0
; CHECK-NEXT: .cfi_def_cfa_offset 64
; CHECK-NEXT: stdu r1, -32(r1)
; CHECK-NEXT: std r0, 48(r1)
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: .cfi_offset lr, 16
; CHECK-NEXT: .cfi_offset r30, -16
; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill
; CHECK-NEXT: stdu r1, -64(r1)
; CHECK-NEXT: std r0, 80(r1)
; CHECK-NEXT: addi r5, r1, 44
; CHECK-NEXT: mr r30, r4
; CHECK-NEXT: lxv v2, 0(r3)
; CHECK-NEXT: mr r5, r4
; CHECK-NEXT: bl frexpf128
; CHECK-NEXT: nop
; CHECK-NEXT: lwz r3, 44(r1)
; CHECK-NEXT: stw r3, 0(r30)
; CHECK-NEXT: addi r1, r1, 64
; CHECK-NEXT: addi r1, r1, 32
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
;
; CHECK-P8-LABEL: qpFREXP:
; CHECK-P8: # %bb.0: # %entry
; CHECK-P8-NEXT: mflr r0
; CHECK-P8-NEXT: .cfi_def_cfa_offset 64
; CHECK-P8-NEXT: stdu r1, -32(r1)
; CHECK-P8-NEXT: std r0, 48(r1)
; CHECK-P8-NEXT: .cfi_def_cfa_offset 32
; CHECK-P8-NEXT: .cfi_offset lr, 16
; CHECK-P8-NEXT: .cfi_offset r30, -16
; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill
; CHECK-P8-NEXT: stdu r1, -64(r1)
; CHECK-P8-NEXT: std r0, 80(r1)
; CHECK-P8-NEXT: addi r5, r1, 44
; CHECK-P8-NEXT: mr r30, r4
; CHECK-P8-NEXT: lxvd2x vs0, 0, r3
; CHECK-P8-NEXT: mr r5, r4
; CHECK-P8-NEXT: xxswapd v2, vs0
; CHECK-P8-NEXT: bl frexpf128
; CHECK-P8-NEXT: nop
; CHECK-P8-NEXT: lwz r3, 44(r1)
; CHECK-P8-NEXT: stw r3, 0(r30)
; CHECK-P8-NEXT: addi r1, r1, 64
; CHECK-P8-NEXT: addi r1, r1, 32
; CHECK-P8-NEXT: ld r0, 16(r1)
; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload
; CHECK-P8-NEXT: mtlr r0
; CHECK-P8-NEXT: blr
entry:
Expand Down
Loading
Loading