-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[SDAG] Allow folding stack slots into sincos/frexp in more cases #118117
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
227798b
4347f35
794f434
3ffca6b
edf4e72
b2b304d
c3ee9d0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -111,10 +111,17 @@ static cl::opt<int> MaxLdStGlue("ldstmemcpy-glue-max", | |
cl::desc("Number limit for gluing ld/st of memcpy."), | ||
cl::Hidden, cl::init(0)); | ||
|
||
static cl::opt<unsigned> | ||
MaxSteps("has-predecessor-max-steps", cl::Hidden, cl::init(8192), | ||
cl::desc("DAG combiner limit number of steps when searching DAG " | ||
"for predecessor nodes")); | ||
|
||
static void NewSDValueDbgMsg(SDValue V, StringRef Msg, SelectionDAG *G) { | ||
LLVM_DEBUG(dbgs() << Msg; V.getNode()->dump(G);); | ||
} | ||
|
||
unsigned SelectionDAG::getHasPredecessorMaxSteps() { return MaxSteps; } | ||
|
||
//===----------------------------------------------------------------------===// | ||
// ConstantFPSDNode Class | ||
//===----------------------------------------------------------------------===// | ||
|
@@ -2474,6 +2481,51 @@ SDValue SelectionDAG::getPartialReduceAdd(SDLoc DL, EVT ReducedTy, SDValue Op1, | |
return Subvectors[0]; | ||
} | ||
|
||
/// Given a store node \p StoreNode, return true if it is safe to fold that node | ||
/// into \p FPNode, which expands to a library call with output pointers. | ||
static bool canFoldStoreIntoLibCallOutputPointers(StoreSDNode *StoreNode, | ||
SDNode *FPNode) { | ||
SmallVector<const SDNode *, 8> Worklist; | ||
SmallVector<const SDNode *, 8> DeferredNodes; | ||
SmallPtrSet<const SDNode *, 16> Visited; | ||
|
||
// Skip FPNode use by StoreNode (that's the use we want to fold into FPNode). | ||
for (SDValue Op : StoreNode->ops()) | ||
if (Op.getNode() != FPNode) | ||
Worklist.push_back(Op.getNode()); | ||
|
||
unsigned MaxSteps = SelectionDAG::getHasPredecessorMaxSteps(); | ||
while (!Worklist.empty()) { | ||
MacDue marked this conversation as resolved.
Show resolved
Hide resolved
|
||
const SDNode *Node = Worklist.pop_back_val(); | ||
auto [_, Inserted] = Visited.insert(Node); | ||
if (!Inserted) | ||
continue; | ||
|
||
if (MaxSteps > 0 && Visited.size() >= MaxSteps) | ||
return false; | ||
|
||
// Reached the FPNode (would result in a cycle). | ||
// OR Reached CALLSEQ_START (would result in nested call sequences). | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Needing to special case calls feels wrong. Can you just use reachesChainWithoutSideEffects? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, I think this really is a special case. The issue comes from the stores being within a call sequence, which means if they're folded into the expansion, we'll get nested call sequences, which is illegal. |
||
if (Node == FPNode || Node->getOpcode() == ISD::CALLSEQ_START) | ||
return false; | ||
|
||
if (Node->getOpcode() == ISD::CALLSEQ_END) { | ||
// Defer looking into call sequences (so we can check we're outside one). | ||
// We still need to look through these for the predecessor check. | ||
DeferredNodes.push_back(Node); | ||
continue; | ||
} | ||
|
||
for (SDValue Op : Node->ops()) | ||
Worklist.push_back(Op.getNode()); | ||
} | ||
|
||
// True if we're outside a call sequence and don't have the FPNode as a | ||
// predecessor. No cycles or nested call sequences possible. | ||
return !SDNode::hasPredecessorHelper(FPNode, Visited, DeferredNodes, | ||
MaxSteps); | ||
} | ||
|
||
bool SelectionDAG::expandMultipleResultFPLibCall( | ||
RTLIB::Libcall LC, SDNode *Node, SmallVectorImpl<SDValue> &Results, | ||
std::optional<unsigned> CallRetResNo) { | ||
|
@@ -2502,26 +2554,35 @@ bool SelectionDAG::expandMultipleResultFPLibCall( | |
|
||
// Find users of the node that store the results (and share input chains). The | ||
// destination pointers can be used instead of creating stack allocations. | ||
// FIXME: This should allow stores with the same chains (not just the entry | ||
// chain), but there's a risk the store is within a (CALLSEQ_START, | ||
// CALLSEQ_END) pair, which after this expansion will lead to nested call | ||
// sequences. | ||
SDValue InChain = getEntryNode(); | ||
SDValue StoresInChain; | ||
SmallVector<StoreSDNode *, 2> ResultStores(NumResults); | ||
for (SDNode *User : Node->uses()) { | ||
if (!ISD::isNormalStore(User)) | ||
continue; | ||
auto *ST = cast<StoreSDNode>(User); | ||
SDValue StoreValue = ST->getValue(); | ||
unsigned ResNo = StoreValue.getResNo(); | ||
// Ensure the store corresponds to an output pointer. | ||
if (CallRetResNo == ResNo) | ||
continue; | ||
// Ensure the store to the default address space and not atomic or volatile. | ||
if (!ST->isSimple() || ST->getAddressSpace() != 0) | ||
continue; | ||
// Ensure all store chains are the same (so they don't alias). | ||
if (StoresInChain && ST->getChain() != StoresInChain) | ||
continue; | ||
// Ensure the store is properly aligned. | ||
Type *StoreType = StoreValue.getValueType().getTypeForEVT(Ctx); | ||
if (CallRetResNo == ResNo || !ST->isSimple() || | ||
ST->getAddressSpace() != 0 || | ||
ST->getAlign() < | ||
getDataLayout().getABITypeAlign(StoreType->getScalarType()) || | ||
ST->getChain() != InChain) | ||
if (ST->getAlign() < | ||
getDataLayout().getABITypeAlign(StoreType->getScalarType())) | ||
continue; | ||
// Avoid: | ||
// 1. Creating cyclic dependencies. | ||
// 2. Expanding the node to a call within a call sequence. | ||
if (!canFoldStoreIntoLibCallOutputPointers(ST, Node)) | ||
continue; | ||
ResultStores[ResNo] = ST; | ||
StoresInChain = ST->getChain(); | ||
} | ||
|
||
TargetLowering::ArgListTy Args; | ||
|
@@ -2563,6 +2624,7 @@ bool SelectionDAG::expandMultipleResultFPLibCall( | |
Type *RetType = CallRetResNo.has_value() | ||
? Node->getValueType(*CallRetResNo).getTypeForEVT(Ctx) | ||
: Type::getVoidTy(Ctx); | ||
SDValue InChain = StoresInChain ? StoresInChain : getEntryNode(); | ||
SDValue Callee = getExternalSymbol(VD ? VD->getVectorFnName().data() : LCName, | ||
TLI->getPointerTy(getDataLayout())); | ||
TargetLowering::CallLoweringInfo CLI(*this); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -253,3 +253,37 @@ entry: | |
store double %cos, ptr %out_cos, align 4 | ||
ret void | ||
} | ||
|
||
declare void @foo(ptr, ptr) | ||
|
||
define void @can_fold_with_call_in_chain(float %x, ptr noalias %a, ptr noalias %b) { | ||
; CHECK-LABEL: can_fold_with_call_in_chain: | ||
; CHECK: // %bb.0: // %entry | ||
; CHECK-NEXT: str d8, [sp, #-32]! // 8-byte Folded Spill | ||
; CHECK-NEXT: str x30, [sp, #8] // 8-byte Folded Spill | ||
; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill | ||
; CHECK-NEXT: .cfi_def_cfa_offset 32 | ||
; CHECK-NEXT: .cfi_offset w19, -8 | ||
; CHECK-NEXT: .cfi_offset w20, -16 | ||
; CHECK-NEXT: .cfi_offset w30, -24 | ||
; CHECK-NEXT: .cfi_offset b8, -32 | ||
; CHECK-NEXT: mov x19, x1 | ||
; CHECK-NEXT: mov x20, x0 | ||
; CHECK-NEXT: fmov s8, s0 | ||
; CHECK-NEXT: bl foo | ||
; CHECK-NEXT: fmov s0, s8 | ||
; CHECK-NEXT: mov x0, x20 | ||
; CHECK-NEXT: mov x1, x19 | ||
; CHECK-NEXT: bl sincosf | ||
; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload | ||
; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload | ||
; CHECK-NEXT: ldr d8, [sp], #32 // 8-byte Folded Reload | ||
; CHECK-NEXT: ret | ||
entry: | ||
%sin = tail call float @llvm.sin.f32(float %x) | ||
%cos = tail call float @llvm.cos.f32(float %x) | ||
Comment on lines
+283
to
+284
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wonder if this is a codegen solution to an IR problem. These could have moved past the call There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure I follow. In SDAG, these don't have any particular place (as they're not side-effecting nodes until after the expansion). The test here shows the two stores after the call to |
||
call void @foo(ptr %a, ptr %b) | ||
store float %sin, ptr %a, align 4 | ||
store float %cos, ptr %b, align 4 | ||
ret void | ||
} | ||
MacDue marked this conversation as resolved.
Show resolved
Hide resolved
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Don't you only want to visit the chain?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is probably overly cautious, but I want to make sure that there's no user of another value of the node that occurs as a predecessor of the store (via any value) to avoid creating cycles.