-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[SDAG] Allow folding stack slots into sincos/frexp in more cases #118117
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-x86 @llvm/pr-subscribers-backend-powerpc Author: Benjamin Maxwell (MacDue) ChangesThis adds a new helper
These two properties are checked as part of the same traversal in Patch is 26.83 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/118117.diff 6 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 182529123ec6d8..8ddb4bcb8fb212 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2474,6 +2474,45 @@ SDValue SelectionDAG::getPartialReduceAdd(SDLoc DL, EVT ReducedTy, SDValue Op1,
return Subvectors[0];
}
+/// Given a store node \p StoreNode, return true if it is safe to fold that node
+/// into \p FPNode, which expands to a library call with output pointers.
+static bool CanFoldStoreIntoFPLibCall(StoreSDNode *StoreNode, SDNode *FPNode) {
+ SmallVector<const SDNode *, 8> Worklist;
+ SmallVector<const SDNode *, 8> DeferredNodes;
+ SmallPtrSet<const SDNode *, 16> Visited;
+
+ // Skip FPNode use by StoreNode (that's the use we want to fold into FPNode).
+ for (SDValue Op : StoreNode->ops())
+ if (Op.getNode() != FPNode)
+ Worklist.push_back(Op.getNode());
+
+ while (!Worklist.empty()) {
+ const SDNode *Node = Worklist.pop_back_val();
+ auto [_, Inserted] = Visited.insert(Node);
+ if (!Inserted)
+ continue;
+
+ // Reached the FPNode (would result in a cycle).
+ // OR Reached CALLSEQ_START (would result in nested call sequences).
+ if (Node == FPNode || Node->getOpcode() == ISD::CALLSEQ_START)
+ return false;
+
+ if (Node->getOpcode() == ISD::CALLSEQ_END) {
+ // Defer looking into call sequences (so we can check we're outside one).
+ // We still need to look through these for the predecessor check.
+ DeferredNodes.push_back(Node);
+ continue;
+ }
+
+ for (SDValue Op : Node->ops())
+ Worklist.push_back(Op.getNode());
+ }
+
+ // True if we're outside a call sequence and don't have the FPNode as a
+ // predecessor. No cycles or nested call sequences possible.
+ return !SDNode::hasPredecessorHelper(FPNode, Visited, DeferredNodes);
+}
+
bool SelectionDAG::expandMultipleResultFPLibCall(
RTLIB::Libcall LC, SDNode *Node, SmallVectorImpl<SDValue> &Results,
std::optional<unsigned> CallRetResNo) {
@@ -2502,11 +2541,7 @@ bool SelectionDAG::expandMultipleResultFPLibCall(
// Find users of the node that store the results (and share input chains). The
// destination pointers can be used instead of creating stack allocations.
- // FIXME: This should allow stores with the same chains (not just the entry
- // chain), but there's a risk the store is within a (CALLSEQ_START,
- // CALLSEQ_END) pair, which after this expansion will lead to nested call
- // sequences.
- SDValue InChain = getEntryNode();
+ SDValue StoresInChain{};
SmallVector<StoreSDNode *, 2> ResultStores(NumResults);
for (SDNode *User : Node->uses()) {
if (!ISD::isNormalStore(User))
@@ -2515,13 +2550,25 @@ bool SelectionDAG::expandMultipleResultFPLibCall(
SDValue StoreValue = ST->getValue();
unsigned ResNo = StoreValue.getResNo();
Type *StoreType = StoreValue.getValueType().getTypeForEVT(Ctx);
- if (CallRetResNo == ResNo || !ST->isSimple() ||
+ if (
+ // Ensure the store corresponds to an output pointer.
+ CallRetResNo == ResNo ||
+ // Ensure the store is not atomic or volatile.
+ !ST->isSimple() ||
+ // Ensure the store is in the default address space.
ST->getAddressSpace() != 0 ||
+ // Ensure the store is properly aligned.
ST->getAlign() <
getDataLayout().getABITypeAlign(StoreType->getScalarType()) ||
- ST->getChain() != InChain)
+ // Ensure all store chains are the same (so they don't alias).
+ (StoresInChain && ST->getChain() != StoresInChain)
+ // Avoid:
+ // 1. Creating cyclic dependencies.
+ // 2. Expanding the node to a call within a call sequence.
+ || !CanFoldStoreIntoFPLibCall(ST, Node))
continue;
ResultStores[ResNo] = ST;
+ StoresInChain = ST->getChain();
}
TargetLowering::ArgListTy Args;
@@ -2563,6 +2610,7 @@ bool SelectionDAG::expandMultipleResultFPLibCall(
Type *RetType = CallRetResNo.has_value()
? Node->getValueType(*CallRetResNo).getTypeForEVT(Ctx)
: Type::getVoidTy(Ctx);
+ SDValue InChain = StoresInChain ? StoresInChain : getEntryNode();
SDValue Callee = getExternalSymbol(VD ? VD->getVectorFnName().data() : LCName,
TLI->getPointerTy(getDataLayout()));
TargetLowering::CallLoweringInfo CLI(*this);
diff --git a/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll b/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll
index 8ef8b5d13b62d4..c5fef61c96af3a 100644
--- a/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll
+++ b/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll
@@ -253,3 +253,37 @@ entry:
store double %cos, ptr %out_cos, align 4
ret void
}
+
+declare void @foo(ptr, ptr)
+
+define void @can_fold_with_call_in_chain(float %x, ptr noalias %a, ptr noalias %b) {
+; CHECK-LABEL: can_fold_with_call_in_chain:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str d8, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-NEXT: str x30, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: .cfi_offset w19, -8
+; CHECK-NEXT: .cfi_offset w20, -16
+; CHECK-NEXT: .cfi_offset w30, -24
+; CHECK-NEXT: .cfi_offset b8, -32
+; CHECK-NEXT: mov x19, x1
+; CHECK-NEXT: mov x20, x0
+; CHECK-NEXT: fmov s8, s0
+; CHECK-NEXT: bl foo
+; CHECK-NEXT: fmov s0, s8
+; CHECK-NEXT: mov x0, x20
+; CHECK-NEXT: mov x1, x19
+; CHECK-NEXT: bl sincosf
+; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT: ldr d8, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %sin = tail call float @llvm.sin.f32(float %x)
+ %cos = tail call float @llvm.cos.f32(float %x)
+ call void @foo(ptr %a, ptr %b)
+ store float %sin, ptr %a, align 4
+ store float %cos, ptr %b, align 4
+ ret void
+}
diff --git a/llvm/test/CodeGen/PowerPC/f128-arith.ll b/llvm/test/CodeGen/PowerPC/f128-arith.ll
index 35e5d61947ead7..decc4a38f7ccd4 100644
--- a/llvm/test/CodeGen/PowerPC/f128-arith.ll
+++ b/llvm/test/CodeGen/PowerPC/f128-arith.ll
@@ -1365,45 +1365,33 @@ define dso_local fp128 @qpFREXP(ptr %a, ptr %b) {
; CHECK-LABEL: qpFREXP:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: mflr r0
-; CHECK-NEXT: .cfi_def_cfa_offset 64
+; CHECK-NEXT: stdu r1, -32(r1)
+; CHECK-NEXT: std r0, 48(r1)
+; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: .cfi_offset lr, 16
-; CHECK-NEXT: .cfi_offset r30, -16
-; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill
-; CHECK-NEXT: stdu r1, -64(r1)
-; CHECK-NEXT: std r0, 80(r1)
-; CHECK-NEXT: addi r5, r1, 44
-; CHECK-NEXT: mr r30, r4
; CHECK-NEXT: lxv v2, 0(r3)
+; CHECK-NEXT: mr r5, r4
; CHECK-NEXT: bl frexpf128
; CHECK-NEXT: nop
-; CHECK-NEXT: lwz r3, 44(r1)
-; CHECK-NEXT: stw r3, 0(r30)
-; CHECK-NEXT: addi r1, r1, 64
+; CHECK-NEXT: addi r1, r1, 32
; CHECK-NEXT: ld r0, 16(r1)
-; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
;
; CHECK-P8-LABEL: qpFREXP:
; CHECK-P8: # %bb.0: # %entry
; CHECK-P8-NEXT: mflr r0
-; CHECK-P8-NEXT: .cfi_def_cfa_offset 64
+; CHECK-P8-NEXT: stdu r1, -32(r1)
+; CHECK-P8-NEXT: std r0, 48(r1)
+; CHECK-P8-NEXT: .cfi_def_cfa_offset 32
; CHECK-P8-NEXT: .cfi_offset lr, 16
-; CHECK-P8-NEXT: .cfi_offset r30, -16
-; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill
-; CHECK-P8-NEXT: stdu r1, -64(r1)
-; CHECK-P8-NEXT: std r0, 80(r1)
-; CHECK-P8-NEXT: addi r5, r1, 44
-; CHECK-P8-NEXT: mr r30, r4
; CHECK-P8-NEXT: lxvd2x vs0, 0, r3
+; CHECK-P8-NEXT: mr r5, r4
; CHECK-P8-NEXT: xxswapd v2, vs0
; CHECK-P8-NEXT: bl frexpf128
; CHECK-P8-NEXT: nop
-; CHECK-P8-NEXT: lwz r3, 44(r1)
-; CHECK-P8-NEXT: stw r3, 0(r30)
-; CHECK-P8-NEXT: addi r1, r1, 64
+; CHECK-P8-NEXT: addi r1, r1, 32
; CHECK-P8-NEXT: ld r0, 16(r1)
-; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload
; CHECK-P8-NEXT: mtlr r0
; CHECK-P8-NEXT: blr
entry:
diff --git a/llvm/test/CodeGen/RISCV/llvm.frexp.ll b/llvm/test/CodeGen/RISCV/llvm.frexp.ll
index 74dec76a02e892..4a77b4d32cdda6 100644
--- a/llvm/test/CodeGen/RISCV/llvm.frexp.ll
+++ b/llvm/test/CodeGen/RISCV/llvm.frexp.ll
@@ -543,50 +543,42 @@ define i32 @test_frexp_f32_i32_only_use_exp(float %a) nounwind {
define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwind {
; RV32IFD-LABEL: test_frexp_v4f32_v4i32:
; RV32IFD: # %bb.0:
-; RV32IFD-NEXT: addi sp, sp, -64
-; RV32IFD-NEXT: sw ra, 60(sp) # 4-byte Folded Spill
-; RV32IFD-NEXT: sw s0, 56(sp) # 4-byte Folded Spill
-; RV32IFD-NEXT: fsd fs0, 48(sp) # 8-byte Folded Spill
-; RV32IFD-NEXT: fsd fs1, 40(sp) # 8-byte Folded Spill
-; RV32IFD-NEXT: fsd fs2, 32(sp) # 8-byte Folded Spill
-; RV32IFD-NEXT: fsd fs3, 24(sp) # 8-byte Folded Spill
+; RV32IFD-NEXT: addi sp, sp, -48
+; RV32IFD-NEXT: sw ra, 44(sp) # 4-byte Folded Spill
+; RV32IFD-NEXT: sw s0, 40(sp) # 4-byte Folded Spill
+; RV32IFD-NEXT: fsd fs0, 32(sp) # 8-byte Folded Spill
+; RV32IFD-NEXT: fsd fs1, 24(sp) # 8-byte Folded Spill
+; RV32IFD-NEXT: fsd fs2, 16(sp) # 8-byte Folded Spill
+; RV32IFD-NEXT: fsd fs3, 8(sp) # 8-byte Folded Spill
; RV32IFD-NEXT: fmv.s fs0, fa3
; RV32IFD-NEXT: fmv.s fs1, fa2
; RV32IFD-NEXT: fmv.s fs2, fa1
; RV32IFD-NEXT: mv s0, a0
-; RV32IFD-NEXT: addi a0, sp, 8
+; RV32IFD-NEXT: addi a0, a0, 16
; RV32IFD-NEXT: call frexpf
; RV32IFD-NEXT: fmv.s fs3, fa0
-; RV32IFD-NEXT: addi a0, sp, 12
+; RV32IFD-NEXT: addi a0, s0, 20
; RV32IFD-NEXT: fmv.s fa0, fs2
; RV32IFD-NEXT: call frexpf
; RV32IFD-NEXT: fmv.s fs2, fa0
-; RV32IFD-NEXT: addi a0, sp, 16
+; RV32IFD-NEXT: addi a0, s0, 24
; RV32IFD-NEXT: fmv.s fa0, fs1
; RV32IFD-NEXT: call frexpf
; RV32IFD-NEXT: fmv.s fs1, fa0
-; RV32IFD-NEXT: addi a0, sp, 20
+; RV32IFD-NEXT: addi a0, s0, 28
; RV32IFD-NEXT: fmv.s fa0, fs0
; RV32IFD-NEXT: call frexpf
-; RV32IFD-NEXT: lw a0, 8(sp)
-; RV32IFD-NEXT: lw a1, 12(sp)
-; RV32IFD-NEXT: lw a2, 16(sp)
-; RV32IFD-NEXT: lw a3, 20(sp)
-; RV32IFD-NEXT: sw a0, 16(s0)
-; RV32IFD-NEXT: sw a1, 20(s0)
-; RV32IFD-NEXT: sw a2, 24(s0)
-; RV32IFD-NEXT: sw a3, 28(s0)
; RV32IFD-NEXT: fsw fs3, 0(s0)
; RV32IFD-NEXT: fsw fs2, 4(s0)
; RV32IFD-NEXT: fsw fs1, 8(s0)
; RV32IFD-NEXT: fsw fa0, 12(s0)
-; RV32IFD-NEXT: lw ra, 60(sp) # 4-byte Folded Reload
-; RV32IFD-NEXT: lw s0, 56(sp) # 4-byte Folded Reload
-; RV32IFD-NEXT: fld fs0, 48(sp) # 8-byte Folded Reload
-; RV32IFD-NEXT: fld fs1, 40(sp) # 8-byte Folded Reload
-; RV32IFD-NEXT: fld fs2, 32(sp) # 8-byte Folded Reload
-; RV32IFD-NEXT: fld fs3, 24(sp) # 8-byte Folded Reload
-; RV32IFD-NEXT: addi sp, sp, 64
+; RV32IFD-NEXT: lw ra, 44(sp) # 4-byte Folded Reload
+; RV32IFD-NEXT: lw s0, 40(sp) # 4-byte Folded Reload
+; RV32IFD-NEXT: fld fs0, 32(sp) # 8-byte Folded Reload
+; RV32IFD-NEXT: fld fs1, 24(sp) # 8-byte Folded Reload
+; RV32IFD-NEXT: fld fs2, 16(sp) # 8-byte Folded Reload
+; RV32IFD-NEXT: fld fs3, 8(sp) # 8-byte Folded Reload
+; RV32IFD-NEXT: addi sp, sp, 48
; RV32IFD-NEXT: ret
;
; RV64IFD-LABEL: test_frexp_v4f32_v4i32:
@@ -639,52 +631,44 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwi
;
; RV32IZFINXZDINX-LABEL: test_frexp_v4f32_v4i32:
; RV32IZFINXZDINX: # %bb.0:
-; RV32IZFINXZDINX-NEXT: addi sp, sp, -48
-; RV32IZFINXZDINX-NEXT: sw ra, 44(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT: sw s0, 40(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT: sw s1, 36(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT: sw s2, 32(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT: sw s3, 28(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT: sw s4, 24(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT: addi sp, sp, -32
+; RV32IZFINXZDINX-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
; RV32IZFINXZDINX-NEXT: mv s0, a4
; RV32IZFINXZDINX-NEXT: mv s1, a3
; RV32IZFINXZDINX-NEXT: mv s2, a2
; RV32IZFINXZDINX-NEXT: mv a2, a1
; RV32IZFINXZDINX-NEXT: mv s3, a0
-; RV32IZFINXZDINX-NEXT: addi a1, sp, 8
+; RV32IZFINXZDINX-NEXT: addi a1, a0, 16
; RV32IZFINXZDINX-NEXT: mv a0, a2
; RV32IZFINXZDINX-NEXT: call frexpf
; RV32IZFINXZDINX-NEXT: mv s4, a0
-; RV32IZFINXZDINX-NEXT: addi a1, sp, 12
+; RV32IZFINXZDINX-NEXT: addi a1, s3, 20
; RV32IZFINXZDINX-NEXT: mv a0, s2
; RV32IZFINXZDINX-NEXT: call frexpf
; RV32IZFINXZDINX-NEXT: mv s2, a0
-; RV32IZFINXZDINX-NEXT: addi a1, sp, 16
+; RV32IZFINXZDINX-NEXT: addi a1, s3, 24
; RV32IZFINXZDINX-NEXT: mv a0, s1
; RV32IZFINXZDINX-NEXT: call frexpf
; RV32IZFINXZDINX-NEXT: mv s1, a0
-; RV32IZFINXZDINX-NEXT: addi a1, sp, 20
+; RV32IZFINXZDINX-NEXT: addi a1, s3, 28
; RV32IZFINXZDINX-NEXT: mv a0, s0
; RV32IZFINXZDINX-NEXT: call frexpf
-; RV32IZFINXZDINX-NEXT: lw a1, 8(sp)
-; RV32IZFINXZDINX-NEXT: lw a2, 12(sp)
-; RV32IZFINXZDINX-NEXT: lw a3, 16(sp)
-; RV32IZFINXZDINX-NEXT: lw a4, 20(sp)
-; RV32IZFINXZDINX-NEXT: sw a1, 16(s3)
-; RV32IZFINXZDINX-NEXT: sw a2, 20(s3)
-; RV32IZFINXZDINX-NEXT: sw a3, 24(s3)
-; RV32IZFINXZDINX-NEXT: sw a4, 28(s3)
; RV32IZFINXZDINX-NEXT: sw s4, 0(s3)
; RV32IZFINXZDINX-NEXT: sw s2, 4(s3)
; RV32IZFINXZDINX-NEXT: sw s1, 8(s3)
; RV32IZFINXZDINX-NEXT: sw a0, 12(s3)
-; RV32IZFINXZDINX-NEXT: lw ra, 44(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT: lw s0, 40(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT: lw s1, 36(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT: lw s2, 32(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT: lw s3, 28(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT: lw s4, 24(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT: addi sp, sp, 48
+; RV32IZFINXZDINX-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
+; RV32IZFINXZDINX-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
+; RV32IZFINXZDINX-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
+; RV32IZFINXZDINX-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
+; RV32IZFINXZDINX-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
+; RV32IZFINXZDINX-NEXT: lw s4, 8(sp) # 4-byte Folded Reload
+; RV32IZFINXZDINX-NEXT: addi sp, sp, 32
; RV32IZFINXZDINX-NEXT: ret
;
; RV64IZFINXZDINX-LABEL: test_frexp_v4f32_v4i32:
@@ -1096,41 +1080,34 @@ define <4 x float> @test_frexp_v4f32_v4i32_only_use_fract(<4 x float> %a) nounwi
define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) nounwind {
; RV32IFD-LABEL: test_frexp_v4f32_v4i32_only_use_exp:
; RV32IFD: # %bb.0:
-; RV32IFD-NEXT: addi sp, sp, -48
-; RV32IFD-NEXT: sw ra, 44(sp) # 4-byte Folded Spill
-; RV32IFD-NEXT: sw s0, 40(sp) # 4-byte Folded Spill
-; RV32IFD-NEXT: fsd fs0, 32(sp) # 8-byte Folded Spill
-; RV32IFD-NEXT: fsd fs1, 24(sp) # 8-byte Folded Spill
-; RV32IFD-NEXT: fsd fs2, 16(sp) # 8-byte Folded Spill
-; RV32IFD-NEXT: fmv.s fs0, fa3
-; RV32IFD-NEXT: fmv.s fs1, fa2
-; RV32IFD-NEXT: fmv.s fs2, fa1
+; RV32IFD-NEXT: addi sp, sp, -32
+; RV32IFD-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
+; RV32IFD-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
+; RV32IFD-NEXT: fsd fs0, 16(sp) # 8-byte Folded Spill
+; RV32IFD-NEXT: fsd fs1, 8(sp) # 8-byte Folded Spill
+; RV32IFD-NEXT: fsd fs2, 0(sp) # 8-byte Folded Spill
+; RV32IFD-NEXT: fmv.s fs0, fa2
+; RV32IFD-NEXT: fmv.s fs1, fa1
+; RV32IFD-NEXT: fmv.s fs2, fa0
; RV32IFD-NEXT: mv s0, a0
-; RV32IFD-NEXT: mv a0, sp
+; RV32IFD-NEXT: addi a0, a0, 12
+; RV32IFD-NEXT: fmv.s fa0, fa3
; RV32IFD-NEXT: call frexpf
-; RV32IFD-NEXT: addi a0, sp, 4
-; RV32IFD-NEXT: fmv.s fa0, fs2
+; RV32IFD-NEXT: addi a0, s0, 8
+; RV32IFD-NEXT: fmv.s fa0, fs0
; RV32IFD-NEXT: call frexpf
-; RV32IFD-NEXT: addi a0, sp, 8
+; RV32IFD-NEXT: addi a0, s0, 4
; RV32IFD-NEXT: fmv.s fa0, fs1
; RV32IFD-NEXT: call frexpf
-; RV32IFD-NEXT: addi a0, sp, 12
-; RV32IFD-NEXT: fmv.s fa0, fs0
+; RV32IFD-NEXT: fmv.s fa0, fs2
+; RV32IFD-NEXT: mv a0, s0
; RV32IFD-NEXT: call frexpf
-; RV32IFD-NEXT: lw a0, 0(sp)
-; RV32IFD-NEXT: lw a1, 4(sp)
-; RV32IFD-NEXT: lw a2, 8(sp)
-; RV32IFD-NEXT: lw a3, 12(sp)
-; RV32IFD-NEXT: sw a0, 0(s0)
-; RV32IFD-NEXT: sw a1, 4(s0)
-; RV32IFD-NEXT: sw a2, 8(s0)
-; RV32IFD-NEXT: sw a3, 12(s0)
-; RV32IFD-NEXT: lw ra, 44(sp) # 4-byte Folded Reload
-; RV32IFD-NEXT: lw s0, 40(sp) # 4-byte Folded Reload
-; RV32IFD-NEXT: fld fs0, 32(sp) # 8-byte Folded Reload
-; RV32IFD-NEXT: fld fs1, 24(sp) # 8-byte Folded Reload
-; RV32IFD-NEXT: fld fs2, 16(sp) # 8-byte Folded Reload
-; RV32IFD-NEXT: addi sp, sp, 48
+; RV32IFD-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
+; RV32IFD-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
+; RV32IFD-NEXT: fld fs0, 16(sp) # 8-byte Folded Reload
+; RV32IFD-NEXT: fld fs1, 8(sp) # 8-byte Folded Reload
+; RV32IFD-NEXT: fld fs2, 0(sp) # 8-byte Folded Reload
+; RV32IFD-NEXT: addi sp, sp, 32
; RV32IFD-NEXT: ret
;
; RV64IFD-LABEL: test_frexp_v4f32_v4i32_only_use_exp:
@@ -1174,43 +1151,34 @@ define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) nounwind {
;
; RV32IZFINXZDINX-LABEL: test_frexp_v4f32_v4i32_only_use_exp:
; RV32IZFINXZDINX: # %bb.0:
-; RV32IZFINXZDINX-NEXT: addi sp, sp, -48
-; RV32IZFINXZDINX-NEXT: sw ra, 44(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT: sw s0, 40(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT: sw s1, 36(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT: sw s2, 32(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT: sw s3, 28(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT: mv s0, a4
-; RV32IZFINXZDINX-NEXT: mv s1, a3
-; RV32IZFINXZDINX-NEXT: mv s2, a2
-; RV32IZFINXZDINX-NEXT: mv a2, a1
+; RV32IZFINXZDINX-NEXT: addi sp, sp, -32
+; RV32IZFINXZDINX-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT: mv s0, a3
+; RV32IZFINXZDINX-NEXT: mv s1, a2
+; RV32IZFINXZDINX-NEXT: mv s2, a1
; RV32IZFINXZDINX-NEXT: mv s3, a0
-; RV32IZFINXZDINX-NEXT: addi a1, sp, 12
-; RV32IZFINXZDINX-NEXT: mv a0, a2
+; RV32IZFINXZDINX-NEXT: addi a1, a0, 12
+; RV32IZFINXZDINX-NEXT: mv a0, a4
; RV32IZFINXZDINX-NEXT: call frexpf
-; RV32IZFINXZDINX-NEXT: addi a1, sp, 16
-; RV32IZFINXZDINX-NEXT: mv a0, s2
+; RV32IZFINXZDINX-NEXT: addi a1, s3, 8
+; RV32IZFINXZDINX-NEXT: mv a0, s0
; RV32IZFINXZDINX-NEXT: call frexpf
-; RV32IZFINXZDINX-NEXT: addi a1, sp, 20
+; RV32IZFINXZDINX-NEXT: addi a1, s3, 4
; RV32IZFINXZDINX-NEXT: mv a0, s1
; RV32IZFINXZDINX-NEXT: call frexpf
-; RV32IZFINXZDINX-NEXT: addi a1, sp, 24
-; RV32IZFINXZDINX-NEXT: mv a0, s0
+; RV32IZFINXZDINX-NEXT: mv a0, s2
+; RV32IZFINXZDINX-NEXT: mv a1, s3
; RV32IZFINXZDINX-NEXT: call frexpf
-; RV32IZFINXZDINX-NEXT: lw a0, 12(sp)
-; RV32IZFINXZDINX-NEXT: lw a1, 16(sp)
-; RV32IZFINXZDINX-NEXT: lw a2, 20(sp)
-; RV32IZFINXZDINX-NEXT: lw a3, 24(sp)
-; RV32IZFINXZDINX-NEXT: sw a0, 0(s3)
-; RV32IZFINXZDINX-NEXT: sw a1, 4(s3)
-; RV32IZFINXZDINX-NEXT: sw a2, 8(s3)
-; RV32IZFINXZDINX-NEXT: sw a3, 12(s3)
-; RV32IZFINXZDINX-NEXT: lw ra, 44(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT: lw s0, 40(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT: lw s1, 36(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT: lw s2, 32(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT: lw s3, 28(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT: addi sp, sp, 48
+; R...
[truncated]
|
@llvm/pr-subscribers-backend-aarch64 Author: Benjamin Maxwell (MacDue) ChangesThis adds a new helper
These two properties are checked as part of the same traversal in Patch is 26.83 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/118117.diff 6 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 182529123ec6d8..8ddb4bcb8fb212 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2474,6 +2474,45 @@ SDValue SelectionDAG::getPartialReduceAdd(SDLoc DL, EVT ReducedTy, SDValue Op1,
return Subvectors[0];
}
+/// Given a store node \p StoreNode, return true if it is safe to fold that node
+/// into \p FPNode, which expands to a library call with output pointers.
+static bool CanFoldStoreIntoFPLibCall(StoreSDNode *StoreNode, SDNode *FPNode) {
+ SmallVector<const SDNode *, 8> Worklist;
+ SmallVector<const SDNode *, 8> DeferredNodes;
+ SmallPtrSet<const SDNode *, 16> Visited;
+
+ // Skip FPNode use by StoreNode (that's the use we want to fold into FPNode).
+ for (SDValue Op : StoreNode->ops())
+ if (Op.getNode() != FPNode)
+ Worklist.push_back(Op.getNode());
+
+ while (!Worklist.empty()) {
+ const SDNode *Node = Worklist.pop_back_val();
+ auto [_, Inserted] = Visited.insert(Node);
+ if (!Inserted)
+ continue;
+
+ // Reached the FPNode (would result in a cycle).
+ // OR Reached CALLSEQ_START (would result in nested call sequences).
+ if (Node == FPNode || Node->getOpcode() == ISD::CALLSEQ_START)
+ return false;
+
+ if (Node->getOpcode() == ISD::CALLSEQ_END) {
+ // Defer looking into call sequences (so we can check we're outside one).
+ // We still need to look through these for the predecessor check.
+ DeferredNodes.push_back(Node);
+ continue;
+ }
+
+ for (SDValue Op : Node->ops())
+ Worklist.push_back(Op.getNode());
+ }
+
+ // True if we're outside a call sequence and don't have the FPNode as a
+ // predecessor. No cycles or nested call sequences possible.
+ return !SDNode::hasPredecessorHelper(FPNode, Visited, DeferredNodes);
+}
+
bool SelectionDAG::expandMultipleResultFPLibCall(
RTLIB::Libcall LC, SDNode *Node, SmallVectorImpl<SDValue> &Results,
std::optional<unsigned> CallRetResNo) {
@@ -2502,11 +2541,7 @@ bool SelectionDAG::expandMultipleResultFPLibCall(
// Find users of the node that store the results (and share input chains). The
// destination pointers can be used instead of creating stack allocations.
- // FIXME: This should allow stores with the same chains (not just the entry
- // chain), but there's a risk the store is within a (CALLSEQ_START,
- // CALLSEQ_END) pair, which after this expansion will lead to nested call
- // sequences.
- SDValue InChain = getEntryNode();
+ SDValue StoresInChain{};
SmallVector<StoreSDNode *, 2> ResultStores(NumResults);
for (SDNode *User : Node->uses()) {
if (!ISD::isNormalStore(User))
@@ -2515,13 +2550,25 @@ bool SelectionDAG::expandMultipleResultFPLibCall(
SDValue StoreValue = ST->getValue();
unsigned ResNo = StoreValue.getResNo();
Type *StoreType = StoreValue.getValueType().getTypeForEVT(Ctx);
- if (CallRetResNo == ResNo || !ST->isSimple() ||
+ if (
+ // Ensure the store corresponds to an output pointer.
+ CallRetResNo == ResNo ||
+ // Ensure the store is not atomic or volatile.
+ !ST->isSimple() ||
+ // Ensure the store is in the default address space.
ST->getAddressSpace() != 0 ||
+ // Ensure the store is properly aligned.
ST->getAlign() <
getDataLayout().getABITypeAlign(StoreType->getScalarType()) ||
- ST->getChain() != InChain)
+ // Ensure all store chains are the same (so they don't alias).
+ (StoresInChain && ST->getChain() != StoresInChain)
+ // Avoid:
+ // 1. Creating cyclic dependencies.
+ // 2. Expanding the node to a call within a call sequence.
+ || !CanFoldStoreIntoFPLibCall(ST, Node))
continue;
ResultStores[ResNo] = ST;
+ StoresInChain = ST->getChain();
}
TargetLowering::ArgListTy Args;
@@ -2563,6 +2610,7 @@ bool SelectionDAG::expandMultipleResultFPLibCall(
Type *RetType = CallRetResNo.has_value()
? Node->getValueType(*CallRetResNo).getTypeForEVT(Ctx)
: Type::getVoidTy(Ctx);
+ SDValue InChain = StoresInChain ? StoresInChain : getEntryNode();
SDValue Callee = getExternalSymbol(VD ? VD->getVectorFnName().data() : LCName,
TLI->getPointerTy(getDataLayout()));
TargetLowering::CallLoweringInfo CLI(*this);
diff --git a/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll b/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll
index 8ef8b5d13b62d4..c5fef61c96af3a 100644
--- a/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll
+++ b/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll
@@ -253,3 +253,37 @@ entry:
store double %cos, ptr %out_cos, align 4
ret void
}
+
+declare void @foo(ptr, ptr)
+
+define void @can_fold_with_call_in_chain(float %x, ptr noalias %a, ptr noalias %b) {
+; CHECK-LABEL: can_fold_with_call_in_chain:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str d8, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-NEXT: str x30, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: .cfi_offset w19, -8
+; CHECK-NEXT: .cfi_offset w20, -16
+; CHECK-NEXT: .cfi_offset w30, -24
+; CHECK-NEXT: .cfi_offset b8, -32
+; CHECK-NEXT: mov x19, x1
+; CHECK-NEXT: mov x20, x0
+; CHECK-NEXT: fmov s8, s0
+; CHECK-NEXT: bl foo
+; CHECK-NEXT: fmov s0, s8
+; CHECK-NEXT: mov x0, x20
+; CHECK-NEXT: mov x1, x19
+; CHECK-NEXT: bl sincosf
+; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT: ldr d8, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %sin = tail call float @llvm.sin.f32(float %x)
+ %cos = tail call float @llvm.cos.f32(float %x)
+ call void @foo(ptr %a, ptr %b)
+ store float %sin, ptr %a, align 4
+ store float %cos, ptr %b, align 4
+ ret void
+}
diff --git a/llvm/test/CodeGen/PowerPC/f128-arith.ll b/llvm/test/CodeGen/PowerPC/f128-arith.ll
index 35e5d61947ead7..decc4a38f7ccd4 100644
--- a/llvm/test/CodeGen/PowerPC/f128-arith.ll
+++ b/llvm/test/CodeGen/PowerPC/f128-arith.ll
@@ -1365,45 +1365,33 @@ define dso_local fp128 @qpFREXP(ptr %a, ptr %b) {
; CHECK-LABEL: qpFREXP:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: mflr r0
-; CHECK-NEXT: .cfi_def_cfa_offset 64
+; CHECK-NEXT: stdu r1, -32(r1)
+; CHECK-NEXT: std r0, 48(r1)
+; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: .cfi_offset lr, 16
-; CHECK-NEXT: .cfi_offset r30, -16
-; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill
-; CHECK-NEXT: stdu r1, -64(r1)
-; CHECK-NEXT: std r0, 80(r1)
-; CHECK-NEXT: addi r5, r1, 44
-; CHECK-NEXT: mr r30, r4
; CHECK-NEXT: lxv v2, 0(r3)
+; CHECK-NEXT: mr r5, r4
; CHECK-NEXT: bl frexpf128
; CHECK-NEXT: nop
-; CHECK-NEXT: lwz r3, 44(r1)
-; CHECK-NEXT: stw r3, 0(r30)
-; CHECK-NEXT: addi r1, r1, 64
+; CHECK-NEXT: addi r1, r1, 32
; CHECK-NEXT: ld r0, 16(r1)
-; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
;
; CHECK-P8-LABEL: qpFREXP:
; CHECK-P8: # %bb.0: # %entry
; CHECK-P8-NEXT: mflr r0
-; CHECK-P8-NEXT: .cfi_def_cfa_offset 64
+; CHECK-P8-NEXT: stdu r1, -32(r1)
+; CHECK-P8-NEXT: std r0, 48(r1)
+; CHECK-P8-NEXT: .cfi_def_cfa_offset 32
; CHECK-P8-NEXT: .cfi_offset lr, 16
-; CHECK-P8-NEXT: .cfi_offset r30, -16
-; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill
-; CHECK-P8-NEXT: stdu r1, -64(r1)
-; CHECK-P8-NEXT: std r0, 80(r1)
-; CHECK-P8-NEXT: addi r5, r1, 44
-; CHECK-P8-NEXT: mr r30, r4
; CHECK-P8-NEXT: lxvd2x vs0, 0, r3
+; CHECK-P8-NEXT: mr r5, r4
; CHECK-P8-NEXT: xxswapd v2, vs0
; CHECK-P8-NEXT: bl frexpf128
; CHECK-P8-NEXT: nop
-; CHECK-P8-NEXT: lwz r3, 44(r1)
-; CHECK-P8-NEXT: stw r3, 0(r30)
-; CHECK-P8-NEXT: addi r1, r1, 64
+; CHECK-P8-NEXT: addi r1, r1, 32
; CHECK-P8-NEXT: ld r0, 16(r1)
-; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload
; CHECK-P8-NEXT: mtlr r0
; CHECK-P8-NEXT: blr
entry:
diff --git a/llvm/test/CodeGen/RISCV/llvm.frexp.ll b/llvm/test/CodeGen/RISCV/llvm.frexp.ll
index 74dec76a02e892..4a77b4d32cdda6 100644
--- a/llvm/test/CodeGen/RISCV/llvm.frexp.ll
+++ b/llvm/test/CodeGen/RISCV/llvm.frexp.ll
@@ -543,50 +543,42 @@ define i32 @test_frexp_f32_i32_only_use_exp(float %a) nounwind {
define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwind {
; RV32IFD-LABEL: test_frexp_v4f32_v4i32:
; RV32IFD: # %bb.0:
-; RV32IFD-NEXT: addi sp, sp, -64
-; RV32IFD-NEXT: sw ra, 60(sp) # 4-byte Folded Spill
-; RV32IFD-NEXT: sw s0, 56(sp) # 4-byte Folded Spill
-; RV32IFD-NEXT: fsd fs0, 48(sp) # 8-byte Folded Spill
-; RV32IFD-NEXT: fsd fs1, 40(sp) # 8-byte Folded Spill
-; RV32IFD-NEXT: fsd fs2, 32(sp) # 8-byte Folded Spill
-; RV32IFD-NEXT: fsd fs3, 24(sp) # 8-byte Folded Spill
+; RV32IFD-NEXT: addi sp, sp, -48
+; RV32IFD-NEXT: sw ra, 44(sp) # 4-byte Folded Spill
+; RV32IFD-NEXT: sw s0, 40(sp) # 4-byte Folded Spill
+; RV32IFD-NEXT: fsd fs0, 32(sp) # 8-byte Folded Spill
+; RV32IFD-NEXT: fsd fs1, 24(sp) # 8-byte Folded Spill
+; RV32IFD-NEXT: fsd fs2, 16(sp) # 8-byte Folded Spill
+; RV32IFD-NEXT: fsd fs3, 8(sp) # 8-byte Folded Spill
; RV32IFD-NEXT: fmv.s fs0, fa3
; RV32IFD-NEXT: fmv.s fs1, fa2
; RV32IFD-NEXT: fmv.s fs2, fa1
; RV32IFD-NEXT: mv s0, a0
-; RV32IFD-NEXT: addi a0, sp, 8
+; RV32IFD-NEXT: addi a0, a0, 16
; RV32IFD-NEXT: call frexpf
; RV32IFD-NEXT: fmv.s fs3, fa0
-; RV32IFD-NEXT: addi a0, sp, 12
+; RV32IFD-NEXT: addi a0, s0, 20
; RV32IFD-NEXT: fmv.s fa0, fs2
; RV32IFD-NEXT: call frexpf
; RV32IFD-NEXT: fmv.s fs2, fa0
-; RV32IFD-NEXT: addi a0, sp, 16
+; RV32IFD-NEXT: addi a0, s0, 24
; RV32IFD-NEXT: fmv.s fa0, fs1
; RV32IFD-NEXT: call frexpf
; RV32IFD-NEXT: fmv.s fs1, fa0
-; RV32IFD-NEXT: addi a0, sp, 20
+; RV32IFD-NEXT: addi a0, s0, 28
; RV32IFD-NEXT: fmv.s fa0, fs0
; RV32IFD-NEXT: call frexpf
-; RV32IFD-NEXT: lw a0, 8(sp)
-; RV32IFD-NEXT: lw a1, 12(sp)
-; RV32IFD-NEXT: lw a2, 16(sp)
-; RV32IFD-NEXT: lw a3, 20(sp)
-; RV32IFD-NEXT: sw a0, 16(s0)
-; RV32IFD-NEXT: sw a1, 20(s0)
-; RV32IFD-NEXT: sw a2, 24(s0)
-; RV32IFD-NEXT: sw a3, 28(s0)
; RV32IFD-NEXT: fsw fs3, 0(s0)
; RV32IFD-NEXT: fsw fs2, 4(s0)
; RV32IFD-NEXT: fsw fs1, 8(s0)
; RV32IFD-NEXT: fsw fa0, 12(s0)
-; RV32IFD-NEXT: lw ra, 60(sp) # 4-byte Folded Reload
-; RV32IFD-NEXT: lw s0, 56(sp) # 4-byte Folded Reload
-; RV32IFD-NEXT: fld fs0, 48(sp) # 8-byte Folded Reload
-; RV32IFD-NEXT: fld fs1, 40(sp) # 8-byte Folded Reload
-; RV32IFD-NEXT: fld fs2, 32(sp) # 8-byte Folded Reload
-; RV32IFD-NEXT: fld fs3, 24(sp) # 8-byte Folded Reload
-; RV32IFD-NEXT: addi sp, sp, 64
+; RV32IFD-NEXT: lw ra, 44(sp) # 4-byte Folded Reload
+; RV32IFD-NEXT: lw s0, 40(sp) # 4-byte Folded Reload
+; RV32IFD-NEXT: fld fs0, 32(sp) # 8-byte Folded Reload
+; RV32IFD-NEXT: fld fs1, 24(sp) # 8-byte Folded Reload
+; RV32IFD-NEXT: fld fs2, 16(sp) # 8-byte Folded Reload
+; RV32IFD-NEXT: fld fs3, 8(sp) # 8-byte Folded Reload
+; RV32IFD-NEXT: addi sp, sp, 48
; RV32IFD-NEXT: ret
;
; RV64IFD-LABEL: test_frexp_v4f32_v4i32:
@@ -639,52 +631,44 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwi
;
; RV32IZFINXZDINX-LABEL: test_frexp_v4f32_v4i32:
; RV32IZFINXZDINX: # %bb.0:
-; RV32IZFINXZDINX-NEXT: addi sp, sp, -48
-; RV32IZFINXZDINX-NEXT: sw ra, 44(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT: sw s0, 40(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT: sw s1, 36(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT: sw s2, 32(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT: sw s3, 28(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT: sw s4, 24(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT: addi sp, sp, -32
+; RV32IZFINXZDINX-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
; RV32IZFINXZDINX-NEXT: mv s0, a4
; RV32IZFINXZDINX-NEXT: mv s1, a3
; RV32IZFINXZDINX-NEXT: mv s2, a2
; RV32IZFINXZDINX-NEXT: mv a2, a1
; RV32IZFINXZDINX-NEXT: mv s3, a0
-; RV32IZFINXZDINX-NEXT: addi a1, sp, 8
+; RV32IZFINXZDINX-NEXT: addi a1, a0, 16
; RV32IZFINXZDINX-NEXT: mv a0, a2
; RV32IZFINXZDINX-NEXT: call frexpf
; RV32IZFINXZDINX-NEXT: mv s4, a0
-; RV32IZFINXZDINX-NEXT: addi a1, sp, 12
+; RV32IZFINXZDINX-NEXT: addi a1, s3, 20
; RV32IZFINXZDINX-NEXT: mv a0, s2
; RV32IZFINXZDINX-NEXT: call frexpf
; RV32IZFINXZDINX-NEXT: mv s2, a0
-; RV32IZFINXZDINX-NEXT: addi a1, sp, 16
+; RV32IZFINXZDINX-NEXT: addi a1, s3, 24
; RV32IZFINXZDINX-NEXT: mv a0, s1
; RV32IZFINXZDINX-NEXT: call frexpf
; RV32IZFINXZDINX-NEXT: mv s1, a0
-; RV32IZFINXZDINX-NEXT: addi a1, sp, 20
+; RV32IZFINXZDINX-NEXT: addi a1, s3, 28
; RV32IZFINXZDINX-NEXT: mv a0, s0
; RV32IZFINXZDINX-NEXT: call frexpf
-; RV32IZFINXZDINX-NEXT: lw a1, 8(sp)
-; RV32IZFINXZDINX-NEXT: lw a2, 12(sp)
-; RV32IZFINXZDINX-NEXT: lw a3, 16(sp)
-; RV32IZFINXZDINX-NEXT: lw a4, 20(sp)
-; RV32IZFINXZDINX-NEXT: sw a1, 16(s3)
-; RV32IZFINXZDINX-NEXT: sw a2, 20(s3)
-; RV32IZFINXZDINX-NEXT: sw a3, 24(s3)
-; RV32IZFINXZDINX-NEXT: sw a4, 28(s3)
; RV32IZFINXZDINX-NEXT: sw s4, 0(s3)
; RV32IZFINXZDINX-NEXT: sw s2, 4(s3)
; RV32IZFINXZDINX-NEXT: sw s1, 8(s3)
; RV32IZFINXZDINX-NEXT: sw a0, 12(s3)
-; RV32IZFINXZDINX-NEXT: lw ra, 44(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT: lw s0, 40(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT: lw s1, 36(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT: lw s2, 32(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT: lw s3, 28(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT: lw s4, 24(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT: addi sp, sp, 48
+; RV32IZFINXZDINX-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
+; RV32IZFINXZDINX-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
+; RV32IZFINXZDINX-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
+; RV32IZFINXZDINX-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
+; RV32IZFINXZDINX-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
+; RV32IZFINXZDINX-NEXT: lw s4, 8(sp) # 4-byte Folded Reload
+; RV32IZFINXZDINX-NEXT: addi sp, sp, 32
; RV32IZFINXZDINX-NEXT: ret
;
; RV64IZFINXZDINX-LABEL: test_frexp_v4f32_v4i32:
@@ -1096,41 +1080,34 @@ define <4 x float> @test_frexp_v4f32_v4i32_only_use_fract(<4 x float> %a) nounwi
define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) nounwind {
; RV32IFD-LABEL: test_frexp_v4f32_v4i32_only_use_exp:
; RV32IFD: # %bb.0:
-; RV32IFD-NEXT: addi sp, sp, -48
-; RV32IFD-NEXT: sw ra, 44(sp) # 4-byte Folded Spill
-; RV32IFD-NEXT: sw s0, 40(sp) # 4-byte Folded Spill
-; RV32IFD-NEXT: fsd fs0, 32(sp) # 8-byte Folded Spill
-; RV32IFD-NEXT: fsd fs1, 24(sp) # 8-byte Folded Spill
-; RV32IFD-NEXT: fsd fs2, 16(sp) # 8-byte Folded Spill
-; RV32IFD-NEXT: fmv.s fs0, fa3
-; RV32IFD-NEXT: fmv.s fs1, fa2
-; RV32IFD-NEXT: fmv.s fs2, fa1
+; RV32IFD-NEXT: addi sp, sp, -32
+; RV32IFD-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
+; RV32IFD-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
+; RV32IFD-NEXT: fsd fs0, 16(sp) # 8-byte Folded Spill
+; RV32IFD-NEXT: fsd fs1, 8(sp) # 8-byte Folded Spill
+; RV32IFD-NEXT: fsd fs2, 0(sp) # 8-byte Folded Spill
+; RV32IFD-NEXT: fmv.s fs0, fa2
+; RV32IFD-NEXT: fmv.s fs1, fa1
+; RV32IFD-NEXT: fmv.s fs2, fa0
; RV32IFD-NEXT: mv s0, a0
-; RV32IFD-NEXT: mv a0, sp
+; RV32IFD-NEXT: addi a0, a0, 12
+; RV32IFD-NEXT: fmv.s fa0, fa3
; RV32IFD-NEXT: call frexpf
-; RV32IFD-NEXT: addi a0, sp, 4
-; RV32IFD-NEXT: fmv.s fa0, fs2
+; RV32IFD-NEXT: addi a0, s0, 8
+; RV32IFD-NEXT: fmv.s fa0, fs0
; RV32IFD-NEXT: call frexpf
-; RV32IFD-NEXT: addi a0, sp, 8
+; RV32IFD-NEXT: addi a0, s0, 4
; RV32IFD-NEXT: fmv.s fa0, fs1
; RV32IFD-NEXT: call frexpf
-; RV32IFD-NEXT: addi a0, sp, 12
-; RV32IFD-NEXT: fmv.s fa0, fs0
+; RV32IFD-NEXT: fmv.s fa0, fs2
+; RV32IFD-NEXT: mv a0, s0
; RV32IFD-NEXT: call frexpf
-; RV32IFD-NEXT: lw a0, 0(sp)
-; RV32IFD-NEXT: lw a1, 4(sp)
-; RV32IFD-NEXT: lw a2, 8(sp)
-; RV32IFD-NEXT: lw a3, 12(sp)
-; RV32IFD-NEXT: sw a0, 0(s0)
-; RV32IFD-NEXT: sw a1, 4(s0)
-; RV32IFD-NEXT: sw a2, 8(s0)
-; RV32IFD-NEXT: sw a3, 12(s0)
-; RV32IFD-NEXT: lw ra, 44(sp) # 4-byte Folded Reload
-; RV32IFD-NEXT: lw s0, 40(sp) # 4-byte Folded Reload
-; RV32IFD-NEXT: fld fs0, 32(sp) # 8-byte Folded Reload
-; RV32IFD-NEXT: fld fs1, 24(sp) # 8-byte Folded Reload
-; RV32IFD-NEXT: fld fs2, 16(sp) # 8-byte Folded Reload
-; RV32IFD-NEXT: addi sp, sp, 48
+; RV32IFD-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
+; RV32IFD-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
+; RV32IFD-NEXT: fld fs0, 16(sp) # 8-byte Folded Reload
+; RV32IFD-NEXT: fld fs1, 8(sp) # 8-byte Folded Reload
+; RV32IFD-NEXT: fld fs2, 0(sp) # 8-byte Folded Reload
+; RV32IFD-NEXT: addi sp, sp, 32
; RV32IFD-NEXT: ret
;
; RV64IFD-LABEL: test_frexp_v4f32_v4i32_only_use_exp:
@@ -1174,43 +1151,34 @@ define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) nounwind {
;
; RV32IZFINXZDINX-LABEL: test_frexp_v4f32_v4i32_only_use_exp:
; RV32IZFINXZDINX: # %bb.0:
-; RV32IZFINXZDINX-NEXT: addi sp, sp, -48
-; RV32IZFINXZDINX-NEXT: sw ra, 44(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT: sw s0, 40(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT: sw s1, 36(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT: sw s2, 32(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT: sw s3, 28(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT: mv s0, a4
-; RV32IZFINXZDINX-NEXT: mv s1, a3
-; RV32IZFINXZDINX-NEXT: mv s2, a2
-; RV32IZFINXZDINX-NEXT: mv a2, a1
+; RV32IZFINXZDINX-NEXT: addi sp, sp, -32
+; RV32IZFINXZDINX-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT: mv s0, a3
+; RV32IZFINXZDINX-NEXT: mv s1, a2
+; RV32IZFINXZDINX-NEXT: mv s2, a1
; RV32IZFINXZDINX-NEXT: mv s3, a0
-; RV32IZFINXZDINX-NEXT: addi a1, sp, 12
-; RV32IZFINXZDINX-NEXT: mv a0, a2
+; RV32IZFINXZDINX-NEXT: addi a1, a0, 12
+; RV32IZFINXZDINX-NEXT: mv a0, a4
; RV32IZFINXZDINX-NEXT: call frexpf
-; RV32IZFINXZDINX-NEXT: addi a1, sp, 16
-; RV32IZFINXZDINX-NEXT: mv a0, s2
+; RV32IZFINXZDINX-NEXT: addi a1, s3, 8
+; RV32IZFINXZDINX-NEXT: mv a0, s0
; RV32IZFINXZDINX-NEXT: call frexpf
-; RV32IZFINXZDINX-NEXT: addi a1, sp, 20
+; RV32IZFINXZDINX-NEXT: addi a1, s3, 4
; RV32IZFINXZDINX-NEXT: mv a0, s1
; RV32IZFINXZDINX-NEXT: call frexpf
-; RV32IZFINXZDINX-NEXT: addi a1, sp, 24
-; RV32IZFINXZDINX-NEXT: mv a0, s0
+; RV32IZFINXZDINX-NEXT: mv a0, s2
+; RV32IZFINXZDINX-NEXT: mv a1, s3
; RV32IZFINXZDINX-NEXT: call frexpf
-; RV32IZFINXZDINX-NEXT: lw a0, 12(sp)
-; RV32IZFINXZDINX-NEXT: lw a1, 16(sp)
-; RV32IZFINXZDINX-NEXT: lw a2, 20(sp)
-; RV32IZFINXZDINX-NEXT: lw a3, 24(sp)
-; RV32IZFINXZDINX-NEXT: sw a0, 0(s3)
-; RV32IZFINXZDINX-NEXT: sw a1, 4(s3)
-; RV32IZFINXZDINX-NEXT: sw a2, 8(s3)
-; RV32IZFINXZDINX-NEXT: sw a3, 12(s3)
-; RV32IZFINXZDINX-NEXT: lw ra, 44(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT: lw s0, 40(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT: lw s1, 36(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT: lw s2, 32(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT: lw s3, 28(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT: addi sp, sp, 48
+; R...
[truncated]
|
continue; | ||
|
||
// Reached the FPNode (would result in a cycle). | ||
// OR Reached CALLSEQ_START (would result in nested call sequences). |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Needing to special case calls feels wrong. Can you just use reachesChainWithoutSideEffects?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, reachesChainWithoutSideEffects()
does not do the check I need. It simply walks the chain, only allowing loads or token factors to be included (and it's not clear what the "dest" would be; it does not always need to be the entry node here).
I think this really is a special case. The issue comes from the stores being within a call sequence, which means if they're folded into the expansion, we'll get nested call sequences, which is illegal.
%sin = tail call float @llvm.sin.f32(float %x) | ||
%cos = tail call float @llvm.cos.f32(float %x) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wonder if this is a codegen solution to an IR problem. These could have moved past the call
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure I follow. In SDAG, these don't have any particular place (as they're not side-effecting nodes until after the expansion).
The test here shows the two stores after the call to @foo
, can still be folded into the @sincos
library call, even though they're both chained to the call to @foo
(as they don't alias with each other).
for (SDValue Op : StoreNode->ops()) | ||
if (Op.getNode() != FPNode) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Don't you only want to visit the chain?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is probably overly cautious, but I want to make sure that there's no user of another value of the node that occurs as a predecessor of the store (via any value) to avoid creating cycles.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM with one request.
This adds a new helper `CanFoldStoreIntoFPLibCall()` to check that it is safe to fold a store into a node that will expand to a library call that takes output pointers. This requires checking for two (independent) properties: 1. The store is not within a CALLSEQ_START..CALLSEQ_END pair * If it is, the expansion would lead to nested call sequences (which is invalid) 2. The node does not appear as a predecessor to the store * If it does, attempting to merge the store into the call would result in a cycle in the DAG These two properties are checked as part of the same traversal in `CanFoldStoreIntoFPLibCall()`
e916271
to
c3ee9d0
Compare
This adds a new helper
canFoldStoreIntoLibCallOutputPointers()
to check that it is safe to fold a store into a node that will expand to a library call that takes output pointers. This requires checking for two (independent) properties:These two properties are checked as part of the same traversal in
canFoldStoreIntoLibCallOutputPointers()