Skip to content

[SDAG][X86] Remove hack needed to avoid missing x87 FPU stack pops #128055

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
Mar 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 0 additions & 16 deletions llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2635,22 +2635,6 @@ bool SelectionDAG::expandMultipleResultFPLibCall(
Results.push_back(LoadResult);
}

if (CallRetResNo && !Node->hasAnyUseOfValue(*CallRetResNo)) {
// FIXME: Find a way to avoid updating the root. This is needed for x86,
// which uses a floating-point stack. If (for example) the node to be
// expanded has two results one floating-point which is returned by the
// call, and one integer result, returned via an output pointer. If only the
// integer result is used then the `CopyFromReg` for the FP result may be
// optimized out. This prevents an FP stack pop from being emitted for it.
// Setting the root like this ensures there will be a use of the
// `CopyFromReg` chain, and ensures the FP pop will be emitted.
SDValue NewRoot =
getNode(ISD::TokenFactor, DL, MVT::Other, getRoot(), CallChain);
setRoot(NewRoot);
// Ensure the new root is reachable from the results.
Results[0] = getMergeValues({Results[0], NewRoot}, DL);
}

return true;
}

Expand Down
11 changes: 11 additions & 0 deletions llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6717,6 +6717,17 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
ReplaceNode(Node, Res);
return;
}
case X86ISD::POP_FROM_X87_REG: {
SDValue Chain = Node->getOperand(0);
Register Reg = cast<RegisterSDNode>(Node->getOperand(1))->getReg();
SDValue Glue;
if (Node->getNumValues() == 3)
Glue = Node->getOperand(2);
SDValue Copy =
CurDAG->getCopyFromReg(Chain, dl, Reg, Node->getValueType(0), Glue);
ReplaceNode(Node, Copy.getNode());
return;
}
}

SelectCode(Node);
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35114,6 +35114,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(CVTTP2SIS_SAE)
NODE_NAME_CASE(CVTTP2UIS)
NODE_NAME_CASE(MCVTTP2UIS)
NODE_NAME_CASE(POP_FROM_X87_REG)
}
return nullptr;
#undef NODE_NAME_CASE
Expand Down
9 changes: 9 additions & 0 deletions llvm/lib/Target/X86/X86ISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,15 @@ namespace llvm {
// marker instruction.
CALL_RVMARKER,

/// The same as ISD::CopyFromReg except that this node makes it explicit
/// that it may lower to an x87 FPU stack pop. Optimizations should be more
/// cautious when handling this node than a normal CopyFromReg to avoid
/// removing a required FPU stack pop. A key requirement is optimizations
/// should not optimize any users of a chain that contains a
/// POP_FROM_X87_REG to use a chain from a point earlier than the
/// POP_FROM_X87_REG (which may remove a required FPU stack pop).
POP_FROM_X87_REG,

/// X86 compare and logical compare instructions.
CMP,
FCMP,
Expand Down
21 changes: 17 additions & 4 deletions llvm/lib/Target/X86/X86ISelLoweringCall.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1095,6 +1095,15 @@ static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
return DAG.getBitcast(ValVT, ValReturned);
}

static SDValue getPopFromX87Reg(SelectionDAG &DAG, SDValue Chain,
const SDLoc &dl, Register Reg, EVT VT,
SDValue Glue) {
SDVTList VTs = DAG.getVTList(VT, MVT::Other, MVT::Glue);
SDValue Ops[] = {Chain, DAG.getRegister(Reg, VT), Glue};
return DAG.getNode(X86ISD::POP_FROM_X87_REG, dl, VTs,
ArrayRef(Ops, Glue.getNode() ? 3 : 2));
}

/// Lower the result values of a call into the
/// appropriate copies out of appropriate physical registers.
///
Expand Down Expand Up @@ -1145,8 +1154,8 @@ SDValue X86TargetLowering::LowerCallResult(
// If we prefer to use the value in xmm registers, copy it out as f80 and
// use a truncate to move it from fp stack reg to xmm reg.
bool RoundAfterCopy = false;
if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
isScalarFPTypeInSSEReg(VA.getValVT())) {
bool X87Result = VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1;
if (X87Result && isScalarFPTypeInSSEReg(VA.getValVT())) {
if (!Subtarget.hasX87())
report_fatal_error("X87 register return with X87 disabled");
CopyVT = MVT::f80;
Expand All @@ -1160,8 +1169,12 @@ SDValue X86TargetLowering::LowerCallResult(
Val =
getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InGlue);
} else {
Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InGlue)
.getValue(1);
Chain =
X87Result
? getPopFromX87Reg(DAG, Chain, dl, VA.getLocReg(), CopyVT, InGlue)
.getValue(1)
: DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InGlue)
.getValue(1);
Val = Chain.getValue(0);
InGlue = Chain.getValue(2);
}
Expand Down
105 changes: 105 additions & 0 deletions llvm/test/CodeGen/PowerPC/llvm.modf.ll
Original file line number Diff line number Diff line change
Expand Up @@ -328,3 +328,108 @@ define { ppc_fp128, ppc_fp128 } @test_modf_ppcf128(ppc_fp128 %a) {
%result = call { ppc_fp128, ppc_fp128 } @llvm.modf.ppcf128(ppc_fp128 %a)
ret { ppc_fp128, ppc_fp128 } %result
}

define ppc_fp128 @test_modf_ppcf128_only_use_intergral(ppc_fp128 %a) {
; CHECK-LABEL: test_modf_ppcf128_only_use_intergral:
; CHECK: # %bb.0:
; CHECK-NEXT: mflr r0
; CHECK-NEXT: stdu r1, -48(r1)
; CHECK-NEXT: std r0, 64(r1)
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: .cfi_offset lr, 16
; CHECK-NEXT: addi r5, r1, 32
; CHECK-NEXT: bl modfl
; CHECK-NEXT: nop
; CHECK-NEXT: lfd f1, 32(r1)
; CHECK-NEXT: lfd f2, 40(r1)
; CHECK-NEXT: addi r1, r1, 48
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
%result = call { ppc_fp128, ppc_fp128 } @llvm.modf.ppcf128(ppc_fp128 %a)
%result.1 = extractvalue { ppc_fp128, ppc_fp128 } %result, 1
ret ppc_fp128 %result.1
}

define ppc_fp128 @test_modf_ppcf128_only_use_fractional(ppc_fp128 %a) {
; CHECK-LABEL: test_modf_ppcf128_only_use_fractional:
; CHECK: # %bb.0:
; CHECK-NEXT: mflr r0
; CHECK-NEXT: stdu r1, -48(r1)
; CHECK-NEXT: std r0, 64(r1)
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: .cfi_offset lr, 16
; CHECK-NEXT: addi r5, r1, 32
; CHECK-NEXT: bl modfl
; CHECK-NEXT: nop
; CHECK-NEXT: addi r1, r1, 48
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
%result = call { ppc_fp128, ppc_fp128 } @llvm.modf.ppcf128(ppc_fp128 %a)
%result.1 = extractvalue { ppc_fp128, ppc_fp128 } %result, 0
ret ppc_fp128 %result.1
}

define { ppc_fp128, ppc_fp128 } @test_modf_ppcf128_tail_call(ppc_fp128 %a) {
; CHECK-LABEL: test_modf_ppcf128_tail_call:
; CHECK: # %bb.0:
; CHECK-NEXT: mflr r0
; CHECK-NEXT: stdu r1, -48(r1)
; CHECK-NEXT: std r0, 64(r1)
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: .cfi_offset lr, 16
; CHECK-NEXT: addi r5, r1, 32
; CHECK-NEXT: bl modfl
; CHECK-NEXT: nop
; CHECK-NEXT: lfd f3, 32(r1)
; CHECK-NEXT: lfd f4, 40(r1)
; CHECK-NEXT: addi r1, r1, 48
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
%result = tail call { ppc_fp128, ppc_fp128 } @llvm.modf.ppcf128(ppc_fp128 %a)
ret { ppc_fp128, ppc_fp128 } %result
}

define ppc_fp128 @test_modf_ppcf128_only_use_intergral_tail_call(ppc_fp128 %a) {
; CHECK-LABEL: test_modf_ppcf128_only_use_intergral_tail_call:
; CHECK: # %bb.0:
; CHECK-NEXT: mflr r0
; CHECK-NEXT: stdu r1, -48(r1)
; CHECK-NEXT: std r0, 64(r1)
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: .cfi_offset lr, 16
; CHECK-NEXT: addi r5, r1, 32
; CHECK-NEXT: bl modfl
; CHECK-NEXT: nop
; CHECK-NEXT: lfd f1, 32(r1)
; CHECK-NEXT: lfd f2, 40(r1)
; CHECK-NEXT: addi r1, r1, 48
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
%result = tail call { ppc_fp128, ppc_fp128 } @llvm.modf.ppcf128(ppc_fp128 %a)
%result.1 = extractvalue { ppc_fp128, ppc_fp128 } %result, 1
ret ppc_fp128 %result.1
}

define ppc_fp128 @test_modf_ppcf128_only_use_fractional_tail_call(ppc_fp128 %a) {
; CHECK-LABEL: test_modf_ppcf128_only_use_fractional_tail_call:
; CHECK: # %bb.0:
; CHECK-NEXT: mflr r0
; CHECK-NEXT: stdu r1, -48(r1)
; CHECK-NEXT: std r0, 64(r1)
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: .cfi_offset lr, 16
; CHECK-NEXT: addi r5, r1, 32
; CHECK-NEXT: bl modfl
; CHECK-NEXT: nop
; CHECK-NEXT: addi r1, r1, 48
; CHECK-NEXT: ld r0, 16(r1)
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
%result = tail call { ppc_fp128, ppc_fp128 } @llvm.modf.ppcf128(ppc_fp128 %a)
%result.1 = extractvalue { ppc_fp128, ppc_fp128 } %result, 0
ret ppc_fp128 %result.1
}