Skip to content

Commit 55fdecc

Browse files
authored
[SDAG][X86] Remove hack needed to avoid missing x87 FPU stack pops (#128055)
If a (two-result) node like `FMODF` or `FFREXP` is expanded to a library call, where said library has the function prototype like: `float(float, float*)` -- that is it returns a float from the call and via an output pointer. The first result of the node maps to the value returned by value and the second result maps to the value returned via the output pointer. If only the second result is used after the expansion, we hit an issue on x87 targets: ``` // Before expansion: t0, t1 = fmodf x return t1 // t0 is unused ``` Expanded result: ``` ptr = alloca ch0 = call modf ptr t0, ch1 = copy_from_reg, ch0 // t0 unused t1, ch2 = ldr ptr, ch1 return t1 ``` So far things are alright, but the DAGCombiner optimizes this to: ``` ptr = alloca ch0 = call modf ptr // copy_from_reg optimized out t1, ch1 = ldr ptr, ch0 return t1 ``` On most targets this is fine. The optimized out `copy_from_reg` is unused and is a NOP. However, x87 uses a floating-point stack, and if the `copy_from_reg` is optimized out it won't emit a pop needed to remove the unused result. The prior solution for this was to attach the chain from the `copy_from_reg` to the root, which did work, however, the root is not always available (it's set to null during legalize types). So the alternate solution in this patch is to replace the `copy_from_reg` with an `X86ISD::POP_FROM_X87_REG` within the X86 call lowering. This node is the same as `copy_from_reg` except this node makes it explicit that it may lower to an x87 FPU stack pop. Optimizations should be more cautious when handling this node than a normal CopyFromReg to avoid removing a required FPU stack pop. ``` ptr = alloca ch0 = call modf ptr t0, ch1 = pop_from_x87_reg, ch0 // t0 unused t1, ch2 = ldr ptr, ch1 return t1 ``` Using this node ensures a required x87 FPU pop is not removed due to the DAGCombiner. This is an alternate solution for #127976.
1 parent 742fa8a commit 55fdecc

File tree

6 files changed

+143
-20
lines changed

6 files changed

+143
-20
lines changed

llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2635,22 +2635,6 @@ bool SelectionDAG::expandMultipleResultFPLibCall(
26352635
Results.push_back(LoadResult);
26362636
}
26372637

2638-
if (CallRetResNo && !Node->hasAnyUseOfValue(*CallRetResNo)) {
2639-
// FIXME: Find a way to avoid updating the root. This is needed for x86,
2640-
// which uses a floating-point stack. If (for example) the node to be
2641-
// expanded has two results one floating-point which is returned by the
2642-
// call, and one integer result, returned via an output pointer. If only the
2643-
// integer result is used then the `CopyFromReg` for the FP result may be
2644-
// optimized out. This prevents an FP stack pop from being emitted for it.
2645-
// Setting the root like this ensures there will be a use of the
2646-
// `CopyFromReg` chain, and ensures the FP pop will be emitted.
2647-
SDValue NewRoot =
2648-
getNode(ISD::TokenFactor, DL, MVT::Other, getRoot(), CallChain);
2649-
setRoot(NewRoot);
2650-
// Ensure the new root is reachable from the results.
2651-
Results[0] = getMergeValues({Results[0], NewRoot}, DL);
2652-
}
2653-
26542638
return true;
26552639
}
26562640

llvm/lib/Target/X86/X86ISelDAGToDAG.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6717,6 +6717,17 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
67176717
ReplaceNode(Node, Res);
67186718
return;
67196719
}
6720+
case X86ISD::POP_FROM_X87_REG: {
6721+
SDValue Chain = Node->getOperand(0);
6722+
Register Reg = cast<RegisterSDNode>(Node->getOperand(1))->getReg();
6723+
SDValue Glue;
6724+
if (Node->getNumValues() == 3)
6725+
Glue = Node->getOperand(2);
6726+
SDValue Copy =
6727+
CurDAG->getCopyFromReg(Chain, dl, Reg, Node->getValueType(0), Glue);
6728+
ReplaceNode(Node, Copy.getNode());
6729+
return;
6730+
}
67206731
}
67216732

67226733
SelectCode(Node);

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35114,6 +35114,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
3511435114
NODE_NAME_CASE(CVTTP2SIS_SAE)
3511535115
NODE_NAME_CASE(CVTTP2UIS)
3511635116
NODE_NAME_CASE(MCVTTP2UIS)
35117+
NODE_NAME_CASE(POP_FROM_X87_REG)
3511735118
}
3511835119
return nullptr;
3511935120
#undef NODE_NAME_CASE

llvm/lib/Target/X86/X86ISelLowering.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,15 @@ namespace llvm {
8181
// marker instruction.
8282
CALL_RVMARKER,
8383

84+
/// The same as ISD::CopyFromReg except that this node makes it explicit
85+
/// that it may lower to an x87 FPU stack pop. Optimizations should be more
86+
/// cautious when handling this node than a normal CopyFromReg to avoid
87+
/// removing a required FPU stack pop. A key requirement is optimizations
88+
/// should not optimize any users of a chain that contains a
89+
/// POP_FROM_X87_REG to use a chain from a point earlier than the
90+
/// POP_FROM_X87_REG (which may remove a required FPU stack pop).
91+
POP_FROM_X87_REG,
92+
8493
/// X86 compare and logical compare instructions.
8594
CMP,
8695
FCMP,

llvm/lib/Target/X86/X86ISelLoweringCall.cpp

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1095,6 +1095,15 @@ static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
10951095
return DAG.getBitcast(ValVT, ValReturned);
10961096
}
10971097

1098+
static SDValue getPopFromX87Reg(SelectionDAG &DAG, SDValue Chain,
1099+
const SDLoc &dl, Register Reg, EVT VT,
1100+
SDValue Glue) {
1101+
SDVTList VTs = DAG.getVTList(VT, MVT::Other, MVT::Glue);
1102+
SDValue Ops[] = {Chain, DAG.getRegister(Reg, VT), Glue};
1103+
return DAG.getNode(X86ISD::POP_FROM_X87_REG, dl, VTs,
1104+
ArrayRef(Ops, Glue.getNode() ? 3 : 2));
1105+
}
1106+
10981107
/// Lower the result values of a call into the
10991108
/// appropriate copies out of appropriate physical registers.
11001109
///
@@ -1145,8 +1154,8 @@ SDValue X86TargetLowering::LowerCallResult(
11451154
// If we prefer to use the value in xmm registers, copy it out as f80 and
11461155
// use a truncate to move it from fp stack reg to xmm reg.
11471156
bool RoundAfterCopy = false;
1148-
if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
1149-
isScalarFPTypeInSSEReg(VA.getValVT())) {
1157+
bool X87Result = VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1;
1158+
if (X87Result && isScalarFPTypeInSSEReg(VA.getValVT())) {
11501159
if (!Subtarget.hasX87())
11511160
report_fatal_error("X87 register return with X87 disabled");
11521161
CopyVT = MVT::f80;
@@ -1160,8 +1169,12 @@ SDValue X86TargetLowering::LowerCallResult(
11601169
Val =
11611170
getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InGlue);
11621171
} else {
1163-
Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InGlue)
1164-
.getValue(1);
1172+
Chain =
1173+
X87Result
1174+
? getPopFromX87Reg(DAG, Chain, dl, VA.getLocReg(), CopyVT, InGlue)
1175+
.getValue(1)
1176+
: DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InGlue)
1177+
.getValue(1);
11651178
Val = Chain.getValue(0);
11661179
InGlue = Chain.getValue(2);
11671180
}

llvm/test/CodeGen/PowerPC/llvm.modf.ll

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -328,3 +328,108 @@ define { ppc_fp128, ppc_fp128 } @test_modf_ppcf128(ppc_fp128 %a) {
328328
%result = call { ppc_fp128, ppc_fp128 } @llvm.modf.ppcf128(ppc_fp128 %a)
329329
ret { ppc_fp128, ppc_fp128 } %result
330330
}
331+
332+
define ppc_fp128 @test_modf_ppcf128_only_use_intergral(ppc_fp128 %a) {
333+
; CHECK-LABEL: test_modf_ppcf128_only_use_intergral:
334+
; CHECK: # %bb.0:
335+
; CHECK-NEXT: mflr r0
336+
; CHECK-NEXT: stdu r1, -48(r1)
337+
; CHECK-NEXT: std r0, 64(r1)
338+
; CHECK-NEXT: .cfi_def_cfa_offset 48
339+
; CHECK-NEXT: .cfi_offset lr, 16
340+
; CHECK-NEXT: addi r5, r1, 32
341+
; CHECK-NEXT: bl modfl
342+
; CHECK-NEXT: nop
343+
; CHECK-NEXT: lfd f1, 32(r1)
344+
; CHECK-NEXT: lfd f2, 40(r1)
345+
; CHECK-NEXT: addi r1, r1, 48
346+
; CHECK-NEXT: ld r0, 16(r1)
347+
; CHECK-NEXT: mtlr r0
348+
; CHECK-NEXT: blr
349+
%result = call { ppc_fp128, ppc_fp128 } @llvm.modf.ppcf128(ppc_fp128 %a)
350+
%result.1 = extractvalue { ppc_fp128, ppc_fp128 } %result, 1
351+
ret ppc_fp128 %result.1
352+
}
353+
354+
define ppc_fp128 @test_modf_ppcf128_only_use_fractional(ppc_fp128 %a) {
355+
; CHECK-LABEL: test_modf_ppcf128_only_use_fractional:
356+
; CHECK: # %bb.0:
357+
; CHECK-NEXT: mflr r0
358+
; CHECK-NEXT: stdu r1, -48(r1)
359+
; CHECK-NEXT: std r0, 64(r1)
360+
; CHECK-NEXT: .cfi_def_cfa_offset 48
361+
; CHECK-NEXT: .cfi_offset lr, 16
362+
; CHECK-NEXT: addi r5, r1, 32
363+
; CHECK-NEXT: bl modfl
364+
; CHECK-NEXT: nop
365+
; CHECK-NEXT: addi r1, r1, 48
366+
; CHECK-NEXT: ld r0, 16(r1)
367+
; CHECK-NEXT: mtlr r0
368+
; CHECK-NEXT: blr
369+
%result = call { ppc_fp128, ppc_fp128 } @llvm.modf.ppcf128(ppc_fp128 %a)
370+
%result.1 = extractvalue { ppc_fp128, ppc_fp128 } %result, 0
371+
ret ppc_fp128 %result.1
372+
}
373+
374+
define { ppc_fp128, ppc_fp128 } @test_modf_ppcf128_tail_call(ppc_fp128 %a) {
375+
; CHECK-LABEL: test_modf_ppcf128_tail_call:
376+
; CHECK: # %bb.0:
377+
; CHECK-NEXT: mflr r0
378+
; CHECK-NEXT: stdu r1, -48(r1)
379+
; CHECK-NEXT: std r0, 64(r1)
380+
; CHECK-NEXT: .cfi_def_cfa_offset 48
381+
; CHECK-NEXT: .cfi_offset lr, 16
382+
; CHECK-NEXT: addi r5, r1, 32
383+
; CHECK-NEXT: bl modfl
384+
; CHECK-NEXT: nop
385+
; CHECK-NEXT: lfd f3, 32(r1)
386+
; CHECK-NEXT: lfd f4, 40(r1)
387+
; CHECK-NEXT: addi r1, r1, 48
388+
; CHECK-NEXT: ld r0, 16(r1)
389+
; CHECK-NEXT: mtlr r0
390+
; CHECK-NEXT: blr
391+
%result = tail call { ppc_fp128, ppc_fp128 } @llvm.modf.ppcf128(ppc_fp128 %a)
392+
ret { ppc_fp128, ppc_fp128 } %result
393+
}
394+
395+
define ppc_fp128 @test_modf_ppcf128_only_use_intergral_tail_call(ppc_fp128 %a) {
396+
; CHECK-LABEL: test_modf_ppcf128_only_use_intergral_tail_call:
397+
; CHECK: # %bb.0:
398+
; CHECK-NEXT: mflr r0
399+
; CHECK-NEXT: stdu r1, -48(r1)
400+
; CHECK-NEXT: std r0, 64(r1)
401+
; CHECK-NEXT: .cfi_def_cfa_offset 48
402+
; CHECK-NEXT: .cfi_offset lr, 16
403+
; CHECK-NEXT: addi r5, r1, 32
404+
; CHECK-NEXT: bl modfl
405+
; CHECK-NEXT: nop
406+
; CHECK-NEXT: lfd f1, 32(r1)
407+
; CHECK-NEXT: lfd f2, 40(r1)
408+
; CHECK-NEXT: addi r1, r1, 48
409+
; CHECK-NEXT: ld r0, 16(r1)
410+
; CHECK-NEXT: mtlr r0
411+
; CHECK-NEXT: blr
412+
%result = tail call { ppc_fp128, ppc_fp128 } @llvm.modf.ppcf128(ppc_fp128 %a)
413+
%result.1 = extractvalue { ppc_fp128, ppc_fp128 } %result, 1
414+
ret ppc_fp128 %result.1
415+
}
416+
417+
define ppc_fp128 @test_modf_ppcf128_only_use_fractional_tail_call(ppc_fp128 %a) {
418+
; CHECK-LABEL: test_modf_ppcf128_only_use_fractional_tail_call:
419+
; CHECK: # %bb.0:
420+
; CHECK-NEXT: mflr r0
421+
; CHECK-NEXT: stdu r1, -48(r1)
422+
; CHECK-NEXT: std r0, 64(r1)
423+
; CHECK-NEXT: .cfi_def_cfa_offset 48
424+
; CHECK-NEXT: .cfi_offset lr, 16
425+
; CHECK-NEXT: addi r5, r1, 32
426+
; CHECK-NEXT: bl modfl
427+
; CHECK-NEXT: nop
428+
; CHECK-NEXT: addi r1, r1, 48
429+
; CHECK-NEXT: ld r0, 16(r1)
430+
; CHECK-NEXT: mtlr r0
431+
; CHECK-NEXT: blr
432+
%result = tail call { ppc_fp128, ppc_fp128 } @llvm.modf.ppcf128(ppc_fp128 %a)
433+
%result.1 = extractvalue { ppc_fp128, ppc_fp128 } %result, 0
434+
ret ppc_fp128 %result.1
435+
}

0 commit comments

Comments
 (0)