-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[SDAG][X86] Remove hack needed to avoid missing x87 FPU stack pops #128055
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-support @llvm/pr-subscribers-backend-aarch64 Author: Benjamin Maxwell (MacDue) ChangesThis patch adds the missing The fix for If a node (two-result) node like If only the second result is used after the expansion, we hit an issue on targets like x87:
Expanded result:
So far things are already, but the DAGCombiner optimizes this to:
On most targets this is fine. The optimized out The prior solution for this was to attach the chain from the
This prevents optimizations from removing the Patch is 44.57 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/128055.diff 11 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 0244c170a2123..b6fff948f89d5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -132,6 +132,7 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
case ISD::STRICT_FLDEXP: R = SoftenFloatRes_ExpOp(N); break;
case ISD::FFREXP: R = SoftenFloatRes_FFREXP(N); break;
case ISD::FSINCOS: R = SoftenFloatRes_FSINCOS(N); break;
+ case ISD::FMODF: R = SoftenFloatRes_FMODF(N); break;
case ISD::STRICT_FREM:
case ISD::FREM: R = SoftenFloatRes_FREM(N); break;
case ISD::STRICT_FRINT:
@@ -791,27 +792,35 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FFREXP(SDNode *N) {
return ReturnVal;
}
-SDValue
-DAGTypeLegalizer::SoftenFloatRes_UnaryWithTwoFPResults(SDNode *N,
- RTLIB::Libcall LC) {
+SDValue DAGTypeLegalizer::SoftenFloatRes_UnaryWithTwoFPResults(
+ SDNode *N, RTLIB::Libcall LC, std::optional<unsigned> CallRetResNo) {
assert(!N->isStrictFPOpcode() && "strictfp not implemented");
EVT VT = N->getValueType(0);
+ assert(VT == N->getValueType(1) &&
+ "expected both return values to have the same type");
+
if (!TLI.getLibcallName(LC))
return SDValue();
EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
- SDValue FirstResultSlot = DAG.CreateStackTemporary(NVT);
- SDValue SecondResultSlot = DAG.CreateStackTemporary(NVT);
SDLoc DL(N);
- TargetLowering::MakeLibCallOptions CallOptions;
- std::array Ops{GetSoftenedFloat(N->getOperand(0)), FirstResultSlot,
- SecondResultSlot};
- std::array OpsVT{VT, FirstResultSlot.getValueType(),
- SecondResultSlot.getValueType()};
+ SmallVector<SDValue, 3> Ops = {GetSoftenedFloat(N->getOperand(0))};
+ SmallVector<EVT, 3> OpsVT = {VT};
+
+ std::array<SDValue, 2> StackSlots;
+ for (auto [ResNum, _] : enumerate(N->values())) {
+ if (ResNum == CallRetResNo)
+ continue;
+ SDValue StackSlot = DAG.CreateStackTemporary(NVT);
+ Ops.push_back(StackSlot);
+ OpsVT.push_back(StackSlot.getValueType());
+ StackSlots[ResNum] = StackSlot;
+ }
+ TargetLowering::MakeLibCallOptions CallOptions;
// TODO: setTypeListBeforeSoften can't properly express multiple return types,
// but since both returns have the same type it should be okay.
CallOptions.setTypeListBeforeSoften({OpsVT}, VT, true);
@@ -825,8 +834,14 @@ DAGTypeLegalizer::SoftenFloatRes_UnaryWithTwoFPResults(SDNode *N,
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
return DAG.getLoad(NVT, DL, Chain, StackSlot, PtrInfo);
};
- SetSoftenedFloat(SDValue(N, 0), CreateStackLoad(FirstResultSlot));
- SetSoftenedFloat(SDValue(N, 1), CreateStackLoad(SecondResultSlot));
+
+ for (auto [ResNum, SlackSlot] : enumerate(StackSlots)) {
+ if (CallRetResNo == ResNum) {
+ SetSoftenedFloat(SDValue(N, ResNum), ReturnVal);
+ continue;
+ }
+ SetSoftenedFloat(SDValue(N, ResNum), CreateStackLoad(SlackSlot));
+ }
return SDValue();
}
@@ -836,6 +851,11 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FSINCOS(SDNode *N) {
N, RTLIB::getSINCOS(N->getValueType(0)));
}
+SDValue DAGTypeLegalizer::SoftenFloatRes_FMODF(SDNode *N) {
+ return SoftenFloatRes_UnaryWithTwoFPResults(
+ N, RTLIB::getMODF(N->getValueType(0)), /*CallRetResNo=*/0);
+}
+
SDValue DAGTypeLegalizer::SoftenFloatRes_FREM(SDNode *N) {
return SoftenFloatRes_Binary(N, GetFPLibCall(N->getValueType(0),
RTLIB::REM_F32,
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index cac969f7e2185..5cbec5cf409cf 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -562,7 +562,8 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
// Convert Float Results to Integer.
void SoftenFloatResult(SDNode *N, unsigned ResNo);
SDValue SoftenFloatRes_Unary(SDNode *N, RTLIB::Libcall LC);
- SDValue SoftenFloatRes_UnaryWithTwoFPResults(SDNode *N, RTLIB::Libcall LC);
+ SDValue SoftenFloatRes_UnaryWithTwoFPResults(
+ SDNode *N, RTLIB::Libcall LC, std::optional<unsigned> CallRetResNo = {});
SDValue SoftenFloatRes_Binary(SDNode *N, RTLIB::Libcall LC);
SDValue SoftenFloatRes_MERGE_VALUES(SDNode *N, unsigned ResNo);
SDValue SoftenFloatRes_ARITH_FENCE(SDNode *N);
@@ -608,6 +609,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
SDValue SoftenFloatRes_ExpOp(SDNode *N);
SDValue SoftenFloatRes_FFREXP(SDNode *N);
SDValue SoftenFloatRes_FSINCOS(SDNode *N);
+ SDValue SoftenFloatRes_FMODF(SDNode *N);
SDValue SoftenFloatRes_FREEZE(SDNode *N);
SDValue SoftenFloatRes_FREM(SDNode *N);
SDValue SoftenFloatRes_FRINT(SDNode *N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 0a3210a10d394..6dadb0a46528b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2616,6 +2616,20 @@ bool SelectionDAG::expandMultipleResultFPLibCall(
auto [Call, CallChain] = TLI->LowerCallTo(CLI);
+ if (CallRetResNo && !Node->hasAnyUseOfValue(*CallRetResNo)) {
+ // FIXME: This is needed for x87, which uses a floating-point stack. If (for
+ // example) the node to be expanded has two results one floating-point which
+ // is returned by the call, and one integer result, returned via an output
+ // pointer. If only the integer result is used then the `CopyFromReg` for
+ // the FP result may be optimized out. This prevents an FP stack pop from
+ // being emitted for it. The `FAKE_USE` node prevents optimizations from
+ // removing the `CopyFromReg` from the chain, and ensures the FP pop will be
+ // emitted. Note: We use an undef pointer as the argument to prevent keeping
+ // any real values live longer than we need to.
+ CallChain = getNode(ISD::FAKE_USE, DL, MVT::Other, CallChain,
+ getUNDEF(TLI->getPointerTy(getDataLayout())));
+ }
+
for (auto [ResNo, ResultPtr] : llvm::enumerate(ResultPtrs)) {
if (ResNo == CallRetResNo) {
Results.push_back(Call);
@@ -2635,22 +2649,6 @@ bool SelectionDAG::expandMultipleResultFPLibCall(
Results.push_back(LoadResult);
}
- if (CallRetResNo && !Node->hasAnyUseOfValue(*CallRetResNo)) {
- // FIXME: Find a way to avoid updating the root. This is needed for x86,
- // which uses a floating-point stack. If (for example) the node to be
- // expanded has two results one floating-point which is returned by the
- // call, and one integer result, returned via an output pointer. If only the
- // integer result is used then the `CopyFromReg` for the FP result may be
- // optimized out. This prevents an FP stack pop from being emitted for it.
- // Setting the root like this ensures there will be a use of the
- // `CopyFromReg` chain, and ensures the FP pop will be emitted.
- SDValue NewRoot =
- getNode(ISD::TokenFactor, DL, MVT::Other, getRoot(), CallChain);
- setRoot(NewRoot);
- // Ensure the new root is reachable from the results.
- Results[0] = getMergeValues({Results[0], NewRoot}, DL);
- }
-
return true;
}
diff --git a/llvm/test/CodeGen/AArch64/llvm.modf.ll b/llvm/test/CodeGen/AArch64/llvm.modf.ll
index 41fe796daca86..ad746c6aef4fe 100644
--- a/llvm/test/CodeGen/AArch64/llvm.modf.ll
+++ b/llvm/test/CodeGen/AArch64/llvm.modf.ll
@@ -45,6 +45,7 @@ define half @test_modf_f16_only_use_integral_part(half %a) {
; CHECK-NEXT: fcvt s0, h0
; CHECK-NEXT: add x0, sp, #12
; CHECK-NEXT: bl modff
+; CHECK-NEXT: // fake_use: $x0
; CHECK-NEXT: ldr s0, [sp, #12]
; CHECK-NEXT: fcvt h0, s0
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/ARM/llvm.frexp.ll b/llvm/test/CodeGen/ARM/llvm.frexp.ll
index e79ddbe93336e..179467120687d 100644
--- a/llvm/test/CodeGen/ARM/llvm.frexp.ll
+++ b/llvm/test/CodeGen/ARM/llvm.frexp.ll
@@ -41,6 +41,7 @@ define i32 @test_frexp_f16_i32_only_use_exp(half %a) {
; CHECK-NEXT: bl __gnu_h2f_ieee
; CHECK-NEXT: add r1, sp, #4
; CHECK-NEXT: bl frexpf
+; CHECK-NEXT: @ fake_use: $r0
; CHECK-NEXT: ldr r0, [sp, #4]
; CHECK-NEXT: add sp, #8
; CHECK-NEXT: pop {r7, pc}
@@ -132,6 +133,8 @@ define <2 x i32> @test_frexp_v2f16_v2i32_only_use_exp(<2 x half> %a) {
; CHECK-NEXT: mov r1, r4
; CHECK-NEXT: bl frexpf
; CHECK-NEXT: vld1.32 {d16[0]}, [r5:32]
+; CHECK-NEXT: @ fake_use: $r0
+; CHECK-NEXT: @ fake_use: $r0
; CHECK-NEXT: vld1.32 {d16[1]}, [r4:32]
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: add sp, #8
@@ -190,6 +193,7 @@ define i32 @test_frexp_f32_i32_only_use_exp(float %a) {
; CHECK-NEXT: sub sp, #8
; CHECK-NEXT: add r1, sp, #4
; CHECK-NEXT: bl frexpf
+; CHECK-NEXT: @ fake_use: $r0
; CHECK-NEXT: ldr r0, [sp, #4]
; CHECK-NEXT: add sp, #8
; CHECK-NEXT: pop {r7, pc}
@@ -265,6 +269,8 @@ define <2 x i32> @test_frexp_v2f32_v2i32_only_use_exp(<2 x float> %a) {
; CHECK-NEXT: mov r1, r5
; CHECK-NEXT: bl frexpf
; CHECK-NEXT: vld1.32 {d16[0]}, [r4:32]
+; CHECK-NEXT: @ fake_use: $r0
+; CHECK-NEXT: @ fake_use: $r0
; CHECK-NEXT: vld1.32 {d16[1]}, [r5:32]
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: add sp, #8
@@ -376,8 +382,13 @@ define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) {
; CHECK-NEXT: mov r1, sp
; CHECK-NEXT: mov r0, r4
; CHECK-NEXT: bl frexpf
-; CHECK-NEXT: ldrd r1, r0, [sp, #8]
-; CHECK-NEXT: ldrd r3, r2, [sp], #16
+; CHECK-NEXT: @ fake_use: $r0
+; CHECK-NEXT: @ fake_use: $r0
+; CHECK-NEXT: ldr r0, [sp, #12]
+; CHECK-NEXT: @ fake_use: $r0
+; CHECK-NEXT: ldrd r2, r1, [sp, #4]
+; CHECK-NEXT: @ fake_use: $r0
+; CHECK-NEXT: ldr r3, [sp], #16
; CHECK-NEXT: pop {r4, r5, r6, pc}
%result = call { <4 x float>, <4 x i32> } @llvm.frexp.v4f32.v4i32(<4 x float> %a)
%result.1 = extractvalue { <4 x float>, <4 x i32> } %result, 1
@@ -419,6 +430,7 @@ define i32 @test_frexp_f64_i32_only_use_exp(double %a) {
; CHECK-NEXT: sub sp, #8
; CHECK-NEXT: add r2, sp, #4
; CHECK-NEXT: bl frexp
+; CHECK-NEXT: @ fake_use: $r0
; CHECK-NEXT: ldr r0, [sp, #4]
; CHECK-NEXT: add sp, #8
; CHECK-NEXT: pop {r7, pc}
@@ -498,6 +510,8 @@ define <2 x i32> @test_frexp_v2f64_v2i32_only_use_exp(<2 x double> %a) {
; CHECK-NEXT: mov r2, r7
; CHECK-NEXT: bl frexp
; CHECK-NEXT: vld1.32 {d16[0]}, [r6:32]
+; CHECK-NEXT: @ fake_use: $r0
+; CHECK-NEXT: @ fake_use: $r0
; CHECK-NEXT: vld1.32 {d16[1]}, [r7:32]
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: add sp, #12
diff --git a/llvm/test/CodeGen/ARM/llvm.modf.ll b/llvm/test/CodeGen/ARM/llvm.modf.ll
new file mode 100644
index 0000000000000..39996cd55772a
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/llvm.modf.ll
@@ -0,0 +1,507 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=thumbv7-gnu-linux < %s | FileCheck -check-prefixes=CHECK %s
+; RUN: llc -mtriple=armv6m < %s | FileCheck %s --check-prefix=THUMB
+
+define { half, half } @test_modf_f16(half %a) {
+; CHECK-LABEL: test_modf_f16:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: push {r4, lr}
+; CHECK-NEXT: sub sp, #8
+; CHECK-NEXT: bl __gnu_h2f_ieee
+; CHECK-NEXT: add r1, sp, #4
+; CHECK-NEXT: bl modff
+; CHECK-NEXT: bl __gnu_f2h_ieee
+; CHECK-NEXT: mov r4, r0
+; CHECK-NEXT: ldr r0, [sp, #4]
+; CHECK-NEXT: bl __gnu_f2h_ieee
+; CHECK-NEXT: mov r1, r0
+; CHECK-NEXT: mov r0, r4
+; CHECK-NEXT: add sp, #8
+; CHECK-NEXT: pop {r4, pc}
+;
+; THUMB-LABEL: test_modf_f16:
+; THUMB: @ %bb.0:
+; THUMB-NEXT: push {r4, lr}
+; THUMB-NEXT: sub sp, #8
+; THUMB-NEXT: uxth r0, r0
+; THUMB-NEXT: bl __gnu_h2f_ieee
+; THUMB-NEXT: add r1, sp, #4
+; THUMB-NEXT: bl modff
+; THUMB-NEXT: bl __gnu_f2h_ieee
+; THUMB-NEXT: mov r4, r0
+; THUMB-NEXT: ldr r0, [sp, #4]
+; THUMB-NEXT: bl __gnu_f2h_ieee
+; THUMB-NEXT: mov r1, r0
+; THUMB-NEXT: mov r0, r4
+; THUMB-NEXT: add sp, #8
+; THUMB-NEXT: pop {r4, pc}
+ %result = call { half, half } @llvm.modf.f16(half %a)
+ ret { half, half } %result
+}
+
+define half @test_modf_f16_only_use_fractional_part(half %a) {
+; CHECK-LABEL: test_modf_f16_only_use_fractional_part:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: sub sp, #8
+; CHECK-NEXT: bl __gnu_h2f_ieee
+; CHECK-NEXT: add r1, sp, #4
+; CHECK-NEXT: bl modff
+; CHECK-NEXT: bl __gnu_f2h_ieee
+; CHECK-NEXT: add sp, #8
+; CHECK-NEXT: pop {r7, pc}
+;
+; THUMB-LABEL: test_modf_f16_only_use_fractional_part:
+; THUMB: @ %bb.0:
+; THUMB-NEXT: push {r7, lr}
+; THUMB-NEXT: sub sp, #8
+; THUMB-NEXT: uxth r0, r0
+; THUMB-NEXT: bl __gnu_h2f_ieee
+; THUMB-NEXT: add r1, sp, #4
+; THUMB-NEXT: bl modff
+; THUMB-NEXT: bl __gnu_f2h_ieee
+; THUMB-NEXT: add sp, #8
+; THUMB-NEXT: pop {r7, pc}
+ %result = call { half, half } @llvm.modf.f16(half %a)
+ %result.0 = extractvalue { half, half } %result, 0
+ ret half %result.0
+}
+
+define half @test_modf_f16_only_use_integral_part(half %a) {
+; CHECK-LABEL: test_modf_f16_only_use_integral_part:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: sub sp, #8
+; CHECK-NEXT: bl __gnu_h2f_ieee
+; CHECK-NEXT: add r1, sp, #4
+; CHECK-NEXT: bl modff
+; CHECK-NEXT: @ fake_use: $r0
+; CHECK-NEXT: ldr r0, [sp, #4]
+; CHECK-NEXT: bl __gnu_f2h_ieee
+; CHECK-NEXT: add sp, #8
+; CHECK-NEXT: pop {r7, pc}
+;
+; THUMB-LABEL: test_modf_f16_only_use_integral_part:
+; THUMB: @ %bb.0:
+; THUMB-NEXT: push {r7, lr}
+; THUMB-NEXT: sub sp, #8
+; THUMB-NEXT: uxth r0, r0
+; THUMB-NEXT: bl __gnu_h2f_ieee
+; THUMB-NEXT: add r1, sp, #4
+; THUMB-NEXT: bl modff
+; THUMB-NEXT: ldr r0, [sp, #4]
+; THUMB-NEXT: bl __gnu_f2h_ieee
+; THUMB-NEXT: add sp, #8
+; THUMB-NEXT: pop {r7, pc}
+ %result = call { half, half } @llvm.modf.f16(half %a)
+ %result.1 = extractvalue { half, half } %result, 1
+ ret half %result.1
+}
+
+define { <2 x half>, <2 x half> } @test_modf_v2f16(<2 x half> %a) {
+; CHECK-LABEL: test_modf_v2f16:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: push {r4, lr}
+; CHECK-NEXT: vpush {d8}
+; CHECK-NEXT: sub sp, #16
+; CHECK-NEXT: mov r4, r0
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bl __gnu_h2f_ieee
+; CHECK-NEXT: add r1, sp, #4
+; CHECK-NEXT: bl modff
+; CHECK-NEXT: bl __gnu_f2h_ieee
+; CHECK-NEXT: strh.w r0, [sp, #14]
+; CHECK-NEXT: mov r0, r4
+; CHECK-NEXT: bl __gnu_h2f_ieee
+; CHECK-NEXT: mov r1, sp
+; CHECK-NEXT: bl modff
+; CHECK-NEXT: bl __gnu_f2h_ieee
+; CHECK-NEXT: strh.w r0, [sp, #12]
+; CHECK-NEXT: add r0, sp, #12
+; CHECK-NEXT: vld1.32 {d8[0]}, [r0:32]
+; CHECK-NEXT: ldr r0, [sp, #4]
+; CHECK-NEXT: bl __gnu_f2h_ieee
+; CHECK-NEXT: ldr r1, [sp]
+; CHECK-NEXT: strh.w r0, [sp, #10]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bl __gnu_f2h_ieee
+; CHECK-NEXT: strh.w r0, [sp, #8]
+; CHECK-NEXT: add r0, sp, #8
+; CHECK-NEXT: vmovl.u16 q9, d8
+; CHECK-NEXT: vld1.32 {d16[0]}, [r0:32]
+; CHECK-NEXT: vmovl.u16 q8, d16
+; CHECK-NEXT: vmov.32 r0, d18[0]
+; CHECK-NEXT: vmov.32 r1, d18[1]
+; CHECK-NEXT: vmov.32 r2, d16[0]
+; CHECK-NEXT: vmov.32 r3, d16[1]
+; CHECK-NEXT: add sp, #16
+; CHECK-NEXT: vpop {d8}
+; CHECK-NEXT: pop {r4, pc}
+;
+; THUMB-LABEL: test_modf_v2f16:
+; THUMB: @ %bb.0:
+; THUMB-NEXT: push {r4, r5, r6, lr}
+; THUMB-NEXT: sub sp, #8
+; THUMB-NEXT: mov r5, r1
+; THUMB-NEXT: uxth r0, r0
+; THUMB-NEXT: bl __gnu_h2f_ieee
+; THUMB-NEXT: mov r1, sp
+; THUMB-NEXT: bl modff
+; THUMB-NEXT: bl __gnu_f2h_ieee
+; THUMB-NEXT: mov r4, r0
+; THUMB-NEXT: uxth r0, r5
+; THUMB-NEXT: bl __gnu_h2f_ieee
+; THUMB-NEXT: add r1, sp, #4
+; THUMB-NEXT: bl modff
+; THUMB-NEXT: bl __gnu_f2h_ieee
+; THUMB-NEXT: mov r5, r0
+; THUMB-NEXT: ldr r0, [sp]
+; THUMB-NEXT: bl __gnu_f2h_ieee
+; THUMB-NEXT: mov r6, r0
+; THUMB-NEXT: ldr r0, [sp, #4]
+; THUMB-NEXT: bl __gnu_f2h_ieee
+; THUMB-NEXT: mov r3, r0
+; THUMB-NEXT: mov r0, r4
+; THUMB-NEXT: mov r1, r5
+; THUMB-NEXT: mov r2, r6
+; THUMB-NEXT: add sp, #8
+; THUMB-NEXT: pop {r4, r5, r6, pc}
+ %result = call { <2 x half>, <2 x half> } @llvm.modf.v2f16(<2 x half> %a)
+ ret { <2 x half>, <2 x half> } %result
+}
+
+define { float, float } @test_modf_f32(float %a) {
+; CHECK-LABEL: test_modf_f32:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: sub sp, #8
+; CHECK-NEXT: add r1, sp, #4
+; CHECK-NEXT: bl modff
+; CHECK-NEXT: ldr r1, [sp, #4]
+; CHECK-NEXT: add sp, #8
+; CHECK-NEXT: pop {r7, pc}
+;
+; THUMB-LABEL: test_modf_f32:
+; THUMB: @ %bb.0:
+; THUMB-NEXT: push {r7, lr}
+; THUMB-NEXT: sub sp, #8
+; THUMB-NEXT: add r1, sp, #4
+; THUMB-NEXT: bl modff
+; THUMB-NEXT: ldr r1, [sp, #4]
+; THUMB-NEXT: add sp, #8
+; THUMB-NEXT: pop {r7, pc}
+ %result = call { float, float } @llvm.modf.f32(float %a)
+ ret { float, float } %result
+}
+
+define { <3 x float>, <3 x float> } @test_modf_v3f32(<3 x float> %a) {
+; CHECK-LABEL: test_modf_v3f32:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: push {r4, r5, r6, lr}
+; CHECK-NEXT: vpush {d8, d9}
+; CHECK-NEXT: sub sp, #8
+; CHECK-NEXT: vldr d9, [sp, #40]
+; CHECK-NEXT: mov r4, r0
+; CHECK-NEXT: mov r1, sp
+; CHECK-NEXT: mov r0, r2
+; CHECK-NEXT: mov r5, r3
+; CHECK-NEXT: vmov d8, r2, r3
+; CHECK-NEXT: bl modff
+; CHECK-NEXT: add r1, sp, #4
+; CHECK-NEXT: mov r6, r0
+; CHECK-NEXT: mov r0, r5
+; CHECK-NEXT: bl modff
+; CHECK-NEXT: mov r5, r0
+; CHECK-NEXT: vmov r0, s18
+; CHECK-NEXT: vldmia sp, {s0, s1}
+; CHECK-NEXT: add.w r1, r4, #16
+; CHECK-NEXT: vst1.32 {d0}, [r1:64]!
+; CHECK-NEXT: bl modff
+; CHECK-NEXT: vmov s1, r5
+; CHECK-NEXT: vmov s0, r6
+; CHECK-NEXT: vst1.32 {d0}, [r4:64]!
+; CHECK-NEXT: str r0, [r4]
+; CHECK-NEXT: add sp, #8
+; CHECK-NEXT: vpop {d8, d9}
+; CHECK-NEXT: pop {r4, r5, r6, pc}
+;
+; THUMB-LABEL: test_modf_v3f32:
+; THUMB: @ %bb.0:
+; THUMB-NEXT: push {r4, r5, r6, r7, lr}
+; THUMB-NEXT: sub sp, #12
+; THUMB-NEXT: mov r7, r3
+; THUMB-NEXT: mov r5, r2
+; THUMB-NEXT: mov r4, r0
+; THUMB-NEXT: ldr r0, [sp, #32]
+; THUMB-NEXT: add r1, sp, #8
+; THUMB-NEXT: bl modff
+; THUMB-NEXT: mov r6, r0
+; THUMB-NEXT: ldr r0, [sp, #8]
+; THUMB-NEXT: str r0, [r4, #24]
+; THUMB-NEXT: add r1, sp, #4
+; THUMB-NEXT: mov r0, r7
+; THUMB-NEXT: bl modff
+; THUMB-NEXT: mov r7, r0
+; THUMB-NEXT: ldr r0, [sp, #4]
+; THUMB-NEXT: str r0, [r4, #20]
+; THUMB-NEXT: mov r1, sp
+; THUMB-NEXT: mov r0, r5
+; THUMB-NEXT: bl modff
+; THUMB-NEXT: ldr r1, [sp]
+; THUMB-NEXT: str r1, [r4, #16]
+; THUMB-NEXT: stm r4!, {r0, r7}
+; THUMB-NEXT: str r6, [r4]
+; THUMB-NEXT: add sp, #12
+; THUMB-NEXT: pop {r4, r5, r6, r7, pc}
+ %result = call { <3 x float>, <3 x float> } @llvm.modf.v3f32(<3 x float> %a)
+ ret { <3 x float>, <3 x float> } %result
+}
+
+define { <2 x float>, <2 x float> } @test_modf_v2f32(<2 x float> %a) {
+; CHECK-LABEL: test_modf_v2f32:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: push {r4, lr}
+; CHECK-NEXT: vpush {d8}
+; CHECK-NEXT: sub sp, #8
+; CHECK-NEXT: vmov d8, r0, r1
+; CHECK-NEXT: mov r1, sp
+; CHECK-NEXT: vmov r0, s17
+; CHECK-NEXT: bl modff
+; CHECK-NEXT: mov r4, r0
+; CHECK-NEXT: vmov r0, s16
+; CHECK-NEXT: add r1, sp, #4
+; CHECK-NEXT: bl modff
+; CHECK-NEXT: vldr s1, [sp]
+; CHECK-NEXT: mov r1, r4
+; CHECK-NEXT: vldr s0, [sp, #4]
+; CHECK-NEXT: vmov r2, r3, d0
+; CHECK-NEXT: add sp, #8
+; CHECK-NEXT: vpop {d8}
+; CHECK-NEXT: pop {r4, pc}
+;
+; THUMB-LABEL: test_modf_v2f32:
+; THUMB: @ %bb.0:
+; THUMB-NEXT: push {r4, r5, r7, lr}
+; THUMB-NEXT: sub sp, #8
+; THUMB-NEXT: mov r4, r1
+; THUMB-NEXT: mov r1, sp
+; THUMB-NEXT: bl modff
+; THUMB-NEXT: mov r5, r0
+; THUMB-NEXT: add r1, sp, #4
+; THUMB-NEXT: mo...
[truncated]
|
This is an alternate solution for #127976 |
✅ With the latest revision this PR passed the undef deprecator. |
The "undef deprecator" is triggered by mentioning the word "undef" in comments :/ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There should be no CHAIN_BARRIER machineinstrs. This is missing necessary support code for it, but it should not be necessary. The chain is a pure DAG concept. I haven't considered the DAG CHAIN_BARRIER yet
Ah right 👍 I've updated |
Now that I've removed the |
Kind ping :) |
Moving to new PR
Change-Id: I669316d89c461f410c9cfdf6317860837c50e597
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
…c" (#129885)" This broke modff calls on 32-bit x86 Windows. See comment on the PR. > This updates the existing modf[f|l] builtin to be lowered via the > llvm.modf.* intrinsic (rather than directly to a library call). > > The legalization issues exposed by the original PR (#126750) should have > been fixed in #128055 and #129264. This reverts commit cd1d9a8.
…lvm#128055) If a (two-result) node like `FMODF` or `FFREXP` is expanded to a library call, where said library has the function prototype like: `float(float, float*)` -- that is it returns a float from the call and via an output pointer. The first result of the node maps to the value returned by value and the second result maps to the value returned via the output pointer. If only the second result is used after the expansion, we hit an issue on x87 targets: ``` // Before expansion: t0, t1 = fmodf x return t1 // t0 is unused ``` Expanded result: ``` ptr = alloca ch0 = call modf ptr t0, ch1 = copy_from_reg, ch0 // t0 unused t1, ch2 = ldr ptr, ch1 return t1 ``` So far things are alright, but the DAGCombiner optimizes this to: ``` ptr = alloca ch0 = call modf ptr // copy_from_reg optimized out t1, ch1 = ldr ptr, ch0 return t1 ``` On most targets this is fine. The optimized out `copy_from_reg` is unused and is a NOP. However, x87 uses a floating-point stack, and if the `copy_from_reg` is optimized out it won't emit a pop needed to remove the unused result. The prior solution for this was to attach the chain from the `copy_from_reg` to the root, which did work, however, the root is not always available (it's set to null during legalize types). So the alternate solution in this patch is to replace the `copy_from_reg` with an `X86ISD::POP_FROM_X87_REG` within the X86 call lowering. This node is the same as `copy_from_reg` except this node makes it explicit that it may lower to an x87 FPU stack pop. Optimizations should be more cautious when handling this node than a normal CopyFromReg to avoid removing a required FPU stack pop. ``` ptr = alloca ch0 = call modf ptr t0, ch1 = pop_from_x87_reg, ch0 // t0 unused t1, ch2 = ldr ptr, ch1 return t1 ``` Using this node ensures a required x87 FPU pop is not removed due to the DAGCombiner. This is an alternate solution for llvm#127976.
…#129885) Reverts llvm#127987 Original description: This updates the existing modf[f|l] builtin to be lowered via the llvm.modf.* intrinsic (rather than directly to a library call). The legalization issues exposed by the original PR (llvm#126750) should have been fixed in llvm#128055 and llvm#129264.
If a (two-result) node like
FMODF
orFFREXP
is expanded to a library call, where said library has the function prototype like:float(float, float*)
-- that is it returns a float from the call and via an output pointer. The first result of the node maps to the value returned by value and the second result maps to the value returned via the output pointer.If only the second result is used after the expansion, we hit an issue on x87 targets:
Expanded result:
So far things are alright, but the DAGCombiner optimizes this to:
On most targets this is fine. The optimized out
copy_from_reg
is unused and is a NOP. However, x87 uses a floating-point stack, and if thecopy_from_reg
is optimized out it won't emit a pop needed to remove the unused result.The prior solution for this was to attach the chain from the
copy_from_reg
to the root, which did work, however, the root is not always available (it's set to null during legalize types). So the alternate solution in this patch is to replace thecopy_from_reg
with anX86ISD::POP_FROM_X87_REG
within the X86 call lowering. This node is the same ascopy_from_reg
except this node makes it explicit that it may lower to an x87 FPU stack pop. Optimizations should be more cautious when handling this node than a normal CopyFromReg to avoid removing a required FPU stack pop.Using this node ensures a required x87 FPU pop is not removed due to the DAGCombiner.
This is an alternate solution for #127976.