-
Notifications
You must be signed in to change notification settings - Fork 13.6k
DAG: Call SimplifyDemandedBits on fcopysign sign value #97151
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
DAG: Call SimplifyDemandedBits on fcopysign sign value #97151
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-selectiondag Author: Matt Arsenault (arsenm) ChangesMath library code has quite a few places with complex bit This assumes the position in the float type, which should There appears to be some value in simplifying the value operand Full diff: https://github.com/llvm/llvm-project/pull/97151.diff 6 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 254d63abdf805..eaab2b3421190 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -17565,6 +17565,12 @@ SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
if (CanCombineFCOPYSIGN_EXTEND_ROUND(N))
return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1.getOperand(0));
+ // We only take the sign bit from the sign operand.
+ EVT SignVT = N1.getValueType();
+ if (SimplifyDemandedBits(N1,
+ APInt::getSignMask(SignVT.getScalarSizeInBits())))
+ return SDValue(N, 0);
+
return SDValue();
}
diff --git a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
index 1eccb55202f02..af4f236c783c6 100644
--- a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
+++ b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
@@ -11,11 +11,10 @@ define half @test_pown_reduced_fast_f16_known_odd(half %x, i32 %y.arg) #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_or_b32_e32 v1, 1, v1
; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff8000, v0
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX9-NEXT: v_mul_f16_e64 v0, |v0|, v1
-; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v2
+; GFX9-NEXT: v_mul_f16_e64 v1, |v0|, v1
+; GFX9-NEXT: v_bfi_b32 v0, s4, v1, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
%y = or i32 %y.arg, 1
%fabs = call half @llvm.fabs.f16(half %x)
@@ -37,10 +36,9 @@ define <2 x half> @test_pown_reduced_fast_v2f16_known_odd(<2 x half> %x, <2 x i3
; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v2
; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 0x80008000, v0
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_pack_b32_f16 v1, v1, v2
; GFX9-NEXT: v_pk_mul_f16 v1, v3, v1
; GFX9-NEXT: v_bfi_b32 v2, s4, v1, v0
@@ -67,10 +65,9 @@ define float @test_pown_reduced_fast_f32_known_odd(float %x, i32 %y.arg) #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_or_b32_e32 v1, 1, v1
; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1
-; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0
; GFX9-NEXT: s_brev_b32 s4, -2
-; GFX9-NEXT: v_mul_f32_e64 v0, |v0|, v1
-; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v2
+; GFX9-NEXT: v_mul_f32_e64 v1, |v0|, v1
+; GFX9-NEXT: v_bfi_b32 v0, s4, v1, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
%y = or i32 %y.arg, 1
%fabs = call float @llvm.fabs.f32(float %x)
@@ -94,8 +91,6 @@ define <2 x float> @test_pown_reduced_fast_v2f32_known_odd(<2 x float> %x, <2 x
; GFX9-NEXT: s_brev_b32 s4, -2
; GFX9-NEXT: v_mul_f32_e64 v3, |v1|, v3
; GFX9-NEXT: v_mul_f32_e64 v2, |v0|, v2
-; GFX9-NEXT: v_and_b32_e32 v1, 0x80000000, v1
-; GFX9-NEXT: v_and_b32_e32 v0, 0x80000000, v0
; GFX9-NEXT: v_bfi_b32 v0, s4, v2, v0
; GFX9-NEXT: v_bfi_b32 v1, s4, v3, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -118,8 +113,7 @@ define double @test_pown_reduced_fast_f64_known_odd(double %x, i32 %y.arg) #0 {
; GFX9-NEXT: v_cvt_f64_i32_e32 v[2:3], v2
; GFX9-NEXT: s_brev_b32 s4, -2
; GFX9-NEXT: v_mul_f64 v[2:3], |v[0:1]|, v[2:3]
-; GFX9-NEXT: v_and_b32_e32 v0, 0x80000000, v1
-; GFX9-NEXT: v_bfi_b32 v1, s4, v3, v0
+; GFX9-NEXT: v_bfi_b32 v1, s4, v3, v1
; GFX9-NEXT: v_mov_b32_e32 v0, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
%y = or i32 %y.arg, 1
@@ -144,10 +138,8 @@ define <2 x double> @test_pown_reduced_fast_v2f64_known_odd(<2 x double> %x, <2
; GFX9-NEXT: s_brev_b32 s4, -2
; GFX9-NEXT: v_mul_f64 v[4:5], |v[0:1]|, v[4:5]
; GFX9-NEXT: v_mul_f64 v[6:7], |v[2:3]|, v[6:7]
-; GFX9-NEXT: v_and_b32_e32 v0, 0x80000000, v3
-; GFX9-NEXT: v_and_b32_e32 v1, 0x80000000, v1
; GFX9-NEXT: v_bfi_b32 v1, s4, v5, v1
-; GFX9-NEXT: v_bfi_b32 v3, s4, v7, v0
+; GFX9-NEXT: v_bfi_b32 v3, s4, v7, v3
; GFX9-NEXT: v_mov_b32_e32 v0, v4
; GFX9-NEXT: v_mov_b32_e32 v2, v6
; GFX9-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/PowerPC/fp128-bitcast-after-operation.ll b/llvm/test/CodeGen/PowerPC/fp128-bitcast-after-operation.ll
index aedbb3f6a985a..ebec8c1c4d654 100644
--- a/llvm/test/CodeGen/PowerPC/fp128-bitcast-after-operation.ll
+++ b/llvm/test/CodeGen/PowerPC/fp128-bitcast-after-operation.ll
@@ -152,41 +152,36 @@ define i128 @test_copysign(ppc_fp128 %x, ppc_fp128 %y) nounwind {
; PPC32-LABEL: test_copysign:
; PPC32: # %bb.0: # %entry
; PPC32-NEXT: mflr 0
-; PPC32-NEXT: stwu 1, -96(1)
-; PPC32-NEXT: stw 0, 100(1)
-; PPC32-NEXT: stfd 1, 40(1)
-; PPC32-NEXT: lwz 3, 44(1)
-; PPC32-NEXT: stfd 2, 32(1)
-; PPC32-NEXT: stw 3, 60(1)
-; PPC32-NEXT: lwz 3, 40(1)
-; PPC32-NEXT: stfd 3, 72(1)
-; PPC32-NEXT: stw 3, 56(1)
+; PPC32-NEXT: stwu 1, -80(1)
+; PPC32-NEXT: stw 0, 84(1)
+; PPC32-NEXT: stfd 1, 32(1)
; PPC32-NEXT: lwz 3, 36(1)
-; PPC32-NEXT: stfd 4, 64(1)
+; PPC32-NEXT: stfd 2, 24(1)
; PPC32-NEXT: stw 3, 52(1)
; PPC32-NEXT: lwz 3, 32(1)
-; PPC32-NEXT: lfd 1, 56(1)
+; PPC32-NEXT: stfd 3, 56(1)
; PPC32-NEXT: stw 3, 48(1)
-; PPC32-NEXT: lwz 3, 76(1)
-; PPC32-NEXT: lfd 2, 48(1)
-; PPC32-NEXT: stw 3, 92(1)
-; PPC32-NEXT: lwz 3, 72(1)
-; PPC32-NEXT: stw 3, 88(1)
-; PPC32-NEXT: lwz 3, 68(1)
-; PPC32-NEXT: lfd 3, 88(1)
-; PPC32-NEXT: stw 3, 84(1)
-; PPC32-NEXT: lwz 3, 64(1)
-; PPC32-NEXT: stw 3, 80(1)
-; PPC32-NEXT: lfd 4, 80(1)
+; PPC32-NEXT: lwz 3, 28(1)
+; PPC32-NEXT: lfd 4, 64(1)
+; PPC32-NEXT: stw 3, 44(1)
+; PPC32-NEXT: lwz 3, 24(1)
+; PPC32-NEXT: lfd 1, 48(1)
+; PPC32-NEXT: stw 3, 40(1)
+; PPC32-NEXT: lwz 3, 60(1)
+; PPC32-NEXT: lfd 2, 40(1)
+; PPC32-NEXT: stw 3, 76(1)
+; PPC32-NEXT: lwz 3, 56(1)
+; PPC32-NEXT: stw 3, 72(1)
+; PPC32-NEXT: lfd 3, 72(1)
; PPC32-NEXT: bl copysignl
-; PPC32-NEXT: stfd 1, 16(1)
-; PPC32-NEXT: stfd 2, 24(1)
-; PPC32-NEXT: lwz 3, 16(1)
-; PPC32-NEXT: lwz 4, 20(1)
-; PPC32-NEXT: lwz 5, 24(1)
-; PPC32-NEXT: lwz 6, 28(1)
-; PPC32-NEXT: lwz 0, 100(1)
-; PPC32-NEXT: addi 1, 1, 96
+; PPC32-NEXT: stfd 1, 8(1)
+; PPC32-NEXT: stfd 2, 16(1)
+; PPC32-NEXT: lwz 3, 8(1)
+; PPC32-NEXT: lwz 4, 12(1)
+; PPC32-NEXT: lwz 5, 16(1)
+; PPC32-NEXT: lwz 6, 20(1)
+; PPC32-NEXT: lwz 0, 84(1)
+; PPC32-NEXT: addi 1, 1, 80
; PPC32-NEXT: mtlr 0
; PPC32-NEXT: blr
entry:
diff --git a/llvm/test/CodeGen/RISCV/double-arith.ll b/llvm/test/CodeGen/RISCV/double-arith.ll
index a2093f5b5e43a..ced6ff66ef678 100644
--- a/llvm/test/CodeGen/RISCV/double-arith.ll
+++ b/llvm/test/CodeGen/RISCV/double-arith.ll
@@ -320,9 +320,7 @@ define double @fsgnjn_d(double %a, double %b) nounwind {
;
; RV64IZFINXZDINX-LABEL: fsgnjn_d:
; RV64IZFINXZDINX: # %bb.0:
-; RV64IZFINXZDINX-NEXT: li a2, -1
-; RV64IZFINXZDINX-NEXT: slli a2, a2, 63
-; RV64IZFINXZDINX-NEXT: xor a1, a1, a2
+; RV64IZFINXZDINX-NEXT: not a1, a1
; RV64IZFINXZDINX-NEXT: fsgnj.d a0, a0, a1
; RV64IZFINXZDINX-NEXT: ret
;
diff --git a/llvm/test/CodeGen/RISCV/double-bitmanip-dagcombines.ll b/llvm/test/CodeGen/RISCV/double-bitmanip-dagcombines.ll
index 99835ff59493f..70d3291d3a840 100644
--- a/llvm/test/CodeGen/RISCV/double-bitmanip-dagcombines.ll
+++ b/llvm/test/CodeGen/RISCV/double-bitmanip-dagcombines.ll
@@ -156,20 +156,16 @@ define double @fcopysign_fneg(double %a, double %b) nounwind {
;
; RV64IFD-LABEL: fcopysign_fneg:
; RV64IFD: # %bb.0:
-; RV64IFD-NEXT: li a2, -1
-; RV64IFD-NEXT: slli a2, a2, 63
-; RV64IFD-NEXT: xor a1, a1, a2
-; RV64IFD-NEXT: fmv.d.x fa5, a1
+; RV64IFD-NEXT: fmv.d.x fa5, a0
+; RV64IFD-NEXT: not a0, a1
; RV64IFD-NEXT: fmv.d.x fa4, a0
-; RV64IFD-NEXT: fsgnj.d fa5, fa4, fa5
+; RV64IFD-NEXT: fsgnj.d fa5, fa5, fa4
; RV64IFD-NEXT: fmv.x.d a0, fa5
; RV64IFD-NEXT: ret
;
; RV64IZFINXZDINX-LABEL: fcopysign_fneg:
; RV64IZFINXZDINX: # %bb.0:
-; RV64IZFINXZDINX-NEXT: li a2, -1
-; RV64IZFINXZDINX-NEXT: slli a2, a2, 63
-; RV64IZFINXZDINX-NEXT: xor a1, a1, a2
+; RV64IZFINXZDINX-NEXT: not a1, a1
; RV64IZFINXZDINX-NEXT: fsgnj.d a0, a0, a1
; RV64IZFINXZDINX-NEXT: ret
%1 = fneg double %b
diff --git a/llvm/test/CodeGen/RISCV/float-bitmanip-dagcombines.ll b/llvm/test/CodeGen/RISCV/float-bitmanip-dagcombines.ll
index 2495b1ad300c6..53df588bf1c37 100644
--- a/llvm/test/CodeGen/RISCV/float-bitmanip-dagcombines.ll
+++ b/llvm/test/CodeGen/RISCV/float-bitmanip-dagcombines.ll
@@ -119,18 +119,16 @@ define float @fcopysign_fneg(float %a, float %b) nounwind {
;
; RV32IF-LABEL: fcopysign_fneg:
; RV32IF: # %bb.0:
-; RV32IF-NEXT: lui a2, 524288
-; RV32IF-NEXT: xor a1, a1, a2
-; RV32IF-NEXT: fmv.w.x fa5, a1
+; RV32IF-NEXT: fmv.w.x fa5, a0
+; RV32IF-NEXT: not a0, a1
; RV32IF-NEXT: fmv.w.x fa4, a0
-; RV32IF-NEXT: fsgnj.s fa5, fa4, fa5
+; RV32IF-NEXT: fsgnj.s fa5, fa5, fa4
; RV32IF-NEXT: fmv.x.w a0, fa5
; RV32IF-NEXT: ret
;
; RV32IZFINX-LABEL: fcopysign_fneg:
; RV32IZFINX: # %bb.0:
-; RV32IZFINX-NEXT: lui a2, 524288
-; RV32IZFINX-NEXT: xor a1, a1, a2
+; RV32IZFINX-NEXT: not a1, a1
; RV32IZFINX-NEXT: fsgnj.s a0, a0, a1
; RV32IZFINX-NEXT: ret
;
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
@@ -17565,6 +17565,12 @@ SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) { | |||
if (CanCombineFCOPYSIGN_EXTEND_ROUND(N)) | |||
return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1.getOperand(0)); | |||
|
|||
// We only take the sign bit from the sign operand. | |||
EVT SignVT = N1.getValueType(); | |||
if (SimplifyDemandedBits(N1, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this should be able to subsume some of the optimizations above, e.g. copysign(x, abs(y)) -> abs(x)
would fall out if SimplifyDemandedBits knew about extracting the sign bit from abs(x)
.
bc9940c
to
f808c8a
Compare
Math library code has quite a few places with complex bit logic that are ultimately fed into a copysign. This helps avoid some regressions in a future patch. This assumes the position in the float type, which should at least be valid for IEEE types. Not sure if we need to guard against ppc_fp128 or anything else weird. There appears to be some value in simplifying the value operand as well, but I'll address that separately.
1cc5dca
to
627e1f1
Compare
Math library code has quite a few places with complex bit logic that are ultimately fed into a copysign. This helps avoid some regressions in a future patch. This assumes the position in the float type, which should at least be valid for IEEE types. Not sure if we need to guard against ppc_fp128 or anything else weird. There appears to be some value in simplifying the value operand as well, but I'll address that separately.
Math library code has quite a few places with complex bit logic that are ultimately fed into a copysign. This helps avoid some regressions in a future patch. This assumes the position in the float type, which should at least be valid for IEEE types. Not sure if we need to guard against ppc_fp128 or anything else weird. There appears to be some value in simplifying the value operand as well, but I'll address that separately.
Math library code has quite a few places with complex bit
logic that are ultimately fed into a copysign. This helps
avoid some regressions in a future patch.
This assumes the position in the float type, which should
at least be valid for IEEE types. Not sure if we need to guard
against ppc_fp128 or anything else weird.
There appears to be some value in simplifying the value operand
as well, but I'll address that separately.