Skip to content

Commit a284bdb

Browse files
committed
[DAG] Fold fdiv X, c2 -> fmul X, 1/c2 without AllowReciprocal if exact (#93882)
This moves the combine of fdiv by constant to fmul out of an 'if (Options.UnsafeFPMath || Flags.hasAllowReciprocal()' block, so that it triggers if the divide is exact. An extra check for Recip.isDenormal() is added as multiple places make reference to it being unsafe or slow on certain platforms.
1 parent 53fecef commit a284bdb

File tree

10 files changed

+200
-239
lines changed

10 files changed

+200
-239
lines changed

llvm/include/llvm/ADT/APFloat.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -964,6 +964,13 @@ class APFloat : public APFloatBase {
964964
return Val;
965965
}
966966

967+
/// Factory for Positive and Negative One.
968+
///
969+
/// \param Negative True iff the number should be negative.
970+
static APFloat getOne(const fltSemantics &Sem, bool Negative = false) {
971+
return APFloat(Sem, Negative ? -1 : 1);
972+
}
973+
967974
/// Factory for Positive and Negative Infinity.
968975
///
969976
/// \param Negative True iff the number should be negative.

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 22 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -17262,26 +17262,29 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
1726217262
if (SDValue V = combineRepeatedFPDivisors(N))
1726317263
return V;
1726417264

17265-
if (Options.UnsafeFPMath || Flags.hasAllowReciprocal()) {
17266-
// fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable.
17267-
if (auto *N1CFP = dyn_cast<ConstantFPSDNode>(N1)) {
17268-
// Compute the reciprocal 1.0 / c2.
17269-
const APFloat &N1APF = N1CFP->getValueAPF();
17270-
APFloat Recip(N1APF.getSemantics(), 1); // 1.0
17271-
APFloat::opStatus st = Recip.divide(N1APF, APFloat::rmNearestTiesToEven);
17272-
// Only do the transform if the reciprocal is a legal fp immediate that
17273-
// isn't too nasty (eg NaN, denormal, ...).
17274-
if ((st == APFloat::opOK || st == APFloat::opInexact) && // Not too nasty
17275-
(!LegalOperations ||
17276-
// FIXME: custom lowering of ConstantFP might fail (see e.g. ARM
17277-
// backend)... we should handle this gracefully after Legalize.
17278-
// TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT) ||
17279-
TLI.isOperationLegal(ISD::ConstantFP, VT) ||
17280-
TLI.isFPImmLegal(Recip, VT, ForCodeSize)))
17281-
return DAG.getNode(ISD::FMUL, DL, VT, N0,
17282-
DAG.getConstantFP(Recip, DL, VT));
17283-
}
17265+
// fold (fdiv X, c2) -> (fmul X, 1/c2) if there is no loss in precision, or
17266+
// the loss is acceptable with AllowReciprocal.
17267+
if (auto *N1CFP = isConstOrConstSplatFP(N1, true)) {
17268+
// Compute the reciprocal 1.0 / c2.
17269+
const APFloat &N1APF = N1CFP->getValueAPF();
17270+
APFloat Recip = APFloat::getOne(N1APF.getSemantics());
17271+
APFloat::opStatus st = Recip.divide(N1APF, APFloat::rmNearestTiesToEven);
17272+
// Only do the transform if the reciprocal is a legal fp immediate that
17273+
// isn't too nasty (eg NaN, denormal, ...).
17274+
if (((st == APFloat::opOK && !Recip.isDenormal()) ||
17275+
(st == APFloat::opInexact &&
17276+
(Options.UnsafeFPMath || Flags.hasAllowReciprocal()))) &&
17277+
(!LegalOperations ||
17278+
// FIXME: custom lowering of ConstantFP might fail (see e.g. ARM
17279+
// backend)... we should handle this gracefully after Legalize.
17280+
// TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT) ||
17281+
TLI.isOperationLegal(ISD::ConstantFP, VT) ||
17282+
TLI.isFPImmLegal(Recip, VT, ForCodeSize)))
17283+
return DAG.getNode(ISD::FMUL, DL, VT, N0,
17284+
DAG.getConstantFP(Recip, DL, VT));
17285+
}
1728417286

17287+
if (Options.UnsafeFPMath || Flags.hasAllowReciprocal()) {
1728517288
// If this FDIV is part of a reciprocal square root, it may be folded
1728617289
// into a target-specific square root estimate instruction.
1728717290
if (N1.getOpcode() == ISD::FSQRT) {

llvm/test/CodeGen/AArch64/fcvt-fixed.ll

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -412,10 +412,10 @@ define half @scvtf_f16_i32_7(i32 %int) {
412412
; CHECK-NO16-LABEL: scvtf_f16_i32_7:
413413
; CHECK-NO16: // %bb.0:
414414
; CHECK-NO16-NEXT: scvtf s1, w0
415-
; CHECK-NO16-NEXT: movi v0.2s, #67, lsl #24
415+
; CHECK-NO16-NEXT: movi v0.2s, #60, lsl #24
416416
; CHECK-NO16-NEXT: fcvt h1, s1
417417
; CHECK-NO16-NEXT: fcvt s1, h1
418-
; CHECK-NO16-NEXT: fdiv s0, s1, s0
418+
; CHECK-NO16-NEXT: fmul s0, s1, s0
419419
; CHECK-NO16-NEXT: fcvt h0, s0
420420
; CHECK-NO16-NEXT: ret
421421
;
@@ -432,10 +432,10 @@ define half @scvtf_f16_i32_15(i32 %int) {
432432
; CHECK-NO16-LABEL: scvtf_f16_i32_15:
433433
; CHECK-NO16: // %bb.0:
434434
; CHECK-NO16-NEXT: scvtf s1, w0
435-
; CHECK-NO16-NEXT: movi v0.2s, #71, lsl #24
435+
; CHECK-NO16-NEXT: movi v0.2s, #56, lsl #24
436436
; CHECK-NO16-NEXT: fcvt h1, s1
437437
; CHECK-NO16-NEXT: fcvt s1, h1
438-
; CHECK-NO16-NEXT: fdiv s0, s1, s0
438+
; CHECK-NO16-NEXT: fmul s0, s1, s0
439439
; CHECK-NO16-NEXT: fcvt h0, s0
440440
; CHECK-NO16-NEXT: ret
441441
;
@@ -452,10 +452,10 @@ define half @scvtf_f16_i64_7(i64 %long) {
452452
; CHECK-NO16-LABEL: scvtf_f16_i64_7:
453453
; CHECK-NO16: // %bb.0:
454454
; CHECK-NO16-NEXT: scvtf s1, x0
455-
; CHECK-NO16-NEXT: movi v0.2s, #67, lsl #24
455+
; CHECK-NO16-NEXT: movi v0.2s, #60, lsl #24
456456
; CHECK-NO16-NEXT: fcvt h1, s1
457457
; CHECK-NO16-NEXT: fcvt s1, h1
458-
; CHECK-NO16-NEXT: fdiv s0, s1, s0
458+
; CHECK-NO16-NEXT: fmul s0, s1, s0
459459
; CHECK-NO16-NEXT: fcvt h0, s0
460460
; CHECK-NO16-NEXT: ret
461461
;
@@ -472,10 +472,10 @@ define half @scvtf_f16_i64_15(i64 %long) {
472472
; CHECK-NO16-LABEL: scvtf_f16_i64_15:
473473
; CHECK-NO16: // %bb.0:
474474
; CHECK-NO16-NEXT: scvtf s1, x0
475-
; CHECK-NO16-NEXT: movi v0.2s, #71, lsl #24
475+
; CHECK-NO16-NEXT: movi v0.2s, #56, lsl #24
476476
; CHECK-NO16-NEXT: fcvt h1, s1
477477
; CHECK-NO16-NEXT: fcvt s1, h1
478-
; CHECK-NO16-NEXT: fdiv s0, s1, s0
478+
; CHECK-NO16-NEXT: fmul s0, s1, s0
479479
; CHECK-NO16-NEXT: fcvt h0, s0
480480
; CHECK-NO16-NEXT: ret
481481
;
@@ -574,10 +574,10 @@ define half @ucvtf_f16_i32_7(i32 %int) {
574574
; CHECK-NO16-LABEL: ucvtf_f16_i32_7:
575575
; CHECK-NO16: // %bb.0:
576576
; CHECK-NO16-NEXT: ucvtf s1, w0
577-
; CHECK-NO16-NEXT: movi v0.2s, #67, lsl #24
577+
; CHECK-NO16-NEXT: movi v0.2s, #60, lsl #24
578578
; CHECK-NO16-NEXT: fcvt h1, s1
579579
; CHECK-NO16-NEXT: fcvt s1, h1
580-
; CHECK-NO16-NEXT: fdiv s0, s1, s0
580+
; CHECK-NO16-NEXT: fmul s0, s1, s0
581581
; CHECK-NO16-NEXT: fcvt h0, s0
582582
; CHECK-NO16-NEXT: ret
583583
;
@@ -594,10 +594,10 @@ define half @ucvtf_f16_i32_15(i32 %int) {
594594
; CHECK-NO16-LABEL: ucvtf_f16_i32_15:
595595
; CHECK-NO16: // %bb.0:
596596
; CHECK-NO16-NEXT: ucvtf s1, w0
597-
; CHECK-NO16-NEXT: movi v0.2s, #71, lsl #24
597+
; CHECK-NO16-NEXT: movi v0.2s, #56, lsl #24
598598
; CHECK-NO16-NEXT: fcvt h1, s1
599599
; CHECK-NO16-NEXT: fcvt s1, h1
600-
; CHECK-NO16-NEXT: fdiv s0, s1, s0
600+
; CHECK-NO16-NEXT: fmul s0, s1, s0
601601
; CHECK-NO16-NEXT: fcvt h0, s0
602602
; CHECK-NO16-NEXT: ret
603603
;
@@ -614,10 +614,10 @@ define half @ucvtf_f16_i64_7(i64 %long) {
614614
; CHECK-NO16-LABEL: ucvtf_f16_i64_7:
615615
; CHECK-NO16: // %bb.0:
616616
; CHECK-NO16-NEXT: ucvtf s1, x0
617-
; CHECK-NO16-NEXT: movi v0.2s, #67, lsl #24
617+
; CHECK-NO16-NEXT: movi v0.2s, #60, lsl #24
618618
; CHECK-NO16-NEXT: fcvt h1, s1
619619
; CHECK-NO16-NEXT: fcvt s1, h1
620-
; CHECK-NO16-NEXT: fdiv s0, s1, s0
620+
; CHECK-NO16-NEXT: fmul s0, s1, s0
621621
; CHECK-NO16-NEXT: fcvt h0, s0
622622
; CHECK-NO16-NEXT: ret
623623
;
@@ -634,10 +634,10 @@ define half @ucvtf_f16_i64_15(i64 %long) {
634634
; CHECK-NO16-LABEL: ucvtf_f16_i64_15:
635635
; CHECK-NO16: // %bb.0:
636636
; CHECK-NO16-NEXT: ucvtf s1, x0
637-
; CHECK-NO16-NEXT: movi v0.2s, #71, lsl #24
637+
; CHECK-NO16-NEXT: movi v0.2s, #56, lsl #24
638638
; CHECK-NO16-NEXT: fcvt h1, s1
639639
; CHECK-NO16-NEXT: fcvt s1, h1
640-
; CHECK-NO16-NEXT: fdiv s0, s1, s0
640+
; CHECK-NO16-NEXT: fmul s0, s1, s0
641641
; CHECK-NO16-NEXT: fcvt h0, s0
642642
; CHECK-NO16-NEXT: ret
643643
;

llvm/test/CodeGen/AArch64/fdiv-const.ll

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
define float @divf32_2(float %a) nounwind {
55
; CHECK-LABEL: divf32_2:
66
; CHECK: // %bb.0:
7-
; CHECK-NEXT: fmov s1, #2.00000000
8-
; CHECK-NEXT: fdiv s0, s0, s1
7+
; CHECK-NEXT: fmov s1, #0.50000000
8+
; CHECK-NEXT: fmul s0, s0, s1
99
; CHECK-NEXT: ret
1010
%r = fdiv float %a, 2.0
1111
ret float %r
@@ -46,8 +46,8 @@ define float @divf32_p75_arcp(float %a) nounwind {
4646
define half @divf16_2(half %a) nounwind {
4747
; CHECK-LABEL: divf16_2:
4848
; CHECK: // %bb.0:
49-
; CHECK-NEXT: fmov h1, #2.00000000
50-
; CHECK-NEXT: fdiv h0, h0, h1
49+
; CHECK-NEXT: fmov h1, #0.50000000
50+
; CHECK-NEXT: fmul h0, h0, h1
5151
; CHECK-NEXT: ret
5252
%r = fdiv half %a, 2.0
5353
ret half %r
@@ -67,9 +67,9 @@ define half @divf16_32768(half %a) nounwind {
6767
define half @divf16_32768_arcp(half %a) nounwind {
6868
; CHECK-LABEL: divf16_32768_arcp:
6969
; CHECK: // %bb.0:
70-
; CHECK-NEXT: mov w8, #512 // =0x200
70+
; CHECK-NEXT: mov w8, #30720 // =0x7800
7171
; CHECK-NEXT: fmov h1, w8
72-
; CHECK-NEXT: fmul h0, h0, h1
72+
; CHECK-NEXT: fdiv h0, h0, h1
7373
; CHECK-NEXT: ret
7474
%r = fdiv arcp half %a, 32768.0
7575
ret half %r
@@ -78,8 +78,8 @@ define half @divf16_32768_arcp(half %a) nounwind {
7878
define double @divf64_2(double %a) nounwind {
7979
; CHECK-LABEL: divf64_2:
8080
; CHECK: // %bb.0:
81-
; CHECK-NEXT: fmov d1, #2.00000000
82-
; CHECK-NEXT: fdiv d0, d0, d1
81+
; CHECK-NEXT: fmov d1, #0.50000000
82+
; CHECK-NEXT: fmul d0, d0, d1
8383
; CHECK-NEXT: ret
8484
%r = fdiv double %a, 2.0
8585
ret double %r
@@ -88,8 +88,8 @@ define double @divf64_2(double %a) nounwind {
8888
define <4 x float> @divv4f32_2(<4 x float> %a) nounwind {
8989
; CHECK-LABEL: divv4f32_2:
9090
; CHECK: // %bb.0:
91-
; CHECK-NEXT: movi v1.4s, #64, lsl #24
92-
; CHECK-NEXT: fdiv v0.4s, v0.4s, v1.4s
91+
; CHECK-NEXT: movi v1.4s, #63, lsl #24
92+
; CHECK-NEXT: fmul v0.4s, v0.4s, v1.4s
9393
; CHECK-NEXT: ret
9494
%r = fdiv <4 x float> %a, <float 2.0, float 2.0, float 2.0, float 2.0>
9595
ret <4 x float> %r
@@ -141,9 +141,8 @@ define <4 x float> @divv4f32_24816(<4 x float> %a) nounwind {
141141
define <vscale x 4 x float> @divnxv4f32_2(<vscale x 4 x float> %a) nounwind {
142142
; CHECK-LABEL: divnxv4f32_2:
143143
; CHECK: // %bb.0:
144-
; CHECK-NEXT: fmov z1.s, #2.00000000
145144
; CHECK-NEXT: ptrue p0.s
146-
; CHECK-NEXT: fdiv z0.s, p0/m, z0.s, z1.s
145+
; CHECK-NEXT: fmul z0.s, p0/m, z0.s, #0.5
147146
; CHECK-NEXT: ret
148147
%r = fdiv <vscale x 4 x float> %a, splat (float 2.0)
149148
ret <vscale x 4 x float> %r

0 commit comments

Comments
 (0)