Skip to content

Commit a205a85

Browse files
Job Henandez Laralntue
Job Henandez Lara
andauthored
[libc][math] Improve fmul performance by using double-double arithmetic. (llvm#107517)
``` Performance tests with inputs in denormal range: -- My function -- Total time : 2731072304 ns Average runtime : 68.2767 ns/op Ops per second : 14646276 op/s -- Other function -- Total time : 3259744268 ns Average runtime : 81.4935 ns/op Ops per second : 12270913 op/s -- Average runtime ratio -- Mine / Other's : 0.837818 Performance tests with inputs in normal range: -- My function -- Total time : 93467258 ns Average runtime : 2.33668 ns/op Ops per second : 427957777 op/s -- Other function -- Total time : 637295452 ns Average runtime : 15.9324 ns/op Ops per second : 62765299 op/s -- Average runtime ratio -- Mine / Other's : 0.146662 Performance tests with inputs in normal range with exponents close to each other: -- My function -- Total time : 95764894 ns Average runtime : 2.39412 ns/op Ops per second : 417690014 op/s -- Other function -- Total time : 639866770 ns Average runtime : 15.9967 ns/op Ops per second : 62513075 op/s -- Average runtime ratio -- Mine / Other's : 0.149664 ``` --------- Co-authored-by: Tue Ly <[email protected]>
1 parent 7d11f52 commit a205a85

File tree

6 files changed

+150
-4
lines changed

6 files changed

+150
-4
lines changed

libc/src/math/generic/CMakeLists.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2958,7 +2958,9 @@ add_entrypoint_object(
29582958
HDRS
29592959
../fmul.h
29602960
DEPENDS
2961-
libc.src.__support.FPUtil.generic.mul
2961+
libc.hdr.errno_macros
2962+
libc.hdr.fenv_macros
2963+
libc.src.__support.FPUtil.double_double
29622964
COMPILE_OPTIONS
29632965
-O3
29642966
)

libc/src/math/generic/fmul.cpp

Lines changed: 101 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,115 @@
55
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
66
//
77
//===----------------------------------------------------------------------===//
8-
98
#include "src/math/fmul.h"
9+
#include "hdr/errno_macros.h"
10+
#include "hdr/fenv_macros.h"
11+
#include "src/__support/FPUtil/double_double.h"
1012
#include "src/__support/FPUtil/generic/mul.h"
1113
#include "src/__support/common.h"
1214
#include "src/__support/macros/config.h"
1315

1416
namespace LIBC_NAMESPACE_DECL {
1517

1618
LLVM_LIBC_FUNCTION(float, fmul, (double x, double y)) {
19+
20+
// Without FMA instructions, fputil::exact_mult is not
21+
// correctly rounded for all rounding modes, so we fall
22+
// back to the generic `fmul` implementation
23+
24+
#ifndef LIBC_TARGET_CPU_HAS_FMA
1725
return fputil::generic::mul<float>(x, y);
18-
}
26+
#else
27+
fputil::DoubleDouble prod = fputil::exact_mult(x, y);
28+
using DoubleBits = fputil::FPBits<double>;
29+
using DoubleStorageType = typename DoubleBits::StorageType;
30+
using FloatBits = fputil::FPBits<float>;
31+
using FloatStorageType = typename FloatBits::StorageType;
32+
DoubleBits x_bits(x);
33+
DoubleBits y_bits(y);
34+
35+
Sign result_sign = x_bits.sign() == y_bits.sign() ? Sign::POS : Sign::NEG;
36+
double result = prod.hi;
37+
DoubleBits hi_bits(prod.hi), lo_bits(prod.lo);
38+
// Check for cases where we need to propagate the sticky bits:
39+
constexpr uint64_t STICKY_MASK = 0xFFF'FFF; // Lower (52 - 23 - 1 = 28 bits)
40+
uint64_t sticky_bits = (hi_bits.uintval() & STICKY_MASK);
41+
if (LIBC_UNLIKELY(sticky_bits == 0)) {
42+
// Might need to propagate sticky bits:
43+
if (!(lo_bits.is_inf_or_nan() || lo_bits.is_zero())) {
44+
// Now prod.lo is nonzero and finite, we need to propagate sticky bits.
45+
if (lo_bits.sign() != hi_bits.sign())
46+
result = DoubleBits(hi_bits.uintval() - 1).get_val();
47+
else
48+
result = DoubleBits(hi_bits.uintval() | 1).get_val();
49+
}
50+
}
51+
52+
float result_f = static_cast<float>(result);
53+
FloatBits rf_bits(result_f);
54+
uint32_t rf_exp = rf_bits.get_biased_exponent();
55+
if (LIBC_LIKELY(rf_exp > 0 && rf_exp < 2 * FloatBits::EXP_BIAS + 1)) {
56+
return result_f;
57+
}
58+
59+
// Now result_f is either inf/nan/zero/denormal.
60+
if (x_bits.is_nan() || y_bits.is_nan()) {
61+
if (x_bits.is_signaling_nan() || y_bits.is_signaling_nan())
62+
fputil::raise_except_if_required(FE_INVALID);
63+
64+
if (x_bits.is_quiet_nan()) {
65+
DoubleStorageType x_payload = x_bits.get_mantissa();
66+
x_payload >>= DoubleBits::FRACTION_LEN - FloatBits::FRACTION_LEN;
67+
return FloatBits::quiet_nan(x_bits.sign(),
68+
static_cast<FloatStorageType>(x_payload))
69+
.get_val();
70+
}
71+
72+
if (y_bits.is_quiet_nan()) {
73+
DoubleStorageType y_payload = y_bits.get_mantissa();
74+
y_payload >>= DoubleBits::FRACTION_LEN - FloatBits::FRACTION_LEN;
75+
return FloatBits::quiet_nan(y_bits.sign(),
76+
static_cast<FloatStorageType>(y_payload))
77+
.get_val();
78+
}
79+
80+
return FloatBits::quiet_nan().get_val();
81+
}
1982

83+
if (x_bits.is_inf()) {
84+
if (y_bits.is_zero()) {
85+
fputil::set_errno_if_required(EDOM);
86+
fputil::raise_except_if_required(FE_INVALID);
87+
88+
return FloatBits::quiet_nan().get_val();
89+
}
90+
91+
return FloatBits::inf(result_sign).get_val();
92+
}
93+
94+
if (y_bits.is_inf()) {
95+
if (x_bits.is_zero()) {
96+
fputil::set_errno_if_required(EDOM);
97+
fputil::raise_except_if_required(FE_INVALID);
98+
return FloatBits::quiet_nan().get_val();
99+
}
100+
101+
return FloatBits::inf(result_sign).get_val();
102+
}
103+
104+
// Now either x or y is zero, and the other one is finite.
105+
if (rf_bits.is_inf()) {
106+
fputil::set_errno_if_required(ERANGE);
107+
return FloatBits::inf(result_sign).get_val();
108+
}
109+
110+
if (x_bits.is_zero() || y_bits.is_zero())
111+
return FloatBits::zero(result_sign).get_val();
112+
113+
fputil::set_errno_if_required(ERANGE);
114+
fputil::raise_except_if_required(FE_UNDERFLOW);
115+
return result_f;
116+
117+
#endif
118+
}
20119
} // namespace LIBC_NAMESPACE_DECL

libc/test/src/math/fmul_test.cpp

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,27 @@
1010

1111
#include "src/math/fmul.h"
1212

13+
#include "test/UnitTest/Test.h"
14+
#include "utils/MPFRWrapper/MPFRUtils.h"
15+
1316
LIST_MUL_TESTS(float, double, LIBC_NAMESPACE::fmul)
17+
18+
TEST_F(LlvmLibcMulTest, SpecialInputs) {
19+
namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
20+
double INPUTS[][2] = {
21+
{0x1.0100010002p8, 0x1.fffcp14},
22+
{0x1.000000b92144p-7, 0x1.62p7},
23+
};
24+
25+
for (size_t i = 0; i < 2; ++i) {
26+
double a = INPUTS[i][0];
27+
28+
for (int j = 0; j < 180; ++j) {
29+
a *= 0.5;
30+
mpfr::BinaryInput<double> input{a, INPUTS[i][1]};
31+
ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Mul, input,
32+
LIBC_NAMESPACE::fmul(a, INPUTS[i][1]),
33+
0.5);
34+
}
35+
}
36+
}

libc/test/src/math/performance_testing/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -484,6 +484,8 @@ add_perf_binary(
484484
DEPENDS
485485
.binary_op_single_output_diff
486486
libc.src.math.fmul
487+
libc.src.__support.FPUtil.generic.mul
488+
libc.src.__support.FPUtil.fp_bits
487489
COMPILE_OPTIONS
488490
-fno-builtin
489491
)

libc/test/src/math/performance_testing/fmul_perf.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,13 @@
77
//===----------------------------------------------------------------------===//
88

99
#include "BinaryOpSingleOutputPerf.h"
10+
#include "src/__support/FPUtil/generic/mul.h"
1011
#include "src/math/fmul.h"
1112

1213
static constexpr size_t DOUBLE_ROUNDS = 40;
1314

1415
float fmul_placeholder_binary(double x, double y) {
15-
return static_cast<float>(x * y);
16+
return LIBC_NAMESPACE::fputil::generic::mul<float>(x, y);
1617
}
1718

1819
int main() {

libc/test/src/math/smoke/fmul_test.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,22 @@
1111
#include "src/math/fmul.h"
1212

1313
LIST_MUL_TESTS(float, double, LIBC_NAMESPACE::fmul)
14+
15+
TEST_F(LlvmLibcMulTest, SpecialInputs) {
16+
constexpr double INPUTS[][2] = {
17+
{0x1.0100010002p8, 0x1.fffcp14},
18+
{0x1.000000b92144p-7, 0x1.62p7},
19+
};
20+
21+
constexpr float RESULTS[] = {
22+
0x1.00fdfep+23f,
23+
0x1.620002p0f,
24+
};
25+
26+
constexpr size_t N = sizeof(RESULTS) / sizeof(RESULTS[0]);
27+
28+
for (size_t i = 0; i < N; ++i) {
29+
float result = LIBC_NAMESPACE::fmul(INPUTS[i][0], INPUTS[i][1]);
30+
EXPECT_FP_EQ(RESULTS[i], result);
31+
}
32+
}

0 commit comments

Comments
 (0)