[libc][math] Improve fmul performance by using double-double arithmetic. (llvm#107517)

Job Henandez Lara · lntue · web-flow · commit a205a854e06d · 2024-09-14T17:32:22.000-04:00
```
 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 2731072304 ns 
     Average runtime : 68.2767 ns/op 
     Ops per second  : 14646276 op/s 
-- Other function --
     Total time      : 3259744268 ns 
     Average runtime : 81.4935 ns/op 
     Ops per second  : 12270913 op/s 
-- Average runtime ratio --
     Mine / Other's  : 0.837818 

 Performance tests with inputs in normal range:
-- My function --
     Total time      : 93467258 ns 
     Average runtime : 2.33668 ns/op 
     Ops per second  : 427957777 op/s 
-- Other function --
     Total time      : 637295452 ns 
     Average runtime : 15.9324 ns/op 
     Ops per second  : 62765299 op/s 
-- Average runtime ratio --
     Mine / Other's  : 0.146662 

 Performance tests with inputs in normal range with exponents close to each other:
-- My function --
     Total time      : 95764894 ns 
     Average runtime : 2.39412 ns/op 
     Ops per second  : 417690014 op/s 
-- Other function --
     Total time      : 639866770 ns 
     Average runtime : 15.9967 ns/op 
     Ops per second  : 62513075 op/s 
-- Average runtime ratio --
     Mine / Other's  : 0.149664 
```

---------

Co-authored-by: Tue Ly &lt;lntue@google.com&gt;
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
@@ -2958,7 +2958,9 @@ add_entrypoint_object(
   HDRS
     ../fmul.h
   DEPENDS
-    libc.src.__support.FPUtil.generic.mul
+    libc.hdr.errno_macros
+    libc.hdr.fenv_macros
+    libc.src.__support.FPUtil.double_double
   COMPILE_OPTIONS
     -O3
 )
diff --git a/libc/src/math/generic/fmul.cpp b/libc/src/math/generic/fmul.cpp
@@ -5,16 +5,115 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-
 #include "src/math/fmul.h"
+#include "hdr/errno_macros.h"
+#include "hdr/fenv_macros.h"
+#include "src/__support/FPUtil/double_double.h"
 #include "src/__support/FPUtil/generic/mul.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float, fmul, (double x, double y)) {
+
+  // Without FMA instructions, fputil::exact_mult is not
+  // correctly rounded for all rounding modes, so we fall
+  // back to the generic `fmul` implementation
+
+#ifndef LIBC_TARGET_CPU_HAS_FMA
   return fputil::generic::mul<float>(x, y);
-}
+#else
+  fputil::DoubleDouble prod = fputil::exact_mult(x, y);
+  using DoubleBits = fputil::FPBits<double>;
+  using DoubleStorageType = typename DoubleBits::StorageType;
+  using FloatBits = fputil::FPBits<float>;
+  using FloatStorageType = typename FloatBits::StorageType;
+  DoubleBits x_bits(x);
+  DoubleBits y_bits(y);
+
+  Sign result_sign = x_bits.sign() == y_bits.sign() ? Sign::POS : Sign::NEG;
+  double result = prod.hi;
+  DoubleBits hi_bits(prod.hi), lo_bits(prod.lo);
+  // Check for cases where we need to propagate the sticky bits:
+  constexpr uint64_t STICKY_MASK = 0xFFF'FFF; // Lower (52 - 23 - 1 = 28 bits)
+  uint64_t sticky_bits = (hi_bits.uintval() & STICKY_MASK);
+  if (LIBC_UNLIKELY(sticky_bits == 0)) {
+    // Might need to propagate sticky bits:
+    if (!(lo_bits.is_inf_or_nan() || lo_bits.is_zero())) {
+      // Now prod.lo is nonzero and finite, we need to propagate sticky bits.
+      if (lo_bits.sign() != hi_bits.sign())
+        result = DoubleBits(hi_bits.uintval() - 1).get_val();
+      else
+        result = DoubleBits(hi_bits.uintval() | 1).get_val();
+    }
+  }
+
+  float result_f = static_cast<float>(result);
+  FloatBits rf_bits(result_f);
+  uint32_t rf_exp = rf_bits.get_biased_exponent();
+  if (LIBC_LIKELY(rf_exp > 0 && rf_exp < 2 * FloatBits::EXP_BIAS + 1)) {
+    return result_f;
+  }
+
+  // Now result_f is either inf/nan/zero/denormal.
+  if (x_bits.is_nan() || y_bits.is_nan()) {
+    if (x_bits.is_signaling_nan() || y_bits.is_signaling_nan())
+      fputil::raise_except_if_required(FE_INVALID);
+
+    if (x_bits.is_quiet_nan()) {
+      DoubleStorageType x_payload = x_bits.get_mantissa();
+      x_payload >>= DoubleBits::FRACTION_LEN - FloatBits::FRACTION_LEN;
+      return FloatBits::quiet_nan(x_bits.sign(),
+                                  static_cast<FloatStorageType>(x_payload))
+          .get_val();
+    }
+
+    if (y_bits.is_quiet_nan()) {
+      DoubleStorageType y_payload = y_bits.get_mantissa();
+      y_payload >>= DoubleBits::FRACTION_LEN - FloatBits::FRACTION_LEN;
+      return FloatBits::quiet_nan(y_bits.sign(),
+                                  static_cast<FloatStorageType>(y_payload))
+          .get_val();
+    }
+
+    return FloatBits::quiet_nan().get_val();
+  }
 
+  if (x_bits.is_inf()) {
+    if (y_bits.is_zero()) {
+      fputil::set_errno_if_required(EDOM);
+      fputil::raise_except_if_required(FE_INVALID);
+
+      return FloatBits::quiet_nan().get_val();
+    }
+
+    return FloatBits::inf(result_sign).get_val();
+  }
+
+  if (y_bits.is_inf()) {
+    if (x_bits.is_zero()) {
+      fputil::set_errno_if_required(EDOM);
+      fputil::raise_except_if_required(FE_INVALID);
+      return FloatBits::quiet_nan().get_val();
+    }
+
+    return FloatBits::inf(result_sign).get_val();
+  }
+
+  // Now either x or y is zero, and the other one is finite.
+  if (rf_bits.is_inf()) {
+    fputil::set_errno_if_required(ERANGE);
+    return FloatBits::inf(result_sign).get_val();
+  }
+
+  if (x_bits.is_zero() || y_bits.is_zero())
+    return FloatBits::zero(result_sign).get_val();
+
+  fputil::set_errno_if_required(ERANGE);
+  fputil::raise_except_if_required(FE_UNDERFLOW);
+  return result_f;
+
+#endif
+}
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/test/src/math/fmul_test.cpp b/libc/test/src/math/fmul_test.cpp
@@ -10,4 +10,27 @@
 
 #include "src/math/fmul.h"
 
+#include "test/UnitTest/Test.h"
+#include "utils/MPFRWrapper/MPFRUtils.h"
+
 LIST_MUL_TESTS(float, double, LIBC_NAMESPACE::fmul)
+
+TEST_F(LlvmLibcMulTest, SpecialInputs) {
+  namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
+  double INPUTS[][2] = {
+      {0x1.0100010002p8, 0x1.fffcp14},
+      {0x1.000000b92144p-7, 0x1.62p7},
+  };
+
+  for (size_t i = 0; i < 2; ++i) {
+    double a = INPUTS[i][0];
+
+    for (int j = 0; j < 180; ++j) {
+      a *= 0.5;
+      mpfr::BinaryInput<double> input{a, INPUTS[i][1]};
+      ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Mul, input,
+                                     LIBC_NAMESPACE::fmul(a, INPUTS[i][1]),
+                                     0.5);
+    }
+  }
+}
diff --git a/libc/test/src/math/performance_testing/CMakeLists.txt b/libc/test/src/math/performance_testing/CMakeLists.txt
@@ -484,6 +484,8 @@ add_perf_binary(
   DEPENDS
     .binary_op_single_output_diff
     libc.src.math.fmul
+    libc.src.__support.FPUtil.generic.mul
+    libc.src.__support.FPUtil.fp_bits
   COMPILE_OPTIONS
     -fno-builtin
 )
diff --git a/libc/test/src/math/performance_testing/fmul_perf.cpp b/libc/test/src/math/performance_testing/fmul_perf.cpp
@@ -7,12 +7,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "BinaryOpSingleOutputPerf.h"
+#include "src/__support/FPUtil/generic/mul.h"
 #include "src/math/fmul.h"
 
 static constexpr size_t DOUBLE_ROUNDS = 40;
 
 float fmul_placeholder_binary(double x, double y) {
-  return static_cast<float>(x * y);
+  return LIBC_NAMESPACE::fputil::generic::mul<float>(x, y);
 }
 
 int main() {
diff --git a/libc/test/src/math/smoke/fmul_test.cpp b/libc/test/src/math/smoke/fmul_test.cpp
@@ -11,3 +11,22 @@
 #include "src/math/fmul.h"
 
 LIST_MUL_TESTS(float, double, LIBC_NAMESPACE::fmul)
+
+TEST_F(LlvmLibcMulTest, SpecialInputs) {
+  constexpr double INPUTS[][2] = {
+      {0x1.0100010002p8, 0x1.fffcp14},
+      {0x1.000000b92144p-7, 0x1.62p7},
+  };
+
+  constexpr float RESULTS[] = {
+      0x1.00fdfep+23f,
+      0x1.620002p0f,
+  };
+
+  constexpr size_t N = sizeof(RESULTS) / sizeof(RESULTS[0]);
+
+  for (size_t i = 0; i < N; ++i) {
+    float result = LIBC_NAMESPACE::fmul(INPUTS[i][0], INPUTS[i][1]);
+    EXPECT_FP_EQ(RESULTS[i], result);
+  }
+}

Original file line number	Diff line number	Diff line change
`@@ -484,6 +484,8 @@ add_perf_binary(`
`484`	`484`	`DEPENDS`
`485`	`485`	`.binary_op_single_output_diff`
`486`	`486`	`libc.src.math.fmul`
	`487`	`+ libc.src.__support.FPUtil.generic.mul`
	`488`	`+ libc.src.__support.FPUtil.fp_bits`
`487`	`489`	`COMPILE_OPTIONS`
`488`	`490`	`-fno-builtin`
`489`	`491`	`)`