Skip to content

[libc][math] Optimize generic nearest integer functions #98483

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 21 additions & 16 deletions libc/src/__support/FPUtil/NearestIntegerOperations.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,15 +75,17 @@ LIBC_INLINE T ceil(T x) {
}

uint32_t trim_size = FPBits<T>::FRACTION_LEN - exponent;
StorageType trunc_mantissa =
static_cast<StorageType>((bits.get_mantissa() >> trim_size) << trim_size);
bits.set_mantissa(trunc_mantissa);
T trunc_value = bits.get_val();
StorageType x_u = bits.uintval();
StorageType trunc_u =
static_cast<StorageType>((x_u >> trim_size) << trim_size);

// If x is already an integer, return it.
if (trunc_value == x)
if (trunc_u == x_u)
return x;

bits.set_uintval(trunc_u);
T trunc_value = bits.get_val();

// If x is negative, the ceil operation is equivalent to the trunc operation.
if (is_neg)
return trunc_value;
Expand Down Expand Up @@ -130,15 +132,17 @@ LIBC_INLINE T round(T x) {
uint32_t trim_size = FPBits<T>::FRACTION_LEN - exponent;
bool half_bit_set =
bool(bits.get_mantissa() & (StorageType(1) << (trim_size - 1)));
StorageType trunc_mantissa =
static_cast<StorageType>((bits.get_mantissa() >> trim_size) << trim_size);
bits.set_mantissa(trunc_mantissa);
T trunc_value = bits.get_val();
StorageType x_u = bits.uintval();
StorageType trunc_u =
static_cast<StorageType>((x_u >> trim_size) << trim_size);

// If x is already an integer, return it.
if (trunc_value == x)
if (trunc_u == x_u)
return x;

bits.set_uintval(trunc_u);
T trunc_value = bits.get_val();

if (!half_bit_set) {
// Franctional part is less than 0.5 so round value is the
// same as the trunc value.
Expand Down Expand Up @@ -188,16 +192,17 @@ round_using_specific_rounding_mode(T x, int rnd) {
}

uint32_t trim_size = FPBits<T>::FRACTION_LEN - exponent;
FPBits<T> new_bits = bits;
StorageType trunc_mantissa =
static_cast<StorageType>((bits.get_mantissa() >> trim_size) << trim_size);
new_bits.set_mantissa(trunc_mantissa);
T trunc_value = new_bits.get_val();
StorageType x_u = bits.uintval();
StorageType trunc_u =
static_cast<StorageType>((x_u >> trim_size) << trim_size);

// If x is already an integer, return it.
if (trunc_value == x)
if (trunc_u == x_u)
return x;

FPBits<T> new_bits(trunc_u);
T trunc_value = new_bits.get_val();

StorageType trim_value =
bits.get_mantissa() &
static_cast<StorageType>(((StorageType(1) << trim_size) - 1));
Expand Down
19 changes: 19 additions & 0 deletions libc/test/src/math/performance_testing/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -366,3 +366,22 @@ add_perf_binary(
COMPILE_OPTIONS
-fno-builtin
)

add_perf_binary(
nearest_integer_funcs_perf
SRCS
nearest_integer_funcs_perf.cpp
DEPENDS
libc.src.math.ceilf
libc.src.math.ceilf16
libc.src.math.floorf
libc.src.math.floorf16
libc.src.math.roundevenf
libc.src.math.roundevenf16
libc.src.math.roundf
libc.src.math.roundf16
libc.src.math.truncf
libc.src.math.truncf16
COMPILE_OPTIONS
-fno-builtin
)
168 changes: 168 additions & 0 deletions libc/test/src/math/performance_testing/nearest_integer_funcs_perf.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
//===-- Performance test for nearest integer functions --------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "src/__support/FPUtil/FPBits.h"
#include "src/math/ceilf.h"
#include "src/math/ceilf16.h"
#include "src/math/floorf.h"
#include "src/math/floorf16.h"
#include "src/math/roundevenf.h"
#include "src/math/roundevenf16.h"
#include "src/math/roundf.h"
#include "src/math/roundf16.h"
#include "src/math/truncf.h"
#include "src/math/truncf16.h"
#include "test/src/math/performance_testing/Timer.h"

#include <fstream>
#include <math.h>

namespace LIBC_NAMESPACE::testing {

template <typename T> class NearestIntegerPerf {
using FPBits = fputil::FPBits<T>;
using StorageType = typename FPBits::StorageType;

public:
typedef T Func(T);

static void run_perf_in_range(Func my_func, Func other_func,
StorageType starting_bit,
StorageType ending_bit, StorageType step,
size_t rounds, std::ofstream &log) {
auto runner = [=](Func func) {
volatile T result;
for (size_t i = 0; i < rounds; i++) {
for (StorageType bits = starting_bit; bits <= ending_bit;
bits += step) {
T x = FPBits(bits).get_val();
result = func(x);
}
}
};

Timer timer;
timer.start();
runner(my_func);
timer.stop();

size_t number_of_runs = (ending_bit - starting_bit) / step + 1;
double my_average =
static_cast<double>(timer.nanoseconds()) / number_of_runs / rounds;
log << "-- My function --\n";
log << " Total time : " << timer.nanoseconds() << " ns \n";
log << " Average runtime : " << my_average << " ns/op \n";
log << " Ops per second : "
<< static_cast<uint64_t>(1'000'000'000.0 / my_average) << " op/s \n";

timer.start();
runner(other_func);
timer.stop();

double other_average =
static_cast<double>(timer.nanoseconds()) / number_of_runs / rounds;
log << "-- Other function --\n";
log << " Total time : " << timer.nanoseconds() << " ns \n";
log << " Average runtime : " << other_average << " ns/op \n";
log << " Ops per second : "
<< static_cast<uint64_t>(1'000'000'000.0 / other_average) << " op/s \n";

log << "-- Average runtime ratio --\n";
log << " Mine / Other's : " << my_average / other_average << " \n";
}

static void run_perf(Func my_func, Func other_func, size_t rounds,
const char *log_file) {
std::ofstream log(log_file);
log << "Performance tests with inputs in normal integral range:\n";
run_perf_in_range(
my_func, other_func,
/*starting_bit=*/StorageType((FPBits::EXP_BIAS + 1) << FPBits::SIG_LEN),
/*ending_bit=*/
StorageType((FPBits::EXP_BIAS + FPBits::FRACTION_LEN - 1)
<< FPBits::SIG_LEN),
/*step=*/StorageType(1 << FPBits::SIG_LEN),
rounds * FPBits::EXP_BIAS * FPBits::EXP_BIAS * 2, log);
log << "\n Performance tests with inputs in low integral range:\n";
run_perf_in_range(
my_func, other_func,
/*starting_bit=*/StorageType(1 << FPBits::SIG_LEN),
/*ending_bit=*/StorageType((FPBits::EXP_BIAS - 1) << FPBits::SIG_LEN),
/*step_bit=*/StorageType(1 << FPBits::SIG_LEN),
rounds * FPBits::EXP_BIAS * FPBits::EXP_BIAS * 2, log);
log << "\n Performance tests with inputs in high integral range:\n";
run_perf_in_range(
my_func, other_func,
/*starting_bit=*/
StorageType((FPBits::EXP_BIAS + FPBits::FRACTION_LEN)
<< FPBits::SIG_LEN),
/*ending_bit=*/
StorageType(FPBits::MAX_BIASED_EXPONENT << FPBits::SIG_LEN),
/*step=*/StorageType(1 << FPBits::SIG_LEN),
rounds * FPBits::EXP_BIAS * FPBits::EXP_BIAS * 2, log);
log << "\n Performance tests with inputs in normal fractional range:\n";
run_perf_in_range(
my_func, other_func,
/*starting_bit=*/
StorageType(((FPBits::EXP_BIAS + 1) << FPBits::SIG_LEN) + 1),
/*ending_bit=*/
StorageType(((FPBits::EXP_BIAS + 2) << FPBits::SIG_LEN) - 1),
/*step=*/StorageType(1), rounds * 2, log);
log << "\n Performance tests with inputs in subnormal fractional range:\n";
run_perf_in_range(my_func, other_func, /*starting_bit=*/StorageType(1),
/*ending_bit=*/StorageType(FPBits::SIG_MASK),
/*step=*/StorageType(1), rounds, log);
}
};

} // namespace LIBC_NAMESPACE::testing

#define NEAREST_INTEGER_PERF(T, my_func, other_func, rounds, filename) \
{ \
LIBC_NAMESPACE::testing::NearestIntegerPerf<T>::run_perf( \
&my_func, &other_func, rounds, filename); \
LIBC_NAMESPACE::testing::NearestIntegerPerf<T>::run_perf( \
&my_func, &other_func, rounds, filename); \
}

static constexpr size_t FLOAT16_ROUNDS = 20'000;
static constexpr size_t FLOAT_ROUNDS = 40;

// LLVM libc might be the only libc implementation with support for float16 math
// functions currently. We can't compare our float16 functions against the
// system libc, so we compare them against this placeholder function.
float16 placeholderf16(float16 x) { return x; }

// The system libc might not provide the roundeven* C23 math functions either.
float placeholderf(float x) { return x; }

int main() {
NEAREST_INTEGER_PERF(float16, LIBC_NAMESPACE::ceilf16, ::placeholderf16,
FLOAT16_ROUNDS, "ceilf16_perf.log")
NEAREST_INTEGER_PERF(float16, LIBC_NAMESPACE::floorf16, ::placeholderf16,
FLOAT16_ROUNDS, "floorf16_perf.log")
NEAREST_INTEGER_PERF(float16, LIBC_NAMESPACE::roundevenf16, ::placeholderf16,
FLOAT16_ROUNDS, "roundevenf16_perf.log")
NEAREST_INTEGER_PERF(float16, LIBC_NAMESPACE::roundf16, ::placeholderf16,
FLOAT16_ROUNDS, "roundf16_perf.log")
NEAREST_INTEGER_PERF(float16, LIBC_NAMESPACE::truncf16, ::placeholderf16,
FLOAT16_ROUNDS, "truncf16_perf.log")

NEAREST_INTEGER_PERF(float, LIBC_NAMESPACE::ceilf, ::ceilf, FLOAT_ROUNDS,
"ceilf_perf.log")
NEAREST_INTEGER_PERF(float, LIBC_NAMESPACE::floorf, ::floorf, FLOAT_ROUNDS,
"floorf_perf.log")
NEAREST_INTEGER_PERF(float, LIBC_NAMESPACE::roundevenf, ::placeholderf,
FLOAT_ROUNDS, "roundevenf_perf.log")
NEAREST_INTEGER_PERF(float, LIBC_NAMESPACE::roundf, ::roundf, FLOAT_ROUNDS,
"roundf_perf.log")
NEAREST_INTEGER_PERF(float, LIBC_NAMESPACE::truncf, ::truncf, FLOAT_ROUNDS,
"truncf_perf.log")

return 0;
}
Loading