Skip to content

Commit 05cdb59

Browse files
overmightyaaryanshukla
authored andcommitted
[libc][math] Optimize generic nearest integer functions (llvm#98483)
1 parent 0ae536e commit 05cdb59

File tree

3 files changed

+208
-16
lines changed

3 files changed

+208
-16
lines changed

libc/src/__support/FPUtil/NearestIntegerOperations.h

Lines changed: 21 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -75,15 +75,17 @@ LIBC_INLINE T ceil(T x) {
7575
}
7676

7777
uint32_t trim_size = FPBits<T>::FRACTION_LEN - exponent;
78-
StorageType trunc_mantissa =
79-
static_cast<StorageType>((bits.get_mantissa() >> trim_size) << trim_size);
80-
bits.set_mantissa(trunc_mantissa);
81-
T trunc_value = bits.get_val();
78+
StorageType x_u = bits.uintval();
79+
StorageType trunc_u =
80+
static_cast<StorageType>((x_u >> trim_size) << trim_size);
8281

8382
// If x is already an integer, return it.
84-
if (trunc_value == x)
83+
if (trunc_u == x_u)
8584
return x;
8685

86+
bits.set_uintval(trunc_u);
87+
T trunc_value = bits.get_val();
88+
8789
// If x is negative, the ceil operation is equivalent to the trunc operation.
8890
if (is_neg)
8991
return trunc_value;
@@ -130,15 +132,17 @@ LIBC_INLINE T round(T x) {
130132
uint32_t trim_size = FPBits<T>::FRACTION_LEN - exponent;
131133
bool half_bit_set =
132134
bool(bits.get_mantissa() & (StorageType(1) << (trim_size - 1)));
133-
StorageType trunc_mantissa =
134-
static_cast<StorageType>((bits.get_mantissa() >> trim_size) << trim_size);
135-
bits.set_mantissa(trunc_mantissa);
136-
T trunc_value = bits.get_val();
135+
StorageType x_u = bits.uintval();
136+
StorageType trunc_u =
137+
static_cast<StorageType>((x_u >> trim_size) << trim_size);
137138

138139
// If x is already an integer, return it.
139-
if (trunc_value == x)
140+
if (trunc_u == x_u)
140141
return x;
141142

143+
bits.set_uintval(trunc_u);
144+
T trunc_value = bits.get_val();
145+
142146
if (!half_bit_set) {
143147
// Franctional part is less than 0.5 so round value is the
144148
// same as the trunc value.
@@ -188,16 +192,17 @@ round_using_specific_rounding_mode(T x, int rnd) {
188192
}
189193

190194
uint32_t trim_size = FPBits<T>::FRACTION_LEN - exponent;
191-
FPBits<T> new_bits = bits;
192-
StorageType trunc_mantissa =
193-
static_cast<StorageType>((bits.get_mantissa() >> trim_size) << trim_size);
194-
new_bits.set_mantissa(trunc_mantissa);
195-
T trunc_value = new_bits.get_val();
195+
StorageType x_u = bits.uintval();
196+
StorageType trunc_u =
197+
static_cast<StorageType>((x_u >> trim_size) << trim_size);
196198

197199
// If x is already an integer, return it.
198-
if (trunc_value == x)
200+
if (trunc_u == x_u)
199201
return x;
200202

203+
FPBits<T> new_bits(trunc_u);
204+
T trunc_value = new_bits.get_val();
205+
201206
StorageType trim_value =
202207
bits.get_mantissa() &
203208
static_cast<StorageType>(((StorageType(1) << trim_size) - 1));

libc/test/src/math/performance_testing/CMakeLists.txt

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -366,3 +366,22 @@ add_perf_binary(
366366
COMPILE_OPTIONS
367367
-fno-builtin
368368
)
369+
370+
add_perf_binary(
371+
nearest_integer_funcs_perf
372+
SRCS
373+
nearest_integer_funcs_perf.cpp
374+
DEPENDS
375+
libc.src.math.ceilf
376+
libc.src.math.ceilf16
377+
libc.src.math.floorf
378+
libc.src.math.floorf16
379+
libc.src.math.roundevenf
380+
libc.src.math.roundevenf16
381+
libc.src.math.roundf
382+
libc.src.math.roundf16
383+
libc.src.math.truncf
384+
libc.src.math.truncf16
385+
COMPILE_OPTIONS
386+
-fno-builtin
387+
)
Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
//===-- Performance test for nearest integer functions --------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include "src/__support/FPUtil/FPBits.h"
10+
#include "src/math/ceilf.h"
11+
#include "src/math/ceilf16.h"
12+
#include "src/math/floorf.h"
13+
#include "src/math/floorf16.h"
14+
#include "src/math/roundevenf.h"
15+
#include "src/math/roundevenf16.h"
16+
#include "src/math/roundf.h"
17+
#include "src/math/roundf16.h"
18+
#include "src/math/truncf.h"
19+
#include "src/math/truncf16.h"
20+
#include "test/src/math/performance_testing/Timer.h"
21+
22+
#include <fstream>
23+
#include <math.h>
24+
25+
namespace LIBC_NAMESPACE::testing {
26+
27+
template <typename T> class NearestIntegerPerf {
28+
using FPBits = fputil::FPBits<T>;
29+
using StorageType = typename FPBits::StorageType;
30+
31+
public:
32+
typedef T Func(T);
33+
34+
static void run_perf_in_range(Func my_func, Func other_func,
35+
StorageType starting_bit,
36+
StorageType ending_bit, StorageType step,
37+
size_t rounds, std::ofstream &log) {
38+
auto runner = [=](Func func) {
39+
volatile T result;
40+
for (size_t i = 0; i < rounds; i++) {
41+
for (StorageType bits = starting_bit; bits <= ending_bit;
42+
bits += step) {
43+
T x = FPBits(bits).get_val();
44+
result = func(x);
45+
}
46+
}
47+
};
48+
49+
Timer timer;
50+
timer.start();
51+
runner(my_func);
52+
timer.stop();
53+
54+
size_t number_of_runs = (ending_bit - starting_bit) / step + 1;
55+
double my_average =
56+
static_cast<double>(timer.nanoseconds()) / number_of_runs / rounds;
57+
log << "-- My function --\n";
58+
log << " Total time : " << timer.nanoseconds() << " ns \n";
59+
log << " Average runtime : " << my_average << " ns/op \n";
60+
log << " Ops per second : "
61+
<< static_cast<uint64_t>(1'000'000'000.0 / my_average) << " op/s \n";
62+
63+
timer.start();
64+
runner(other_func);
65+
timer.stop();
66+
67+
double other_average =
68+
static_cast<double>(timer.nanoseconds()) / number_of_runs / rounds;
69+
log << "-- Other function --\n";
70+
log << " Total time : " << timer.nanoseconds() << " ns \n";
71+
log << " Average runtime : " << other_average << " ns/op \n";
72+
log << " Ops per second : "
73+
<< static_cast<uint64_t>(1'000'000'000.0 / other_average) << " op/s \n";
74+
75+
log << "-- Average runtime ratio --\n";
76+
log << " Mine / Other's : " << my_average / other_average << " \n";
77+
}
78+
79+
static void run_perf(Func my_func, Func other_func, size_t rounds,
80+
const char *log_file) {
81+
std::ofstream log(log_file);
82+
log << "Performance tests with inputs in normal integral range:\n";
83+
run_perf_in_range(
84+
my_func, other_func,
85+
/*starting_bit=*/StorageType((FPBits::EXP_BIAS + 1) << FPBits::SIG_LEN),
86+
/*ending_bit=*/
87+
StorageType((FPBits::EXP_BIAS + FPBits::FRACTION_LEN - 1)
88+
<< FPBits::SIG_LEN),
89+
/*step=*/StorageType(1 << FPBits::SIG_LEN),
90+
rounds * FPBits::EXP_BIAS * FPBits::EXP_BIAS * 2, log);
91+
log << "\n Performance tests with inputs in low integral range:\n";
92+
run_perf_in_range(
93+
my_func, other_func,
94+
/*starting_bit=*/StorageType(1 << FPBits::SIG_LEN),
95+
/*ending_bit=*/StorageType((FPBits::EXP_BIAS - 1) << FPBits::SIG_LEN),
96+
/*step_bit=*/StorageType(1 << FPBits::SIG_LEN),
97+
rounds * FPBits::EXP_BIAS * FPBits::EXP_BIAS * 2, log);
98+
log << "\n Performance tests with inputs in high integral range:\n";
99+
run_perf_in_range(
100+
my_func, other_func,
101+
/*starting_bit=*/
102+
StorageType((FPBits::EXP_BIAS + FPBits::FRACTION_LEN)
103+
<< FPBits::SIG_LEN),
104+
/*ending_bit=*/
105+
StorageType(FPBits::MAX_BIASED_EXPONENT << FPBits::SIG_LEN),
106+
/*step=*/StorageType(1 << FPBits::SIG_LEN),
107+
rounds * FPBits::EXP_BIAS * FPBits::EXP_BIAS * 2, log);
108+
log << "\n Performance tests with inputs in normal fractional range:\n";
109+
run_perf_in_range(
110+
my_func, other_func,
111+
/*starting_bit=*/
112+
StorageType(((FPBits::EXP_BIAS + 1) << FPBits::SIG_LEN) + 1),
113+
/*ending_bit=*/
114+
StorageType(((FPBits::EXP_BIAS + 2) << FPBits::SIG_LEN) - 1),
115+
/*step=*/StorageType(1), rounds * 2, log);
116+
log << "\n Performance tests with inputs in subnormal fractional range:\n";
117+
run_perf_in_range(my_func, other_func, /*starting_bit=*/StorageType(1),
118+
/*ending_bit=*/StorageType(FPBits::SIG_MASK),
119+
/*step=*/StorageType(1), rounds, log);
120+
}
121+
};
122+
123+
} // namespace LIBC_NAMESPACE::testing
124+
125+
#define NEAREST_INTEGER_PERF(T, my_func, other_func, rounds, filename) \
126+
{ \
127+
LIBC_NAMESPACE::testing::NearestIntegerPerf<T>::run_perf( \
128+
&my_func, &other_func, rounds, filename); \
129+
LIBC_NAMESPACE::testing::NearestIntegerPerf<T>::run_perf( \
130+
&my_func, &other_func, rounds, filename); \
131+
}
132+
133+
static constexpr size_t FLOAT16_ROUNDS = 20'000;
134+
static constexpr size_t FLOAT_ROUNDS = 40;
135+
136+
// LLVM libc might be the only libc implementation with support for float16 math
137+
// functions currently. We can't compare our float16 functions against the
138+
// system libc, so we compare them against this placeholder function.
139+
float16 placeholderf16(float16 x) { return x; }
140+
141+
// The system libc might not provide the roundeven* C23 math functions either.
142+
float placeholderf(float x) { return x; }
143+
144+
int main() {
145+
NEAREST_INTEGER_PERF(float16, LIBC_NAMESPACE::ceilf16, ::placeholderf16,
146+
FLOAT16_ROUNDS, "ceilf16_perf.log")
147+
NEAREST_INTEGER_PERF(float16, LIBC_NAMESPACE::floorf16, ::placeholderf16,
148+
FLOAT16_ROUNDS, "floorf16_perf.log")
149+
NEAREST_INTEGER_PERF(float16, LIBC_NAMESPACE::roundevenf16, ::placeholderf16,
150+
FLOAT16_ROUNDS, "roundevenf16_perf.log")
151+
NEAREST_INTEGER_PERF(float16, LIBC_NAMESPACE::roundf16, ::placeholderf16,
152+
FLOAT16_ROUNDS, "roundf16_perf.log")
153+
NEAREST_INTEGER_PERF(float16, LIBC_NAMESPACE::truncf16, ::placeholderf16,
154+
FLOAT16_ROUNDS, "truncf16_perf.log")
155+
156+
NEAREST_INTEGER_PERF(float, LIBC_NAMESPACE::ceilf, ::ceilf, FLOAT_ROUNDS,
157+
"ceilf_perf.log")
158+
NEAREST_INTEGER_PERF(float, LIBC_NAMESPACE::floorf, ::floorf, FLOAT_ROUNDS,
159+
"floorf_perf.log")
160+
NEAREST_INTEGER_PERF(float, LIBC_NAMESPACE::roundevenf, ::placeholderf,
161+
FLOAT_ROUNDS, "roundevenf_perf.log")
162+
NEAREST_INTEGER_PERF(float, LIBC_NAMESPACE::roundf, ::roundf, FLOAT_ROUNDS,
163+
"roundf_perf.log")
164+
NEAREST_INTEGER_PERF(float, LIBC_NAMESPACE::truncf, ::truncf, FLOAT_ROUNDS,
165+
"truncf_perf.log")
166+
167+
return 0;
168+
}

0 commit comments

Comments
 (0)