Skip to content

Commit fcd8ff3

Browse files
jameshu15869AlexisPerry
authored andcommitted
[libc] NVPTX Profiling (llvm#92009)
PR for adding microbenchmarking infrastructure for NVPTX. `nvlink` cannot perform LTO, so we cannot inline `libc` functions and this function call overhead is not adjusted for during microbenchmarking.
1 parent 69fb40d commit fcd8ff3

17 files changed

+644
-11
lines changed

libc/CMakeLists.txt

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -401,9 +401,7 @@ if(LLVM_INCLUDE_TESTS)
401401
add_subdirectory(fuzzing)
402402
endif()
403403

404-
if(LIBC_INCLUDE_BENCHMARKS)
405-
add_subdirectory(benchmarks)
406-
endif()
404+
add_subdirectory(benchmarks)
407405

408406
if (LIBC_INCLUDE_DOCS)
409407
add_subdirectory(docs)

libc/benchmarks/CMakeLists.txt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,13 @@
1+
if(LIBC_TARGET_OS_IS_GPU)
2+
add_subdirectory(gpu)
3+
return()
4+
endif()
5+
6+
# The CPU build depends on Google benchmark.
7+
if(NOT LIBC_INCLUDE_BENCHMARKS)
8+
return()
9+
endif()
10+
111
find_package(Threads)
212

313
set(LLVM_LINK_COMPONENTS
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
#include "benchmarks/gpu/BenchmarkLogger.h"
2+
#include "src/__support/CPP/string.h"
3+
#include "src/__support/CPP/string_view.h"
4+
#include "src/__support/OSUtil/io.h" // write_to_stderr
5+
#include "src/__support/big_int.h" // is_big_int
6+
#include "src/__support/macros/properties/types.h" // LIBC_TYPES_HAS_INT128
7+
#include "src/__support/uint128.h"
8+
9+
#include <stdint.h>
10+
11+
namespace LIBC_NAMESPACE {
12+
namespace benchmarks {
13+
14+
// cpp::string_view specialization
15+
template <>
16+
BenchmarkLogger &
17+
BenchmarkLogger::operator<< <cpp::string_view>(cpp::string_view str) {
18+
LIBC_NAMESPACE::write_to_stderr(str);
19+
return *this;
20+
}
21+
22+
// cpp::string specialization
23+
template <>
24+
BenchmarkLogger &BenchmarkLogger::operator<< <cpp::string>(cpp::string str) {
25+
return *this << static_cast<cpp::string_view>(str);
26+
}
27+
28+
// const char* specialization
29+
template <>
30+
BenchmarkLogger &BenchmarkLogger::operator<< <const char *>(const char *str) {
31+
return *this << cpp::string_view(str);
32+
}
33+
34+
// char* specialization
35+
template <> BenchmarkLogger &BenchmarkLogger::operator<< <char *>(char *str) {
36+
return *this << cpp::string_view(str);
37+
}
38+
39+
// char specialization
40+
template <> BenchmarkLogger &BenchmarkLogger::operator<<(char ch) {
41+
return *this << cpp::string_view(&ch, 1);
42+
}
43+
44+
// bool specialization
45+
template <> BenchmarkLogger &BenchmarkLogger::operator<<(bool cond) {
46+
return *this << (cond ? "true" : "false");
47+
}
48+
49+
// void * specialization
50+
template <> BenchmarkLogger &BenchmarkLogger::operator<<(void *addr) {
51+
return *this << "0x" << cpp::to_string(reinterpret_cast<uintptr_t>(addr));
52+
}
53+
54+
template <typename T> BenchmarkLogger &BenchmarkLogger::operator<<(T t) {
55+
if constexpr (is_big_int_v<T> ||
56+
(cpp::is_integral_v<T> && cpp::is_unsigned_v<T> &&
57+
(sizeof(T) > sizeof(uint64_t)))) {
58+
static_assert(sizeof(T) % 8 == 0, "Unsupported size of UInt");
59+
const IntegerToString<T, radix::Hex::WithPrefix> buffer(t);
60+
return *this << buffer.view();
61+
} else {
62+
return *this << cpp::to_string(t);
63+
}
64+
}
65+
66+
// is_integral specializations
67+
// char is already specialized to handle character
68+
template BenchmarkLogger &BenchmarkLogger::operator<< <short>(short);
69+
template BenchmarkLogger &BenchmarkLogger::operator<< <int>(int);
70+
template BenchmarkLogger &BenchmarkLogger::operator<< <long>(long);
71+
template BenchmarkLogger &BenchmarkLogger::operator<< <long long>(long long);
72+
template BenchmarkLogger &
73+
BenchmarkLogger::operator<< <unsigned char>(unsigned char);
74+
template BenchmarkLogger &
75+
BenchmarkLogger::operator<< <unsigned short>(unsigned short);
76+
template BenchmarkLogger &
77+
BenchmarkLogger::operator<< <unsigned int>(unsigned int);
78+
template BenchmarkLogger &
79+
BenchmarkLogger::operator<< <unsigned long>(unsigned long);
80+
template BenchmarkLogger &
81+
BenchmarkLogger::operator<< <unsigned long long>(unsigned long long);
82+
83+
#ifdef LIBC_TYPES_HAS_INT128
84+
template BenchmarkLogger &
85+
BenchmarkLogger::operator<< <__uint128_t>(__uint128_t);
86+
#endif // LIBC_TYPES_HAS_INT128
87+
template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<128>>(UInt<128>);
88+
template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<192>>(UInt<192>);
89+
template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<256>>(UInt<256>);
90+
template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<320>>(UInt<320>);
91+
92+
// TODO: Add floating point formatting once it's supported by StringStream.
93+
94+
BenchmarkLogger log;
95+
96+
} // namespace benchmarks
97+
} // namespace LIBC_NAMESPACE

libc/benchmarks/gpu/BenchmarkLogger.h

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
//===-- Utilities to log to standard output during tests --------*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H
10+
#define LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H
11+
12+
namespace LIBC_NAMESPACE {
13+
namespace benchmarks {
14+
15+
// A class to log to standard output in the context of hermetic tests.
16+
struct BenchmarkLogger {
17+
constexpr BenchmarkLogger() = default;
18+
template <typename T> BenchmarkLogger &operator<<(T);
19+
};
20+
21+
// A global TestLogger instance to be used in tests.
22+
extern BenchmarkLogger log;
23+
24+
} // namespace benchmarks
25+
} // namespace LIBC_NAMESPACE
26+
27+
#endif /* LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H */

libc/benchmarks/gpu/CMakeLists.txt

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
add_subdirectory(timing)
2+
3+
add_custom_target(gpu-benchmark)
4+
5+
function(add_benchmark benchmark_name)
6+
cmake_parse_arguments(
7+
"BENCHMARK"
8+
"" # Optional arguments
9+
"" # Single value arguments
10+
"LINK_LIBRARIES" # Multi-value arguments
11+
${ARGN}
12+
)
13+
if(NOT libc.src.time.clock IN_LIST TARGET_LLVMLIBC_ENTRYPOINTS)
14+
message(FATAL_ERROR "target does not support clock")
15+
endif()
16+
add_libc_hermetic(
17+
${benchmark_name}
18+
IS_BENCHMARK
19+
LINK_LIBRARIES
20+
LibcGpuBenchmark.hermetic
21+
${BENCHMARK_LINK_LIBRARIES}
22+
${BENCHMARK_UNPARSED_ARGUMENTS}
23+
)
24+
get_fq_target_name(${benchmark_name} fq_target_name)
25+
add_dependencies(gpu-benchmark ${fq_target_name})
26+
endfunction(add_benchmark)
27+
28+
add_unittest_framework_library(
29+
LibcGpuBenchmark
30+
SRCS
31+
LibcGpuBenchmark.cpp
32+
LibcGpuBenchmarkMain.cpp
33+
BenchmarkLogger.cpp
34+
HDRS
35+
LibcGpuBenchmark.h
36+
BenchmarkLogger.h
37+
DEPENDS
38+
libc.src.__support.big_int
39+
libc.src.__support.c_string
40+
libc.src.__support.CPP.string
41+
libc.src.__support.CPP.string_view
42+
libc.src.__support.CPP.type_traits
43+
libc.src.__support.CPP.functional
44+
libc.src.__support.CPP.limits
45+
libc.src.__support.CPP.algorithm
46+
libc.src.__support.fixed_point.fx_rep
47+
libc.src.__support.macros.properties.types
48+
libc.src.__support.OSUtil.osutil
49+
libc.src.__support.uint128
50+
libc.src.__support.FPUtil.sqrt
51+
libc.src.__support.fixedvector
52+
libc.src.time.clock
53+
libc.benchmarks.gpu.timing.timing
54+
)
55+
56+
add_subdirectory(src)
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
#include "LibcGpuBenchmark.h"
2+
#include "src/__support/CPP/algorithm.h"
3+
#include "src/__support/CPP/array.h"
4+
#include "src/__support/CPP/string.h"
5+
#include "src/__support/FPUtil/sqrt.h"
6+
#include "src/__support/GPU/utils.h"
7+
#include "src/__support/fixedvector.h"
8+
#include "src/time/gpu/time_utils.h"
9+
10+
namespace LIBC_NAMESPACE {
11+
namespace benchmarks {
12+
13+
FixedVector<Benchmark *, 64> benchmarks;
14+
cpp::array<BenchmarkResult, 1024> results;
15+
16+
void Benchmark::add_benchmark(Benchmark *benchmark) {
17+
benchmarks.push_back(benchmark);
18+
}
19+
20+
BenchmarkResult reduce_results(cpp::array<BenchmarkResult, 1024> &results) {
21+
BenchmarkResult result;
22+
uint64_t cycles_sum = 0;
23+
double standard_deviation_sum = 0;
24+
uint64_t min = UINT64_MAX;
25+
uint64_t max = 0;
26+
uint32_t samples_sum = 0;
27+
uint32_t iterations_sum = 0;
28+
clock_t time_sum = 0;
29+
uint64_t num_threads = gpu::get_num_threads();
30+
for (uint64_t i = 0; i < num_threads; i++) {
31+
BenchmarkResult current_result = results[i];
32+
cycles_sum += current_result.cycles;
33+
standard_deviation_sum += current_result.standard_deviation;
34+
min = cpp::min(min, current_result.min);
35+
max = cpp::max(max, current_result.max);
36+
samples_sum += current_result.samples;
37+
iterations_sum += current_result.total_iterations;
38+
time_sum += current_result.total_time;
39+
}
40+
result.cycles = cycles_sum / num_threads;
41+
result.standard_deviation = standard_deviation_sum / num_threads;
42+
result.min = min;
43+
result.max = max;
44+
result.samples = samples_sum / num_threads;
45+
result.total_iterations = iterations_sum / num_threads;
46+
result.total_time = time_sum / num_threads;
47+
return result;
48+
}
49+
50+
void Benchmark::run_benchmarks() {
51+
uint64_t id = gpu::get_thread_id();
52+
gpu::sync_threads();
53+
54+
for (Benchmark *benchmark : benchmarks)
55+
results[id] = benchmark->run();
56+
gpu::sync_threads();
57+
if (id == 0) {
58+
for (Benchmark *benchmark : benchmarks) {
59+
BenchmarkResult all_results = reduce_results(results);
60+
constexpr auto GREEN = "\033[32m";
61+
constexpr auto RESET = "\033[0m";
62+
log << GREEN << "[ RUN ] " << RESET << benchmark->get_name() << '\n';
63+
log << GREEN << "[ OK ] " << RESET << benchmark->get_name() << ": "
64+
<< all_results.cycles << " cycles, " << all_results.min << " min, "
65+
<< all_results.max << " max, " << all_results.total_iterations
66+
<< " iterations, " << all_results.total_time << " ns, "
67+
<< static_cast<long>(all_results.standard_deviation) << " stddev\n";
68+
}
69+
}
70+
gpu::sync_threads();
71+
}
72+
73+
BenchmarkResult benchmark(const BenchmarkOptions &options,
74+
cpp::function<uint64_t(void)> wrapper_func) {
75+
BenchmarkResult result;
76+
RuntimeEstimationProgression rep;
77+
uint32_t total_iterations = 0;
78+
uint32_t iterations = options.initial_iterations;
79+
if (iterations < 1u)
80+
iterations = 1;
81+
82+
uint32_t samples = 0;
83+
uint64_t total_time = 0;
84+
uint64_t best_guess = 0;
85+
uint64_t total_cycles = 0;
86+
uint64_t cycles_squared = 0;
87+
uint64_t min = UINT64_MAX;
88+
uint64_t max = 0;
89+
90+
uint64_t overhead = UINT64_MAX;
91+
int overhead_iterations = 10;
92+
for (int i = 0; i < overhead_iterations; i++)
93+
overhead = cpp::min(overhead, LIBC_NAMESPACE::overhead());
94+
95+
for (uint64_t time_budget = options.max_duration; time_budget >= 0;) {
96+
uint64_t sample_cycles = 0;
97+
const clock_t start = static_cast<double>(clock());
98+
for (uint32_t i = 0; i < iterations; i++) {
99+
auto wrapper_intermediate = wrapper_func();
100+
uint64_t result = wrapper_intermediate - overhead;
101+
max = cpp::max(max, result);
102+
min = cpp::min(min, result);
103+
sample_cycles += result;
104+
}
105+
const clock_t end = clock();
106+
const clock_t duration_ns =
107+
((end - start) * 1000 * 1000 * 1000) / CLOCKS_PER_SEC;
108+
total_time += duration_ns;
109+
time_budget -= duration_ns;
110+
samples++;
111+
total_cycles += sample_cycles;
112+
cycles_squared += sample_cycles * sample_cycles;
113+
114+
total_iterations += iterations;
115+
const double change_ratio =
116+
rep.compute_improvement({iterations, sample_cycles});
117+
best_guess = rep.current_estimation;
118+
119+
if (samples >= options.max_samples || iterations >= options.max_iterations)
120+
break;
121+
if (total_time >= options.min_duration && samples >= options.min_samples &&
122+
change_ratio < options.epsilon)
123+
break;
124+
125+
iterations *= options.scaling_factor;
126+
}
127+
result.cycles = best_guess;
128+
result.standard_deviation = fputil::sqrt<double>(
129+
static_cast<double>(cycles_squared) / total_iterations -
130+
static_cast<double>(best_guess * best_guess));
131+
result.min = min;
132+
result.max = max;
133+
result.samples = samples;
134+
result.total_iterations = total_iterations;
135+
result.total_time = total_time;
136+
return result;
137+
};
138+
139+
} // namespace benchmarks
140+
} // namespace LIBC_NAMESPACE

0 commit comments

Comments
 (0)