-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[libc] NVPTX Profiling #92009
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[libc] NVPTX Profiling #92009
Changes from 17 commits
b6b47fb
f8291e9
1129ccc
5c46009
e50ea99
a588fc5
be303da
a41eb32
ab6b6ca
c7c8445
c857891
6073de7
9f23d21
46b5e25
945090f
4aa5e8b
b93318e
cb3b05c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
#include "benchmarks/gpu/BenchmarkLogger.h" | ||
#include "src/__support/CPP/string.h" | ||
#include "src/__support/CPP/string_view.h" | ||
#include "src/__support/OSUtil/io.h" // write_to_stderr | ||
#include "src/__support/big_int.h" // is_big_int | ||
#include "src/__support/macros/properties/types.h" // LIBC_TYPES_HAS_INT128 | ||
#include "src/__support/uint128.h" | ||
|
||
#include <stdint.h> | ||
|
||
namespace LIBC_NAMESPACE { | ||
namespace benchmarks { | ||
|
||
// cpp::string_view specialization | ||
template <> | ||
BenchmarkLogger & | ||
BenchmarkLogger::operator<< <cpp::string_view>(cpp::string_view str) { | ||
LIBC_NAMESPACE::write_to_stderr(str); | ||
return *this; | ||
} | ||
|
||
// cpp::string specialization | ||
template <> | ||
BenchmarkLogger &BenchmarkLogger::operator<< <cpp::string>(cpp::string str) { | ||
return *this << static_cast<cpp::string_view>(str); | ||
} | ||
|
||
// const char* specialization | ||
template <> | ||
BenchmarkLogger &BenchmarkLogger::operator<< <const char *>(const char *str) { | ||
return *this << cpp::string_view(str); | ||
} | ||
|
||
// char* specialization | ||
template <> BenchmarkLogger &BenchmarkLogger::operator<< <char *>(char *str) { | ||
return *this << cpp::string_view(str); | ||
} | ||
|
||
// char specialization | ||
template <> BenchmarkLogger &BenchmarkLogger::operator<<(char ch) { | ||
return *this << cpp::string_view(&ch, 1); | ||
} | ||
|
||
// bool specialization | ||
template <> BenchmarkLogger &BenchmarkLogger::operator<<(bool cond) { | ||
return *this << (cond ? "true" : "false"); | ||
} | ||
|
||
// void * specialization | ||
template <> BenchmarkLogger &BenchmarkLogger::operator<<(void *addr) { | ||
return *this << "0x" << cpp::to_string(reinterpret_cast<uintptr_t>(addr)); | ||
} | ||
|
||
template <typename T> BenchmarkLogger &BenchmarkLogger::operator<<(T t) { | ||
if constexpr (is_big_int_v<T> || | ||
(cpp::is_integral_v<T> && cpp::is_unsigned_v<T> && | ||
(sizeof(T) > sizeof(uint64_t)))) { | ||
static_assert(sizeof(T) % 8 == 0, "Unsupported size of UInt"); | ||
const IntegerToString<T, radix::Hex::WithPrefix> buffer(t); | ||
return *this << buffer.view(); | ||
} else { | ||
return *this << cpp::to_string(t); | ||
} | ||
} | ||
|
||
// is_integral specializations | ||
// char is already specialized to handle character | ||
template BenchmarkLogger &BenchmarkLogger::operator<< <short>(short); | ||
template BenchmarkLogger &BenchmarkLogger::operator<< <int>(int); | ||
template BenchmarkLogger &BenchmarkLogger::operator<< <long>(long); | ||
template BenchmarkLogger &BenchmarkLogger::operator<< <long long>(long long); | ||
template BenchmarkLogger & | ||
BenchmarkLogger::operator<< <unsigned char>(unsigned char); | ||
template BenchmarkLogger & | ||
BenchmarkLogger::operator<< <unsigned short>(unsigned short); | ||
template BenchmarkLogger & | ||
BenchmarkLogger::operator<< <unsigned int>(unsigned int); | ||
template BenchmarkLogger & | ||
BenchmarkLogger::operator<< <unsigned long>(unsigned long); | ||
template BenchmarkLogger & | ||
BenchmarkLogger::operator<< <unsigned long long>(unsigned long long); | ||
|
||
#ifdef LIBC_TYPES_HAS_INT128 | ||
template BenchmarkLogger & | ||
BenchmarkLogger::operator<< <__uint128_t>(__uint128_t); | ||
#endif // LIBC_TYPES_HAS_INT128 | ||
template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<128>>(UInt<128>); | ||
template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<192>>(UInt<192>); | ||
template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<256>>(UInt<256>); | ||
template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<320>>(UInt<320>); | ||
|
||
// TODO: Add floating point formatting once it's supported by StringStream. | ||
|
||
BenchmarkLogger log; | ||
|
||
} // namespace benchmarks | ||
} // namespace LIBC_NAMESPACE |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
//===-- Utilities to log to standard output during tests --------*- C++ -*-===// | ||
// | ||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// | ||
//===----------------------------------------------------------------------===// | ||
|
||
#ifndef LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H | ||
#define LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H | ||
|
||
namespace LIBC_NAMESPACE { | ||
namespace benchmarks { | ||
|
||
// A class to log to standard output in the context of hermetic tests. | ||
struct BenchmarkLogger { | ||
constexpr BenchmarkLogger() = default; | ||
template <typename T> BenchmarkLogger &operator<<(T); | ||
}; | ||
|
||
// A global TestLogger instance to be used in tests. | ||
extern BenchmarkLogger log; | ||
|
||
} // namespace benchmarks | ||
} // namespace LIBC_NAMESPACE | ||
|
||
#endif /* LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H */ |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
add_subdirectory(timing) | ||
|
||
add_custom_target(gpu-benchmark) | ||
|
||
function(add_benchmark benchmark_name) | ||
cmake_parse_arguments( | ||
"BENCHMARK" | ||
"" # Optional arguments | ||
"" # Single value arguments | ||
"LINK_LIBRARIES" # Multi-value arguments | ||
${ARGN} | ||
) | ||
if(NOT libc.src.time.clock IN_LIST TARGET_LLVMLIBC_ENTRYPOINTS) | ||
message(FATAL_ERROR "target does not support clock") | ||
endif() | ||
add_libc_hermetic( | ||
${benchmark_name} | ||
IS_BENCHMARK | ||
LINK_LIBRARIES | ||
LibcGpuBenchmark.hermetic | ||
${BENCHMARK_LINK_LIBRARIES} | ||
${BENCHMARK_UNPARSED_ARGUMENTS} | ||
) | ||
get_fq_target_name(${benchmark_name} fq_target_name) | ||
add_dependencies(gpu-benchmark ${fq_target_name}) | ||
endfunction(add_benchmark) | ||
|
||
add_unittest_framework_library( | ||
LibcGpuBenchmark | ||
SRCS | ||
LibcGpuBenchmark.cpp | ||
LibcGpuBenchmarkMain.cpp | ||
BenchmarkLogger.cpp | ||
HDRS | ||
LibcGpuBenchmark.h | ||
BenchmarkLogger.h | ||
DEPENDS | ||
libc.src.__support.big_int | ||
libc.src.__support.c_string | ||
libc.src.__support.CPP.string | ||
libc.src.__support.CPP.string_view | ||
libc.src.__support.CPP.type_traits | ||
libc.src.__support.CPP.functional | ||
libc.src.__support.CPP.limits | ||
libc.src.__support.CPP.algorithm | ||
libc.src.__support.fixed_point.fx_rep | ||
libc.src.__support.macros.properties.types | ||
libc.src.__support.OSUtil.osutil | ||
libc.src.__support.uint128 | ||
libc.src.__support.FPUtil.sqrt | ||
libc.src.__support.fixedvector | ||
libc.src.time.clock | ||
libc.benchmarks.gpu.timing.timing | ||
) | ||
|
||
add_subdirectory(src) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,140 @@ | ||
#include "LibcGpuBenchmark.h" | ||
#include "src/__support/CPP/algorithm.h" | ||
#include "src/__support/CPP/array.h" | ||
#include "src/__support/CPP/string.h" | ||
#include "src/__support/FPUtil/sqrt.h" | ||
#include "src/__support/GPU/utils.h" | ||
#include "src/__support/fixedvector.h" | ||
#include "src/time/gpu/time_utils.h" | ||
|
||
namespace LIBC_NAMESPACE { | ||
namespace benchmarks { | ||
|
||
FixedVector<Benchmark *, 64> benchmarks; | ||
cpp::array<BenchmarkResult, 1024> results; | ||
|
||
void Benchmark::add_benchmark(Benchmark *benchmark) { | ||
benchmarks.push_back(benchmark); | ||
} | ||
|
||
BenchmarkResult reduce_results(cpp::array<BenchmarkResult, 1024> &results) { | ||
BenchmarkResult result; | ||
uint64_t cycles_sum = 0; | ||
double standard_deviation_sum = 0; | ||
uint64_t min = UINT64_MAX; | ||
uint64_t max = 0; | ||
uint32_t samples_sum = 0; | ||
uint32_t iterations_sum = 0; | ||
clock_t time_sum = 0; | ||
uint64_t num_threads = gpu::get_num_threads(); | ||
for (uint64_t i = 0; i < num_threads; i++) { | ||
BenchmarkResult current_result = results[i]; | ||
cycles_sum += current_result.cycles; | ||
standard_deviation_sum += current_result.standard_deviation; | ||
min = cpp::min(min, current_result.min); | ||
max = cpp::max(max, current_result.max); | ||
samples_sum += current_result.samples; | ||
iterations_sum += current_result.total_iterations; | ||
time_sum += current_result.total_time; | ||
} | ||
result.cycles = cycles_sum / num_threads; | ||
result.standard_deviation = standard_deviation_sum / num_threads; | ||
result.min = min; | ||
result.max = max; | ||
result.samples = samples_sum / num_threads; | ||
result.total_iterations = iterations_sum / num_threads; | ||
result.total_time = time_sum / num_threads; | ||
return result; | ||
} | ||
|
||
void Benchmark::run_benchmarks() { | ||
uint64_t id = gpu::get_thread_id(); | ||
gpu::sync_threads(); | ||
|
||
for (Benchmark *benchmark : benchmarks) | ||
results[id] = benchmark->run(); | ||
gpu::sync_threads(); | ||
if (id == 0) { | ||
for (Benchmark *benchmark : benchmarks) { | ||
BenchmarkResult all_results = reduce_results(results); | ||
constexpr auto GREEN = "\033[32m"; | ||
constexpr auto RESET = "\033[0m"; | ||
log << GREEN << "[ RUN ] " << RESET << benchmark->get_name() << '\n'; | ||
log << GREEN << "[ OK ] " << RESET << benchmark->get_name() << ": " | ||
<< all_results.cycles << " cycles, " << all_results.min << " min, " | ||
<< all_results.max << " max, " << all_results.total_iterations | ||
<< " iterations, " << all_results.total_time << " ns, " | ||
<< static_cast<long>(all_results.standard_deviation) << " stddev\n"; | ||
} | ||
} | ||
gpu::sync_threads(); | ||
} | ||
|
||
BenchmarkResult benchmark(const BenchmarkOptions &options, | ||
cpp::function<uint64_t(void)> wrapper_func) { | ||
BenchmarkResult result; | ||
RuntimeEstimationProgression rep; | ||
uint32_t total_iterations = 0; | ||
uint32_t iterations = options.initial_iterations; | ||
if (iterations < 1u) | ||
iterations = 1; | ||
|
||
uint32_t samples = 0; | ||
uint64_t total_time = 0; | ||
uint64_t best_guess = 0; | ||
uint64_t total_cycles = 0; | ||
uint64_t cycles_squared = 0; | ||
uint64_t min = UINT64_MAX; | ||
uint64_t max = 0; | ||
|
||
uint64_t overhead = UINT64_MAX; | ||
int overhead_iterations = 10; | ||
for (int i = 0; i < overhead_iterations; i++) | ||
overhead = cpp::min(overhead, LIBC_NAMESPACE::overhead()); | ||
|
||
for (uint64_t time_budget = options.max_duration; time_budget >= 0;) { | ||
uint64_t sample_cycles = 0; | ||
const clock_t start = static_cast<double>(clock()); | ||
for (uint32_t i = 0; i < iterations; i++) { | ||
auto wrapper_intermediate = wrapper_func(); | ||
uint64_t result = wrapper_intermediate - overhead; | ||
max = cpp::max(max, result); | ||
min = cpp::min(min, result); | ||
sample_cycles += result; | ||
} | ||
const clock_t end = clock(); | ||
const clock_t duration_ns = | ||
((end - start) * 1000 * 1000 * 1000) / CLOCKS_PER_SEC; | ||
total_time += duration_ns; | ||
time_budget -= duration_ns; | ||
samples++; | ||
total_cycles += sample_cycles; | ||
cycles_squared += sample_cycles * sample_cycles; | ||
|
||
total_iterations += iterations; | ||
const double change_ratio = | ||
rep.compute_improvement({iterations, sample_cycles}); | ||
best_guess = rep.current_estimation; | ||
|
||
if (samples >= options.max_samples || iterations >= options.max_iterations) | ||
break; | ||
if (total_time >= options.min_duration && samples >= options.min_samples && | ||
change_ratio < options.epsilon) | ||
break; | ||
|
||
iterations *= options.scaling_factor; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit, could probably move this to the for loop. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. which for loop are you referring to? I think my original intent was to increase the number of iterations executed per sample after each sample (i.e. group of iterations) executes. Do you mean moving it to the for loop on line 47? |
||
} | ||
result.cycles = best_guess; | ||
result.standard_deviation = fputil::sqrt<double>( | ||
static_cast<double>(cycles_squared) / total_iterations - | ||
static_cast<double>(best_guess * best_guess)); | ||
result.min = min; | ||
result.max = max; | ||
result.samples = samples; | ||
result.total_iterations = total_iterations; | ||
result.total_time = total_time; | ||
return result; | ||
}; | ||
jhuber6 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
} // namespace benchmarks | ||
} // namespace LIBC_NAMESPACE |
Uh oh!
There was an error while loading. Please reload this page.