Skip to content

Reapply "[LLVM] Make the GPU loader utilities an LLVM tool (#132096)" #132277

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 0 additions & 7 deletions libc/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -59,13 +59,6 @@ set(LIBC_NAMESPACE ${default_namespace}
CACHE STRING "The namespace to use to enclose internal implementations. Must start with '__llvm_libc'."
)

# We will build the GPU utilities if we are not doing a runtimes build.
option(LIBC_BUILD_GPU_LOADER "Always build the GPU loader utilities" OFF)
if(LIBC_BUILD_GPU_LOADER OR ((NOT LLVM_RUNTIMES_BUILD) AND LLVM_LIBC_GPU_BUILD))
add_subdirectory(utils/gpu)
return()
endif()

option(LIBC_CMAKE_VERBOSE_LOGGING
"Log details warnings and notifications during CMake configuration." OFF)

Expand Down
7 changes: 6 additions & 1 deletion libc/src/__support/RPC/rpc_server.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@
#define __has_builtin(x) 0
#endif

// Workaround for missing __builtin_is_constant_evaluated in < GCC 10.
#ifndef __builtin_is_constant_evaluated
#define __builtin_is_constant_evaluated(x) 0
#endif

// Configs for using the LLVM libc writer interface.
#define LIBC_COPT_USE_C_ASSERT
#define LIBC_COPT_MEMCPY_USE_EMBEDDED_TINY
Expand All @@ -28,7 +33,7 @@
#define LIBC_COPT_PRINTF_DISABLE_INDEX_MODE
#define LIBC_COPT_PRINTF_DISABLE_STRERROR

// The 'long double' type is 8 byte
// The 'long double' type is 8 bytes.
#define LIBC_TYPES_LONG_DOUBLE_IS_FLOAT64

#include "shared/rpc.h"
Expand Down
1 change: 0 additions & 1 deletion libc/utils/gpu/CMakeLists.txt

This file was deleted.

54 changes: 0 additions & 54 deletions libc/utils/gpu/loader/CMakeLists.txt

This file was deleted.

10 changes: 0 additions & 10 deletions libc/utils/gpu/loader/amdgpu/CMakeLists.txt

This file was deleted.

9 changes: 0 additions & 9 deletions libc/utils/gpu/loader/nvptx/CMakeLists.txt

This file was deleted.

4 changes: 0 additions & 4 deletions llvm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -210,10 +210,6 @@ if("${LIBC_TARGET_TRIPLE}" STREQUAL "amdgcn-amd-amdhsa" OR
"${LIBC_TARGET_TRIPLE}" STREQUAL "nvptx64-nvidia-cuda")
set(LLVM_LIBC_GPU_BUILD ON)
endif()
if (NOT "libc" IN_LIST LLVM_ENABLE_PROJECTS AND LLVM_LIBC_GPU_BUILD)
message(STATUS "Enabling libc project to build libc testing tools")
list(APPEND LLVM_ENABLE_PROJECTS "libc")
endif()

# LLVM_ENABLE_PROJECTS_USED is `ON` if the user has ever used the
# `LLVM_ENABLE_PROJECTS` CMake cache variable. This exists for
Expand Down
14 changes: 0 additions & 14 deletions llvm/runtimes/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -534,20 +534,6 @@ if(build_runtimes)
endif()
if(LLVM_LIBC_GPU_BUILD)
list(APPEND extra_cmake_args "-DLLVM_LIBC_GPU_BUILD=ON")
if("libc" IN_LIST RUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES)
if(TARGET amdhsa-loader)
list(APPEND extra_cmake_args
"-DRUNTIMES_amdgcn-amd-amdhsa_LIBC_GPU_LOADER_EXECUTABLE=$<TARGET_FILE:amdhsa-loader>")
list(APPEND extra_deps amdhsa-loader)
endif()
endif()
if("libc" IN_LIST RUNTIMES_nvptx64-nvidia-cuda_LLVM_ENABLE_RUNTIMES)
if(TARGET nvptx-loader)
list(APPEND extra_cmake_args
"-DRUNTIMES_nvptx64-nvidia-cuda_LIBC_GPU_LOADER_EXECUTABLE=$<TARGET_FILE:nvptx-loader>")
list(APPEND extra_deps nvptx-loader)
endif()
endif()
if(TARGET clang-offload-packager)
list(APPEND extra_deps clang-offload-packager)
endif()
Expand Down
4 changes: 4 additions & 0 deletions llvm/tools/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@
# traversing each directory.
create_llvm_tool_options()

if(NOT LLVM_COMPILER_IS_GCC_COMPATIBLE)
set(LLVM_TOOL_LLVM_GPU_LOADER_BUILD OFF)
endif()

if(NOT LLVM_BUILD_LLVM_DYLIB AND NOT LLVM_BUILD_LLVM_C_DYLIB)
set(LLVM_TOOL_LLVM_SHLIB_BUILD Off)
endif()
Expand Down
46 changes: 46 additions & 0 deletions llvm/tools/llvm-gpu-loader/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
set(LLVM_LINK_COMPONENTS
BinaryFormat
Object
Option
Support
FrontendOffloading
TargetParser
)

add_llvm_tool(llvm-gpu-loader
llvm-gpu-loader.cpp

# TODO: We intentionally split this currently due to statically linking the
# GPU runtimes. Dynamically load the dependencies, possibly using the
# LLVM offloading API when it is complete.
PARTIAL_SOURCES_INTENDED

DEPENDS
intrinsics_gen
)

# Locate the RPC server handling interface.
include(FindLibcCommonUtils)
target_link_libraries(llvm-gpu-loader PUBLIC llvm-libc-common-utilities)

# Check for HSA support for targeting AMD GPUs.
find_package(hsa-runtime64 QUIET 1.2.0 HINTS ${CMAKE_INSTALL_PREFIX} PATHS /opt/rocm)
if(hsa-runtime64_FOUND)
target_sources(llvm-gpu-loader PRIVATE amdhsa.cpp)
target_compile_definitions(llvm-gpu-loader PRIVATE AMDHSA_SUPPORT)
target_link_libraries(llvm-gpu-loader PRIVATE hsa-runtime64::hsa-runtime64)

# Compatibility with the old amdhsa-loader name.
add_llvm_tool_symlink(amdhsa-loader llvm-gpu-loader)
endif()

# Check for CUDA support for targeting NVIDIA GPUs.
find_package(CUDAToolkit 11.2 QUIET)
if(CUDAToolkit_FOUND)
target_sources(llvm-gpu-loader PRIVATE nvptx.cpp)
target_compile_definitions(llvm-gpu-loader PRIVATE NVPTX_SUPPORT)
target_link_libraries(llvm-gpu-loader PRIVATE CUDA::cuda_driver)

# Compatibility with the old nvptx-loader name.
add_llvm_tool_symlink(nvptx-loader llvm-gpu-loader)
endif()
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
//
//===----------------------------------------------------------------------===//

#include "Loader.h"
#include "llvm-gpu-loader.h"
#include "server.h"

#include "hsa/hsa.h"
#include "hsa/hsa_ext_amd.h"
Expand Down Expand Up @@ -260,9 +261,8 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
// Register RPC callbacks for the malloc and free functions on HSA.
auto malloc_handler = [&](size_t size) -> void * {
void *dev_ptr = nullptr;
if (hsa_status_t err =
hsa_amd_memory_pool_allocate(coarsegrained_pool, size,
/*flags=*/0, &dev_ptr))
if (hsa_amd_memory_pool_allocate(coarsegrained_pool, size,
/*flags=*/0, &dev_ptr))
dev_ptr = nullptr;
hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
return dev_ptr;
Expand Down Expand Up @@ -330,9 +330,9 @@ static hsa_status_t hsa_memcpy(void *dst, hsa_agent_t dst_agent,
return HSA_STATUS_SUCCESS;
}

int load(int argc, const char **argv, const char **envp, void *image,
size_t size, const LaunchParameters &params,
bool print_resource_usage) {
int load_amdhsa(int argc, const char **argv, const char **envp, void *image,
size_t size, const LaunchParameters &params,
bool print_resource_usage) {
// Initialize the HSA runtime used to communicate with the device.
if (hsa_status_t err = hsa_init())
handle_error(err);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,25 @@
//
//===----------------------------------------------------------------------===//
//
// This file opens a device image passed on the command line and passes it to
// one of the loader implementations for launch.
// This utility is used to launch standard programs onto the GPU in conjunction
// with the LLVM 'libc' project. It is designed to mimic a standard emulator
// workflow, allowing for unit tests to be run on the GPU directly.
//
//===----------------------------------------------------------------------===//

#include "Loader.h"
#include "llvm-gpu-loader.h"

#include "llvm/BinaryFormat/Magic.h"
#include "llvm/Object/ELF.h"
#include "llvm/Object/ELFObjectFile.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Error.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/Signals.h"
#include "llvm/Support/WithColor.h"
#include "llvm/TargetParser/Triple.h"

#include <cerrno>
#include <cstdio>
Expand Down Expand Up @@ -67,12 +71,6 @@ static cl::opt<bool>
cl::desc("Output resource usage of launched kernels"),
cl::init(false), cl::cat(loader_category));

static cl::opt<bool>
no_parallelism("no-parallelism",
cl::desc("Allows only a single process to use the GPU at a "
"time. Useful to suppress out-of-resource errors"),
cl::init(false), cl::cat(loader_category));

static cl::opt<std::string> file(cl::Positional, cl::Required,
cl::desc("<gpu executable>"),
cl::cat(loader_category));
Expand Down Expand Up @@ -115,27 +113,42 @@ int main(int argc, const char **argv, const char **envp) {
llvm::transform(args, std::back_inserter(new_argv),
[](const std::string &arg) { return arg.c_str(); });

// Claim a file lock on the executable so only a single process can enter this
// region if requested. This prevents the loader from spurious failures.
int fd = -1;
if (no_parallelism) {
fd = open(get_main_executable(argv[0]).c_str(), O_RDONLY);
if (flock(fd, LOCK_EX) == -1)
report_error(createStringError("Failed to lock '%s': %s", argv[0],
strerror(errno)));
}

// Drop the loader from the program arguments.
LaunchParameters params{threads_x, threads_y, threads_z,
blocks_x, blocks_y, blocks_z};
int ret = load(new_argv.size(), new_argv.data(), envp,
const_cast<char *>(image.getBufferStart()),
image.getBufferSize(), params, print_resource_usage);

if (no_parallelism) {
if (flock(fd, LOCK_UN) == -1)
report_error(createStringError("Failed to unlock '%s': %s", argv[0],
strerror(errno)));
Expected<llvm::object::ELF64LEObjectFile> elf_or_err =
llvm::object::ELF64LEObjectFile::create(image);
if (!elf_or_err)
report_error(elf_or_err.takeError());

int ret = 1;
if (elf_or_err->getArch() == Triple::amdgcn) {
#ifdef AMDHSA_SUPPORT
LaunchParameters params{threads_x, threads_y, threads_z,
blocks_x, blocks_y, blocks_z};

ret = load_amdhsa(new_argv.size(), new_argv.data(), envp,
const_cast<char *>(image.getBufferStart()),
image.getBufferSize(), params, print_resource_usage);
#else
report_error(createStringError(
"Unsupported architecture; %s",
Triple::getArchTypeName(elf_or_err->getArch()).bytes_begin()));
#endif
} else if (elf_or_err->getArch() == Triple::nvptx64) {
#ifdef NVPTX_SUPPORT
LaunchParameters params{threads_x, threads_y, threads_z,
blocks_x, blocks_y, blocks_z};

ret = load_nvptx(new_argv.size(), new_argv.data(), envp,
const_cast<char *>(image.getBufferStart()),
image.getBufferSize(), params, print_resource_usage);
#else
report_error(createStringError(
"Unsupported architecture; %s",
Triple::getArchTypeName(elf_or_err->getArch()).bytes_begin()));
#endif
} else {
report_error(createStringError(
"Unsupported architecture; %s",
Triple::getArchTypeName(elf_or_err->getArch()).bytes_begin()));
}

return ret;
Expand Down
Loading
Loading