llvm · jhuber6 · Mar 21, 2025 · Mar 20, 2025
diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt
@@ -59,13 +59,6 @@ set(LIBC_NAMESPACE ${default_namespace}
   CACHE STRING "The namespace to use to enclose internal implementations. Must start with '__llvm_libc'."
 )
 
-# We will build the GPU utilities if we are not doing a runtimes build.
-option(LIBC_BUILD_GPU_LOADER "Always build the GPU loader utilities" OFF)
-if(LIBC_BUILD_GPU_LOADER OR ((NOT LLVM_RUNTIMES_BUILD) AND LLVM_LIBC_GPU_BUILD))
-  add_subdirectory(utils/gpu)
-  return()
-endif()
-
 option(LIBC_CMAKE_VERBOSE_LOGGING
   "Log details warnings and notifications during CMake configuration." OFF)
 

diff --git a/libc/src/__support/RPC/rpc_server.h b/libc/src/__support/RPC/rpc_server.h
@@ -20,6 +20,11 @@
 #define __has_builtin(x) 0
 #endif
 
+// Workaround for missing __builtin_is_constant_evaluated in < GCC 10.
+#ifndef __builtin_is_constant_evaluated
+#define __builtin_is_constant_evaluated(x) 0
+#endif
+
 // Configs for using the LLVM libc writer interface.
 #define LIBC_COPT_USE_C_ASSERT
 #define LIBC_COPT_MEMCPY_USE_EMBEDDED_TINY
@@ -28,7 +33,7 @@
 #define LIBC_COPT_PRINTF_DISABLE_INDEX_MODE
 #define LIBC_COPT_PRINTF_DISABLE_STRERROR
 
-// The 'long double' type is 8 byte
+// The 'long double' type is 8 bytes.
 #define LIBC_TYPES_LONG_DOUBLE_IS_FLOAT64
 
 #include "shared/rpc.h"

diff --git a/libc/utils/gpu/CMakeLists.txt b/libc/utils/gpu/CMakeLists.txt
diff --git a/libc/utils/gpu/loader/CMakeLists.txt b/libc/utils/gpu/loader/CMakeLists.txt
diff --git a/libc/utils/gpu/loader/amdgpu/CMakeLists.txt b/libc/utils/gpu/loader/amdgpu/CMakeLists.txt
diff --git a/libc/utils/gpu/loader/nvptx/CMakeLists.txt b/libc/utils/gpu/loader/nvptx/CMakeLists.txt
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
@@ -210,10 +210,6 @@ if("${LIBC_TARGET_TRIPLE}" STREQUAL "amdgcn-amd-amdhsa" OR
    "${LIBC_TARGET_TRIPLE}" STREQUAL "nvptx64-nvidia-cuda")
   set(LLVM_LIBC_GPU_BUILD ON)
 endif()
-if (NOT "libc" IN_LIST LLVM_ENABLE_PROJECTS AND LLVM_LIBC_GPU_BUILD)
-  message(STATUS "Enabling libc project to build libc testing tools")
-  list(APPEND LLVM_ENABLE_PROJECTS "libc")
-endif()
 
 # LLVM_ENABLE_PROJECTS_USED is `ON` if the user has ever used the
 # `LLVM_ENABLE_PROJECTS` CMake cache variable.  This exists for

diff --git a/llvm/runtimes/CMakeLists.txt b/llvm/runtimes/CMakeLists.txt
@@ -534,20 +534,6 @@ if(build_runtimes)
   endif()
   if(LLVM_LIBC_GPU_BUILD)
     list(APPEND extra_cmake_args "-DLLVM_LIBC_GPU_BUILD=ON")
-    if("libc" IN_LIST RUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES)
-      if(TARGET amdhsa-loader)
-        list(APPEND extra_cmake_args
-             "-DRUNTIMES_amdgcn-amd-amdhsa_LIBC_GPU_LOADER_EXECUTABLE=$<TARGET_FILE:amdhsa-loader>")
-        list(APPEND extra_deps amdhsa-loader)
-      endif()
-    endif()
-    if("libc" IN_LIST RUNTIMES_nvptx64-nvidia-cuda_LLVM_ENABLE_RUNTIMES)
-      if(TARGET nvptx-loader)
-        list(APPEND extra_cmake_args
-             "-DRUNTIMES_nvptx64-nvidia-cuda_LIBC_GPU_LOADER_EXECUTABLE=$<TARGET_FILE:nvptx-loader>")
-        list(APPEND extra_deps nvptx-loader)
-      endif()
-    endif()
     if(TARGET clang-offload-packager)
       list(APPEND extra_deps clang-offload-packager)
     endif()

diff --git a/llvm/tools/CMakeLists.txt b/llvm/tools/CMakeLists.txt
@@ -9,6 +9,10 @@
 # traversing each directory.
 create_llvm_tool_options()
 
+if(NOT LLVM_COMPILER_IS_GCC_COMPATIBLE)
+  set(LLVM_TOOL_LLVM_GPU_LOADER_BUILD OFF)
+endif()
+
 if(NOT LLVM_BUILD_LLVM_DYLIB AND NOT LLVM_BUILD_LLVM_C_DYLIB)
   set(LLVM_TOOL_LLVM_SHLIB_BUILD Off)
 endif()

diff --git a/llvm/tools/llvm-gpu-loader/CMakeLists.txt b/llvm/tools/llvm-gpu-loader/CMakeLists.txt
@@ -0,0 +1,46 @@
+set(LLVM_LINK_COMPONENTS
+  BinaryFormat
+  Object
+  Option
+  Support
+  FrontendOffloading
+  TargetParser
+)
+
+add_llvm_tool(llvm-gpu-loader
+  llvm-gpu-loader.cpp
+
+  # TODO: We intentionally split this currently due to statically linking the
+  #       GPU runtimes. Dynamically load the dependencies, possibly using the
+  #       LLVM offloading API when it is complete.
+  PARTIAL_SOURCES_INTENDED
+
+  DEPENDS
+  intrinsics_gen
+)
+
+# Locate the RPC server handling interface.
+include(FindLibcCommonUtils)
+target_link_libraries(llvm-gpu-loader PUBLIC llvm-libc-common-utilities)
+
+# Check for HSA support for targeting AMD GPUs.
+find_package(hsa-runtime64 QUIET 1.2.0 HINTS ${CMAKE_INSTALL_PREFIX} PATHS /opt/rocm)
+if(hsa-runtime64_FOUND)
+  target_sources(llvm-gpu-loader PRIVATE amdhsa.cpp)
+  target_compile_definitions(llvm-gpu-loader PRIVATE AMDHSA_SUPPORT)
+  target_link_libraries(llvm-gpu-loader PRIVATE hsa-runtime64::hsa-runtime64)
+
+  # Compatibility with the old amdhsa-loader name.
+  add_llvm_tool_symlink(amdhsa-loader llvm-gpu-loader)
+endif()
+
+# Check for CUDA support for targeting NVIDIA GPUs.
+find_package(CUDAToolkit 11.2 QUIET)
+if(CUDAToolkit_FOUND)
+  target_sources(llvm-gpu-loader PRIVATE nvptx.cpp)
+  target_compile_definitions(llvm-gpu-loader PRIVATE NVPTX_SUPPORT)
+  target_link_libraries(llvm-gpu-loader PRIVATE CUDA::cuda_driver)
+
+  # Compatibility with the old nvptx-loader name.
+  add_llvm_tool_symlink(nvptx-loader llvm-gpu-loader)
+endif()
diff --git a/...utils/gpu/loader/amdgpu/amdhsa-loader.cpp → llvm/tools/llvm-gpu-loader/amdhsa.cpp b/...utils/gpu/loader/amdgpu/amdhsa-loader.cpp → llvm/tools/llvm-gpu-loader/amdhsa.cpp
@@ -13,7 +13,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "Loader.h"
+#include "llvm-gpu-loader.h"
+#include "server.h"
 
 #include "hsa/hsa.h"
 #include "hsa/hsa_ext_amd.h"
@@ -260,9 +261,8 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
         // Register RPC callbacks for the malloc and free functions on HSA.
         auto malloc_handler = [&](size_t size) -> void * {
           void *dev_ptr = nullptr;
-          if (hsa_status_t err =
-                  hsa_amd_memory_pool_allocate(coarsegrained_pool, size,
-                                               /*flags=*/0, &dev_ptr))
+          if (hsa_amd_memory_pool_allocate(coarsegrained_pool, size,
+                                           /*flags=*/0, &dev_ptr))
             dev_ptr = nullptr;
           hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
           return dev_ptr;
@@ -330,9 +330,9 @@ static hsa_status_t hsa_memcpy(void *dst, hsa_agent_t dst_agent,
   return HSA_STATUS_SUCCESS;
 }
 
-int load(int argc, const char **argv, const char **envp, void *image,
-         size_t size, const LaunchParameters &params,
-         bool print_resource_usage) {
+int load_amdhsa(int argc, const char **argv, const char **envp, void *image,
+                size_t size, const LaunchParameters &params,
+                bool print_resource_usage) {
   // Initialize the HSA runtime used to communicate with the device.
   if (hsa_status_t err = hsa_init())
     handle_error(err);

diff --git a/libc/utils/gpu/loader/Main.cpp → ...tools/llvm-gpu-loader/llvm-gpu-loader.cpp b/libc/utils/gpu/loader/Main.cpp → ...tools/llvm-gpu-loader/llvm-gpu-loader.cpp
@@ -6,21 +6,25 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file opens a device image passed on the command line and passes it to
-// one of the loader implementations for launch.
+// This utility is used to launch standard programs onto the GPU in conjunction
+// with the LLVM 'libc' project. It is designed to mimic a standard emulator
+// workflow, allowing for unit tests to be run on the GPU directly.
 //
 //===----------------------------------------------------------------------===//
 
-#include "Loader.h"
+#include "llvm-gpu-loader.h"
 
 #include "llvm/BinaryFormat/Magic.h"
+#include "llvm/Object/ELF.h"
+#include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/WithColor.h"
+#include "llvm/TargetParser/Triple.h"
 
 #include <cerrno>
 #include <cstdio>
@@ -67,12 +71,6 @@ static cl::opt<bool>
                          cl::desc("Output resource usage of launched kernels"),
                          cl::init(false), cl::cat(loader_category));
 
-static cl::opt<bool>
-    no_parallelism("no-parallelism",
-                   cl::desc("Allows only a single process to use the GPU at a "
-                            "time. Useful to suppress out-of-resource errors"),
-                   cl::init(false), cl::cat(loader_category));
-
 static cl::opt<std::string> file(cl::Positional, cl::Required,
                                  cl::desc("<gpu executable>"),
                                  cl::cat(loader_category));
@@ -115,27 +113,42 @@ int main(int argc, const char **argv, const char **envp) {
   llvm::transform(args, std::back_inserter(new_argv),
                   [](const std::string &arg) { return arg.c_str(); });
 
-  // Claim a file lock on the executable so only a single process can enter this
-  // region if requested. This prevents the loader from spurious failures.
-  int fd = -1;
-  if (no_parallelism) {
-    fd = open(get_main_executable(argv[0]).c_str(), O_RDONLY);
-    if (flock(fd, LOCK_EX) == -1)
-      report_error(createStringError("Failed to lock '%s': %s", argv[0],
-                                     strerror(errno)));
-  }
-
-  // Drop the loader from the program arguments.
-  LaunchParameters params{threads_x, threads_y, threads_z,
-                          blocks_x,  blocks_y,  blocks_z};
-  int ret = load(new_argv.size(), new_argv.data(), envp,
-                 const_cast<char *>(image.getBufferStart()),
-                 image.getBufferSize(), params, print_resource_usage);
-
-  if (no_parallelism) {
-    if (flock(fd, LOCK_UN) == -1)
-      report_error(createStringError("Failed to unlock '%s': %s", argv[0],
-                                     strerror(errno)));
+  Expected<llvm::object::ELF64LEObjectFile> elf_or_err =
+      llvm::object::ELF64LEObjectFile::create(image);
+  if (!elf_or_err)
+    report_error(elf_or_err.takeError());
+
+  int ret = 1;
+  if (elf_or_err->getArch() == Triple::amdgcn) {
+#ifdef AMDHSA_SUPPORT
+    LaunchParameters params{threads_x, threads_y, threads_z,
+                            blocks_x,  blocks_y,  blocks_z};
+
+    ret = load_amdhsa(new_argv.size(), new_argv.data(), envp,
+                      const_cast<char *>(image.getBufferStart()),
+                      image.getBufferSize(), params, print_resource_usage);
+#else
+    report_error(createStringError(
+        "Unsupported architecture; %s",
+        Triple::getArchTypeName(elf_or_err->getArch()).bytes_begin()));
+#endif
+  } else if (elf_or_err->getArch() == Triple::nvptx64) {
+#ifdef NVPTX_SUPPORT
+    LaunchParameters params{threads_x, threads_y, threads_z,
+                            blocks_x,  blocks_y,  blocks_z};
+
+    ret = load_nvptx(new_argv.size(), new_argv.data(), envp,
+                     const_cast<char *>(image.getBufferStart()),
+                     image.getBufferSize(), params, print_resource_usage);
+#else
+    report_error(createStringError(
+        "Unsupported architecture; %s",
+        Triple::getArchTypeName(elf_or_err->getArch()).bytes_begin()));
+#endif
+  } else {
+    report_error(createStringError(
+        "Unsupported architecture; %s",
+        Triple::getArchTypeName(elf_or_err->getArch()).bytes_begin()));
   }
 
   return ret;