Skip to content

Commit 1ea0865

Browse files
authored
[Clang] Add env var for nvptx-arch/amdgpu-arch timeout (#102521)
When working on very busy systems, check-offload frequently fails many tests with this diagnostic: ``` clang: error: cannot determine amdgcn architecture: /tmp/llvm/build/bin/amdgpu-arch: Child timed out: ; consider passing it via '-march' ``` This patch accepts the environment variable `CLANG_TOOLCHAIN_PROGRAM_TIMEOUT` to set the timeout. It also increases the timeout from 10 to 60 seconds.
1 parent 7ede1c4 commit 1ea0865

File tree

9 files changed

+52
-8
lines changed

9 files changed

+52
-8
lines changed

clang/include/clang/Basic/DiagnosticDriverKinds.td

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,8 +98,10 @@ def warn_drv_amdgpu_cov6: Warning<
9898
"code object v6 is still in development and not ready for production use yet;"
9999
" use at your own risk">;
100100
def err_drv_undetermined_gpu_arch : Error<
101-
"cannot determine %0 architecture: %1; consider passing it via "
102-
"'%2'">;
101+
"cannot determine %0 architecture: %1; consider passing it via '%2'; "
102+
"environment variable CLANG_TOOLCHAIN_PROGRAM_TIMEOUT specifies the tool "
103+
"timeout (integer secs, <=0 is infinite)">;
104+
103105
def warn_drv_multi_gpu_arch : Warning<
104106
"multiple %0 architectures are detected: %1; only the first one is used for "
105107
"'%2'">, InGroup<MultiGPU>;

clang/include/clang/Driver/ToolChain.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -205,8 +205,7 @@ class ToolChain {
205205

206206
/// Executes the given \p Executable and returns the stdout.
207207
llvm::Expected<std::unique_ptr<llvm::MemoryBuffer>>
208-
executeToolChainProgram(StringRef Executable,
209-
unsigned SecondsToWait = 0) const;
208+
executeToolChainProgram(StringRef Executable) const;
210209

211210
void setTripleEnvironment(llvm::Triple::EnvironmentType Env);
212211

clang/lib/Driver/ToolChain.cpp

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
#include "llvm/Support/FileSystem.h"
4141
#include "llvm/Support/FileUtilities.h"
4242
#include "llvm/Support/Path.h"
43+
#include "llvm/Support/Process.h"
4344
#include "llvm/Support/VersionTuple.h"
4445
#include "llvm/Support/VirtualFileSystem.h"
4546
#include "llvm/TargetParser/AArch64TargetParser.h"
@@ -104,8 +105,7 @@ ToolChain::ToolChain(const Driver &D, const llvm::Triple &T,
104105
}
105106

106107
llvm::Expected<std::unique_ptr<llvm::MemoryBuffer>>
107-
ToolChain::executeToolChainProgram(StringRef Executable,
108-
unsigned SecondsToWait) const {
108+
ToolChain::executeToolChainProgram(StringRef Executable) const {
109109
llvm::SmallString<64> OutputFile;
110110
llvm::sys::fs::createTemporaryFile("toolchain-program", "txt", OutputFile);
111111
llvm::FileRemover OutputRemover(OutputFile.c_str());
@@ -116,6 +116,16 @@ ToolChain::executeToolChainProgram(StringRef Executable,
116116
};
117117

118118
std::string ErrorMessage;
119+
int SecondsToWait = 60;
120+
if (std::optional<std::string> Str =
121+
llvm::sys::Process::GetEnv("CLANG_TOOLCHAIN_PROGRAM_TIMEOUT")) {
122+
if (!llvm::to_integer(*Str, SecondsToWait))
123+
return llvm::createStringError(std::error_code(),
124+
"CLANG_TOOLCHAIN_PROGRAM_TIMEOUT expected "
125+
"an integer, got '" +
126+
*Str + "'");
127+
SecondsToWait = std::min(SecondsToWait, 0); // infinite
128+
}
119129
if (llvm::sys::ExecuteAndWait(Executable, {}, {}, Redirects, SecondsToWait,
120130
/*MemoryLimit=*/0, &ErrorMessage))
121131
return llvm::createStringError(std::error_code(),

clang/lib/Driver/ToolChains/AMDGPU.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -899,7 +899,7 @@ AMDGPUToolChain::getSystemGPUArchs(const ArgList &Args) const {
899899
else
900900
Program = GetProgramPath("amdgpu-arch");
901901

902-
auto StdoutOrErr = executeToolChainProgram(Program, /*SecondsToWait=*/10);
902+
auto StdoutOrErr = executeToolChainProgram(Program);
903903
if (!StdoutOrErr)
904904
return StdoutOrErr.takeError();
905905

clang/lib/Driver/ToolChains/Cuda.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -804,7 +804,7 @@ NVPTXToolChain::getSystemGPUArchs(const ArgList &Args) const {
804804
else
805805
Program = GetProgramPath("nvptx-arch");
806806

807-
auto StdoutOrErr = executeToolChainProgram(Program, /*SecondsToWait=*/10);
807+
auto StdoutOrErr = executeToolChainProgram(Program);
808808
if (!StdoutOrErr)
809809
return StdoutOrErr.takeError();
810810

clang/test/Driver/amdgpu-hip-system-arch.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,3 +29,11 @@
2929
// RUN: %clang -### --target=x86_64-unknown-linux-gnu -nogpuinc -nogpulib --offload-new-driver --offload-arch=native --amdgpu-arch-tool=%t/amdgpu_arch_gfx906 -x hip %s 2>&1 \
3030
// RUN: | FileCheck %s --check-prefix=ARCH-GFX906
3131
// ARCH-GFX906: "-cc1" "-triple" "amdgcn-amd-amdhsa"{{.*}}"-target-cpu" "gfx906"
32+
33+
// case when CLANG_TOOLCHAIN_PROGRAM_TIMEOUT is malformed.
34+
// RUN: env CLANG_TOOLCHAIN_PROGRAM_TIMEOUT=foo \
35+
// RUN: not %clang -### --target=x86_64-unknown-linux-gnu -nogpuinc -nogpulib \
36+
// RUN: --offload-arch=native --amdgpu-arch-tool=%t/amdgpu_arch_gfx906 \
37+
// RUN: -x hip %s 2>&1 | \
38+
// RUN: FileCheck %s --check-prefix=BAD-TIMEOUT
39+
// BAD-TIMEOUT: clang: error: cannot determine amdgcn architecture: CLANG_TOOLCHAIN_PROGRAM_TIMEOUT expected an integer, got 'foo'; consider passing it via '--offload-arch'; environment variable CLANG_TOOLCHAIN_PROGRAM_TIMEOUT specifies the tool timeout (integer secs, <=0 is infinite)

clang/test/Driver/nvptx-cuda-system-arch.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,3 +42,11 @@
4242
// RUN: --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda %s 2>&1 | FileCheck %s --check-prefix=MARCH-sm_89
4343
// MARCH-sm_89: warning: multiple nvptx64 architectures are detected: sm_89, sm_80; only the first one is used for '-march' [-Wmulti-gpu]
4444
// MARCH-sm_89: "-cc1" "-triple" "nvptx64-nvidia-cuda"{{.*}}"-target-cpu" "sm_89"
45+
46+
// case when CLANG_TOOLCHAIN_PROGRAM_TIMEOUT is malformed.
47+
// RUN: env CLANG_TOOLCHAIN_PROGRAM_TIMEOUT=foo \
48+
// RUN: not %clang -### --target=x86_64-unknown-linux-gnu -nogpulib \
49+
// RUN: --offload-arch=native --nvptx-arch-tool=%t/nvptx_arch_sm_70 \
50+
// RUN: --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda -x cuda %s 2>&1 | \
51+
// RUN: FileCheck %s --check-prefix=BAD-TIMEOUT
52+
// BAD-TIMEOUT: clang: error: cannot determine nvptx64 architecture: CLANG_TOOLCHAIN_PROGRAM_TIMEOUT expected an integer, got 'foo'; consider passing it via '--offload-arch'; environment variable CLANG_TOOLCHAIN_PROGRAM_TIMEOUT specifies the tool timeout (integer secs, <=0 is infinite)

clang/test/Driver/openmp-system-arch.c

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,3 +75,19 @@
7575
// RUN: -fopenmp-targets=amdgcn-amd-amdhsa --amdgpu-arch-tool=%t/amdgpu_arch_empty %s 2>&1 \
7676
// RUN: | FileCheck %s --check-prefix=AMDGPU
7777
// AMDGPU: error: cannot determine amdgcn architecture: No AMD GPU detected in the system; consider passing it via '-march'
78+
79+
// case when CLANG_TOOLCHAIN_PROGRAM_TIMEOUT is malformed for nvptx-arch.
80+
// RUN: env CLANG_TOOLCHAIN_PROGRAM_TIMEOUT=foo \
81+
// RUN: not %clang -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp \
82+
// RUN: -fopenmp-targets=nvptx64-nvidia-cuda -nogpulib \
83+
// RUN: --nvptx-arch-tool=%t/nvptx_arch_sm_70 %s 2>&1 | \
84+
// RUN: FileCheck %s --check-prefix=BAD-TIMEOUT-NVPTX
85+
// BAD-TIMEOUT-NVPTX: clang: error: cannot determine nvptx64 architecture: CLANG_TOOLCHAIN_PROGRAM_TIMEOUT expected an integer, got 'foo'; consider passing it via '-march'; environment variable CLANG_TOOLCHAIN_PROGRAM_TIMEOUT specifies the tool timeout (integer secs, <=0 is infinite)
86+
87+
// case when CLANG_TOOLCHAIN_PROGRAM_TIMEOUT is malformed for amdgpu-arch.
88+
// RUN: env CLANG_TOOLCHAIN_PROGRAM_TIMEOUT= \
89+
// RUN: not %clang -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp \
90+
// RUN: -fopenmp-targets=amdgcn-amd-amdhsa -nogpulib \
91+
// RUN: --amdgpu-arch-tool=%t/amdgpu_arch_gfx906 %s 2>&1 | \
92+
// RUN: FileCheck %s --check-prefix=BAD-TIMEOUT-AMDGPU
93+
// BAD-TIMEOUT-AMDGPU: clang: error: cannot determine amdgcn architecture: CLANG_TOOLCHAIN_PROGRAM_TIMEOUT expected an integer, got ''; consider passing it via '-march'; environment variable CLANG_TOOLCHAIN_PROGRAM_TIMEOUT specifies the tool timeout (integer secs, <=0 is infinite)

llvm/utils/lit/lit/TestingConfig.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ def fromdefaults(litConfig):
2626
"SYSTEMROOT",
2727
"TERM",
2828
"CLANG",
29+
"CLANG_TOOLCHAIN_PROGRAM_TIMEOUT",
2930
"LLDB",
3031
"LD_PRELOAD",
3132
"LLVM_SYMBOLIZER_PATH",

0 commit comments

Comments
 (0)