Skip to content

Commit aae059e

Browse files
committed
[LinkerWrapper] Pass all files to the device linker
Summary: The linker wrapper's job is to extract embedded device code from fat binaries and create linked images that can then be embedded and executed. In order to support LTO, we originally reinvented all of the LTO handling that `ld.lld` normally does. Primarily, this was because `nvlink` didn't support this at all, and we have special hacks required for offloading languages interacting with archive libraries. Now since I wrote #96561 we should be able to pass all the inputs to the device linker transparently. This has the advantage of allowing the `clang` Driver to do its own handling. Primarily, this will be used to implicitly pass libraries to the device link job to make it more consistent with other toolchains. The JIT support is a notable departure, however there is an option called `--lto-emit-llvm` that performs the exact function where we want the final link job to output LLVM-IR that we can then embed instead. This patch does not fully delete the LTO handling, primarily because I think the SPIR-V people might want it. To see only the relevant patches, ignore the first commit of the nvlink-wrapper. Depends on #96561.
1 parent c747300 commit aae059e

File tree

3 files changed

+68
-46
lines changed

3 files changed

+68
-46
lines changed

clang/test/Driver/linker-wrapper-libs.c

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ int bar() { return weak; }
4848
// RUN: --linker-path=/usr/bin/ld %t.a %t.o -o a.out 2>&1 \
4949
// RUN: | FileCheck %s --check-prefix=LIBRARY-RESOLVES
5050

51-
// LIBRARY-RESOLVES: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 {{.*}}.s {{.*}}.o
51+
// LIBRARY-RESOLVES: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 {{.*}}.o {{.*}}.o
5252
// LIBRARY-RESOLVES: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx1030 {{.*}}.o {{.*}}.o
5353

5454
//
@@ -72,7 +72,7 @@ int bar() { return weak; }
7272
// RUN: --linker-path=/usr/bin/ld %t.a %t.o -o a.out 2>&1 \
7373
// RUN: | FileCheck %s --check-prefix=LIBRARY-GLOBAL
7474

75-
// LIBRARY-GLOBAL: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 {{.*}}.s {{.*}}.o
75+
// LIBRARY-GLOBAL: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 {{.*}}.o {{.*}}.o
7676
// LIBRARY-GLOBAL: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx1030 {{.*}}.o {{.*}}.o
7777

7878
//
@@ -96,7 +96,7 @@ int bar() { return weak; }
9696
// RUN: | FileCheck %s --check-prefix=LIBRARY-GLOBAL-NONE
9797

9898
// LIBRARY-GLOBAL-NONE-NOT: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx1030 {{.*}}.o {{.*}}.o
99-
// LIBRARY-GLOBAL-NONE-NOT: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 {{.*}}.s {{.*}}.o
99+
// LIBRARY-GLOBAL-NONE-NOT: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 {{.*}}.o {{.*}}.o
100100

101101
//
102102
// Check that we do not extract an external weak symbol.
@@ -161,7 +161,7 @@ int bar() { return weak; }
161161
// RUN: --linker-path=/usr/bin/ld %t.o %t.a %t.a -o a.out 2>&1 \
162162
// RUN: | FileCheck %s --check-prefix=LIBRARY-GLOBAL-DEFINED
163163

164-
// LIBRARY-GLOBAL-DEFINED: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 {{.*}}.s {{.*}}.o
164+
// LIBRARY-GLOBAL-DEFINED: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 {{.*}}.o {{.*}}.o
165165
// LIBRARY-GLOBAL-DEFINED-NOT: {{.*}}gfx1030{{.*}}.o
166166
// LIBRARY-GLOBAL-DEFINED: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx1030 {{.*}}.o {{.*}}.o
167167

@@ -185,7 +185,7 @@ int bar() { return weak; }
185185
// RUN: --linker-path=/usr/bin/ld %t.o --whole-archive %t.a -o a.out 2>&1 \
186186
// RUN: | FileCheck %s --check-prefix=LIBRARY-WHOLE-ARCHIVE
187187

188-
// LIBRARY-WHOLE-ARCHIVE: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 {{.*}}.s {{.*}}.o
188+
// LIBRARY-WHOLE-ARCHIVE: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 {{.*}}.o {{.*}}.o
189189
// LIBRARY-WHOLE-ARCHIVE: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx1030 {{.*}}.o {{.*}}.o
190-
// LIBRARY-WHOLE-ARCHIVE: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_52 {{.*}}.s
190+
// LIBRARY-WHOLE-ARCHIVE: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_52 {{.*}}.o
191191
// LIBRARY-WHOLE-ARCHIVE: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx90a {{.*}}.o

clang/test/Driver/linker-wrapper.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ __attribute__((visibility("protected"), used)) int x;
4848
// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run --save-temps -O2 \
4949
// RUN: --linker-path=/usr/bin/ld %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=AMDGPU-LTO-TEMPS
5050

51-
// AMDGPU-LTO-TEMPS: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx1030 -O2 -Wl,--no-undefined {{.*}}.s -save-temps
51+
// AMDGPU-LTO-TEMPS: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx1030 -O2 -Wl,--no-undefined {{.*}}.o -save-temps
5252

5353
// RUN: clang-offload-packager -o %t.out \
5454
// RUN: --image=file=%t.elf.o,kind=openmp,triple=x86_64-unknown-linux-gnu \
@@ -147,7 +147,7 @@ __attribute__((visibility("protected"), used)) int x;
147147
// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run --clang-backend \
148148
// RUN: --linker-path=/usr/bin/ld %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=CLANG-BACKEND
149149

150-
// CLANG-BACKEND: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx908 -O2 -Wl,--no-undefined {{.*}}.bc
150+
// CLANG-BACKEND: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx908 -O2 -Wl,--no-undefined {{.*}}.o
151151

152152
// RUN: clang-offload-packager -o %t.out \
153153
// RUN: --image=file=%t.elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70

clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp

Lines changed: 60 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,13 @@ Expected<std::string> findProgram(StringRef Name, ArrayRef<StringRef> Paths) {
293293
return *Path;
294294
}
295295

296+
/// We will defer LTO to the target's linker if we are not doing JIT and it is
297+
/// supported by the toolchain.
298+
bool linkerSupportsLTO(const ArgList &Args) {
299+
llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ));
300+
return Triple.isNVPTX() || Triple.isAMDGPU();
301+
}
302+
296303
/// Returns the hashed value for a constant string.
297304
std::string getHash(StringRef Str) {
298305
llvm::MD5 Hasher;
@@ -555,18 +562,23 @@ Expected<StringRef> clang(ArrayRef<StringRef> InputFiles, const ArgList &Args) {
555562
llvm::copy(LinkerArgs, std::back_inserter(CmdArgs));
556563
}
557564

558-
// Pass on -mllvm options to the clang invocation.
559-
for (const opt::Arg *Arg : Args.filtered(OPT_mllvm)) {
560-
CmdArgs.push_back("-mllvm");
561-
CmdArgs.push_back(Arg->getValue());
562-
}
565+
// Pass on -mllvm options to the linker invocation.
566+
for (const opt::Arg *Arg : Args.filtered(OPT_mllvm))
567+
CmdArgs.push_back(
568+
Args.MakeArgString("-Wl,-mllvm=" + StringRef(Arg->getValue())));
563569

564570
if (Args.hasArg(OPT_debug))
565571
CmdArgs.push_back("-g");
566572

567573
if (SaveTemps)
568574
CmdArgs.push_back("-save-temps");
569575

576+
if (SaveTemps && linkerSupportsLTO(Args))
577+
CmdArgs.push_back("-Wl,--save-temps");
578+
579+
if (Args.hasArg(OPT_embed_bitcode))
580+
CmdArgs.push_back("-Wl,--lto-emit-llvm");
581+
570582
if (Verbose)
571583
CmdArgs.push_back("-v");
572584

@@ -587,8 +599,8 @@ Expected<StringRef> clang(ArrayRef<StringRef> InputFiles, const ArgList &Args) {
587599
Args.MakeArgString(Arg.split('=').second)});
588600
}
589601

590-
// The OpenMPOpt pass can introduce new calls and is expensive, we do not want
591-
// this when running CodeGen through clang.
602+
// The OpenMPOpt pass can introduce new calls and is expensive, we do
603+
// not want this when running CodeGen through clang.
592604
if (Args.hasArg(OPT_clang_backend) || Args.hasArg(OPT_builtin_bitcode_EQ))
593605
CmdArgs.append({"-mllvm", "-openmp-opt-disable"});
594606

@@ -772,8 +784,9 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
772784
BumpPtrAllocator Alloc;
773785
StringSaver Saver(Alloc);
774786

775-
// Search for bitcode files in the input and create an LTO input file. If it
776-
// is not a bitcode file, scan its symbol table for symbols we need to save.
787+
// Search for bitcode files in the input and create an LTO input file. If
788+
// it is not a bitcode file, scan its symbol table for symbols we need to
789+
// save.
777790
for (OffloadFile &File : InputFiles) {
778791
MemoryBufferRef Buffer = MemoryBufferRef(File.getBinary()->getImage(), "");
779792

@@ -807,7 +820,8 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
807820
if (!Name)
808821
return Name.takeError();
809822

810-
// Record if we've seen these symbols in any object or shared libraries.
823+
// Record if we've seen these symbols in any object or shared
824+
// libraries.
811825
if ((*ObjFile)->isRelocatableObject())
812826
UsedInRegularObj.insert(Saver.save(*Name));
813827
else
@@ -844,17 +858,18 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
844858
return false;
845859
};
846860

847-
// We assume visibility of the whole program if every input file was bitcode.
861+
// We assume visibility of the whole program if every input file was
862+
// bitcode.
848863
auto Features = getTargetFeatures(BitcodeInputFiles);
849864
auto LTOBackend = Args.hasArg(OPT_embed_bitcode) ||
850865
Args.hasArg(OPT_builtin_bitcode_EQ) ||
851866
Args.hasArg(OPT_clang_backend)
852867
? createLTO(Args, Features, OutputBitcode)
853868
: createLTO(Args, Features);
854869

855-
// We need to resolve the symbols so the LTO backend knows which symbols need
856-
// to be kept or can be internalized. This is a simplified symbol resolution
857-
// scheme to approximate the full resolution a linker would do.
870+
// We need to resolve the symbols so the LTO backend knows which symbols
871+
// need to be kept or can be internalized. This is a simplified symbol
872+
// resolution scheme to approximate the full resolution a linker would do.
858873
uint64_t Idx = 0;
859874
DenseSet<StringRef> PrevailingSymbols;
860875
for (auto &BitcodeInput : BitcodeInputFiles) {
@@ -886,7 +901,8 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
886901
// We need LTO to preseve the following global symbols:
887902
// 1) Symbols used in regular objects.
888903
// 2) Sections that will be given a __start/__stop symbol.
889-
// 3) Prevailing symbols that are needed visible to external libraries.
904+
// 3) Prevailing symbols that are needed visible to external
905+
// libraries.
890906
Res.VisibleToRegularObj =
891907
UsedInRegularObj.contains(Sym.getName()) ||
892908
isValidCIdentifier(Sym.getSectionName()) ||
@@ -901,9 +917,9 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
901917
(UsedInSharedLib.contains(Sym.getName()) ||
902918
!Sym.canBeOmittedFromSymbolTable());
903919

904-
// The final definition will reside in this linkage unit if the symbol is
905-
// defined and local to the module. This only checks for bitcode files,
906-
// full assertion will require complete symbol resolution.
920+
// The final definition will reside in this linkage unit if the symbol
921+
// is defined and local to the module. This only checks for bitcode
922+
// files, full assertion will require complete symbol resolution.
907923
Res.FinalDefinitionInLinkageUnit =
908924
Sym.getVisibility() != GlobalValue::DefaultVisibility &&
909925
(!Sym.isUndefined() && !Sym.isCommon());
@@ -956,8 +972,8 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
956972
return Error::success();
957973
}
958974

959-
// Append the new inputs to the device linker input. If the user requested an
960-
// internalizing link we need to pass the bitcode to clang.
975+
// Append the new inputs to the device linker input. If the user requested
976+
// an internalizing link we need to pass the bitcode to clang.
961977
for (StringRef File :
962978
Args.hasArg(OPT_clang_backend) || Args.hasArg(OPT_builtin_bitcode_EQ)
963979
? BitcodeOutput
@@ -972,10 +988,9 @@ Expected<StringRef> writeOffloadFile(const OffloadFile &File) {
972988

973989
StringRef Prefix =
974990
sys::path::stem(Binary.getMemoryBufferRef().getBufferIdentifier());
975-
StringRef Suffix = getImageKindName(Binary.getImageKind());
976991

977992
auto TempFileOrErr = createOutputFile(
978-
Prefix + "-" + Binary.getTriple() + "-" + Binary.getArch(), Suffix);
993+
Prefix + "-" + Binary.getTriple() + "-" + Binary.getArch(), "o");
979994
if (!TempFileOrErr)
980995
return TempFileOrErr.takeError();
981996

@@ -1188,8 +1203,8 @@ DerivedArgList getLinkerArgs(ArrayRef<OffloadFile> Input,
11881203
DAL.AddJoinedArg(nullptr, Tbl.getOption(OPT_triple_EQ),
11891204
Args.MakeArgString(Input.front().getBinary()->getTriple()));
11901205

1191-
// If every input file is bitcode we have whole program visibility as we do
1192-
// only support static linking with bitcode.
1206+
// If every input file is bitcode we have whole program visibility as we
1207+
// do only support static linking with bitcode.
11931208
auto ContainsBitcode = [](const OffloadFile &F) {
11941209
return identify_magic(F.getBinary()->getImage()) == file_magic::bitcode;
11951210
};
@@ -1277,12 +1292,15 @@ Expected<SmallVector<StringRef>> linkAndWrapDeviceFiles(
12771292
if (File.getBinary()->getOffloadKind() != OFK_None)
12781293
ActiveOffloadKinds.insert(File.getBinary()->getOffloadKind());
12791294

1280-
// First link and remove all the input files containing bitcode.
1295+
// First link and remove all the input files containing bitcode if
1296+
// the target linker does not support it natively.
12811297
SmallVector<StringRef> InputFiles;
1282-
if (Error Err = linkBitcodeFiles(Input, InputFiles, LinkerArgs))
1283-
return Err;
1298+
if (!linkerSupportsLTO(LinkerArgs))
1299+
if (Error Err = linkBitcodeFiles(Input, InputFiles, LinkerArgs))
1300+
return Err;
12841301

1285-
// Write any remaining device inputs to an output file for the linker.
1302+
// Write any remaining device inputs to an output file for the
1303+
// linker.
12861304
for (const OffloadFile &File : Input) {
12871305
auto FileNameOrErr = writeOffloadFile(File);
12881306
if (!FileNameOrErr)
@@ -1291,9 +1309,10 @@ Expected<SmallVector<StringRef>> linkAndWrapDeviceFiles(
12911309
}
12921310

12931311
// Link the remaining device files using the device linker.
1294-
auto OutputOrErr = !Args.hasArg(OPT_embed_bitcode)
1295-
? linkDevice(InputFiles, LinkerArgs)
1296-
: InputFiles.front();
1312+
auto OutputOrErr =
1313+
!Args.hasArg(OPT_embed_bitcode) || linkerSupportsLTO(LinkerArgs)
1314+
? linkDevice(InputFiles, LinkerArgs)
1315+
: InputFiles.front();
12971316
if (!OutputOrErr)
12981317
return OutputOrErr.takeError();
12991318

@@ -1420,12 +1439,14 @@ Expected<bool> getSymbolsFromBitcode(MemoryBufferRef Buffer, OffloadKind Kind,
14201439
bool NewSymbol = Syms.count(Sym.getName()) == 0;
14211440
auto OldSym = NewSymbol ? Sym_None : Syms[Sym.getName()];
14221441

1423-
// We will extract if it defines a currenlty undefined non-weak symbol.
1442+
// We will extract if it defines a currenlty undefined non-weak
1443+
// symbol.
14241444
bool ResolvesStrongReference =
14251445
((OldSym & Sym_Undefined && !(OldSym & Sym_Weak)) &&
14261446
!Sym.isUndefined());
1427-
// We will extract if it defines a new global symbol visible to the host.
1428-
// This is only necessary for code targeting an offloading language.
1447+
// We will extract if it defines a new global symbol visible to the
1448+
// host. This is only necessary for code targeting an offloading
1449+
// language.
14291450
bool NewGlobalSymbol =
14301451
((NewSymbol || (OldSym & Sym_Undefined)) && !Sym.isUndefined() &&
14311452
!Sym.canBeOmittedFromSymbolTable() && Kind != object::OFK_None &&
@@ -1480,8 +1501,9 @@ Expected<bool> getSymbolsFromObject(const ObjectFile &Obj, OffloadKind Kind,
14801501
!(OldSym & Sym_Weak) &&
14811502
!(*FlagsOrErr & SymbolRef::SF_Undefined);
14821503

1483-
// We will extract if it defines a new global symbol visible to the host.
1484-
// This is only necessary for code targeting an offloading language.
1504+
// We will extract if it defines a new global symbol visible to the
1505+
// host. This is only necessary for code targeting an offloading
1506+
// language.
14851507
bool NewGlobalSymbol =
14861508
((NewSymbol || (OldSym & Sym_Undefined)) &&
14871509
!(*FlagsOrErr & SymbolRef::SF_Undefined) && Kind != object::OFK_None &&
@@ -1648,8 +1670,8 @@ getDeviceInput(const ArgList &Args) {
16481670

16491671
Expected<bool> ExtractOrErr =
16501672
getSymbols(Binary.getBinary()->getImage(),
1651-
Binary.getBinary()->getOffloadKind(), /*IsArchive=*/true,
1652-
Saver, Syms[ID]);
1673+
Binary.getBinary()->getOffloadKind(),
1674+
/*IsArchive=*/true, Saver, Syms[ID]);
16531675
if (!ExtractOrErr)
16541676
return ExtractOrErr.takeError();
16551677

0 commit comments

Comments
 (0)