Skip to content

Commit 50547c3

Browse files
committed
[LinkerWrapper] Pass all files to the device linker
Summary: The linker wrapper's job is to extract embedded device code from fat binaries and create linked images that can then be embedded and executed. In order to support LTO, we originally reinvented all of the LTO handling that `ld.lld` normally does. Primarily, this was because `nvlink` didn't support this at all, and we have special hacks required for offloading languages interacting with archive libraries. Now since I wrote llvm#96561 we should be able to pass all the inputs to the device linker transparently. This has the advantage of allowing the `clang` Driver to do its own handling. Primarily, this will be used to implicitly pass libraries to the device link job to make it more consistent with other toolchains. The JIT support is a notable departure, however there is an option called `--lto-emit-llvm` that performs the exact function where we want the final link job to output LLVM-IR that we can then embed instead. This patch does not fully delete the LTO handling, primarily because I think the SPIR-V people might want it. To see only the relevant patches, ignore the first commit of the nvlink-wrapper. Depends on llvm#96561.
1 parent 2d3957a commit 50547c3

File tree

3 files changed

+68
-46
lines changed

3 files changed

+68
-46
lines changed

clang/test/Driver/linker-wrapper-libs.c

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ int bar() { return weak; }
4848
// RUN: --linker-path=/usr/bin/ld %t.a %t.o -o a.out 2>&1 \
4949
// RUN: | FileCheck %s --check-prefix=LIBRARY-RESOLVES
5050

51-
// LIBRARY-RESOLVES: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 {{.*}}.s {{.*}}.o
51+
// LIBRARY-RESOLVES: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 {{.*}}.o {{.*}}.o
5252
// LIBRARY-RESOLVES: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx1030 {{.*}}.o {{.*}}.o
5353

5454
//
@@ -72,7 +72,7 @@ int bar() { return weak; }
7272
// RUN: --linker-path=/usr/bin/ld %t.a %t.o -o a.out 2>&1 \
7373
// RUN: | FileCheck %s --check-prefix=LIBRARY-GLOBAL
7474

75-
// LIBRARY-GLOBAL: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 {{.*}}.s {{.*}}.o
75+
// LIBRARY-GLOBAL: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 {{.*}}.o {{.*}}.o
7676
// LIBRARY-GLOBAL: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx1030 {{.*}}.o {{.*}}.o
7777

7878
//
@@ -96,7 +96,7 @@ int bar() { return weak; }
9696
// RUN: | FileCheck %s --check-prefix=LIBRARY-GLOBAL-NONE
9797

9898
// LIBRARY-GLOBAL-NONE-NOT: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx1030 {{.*}}.o {{.*}}.o
99-
// LIBRARY-GLOBAL-NONE-NOT: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 {{.*}}.s {{.*}}.o
99+
// LIBRARY-GLOBAL-NONE-NOT: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 {{.*}}.o {{.*}}.o
100100

101101
//
102102
// Check that we do not extract an external weak symbol.
@@ -161,7 +161,7 @@ int bar() { return weak; }
161161
// RUN: --linker-path=/usr/bin/ld %t.o %t.a %t.a -o a.out 2>&1 \
162162
// RUN: | FileCheck %s --check-prefix=LIBRARY-GLOBAL-DEFINED
163163

164-
// LIBRARY-GLOBAL-DEFINED: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 {{.*}}.s {{.*}}.o
164+
// LIBRARY-GLOBAL-DEFINED: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 {{.*}}.o {{.*}}.o
165165
// LIBRARY-GLOBAL-DEFINED-NOT: {{.*}}gfx1030{{.*}}.o
166166
// LIBRARY-GLOBAL-DEFINED: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx1030 {{.*}}.o {{.*}}.o
167167

@@ -185,7 +185,7 @@ int bar() { return weak; }
185185
// RUN: --linker-path=/usr/bin/ld %t.o --whole-archive %t.a -o a.out 2>&1 \
186186
// RUN: | FileCheck %s --check-prefix=LIBRARY-WHOLE-ARCHIVE
187187

188-
// LIBRARY-WHOLE-ARCHIVE: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 {{.*}}.s {{.*}}.o
188+
// LIBRARY-WHOLE-ARCHIVE: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 {{.*}}.o {{.*}}.o
189189
// LIBRARY-WHOLE-ARCHIVE: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx1030 {{.*}}.o {{.*}}.o
190-
// LIBRARY-WHOLE-ARCHIVE: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_52 {{.*}}.s
190+
// LIBRARY-WHOLE-ARCHIVE: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_52 {{.*}}.o
191191
// LIBRARY-WHOLE-ARCHIVE: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx90a {{.*}}.o

clang/test/Driver/linker-wrapper.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ __attribute__((visibility("protected"), used)) int x;
4848
// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run --save-temps -O2 \
4949
// RUN: --linker-path=/usr/bin/ld %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=AMDGPU-LTO-TEMPS
5050

51-
// AMDGPU-LTO-TEMPS: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx1030 -O2 -Wl,--no-undefined {{.*}}.s -save-temps
51+
// AMDGPU-LTO-TEMPS: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx1030 -O2 -Wl,--no-undefined {{.*}}.o -save-temps
5252

5353
// RUN: clang-offload-packager -o %t.out \
5454
// RUN: --image=file=%t.elf.o,kind=openmp,triple=x86_64-unknown-linux-gnu \
@@ -147,7 +147,7 @@ __attribute__((visibility("protected"), used)) int x;
147147
// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run --clang-backend \
148148
// RUN: --linker-path=/usr/bin/ld %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=CLANG-BACKEND
149149

150-
// CLANG-BACKEND: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx908 -O2 -Wl,--no-undefined {{.*}}.bc
150+
// CLANG-BACKEND: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx908 -O2 -Wl,--no-undefined {{.*}}.o
151151

152152
// RUN: clang-offload-packager -o %t.out \
153153
// RUN: --image=file=%t.elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70

clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp

Lines changed: 60 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,13 @@ Expected<std::string> findProgram(StringRef Name, ArrayRef<StringRef> Paths) {
242242
return *Path;
243243
}
244244

245+
/// We will defer LTO to the target's linker if we are not doing JIT and it is
246+
/// supported by the toolchain.
247+
bool linkerSupportsLTO(const ArgList &Args) {
248+
llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ));
249+
return Triple.isNVPTX() || Triple.isAMDGPU();
250+
}
251+
245252
/// Returns the hashed value for a constant string.
246253
std::string getHash(StringRef Str) {
247254
llvm::MD5 Hasher;
@@ -504,18 +511,23 @@ Expected<StringRef> clang(ArrayRef<StringRef> InputFiles, const ArgList &Args) {
504511
llvm::copy(LinkerArgs, std::back_inserter(CmdArgs));
505512
}
506513

507-
// Pass on -mllvm options to the clang invocation.
508-
for (const opt::Arg *Arg : Args.filtered(OPT_mllvm)) {
509-
CmdArgs.push_back("-mllvm");
510-
CmdArgs.push_back(Arg->getValue());
511-
}
514+
// Pass on -mllvm options to the linker invocation.
515+
for (const opt::Arg *Arg : Args.filtered(OPT_mllvm))
516+
CmdArgs.push_back(
517+
Args.MakeArgString("-Wl,-mllvm=" + StringRef(Arg->getValue())));
512518

513519
if (Args.hasArg(OPT_debug))
514520
CmdArgs.push_back("-g");
515521

516522
if (SaveTemps)
517523
CmdArgs.push_back("-save-temps");
518524

525+
if (SaveTemps && linkerSupportsLTO(Args))
526+
CmdArgs.push_back("-Wl,--save-temps");
527+
528+
if (Args.hasArg(OPT_embed_bitcode))
529+
CmdArgs.push_back("-Wl,--lto-emit-llvm");
530+
519531
if (Verbose)
520532
CmdArgs.push_back("-v");
521533

@@ -536,8 +548,8 @@ Expected<StringRef> clang(ArrayRef<StringRef> InputFiles, const ArgList &Args) {
536548
Args.MakeArgString(Arg.split('=').second)});
537549
}
538550

539-
// The OpenMPOpt pass can introduce new calls and is expensive, we do not want
540-
// this when running CodeGen through clang.
551+
// The OpenMPOpt pass can introduce new calls and is expensive, we do
552+
// not want this when running CodeGen through clang.
541553
if (Args.hasArg(OPT_clang_backend) || Args.hasArg(OPT_builtin_bitcode_EQ))
542554
CmdArgs.append({"-mllvm", "-openmp-opt-disable"});
543555

@@ -703,8 +715,9 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
703715
BumpPtrAllocator Alloc;
704716
StringSaver Saver(Alloc);
705717

706-
// Search for bitcode files in the input and create an LTO input file. If it
707-
// is not a bitcode file, scan its symbol table for symbols we need to save.
718+
// Search for bitcode files in the input and create an LTO input file. If
719+
// it is not a bitcode file, scan its symbol table for symbols we need to
720+
// save.
708721
for (OffloadFile &File : InputFiles) {
709722
MemoryBufferRef Buffer = MemoryBufferRef(File.getBinary()->getImage(), "");
710723

@@ -738,7 +751,8 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
738751
if (!Name)
739752
return Name.takeError();
740753

741-
// Record if we've seen these symbols in any object or shared libraries.
754+
// Record if we've seen these symbols in any object or shared
755+
// libraries.
742756
if ((*ObjFile)->isRelocatableObject())
743757
UsedInRegularObj.insert(Saver.save(*Name));
744758
else
@@ -775,17 +789,18 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
775789
return false;
776790
};
777791

778-
// We assume visibility of the whole program if every input file was bitcode.
792+
// We assume visibility of the whole program if every input file was
793+
// bitcode.
779794
auto Features = getTargetFeatures(BitcodeInputFiles);
780795
auto LTOBackend = Args.hasArg(OPT_embed_bitcode) ||
781796
Args.hasArg(OPT_builtin_bitcode_EQ) ||
782797
Args.hasArg(OPT_clang_backend)
783798
? createLTO(Args, Features, OutputBitcode)
784799
: createLTO(Args, Features);
785800

786-
// We need to resolve the symbols so the LTO backend knows which symbols need
787-
// to be kept or can be internalized. This is a simplified symbol resolution
788-
// scheme to approximate the full resolution a linker would do.
801+
// We need to resolve the symbols so the LTO backend knows which symbols
802+
// need to be kept or can be internalized. This is a simplified symbol
803+
// resolution scheme to approximate the full resolution a linker would do.
789804
uint64_t Idx = 0;
790805
DenseSet<StringRef> PrevailingSymbols;
791806
for (auto &BitcodeInput : BitcodeInputFiles) {
@@ -817,7 +832,8 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
817832
// We need LTO to preseve the following global symbols:
818833
// 1) Symbols used in regular objects.
819834
// 2) Sections that will be given a __start/__stop symbol.
820-
// 3) Prevailing symbols that are needed visible to external libraries.
835+
// 3) Prevailing symbols that are needed visible to external
836+
// libraries.
821837
Res.VisibleToRegularObj =
822838
UsedInRegularObj.contains(Sym.getName()) ||
823839
isValidCIdentifier(Sym.getSectionName()) ||
@@ -832,9 +848,9 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
832848
(UsedInSharedLib.contains(Sym.getName()) ||
833849
!Sym.canBeOmittedFromSymbolTable());
834850

835-
// The final definition will reside in this linkage unit if the symbol is
836-
// defined and local to the module. This only checks for bitcode files,
837-
// full assertion will require complete symbol resolution.
851+
// The final definition will reside in this linkage unit if the symbol
852+
// is defined and local to the module. This only checks for bitcode
853+
// files, full assertion will require complete symbol resolution.
838854
Res.FinalDefinitionInLinkageUnit =
839855
Sym.getVisibility() != GlobalValue::DefaultVisibility &&
840856
(!Sym.isUndefined() && !Sym.isCommon());
@@ -887,8 +903,8 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
887903
return Error::success();
888904
}
889905

890-
// Append the new inputs to the device linker input. If the user requested an
891-
// internalizing link we need to pass the bitcode to clang.
906+
// Append the new inputs to the device linker input. If the user requested
907+
// an internalizing link we need to pass the bitcode to clang.
892908
for (StringRef File :
893909
Args.hasArg(OPT_clang_backend) || Args.hasArg(OPT_builtin_bitcode_EQ)
894910
? BitcodeOutput
@@ -903,10 +919,9 @@ Expected<StringRef> writeOffloadFile(const OffloadFile &File) {
903919

904920
StringRef Prefix =
905921
sys::path::stem(Binary.getMemoryBufferRef().getBufferIdentifier());
906-
StringRef Suffix = getImageKindName(Binary.getImageKind());
907922

908923
auto TempFileOrErr = createOutputFile(
909-
Prefix + "-" + Binary.getTriple() + "-" + Binary.getArch(), Suffix);
924+
Prefix + "-" + Binary.getTriple() + "-" + Binary.getArch(), "o");
910925
if (!TempFileOrErr)
911926
return TempFileOrErr.takeError();
912927

@@ -1119,8 +1134,8 @@ DerivedArgList getLinkerArgs(ArrayRef<OffloadFile> Input,
11191134
DAL.AddJoinedArg(nullptr, Tbl.getOption(OPT_triple_EQ),
11201135
Args.MakeArgString(Input.front().getBinary()->getTriple()));
11211136

1122-
// If every input file is bitcode we have whole program visibility as we do
1123-
// only support static linking with bitcode.
1137+
// If every input file is bitcode we have whole program visibility as we
1138+
// do only support static linking with bitcode.
11241139
auto ContainsBitcode = [](const OffloadFile &F) {
11251140
return identify_magic(F.getBinary()->getImage()) == file_magic::bitcode;
11261141
};
@@ -1208,12 +1223,15 @@ Expected<SmallVector<StringRef>> linkAndWrapDeviceFiles(
12081223
if (File.getBinary()->getOffloadKind() != OFK_None)
12091224
ActiveOffloadKinds.insert(File.getBinary()->getOffloadKind());
12101225

1211-
// First link and remove all the input files containing bitcode.
1226+
// First link and remove all the input files containing bitcode if
1227+
// the target linker does not support it natively.
12121228
SmallVector<StringRef> InputFiles;
1213-
if (Error Err = linkBitcodeFiles(Input, InputFiles, LinkerArgs))
1214-
return Err;
1229+
if (!linkerSupportsLTO(LinkerArgs))
1230+
if (Error Err = linkBitcodeFiles(Input, InputFiles, LinkerArgs))
1231+
return Err;
12151232

1216-
// Write any remaining device inputs to an output file for the linker.
1233+
// Write any remaining device inputs to an output file for the
1234+
// linker.
12171235
for (const OffloadFile &File : Input) {
12181236
auto FileNameOrErr = writeOffloadFile(File);
12191237
if (!FileNameOrErr)
@@ -1222,9 +1240,10 @@ Expected<SmallVector<StringRef>> linkAndWrapDeviceFiles(
12221240
}
12231241

12241242
// Link the remaining device files using the device linker.
1225-
auto OutputOrErr = !Args.hasArg(OPT_embed_bitcode)
1226-
? linkDevice(InputFiles, LinkerArgs)
1227-
: InputFiles.front();
1243+
auto OutputOrErr =
1244+
!Args.hasArg(OPT_embed_bitcode) || linkerSupportsLTO(LinkerArgs)
1245+
? linkDevice(InputFiles, LinkerArgs)
1246+
: InputFiles.front();
12281247
if (!OutputOrErr)
12291248
return OutputOrErr.takeError();
12301249

@@ -1351,12 +1370,14 @@ Expected<bool> getSymbolsFromBitcode(MemoryBufferRef Buffer, OffloadKind Kind,
13511370
bool NewSymbol = Syms.count(Sym.getName()) == 0;
13521371
auto OldSym = NewSymbol ? Sym_None : Syms[Sym.getName()];
13531372

1354-
// We will extract if it defines a currenlty undefined non-weak symbol.
1373+
// We will extract if it defines a currenlty undefined non-weak
1374+
// symbol.
13551375
bool ResolvesStrongReference =
13561376
((OldSym & Sym_Undefined && !(OldSym & Sym_Weak)) &&
13571377
!Sym.isUndefined());
1358-
// We will extract if it defines a new global symbol visible to the host.
1359-
// This is only necessary for code targeting an offloading language.
1378+
// We will extract if it defines a new global symbol visible to the
1379+
// host. This is only necessary for code targeting an offloading
1380+
// language.
13601381
bool NewGlobalSymbol =
13611382
((NewSymbol || (OldSym & Sym_Undefined)) && !Sym.isUndefined() &&
13621383
!Sym.canBeOmittedFromSymbolTable() && Kind != object::OFK_None &&
@@ -1411,8 +1432,9 @@ Expected<bool> getSymbolsFromObject(const ObjectFile &Obj, OffloadKind Kind,
14111432
!(OldSym & Sym_Weak) &&
14121433
!(*FlagsOrErr & SymbolRef::SF_Undefined);
14131434

1414-
// We will extract if it defines a new global symbol visible to the host.
1415-
// This is only necessary for code targeting an offloading language.
1435+
// We will extract if it defines a new global symbol visible to the
1436+
// host. This is only necessary for code targeting an offloading
1437+
// language.
14161438
bool NewGlobalSymbol =
14171439
((NewSymbol || (OldSym & Sym_Undefined)) &&
14181440
!(*FlagsOrErr & SymbolRef::SF_Undefined) && Kind != object::OFK_None &&
@@ -1579,8 +1601,8 @@ getDeviceInput(const ArgList &Args) {
15791601

15801602
Expected<bool> ExtractOrErr =
15811603
getSymbols(Binary.getBinary()->getImage(),
1582-
Binary.getBinary()->getOffloadKind(), /*IsArchive=*/true,
1583-
Saver, Syms[ID]);
1604+
Binary.getBinary()->getOffloadKind(),
1605+
/*IsArchive=*/true, Saver, Syms[ID]);
15841606
if (!ExtractOrErr)
15851607
return ExtractOrErr.takeError();
15861608

0 commit comments

Comments
 (0)