Skip to content

Commit f02b1cc

Browse files
[ASTWriter] Detect more non-affecting FileIDs to reduce source location duplication (llvm#112015)
Currently, any FileID that references a module map file that was required for a compilation is considered as affecting. This misses an important opportunity to reduce the source location space taken by the resulting PCM. In particular, consider the situation where the same module map file is passed multiple times in the dependency chain: ```shell $ clang -fmodule-map-file=foo.modulemap ... -o mod1.pcm $ clang -fmodule-map-file=foo.modulemap -fmodule-file=mod1.pcm ... -o mod2.pcm ... $ clang -fmodule-map-file=foo.modulemap -fmodule-file=mod$((N-1)).pcm ... -o mod$N.pcm ``` Because `foo.modulemap` is read before reading any of the `.pcm` files, we have to create a unique `FileID` for it when creating each module. However, when reading the `.pcm` files, we will reuse the `FileID` loaded from it for the same module map file and the `FileID` we created can never be used again, but we will still mark it as affecting and it will take the source location space in the output PCM. For a chain of N dependencies, this results in the file taking `N * (size of file)` source location space, which could be significant. For examples, we observer internally that some targets that run out of 2GB of source location space end up wasting up to 20% of that space in module maps as described above. I take extra care to still write the InputFile entries for those files that occupied source location space before. It is required for correctness of clang-scan-deps.
1 parent 984bca9 commit f02b1cc

File tree

4 files changed

+129
-11
lines changed

4 files changed

+129
-11
lines changed

clang/include/clang/Serialization/ASTWriter.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -496,6 +496,9 @@ class ASTWriter : public ASTDeserializationListener,
496496

497497
/// Mapping from a source location entry to whether it is affecting or not.
498498
llvm::BitVector IsSLocAffecting;
499+
/// Mapping from a source location entry to whether it must be included as
500+
/// input file.
501+
llvm::BitVector IsSLocFileEntryAffecting;
499502

500503
/// Mapping from \c FileID to an index into the FileID adjustment table.
501504
std::vector<FileID> NonAffectingFileIDs;

clang/lib/Serialization/ASTReader.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5866,6 +5866,12 @@ llvm::Error ASTReader::ReadSubmoduleBlock(ModuleFile &F,
58665866
}
58675867

58685868
CurrentModule->Kind = Kind;
5869+
// Note that we may be rewriting an existing location and it is important
5870+
// to keep doing that. In particular, we would like to prefer a
5871+
// `DefinitionLoc` loaded from the module file instead of the location
5872+
// created in the current source manager, because it allows the new
5873+
// location to be marked as "unaffecting" when writing and avoid creating
5874+
// duplicate locations for the same module map file.
58695875
CurrentModule->DefinitionLoc = DefinitionLoc;
58705876
CurrentModule->Signature = F.Signature;
58715877
CurrentModule->IsFromModuleFile = true;

clang/lib/Serialization/ASTWriter.cpp

Lines changed: 35 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
#include "clang/AST/TypeLocVisitor.h"
3939
#include "clang/Basic/Diagnostic.h"
4040
#include "clang/Basic/DiagnosticOptions.h"
41+
#include "clang/Basic/FileEntry.h"
4142
#include "clang/Basic/FileManager.h"
4243
#include "clang/Basic/FileSystemOptions.h"
4344
#include "clang/Basic/IdentifierTable.h"
@@ -81,6 +82,7 @@
8182
#include "llvm/ADT/APSInt.h"
8283
#include "llvm/ADT/ArrayRef.h"
8384
#include "llvm/ADT/DenseMap.h"
85+
#include "llvm/ADT/DenseSet.h"
8486
#include "llvm/ADT/Hashing.h"
8587
#include "llvm/ADT/PointerIntPair.h"
8688
#include "llvm/ADT/STLExtras.h"
@@ -166,18 +168,25 @@ static TypeCode getTypeCodeForTypeClass(Type::TypeClass id) {
166168

167169
namespace {
168170

169-
std::optional<std::set<const FileEntry *>>
171+
struct AffectingModuleMaps {
172+
llvm::DenseSet<FileID> DefinitionFileIDs;
173+
llvm::DenseSet<const FileEntry *> DefinitionFiles;
174+
};
175+
176+
std::optional<AffectingModuleMaps>
170177
GetAffectingModuleMaps(const Preprocessor &PP, Module *RootModule) {
171178
if (!PP.getHeaderSearchInfo()
172179
.getHeaderSearchOpts()
173180
.ModulesPruneNonAffectingModuleMaps)
174181
return std::nullopt;
175182

176183
const HeaderSearch &HS = PP.getHeaderSearchInfo();
184+
const SourceManager &SM = PP.getSourceManager();
177185
const ModuleMap &MM = HS.getModuleMap();
178186

179-
std::set<const FileEntry *> ModuleMaps;
180-
std::set<const Module *> ProcessedModules;
187+
llvm::DenseSet<FileID> ModuleMaps;
188+
189+
llvm::DenseSet<const Module *> ProcessedModules;
181190
auto CollectModuleMapsForHierarchy = [&](const Module *M) {
182191
M = M->getTopLevelModule();
183192

@@ -192,13 +201,13 @@ GetAffectingModuleMaps(const Preprocessor &PP, Module *RootModule) {
192201

193202
// The containing module map is affecting, because it's being pointed
194203
// into by Module::DefinitionLoc.
195-
if (auto FE = MM.getContainingModuleMapFile(Mod))
196-
ModuleMaps.insert(*FE);
204+
if (auto F = MM.getContainingModuleMapFileID(Mod); F.isValid())
205+
ModuleMaps.insert(F);
197206
// For inferred modules, the module map that allowed inferring is not
198207
// related to the virtual containing module map file. It did affect the
199208
// compilation, though.
200-
if (auto FE = MM.getModuleMapFileForUniquing(Mod))
201-
ModuleMaps.insert(*FE);
209+
if (auto UniqF = MM.getModuleMapFileIDForUniquing(Mod); UniqF.isValid())
210+
ModuleMaps.insert(UniqF);
202211

203212
for (auto *SubM : Mod->submodules())
204213
Q.push(SubM);
@@ -268,7 +277,16 @@ GetAffectingModuleMaps(const Preprocessor &PP, Module *RootModule) {
268277
// just ban module map hierarchies where module map defining a (sub)module X
269278
// includes a module map defining a module that's not a submodule of X.
270279

271-
return ModuleMaps;
280+
llvm::DenseSet<const FileEntry *> ModuleFileEntries;
281+
for (FileID MM : ModuleMaps) {
282+
if (auto *FE = SM.getFileEntryForID(MM))
283+
ModuleFileEntries.insert(FE);
284+
}
285+
286+
AffectingModuleMaps R;
287+
R.DefinitionFileIDs = std::move(ModuleMaps);
288+
R.DefinitionFiles = std::move(ModuleFileEntries);
289+
return std::move(R);
272290
}
273291

274292
class ASTTypeWriter {
@@ -1770,14 +1788,17 @@ void ASTWriter::WriteInputFiles(SourceManager &SourceMgr,
17701788
continue;
17711789

17721790
// Do not emit input files that do not affect current module.
1773-
if (!IsSLocAffecting[I])
1791+
if (!IsSLocFileEntryAffecting[I])
17741792
continue;
17751793

17761794
InputFileEntry Entry(*Cache->OrigEntry);
17771795
Entry.IsSystemFile = isSystem(File.getFileCharacteristic());
17781796
Entry.IsTransient = Cache->IsTransient;
17791797
Entry.BufferOverridden = Cache->BufferOverridden;
1780-
Entry.IsTopLevel = getAffectingIncludeLoc(SourceMgr, File).isInvalid();
1798+
1799+
FileID IncludeFileID = SourceMgr.getFileID(File.getIncludeLoc());
1800+
Entry.IsTopLevel = IncludeFileID.isInvalid() || IncludeFileID.ID < 0 ||
1801+
!IsSLocFileEntryAffecting[IncludeFileID.ID];
17811802
Entry.IsModuleMap = isModuleMap(File.getFileCharacteristic());
17821803

17831804
uint64_t ContentHash = 0;
@@ -4920,6 +4941,7 @@ void ASTWriter::computeNonAffectingInputFiles() {
49204941
unsigned N = SrcMgr.local_sloc_entry_size();
49214942

49224943
IsSLocAffecting.resize(N, true);
4944+
IsSLocFileEntryAffecting.resize(N, true);
49234945

49244946
if (!WritingModule)
49254947
return;
@@ -4956,10 +4978,12 @@ void ASTWriter::computeNonAffectingInputFiles() {
49564978
continue;
49574979

49584980
// Don't prune module maps that are affecting.
4959-
if (llvm::is_contained(*AffectingModuleMaps, *Cache->OrigEntry))
4981+
if (AffectingModuleMaps->DefinitionFileIDs.contains(FID))
49604982
continue;
49614983

49624984
IsSLocAffecting[I] = false;
4985+
IsSLocFileEntryAffecting[I] =
4986+
AffectingModuleMaps->DefinitionFiles.contains(*Cache->OrigEntry);
49634987

49644988
FileIDAdjustment += 1;
49654989
// Even empty files take up one element in the offset table.
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
// Check that the same module map file passed to -fmodule-map-file *and*
2+
// available from one of the `-fmodule-file` does not allocate extra source
3+
// location space. This optimization is important for using module maps in
4+
// large codebases to avoid running out of source location space.
5+
6+
// RUN: rm -rf %t && mkdir %t
7+
// RUN: split-file %s %t
8+
9+
// RUN: %clang_cc1 -xc++ -fmodules -fno-implicit-modules -fmodule-map-file=%t/base.map -fmodule-name=base -emit-module %t/base.map -o %t/base.pcm
10+
// RUN: %clang_cc1 -xc++ -fmodules -fno-implicit-modules \
11+
// RUN: -fmodule-map-file=%t/base.map -fmodule-map-file=%t/mod1.map \
12+
// RUN: -fmodule-file=%t/base.pcm \
13+
// RUN: -fmodule-name=mod1 -emit-module %t/mod1.map -o %t/mod1.pcm
14+
// RUN: %clang_cc1 -xc++ -fmodules -fno-implicit-modules \
15+
// RUN: -fmodule-map-file=%t/base.map -fmodule-map-file=%t/mod1.map -fmodule-map-file=%t/mod2.map \
16+
// RUN: -fmodule-file=%t/mod1.pcm \
17+
// RUN: -fmodule-name=mod2 -emit-module %t/mod2.map -o %t/mod2.pcm
18+
// RUN: %clang_cc1 -xc++ -fmodules -fno-implicit-modules \
19+
// RUN: -fmodule-map-file=%t/base.map -fmodule-map-file=%t/mod2.map -fmodule-map-file=%t/mod3.map \
20+
// RUN: -fmodule-file=%t/mod2.pcm \
21+
// RUN: -fmodule-name=mod3 -emit-module %t/mod3.map -o %t/mod3.pcm
22+
// RUN: %clang_cc1 -xc++ -fmodules -fno-implicit-modules \
23+
// RUN: -fmodule-map-file=%t/base.map -fmodule-map-file=%t/mod3.map -fmodule-map-file=%t/mod4.map \
24+
// RUN: -fmodule-file=%t/mod3.pcm \
25+
// RUN: -fmodule-name=mod4 -emit-module %t/mod4.map -o %t/mod4.pcm
26+
// RUN: %clang_cc1 -xc++ -fmodules -fno-implicit-modules -fmodule-map-file=%t/base.map -fmodule-map-file=%t/mod4.map -fmodule-file=%t/mod4.pcm -fsyntax-only -verify %t/check_slocs.cc
27+
28+
//--- base.map
29+
module base { header "vector.h" }
30+
//--- mod1.map
31+
module mod1 { header "mod1.h" }
32+
//--- mod2.map
33+
module mod2 { header "mod2.h" }
34+
//--- mod3.map
35+
module mod3 { header "mod3.h" }
36+
//--- mod4.map
37+
module mod4 { header "mod4.h" }
38+
//--- check_slocs.cc
39+
#include "mod4.h"
40+
#pragma clang __debug sloc_usage // expected-remark {{source manager location address space usage}}
41+
// expected-note@* {{% of available space}}
42+
43+
// Module map files files that were specified on the command line are entered twice (once when parsing command-line, once loaded from the .pcm)
44+
// Those that not specified on the command line must be entered once.
45+
46+
// [email protected]:1 {{file entered 2 times}}
47+
// [email protected]:1 {{file entered 2 times}}
48+
// [email protected]:1 {{file entered 1 time}}
49+
// [email protected]:1 {{file entered 1 time}}
50+
// [email protected]:1 {{file entered 1 time}}
51+
// expected-note@* + {{file entered}}
52+
53+
54+
//--- vector.h
55+
#ifndef VECTOR_H
56+
#define VECTOR_H
57+
#endif
58+
59+
//--- mod1.h
60+
#ifndef MOD1
61+
#define MOD1
62+
#include "vector.h"
63+
int mod1();
64+
#endif
65+
//--- mod2.h
66+
#ifndef MOD2
67+
#define MOD2
68+
#include "vector.h"
69+
#include "mod1.h"
70+
int mod2();
71+
#endif
72+
//--- mod3.h
73+
#ifndef MOD3
74+
#define MOD3
75+
#include "vector.h"
76+
#include "mod2.h"
77+
int mod3();
78+
#endif
79+
//--- mod4.h
80+
#ifndef MOD4
81+
#define MOD4
82+
#include "vector.h"
83+
#include "mod3.h"
84+
int mod4();
85+
#endif

0 commit comments

Comments
 (0)