Skip to content

Commit c4e1c89

Browse files
SC llvm teamSC llvm team
SC llvm team
authored and
SC llvm team
committed
Merged main:c1e95b2e5e61 into origin/amd-gfx:e60c0ac07789
Local branch origin/amd-gfx e60c0ac Merged main:fcaefc2c19eb into origin/amd-gfx:25b40737e2c7 Remote branch main c1e95b2 [RISCV] Fix matching bug in VLA shuffle lowering (llvm#134750)
2 parents e60c0ac + c1e95b2 commit c4e1c89

File tree

117 files changed

+2193
-151
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

117 files changed

+2193
-151
lines changed

clang/include/clang/Basic/DiagnosticDriverKinds.td

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,8 @@ def err_drv_no_cuda_libdevice : Error<
6767
"libdevice">;
6868

6969
def err_drv_no_rocm_device_lib : Error<
70-
"cannot find ROCm device library%select{| for %1| for ABI version %1}0; provide its path via "
70+
"cannot find ROCm device library%select{| for %1| for ABI version %1"
71+
"%select{|, which requires ROCm %3 or higher}2}0; provide its path via "
7172
"'--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build "
7273
"without ROCm device library">;
7374
def err_drv_no_hip_runtime : Error<

clang/lib/Driver/ToolChains/AMDGPU.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -935,7 +935,13 @@ bool RocmInstallationDetector::checkCommonBitcodeLibs(
935935
return false;
936936
}
937937
if (ABIVer.requiresLibrary() && getABIVersionPath(ABIVer).empty()) {
938-
D.Diag(diag::err_drv_no_rocm_device_lib) << 2 << ABIVer.toString();
938+
// Starting from COV6, we will report minimum ROCm version requirement in
939+
// the error message.
940+
if (ABIVer.getAsCodeObjectVersion() < 6)
941+
D.Diag(diag::err_drv_no_rocm_device_lib) << 2 << ABIVer.toString() << 0;
942+
else
943+
D.Diag(diag::err_drv_no_rocm_device_lib)
944+
<< 2 << ABIVer.toString() << 1 << "6.3";
939945
return false;
940946
}
941947
return true;

clang/lib/Driver/ToolChains/ROCm.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,11 @@ struct DeviceLibABIVersion {
3737
/// and below works with ROCm 5.0 and below which does not have
3838
/// abi_version_*.bc. Code object v5 requires abi_version_500.bc.
3939
bool requiresLibrary() { return ABIVersion >= 500; }
40-
std::string toString() {
40+
std::string toString() { return Twine(getAsCodeObjectVersion()).str(); }
41+
42+
unsigned getAsCodeObjectVersion() const {
4143
assert(ABIVersion % 100 == 0 && "Not supported");
42-
return Twine(ABIVersion / 100).str();
44+
return ABIVersion / 100;
4345
}
4446
};
4547

clang/test/Driver/hip-device-libs.hip

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -254,4 +254,4 @@
254254
// NOABI4-NOT: "-mlink-builtin-bitcode" "{{.*}}oclc_abi_version_400.bc"
255255
// NOABI4-NOT: "-mlink-builtin-bitcode" "{{.*}}oclc_abi_version_500.bc"
256256
// NOABI5: error: cannot find ROCm device library for ABI version 5; provide its path via '--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build without ROCm device library
257-
// NOABI6: error: cannot find ROCm device library for ABI version 6; provide its path via '--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build without ROCm device library
257+
// NOABI6: error: cannot find ROCm device library for ABI version 6, which requires ROCm 6.3 or higher; provide its path via '--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build without ROCm device library

compiler-rt/lib/ctx_profile/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ endif()
2727
add_compiler_rt_runtime(clang_rt.ctx_profile
2828
STATIC
2929
ARCHS ${CTX_PROFILE_SUPPORTED_ARCH}
30-
OBJECT_LIBS RTSanitizerCommon RTSanitizerCommonLibc
30+
OBJECT_LIBS RTSanitizerCommon RTSanitizerCommonLibc RTSanitizerCommonSymbolizer
3131
CFLAGS ${EXTRA_FLAGS}
3232
SOURCES ${CTX_PROFILE_SOURCES}
3333
ADDITIONAL_HEADERS ${CTX_PROFILE_HEADERS}

compiler-rt/lib/ctx_profile/CtxInstrContextNode.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ class ContextNode final {
127127
/// MUTEXDECL takes one parameter, the name of a field that is a mutex.
128128
#define CTXPROF_FUNCTION_DATA(PTRDECL, VOLATILE_PTRDECL, MUTEXDECL) \
129129
PTRDECL(FunctionData, Next) \
130+
VOLATILE_PTRDECL(void, EntryAddress) \
130131
VOLATILE_PTRDECL(ContextRoot, CtxRoot) \
131132
VOLATILE_PTRDECL(ContextNode, FlatCtx) \
132133
MUTEXDECL(Mutex)

compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp

Lines changed: 57 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
//===----------------------------------------------------------------------===//
88

99
#include "CtxInstrProfiling.h"
10+
#include "RootAutoDetector.h"
1011
#include "sanitizer_common/sanitizer_allocator_internal.h"
1112
#include "sanitizer_common/sanitizer_atomic.h"
1213
#include "sanitizer_common/sanitizer_atomic_clang.h"
@@ -43,6 +44,12 @@ Arena *FlatCtxArena = nullptr;
4344
__thread bool IsUnderContext = false;
4445
__sanitizer::atomic_uint8_t ProfilingStarted = {};
4546

47+
__sanitizer::atomic_uintptr_t RootDetector = {};
48+
RootAutoDetector *getRootDetector() {
49+
return reinterpret_cast<RootAutoDetector *>(
50+
__sanitizer::atomic_load_relaxed(&RootDetector));
51+
}
52+
4653
// utility to taint a pointer by setting the LSB. There is an assumption
4754
// throughout that the addresses of contexts are even (really, they should be
4855
// align(8), but "even"-ness is the minimum assumption)
@@ -201,7 +208,7 @@ ContextNode *getCallsiteSlow(GUID Guid, ContextNode **InsertionPoint,
201208
return Ret;
202209
}
203210

204-
ContextNode *getFlatProfile(FunctionData &Data, GUID Guid,
211+
ContextNode *getFlatProfile(FunctionData &Data, void *Callee, GUID Guid,
205212
uint32_t NumCounters) {
206213
if (ContextNode *Existing = Data.FlatCtx)
207214
return Existing;
@@ -232,6 +239,7 @@ ContextNode *getFlatProfile(FunctionData &Data, GUID Guid,
232239
auto *Ret = allocContextNode(AllocBuff, Guid, NumCounters, 0);
233240
Data.FlatCtx = Ret;
234241

242+
Data.EntryAddress = Callee;
235243
Data.Next = reinterpret_cast<FunctionData *>(
236244
__sanitizer::atomic_load_relaxed(&AllFunctionsData));
237245
while (!__sanitizer::atomic_compare_exchange_strong(
@@ -296,8 +304,9 @@ ContextNode *tryStartContextGivenRoot(ContextRoot *Root, GUID Guid,
296304
return TheScratchContext;
297305
}
298306

299-
ContextNode *getUnhandledContext(FunctionData &Data, GUID Guid,
300-
uint32_t NumCounters) {
307+
ContextNode *getUnhandledContext(FunctionData &Data, void *Callee, GUID Guid,
308+
uint32_t NumCounters, uint32_t NumCallsites,
309+
ContextRoot *CtxRoot) {
301310

302311
// 1) if we are currently collecting a contextual profile, fetch a ContextNode
303312
// in the `Unhandled` set. We want to do this regardless of `ProfilingStarted`
@@ -316,27 +325,32 @@ ContextNode *getUnhandledContext(FunctionData &Data, GUID Guid,
316325
// entered once and never exit. They should be assumed to be entered before
317326
// profiling starts - because profiling should start after the server is up
318327
// and running (which is equivalent to "message pumps are set up").
319-
ContextRoot *R = __llvm_ctx_profile_current_context_root;
320-
if (!R) {
328+
if (!CtxRoot) {
329+
if (auto *RAD = getRootDetector())
330+
RAD->sample();
331+
else if (auto *CR = Data.CtxRoot)
332+
return tryStartContextGivenRoot(CR, Guid, NumCounters, NumCallsites);
321333
if (IsUnderContext || !__sanitizer::atomic_load_relaxed(&ProfilingStarted))
322334
return TheScratchContext;
323335
else
324336
return markAsScratch(
325-
onContextEnter(*getFlatProfile(Data, Guid, NumCounters)));
337+
onContextEnter(*getFlatProfile(Data, Callee, Guid, NumCounters)));
326338
}
327-
auto [Iter, Ins] = R->Unhandled.insert({Guid, nullptr});
339+
auto [Iter, Ins] = CtxRoot->Unhandled.insert({Guid, nullptr});
328340
if (Ins)
329-
Iter->second =
330-
getCallsiteSlow(Guid, &R->FirstUnhandledCalleeNode, NumCounters, 0);
341+
Iter->second = getCallsiteSlow(Guid, &CtxRoot->FirstUnhandledCalleeNode,
342+
NumCounters, 0);
331343
return markAsScratch(onContextEnter(*Iter->second));
332344
}
333345

334346
ContextNode *__llvm_ctx_profile_get_context(FunctionData *Data, void *Callee,
335347
GUID Guid, uint32_t NumCounters,
336348
uint32_t NumCallsites) {
349+
auto *CtxRoot = __llvm_ctx_profile_current_context_root;
337350
// fast "out" if we're not even doing contextual collection.
338-
if (!__llvm_ctx_profile_current_context_root)
339-
return getUnhandledContext(*Data, Guid, NumCounters);
351+
if (!CtxRoot)
352+
return getUnhandledContext(*Data, Callee, Guid, NumCounters, NumCallsites,
353+
nullptr);
340354

341355
// also fast "out" if the caller is scratch. We can see if it's scratch by
342356
// looking at the interior pointer into the subcontexts vector that the caller
@@ -345,7 +359,8 @@ ContextNode *__llvm_ctx_profile_get_context(FunctionData *Data, void *Callee,
345359
// precisely, aligned - 8 values)
346360
auto **CallsiteContext = consume(__llvm_ctx_profile_callsite[0]);
347361
if (!CallsiteContext || isScratch(CallsiteContext))
348-
return getUnhandledContext(*Data, Guid, NumCounters);
362+
return getUnhandledContext(*Data, Callee, Guid, NumCounters, NumCallsites,
363+
CtxRoot);
349364

350365
// if the callee isn't the expected one, return scratch.
351366
// Signal handler(s) could have been invoked at any point in the execution.
@@ -363,7 +378,8 @@ ContextNode *__llvm_ctx_profile_get_context(FunctionData *Data, void *Callee,
363378
// for that case.
364379
auto *ExpectedCallee = consume(__llvm_ctx_profile_expected_callee[0]);
365380
if (ExpectedCallee != Callee)
366-
return getUnhandledContext(*Data, Guid, NumCounters);
381+
return getUnhandledContext(*Data, Callee, Guid, NumCounters, NumCallsites,
382+
CtxRoot);
367383

368384
auto *Callsite = *CallsiteContext;
369385
// in the case of indirect calls, we will have all seen targets forming a
@@ -388,21 +404,23 @@ ContextNode *__llvm_ctx_profile_get_context(FunctionData *Data, void *Callee,
388404
ContextNode *__llvm_ctx_profile_start_context(FunctionData *FData, GUID Guid,
389405
uint32_t Counters,
390406
uint32_t Callsites) {
407+
391408
return tryStartContextGivenRoot(FData->getOrAllocateContextRoot(), Guid,
392409
Counters, Callsites);
393410
}
394411

395412
void __llvm_ctx_profile_release_context(FunctionData *FData)
396413
SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
414+
const auto *CurrentRoot = __llvm_ctx_profile_current_context_root;
415+
if (!CurrentRoot || FData->CtxRoot != CurrentRoot)
416+
return;
397417
IsUnderContext = false;
398-
if (__llvm_ctx_profile_current_context_root) {
399-
__llvm_ctx_profile_current_context_root = nullptr;
400-
assert(FData->CtxRoot);
401-
FData->CtxRoot->Taken.Unlock();
402-
}
418+
assert(FData->CtxRoot);
419+
__llvm_ctx_profile_current_context_root = nullptr;
420+
FData->CtxRoot->Taken.Unlock();
403421
}
404422

405-
void __llvm_ctx_profile_start_collection() {
423+
void __llvm_ctx_profile_start_collection(unsigned AutodetectDuration) {
406424
size_t NumMemUnits = 0;
407425
__sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
408426
&AllContextsMutex);
@@ -418,12 +436,28 @@ void __llvm_ctx_profile_start_collection() {
418436
resetContextNode(*Root->FirstUnhandledCalleeNode);
419437
__sanitizer::atomic_store_relaxed(&Root->TotalEntries, 0);
420438
}
439+
if (AutodetectDuration) {
440+
// we leak RD intentionally. Knowing when to free it is tricky, there's a
441+
// race condition with functions observing the `RootDectector` as non-null.
442+
// This can be addressed but the alternatives have some added complexity and
443+
// it's not (yet) worth it.
444+
auto *RD = new (__sanitizer::InternalAlloc(sizeof(RootAutoDetector)))
445+
RootAutoDetector(AllFunctionsData, RootDetector, AutodetectDuration);
446+
RD->start();
447+
} else {
448+
__sanitizer::Printf("[ctxprof] Initial NumMemUnits: %zu \n", NumMemUnits);
449+
}
421450
__sanitizer::atomic_store_relaxed(&ProfilingStarted, true);
422-
__sanitizer::Printf("[ctxprof] Initial NumMemUnits: %zu \n", NumMemUnits);
423451
}
424452

425453
bool __llvm_ctx_profile_fetch(ProfileWriter &Writer) {
426454
__sanitizer::atomic_store_relaxed(&ProfilingStarted, false);
455+
if (auto *RD = getRootDetector()) {
456+
__sanitizer::Printf("[ctxprof] Expected the root autodetector to have "
457+
"finished well before attempting to fetch a context");
458+
RD->join();
459+
}
460+
427461
__sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
428462
&AllContextsMutex);
429463

@@ -448,8 +482,9 @@ bool __llvm_ctx_profile_fetch(ProfileWriter &Writer) {
448482
const auto *Pos = reinterpret_cast<const FunctionData *>(
449483
__sanitizer::atomic_load_relaxed(&AllFunctionsData));
450484
for (; Pos; Pos = Pos->Next)
451-
Writer.writeFlat(Pos->FlatCtx->guid(), Pos->FlatCtx->counters(),
452-
Pos->FlatCtx->counters_size());
485+
if (!Pos->CtxRoot)
486+
Writer.writeFlat(Pos->FlatCtx->guid(), Pos->FlatCtx->counters(),
487+
Pos->FlatCtx->counters_size());
453488
Writer.endFlatSection();
454489
return true;
455490
}

compiler-rt/lib/ctx_profile/CtxInstrProfiling.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,7 @@ ContextNode *__llvm_ctx_profile_get_context(__ctx_profile::FunctionData *FData,
207207

208208
/// Prepares for collection. Currently this resets counter values but preserves
209209
/// internal context tree structure.
210-
void __llvm_ctx_profile_start_collection();
210+
void __llvm_ctx_profile_start_collection(unsigned AutodetectDuration = 0);
211211

212212
/// Completely free allocated memory.
213213
void __llvm_ctx_profile_free();

compiler-rt/lib/ctx_profile/RootAutoDetector.cpp

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
#include "RootAutoDetector.h"
1010

11+
#include "CtxInstrProfiling.h"
1112
#include "sanitizer_common/sanitizer_common.h"
1213
#include "sanitizer_common/sanitizer_placement_new.h" // IWYU pragma: keep (DenseMap)
1314
#include <assert.h>
@@ -17,6 +18,99 @@
1718
using namespace __ctx_profile;
1819
template <typename T> using Set = DenseMap<T, bool>;
1920

21+
namespace __sanitizer {
22+
void BufferedStackTrace::UnwindImpl(uptr pc, uptr bp, void *context,
23+
bool request_fast, u32 max_depth) {
24+
// We can't implement the fast variant. The fast variant ends up invoking an
25+
// external allocator, because of pthread_attr_getstack. If this happens
26+
// during an allocation of the program being instrumented, a non-reentrant
27+
// lock may be taken (this was observed). The allocator called by
28+
// pthread_attr_getstack will also try to take that lock.
29+
UnwindSlow(pc, max_depth);
30+
}
31+
} // namespace __sanitizer
32+
33+
RootAutoDetector::PerThreadSamples::PerThreadSamples(RootAutoDetector &Parent) {
34+
GenericScopedLock<SpinMutex> L(&Parent.AllSamplesMutex);
35+
Parent.AllSamples.PushBack(this);
36+
}
37+
38+
void RootAutoDetector::start() {
39+
atomic_store_relaxed(&Self, reinterpret_cast<uintptr_t>(this));
40+
pthread_create(
41+
&WorkerThread, nullptr,
42+
+[](void *Ctx) -> void * {
43+
RootAutoDetector *RAD = reinterpret_cast<RootAutoDetector *>(Ctx);
44+
SleepForSeconds(RAD->WaitSeconds);
45+
// To avoid holding the AllSamplesMutex, make a snapshot of all the
46+
// thread samples collected so far
47+
Vector<PerThreadSamples *> SamplesSnapshot;
48+
{
49+
GenericScopedLock<SpinMutex> M(&RAD->AllSamplesMutex);
50+
SamplesSnapshot.Resize(RAD->AllSamples.Size());
51+
for (uptr I = 0; I < RAD->AllSamples.Size(); ++I)
52+
SamplesSnapshot[I] = RAD->AllSamples[I];
53+
}
54+
DenseMap<uptr, uint64_t> AllRoots;
55+
for (uptr I = 0; I < SamplesSnapshot.Size(); ++I) {
56+
GenericScopedLock<SpinMutex>(&SamplesSnapshot[I]->M);
57+
SamplesSnapshot[I]->TrieRoot.determineRoots().forEach([&](auto &KVP) {
58+
auto [FAddr, Count] = KVP;
59+
AllRoots[FAddr] += Count;
60+
return true;
61+
});
62+
}
63+
// FIXME: as a next step, establish a minimum relative nr of samples
64+
// per root that would qualify it as a root.
65+
for (auto *FD = reinterpret_cast<FunctionData *>(
66+
atomic_load_relaxed(&RAD->FunctionDataListHead));
67+
FD; FD = FD->Next) {
68+
if (AllRoots.contains(reinterpret_cast<uptr>(FD->EntryAddress))) {
69+
FD->getOrAllocateContextRoot();
70+
}
71+
}
72+
atomic_store_relaxed(&RAD->Self, 0);
73+
return nullptr;
74+
},
75+
this);
76+
}
77+
78+
void RootAutoDetector::join() { pthread_join(WorkerThread, nullptr); }
79+
80+
void RootAutoDetector::sample() {
81+
// tracking reentry in case we want to re-explore fast stack unwind - which
82+
// does potentially re-enter the runtime because it calls the instrumented
83+
// allocator because of pthread_attr_getstack. See the notes also on
84+
// UnwindImpl above.
85+
static thread_local bool Entered = false;
86+
static thread_local uint64_t Entries = 0;
87+
if (Entered || (++Entries % SampleRate))
88+
return;
89+
Entered = true;
90+
collectStack();
91+
Entered = false;
92+
}
93+
94+
void RootAutoDetector::collectStack() {
95+
GET_CALLER_PC_BP;
96+
BufferedStackTrace CurrentStack;
97+
CurrentStack.Unwind(pc, bp, /*context=*/nullptr, /*request_fast=*/false);
98+
// 2 stack frames would be very unlikely to mean anything, since at least the
99+
// compiler-rt frame - which can't be inlined - should be observable, which
100+
// counts as 1; we can be even more aggressive with this number.
101+
if (CurrentStack.size <= 2)
102+
return;
103+
static thread_local PerThreadSamples *ThisThreadSamples =
104+
new (__sanitizer::InternalAlloc(sizeof(PerThreadSamples)))
105+
PerThreadSamples(*this);
106+
107+
if (!ThisThreadSamples->M.TryLock())
108+
return;
109+
110+
ThisThreadSamples->TrieRoot.insertStack(CurrentStack);
111+
ThisThreadSamples->M.Unlock();
112+
}
113+
20114
uptr PerThreadCallsiteTrie::getFctStartAddr(uptr CallsiteAddress) const {
21115
// this requires --linkopt=-Wl,--export-dynamic
22116
Dl_info Info;

0 commit comments

Comments
 (0)