Skip to content

Commit 9a10132

Browse files
authored
[Offload] Allow to record kernel launch stack traces (#100472)
Similar to (de)allocation traces, we can record kernel launch stack traces and display them in case of an error. However, the AMD GPU plugin signal handler, which is invoked on memroy faults, cannot pinpoint the offending kernel. Insteade print `<NUM>`, set via `OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=<NUM>`, many traces. The recoding/record uses a ring buffer of fixed size (for now 8). For `trap` errors, we print the actual kernel name, and trace if recorded.
1 parent 33960ce commit 9a10132

File tree

14 files changed

+531
-12
lines changed

14 files changed

+531
-12
lines changed

offload/include/Shared/EnvironmentVar.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ struct StringParser {
2828
/// Class for reading and checking environment variables. Currently working with
2929
/// integer, floats, std::string and bool types.
3030
template <typename Ty> class Envar {
31+
llvm::StringRef Name;
3132
Ty Data;
3233
bool IsPresent;
3334
bool Initialized;
@@ -53,7 +54,7 @@ template <typename Ty> class Envar {
5354
/// take the value read from the environment variable, or the default if it
5455
/// was not set or not correct. This constructor is not fallible.
5556
Envar(llvm::StringRef Name, Ty Default = Ty())
56-
: Data(Default), IsPresent(false), Initialized(true) {
57+
: Name(Name), Data(Default), IsPresent(false), Initialized(true) {
5758

5859
if (const char *EnvStr = getenv(Name.data())) {
5960
// Check whether the envar is defined and valid.
@@ -84,6 +85,9 @@ template <typename Ty> class Envar {
8485
/// Get the definitive value.
8586
operator Ty() const { return get(); }
8687

88+
/// Return the environment variable name.
89+
llvm::StringRef getName() const { return Name; }
90+
8791
/// Indicate whether the environment variable was defined and valid.
8892
bool isPresent() const { return IsPresent; }
8993

offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ typedef enum {
3131
HSA_STATUS_ERROR = 0x1000,
3232
HSA_STATUS_ERROR_INVALID_CODE_OBJECT = 0x1010,
3333
HSA_STATUS_ERROR_NOT_INITIALIZED = 0x100B,
34+
HSA_STATUS_ERROR_EXCEPTION = 0x1016,
3435
} hsa_status_t;
3536

3637
hsa_status_t hsa_status_string(hsa_status_t status, const char **status_string);

offload/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 61 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,16 @@
1313
#include <atomic>
1414
#include <cassert>
1515
#include <cstddef>
16+
#include <cstdint>
1617
#include <deque>
18+
#include <functional>
1719
#include <mutex>
1820
#include <string>
1921
#include <system_error>
2022
#include <unistd.h>
2123
#include <unordered_map>
2224

25+
#include "ErrorReporting.h"
2326
#include "Shared/APITypes.h"
2427
#include "Shared/Debug.h"
2528
#include "Shared/Environment.h"
@@ -43,6 +46,7 @@
4346
#include "llvm/Support/FileSystem.h"
4447
#include "llvm/Support/MemoryBuffer.h"
4548
#include "llvm/Support/Program.h"
49+
#include "llvm/Support/Signals.h"
4650
#include "llvm/Support/raw_ostream.h"
4751

4852
#if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) || \
@@ -685,12 +689,12 @@ struct AMDGPUQueueTy {
685689
AMDGPUQueueTy() : Queue(nullptr), Mutex(), NumUsers(0) {}
686690

687691
/// Lazily initialize a new queue belonging to a specific agent.
688-
Error init(hsa_agent_t Agent, int32_t QueueSize) {
692+
Error init(GenericDeviceTy &Device, hsa_agent_t Agent, int32_t QueueSize) {
689693
if (Queue)
690694
return Plugin::success();
691695
hsa_status_t Status =
692696
hsa_queue_create(Agent, QueueSize, HSA_QUEUE_TYPE_MULTI, callbackError,
693-
nullptr, UINT32_MAX, UINT32_MAX, &Queue);
697+
&Device, UINT32_MAX, UINT32_MAX, &Queue);
694698
return Plugin::check(Status, "Error in hsa_queue_create: %s");
695699
}
696700

@@ -875,10 +879,8 @@ struct AMDGPUQueueTy {
875879
}
876880

877881
/// Callack that will be called when an error is detected on the HSA queue.
878-
static void callbackError(hsa_status_t Status, hsa_queue_t *Source, void *) {
879-
auto Err = Plugin::check(Status, "Received error in queue %p: %s", Source);
880-
FATAL_MESSAGE(1, "%s", toString(std::move(Err)).data());
881-
}
882+
static void callbackError(hsa_status_t Status, hsa_queue_t *Source,
883+
void *Data);
882884

883885
/// The HSA queue.
884886
hsa_queue_t *Queue;
@@ -1484,6 +1486,8 @@ struct AMDGPUStreamTy {
14841486
return true;
14851487
}
14861488

1489+
const AMDGPUQueueTy *getQueue() const { return Queue; }
1490+
14871491
/// Record the state of the stream on an event.
14881492
Error recordEvent(AMDGPUEventTy &Event) const;
14891493

@@ -1594,7 +1598,7 @@ struct AMDGPUStreamManagerTy final
15941598
using ResourcePoolTy = GenericDeviceResourceManagerTy<ResourceRef>;
15951599

15961600
AMDGPUStreamManagerTy(GenericDeviceTy &Device, hsa_agent_t HSAAgent)
1597-
: GenericDeviceResourceManagerTy(Device),
1601+
: GenericDeviceResourceManagerTy(Device), Device(Device),
15981602
OMPX_QueueTracking("LIBOMPTARGET_AMDGPU_HSA_QUEUE_BUSY_TRACKING", true),
15991603
NextQueue(0), Agent(HSAAgent) {}
16001604

@@ -1603,7 +1607,7 @@ struct AMDGPUStreamManagerTy final
16031607
QueueSize = HSAQueueSize;
16041608
MaxNumQueues = NumHSAQueues;
16051609
// Initialize one queue eagerly
1606-
if (auto Err = Queues.front().init(Agent, QueueSize))
1610+
if (auto Err = Queues.front().init(Device, Agent, QueueSize))
16071611
return Err;
16081612

16091613
return GenericDeviceResourceManagerTy::init(InitialSize);
@@ -1660,14 +1664,17 @@ struct AMDGPUStreamManagerTy final
16601664
}
16611665

16621666
// Make sure the queue is initialized, then add user & assign.
1663-
if (auto Err = Queues[Index].init(Agent, QueueSize))
1667+
if (auto Err = Queues[Index].init(Device, Agent, QueueSize))
16641668
return Err;
16651669
Queues[Index].addUser();
16661670
Stream->Queue = &Queues[Index];
16671671

16681672
return Plugin::success();
16691673
}
16701674

1675+
/// The device associated with this stream.
1676+
GenericDeviceTy &Device;
1677+
16711678
/// Envar for controlling the tracking of busy HSA queues.
16721679
BoolEnvar OMPX_QueueTracking;
16731680

@@ -3074,7 +3081,7 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
30743081
Initialized = true;
30753082

30763083
// Register event handler to detect memory errors on the devices.
3077-
Status = hsa_amd_register_system_event_handler(eventHandler, nullptr);
3084+
Status = hsa_amd_register_system_event_handler(eventHandler, this);
30783085
if (auto Err = Plugin::check(
30793086
Status, "Error in hsa_amd_register_system_event_handler: %s"))
30803087
return std::move(Err);
@@ -3209,7 +3216,8 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
32093216

32103217
private:
32113218
/// Event handler that will be called by ROCr if an event is detected.
3212-
static hsa_status_t eventHandler(const hsa_amd_event_t *Event, void *) {
3219+
static hsa_status_t eventHandler(const hsa_amd_event_t *Event,
3220+
void *PluginPtr) {
32133221
if (Event->event_type != HSA_AMD_GPU_MEMORY_FAULT_EVENT)
32143222
return HSA_STATUS_SUCCESS;
32153223

@@ -3240,6 +3248,26 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
32403248
uint32_t Node = -1;
32413249
hsa_agent_get_info(Event->memory_fault.agent, HSA_AGENT_INFO_NODE, &Node);
32423250

3251+
AMDGPUPluginTy &Plugin = *reinterpret_cast<AMDGPUPluginTy *>(PluginPtr);
3252+
for (uint32_t I = 0, E = Plugin.getNumDevices();
3253+
Node != uint32_t(-1) && I < E; ++I) {
3254+
AMDGPUDeviceTy &AMDGPUDevice =
3255+
reinterpret_cast<AMDGPUDeviceTy &>(Plugin.getDevice(I));
3256+
auto KernelTraceInfoRecord =
3257+
AMDGPUDevice.KernelLaunchTraces.getExclusiveAccessor();
3258+
3259+
uint32_t DeviceNode = -1;
3260+
if (auto Err =
3261+
AMDGPUDevice.getDeviceAttr(HSA_AGENT_INFO_NODE, DeviceNode)) {
3262+
consumeError(std::move(Err));
3263+
continue;
3264+
}
3265+
if (DeviceNode != Node)
3266+
continue;
3267+
3268+
ErrorReporter::reportKernelTraces(AMDGPUDevice, *KernelTraceInfoRecord);
3269+
}
3270+
32433271
// Abort the execution since we do not recover from this error.
32443272
FATAL_MESSAGE(1,
32453273
"Memory access fault by GPU %" PRIu32 " (agent 0x%" PRIx64
@@ -3480,6 +3508,28 @@ void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) {
34803508
return Alloc;
34813509
}
34823510

3511+
void AMDGPUQueueTy::callbackError(hsa_status_t Status, hsa_queue_t *Source,
3512+
void *Data) {
3513+
auto &AMDGPUDevice = *reinterpret_cast<AMDGPUDeviceTy *>(Data);
3514+
3515+
if (Status == HSA_STATUS_ERROR_EXCEPTION) {
3516+
auto KernelTraceInfoRecord =
3517+
AMDGPUDevice.KernelLaunchTraces.getExclusiveAccessor();
3518+
std::function<bool(__tgt_async_info &)> AsyncInfoWrapperMatcher =
3519+
[=](__tgt_async_info &AsyncInfo) {
3520+
auto *Stream = reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue);
3521+
if (!Stream || !Stream->getQueue())
3522+
return false;
3523+
return Stream->getQueue()->Queue == Source;
3524+
};
3525+
ErrorReporter::reportTrapInKernel(AMDGPUDevice, *KernelTraceInfoRecord,
3526+
AsyncInfoWrapperMatcher);
3527+
}
3528+
3529+
auto Err = Plugin::check(Status, "Received error in queue %p: %s", Source);
3530+
FATAL_MESSAGE(1, "%s", toString(std::move(Err)).data());
3531+
}
3532+
34833533
} // namespace plugin
34843534
} // namespace target
34853535
} // namespace omp

offload/plugins-nextgen/common/include/ErrorReporting.h

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include "PluginInterface.h"
1515
#include "Shared/EnvironmentVar.h"
1616

17+
#include "llvm/ADT/STLExtras.h"
1718
#include "llvm/ADT/SmallString.h"
1819
#include "llvm/ADT/StringRef.h"
1920
#include "llvm/Support/ErrorHandling.h"
@@ -216,6 +217,90 @@ class ErrorReporter {
216217
getAllocTyName(ATI->Kind).data(), DevicePtr);
217218
#undef DEALLOCATION_ERROR
218219
}
220+
221+
/// Report that a kernel encountered a trap instruction.
222+
static void reportTrapInKernel(
223+
GenericDeviceTy &Device, KernelTraceInfoRecordTy &KTIR,
224+
std::function<bool(__tgt_async_info &)> AsyncInfoWrapperMatcher) {
225+
assert(AsyncInfoWrapperMatcher && "A matcher is required");
226+
227+
uint32_t Idx = 0;
228+
for (uint32_t I = 0, E = KTIR.size(); I < E; ++I) {
229+
auto KTI = KTIR.getKernelTraceInfo(I);
230+
if (KTI.Kernel == nullptr)
231+
break;
232+
// Skip kernels issued in other queues.
233+
if (KTI.AsyncInfo && !(AsyncInfoWrapperMatcher(*KTI.AsyncInfo)))
234+
continue;
235+
Idx = I;
236+
break;
237+
}
238+
239+
auto KTI = KTIR.getKernelTraceInfo(Idx);
240+
if (KTI.AsyncInfo && (AsyncInfoWrapperMatcher(*KTI.AsyncInfo)))
241+
reportError("Kernel '%s'", KTI.Kernel->getName());
242+
reportError("execution interrupted by hardware trap instruction");
243+
if (KTI.AsyncInfo && (AsyncInfoWrapperMatcher(*KTI.AsyncInfo))) {
244+
if (!KTI.LaunchTrace.empty())
245+
reportStackTrace(KTI.LaunchTrace);
246+
else
247+
print(Yellow, "Use '%s=1' to show the stack trace of the kernel\n",
248+
Device.OMPX_TrackNumKernelLaunches.getName().data());
249+
}
250+
abort();
251+
}
252+
253+
/// Report the kernel traces taken from \p KTIR, up to
254+
/// OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES many.
255+
static void reportKernelTraces(GenericDeviceTy &Device,
256+
KernelTraceInfoRecordTy &KTIR) {
257+
uint32_t NumKTIs = 0;
258+
for (uint32_t I = 0, E = KTIR.size(); I < E; ++I) {
259+
auto KTI = KTIR.getKernelTraceInfo(I);
260+
if (KTI.Kernel == nullptr)
261+
break;
262+
++NumKTIs;
263+
}
264+
if (NumKTIs == 0) {
265+
print(BoldRed, "No kernel launches known\n");
266+
return;
267+
}
268+
269+
uint32_t TracesToShow =
270+
std::min(Device.OMPX_TrackNumKernelLaunches.get(), NumKTIs);
271+
if (TracesToShow == 0) {
272+
if (NumKTIs == 1)
273+
print(BoldLightPurple, "Display only launched kernel:\n");
274+
else
275+
print(BoldLightPurple, "Display last %u kernels launched:\n", NumKTIs);
276+
} else {
277+
if (NumKTIs == 1)
278+
print(BoldLightPurple, "Display kernel launch trace:\n");
279+
else
280+
print(BoldLightPurple,
281+
"Display %u of the %u last kernel launch traces:\n", TracesToShow,
282+
NumKTIs);
283+
}
284+
285+
for (uint32_t Idx = 0, I = 0; I < NumKTIs; ++Idx) {
286+
auto KTI = KTIR.getKernelTraceInfo(Idx);
287+
if (NumKTIs == 1)
288+
print(BoldLightPurple, "Kernel '%s'\n", KTI.Kernel->getName());
289+
else
290+
print(BoldLightPurple, "Kernel %d: '%s'\n", I, KTI.Kernel->getName());
291+
reportStackTrace(KTI.LaunchTrace);
292+
++I;
293+
}
294+
295+
if (NumKTIs != 1) {
296+
print(Yellow,
297+
"Use '%s=<num>' to adjust the number of shown stack traces (%u "
298+
"now, up to %zu)\n",
299+
Device.OMPX_TrackNumKernelLaunches.getName().data(),
300+
Device.OMPX_TrackNumKernelLaunches.get(), KTIR.size());
301+
}
302+
// TODO: Let users know how to serialize kernels
303+
}
219304
};
220305

221306
} // namespace plugin

offload/plugins-nextgen/common/include/PluginInterface.h

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -412,6 +412,44 @@ struct AllocationTraceInfoTy {
412412
std::mutex Lock;
413413
};
414414

415+
/// Information about an allocation, when it has been allocated, and when/if it
416+
/// has been deallocated, for error reporting purposes.
417+
struct KernelTraceInfoTy {
418+
419+
/// The launched kernel.
420+
GenericKernelTy *Kernel;
421+
422+
/// The stack trace of the launch itself.
423+
std::string LaunchTrace;
424+
425+
/// The async info the kernel was launched in.
426+
__tgt_async_info *AsyncInfo;
427+
};
428+
429+
struct KernelTraceInfoRecordTy {
430+
KernelTraceInfoRecordTy() { KTIs.fill({}); }
431+
432+
/// Return the (maximal) record size.
433+
auto size() const { return KTIs.size(); }
434+
435+
/// Create a new kernel trace info and add it into the record.
436+
void emplace(GenericKernelTy *Kernel, const std::string &&StackTrace,
437+
__tgt_async_info *AsyncInfo) {
438+
KTIs[Idx] = {Kernel, std::move(StackTrace), AsyncInfo};
439+
Idx = (Idx + 1) % size();
440+
}
441+
442+
/// Return the \p I'th last kernel trace info.
443+
auto getKernelTraceInfo(int32_t I) const {
444+
// Note that kernel trace infos "grow forward", so lookup is backwards.
445+
return KTIs[(Idx - I - 1 + size()) % size()];
446+
}
447+
448+
private:
449+
std::array<KernelTraceInfoTy, 8> KTIs;
450+
unsigned Idx = 0;
451+
};
452+
415453
/// Class representing a map of host pinned allocations. We track these pinned
416454
/// allocations, so memory tranfers invloving these buffers can be optimized.
417455
class PinnedAllocationMapTy {
@@ -900,6 +938,14 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
900938
/// been deallocated, both for error reporting purposes.
901939
ProtectedObj<DenseMap<void *, AllocationTraceInfoTy *>> AllocationTraces;
902940

941+
/// Map to record kernel have been launchedl, for error reporting purposes.
942+
ProtectedObj<KernelTraceInfoRecordTy> KernelLaunchTraces;
943+
944+
/// Environment variable to determine if stack traces for kernel launches are
945+
/// tracked.
946+
UInt32Envar OMPX_TrackNumKernelLaunches =
947+
UInt32Envar("OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES", 0);
948+
903949
private:
904950
/// Get and set the stack size and heap size for the device. If not used, the
905951
/// plugin can implement the setters as no-op and setting the output

offload/plugins-nextgen/common/src/PluginInterface.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1468,6 +1468,18 @@ Error GenericDeviceTy::launchKernel(void *EntryPtr, void **ArgPtrs,
14681468
GenericKernelTy &GenericKernel =
14691469
*reinterpret_cast<GenericKernelTy *>(EntryPtr);
14701470

1471+
{
1472+
std::string StackTrace;
1473+
if (OMPX_TrackNumKernelLaunches) {
1474+
llvm::raw_string_ostream OS(StackTrace);
1475+
llvm::sys::PrintStackTrace(OS);
1476+
}
1477+
1478+
auto KernelTraceInfoRecord = KernelLaunchTraces.getExclusiveAccessor();
1479+
(*KernelTraceInfoRecord)
1480+
.emplace(&GenericKernel, std::move(StackTrace), AsyncInfo);
1481+
}
1482+
14711483
auto Err = GenericKernel.launch(*this, ArgPtrs, ArgOffsets, KernelArgs,
14721484
AsyncInfoWrapper);
14731485

0 commit comments

Comments
 (0)