Skip to content

[Offload] Improve error reporting on memory faults #104254

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Aug 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion offload/plugins-nextgen/amdgpu/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3264,8 +3264,18 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
}
if (DeviceNode != Node)
continue;

void *DevicePtr = (void *)Event->memory_fault.virtual_address;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wonder if we should just treat these as uintptr_t

std::string S;
llvm::raw_string_ostream OS(S);
OS << llvm::format("Memory access fault by GPU %" PRIu32
" (agent 0x%" PRIx64
") at virtual address %p. Reasons: %s",
Node, Event->memory_fault.agent.handle,
(void *)Event->memory_fault.virtual_address,
llvm::join(Reasons, ", ").c_str());
ErrorReporter::reportKernelTraces(AMDGPUDevice, *KernelTraceInfoRecord);
ErrorReporter::reportMemoryAccessError(AMDGPUDevice, DevicePtr, S,
/*Abort*/ true);
}

// Abort the execution since we do not recover from this error.
Expand Down
67 changes: 61 additions & 6 deletions offload/plugins-nextgen/common/include/ErrorReporting.h
Original file line number Diff line number Diff line change
Expand Up @@ -157,10 +157,13 @@ class ErrorReporter {

if (ATI->HostPtr)
print(BoldLightPurple,
"Last allocation of size %lu for host pointer %p:\n", ATI->Size,
ATI->HostPtr);
"Last allocation of size %lu for host pointer %p -> device pointer "
"%p:\n",
ATI->Size, ATI->HostPtr, ATI->DevicePtr);
else
print(BoldLightPurple, "Last allocation of size %lu:\n", ATI->Size);
print(BoldLightPurple,
"Last allocation of size %lu -> device pointer %p:\n", ATI->Size,
ATI->DevicePtr);
reportStackTrace(ATI->AllocationTrace);
if (!ATI->LastAllocationInfo)
return;
Expand All @@ -174,10 +177,13 @@ class ErrorReporter {
ATI->Size);
reportStackTrace(ATI->DeallocationTrace);
if (ATI->HostPtr)
print(BoldLightPurple, " #%u Prior allocation for host pointer %p:\n",
I, ATI->HostPtr);
print(
BoldLightPurple,
" #%u Prior allocation for host pointer %p -> device pointer %p:\n",
I, ATI->HostPtr, ATI->DevicePtr);
else
print(BoldLightPurple, " #%u Prior allocation:\n", I);
print(BoldLightPurple, " #%u Prior allocation -> device pointer %p:\n",
I, ATI->DevicePtr);
reportStackTrace(ATI->AllocationTrace);
++I;
}
Expand Down Expand Up @@ -219,6 +225,55 @@ class ErrorReporter {
#undef DEALLOCATION_ERROR
}

static void reportMemoryAccessError(GenericDeviceTy &Device, void *DevicePtr,
std::string &ErrorStr, bool Abort) {
reportError(ErrorStr.c_str());

if (!Device.OMPX_TrackAllocationTraces) {
print(Yellow, "Use '%s=true' to track device allocations\n",
Device.OMPX_TrackAllocationTraces.getName().data());
if (Abort)
abortExecution();
return;
}
uintptr_t Distance = false;
auto *ATI =
Device.getClosestAllocationTraceInfoForAddr(DevicePtr, Distance);
if (!ATI) {
print(Cyan,
"No host-issued allocations; device pointer %p might be "
"a global, stack, or shared location\n",
DevicePtr);
if (Abort)
abortExecution();
return;
}
if (!Distance) {
print(Cyan, "Device pointer %p points into%s host-issued allocation:\n",
DevicePtr, ATI->DeallocationTrace.empty() ? "" : " prior");
reportAllocationInfo(ATI);
if (Abort)
abortExecution();
return;
}

bool IsClose = Distance < (1L << 29L /*512MB=*/);
print(Cyan,
"Device pointer %p does not point into any (current or prior) "
"host-issued allocation%s.\n",
DevicePtr,
IsClose ? "" : " (might be a global, stack, or shared location)");
if (IsClose) {
print(Cyan,
"Closest host-issued allocation (distance %" PRIuPTR
" byte%s; might be by page):\n",
Distance, Distance > 1 ? "s" : "");
reportAllocationInfo(ATI);
}
if (Abort)
abortExecution();
}

/// Report that a kernel encountered a trap instruction.
static void reportTrapInKernel(
GenericDeviceTy &Device, KernelTraceInfoRecordTy &KTIR,
Expand Down
46 changes: 41 additions & 5 deletions offload/plugins-nextgen/common/include/PluginInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -938,6 +938,42 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
/// been deallocated, both for error reporting purposes.
ProtectedObj<DenseMap<void *, AllocationTraceInfoTy *>> AllocationTraces;

/// Return the allocation trace info for a device pointer, that is the
/// allocation into which this device pointer points to (or pointed into).
AllocationTraceInfoTy *getAllocationTraceInfoForAddr(void *DevicePtr) {
auto AllocationTraceMap = AllocationTraces.getExclusiveAccessor();
for (auto &It : *AllocationTraceMap) {
if (It.first <= DevicePtr &&
advanceVoidPtr(It.first, It.second->Size) > DevicePtr)
return It.second;
}
return nullptr;
}

/// Return the allocation trace info for a device pointer, that is the
/// allocation into which this device pointer points to (or pointed into).
AllocationTraceInfoTy *
getClosestAllocationTraceInfoForAddr(void *DevicePtr, uintptr_t &Distance) {
Distance = 0;
if (auto *ATI = getAllocationTraceInfoForAddr(DevicePtr)) {
return ATI;
}

AllocationTraceInfoTy *ATI = nullptr;
uintptr_t DevicePtrI = uintptr_t(DevicePtr);
auto AllocationTraceMap = AllocationTraces.getExclusiveAccessor();
for (auto &It : *AllocationTraceMap) {
uintptr_t Begin = uintptr_t(It.second->DevicePtr);
uintptr_t End = Begin + It.second->Size - 1;
uintptr_t ItDistance = std::min(Begin - DevicePtrI, DevicePtrI - End);
if (ATI && ItDistance > Distance)
continue;
ATI = It.second;
Distance = ItDistance;
}
return ATI;
}

/// Map to record kernel have been launchedl, for error reporting purposes.
ProtectedObj<KernelTraceInfoRecordTy> KernelLaunchTraces;

Expand All @@ -946,6 +982,11 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
UInt32Envar OMPX_TrackNumKernelLaunches =
UInt32Envar("OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES", 0);

/// Environment variable to determine if stack traces for allocations and
/// deallocations are tracked.
BoolEnvar OMPX_TrackAllocationTraces =
BoolEnvar("OFFLOAD_TRACK_ALLOCATION_TRACES", false);

private:
/// Get and set the stack size and heap size for the device. If not used, the
/// plugin can implement the setters as no-op and setting the output
Expand Down Expand Up @@ -996,11 +1037,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
UInt32Envar OMPX_InitialNumStreams;
UInt32Envar OMPX_InitialNumEvents;

/// Environment variable to determine if stack traces for allocations and
/// deallocations are tracked.
BoolEnvar OMPX_TrackAllocationTraces =
BoolEnvar("OFFLOAD_TRACK_ALLOCATION_TRACES", false);

/// Array of images loaded into the device. Images are automatically
/// deallocated by the allocator.
llvm::SmallVector<DeviceImageTy *> LoadedImages;
Expand Down
6 changes: 3 additions & 3 deletions offload/test/sanitizer/double_free.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ int main(void) {
// NDEBG: main
// DEBUG: main {{.*}}double_free.c:24
//
// CHECK: Last allocation of size 8:
// CHECK: Last allocation of size 8 -> device pointer
// CHECK: dataAlloc
// CHECK: omp_target_alloc
// NDEBG: main
Expand All @@ -49,7 +49,7 @@ int main(void) {
// NDEBG: main
// DEBUG: main {{.*}}double_free.c:22
//
// CHECK: #0 Prior allocation:
// CHECK: #0 Prior allocation -> device pointer
// CHECK: dataAlloc
// CHECK: omp_target_alloc
// NDEBG: main
Expand All @@ -61,7 +61,7 @@ int main(void) {
// NDEBG: main
// DEBUG: main {{.*}}double_free.c:20
//
// CHECK: #1 Prior allocation:
// CHECK: #1 Prior allocation -> device pointer
// CHECK: dataAlloc
// CHECK: omp_target_alloc
// NDEBG: main
Expand Down
2 changes: 1 addition & 1 deletion offload/test/sanitizer/double_free_racy.c
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,6 @@ int main(void) {
// CHECK: dataDelete
// CHECK: omp_target_free

// CHECK: Last allocation of size 8:
// CHECK: Last allocation of size 8 -> device pointer
// CHECK: dataAlloc
// CHECK: omp_target_alloc
2 changes: 1 addition & 1 deletion offload/test/sanitizer/free_wrong_ptr_kind.c
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ int main(void) {
// NDEBG: main
// DEBUG: main {{.*}}free_wrong_ptr_kind.c:22
//
// CHECK: Last allocation of size 8:
// CHECK: Last allocation of size 8 -> device pointer
// CHECK: dataAlloc
// CHECK: llvm_omp_target_alloc_host
// NDEBG: main
Expand Down
2 changes: 1 addition & 1 deletion offload/test/sanitizer/free_wrong_ptr_kind.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ int main(void) {
// NDEBG: main
// DEBUG: main {{.*}}free_wrong_ptr_kind.cpp:25
//
// CHECK: Last allocation of size 8:
// CHECK: Last allocation of size 8 -> device pointer
// CHECK: dataAlloc
// CHECK: llvm_omp_target_alloc_shared
// NDEBG: main
Expand Down
40 changes: 40 additions & 0 deletions offload/test/sanitizer/ptr_outside_alloc_1.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
// clang-format off
// RUN: %libomptarget-compileopt-generic
// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,NTRCE
// RUN: %libomptarget-compileopt-generic
// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,TRACE
// clang-format on

// UNSUPPORTED: aarch64-unknown-linux-gnu
// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
// UNSUPPORTED: x86_64-pc-linux-gnu
// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
// UNSUPPORTED: s390x-ibm-linux-gnu
// UNSUPPORTED: s390x-ibm-linux-gnu-LTO

#include <omp.h>

void *llvm_omp_target_alloc_host(size_t Size, int DeviceNum);
void llvm_omp_target_free_host(void *Ptr, int DeviceNum);

int main() {
int N = (1 << 30);
char *A = (char *)llvm_omp_target_alloc_host(N, omp_get_default_device());
char *P;
#pragma omp target map(from : P)
{
P = &A[0];
*P = 3;
}
// clang-format off
// CHECK: OFFLOAD ERROR: Memory access fault by GPU {{.*}} (agent 0x{{.*}}) at virtual address [[PTR:0x[0-9a-z]*]]. Reasons: {{.*}}
// NTRCE: Use 'OFFLOAD_TRACK_ALLOCATION_TRACES=true' to track device allocations
// TRACE: Device pointer [[PTR]] does not point into any (current or prior) host-issued allocation.
// TRACE: Closest host-issued allocation (distance 4096 bytes; might be by page):
// TRACE: Last allocation of size 1073741824
// clang-format on
#pragma omp target
{ P[-4] = 5; }

llvm_omp_target_free_host(A, omp_get_default_device());
}
26 changes: 26 additions & 0 deletions offload/test/sanitizer/ptr_outside_alloc_2.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
// clang-format off
// RUN: %libomptarget-compileopt-generic
// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
// clang-format on

// UNSUPPORTED: aarch64-unknown-linux-gnu
// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
// UNSUPPORTED: x86_64-pc-linux-gnu
// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
// UNSUPPORTED: s390x-ibm-linux-gnu
// UNSUPPORTED: s390x-ibm-linux-gnu-LTO

#include <omp.h>

int main() {
int N = (1 << 30);
char *A = (char *)malloc(N);
#pragma omp target map(A[ : N])
{ A[N] = 3; }
// clang-format off
// CHECK: OFFLOAD ERROR: Memory access fault by GPU {{.*}} (agent 0x{{.*}}) at virtual address [[PTR:0x[0-9a-z]*]]. Reasons: {{.*}}
// CHECK: Device pointer [[PTR]] does not point into any (current or prior) host-issued allocation.
// CHECK: Closest host-issued allocation (distance 1 byte; might be by page):
// CHECK: Last allocation of size 1073741824
// clang-format on
}
39 changes: 39 additions & 0 deletions offload/test/sanitizer/use_after_free_1.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
// clang-format off
// RUN: %libomptarget-compileopt-generic
// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,NTRCE
// RUN: %libomptarget-compileopt-generic
// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,TRACE
// clang-format on

// UNSUPPORTED: aarch64-unknown-linux-gnu
// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
// UNSUPPORTED: x86_64-pc-linux-gnu
// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
// UNSUPPORTED: s390x-ibm-linux-gnu
// UNSUPPORTED: s390x-ibm-linux-gnu-LTO

#include <omp.h>

void *llvm_omp_target_alloc_host(size_t Size, int DeviceNum);
void llvm_omp_target_free_host(void *Ptr, int DeviceNum);

int main() {
int N = (1 << 30);
char *A = (char *)llvm_omp_target_alloc_host(N, omp_get_default_device());
char *P;
#pragma omp target map(from : P)
{
P = &A[N / 2];
*P = 3;
}
llvm_omp_target_free_host(A, omp_get_default_device());
// clang-format off
// CHECK: OFFLOAD ERROR: Memory access fault by GPU {{.*}} (agent 0x{{.*}}) at virtual address [[PTR:0x[0-9a-z]*]]. Reasons: {{.*}}
// NTRCE: Use 'OFFLOAD_TRACK_ALLOCATION_TRACES=true' to track device allocations
// TRACE: Device pointer [[PTR]] points into prior host-issued allocation:
// TRACE: Last deallocation:
// TRACE: Last allocation of size 1073741824
// clang-format on
#pragma omp target
{ *P = 5; }
}
32 changes: 32 additions & 0 deletions offload/test/sanitizer/use_after_free_2.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
// clang-format off
// RUN: %libomptarget-compileopt-generic
// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
// clang-format on

// UNSUPPORTED: aarch64-unknown-linux-gnu
// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
// UNSUPPORTED: x86_64-pc-linux-gnu
// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
// UNSUPPORTED: s390x-ibm-linux-gnu
// UNSUPPORTED: s390x-ibm-linux-gnu-LTO

#include <omp.h>

int main() {
int N = (1 << 30);
char *A = (char *)malloc(N);
char *P;
#pragma omp target map(A[ : N]) map(from : P)
{
P = &A[N / 2];
*P = 3;
}
// clang-format off
// CHECK: OFFLOAD ERROR: Memory access fault by GPU {{.*}} (agent 0x{{.*}}) at virtual address [[PTR:0x[0-9a-z]*]]. Reasons: {{.*}}
// CHECK: Device pointer [[PTR]] points into prior host-issued allocation:
// CHECK: Last deallocation:
// CHECK: Last allocation of size 1073741824
// clang-format on
#pragma omp target
{ *P = 5; }
}
Loading