Skip to content

Commit 3b76115

Browse files
authored
[Offload] Improve error reporting on memory faults (#104254)
Since we can already track allocations, we can diagnose memory faults to some degree. If the fault happens in a prior allocation (use after free) or "close but outside" one, we can provide that information to the user. Note that the fault address might be page aligned, and not all accesses trigger a fault, especially for allocations that are backed by a MemoryManager. Still, if people disable the MemoryManager or the allocation is big enough, we can sometimes provide valueable feedback.
1 parent 216d6a0 commit 3b76115

11 files changed

+256
-18
lines changed

offload/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3264,8 +3264,18 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
32643264
}
32653265
if (DeviceNode != Node)
32663266
continue;
3267-
3267+
void *DevicePtr = (void *)Event->memory_fault.virtual_address;
3268+
std::string S;
3269+
llvm::raw_string_ostream OS(S);
3270+
OS << llvm::format("Memory access fault by GPU %" PRIu32
3271+
" (agent 0x%" PRIx64
3272+
") at virtual address %p. Reasons: %s",
3273+
Node, Event->memory_fault.agent.handle,
3274+
(void *)Event->memory_fault.virtual_address,
3275+
llvm::join(Reasons, ", ").c_str());
32683276
ErrorReporter::reportKernelTraces(AMDGPUDevice, *KernelTraceInfoRecord);
3277+
ErrorReporter::reportMemoryAccessError(AMDGPUDevice, DevicePtr, S,
3278+
/*Abort*/ true);
32693279
}
32703280

32713281
// Abort the execution since we do not recover from this error.

offload/plugins-nextgen/common/include/ErrorReporting.h

Lines changed: 61 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -157,10 +157,13 @@ class ErrorReporter {
157157

158158
if (ATI->HostPtr)
159159
print(BoldLightPurple,
160-
"Last allocation of size %lu for host pointer %p:\n", ATI->Size,
161-
ATI->HostPtr);
160+
"Last allocation of size %lu for host pointer %p -> device pointer "
161+
"%p:\n",
162+
ATI->Size, ATI->HostPtr, ATI->DevicePtr);
162163
else
163-
print(BoldLightPurple, "Last allocation of size %lu:\n", ATI->Size);
164+
print(BoldLightPurple,
165+
"Last allocation of size %lu -> device pointer %p:\n", ATI->Size,
166+
ATI->DevicePtr);
164167
reportStackTrace(ATI->AllocationTrace);
165168
if (!ATI->LastAllocationInfo)
166169
return;
@@ -174,10 +177,13 @@ class ErrorReporter {
174177
ATI->Size);
175178
reportStackTrace(ATI->DeallocationTrace);
176179
if (ATI->HostPtr)
177-
print(BoldLightPurple, " #%u Prior allocation for host pointer %p:\n",
178-
I, ATI->HostPtr);
180+
print(
181+
BoldLightPurple,
182+
" #%u Prior allocation for host pointer %p -> device pointer %p:\n",
183+
I, ATI->HostPtr, ATI->DevicePtr);
179184
else
180-
print(BoldLightPurple, " #%u Prior allocation:\n", I);
185+
print(BoldLightPurple, " #%u Prior allocation -> device pointer %p:\n",
186+
I, ATI->DevicePtr);
181187
reportStackTrace(ATI->AllocationTrace);
182188
++I;
183189
}
@@ -219,6 +225,55 @@ class ErrorReporter {
219225
#undef DEALLOCATION_ERROR
220226
}
221227

228+
static void reportMemoryAccessError(GenericDeviceTy &Device, void *DevicePtr,
229+
std::string &ErrorStr, bool Abort) {
230+
reportError(ErrorStr.c_str());
231+
232+
if (!Device.OMPX_TrackAllocationTraces) {
233+
print(Yellow, "Use '%s=true' to track device allocations\n",
234+
Device.OMPX_TrackAllocationTraces.getName().data());
235+
if (Abort)
236+
abortExecution();
237+
return;
238+
}
239+
uintptr_t Distance = false;
240+
auto *ATI =
241+
Device.getClosestAllocationTraceInfoForAddr(DevicePtr, Distance);
242+
if (!ATI) {
243+
print(Cyan,
244+
"No host-issued allocations; device pointer %p might be "
245+
"a global, stack, or shared location\n",
246+
DevicePtr);
247+
if (Abort)
248+
abortExecution();
249+
return;
250+
}
251+
if (!Distance) {
252+
print(Cyan, "Device pointer %p points into%s host-issued allocation:\n",
253+
DevicePtr, ATI->DeallocationTrace.empty() ? "" : " prior");
254+
reportAllocationInfo(ATI);
255+
if (Abort)
256+
abortExecution();
257+
return;
258+
}
259+
260+
bool IsClose = Distance < (1L << 29L /*512MB=*/);
261+
print(Cyan,
262+
"Device pointer %p does not point into any (current or prior) "
263+
"host-issued allocation%s.\n",
264+
DevicePtr,
265+
IsClose ? "" : " (might be a global, stack, or shared location)");
266+
if (IsClose) {
267+
print(Cyan,
268+
"Closest host-issued allocation (distance %" PRIuPTR
269+
" byte%s; might be by page):\n",
270+
Distance, Distance > 1 ? "s" : "");
271+
reportAllocationInfo(ATI);
272+
}
273+
if (Abort)
274+
abortExecution();
275+
}
276+
222277
/// Report that a kernel encountered a trap instruction.
223278
static void reportTrapInKernel(
224279
GenericDeviceTy &Device, KernelTraceInfoRecordTy &KTIR,

offload/plugins-nextgen/common/include/PluginInterface.h

Lines changed: 41 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -938,6 +938,42 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
938938
/// been deallocated, both for error reporting purposes.
939939
ProtectedObj<DenseMap<void *, AllocationTraceInfoTy *>> AllocationTraces;
940940

941+
/// Return the allocation trace info for a device pointer, that is the
942+
/// allocation into which this device pointer points to (or pointed into).
943+
AllocationTraceInfoTy *getAllocationTraceInfoForAddr(void *DevicePtr) {
944+
auto AllocationTraceMap = AllocationTraces.getExclusiveAccessor();
945+
for (auto &It : *AllocationTraceMap) {
946+
if (It.first <= DevicePtr &&
947+
advanceVoidPtr(It.first, It.second->Size) > DevicePtr)
948+
return It.second;
949+
}
950+
return nullptr;
951+
}
952+
953+
/// Return the allocation trace info for a device pointer, that is the
954+
/// allocation into which this device pointer points to (or pointed into).
955+
AllocationTraceInfoTy *
956+
getClosestAllocationTraceInfoForAddr(void *DevicePtr, uintptr_t &Distance) {
957+
Distance = 0;
958+
if (auto *ATI = getAllocationTraceInfoForAddr(DevicePtr)) {
959+
return ATI;
960+
}
961+
962+
AllocationTraceInfoTy *ATI = nullptr;
963+
uintptr_t DevicePtrI = uintptr_t(DevicePtr);
964+
auto AllocationTraceMap = AllocationTraces.getExclusiveAccessor();
965+
for (auto &It : *AllocationTraceMap) {
966+
uintptr_t Begin = uintptr_t(It.second->DevicePtr);
967+
uintptr_t End = Begin + It.second->Size - 1;
968+
uintptr_t ItDistance = std::min(Begin - DevicePtrI, DevicePtrI - End);
969+
if (ATI && ItDistance > Distance)
970+
continue;
971+
ATI = It.second;
972+
Distance = ItDistance;
973+
}
974+
return ATI;
975+
}
976+
941977
/// Map to record kernel have been launchedl, for error reporting purposes.
942978
ProtectedObj<KernelTraceInfoRecordTy> KernelLaunchTraces;
943979

@@ -946,6 +982,11 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
946982
UInt32Envar OMPX_TrackNumKernelLaunches =
947983
UInt32Envar("OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES", 0);
948984

985+
/// Environment variable to determine if stack traces for allocations and
986+
/// deallocations are tracked.
987+
BoolEnvar OMPX_TrackAllocationTraces =
988+
BoolEnvar("OFFLOAD_TRACK_ALLOCATION_TRACES", false);
989+
949990
private:
950991
/// Get and set the stack size and heap size for the device. If not used, the
951992
/// plugin can implement the setters as no-op and setting the output
@@ -996,11 +1037,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
9961037
UInt32Envar OMPX_InitialNumStreams;
9971038
UInt32Envar OMPX_InitialNumEvents;
9981039

999-
/// Environment variable to determine if stack traces for allocations and
1000-
/// deallocations are tracked.
1001-
BoolEnvar OMPX_TrackAllocationTraces =
1002-
BoolEnvar("OFFLOAD_TRACK_ALLOCATION_TRACES", false);
1003-
10041040
/// Array of images loaded into the device. Images are automatically
10051041
/// deallocated by the allocator.
10061042
llvm::SmallVector<DeviceImageTy *> LoadedImages;

offload/test/sanitizer/double_free.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ int main(void) {
3636
// NDEBG: main
3737
// DEBUG: main {{.*}}double_free.c:24
3838
//
39-
// CHECK: Last allocation of size 8:
39+
// CHECK: Last allocation of size 8 -> device pointer
4040
// CHECK: dataAlloc
4141
// CHECK: omp_target_alloc
4242
// NDEBG: main
@@ -49,7 +49,7 @@ int main(void) {
4949
// NDEBG: main
5050
// DEBUG: main {{.*}}double_free.c:22
5151
//
52-
// CHECK: #0 Prior allocation:
52+
// CHECK: #0 Prior allocation -> device pointer
5353
// CHECK: dataAlloc
5454
// CHECK: omp_target_alloc
5555
// NDEBG: main
@@ -61,7 +61,7 @@ int main(void) {
6161
// NDEBG: main
6262
// DEBUG: main {{.*}}double_free.c:20
6363
//
64-
// CHECK: #1 Prior allocation:
64+
// CHECK: #1 Prior allocation -> device pointer
6565
// CHECK: dataAlloc
6666
// CHECK: omp_target_alloc
6767
// NDEBG: main

offload/test/sanitizer/double_free_racy.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,6 @@ int main(void) {
2828
// CHECK: dataDelete
2929
// CHECK: omp_target_free
3030

31-
// CHECK: Last allocation of size 8:
31+
// CHECK: Last allocation of size 8 -> device pointer
3232
// CHECK: dataAlloc
3333
// CHECK: omp_target_alloc

offload/test/sanitizer/free_wrong_ptr_kind.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ int main(void) {
2828
// NDEBG: main
2929
// DEBUG: main {{.*}}free_wrong_ptr_kind.c:22
3030
//
31-
// CHECK: Last allocation of size 8:
31+
// CHECK: Last allocation of size 8 -> device pointer
3232
// CHECK: dataAlloc
3333
// CHECK: llvm_omp_target_alloc_host
3434
// NDEBG: main

offload/test/sanitizer/free_wrong_ptr_kind.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ int main(void) {
3131
// NDEBG: main
3232
// DEBUG: main {{.*}}free_wrong_ptr_kind.cpp:25
3333
//
34-
// CHECK: Last allocation of size 8:
34+
// CHECK: Last allocation of size 8 -> device pointer
3535
// CHECK: dataAlloc
3636
// CHECK: llvm_omp_target_alloc_shared
3737
// NDEBG: main
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
// clang-format off
2+
// RUN: %libomptarget-compileopt-generic
3+
// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,NTRCE
4+
// RUN: %libomptarget-compileopt-generic
5+
// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,TRACE
6+
// clang-format on
7+
8+
// UNSUPPORTED: aarch64-unknown-linux-gnu
9+
// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
10+
// UNSUPPORTED: x86_64-pc-linux-gnu
11+
// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
12+
// UNSUPPORTED: s390x-ibm-linux-gnu
13+
// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
14+
15+
#include <omp.h>
16+
17+
void *llvm_omp_target_alloc_host(size_t Size, int DeviceNum);
18+
void llvm_omp_target_free_host(void *Ptr, int DeviceNum);
19+
20+
int main() {
21+
int N = (1 << 30);
22+
char *A = (char *)llvm_omp_target_alloc_host(N, omp_get_default_device());
23+
char *P;
24+
#pragma omp target map(from : P)
25+
{
26+
P = &A[0];
27+
*P = 3;
28+
}
29+
// clang-format off
30+
// CHECK: OFFLOAD ERROR: Memory access fault by GPU {{.*}} (agent 0x{{.*}}) at virtual address [[PTR:0x[0-9a-z]*]]. Reasons: {{.*}}
31+
// NTRCE: Use 'OFFLOAD_TRACK_ALLOCATION_TRACES=true' to track device allocations
32+
// TRACE: Device pointer [[PTR]] does not point into any (current or prior) host-issued allocation.
33+
// TRACE: Closest host-issued allocation (distance 4096 bytes; might be by page):
34+
// TRACE: Last allocation of size 1073741824
35+
// clang-format on
36+
#pragma omp target
37+
{ P[-4] = 5; }
38+
39+
llvm_omp_target_free_host(A, omp_get_default_device());
40+
}
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
// clang-format off
2+
// RUN: %libomptarget-compileopt-generic
3+
// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
4+
// clang-format on
5+
6+
// UNSUPPORTED: aarch64-unknown-linux-gnu
7+
// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
8+
// UNSUPPORTED: x86_64-pc-linux-gnu
9+
// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
10+
// UNSUPPORTED: s390x-ibm-linux-gnu
11+
// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
12+
13+
#include <omp.h>
14+
15+
int main() {
16+
int N = (1 << 30);
17+
char *A = (char *)malloc(N);
18+
#pragma omp target map(A[ : N])
19+
{ A[N] = 3; }
20+
// clang-format off
21+
// CHECK: OFFLOAD ERROR: Memory access fault by GPU {{.*}} (agent 0x{{.*}}) at virtual address [[PTR:0x[0-9a-z]*]]. Reasons: {{.*}}
22+
// CHECK: Device pointer [[PTR]] does not point into any (current or prior) host-issued allocation.
23+
// CHECK: Closest host-issued allocation (distance 1 byte; might be by page):
24+
// CHECK: Last allocation of size 1073741824
25+
// clang-format on
26+
}
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
// clang-format off
2+
// RUN: %libomptarget-compileopt-generic
3+
// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,NTRCE
4+
// RUN: %libomptarget-compileopt-generic
5+
// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,TRACE
6+
// clang-format on
7+
8+
// UNSUPPORTED: aarch64-unknown-linux-gnu
9+
// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
10+
// UNSUPPORTED: x86_64-pc-linux-gnu
11+
// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
12+
// UNSUPPORTED: s390x-ibm-linux-gnu
13+
// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
14+
15+
#include <omp.h>
16+
17+
void *llvm_omp_target_alloc_host(size_t Size, int DeviceNum);
18+
void llvm_omp_target_free_host(void *Ptr, int DeviceNum);
19+
20+
int main() {
21+
int N = (1 << 30);
22+
char *A = (char *)llvm_omp_target_alloc_host(N, omp_get_default_device());
23+
char *P;
24+
#pragma omp target map(from : P)
25+
{
26+
P = &A[N / 2];
27+
*P = 3;
28+
}
29+
llvm_omp_target_free_host(A, omp_get_default_device());
30+
// clang-format off
31+
// CHECK: OFFLOAD ERROR: Memory access fault by GPU {{.*}} (agent 0x{{.*}}) at virtual address [[PTR:0x[0-9a-z]*]]. Reasons: {{.*}}
32+
// NTRCE: Use 'OFFLOAD_TRACK_ALLOCATION_TRACES=true' to track device allocations
33+
// TRACE: Device pointer [[PTR]] points into prior host-issued allocation:
34+
// TRACE: Last deallocation:
35+
// TRACE: Last allocation of size 1073741824
36+
// clang-format on
37+
#pragma omp target
38+
{ *P = 5; }
39+
}
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
// clang-format off
2+
// RUN: %libomptarget-compileopt-generic
3+
// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
4+
// clang-format on
5+
6+
// UNSUPPORTED: aarch64-unknown-linux-gnu
7+
// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
8+
// UNSUPPORTED: x86_64-pc-linux-gnu
9+
// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
10+
// UNSUPPORTED: s390x-ibm-linux-gnu
11+
// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
12+
13+
#include <omp.h>
14+
15+
int main() {
16+
int N = (1 << 30);
17+
char *A = (char *)malloc(N);
18+
char *P;
19+
#pragma omp target map(A[ : N]) map(from : P)
20+
{
21+
P = &A[N / 2];
22+
*P = 3;
23+
}
24+
// clang-format off
25+
// CHECK: OFFLOAD ERROR: Memory access fault by GPU {{.*}} (agent 0x{{.*}}) at virtual address [[PTR:0x[0-9a-z]*]]. Reasons: {{.*}}
26+
// CHECK: Device pointer [[PTR]] points into prior host-issued allocation:
27+
// CHECK: Last deallocation:
28+
// CHECK: Last allocation of size 1073741824
29+
// clang-format on
30+
#pragma omp target
31+
{ *P = 5; }
32+
}

0 commit comments

Comments
 (0)