Skip to content

Commit 84bb0f0

Browse files
committed
std: stop using TLS in signal handler
TLS is not async-signal-safe, making its use in the signal handler used to detect stack overflows unsound (c.f. #133698). POSIX however lists two thread-specific identifiers that can be obtained in a signal handler: the current `pthread_t` and the address of `errno`. Since `pthread_equal` is not AS-safe, `pthread_t` should be considered opaque, so for our purposes, `&errno` is the only option. This however works nicely: we can use the address as a key into a map that stores information for each thread. This PR uses a `BTreeMap` protected by a spin lock to hold the guard page address and thread name and thus fixes #133698.
1 parent 097cd98 commit 84bb0f0

File tree

2 files changed

+183
-38
lines changed

2 files changed

+183
-38
lines changed

library/std/src/sys/pal/unix/stack_overflow.rs

+54-38
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,18 @@ impl Drop for Handler {
2525
}
2626
}
2727

28+
#[cfg(any(
29+
target_os = "linux",
30+
target_os = "freebsd",
31+
target_os = "hurd",
32+
target_os = "macos",
33+
target_os = "netbsd",
34+
target_os = "openbsd",
35+
target_os = "solaris",
36+
target_os = "illumos",
37+
))]
38+
mod thread_info;
39+
2840
#[cfg(any(
2941
target_os = "linux",
3042
target_os = "freebsd",
@@ -46,22 +58,13 @@ mod imp {
4658
use libc::{mmap64, mprotect, munmap};
4759

4860
use super::Handler;
49-
use crate::cell::Cell;
61+
use super::thread_info::{delete_current_info, set_current_info, with_current_info};
5062
use crate::ops::Range;
5163
use crate::sync::OnceLock;
5264
use crate::sync::atomic::{Atomic, AtomicBool, AtomicPtr, AtomicUsize, Ordering};
5365
use crate::sys::pal::unix::os;
54-
use crate::{io, mem, ptr, thread};
55-
56-
// We use a TLS variable to store the address of the guard page. While TLS
57-
// variables are not guaranteed to be signal-safe, this works out in practice
58-
// since we make sure to write to the variable before the signal stack is
59-
// installed, thereby ensuring that the variable is always allocated when
60-
// the signal handler is called.
61-
thread_local! {
62-
// FIXME: use `Range` once that implements `Copy`.
63-
static GUARD: Cell<(usize, usize)> = const { Cell::new((0, 0)) };
64-
}
66+
use crate::thread::with_current_name;
67+
use crate::{io, mem, panic, ptr};
6568

6669
// Signal handler for the SIGSEGV and SIGBUS handlers. We've got guard pages
6770
// (unmapped pages) at the end of every thread's stack, so if a thread ends
@@ -93,29 +96,35 @@ mod imp {
9396
info: *mut libc::siginfo_t,
9497
_data: *mut libc::c_void,
9598
) {
96-
let (start, end) = GUARD.get();
9799
// SAFETY: this pointer is provided by the system and will always point to a valid `siginfo_t`.
98-
let addr = unsafe { (*info).si_addr().addr() };
100+
let fault_addr = unsafe { (*info).si_addr().addr() };
101+
102+
// `with_current_info` expects that the process aborts after it is
103+
// called. If the signal was not caused by a memory access, this might
104+
// not be true. We detect this by noticing that the `si_addr` field is
105+
// zero if the signal is synthetic.
106+
if fault_addr != 0 {
107+
with_current_info(|thread_info| {
108+
// If the faulting address is within the guard page, then we print a
109+
// message saying so and abort.
110+
if let Some(thread_info) = thread_info
111+
&& thread_info.guard_page_range.contains(&fault_addr)
112+
{
113+
let name = thread_info.thread_name.as_deref().unwrap_or("<unknown>");
114+
rtprintpanic!("\nthread '{name}' has overflowed its stack\n");
115+
rtabort!("stack overflow");
116+
}
117+
})
118+
}
99119

100-
// If the faulting address is within the guard page, then we print a
101-
// message saying so and abort.
102-
if start <= addr && addr < end {
103-
thread::with_current_name(|name| {
104-
let name = name.unwrap_or("<unknown>");
105-
rtprintpanic!("\nthread '{name}' has overflowed its stack\n");
106-
});
120+
// Unregister ourselves by reverting back to the default behavior.
121+
// SAFETY: assuming all platforms define struct sigaction as "zero-initializable"
122+
let mut action: sigaction = unsafe { mem::zeroed() };
123+
action.sa_sigaction = SIG_DFL;
124+
// SAFETY: pray this is a well-behaved POSIX implementation of fn sigaction
125+
unsafe { sigaction(signum, &action, ptr::null_mut()) };
107126

108-
rtabort!("stack overflow");
109-
} else {
110-
// Unregister ourselves by reverting back to the default behavior.
111-
// SAFETY: assuming all platforms define struct sigaction as "zero-initializable"
112-
let mut action: sigaction = unsafe { mem::zeroed() };
113-
action.sa_sigaction = SIG_DFL;
114-
// SAFETY: pray this is a well-behaved POSIX implementation of fn sigaction
115-
unsafe { sigaction(signum, &action, ptr::null_mut()) };
116-
117-
// See comment above for why this function returns.
118-
}
127+
// See comment above for why this function returns.
119128
}
120129

121130
static PAGE_SIZE: Atomic<usize> = AtomicUsize::new(0);
@@ -128,9 +137,7 @@ mod imp {
128137
pub unsafe fn init() {
129138
PAGE_SIZE.store(os::page_size(), Ordering::Relaxed);
130139

131-
// Always write to GUARD to ensure the TLS variable is allocated.
132-
let guard = unsafe { install_main_guard().unwrap_or(0..0) };
133-
GUARD.set((guard.start, guard.end));
140+
let mut guard_page_range = unsafe { install_main_guard() };
134141

135142
// SAFETY: assuming all platforms define struct sigaction as "zero-initializable"
136143
let mut action: sigaction = unsafe { mem::zeroed() };
@@ -145,7 +152,13 @@ mod imp {
145152
let handler = unsafe { make_handler(true) };
146153
MAIN_ALTSTACK.store(handler.data, Ordering::Relaxed);
147154
mem::forget(handler);
155+
156+
if let Some(guard_page_range) = guard_page_range.take() {
157+
let thread_name = with_current_name(|name| name.map(Box::from));
158+
set_current_info(guard_page_range, thread_name);
159+
}
148160
}
161+
149162
action.sa_flags = SA_SIGINFO | SA_ONSTACK;
150163
action.sa_sigaction = signal_handler as sighandler_t;
151164
// SAFETY: only overriding signals if the default is set
@@ -214,9 +227,10 @@ mod imp {
214227
}
215228

216229
if !main_thread {
217-
// Always write to GUARD to ensure the TLS variable is allocated.
218-
let guard = unsafe { current_guard() }.unwrap_or(0..0);
219-
GUARD.set((guard.start, guard.end));
230+
if let Some(guard_page_range) = unsafe { current_guard() } {
231+
let thread_name = with_current_name(|name| name.map(Box::from));
232+
set_current_info(guard_page_range, thread_name);
233+
}
220234
}
221235

222236
// SAFETY: assuming stack_t is zero-initializable
@@ -261,6 +275,8 @@ mod imp {
261275
// a mapping that started one page earlier, so walk back a page and unmap from there.
262276
unsafe { munmap(data.sub(page_size), sigstack_size + page_size) };
263277
}
278+
279+
delete_current_info();
264280
}
265281

266282
/// Modern kernels on modern hardware can have dynamic signal stack sizes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
//! TLS, but async-signal-safe.
2+
//!
3+
//! Unfortunately, because thread local storage isn't async-signal-safe, we
4+
//! cannot soundly use it in our stack overflow handler. While this works
5+
//! without problems on most platforms, it can lead to undefined behaviour
6+
//! on others (such as GNU/Linux). Luckily, the POSIX specification documents
7+
//! two thread-specific values that can be accessed in asynchronous signal
8+
//! handlers: the value of `pthread_self()` and the address of `errno`. As
9+
//! `pthread_t` is an opaque platform-specific type, we use the address of
10+
//! `errno` here. As it is thread-specific and does not change over the
11+
//! lifetime of a thread, we can use `&errno` as a key for a `BTreeMap`
12+
//! that stores thread-specific data.
13+
//!
14+
//! Concurrent access to this map is synchronized by two locks – an outer
15+
//! [`Mutex`] and an inner spin lock that also remembers the identity of
16+
//! the lock owner:
17+
//! * The spin lock is the primary means of synchronization: since it only
18+
//! uses native atomics, it can be soundly used inside the signal handle
19+
//! as opposed to [`Mutex`], which might not be async-signal-safe.
20+
//! * The [`Mutex`] prevents busy-waiting in the setup logic, as all accesses
21+
//! there are performed with the [`Mutex`] held, which makes the spin-lock
22+
//! redundant in the common case.
23+
//! * Finally, by using the `errno` address as the locked value of the spin
24+
//! lock, we can detect cases where a SIGSEGV occurred while the thread
25+
//! info is being modified.
26+
27+
use crate::collections::BTreeMap;
28+
use crate::hint::spin_loop;
29+
use crate::ops::Range;
30+
use crate::sync::Mutex;
31+
use crate::sync::atomic::{AtomicUsize, Ordering};
32+
use crate::sys::os::errno_location;
33+
34+
pub struct ThreadInfo {
35+
pub guard_page_range: Range<usize>,
36+
pub thread_name: Option<Box<str>>,
37+
}
38+
39+
static LOCK: Mutex<()> = Mutex::new(());
40+
static SPIN_LOCK: AtomicUsize = AtomicUsize::new(0);
41+
// This uses a `BTreeMap` instead of a hashmap since it supports constant
42+
// initialization and automatically reduces the amount of memory used when
43+
// items are removed.
44+
static mut THREAD_INFO: BTreeMap<usize, ThreadInfo> = BTreeMap::new();
45+
46+
struct UnlockOnDrop;
47+
48+
impl Drop for UnlockOnDrop {
49+
fn drop(&mut self) {
50+
SPIN_LOCK.store(0, Ordering::Release);
51+
}
52+
}
53+
54+
/// Get the current thread's information, if available.
55+
///
56+
/// Calling this function might freeze other threads if they attempt to modify
57+
/// their thread information. Thus, the caller should ensure that the process
58+
/// is aborted shortly after this function is called.
59+
///
60+
/// This function is guaranteed to be async-signal-safe if `f` is too.
61+
pub fn with_current_info<R>(f: impl FnOnce(Option<&ThreadInfo>) -> R) -> R {
62+
let this = errno_location().addr();
63+
let mut attempt = 0;
64+
let _guard = loop {
65+
// If we are just spinning endlessly, it's very likely that the thread
66+
// modifying the thread info map has a lower priority than us and will
67+
// not continue until we stop running. Just give up in that case.
68+
if attempt == 10_000_000 {
69+
rtprintpanic!("deadlock in SIGSEGV handler");
70+
return f(None);
71+
}
72+
73+
match SPIN_LOCK.compare_exchange(0, this, Ordering::Acquire, Ordering::Relaxed) {
74+
Ok(_) => break UnlockOnDrop,
75+
Err(owner) if owner == this => {
76+
rtabort!("a thread received SIGSEGV while modifying its stack overflow information")
77+
}
78+
// Spin until the lock can be acquired – there is nothing better to
79+
// do. This is unfortunately a priority hole, but a stack overflow
80+
// is a fatal error anyway.
81+
Err(_) => {
82+
spin_loop();
83+
attempt += 1;
84+
}
85+
}
86+
};
87+
88+
// SAFETY: we own the spin lock, so `THREAD_INFO` cannot not be aliased.
89+
let thread_info = unsafe { &*(&raw const THREAD_INFO) };
90+
f(thread_info.get(&this))
91+
}
92+
93+
fn spin_lock_in_setup(this: usize) -> UnlockOnDrop {
94+
loop {
95+
match SPIN_LOCK.compare_exchange(0, this, Ordering::Acquire, Ordering::Relaxed) {
96+
Ok(_) => return UnlockOnDrop,
97+
Err(owner) if owner == this => {
98+
unreachable!("the thread info setup logic isn't recursive")
99+
}
100+
// This function is always called with the outer lock held,
101+
// meaning the only time locking can fail is if another thread has
102+
// encountered a stack overflow. Since that will abort the process,
103+
// we just stop the current thread until that time. We use `pause`
104+
// instead of spinning to avoid priority inversion.
105+
// SAFETY: this doesn't have any safety preconditions.
106+
Err(_) => drop(unsafe { libc::pause() }),
107+
}
108+
}
109+
}
110+
111+
pub fn set_current_info(guard_page_range: Range<usize>, thread_name: Option<Box<str>>) {
112+
let this = errno_location().addr();
113+
let _lock_guard = LOCK.lock();
114+
let _spin_guard = spin_lock_in_setup(this);
115+
116+
// SAFETY: we own the spin lock, so `THREAD_INFO` cannot be aliased.
117+
let thread_info = unsafe { &mut *(&raw mut THREAD_INFO) };
118+
thread_info.insert(this, ThreadInfo { guard_page_range, thread_name });
119+
}
120+
121+
pub fn delete_current_info() {
122+
let this = errno_location().addr();
123+
let _lock_guard = LOCK.lock();
124+
let _spin_guard = spin_lock_in_setup(this);
125+
126+
// SAFETY: we own the spin lock, so `THREAD_INFO` cannot not be aliased.
127+
let thread_info = unsafe { &mut *(&raw mut THREAD_INFO) };
128+
thread_info.remove(&this);
129+
}

0 commit comments

Comments
 (0)