Skip to content

Commit 3feab00

Browse files
authored
Rollup merge of #113939 - the8472:pidfd-from-child, r=Mark-Simulacrum
open pidfd in child process and send to the parent via SOCK_SEQPACKET+CMSG This avoids using `clone3` when a pidfd is requested while still getting it in a 100% race-free manner by passing it up from the child process. This should solve most concerns in #82971
2 parents a946c1e + 8d349c1 commit 3feab00

File tree

2 files changed

+156
-90
lines changed

2 files changed

+156
-90
lines changed

library/std/src/sys/unix/process/process_unix.rs

+131-90
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,6 @@ use core::ffi::NonZero_c_int;
1010
#[cfg(target_os = "linux")]
1111
use crate::os::linux::process::PidFd;
1212

13-
#[cfg(target_os = "linux")]
14-
use crate::sys::weak::raw_syscall;
15-
1613
#[cfg(any(
1714
target_os = "macos",
1815
target_os = "watchos",
@@ -91,6 +88,11 @@ impl Command {
9188
if let Some(ret) = self.posix_spawn(&theirs, envp.as_ref())? {
9289
return Ok((ret, ours));
9390
}
91+
92+
#[cfg(target_os = "linux")]
93+
let (input, output) = sys::net::Socket::new_pair(libc::AF_UNIX, libc::SOCK_SEQPACKET)?;
94+
95+
#[cfg(not(target_os = "linux"))]
9496
let (input, output) = sys::pipe::anon_pipe()?;
9597

9698
// Whatever happens after the fork is almost for sure going to touch or
@@ -104,12 +106,16 @@ impl Command {
104106
// The child calls `mem::forget` to leak the lock, which is crucial because
105107
// releasing a lock is not async-signal-safe.
106108
let env_lock = sys::os::env_read_lock();
107-
let (pid, pidfd) = unsafe { self.do_fork()? };
109+
let pid = unsafe { self.do_fork()? };
108110

109111
if pid == 0 {
110112
crate::panic::always_abort();
111113
mem::forget(env_lock); // avoid non-async-signal-safe unlocking
112114
drop(input);
115+
#[cfg(target_os = "linux")]
116+
if self.get_create_pidfd() {
117+
self.send_pidfd(&output);
118+
}
113119
let Err(err) = unsafe { self.do_exec(theirs, envp.as_ref()) };
114120
let errno = err.raw_os_error().unwrap_or(libc::EINVAL) as u32;
115121
let errno = errno.to_be_bytes();
@@ -133,6 +139,12 @@ impl Command {
133139
drop(env_lock);
134140
drop(output);
135141

142+
#[cfg(target_os = "linux")]
143+
let pidfd = if self.get_create_pidfd() { self.recv_pidfd(&input) } else { -1 };
144+
145+
#[cfg(not(target_os = "linux"))]
146+
let pidfd = -1;
147+
136148
// Safety: We obtained the pidfd from calling `clone3` with
137149
// `CLONE_PIDFD` so it's valid an otherwise unowned.
138150
let mut p = unsafe { Process::new(pid, pidfd) };
@@ -160,6 +172,7 @@ impl Command {
160172
}
161173
Ok(..) => {
162174
// pipe I/O up to PIPE_BUF bytes should be atomic
175+
// similarly SOCK_SEQPACKET messages should arrive whole
163176
assert!(p.wait().is_ok(), "wait() should either return Ok or panic");
164177
panic!("short read on the CLOEXEC pipe")
165178
}
@@ -185,28 +198,27 @@ impl Command {
185198
);
186199

187200
#[cfg(any(target_os = "tvos", target_os = "watchos"))]
188-
unsafe fn do_fork(&mut self) -> Result<(pid_t, pid_t), io::Error> {
201+
unsafe fn do_fork(&mut self) -> Result<pid_t, io::Error> {
189202
return Err(Self::ERR_APPLE_TV_WATCH_NO_FORK_EXEC);
190203
}
191204

192205
// Attempts to fork the process. If successful, returns Ok((0, -1))
193206
// in the child, and Ok((child_pid, -1)) in the parent.
194207
#[cfg(not(any(
195-
target_os = "linux",
196208
target_os = "watchos",
197209
target_os = "tvos",
198210
all(target_os = "nto", target_env = "nto71"),
199211
)))]
200-
unsafe fn do_fork(&mut self) -> Result<(pid_t, pid_t), io::Error> {
201-
cvt(libc::fork()).map(|res| (res, -1))
212+
unsafe fn do_fork(&mut self) -> Result<pid_t, io::Error> {
213+
cvt(libc::fork())
202214
}
203215

204216
// On QNX Neutrino, fork can fail with EBADF in case "another thread might have opened
205217
// or closed a file descriptor while the fork() was occurring".
206218
// Documentation says "... or try calling fork() again". This is what we do here.
207219
// See also https://www.qnx.com/developers/docs/7.1/#com.qnx.doc.neutrino.lib_ref/topic/f/fork.html
208220
#[cfg(all(target_os = "nto", target_env = "nto71"))]
209-
unsafe fn do_fork(&mut self) -> Result<(pid_t, pid_t), io::Error> {
221+
unsafe fn do_fork(&mut self) -> Result<pid_t, io::Error> {
210222
use crate::sys::os::errno;
211223

212224
let mut delay = MIN_FORKSPAWN_SLEEP;
@@ -229,91 +241,11 @@ impl Command {
229241
delay *= 2;
230242
continue;
231243
} else {
232-
return cvt(r).map(|res| (res, -1));
244+
return cvt(r);
233245
}
234246
}
235247
}
236248

237-
// Attempts to fork the process. If successful, returns Ok((0, -1))
238-
// in the child, and Ok((child_pid, child_pidfd)) in the parent.
239-
#[cfg(target_os = "linux")]
240-
unsafe fn do_fork(&mut self) -> Result<(pid_t, pid_t), io::Error> {
241-
use crate::sync::atomic::{AtomicBool, Ordering};
242-
243-
static HAS_CLONE3: AtomicBool = AtomicBool::new(true);
244-
const CLONE_PIDFD: u64 = 0x00001000;
245-
246-
#[repr(C)]
247-
struct clone_args {
248-
flags: u64,
249-
pidfd: u64,
250-
child_tid: u64,
251-
parent_tid: u64,
252-
exit_signal: u64,
253-
stack: u64,
254-
stack_size: u64,
255-
tls: u64,
256-
set_tid: u64,
257-
set_tid_size: u64,
258-
cgroup: u64,
259-
}
260-
261-
raw_syscall! {
262-
fn clone3(cl_args: *mut clone_args, len: libc::size_t) -> libc::c_long
263-
}
264-
265-
// Bypassing libc for `clone3` can make further libc calls unsafe,
266-
// so we use it sparingly for now. See #89522 for details.
267-
// Some tools (e.g. sandboxing tools) may also expect `fork`
268-
// rather than `clone3`.
269-
let want_clone3_pidfd = self.get_create_pidfd();
270-
271-
// If we fail to create a pidfd for any reason, this will
272-
// stay as -1, which indicates an error.
273-
let mut pidfd: pid_t = -1;
274-
275-
// Attempt to use the `clone3` syscall, which supports more arguments
276-
// (in particular, the ability to create a pidfd). If this fails,
277-
// we will fall through this block to a call to `fork()`
278-
if want_clone3_pidfd && HAS_CLONE3.load(Ordering::Relaxed) {
279-
let mut args = clone_args {
280-
flags: CLONE_PIDFD,
281-
pidfd: &mut pidfd as *mut pid_t as u64,
282-
child_tid: 0,
283-
parent_tid: 0,
284-
exit_signal: libc::SIGCHLD as u64,
285-
stack: 0,
286-
stack_size: 0,
287-
tls: 0,
288-
set_tid: 0,
289-
set_tid_size: 0,
290-
cgroup: 0,
291-
};
292-
293-
let args_ptr = &mut args as *mut clone_args;
294-
let args_size = crate::mem::size_of::<clone_args>();
295-
296-
let res = cvt(clone3(args_ptr, args_size));
297-
match res {
298-
Ok(n) => return Ok((n as pid_t, pidfd)),
299-
Err(e) => match e.raw_os_error() {
300-
// Multiple threads can race to execute this store,
301-
// but that's fine - that just means that multiple threads
302-
// will have tried and failed to execute the same syscall,
303-
// with no other side effects.
304-
Some(libc::ENOSYS) => HAS_CLONE3.store(false, Ordering::Relaxed),
305-
// Fallback to fork if `EPERM` is returned. (e.g. blocked by seccomp)
306-
Some(libc::EPERM) => {}
307-
_ => return Err(e),
308-
},
309-
}
310-
}
311-
312-
// Generally, we just call `fork`. If we get here after wanting `clone3`,
313-
// then the syscall does not exist or we do not have permission to call it.
314-
cvt(libc::fork()).map(|res| (res, pidfd))
315-
}
316-
317249
pub fn exec(&mut self, default: Stdio) -> io::Error {
318250
let envp = self.capture_env();
319251

@@ -722,6 +654,115 @@ impl Command {
722654
Ok(Some(p))
723655
}
724656
}
657+
658+
#[cfg(target_os = "linux")]
659+
fn send_pidfd(&self, sock: &crate::sys::net::Socket) {
660+
use crate::io::IoSlice;
661+
use crate::os::fd::RawFd;
662+
use crate::sys::cvt_r;
663+
use libc::{CMSG_DATA, CMSG_FIRSTHDR, CMSG_LEN, CMSG_SPACE, SCM_RIGHTS, SOL_SOCKET};
664+
665+
unsafe {
666+
let child_pid = libc::getpid();
667+
// pidfd_open sets CLOEXEC by default
668+
let pidfd = libc::syscall(libc::SYS_pidfd_open, child_pid, 0);
669+
670+
let fds: [c_int; 1] = [pidfd as RawFd];
671+
672+
const SCM_MSG_LEN: usize = mem::size_of::<[c_int; 1]>();
673+
674+
#[repr(C)]
675+
union Cmsg {
676+
buf: [u8; unsafe { CMSG_SPACE(SCM_MSG_LEN as u32) as usize }],
677+
_align: libc::cmsghdr,
678+
}
679+
680+
let mut cmsg: Cmsg = mem::zeroed();
681+
682+
// 0-length message to send through the socket so we can pass along the fd
683+
let mut iov = [IoSlice::new(b"")];
684+
let mut msg: libc::msghdr = mem::zeroed();
685+
686+
msg.msg_iov = &mut iov as *mut _ as *mut _;
687+
msg.msg_iovlen = 1;
688+
msg.msg_controllen = mem::size_of_val(&cmsg.buf) as _;
689+
msg.msg_control = &mut cmsg.buf as *mut _ as *mut _;
690+
691+
// only attach cmsg if we successfully acquired the pidfd
692+
if pidfd >= 0 {
693+
let hdr = CMSG_FIRSTHDR(&mut msg as *mut _ as *mut _);
694+
(*hdr).cmsg_level = SOL_SOCKET;
695+
(*hdr).cmsg_type = SCM_RIGHTS;
696+
(*hdr).cmsg_len = CMSG_LEN(SCM_MSG_LEN as _) as _;
697+
let data = CMSG_DATA(hdr);
698+
crate::ptr::copy_nonoverlapping(
699+
fds.as_ptr().cast::<u8>(),
700+
data as *mut _,
701+
SCM_MSG_LEN,
702+
);
703+
}
704+
705+
// we send the 0-length message even if we failed to acquire the pidfd
706+
// so we get a consistent SEQPACKET order
707+
match cvt_r(|| libc::sendmsg(sock.as_raw(), &msg, 0)) {
708+
Ok(0) => {}
709+
_ => rtabort!("failed to communicate with parent process"),
710+
}
711+
}
712+
}
713+
714+
#[cfg(target_os = "linux")]
715+
fn recv_pidfd(&self, sock: &crate::sys::net::Socket) -> pid_t {
716+
use crate::io::IoSliceMut;
717+
use crate::sys::cvt_r;
718+
719+
use libc::{CMSG_DATA, CMSG_FIRSTHDR, CMSG_LEN, CMSG_SPACE, SCM_RIGHTS, SOL_SOCKET};
720+
721+
unsafe {
722+
const SCM_MSG_LEN: usize = mem::size_of::<[c_int; 1]>();
723+
724+
#[repr(C)]
725+
union Cmsg {
726+
_buf: [u8; unsafe { CMSG_SPACE(SCM_MSG_LEN as u32) as usize }],
727+
_align: libc::cmsghdr,
728+
}
729+
let mut cmsg: Cmsg = mem::zeroed();
730+
// 0-length read to get the fd
731+
let mut iov = [IoSliceMut::new(&mut [])];
732+
733+
let mut msg: libc::msghdr = mem::zeroed();
734+
735+
msg.msg_iov = &mut iov as *mut _ as *mut _;
736+
msg.msg_iovlen = 1;
737+
msg.msg_controllen = mem::size_of::<Cmsg>() as _;
738+
msg.msg_control = &mut cmsg as *mut _ as *mut _;
739+
740+
match cvt_r(|| libc::recvmsg(sock.as_raw(), &mut msg, 0)) {
741+
Err(_) => return -1,
742+
Ok(_) => {}
743+
}
744+
745+
let hdr = CMSG_FIRSTHDR(&mut msg as *mut _ as *mut _);
746+
if hdr.is_null()
747+
|| (*hdr).cmsg_level != SOL_SOCKET
748+
|| (*hdr).cmsg_type != SCM_RIGHTS
749+
|| (*hdr).cmsg_len != CMSG_LEN(SCM_MSG_LEN as _) as _
750+
{
751+
return -1;
752+
}
753+
let data = CMSG_DATA(hdr);
754+
755+
let mut fds = [-1 as c_int];
756+
757+
crate::ptr::copy_nonoverlapping(
758+
data as *const _,
759+
fds.as_mut_ptr().cast::<u8>(),
760+
SCM_MSG_LEN,
761+
);
762+
763+
fds[0]
764+
}
765+
}
725766
}
726767

727768
////////////////////////////////////////////////////////////////////////////////

library/std/src/sys/unix/process/process_unix/tests.rs

+25
Original file line numberDiff line numberDiff line change
@@ -60,3 +60,28 @@ fn test_command_fork_no_unwind() {
6060
|| signal == libc::SIGSEGV
6161
);
6262
}
63+
64+
#[test]
65+
#[cfg(target_os = "linux")]
66+
fn test_command_pidfd() {
67+
use crate::os::fd::RawFd;
68+
use crate::os::linux::process::{ChildExt, CommandExt};
69+
use crate::process::Command;
70+
71+
let our_pid = crate::process::id();
72+
let pidfd = unsafe { libc::syscall(libc::SYS_pidfd_open, our_pid, 0) };
73+
let pidfd_open_available = if pidfd >= 0 {
74+
unsafe { libc::close(pidfd as RawFd) };
75+
true
76+
} else {
77+
false
78+
};
79+
80+
// always exercise creation attempts
81+
let child = Command::new("echo").create_pidfd(true).spawn().unwrap();
82+
83+
// but only check if we know that the kernel supports pidfds
84+
if pidfd_open_available {
85+
assert!(child.pidfd().is_ok())
86+
}
87+
}

0 commit comments

Comments
 (0)