Description
The unbounded channel implementation from std::sync::mpsc
can segfault due to what looks like a race condition when dropping channel senders and receivers.
You can find a repro at https://github.com/teskje/timely-dataflow/tree/channel-segfault. Unfortunately it is not minimal at all, since it depends on the timely-dataflow crate, which is the actual channel user. I hope it is still self-contained enough that people familiar with the channel implementation can run it and use it as a starting point for debugging.
Run the channel_segfault
example like this:
$ cargo run --release --example channel_segfault 1 -w4
On some machines* it will segfault after some time, e.g.:
...
worker 0: 730000
worker 1: 737400
worker 3: 737400
worker 2: 679400
Segmentation fault (core dumped)
The segfault happens inside std::sync::mpsc::list
, as visible in the gdb backtrace below.
The same segfault also happens with crossbeam_channel
. You can reproduce this by reverting the second-to-last commit on the repro branch. I understand that std::sync::mpsc
is a port of crossbeam_channel
, so that's not surprising.
Meta
rustc --version --verbose
:
rustc 1.68.2 (9eb3afe9e 2023-03-27)
binary: rustc
commit-hash: 9eb3afe9ebe9c7d2b84b71002d44f4a0edac95e0
commit-date: 2023-03-27
host: x86_64-unknown-linux-gnu
release: 1.68.2
LLVM version: 15.0.6
I can reproduce the segfault on nightly as well.
*The segfault is not reproducible on all machines! We were able to reproduce it on:
- a c5.2xlarge EC2 instance running Ubuntu
- an AMD Threadripper Thinkpad running Debian
- an Intel i5-8250U Dell XPS 13 running Arch Linux
It doesn't seem to reproduce on:
- an M1 MacBook Pro
Also note that the repro is using more and more memory over time. That is because the different worker threads create and drop dataflows at different speeds and timely-dataflow keeps per-dataflow communication state (including channels) around until the last worker has dropped the dataflow. It is possible to avoid that by adding a thread::sleep
that lets late workers catch up, but in my experience that also stops the segfault from happening.
Backtrace
#0 std::sync::mpmc::list::Slot<timely_communication::message::Message<(usize, usize, alloc::vec::Vec<((timely::progress::Location, usize), i64), alloc::alloc::Global>)>>::wait_write<timely_communication::message::Message<(usize, usize, alloc::vec::Vec<((timely::progress::Location, usize), i64), alloc::alloc::Global>)>> (self=<optimized out>) at /rustc/9eb3afe9ebe9c7d2b84b71002d44f4a0edac95e0/library/std/src/sync/mpmc/list.rs:48
#1 std::sync::mpmc::list::Channel<timely_communication::message::Message<(usize, usize, alloc::vec::Vec<((timely::progress::Location, usize), i64), alloc::alloc::Global>)>>::discard_all_messages<timely_communication::message::Message<(usize, usize, alloc::vec::Vec<((timely::progress::Location, usize), i64), alloc::alloc::Global>)>> (self=0x7ffe5d371300) at /rustc/9eb3afe9ebe9c7d2b84b71002d44f4a0edac95e0/library/std/src/sync/mpmc/list.rs:560
#2 std::sync::mpmc::list::Channel<timely_communication::message::Message<(usize, usize, alloc::vec::Vec<((timely::progress::Location, usize), i64), alloc::alloc::Global>)>>::disconnect_receivers<timely_communication::message::Message<(usize, usize, alloc::vec::Vec<((timely::progress::Location, usize), i64), alloc::alloc::Global>)>> (self=0x7ffe5d371300) at /rustc/9eb3afe9ebe9c7d2b84b71002d44f4a0edac95e0/library/std/src/sync/mpmc/list.rs:523
#3 0x000055555557a6bf in std::sync::mpmc::{impl#15}::drop::{closure#1}<timely_communication::message::Message<(usize, usize, alloc::vec::Vec<((timely::progress::Location, usize), i64), alloc::alloc::Global>)>> (c=0x7ffe5d371300)
at /rustc/9eb3afe9ebe9c7d2b84b71002d44f4a0edac95e0/library/std/src/sync/mpmc/mod.rs:407
#4 std::sync::mpmc::counter::Receiver<std::sync::mpmc::list::Channel<timely_communication::message::Message<(usize, usize, alloc::vec::Vec<((timely::progress::Location, usize), i64), alloc::alloc::Global>)>>>::release<std::sync::mpmc::list::Channel<timely_communication::message::Message<(usize, usize, alloc::vec::Vec<((timely::progress::Location, usize), i64), alloc::alloc::Global>)>>, std::sync::mpmc::{impl#15}::drop::{closure_env#1}<timely_communication::message::Message<(usize, usize, alloc::vec::Vec<((timely::progress::Location, usize), i64), alloc::alloc::Global>)>>> (self=0x7fffe80cd858, disconnect=...)
at /rustc/9eb3afe9ebe9c7d2b84b71002d44f4a0edac95e0/library/std/src/sync/mpmc/counter.rs:116
#5 0x00005555555a4ad6 in core::ptr::drop_in_place<alloc::boxed::Box<dyn timely_communication::Pull<timely_communication::message::Message<(usize, usize, alloc::vec::Vec<((timely::progress::Location, usize), i64), alloc::alloc::Global>)>>, alloc::alloc::Global>> () at /rustc/9eb3afe9ebe9c7d2b84b71002d44f4a0edac95e0/library/core/src/ptr/mod.rs:490
#6 core::ptr::drop_in_place<timely::progress::broadcast::Progcaster<usize>> () at /rustc/9eb3afe9ebe9c7d2b84b71002d44f4a0edac95e0/library/core/src/ptr/mod.rs:490
#7 0x00005555555a57f1 in core::ptr::drop_in_place<timely::progress::subgraph::Subgraph<(), usize>> () at /rustc/9eb3afe9ebe9c7d2b84b71002d44f4a0edac95e0/library/core/src/ptr/mod.rs:490
#8 0x00005555555c24ed in core::ptr::drop_in_place<alloc::boxed::Box<dyn timely::scheduling::Schedule, alloc::alloc::Global>> () at /rustc/9eb3afe9ebe9c7d2b84b71002d44f4a0edac95e0/library/core/src/ptr/mod.rs:490
#9 core::ptr::drop_in_place<core::option::Option<alloc::boxed::Box<dyn timely::scheduling::Schedule, alloc::alloc::Global>>> () at /rustc/9eb3afe9ebe9c7d2b84b71002d44f4a0edac95e0/library/core/src/ptr/mod.rs:490
#10 timely::worker::{impl#8}::drop (self=0x7ffff7b67530) at timely/src/worker.rs:775
#11 0x00005555555a3e3d in core::ptr::drop_in_place<timely::worker::Wrapper> () at /rustc/9eb3afe9ebe9c7d2b84b71002d44f4a0edac95e0/library/core/src/ptr/mod.rs:490
#12 0x00005555555a75a0 in timely::worker::Worker<timely_communication::allocator::generic::Generic>::drop_dataflow<timely_communication::allocator::generic::Generic> (self=<optimized out>, dataflow_identifier=<optimized out>)
at timely/src/worker.rs:685
#13 0x000055555557dffb in channel_segfault::main::{closure#0} (worker=0x7ffff7b67840) at timely/examples/channel_segfault.rs:25
#14 timely::execute::execute::{closure#1}<(), channel_segfault::main::{closure_env#0}> (allocator=...) at timely/src/execute.rs:287
#15 timely_communication::initialize::initialize_from::{closure#0}<timely_communication::allocator::generic::GenericBuilder, (), timely::execute::execute::{closure_env#1}<(), channel_segfault::main::{closure_env#0}>> ()
at communication/src/initialize.rs:316
#16 std::sys_common::backtrace::__rust_begin_short_backtrace<timely_communication::initialize::initialize_from::{closure_env#0}<timely_communication::allocator::generic::GenericBuilder, (), timely::execute::execute::{closure_env#1}<(), channel_segfault::main::{closure_env#0}>>, ()> (f=...) at /rustc/9eb3afe9ebe9c7d2b84b71002d44f4a0edac95e0/library/std/src/sys_common/backtrace.rs:121
#17 0x000055555557eaa7 in std::thread::{impl#0}::spawn_unchecked_::{closure#1}::{closure#0}<timely_communication::initialize::initialize_from::{closure_env#0}<timely_communication::allocator::generic::GenericBuilder, (), timely::execute::execute::{closure_env#1}<(), channel_segfault::main::{closure_env#0}>>, ()> () at /rustc/9eb3afe9ebe9c7d2b84b71002d44f4a0edac95e0/library/std/src/thread/mod.rs:558
#18 core::panic::unwind_safe::{impl#23}::call_once<(), std::thread::{impl#0}::spawn_unchecked_::{closure#1}::{closure_env#0}<timely_communication::initialize::initialize_from::{closure_env#0}<timely_communication::allocator::generic::GenericBuilder, (), timely::execute::execute::{closure_env#1}<(), channel_segfault::main::{closure_env#0}>>, ()>> (self=<error reading variable: Cannot access memory at address 0x0>, _args=<optimized out>)
at /rustc/9eb3afe9ebe9c7d2b84b71002d44f4a0edac95e0/library/core/src/panic/unwind_safe.rs:271
#19 std::panicking::try::do_call<core::panic::unwind_safe::AssertUnwindSafe<std::thread::{impl#0}::spawn_unchecked_::{closure#1}::{closure_env#0}<timely_communication::initialize::initialize_from::{closure_env#0}<timely_communication::allocator::generic::GenericBuilder, (), timely::execute::execute::{closure_env#1}<(), channel_segfault::main::{closure_env#0}>>, ()>>, ()> (data=<optimized out>)
at /rustc/9eb3afe9ebe9c7d2b84b71002d44f4a0edac95e0/library/std/src/panicking.rs:483
#20 std::panicking::try<(), core::panic::unwind_safe::AssertUnwindSafe<std::thread::{impl#0}::spawn_unchecked_::{closure#1}::{closure_env#0}<timely_communication::initialize::initialize_from::{closure_env#0}<timely_communication::allocator::generic::GenericBuilder, (), timely::execute::execute::{closure_env#1}<(), channel_segfault::main::{closure_env#0}>>, ()>>> (f=<error reading variable: Cannot access memory at address 0x0>)
at /rustc/9eb3afe9ebe9c7d2b84b71002d44f4a0edac95e0/library/std/src/panicking.rs:447
#21 std::panic::catch_unwind<core::panic::unwind_safe::AssertUnwindSafe<std::thread::{impl#0}::spawn_unchecked_::{closure#1}::{closure_env#0}<timely_communication::initialize::initialize_from::{closure_env#0}<timely_communication::allocator::generic::GenericBuilder, (), timely::execute::execute::{closure_env#1}<(), channel_segfault::main::{closure_env#0}>>, ()>>, ()> (f=<error reading variable: Cannot access memory at address 0x0>)
at /rustc/9eb3afe9ebe9c7d2b84b71002d44f4a0edac95e0/library/std/src/panic.rs:140
#22 std::thread::{impl#0}::spawn_unchecked_::{closure#1}<timely_communication::initialize::initialize_from::{closure_env#0}<timely_communication::allocator::generic::GenericBuilder, (), timely::execute::execute::{closure_env#1}<(), channel_segfault::main::{closure_env#0}>>, ()> () at /rustc/9eb3afe9ebe9c7d2b84b71002d44f4a0edac95e0/library/std/src/thread/mod.rs:557
#23 core::ops::function::FnOnce::call_once<std::thread::{impl#0}::spawn_unchecked_::{closure_env#1}<timely_communication::initialize::initialize_from::{closure_env#0}<timely_communication::allocator::generic::GenericBuilder, (), timely::execute::execute::{closure_env#1}<(), channel_segfault::main::{closure_env#0}>>, ()>, ()> () at /rustc/9eb3afe9ebe9c7d2b84b71002d44f4a0edac95e0/library/core/src/ops/function.rs:250
#24 0x0000555555613be3 in alloc::boxed::{impl#45}::call_once<(), dyn core::ops::function::FnOnce<(), Output=()>, alloc::alloc::Global> () at library/alloc/src/boxed.rs:1988
#25 alloc::boxed::{impl#45}::call_once<(), alloc::boxed::Box<dyn core::ops::function::FnOnce<(), Output=()>, alloc::alloc::Global>, alloc::alloc::Global> () at library/alloc/src/boxed.rs:1988
#26 std::sys::unix::thread::{impl#2}::new::thread_start () at library/std/src/sys/unix/thread.rs:108
#27 0x00007ffff7e01b43 in start_thread (arg=<optimized out>) at ./nptl/pthread_create.c:442
#28 0x00007ffff7e93a00 in clone3 () at ../sysdeps/unix/sysv/linux/x86_64/clone3.S:81