Avoid disconnecting all peers if user code is slow

TheBlueMatt · TheBlueMatt · commit 8f5023a006ed · 2022-01-21T00:36:59.000Z
In the sample client (and likely other downstream users), event
processing may block on slow operations (e.g. Bitcoin Core RPCs)
and ChannelManager persistence may take some time. This should be
fine, except that we consider this a case of possible backgrounding
and disconnect all of our peers when it happens.

Instead, we here avoid considering event processing time in the
time between PeerManager events.
diff --git a/lightning-background-processor/src/lib.rs b/lightning-background-processor/src/lib.rs
@@ -219,11 +219,17 @@ impl BackgroundProcessor {
 			let mut have_pruned = false;
 
 			loop {
-				peer_manager.process_events();
+				peer_manager.process_events(); // Note that this may block on ChannelManager's locking
 				channel_manager.process_pending_events(&event_handler);
 				chain_monitor.process_pending_events(&event_handler);
+
+				// We wait up to 100ms, but track how long it takes to detect being put to sleep,
+				// see `await_start`'s use below.
+				let await_start = Instant::now();
 				let updates_available =
 					channel_manager.await_persistable_update_timeout(Duration::from_millis(100));
+				let await_time = await_start.elapsed();
+
 				if updates_available {
 					log_trace!(logger, "Persisting ChannelManager...");
 					persister.persist_manager(&*channel_manager)?;
@@ -239,15 +245,20 @@ impl BackgroundProcessor {
 					channel_manager.timer_tick_occurred();
 					last_freshness_call = Instant::now();
 				}
-				if last_ping_call.elapsed().as_secs() > PING_TIMER * 2 {
+				if await_time > Duration::from_secs(1) {
 					// On various platforms, we may be starved of CPU cycles for several reasons.
 					// E.g. on iOS, if we've been in the background, we will be entirely paused.
 					// Similarly, if we're on a desktop platform and the device has been asleep, we
 					// may not get any cycles.
-					// In any case, if we've been entirely paused for more than double our ping
-					// timer, we should have disconnected all sockets by now (and they're probably
-					// dead anyway), so disconnect them by calling `timer_tick_occurred()` twice.
-					log_trace!(logger, "Awoke after more than double our ping timer, disconnecting peers.");
+					// We detect this by checking if our max-100ms-sleep, above, ran longer than a
+					// full second, at which point we assume sockets may have been killed (they
+					// appear to be at least on some platforms, even if it has only been a second).
+					// Note that we have to take care to not get here just because user event
+					// processing was slow at the top of the loop. For example, the sample client
+					// may call Bitcoin Core RPCs during event handling, which very often takes
+					// more than a handful of seconds to complete, and shouldn't disconnect all our
+					// peers.
+					log_trace!(logger, "100ms sleep took more than a second, disconnecting peers.");
 					peer_manager.disconnect_all_peers();
 					last_ping_call = Instant::now();
 				} else if last_ping_call.elapsed().as_secs() > PING_TIMER {