Disallow taking two instances of the same mutex at the same time

TheBlueMatt · TheBlueMatt · commit a1bb7cac2934 · 2023-02-22T23:01:01.000Z
Taking two instances of the same mutex may be totally fine, but it
requires a total lockorder that we cannot (trivially) check. Thus,
its generally unsafe to do if we can avoid it.

To discourage doing this, here we default to panicing on such locks
in our lockorder tests, with a separate lock function added that is
clearly labeled "unsafe" to allow doing so when we can guarantee a
total lockorder.

This requires adapting a number of sites to the new API, including
fixing a bug this turned up in `ChannelMonitor`'s `PartialEq` where
no lockorder was guaranteed.
diff --git a/lightning/src/chain/channelmonitor.rs b/lightning/src/chain/channelmonitor.rs
@@ -60,7 +60,7 @@ use core::{cmp, mem};
 use crate::io::{self, Error};
 use core::convert::TryInto;
 use core::ops::Deref;
-use crate::sync::Mutex;
+use crate::sync::{Mutex, LockTestExt};
 
 /// An update generated by the underlying channel itself which contains some new information the
 /// [`ChannelMonitor`] should be made aware of.
@@ -855,9 +855,13 @@ pub type TransactionOutputs = (Txid, Vec<(u32, TxOut)>);
 
 impl<Signer: WriteableEcdsaChannelSigner> PartialEq for ChannelMonitor<Signer> where Signer: PartialEq {
 	fn eq(&self, other: &Self) -> bool {
-		let inner = self.inner.lock().unwrap();
-		let other = other.inner.lock().unwrap();
-		inner.eq(&other)
+		// We need some kind of total lockorder. Absent a better idea, we sort by position in
+		// memory and take locks in that order (assuming that we can't move within memory while a
+		// lock is held).
+		let ord = ((self as *const _) as usize) < ((other as *const _) as usize);
+		let a = if ord { self.inner.unsafe_well_ordered_double_lock_self() } else { other.inner.unsafe_well_ordered_double_lock_self() };
+		let b = if ord { other.inner.unsafe_well_ordered_double_lock_self() } else { self.inner.unsafe_well_ordered_double_lock_self() };
+		a.eq(&b)
 	}
 }
 
diff --git a/lightning/src/ln/chanmon_update_fail_tests.rs b/lightning/src/ln/chanmon_update_fail_tests.rs
@@ -108,12 +108,13 @@ fn test_monitor_and_persister_update_fail() {
 		blocks: Arc::new(Mutex::new(vec![(genesis_block(Network::Testnet), 200); 200])),
 	};
 	let chain_mon = {
-		let monitor = nodes[0].chain_monitor.chain_monitor.get_monitor(outpoint).unwrap();
-		let mut w = test_utils::TestVecWriter(Vec::new());
-		monitor.write(&mut w).unwrap();
-		let new_monitor = <(BlockHash, ChannelMonitor<EnforcingSigner>)>::read(
-			&mut io::Cursor::new(&w.0), (nodes[0].keys_manager, nodes[0].keys_manager)).unwrap().1;
-		assert!(new_monitor == *monitor);
+		let new_monitor = {
+			let monitor = nodes[0].chain_monitor.chain_monitor.get_monitor(outpoint).unwrap();
+			let new_monitor = <(BlockHash, ChannelMonitor<EnforcingSigner>)>::read(
+				&mut io::Cursor::new(&monitor.encode()), (nodes[0].keys_manager, nodes[0].keys_manager)).unwrap().1;
+			assert!(new_monitor == *monitor);
+			new_monitor
+		};
 		let chain_mon = test_utils::TestChainMonitor::new(Some(&chain_source), &tx_broadcaster, &logger, &chanmon_cfgs[0].fee_estimator, &persister, &node_cfgs[0].keys_manager);
 		assert_eq!(chain_mon.watch_channel(outpoint, new_monitor), ChannelMonitorUpdateStatus::Completed);
 		chain_mon
diff --git a/lightning/src/ln/channelmanager.rs b/lightning/src/ln/channelmanager.rs
@@ -6930,7 +6930,10 @@ where
 		let mut monitor_update_blocked_actions_per_peer = None;
 		let mut peer_states = Vec::new();
 		for (_, peer_state_mutex) in per_peer_state.iter() {
-			peer_states.push(peer_state_mutex.lock().unwrap());
+			// Because we're holding the owning `per_peer_state` write lock here there's no chance
+			// of a lockorder violation deadlock - no other thread can be holding any
+			// per_peer_state lock at all.
+			peer_states.push(peer_state_mutex.unsafe_well_ordered_double_lock_self());
 		}
 
 		(serializable_peer_count).write(writer)?;
@@ -8275,18 +8278,20 @@ mod tests {
 			let nodes_0_lock = nodes[0].node.id_to_peer.lock().unwrap();
 			assert_eq!(nodes_0_lock.len(), 1);
 			assert!(nodes_0_lock.contains_key(channel_id));
-
-			assert_eq!(nodes[1].node.id_to_peer.lock().unwrap().len(), 0);
 		}
 
+		assert_eq!(nodes[1].node.id_to_peer.lock().unwrap().len(), 0);
+
 		let funding_created_msg = get_event_msg!(nodes[0], MessageSendEvent::SendFundingCreated, nodes[1].node.get_our_node_id());
 
 		nodes[1].node.handle_funding_created(&nodes[0].node.get_our_node_id(), &funding_created_msg);
 		{
 			let nodes_0_lock = nodes[0].node.id_to_peer.lock().unwrap();
 			assert_eq!(nodes_0_lock.len(), 1);
 			assert!(nodes_0_lock.contains_key(channel_id));
+		}
 
+		{
 			// Assert that `nodes[1]`'s `id_to_peer` map is populated with the channel as soon as
 			// as it has the funding transaction.
 			let nodes_1_lock = nodes[1].node.id_to_peer.lock().unwrap();
@@ -8316,7 +8321,9 @@ mod tests {
 			let nodes_0_lock = nodes[0].node.id_to_peer.lock().unwrap();
 			assert_eq!(nodes_0_lock.len(), 1);
 			assert!(nodes_0_lock.contains_key(channel_id));
+		}
 
+		{
 			// At this stage, `nodes[1]` has proposed a fee for the closing transaction in the
 			// `handle_closing_signed` call above. As `nodes[1]` has not yet received the signature
 			// from `nodes[0]` for the closing transaction with the proposed fee, the channel is
diff --git a/lightning/src/ln/functional_test_utils.rs b/lightning/src/ln/functional_test_utils.rs
@@ -466,8 +466,8 @@ impl<'a, 'b, 'c> Drop for Node<'a, 'b, 'c> {
 					panic!();
 				}
 			}
-			assert_eq!(*chain_source.watched_txn.lock().unwrap(), *self.chain_source.watched_txn.lock().unwrap());
-			assert_eq!(*chain_source.watched_outputs.lock().unwrap(), *self.chain_source.watched_outputs.lock().unwrap());
+			assert_eq!(*chain_source.watched_txn.unsafe_well_ordered_double_lock_self(), *self.chain_source.watched_txn.unsafe_well_ordered_double_lock_self());
+			assert_eq!(*chain_source.watched_outputs.unsafe_well_ordered_double_lock_selflock(), *self.chain_source.watched_outputs.unsafe_well_ordered_double_lock_selflock());
 		}
 	}
 }
diff --git a/lightning/src/ln/functional_tests.rs b/lightning/src/ln/functional_tests.rs
@@ -8160,12 +8160,13 @@ fn test_update_err_monitor_lockdown() {
 	let logger = test_utils::TestLogger::with_id(format!("node {}", 0));
 	let persister = test_utils::TestPersister::new();
 	let watchtower = {
-		let monitor = nodes[0].chain_monitor.chain_monitor.get_monitor(outpoint).unwrap();
-		let mut w = test_utils::TestVecWriter(Vec::new());
-		monitor.write(&mut w).unwrap();
-		let new_monitor = <(BlockHash, channelmonitor::ChannelMonitor<EnforcingSigner>)>::read(
-				&mut io::Cursor::new(&w.0), (nodes[0].keys_manager, nodes[0].keys_manager)).unwrap().1;
-		assert!(new_monitor == *monitor);
+		let new_monitor = {
+			let monitor = nodes[0].chain_monitor.chain_monitor.get_monitor(outpoint).unwrap();
+			let new_monitor = <(BlockHash, channelmonitor::ChannelMonitor<EnforcingSigner>)>::read(
+					&mut io::Cursor::new(&monitor.encode()), (nodes[0].keys_manager, nodes[0].keys_manager)).unwrap().1;
+			assert!(new_monitor == *monitor);
+			new_monitor
+		};
 		let watchtower = test_utils::TestChainMonitor::new(Some(&chain_source), &chanmon_cfgs[0].tx_broadcaster, &logger, &chanmon_cfgs[0].fee_estimator, &persister, &node_cfgs[0].keys_manager);
 		assert_eq!(watchtower.watch_channel(outpoint, new_monitor), ChannelMonitorUpdateStatus::Completed);
 		watchtower
@@ -8227,12 +8228,13 @@ fn test_concurrent_monitor_claim() {
 	let logger = test_utils::TestLogger::with_id(format!("node {}", "Alice"));
 	let persister = test_utils::TestPersister::new();
 	let watchtower_alice = {
-		let monitor = nodes[0].chain_monitor.chain_monitor.get_monitor(outpoint).unwrap();
-		let mut w = test_utils::TestVecWriter(Vec::new());
-		monitor.write(&mut w).unwrap();
-		let new_monitor = <(BlockHash, channelmonitor::ChannelMonitor<EnforcingSigner>)>::read(
-				&mut io::Cursor::new(&w.0), (nodes[0].keys_manager, nodes[0].keys_manager)).unwrap().1;
-		assert!(new_monitor == *monitor);
+		let new_monitor = {
+			let monitor = nodes[0].chain_monitor.chain_monitor.get_monitor(outpoint).unwrap();
+			let new_monitor = <(BlockHash, channelmonitor::ChannelMonitor<EnforcingSigner>)>::read(
+					&mut io::Cursor::new(&monitor.encode()), (nodes[0].keys_manager, nodes[0].keys_manager)).unwrap().1;
+			assert!(new_monitor == *monitor);
+			new_monitor
+		};
 		let watchtower = test_utils::TestChainMonitor::new(Some(&chain_source), &chanmon_cfgs[0].tx_broadcaster, &logger, &chanmon_cfgs[0].fee_estimator, &persister, &node_cfgs[0].keys_manager);
 		assert_eq!(watchtower.watch_channel(outpoint, new_monitor), ChannelMonitorUpdateStatus::Completed);
 		watchtower
@@ -8256,12 +8258,13 @@ fn test_concurrent_monitor_claim() {
 	let logger = test_utils::TestLogger::with_id(format!("node {}", "Bob"));
 	let persister = test_utils::TestPersister::new();
 	let watchtower_bob = {
-		let monitor = nodes[0].chain_monitor.chain_monitor.get_monitor(outpoint).unwrap();
-		let mut w = test_utils::TestVecWriter(Vec::new());
-		monitor.write(&mut w).unwrap();
-		let new_monitor = <(BlockHash, channelmonitor::ChannelMonitor<EnforcingSigner>)>::read(
-				&mut io::Cursor::new(&w.0), (nodes[0].keys_manager, nodes[0].keys_manager)).unwrap().1;
-		assert!(new_monitor == *monitor);
+		let new_monitor = {
+			let monitor = nodes[0].chain_monitor.chain_monitor.get_monitor(outpoint).unwrap();
+			let new_monitor = <(BlockHash, channelmonitor::ChannelMonitor<EnforcingSigner>)>::read(
+					&mut io::Cursor::new(&monitor.encode()), (nodes[0].keys_manager, nodes[0].keys_manager)).unwrap().1;
+			assert!(new_monitor == *monitor);
+			new_monitor
+		};
 		let watchtower = test_utils::TestChainMonitor::new(Some(&chain_source), &chanmon_cfgs[0].tx_broadcaster, &logger, &chanmon_cfgs[0].fee_estimator, &persister, &node_cfgs[0].keys_manager);
 		assert_eq!(watchtower.watch_channel(outpoint, new_monitor), ChannelMonitorUpdateStatus::Completed);
 		watchtower
diff --git a/lightning/src/routing/utxo.rs b/lightning/src/routing/utxo.rs
@@ -26,7 +26,7 @@ use crate::util::ser::Writeable;
 use crate::prelude::*;
 
 use alloc::sync::{Arc, Weak};
-use crate::sync::Mutex;
+use crate::sync::{Mutex, LockTestExt};
 use core::ops::Deref;
 
 /// An error when accessing the chain via [`UtxoLookup`].
@@ -404,7 +404,10 @@ impl PendingChecks {
 				// lookup if we haven't gotten that far yet).
 				match Weak::upgrade(&e.get()) {
 					Some(pending_msgs) => {
-						let pending_matches = match &pending_msgs.lock().unwrap().channel_announce {
+						// This may be called with the mutex held on a different UtxoMessages
+						// struct, however in that case we have a global lockorder of new messages
+						// -> old messages, which makes this safe.
+						let pending_matches = match &pending_msgs.unsafe_well_ordered_double_lock_self().channel_announce {
 							Some(ChannelAnnouncement::Full(pending_msg)) => Some(pending_msg) == full_msg,
 							Some(ChannelAnnouncement::Unsigned(pending_msg)) => pending_msg == msg,
 							None => {
diff --git a/lightning/src/sync/debug_sync.rs b/lightning/src/sync/debug_sync.rs
@@ -124,17 +124,21 @@ impl LockMetadata {
 		res
 	}
 
-	fn pre_lock(this: &Arc<LockMetadata>) {
+	fn pre_lock(this: &Arc<LockMetadata>, double_lock_self_allowed: bool) {
 		LOCKS_HELD.with(|held| {
 			// For each lock which is currently locked, check that no lock's locked-before
 			// set includes the lock we're about to lock, which would imply a lockorder
 			// inversion.
 			for (locked_idx, locked) in held.borrow().iter() {
 				if *locked_idx == this.lock_idx {
-					// With `feature = "backtrace"` set, we may be looking at different instances
-					// of the same lock.
-					debug_assert!(cfg!(feature = "backtrace"), "Tried to acquire a lock while it was held!");
+					// Note that with `feature = "backtrace"` set, we may be looking at different
+					// instances of the same lock. Still, doing so is quite risky, a total order
+					// must be maintained, and doing so across a set of otherwise-identical mutexes
+					// is fraught with issues.
+					debug_assert!(cfg!(feature = "backtrace") && double_lock_self_allowed, "Tried to acquire a lock while it was held!");
 				}
+			}
+			for (locked_idx, locked) in held.borrow().iter() {
 				for (locked_dep_idx, _locked_dep) in locked.locked_before.lock().unwrap().iter() {
 					if *locked_dep_idx == this.lock_idx && *locked_dep_idx != locked.lock_idx {
 						#[cfg(feature = "backtrace")]
@@ -236,7 +240,7 @@ impl<T> Mutex<T> {
 	}
 
 	pub fn lock<'a>(&'a self) -> LockResult<MutexGuard<'a, T>> {
-		LockMetadata::pre_lock(&self.deps);
+		LockMetadata::pre_lock(&self.deps, false);
 		self.inner.lock().map(|lock| MutexGuard { mutex: self, lock }).map_err(|_| ())
 	}
 
@@ -249,11 +253,17 @@ impl<T> Mutex<T> {
 	}
 }
 
-impl <T> LockTestExt for Mutex<T> {
+impl<'a, T: 'a> LockTestExt<'a> for Mutex<T> {
 	#[inline]
 	fn held_by_thread(&self) -> LockHeldState {
 		LockMetadata::held_by_thread(&self.deps)
 	}
+	type ExclLock = MutexGuard<'a, T>;
+	#[inline]
+	fn unsafe_well_ordered_double_lock_self(&'a self) -> MutexGuard<T> {
+		LockMetadata::pre_lock(&self.deps, true);
+		self.inner.lock().map(|lock| MutexGuard { mutex: self, lock }).unwrap()
+	}
 }
 
 pub struct RwLock<T: Sized> {
@@ -318,12 +328,12 @@ impl<T> RwLock<T> {
 		// Note that while we could be taking a recursive read lock here, Rust's `RwLock` may
 		// deadlock trying to take a second read lock if another thread is waiting on the write
 		// lock. Its platform dependent (but our in-tree `FairRwLock` guarantees this behavior).
-		LockMetadata::pre_lock(&self.deps);
+		LockMetadata::pre_lock(&self.deps, false);
 		self.inner.read().map(|guard| RwLockReadGuard { lock: self, guard }).map_err(|_| ())
 	}
 
 	pub fn write<'a>(&'a self) -> LockResult<RwLockWriteGuard<'a, T>> {
-		LockMetadata::pre_lock(&self.deps);
+		LockMetadata::pre_lock(&self.deps, false);
 		self.inner.write().map(|guard| RwLockWriteGuard { lock: self, guard }).map_err(|_| ())
 	}
 
@@ -336,11 +346,17 @@ impl<T> RwLock<T> {
 	}
 }
 
-impl <T> LockTestExt for RwLock<T> {
+impl<'a, T: 'a> LockTestExt<'a> for RwLock<T> {
 	#[inline]
 	fn held_by_thread(&self) -> LockHeldState {
 		LockMetadata::held_by_thread(&self.deps)
 	}
+	type ExclLock = RwLockWriteGuard<'a, T>;
+	#[inline]
+	fn unsafe_well_ordered_double_lock_self(&'a self) -> RwLockWriteGuard<'a, T> {
+		LockMetadata::pre_lock(&self.deps, true);
+		self.inner.write().map(|guard| RwLockWriteGuard { lock: self, guard }).unwrap()
+	}
 }
 
 pub type FairRwLock<T> = RwLock<T>;
diff --git a/lightning/src/sync/fairrwlock.rs b/lightning/src/sync/fairrwlock.rs
@@ -50,10 +50,15 @@ impl<T> FairRwLock<T> {
 	}
 }
 
-impl<T> LockTestExt for FairRwLock<T> {
+impl<'a, T: 'a> LockTestExt<'a> for FairRwLock<T> {
 	#[inline]
 	fn held_by_thread(&self) -> LockHeldState {
 		// fairrwlock is only built in non-test modes, so we should never support tests.
 		LockHeldState::Unsupported
 	}
+	type ExclLock = RwLockWriteGuard<'a, T>;
+	#[inline]
+	fn unsafe_well_ordered_double_lock_self(&'a self) -> RwLockWriteGuard<'a, T> {
+		self.write().unwrap()
+	}
 }
diff --git a/lightning/src/sync/mod.rs b/lightning/src/sync/mod.rs
@@ -7,8 +7,17 @@ pub(crate) enum LockHeldState {
 	Unsupported,
 }
 
-pub(crate) trait LockTestExt {
+pub(crate) trait LockTestExt<'a> {
 	fn held_by_thread(&self) -> LockHeldState;
+	type ExclLock;
+	/// If two instances of the same mutex are being taken at the same time, it's very easy to have
+	/// a lockorder inversion and risk deadlock. Thus, we default to disabling such locks.
+	///
+	/// However, sometimes they cannot be avoided. In such cases, this method exists to take a
+	/// mutex while avoiding a test failure. It is deliberately verbose and includes the term
+	/// "unsafe" to indicate that special care needs to be taken to ensure no deadlocks are
+	/// possible.
+	fn unsafe_well_ordered_double_lock_self(&'a self) -> Self::ExclLock;
 }
 
 #[cfg(all(feature = "std", not(feature = "_bench_unstable"), test))]
@@ -27,13 +36,19 @@ pub use {std::sync::{Arc, Mutex, Condvar, MutexGuard, RwLock, RwLockReadGuard, R
 #[cfg(all(feature = "std", any(feature = "_bench_unstable", not(test))))]
 mod ext_impl {
 	use super::*;
-	impl<T> LockTestExt for Mutex<T> {
+	impl<'a, T: 'a> LockTestExt<'a> for Mutex<T> {
 		#[inline]
 		fn held_by_thread(&self) -> LockHeldState { LockHeldState::Unsupported }
+		type ExclLock = MutexGuard<'a, T>;
+		#[inline]
+		fn unsafe_well_ordered_double_lock_self(&'a self) -> MutexGuard<T> { self.lock().unwrap() }
 	}
-	impl<T> LockTestExt for RwLock<T> {
+	impl<'a, T: 'a> LockTestExt<'a> for RwLock<T> {
 		#[inline]
 		fn held_by_thread(&self) -> LockHeldState { LockHeldState::Unsupported }
+		type ExclLock = RwLockWriteGuard<'a, T>;
+		#[inline]
+		fn unsafe_well_ordered_double_lock_self(&'a self) -> RwLockWriteGuard<T> { self.write().unwrap() }
 	}
 }
 

Original file line number	Diff line number	Diff line change
`@@ -466,8 +466,8 @@ impl<'a, 'b, 'c> Drop for Node<'a, 'b, 'c> {`
`466`	`466`	`panic!();`
`467`	`467`	`}`
`468`	`468`	`}`
`469`		`- assert_eq!(chain_source.watched_txn.lock().unwrap(), self.chain_source.watched_txn.lock().unwrap());`
`470`		`- assert_eq!(chain_source.watched_outputs.lock().unwrap(), self.chain_source.watched_outputs.lock().unwrap());`
	`469`	`+ assert_eq!(chain_source.watched_txn.unsafe_well_ordered_double_lock_self(), self.chain_source.watched_txn.unsafe_well_ordered_double_lock_self());`
	`470`	`+ assert_eq!(chain_source.watched_outputs.unsafe_well_ordered_double_lock_selflock(), self.chain_source.watched_outputs.unsafe_well_ordered_double_lock_selflock());`
`471`	`471`	`}`
`472`	`472`	`}`
`473`	`473`	`}`
Original file line number	Diff line number	Diff line change
`@@ -50,10 +50,15 @@ impl<T> FairRwLock<T> {`
`50`	`50`	`}`
`51`	`51`	`}`
`52`	`52`
`53`		`-impl<T> LockTestExt for FairRwLock<T> {`
	`53`	`+impl<'a, T: 'a> LockTestExt<'a> for FairRwLock<T> {`
`54`	`54`	`#[inline]`
`55`	`55`	`fn held_by_thread(&self) -> LockHeldState {`
`56`	`56`	`// fairrwlock is only built in non-test modes, so we should never support tests.`
`57`	`57`	`LockHeldState::Unsupported`
`58`	`58`	`}`
	`59`	`+ type ExclLock = RwLockWriteGuard<'a, T>;`
	`60`	`+ #[inline]`
	`61`	`+ fn unsafe_well_ordered_double_lock_self(&'a self) -> RwLockWriteGuard<'a, T> {`
	`62`	`+ self.write().unwrap()`
	`63`	`+ }`
`59`	`64`	`}`