Persist ChannelMonitors after new blocks are connected

TheBlueMatt · TheBlueMatt · commit 64a083eafcd3 · 2021-10-05T05:56:19.000Z
We avoid handling events until after the user has confirmed the
`ChannelMonitor` has been persisted to disk. This avoids a race
where we
 * send a payment/HTLC (persisting the monitor to disk with the
   HTLC pending),
 * force-close the channel, removing the channel entry from the
   ChannelManager entirely,
 * persist the ChannelManager,
 * connect a block which contains a fulfill of the HTLC, generating
   a claim event,
 * handle the claim event while the `ChannelMonitor` is being
   persisted,
 * persist the ChannelManager (before the CHannelMonitor is
   persisted fully),
 * restart, reloading the HTLC as a pending payment in the
   ChannelManager, which now has no references to it except from
   the ChannelMonitor which still has the pending HTLC,
 * replay the block connection, generating a duplicate PaymentSent
   event.
diff --git a/lightning-persister/src/lib.rs b/lightning-persister/src/lib.rs
@@ -159,6 +159,12 @@ impl FilesystemPersister {
 }
 
 impl<ChannelSigner: Sign> channelmonitor::Persist<ChannelSigner> for FilesystemPersister {
+	// TODO: We really need a way for the persister to inform the user that its time to crash/shut
+	// down once these start returning failure.
+	// A PermanentFailure implies we need to shut down since we're force-closing channels without
+	// even broadcasting, and sync_persisted_channel's docs are even more explicit that its time to
+	// shut down!
+
 	fn persist_new_channel(&self, funding_txo: OutPoint, monitor: &ChannelMonitor<ChannelSigner>) -> Result<(), ChannelMonitorUpdateErr> {
 		let filename = format!("{}_{}", funding_txo.txid.to_hex(), funding_txo.index);
 		util::write_to_file(self.path_to_monitor_data(), filename, monitor)
@@ -170,6 +176,12 @@ impl<ChannelSigner: Sign> channelmonitor::Persist<ChannelSigner> for FilesystemP
 		util::write_to_file(self.path_to_monitor_data(), filename, monitor)
 			.map_err(|_| ChannelMonitorUpdateErr::PermanentFailure)
 	}
+
+	fn sync_persisted_channel(&self, funding_txo: OutPoint, monitor: &ChannelMonitor<ChannelSigner>) -> Result<(), ()> {
+		let filename = format!("{}_{}", funding_txo.txid.to_hex(), funding_txo.index);
+		util::write_to_file(self.path_to_monitor_data(), filename, monitor)
+			.map_err(|_| ())
+	}
 }
 
 #[cfg(test)]
diff --git a/lightning/src/chain/chainmonitor.rs b/lightning/src/chain/chainmonitor.rs
@@ -39,7 +39,7 @@ use util::events::EventHandler;
 use ln::channelmanager::ChannelDetails;
 
 use prelude::*;
-use sync::RwLock;
+use sync::{RwLock, Mutex};
 use core::ops::Deref;
 
 /// An implementation of [`chain::Watch`] for monitoring channels.
@@ -60,6 +60,18 @@ pub struct ChainMonitor<ChannelSigner: Sign, C: Deref, T: Deref, F: Deref, L: De
 {
 	/// The monitors
 	pub monitors: RwLock<HashMap<OutPoint, ChannelMonitor<ChannelSigner>>>,
+	/// Beyond the synchronization of `monitors` itself, we cannot handle user events until after
+	/// any chain updates have been stored on disk. This mutex is used to provide mutual exclusion
+	/// of event-processing/block-/transaction-connection.
+	/// This avoids the possibility of handling, e.g. an on-chain claim, generating a claim monitor
+	/// event, resulting in the relevant ChannelManager generating a PaymentSent event and dropping
+	/// the pending payment entry, and then reloading before the monitor is persisted, resulting in
+	/// the ChannelManager re-adding the same payment entry, before the same block is replayed,
+	/// resulting in a duplicate PaymentSent event.
+	///
+	/// Note that this is set to true if any persistence fails, at which point *no events must be
+	/// processed* (and the user has indicated they will shut down very very soon).
+	event_mutex: Mutex<bool>,
 	chain_source: Option<C>,
 	broadcaster: T,
 	logger: L,
@@ -88,26 +100,43 @@ where C::Target: chain::Filter,
 		FN: Fn(&ChannelMonitor<ChannelSigner>, &TransactionData) -> Vec<TransactionOutputs>
 	{
 		let mut dependent_txdata = Vec::new();
-		let monitors = self.monitors.read().unwrap();
-		for monitor in monitors.values() {
-			let mut txn_outputs = process(monitor, txdata);
+		{
+			let monitors = self.monitors.write().unwrap();
+			for (funding_outpoint, monitor) in monitors.iter() {
+				let mut txn_outputs;
+				{
+					let mut ev_lock = self.event_mutex.lock().unwrap();
+					txn_outputs = process(monitor, txdata);
+					log_trace!(self.logger, "Syncing Channel Monitor for channel {}", log_funding_info!(monitor));
+					if let Err(()) = self.persister.sync_persisted_channel(*funding_outpoint, monitor) {
+						// If we fail to persist a monitor, stop processing events, assuming we'll
+						// be shutting down soon (and the events can be re-generated on chain
+						// replay).
+						*ev_lock = true;
+						log_error!(self.logger, "Failed to sync Channel Monitor for channel {}!", log_funding_info!(monitor));
+						log_error!(self.logger, "    The LDK-based application should now be shutting down!");
+					} else {
+						log_trace!(self.logger, "Finished syncing Channel Monitor for channel {}", log_funding_info!(monitor));
+					}
+				}
 
-			// Register any new outputs with the chain source for filtering, storing any dependent
-			// transactions from within the block that previously had not been included in txdata.
-			if let Some(ref chain_source) = self.chain_source {
-				let block_hash = header.block_hash();
-				for (txid, mut outputs) in txn_outputs.drain(..) {
-					for (idx, output) in outputs.drain(..) {
-						// Register any new outputs with the chain source for filtering and recurse
-						// if it indicates that there are dependent transactions within the block
-						// that had not been previously included in txdata.
-						let output = WatchedOutput {
-							block_hash: Some(block_hash),
-							outpoint: OutPoint { txid, index: idx as u16 },
-							script_pubkey: output.script_pubkey,
-						};
-						if let Some(tx) = chain_source.register_output(output) {
-							dependent_txdata.push(tx);
+				// Register any new outputs with the chain source for filtering, storing any dependent
+				// transactions from within the block that previously had not been included in txdata.
+				if let Some(ref chain_source) = self.chain_source {
+					let block_hash = header.block_hash();
+					for (txid, mut outputs) in txn_outputs.drain(..) {
+						for (idx, output) in outputs.drain(..) {
+							// Register any new outputs with the chain source for filtering and recurse
+							// if it indicates that there are dependent transactions within the block
+							// that had not been previously included in txdata.
+							let output = WatchedOutput {
+								block_hash: Some(block_hash),
+								outpoint: OutPoint { txid, index: idx as u16 },
+								script_pubkey: output.script_pubkey,
+							};
+							if let Some(tx) = chain_source.register_output(output) {
+								dependent_txdata.push(tx);
+							}
 						}
 					}
 				}
@@ -133,6 +162,7 @@ where C::Target: chain::Filter,
 	pub fn new(chain_source: Option<C>, broadcaster: T, logger: L, feeest: F, persister: P) -> Self {
 		Self {
 			monitors: RwLock::new(HashMap::new()),
+			event_mutex: Mutex::new(false),
 			chain_source,
 			broadcaster,
 			logger,
@@ -331,6 +361,13 @@ where C::Target: chain::Filter,
 	}
 
 	fn release_pending_monitor_events(&self) -> Vec<MonitorEvent> {
+		let ev_lock = self.event_mutex.lock().unwrap();
+		if *ev_lock {
+			log_error!(self.logger, "Failed to sync a Channel Monitor, refusing to provide monitor events!");
+			log_error!(self.logger, "    The LDK-based application should now be shutting down!");
+			return Vec::new();
+		}
+
 		let mut pending_monitor_events = Vec::new();
 		for monitor in self.monitors.read().unwrap().values() {
 			pending_monitor_events.append(&mut monitor.get_and_clear_pending_monitor_events());
@@ -353,6 +390,13 @@ impl<ChannelSigner: Sign, C: Deref, T: Deref, F: Deref, L: Deref, P: Deref> even
 	///
 	/// [`SpendableOutputs`]: events::Event::SpendableOutputs
 	fn process_pending_events<H: Deref>(&self, handler: H) where H::Target: EventHandler {
+		let ev_lock = self.event_mutex.lock().unwrap();
+		if *ev_lock {
+			log_error!(self.logger, "Failed to sync a Channel Monitor, refusing to provide monitor events!");
+			log_error!(self.logger, "    The LDK-based application should now be shutting down!");
+			return;
+		}
+
 		let mut pending_events = Vec::new();
 		for monitor in self.monitors.read().unwrap().values() {
 			pending_events.append(&mut monitor.get_and_clear_pending_events());
diff --git a/lightning/src/chain/channelmonitor.rs b/lightning/src/chain/channelmonitor.rs
@@ -685,6 +685,14 @@ pub(crate) struct ChannelMonitorImpl<Signer: Sign> {
 
 	payment_preimages: HashMap<PaymentHash, PaymentPreimage>,
 
+	// Note that events MUST NOT be generated during update processing, only generated during chain
+	// data processing. This prevents a race in ChainMonitor::update_channel (and presumably user
+	// implementations thereof as well) where we update the in-memory channel object, then before
+	// the persistence finishes (as its all under a read-lock), we return pending events to the
+	// user or to the relevant ChannelManager. This could cause duplicate events.
+	// Note that because the `event_lock` in `ChainMonitor` is only taken in
+	// block/transaction-connected events and *not* during block/transaction-disconnected events,
+	// we further MUST NOT generate events during block/transaction-disconnection.
 	pending_monitor_events: Vec<MonitorEvent>,
 	pending_events: Vec<Event>,
 
@@ -2991,6 +2999,24 @@ pub trait Persist<ChannelSigner: Sign> {
 	/// [`ChannelMonitorUpdate::write`] for writing out an update, and
 	/// [`ChannelMonitorUpdateErr`] for requirements when returning errors.
 	fn update_persisted_channel(&self, id: OutPoint, update: &ChannelMonitorUpdate, data: &ChannelMonitor<ChannelSigner>) -> Result<(), ChannelMonitorUpdateErr>;
+
+	/// Update one channel's data synchronously without a [`ChannelMonitorUpdate`].
+	///
+	/// This is called during block/transaction connection, and is a good time to synchronously
+	/// remove all pending [`ChannelMonitorUpdate`]s which may have been persisted separately as an
+	/// intent log.
+	///
+	/// Note that returning an error here irrevocably disables some processing in [`ChainMonitor`],
+	/// preventing continued normal operation. Errors here are largely only useful to continue
+	/// operation long enough to shut down.
+	///
+	/// Failures here do not imply the channel will be force-closed, however any future calls to
+	/// [`update_persisted_channel`] after an error is returned here MUST either persist the full,
+	/// updated [`ChannelMonitor`] provided to [`update_persisted_channel`] or return
+	/// [`ChannelMonitorUpdateErr::PermanentFailure`], force-closing the channel. In other words,
+	/// any future calls to [`update_persisted_channel`] after an error here MUST NOT persist the
+	/// [`ChannelMonitorUpdate`] alone.
+	fn sync_persisted_channel(&self, id: OutPoint, data: &ChannelMonitor<ChannelSigner>) -> Result<(), ()>;
 }
 
 impl<Signer: Sign, T: Deref, F: Deref, L: Deref> chain::Listen for (ChannelMonitor<Signer>, T, F, L)
diff --git a/lightning/src/util/test_utils.rs b/lightning/src/util/test_utils.rs
@@ -203,6 +203,9 @@ impl<Signer: keysinterface::Sign> channelmonitor::Persist<Signer> for TestPersis
 	fn update_persisted_channel(&self, _funding_txo: OutPoint, _update: &channelmonitor::ChannelMonitorUpdate, _data: &channelmonitor::ChannelMonitor<Signer>) -> Result<(), channelmonitor::ChannelMonitorUpdateErr> {
 		self.update_ret.lock().unwrap().clone()
 	}
+	fn sync_persisted_channel(&self, _funding_txo: OutPoint, _data: &channelmonitor::ChannelMonitor<Signer>) -> Result<(), ()> {
+		self.update_ret.lock().unwrap().clone().map_err(|_| ())
+	}
 }
 
 pub struct TestBroadcaster {

Original file line number	Diff line number	Diff line change
`@@ -203,6 +203,9 @@ impl<Signer: keysinterface::Sign> channelmonitor::Persist<Signer> for TestPersis`
`203`	`203`	`fn update_persisted_channel(&self, _funding_txo: OutPoint, _update: &channelmonitor::ChannelMonitorUpdate, _data: &channelmonitor::ChannelMonitor<Signer>) -> Result<(), channelmonitor::ChannelMonitorUpdateErr> {`
`204`	`204`	`self.update_ret.lock().unwrap().clone()`
`205`	`205`	`}`
	`206`	`+ fn sync_persisted_channel(&self, _funding_txo: OutPoint, _data: &channelmonitor::ChannelMonitor<Signer>) -> Result<(), ()> {`
	`207`	`+ self.update_ret.lock().unwrap().clone().map_err(\|_\| ())`
	`208`	`+ }`
`206`	`209`	`}`
`207`	`210`
`208`	`211`	`pub struct TestBroadcaster {`