Skip to content

Commit 23c5308

Browse files
committed
Drop the ChannelMonitorUpdateStatus::PermanentFailure variant
When a `ChannelMonitorUpdate` fails to apply, it generally means we cannot reach our storage backend. This, in general, is a critical issue, but is often only a transient issue. Sadly, users see the failure variant and return it on any I/O error, resulting in channel force-closures due to transient issues. Users don't generally expect force-closes in most cases, and luckily with async `ChannelMonitorUpdate`s supported we don't take any risk by "delaying" the `ChannelMonitorUpdate` indefinitely. Thus, here we drop the `PermanentFailure` variant entirely, making all failures instead be "the update is in progress, but won't ever complete", which is equivalent if we do not close the channel automatically.
1 parent f2bb931 commit 23c5308

12 files changed

+129
-316
lines changed

fuzz/src/chanmon_consistency.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ impl TestChainMonitor {
138138
}
139139
}
140140
impl chain::Watch<TestChannelSigner> for TestChainMonitor {
141-
fn watch_channel(&self, funding_txo: OutPoint, monitor: channelmonitor::ChannelMonitor<TestChannelSigner>) -> chain::ChannelMonitorUpdateStatus {
141+
fn watch_channel(&self, funding_txo: OutPoint, monitor: channelmonitor::ChannelMonitor<TestChannelSigner>) -> Result<chain::ChannelMonitorUpdateStatus, ()> {
142142
let mut ser = VecWriter(Vec::new());
143143
monitor.write(&mut ser).unwrap();
144144
if let Some(_) = self.latest_monitors.lock().unwrap().insert(funding_txo, (monitor.get_latest_update_id(), ser.0)) {
@@ -500,7 +500,7 @@ pub fn do_test<Out: Output>(data: &[u8], underlying_out: Out) {
500500
let res = (<(BlockHash, ChanMan)>::read(&mut Cursor::new(&$ser.0), read_args).expect("Failed to read manager").1, chain_monitor.clone());
501501
for (funding_txo, mon) in monitors.drain() {
502502
assert_eq!(chain_monitor.chain_monitor.watch_channel(funding_txo, mon),
503-
ChannelMonitorUpdateStatus::Completed);
503+
Ok(ChannelMonitorUpdateStatus::Completed));
504504
}
505505
res
506506
} }

lightning-persister/src/fs_store.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -436,7 +436,7 @@ mod tests {
436436
}
437437

438438
// Test that if the store's path to channel data is read-only, writing a
439-
// monitor to it results in the store returning a PermanentFailure.
439+
// monitor to it results in the store returning an InProgress.
440440
// Windows ignores the read-only flag for folders, so this test is Unix-only.
441441
#[cfg(not(target_os = "windows"))]
442442
#[test]
@@ -470,7 +470,7 @@ mod tests {
470470
index: 0
471471
};
472472
match store.persist_new_channel(test_txo, &added_monitors[0].1, update_id.2) {
473-
ChannelMonitorUpdateStatus::PermanentFailure => {},
473+
ChannelMonitorUpdateStatus::InProgress => {},
474474
_ => panic!("unexpected result from persisting new channel")
475475
}
476476

@@ -507,7 +507,7 @@ mod tests {
507507
index: 0
508508
};
509509
match store.persist_new_channel(test_txo, &added_monitors[0].1, update_id.2) {
510-
ChannelMonitorUpdateStatus::PermanentFailure => {},
510+
ChannelMonitorUpdateStatus::InProgress => {},
511511
_ => panic!("unexpected result from persisting new channel")
512512
}
513513

lightning/src/chain/chainmonitor.rs

Lines changed: 16 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ impl MonitorUpdateId {
7878
/// `Persist` defines behavior for persisting channel monitors: this could mean
7979
/// writing once to disk, and/or uploading to one or more backup services.
8080
///
81-
/// Each method can return three possible values:
81+
/// Each method can return two possible values:
8282
/// * If persistence (including any relevant `fsync()` calls) happens immediately, the
8383
/// implementation should return [`ChannelMonitorUpdateStatus::Completed`], indicating normal
8484
/// channel operation should continue.
@@ -91,10 +91,9 @@ impl MonitorUpdateId {
9191
/// Note that unlike the direct [`chain::Watch`] interface,
9292
/// [`ChainMonitor::channel_monitor_updated`] must be called once for *each* update which occurs.
9393
///
94-
/// * If persistence fails for some reason, implementations should return
95-
/// [`ChannelMonitorUpdateStatus::PermanentFailure`], in which case the channel will likely be
96-
/// closed without broadcasting the latest state. See
97-
/// [`ChannelMonitorUpdateStatus::PermanentFailure`] for more details.
94+
/// If persistence fails for some reason, implementations should still return
95+
/// [`ChannelMonitorUpdateStatus::InProgress`] and attempt to shut down or otherwise resolve the
96+
/// situation ASAP.
9897
///
9998
/// Third-party watchtowers may be built as a part of an implementation of this trait, with the
10099
/// advantage that you can control whether to resume channel operation depending on if an update
@@ -335,11 +334,6 @@ where C::Target: chain::Filter,
335334
match self.persister.update_persisted_channel(*funding_outpoint, None, monitor, update_id) {
336335
ChannelMonitorUpdateStatus::Completed =>
337336
log_trace!(self.logger, "Finished syncing Channel Monitor for channel {}", log_funding_info!(monitor)),
338-
ChannelMonitorUpdateStatus::PermanentFailure => {
339-
monitor_state.channel_perm_failed.store(true, Ordering::Release);
340-
self.pending_monitor_events.lock().unwrap().push((*funding_outpoint, vec![MonitorEvent::UpdateFailed(*funding_outpoint)], monitor.get_counterparty_node_id()));
341-
self.event_notifier.notify();
342-
}
343337
ChannelMonitorUpdateStatus::InProgress => {
344338
log_debug!(self.logger, "Channel Monitor sync for channel {} in progress, holding events until completion!", log_funding_info!(monitor));
345339
pending_monitor_updates.push(update_id);
@@ -673,12 +667,12 @@ where C::Target: chain::Filter,
673667
///
674668
/// Note that we persist the given `ChannelMonitor` while holding the `ChainMonitor`
675669
/// monitors lock.
676-
fn watch_channel(&self, funding_outpoint: OutPoint, monitor: ChannelMonitor<ChannelSigner>) -> ChannelMonitorUpdateStatus {
670+
fn watch_channel(&self, funding_outpoint: OutPoint, monitor: ChannelMonitor<ChannelSigner>) -> Result<ChannelMonitorUpdateStatus, ()> {
677671
let mut monitors = self.monitors.write().unwrap();
678672
let entry = match monitors.entry(funding_outpoint) {
679673
hash_map::Entry::Occupied(_) => {
680674
log_error!(self.logger, "Failed to add new channel data: channel monitor for given outpoint is already present");
681-
return ChannelMonitorUpdateStatus::PermanentFailure
675+
return Err(());
682676
},
683677
hash_map::Entry::Vacant(e) => e,
684678
};
@@ -691,10 +685,6 @@ where C::Target: chain::Filter,
691685
log_info!(self.logger, "Persistence of new ChannelMonitor for channel {} in progress", log_funding_info!(monitor));
692686
pending_monitor_updates.push(update_id);
693687
},
694-
ChannelMonitorUpdateStatus::PermanentFailure => {
695-
log_error!(self.logger, "Persistence of new ChannelMonitor for channel {} failed", log_funding_info!(monitor));
696-
return persist_res;
697-
},
698688
ChannelMonitorUpdateStatus::Completed => {
699689
log_info!(self.logger, "Persistence of new ChannelMonitor for channel {} completed", log_funding_info!(monitor));
700690
}
@@ -708,7 +698,7 @@ where C::Target: chain::Filter,
708698
channel_perm_failed: AtomicBool::new(false),
709699
last_chain_persist_height: AtomicUsize::new(self.highest_chain_height.load(Ordering::Acquire)),
710700
});
711-
persist_res
701+
Ok(persist_res)
712702
}
713703

714704
/// Note that we persist the given `ChannelMonitor` update while holding the
@@ -723,10 +713,10 @@ where C::Target: chain::Filter,
723713
// We should never ever trigger this from within ChannelManager. Technically a
724714
// user could use this object with some proxying in between which makes this
725715
// possible, but in tests and fuzzing, this should be a panic.
726-
#[cfg(any(test, fuzzing))]
716+
#[cfg(debug_assertions)]
727717
panic!("ChannelManager generated a channel update for a channel that was not yet registered!");
728-
#[cfg(not(any(test, fuzzing)))]
729-
ChannelMonitorUpdateStatus::PermanentFailure
718+
#[cfg(not(debug_assertions))]
719+
ChannelMonitorUpdateStatus::InProgress
730720
},
731721
Some(monitor_state) => {
732722
let monitor = &monitor_state.monitor;
@@ -745,18 +735,14 @@ where C::Target: chain::Filter,
745735
pending_monitor_updates.push(update_id);
746736
log_debug!(self.logger, "Persistence of ChannelMonitorUpdate for channel {} in progress", log_funding_info!(monitor));
747737
},
748-
ChannelMonitorUpdateStatus::PermanentFailure => {
749-
monitor_state.channel_perm_failed.store(true, Ordering::Release);
750-
log_error!(self.logger, "Persistence of ChannelMonitorUpdate for channel {} failed", log_funding_info!(monitor));
751-
},
752738
ChannelMonitorUpdateStatus::Completed => {
753739
log_debug!(self.logger, "Persistence of ChannelMonitorUpdate for channel {} completed", log_funding_info!(monitor));
754740
},
755741
}
756742
if update_res.is_err() {
757-
ChannelMonitorUpdateStatus::PermanentFailure
743+
ChannelMonitorUpdateStatus::InProgress
758744
} else if monitor_state.channel_perm_failed.load(Ordering::Acquire) {
759-
ChannelMonitorUpdateStatus::PermanentFailure
745+
ChannelMonitorUpdateStatus::InProgress
760746
} else {
761747
persist_res
762748
}
@@ -831,12 +817,12 @@ impl<ChannelSigner: WriteableEcdsaChannelSigner, C: Deref, T: Deref, F: Deref, L
831817

832818
#[cfg(test)]
833819
mod tests {
834-
use crate::{check_added_monitors, check_closed_broadcast, check_closed_event};
820+
use crate::check_added_monitors;
835821
use crate::{expect_payment_claimed, expect_payment_path_successful, get_event_msg};
836822
use crate::{get_htlc_update_msgs, get_local_commitment_txn, get_revoke_commit_msgs, get_route_and_payment_hash, unwrap_send_err};
837823
use crate::chain::{ChannelMonitorUpdateStatus, Confirm, Watch};
838824
use crate::chain::channelmonitor::LATENCY_GRACE_PERIOD_BLOCKS;
839-
use crate::events::{Event, ClosureReason, MessageSendEvent, MessageSendEventsProvider};
825+
use crate::events::{Event, MessageSendEvent, MessageSendEventsProvider};
840826
use crate::ln::channelmanager::{PaymentSendFailure, PaymentId, RecipientOnionFields};
841827
use crate::ln::functional_test_utils::*;
842828
use crate::ln::msgs::ChannelMessageHandler;
@@ -988,12 +974,8 @@ mod tests {
988974
chanmon_cfgs[0].persister.set_update_ret(ChannelMonitorUpdateStatus::Completed);
989975
unwrap_send_err!(nodes[0].node.send_payment_with_route(&route, second_payment_hash,
990976
RecipientOnionFields::secret_only(second_payment_secret), PaymentId(second_payment_hash.0)
991-
), true, APIError::ChannelUnavailable { ref err },
992-
assert!(err.contains("ChannelMonitor storage failure")));
993-
check_added_monitors!(nodes[0], 2); // After the failure we generate a close-channel monitor update
994-
check_closed_broadcast!(nodes[0], true);
995-
check_closed_event!(nodes[0], 1, ClosureReason::ProcessingError { err: "ChannelMonitor storage failure".to_string() },
996-
[nodes[1].node.get_our_node_id()], 100000);
977+
), false, APIError::MonitorUpdateInProgress, {});
978+
check_added_monitors!(nodes[0], 1);
997979

998980
// However, as the ChainMonitor is still waiting for the original persistence to complete,
999981
// it won't yet release the MonitorEvents.
@@ -1020,28 +1002,4 @@ mod tests {
10201002
do_chainsync_pauses_events(false);
10211003
do_chainsync_pauses_events(true);
10221004
}
1023-
1024-
#[test]
1025-
fn update_during_chainsync_fails_channel() {
1026-
let chanmon_cfgs = create_chanmon_cfgs(2);
1027-
let node_cfgs = create_node_cfgs(2, &chanmon_cfgs);
1028-
let node_chanmgrs = create_node_chanmgrs(2, &node_cfgs, &[None, None]);
1029-
let nodes = create_network(2, &node_cfgs, &node_chanmgrs);
1030-
create_announced_chan_between_nodes(&nodes, 0, 1);
1031-
1032-
chanmon_cfgs[0].persister.chain_sync_monitor_persistences.lock().unwrap().clear();
1033-
chanmon_cfgs[0].persister.set_update_ret(ChannelMonitorUpdateStatus::PermanentFailure);
1034-
1035-
connect_blocks(&nodes[0], 1);
1036-
// Before processing events, the ChannelManager will still think the Channel is open and
1037-
// there won't be any ChannelMonitorUpdates
1038-
assert_eq!(nodes[0].node.list_channels().len(), 1);
1039-
check_added_monitors!(nodes[0], 0);
1040-
// ... however once we get events once, the channel will close, creating a channel-closed
1041-
// ChannelMonitorUpdate.
1042-
check_closed_broadcast!(nodes[0], true);
1043-
check_closed_event!(nodes[0], 1, ClosureReason::ProcessingError { err: "Failed to persist ChannelMonitor update during chain sync".to_string() },
1044-
[nodes[1].node.get_our_node_id()], 100000);
1045-
check_added_monitors!(nodes[0], 1);
1046-
}
10471005
}

lightning/src/chain/channelmonitor.rs

Lines changed: 12 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -151,10 +151,7 @@ pub enum MonitorEvent {
151151
monitor_update_id: u64,
152152
},
153153

154-
/// Indicates a [`ChannelMonitor`] update has failed. See
155-
/// [`ChannelMonitorUpdateStatus::PermanentFailure`] for more information on how this is used.
156-
///
157-
/// [`ChannelMonitorUpdateStatus::PermanentFailure`]: super::ChannelMonitorUpdateStatus::PermanentFailure
154+
/// Indicates a [`ChannelMonitor`] update has failed.
158155
UpdateFailed(OutPoint),
159156
}
160157
impl_writeable_tlv_based_enum_upgradable!(MonitorEvent,
@@ -1488,21 +1485,20 @@ impl<Signer: WriteableEcdsaChannelSigner> ChannelMonitor<Signer> {
14881485
self.inner.lock().unwrap().counterparty_node_id
14891486
}
14901487

1491-
/// Used by ChannelManager deserialization to broadcast the latest holder state if its copy of
1492-
/// the Channel was out-of-date.
1488+
/// Used by [`ChannelManager`] deserialization to broadcast the latest holder state if its copy
1489+
/// of the channel state was out-of-date.
14931490
///
14941491
/// You may also use this to broadcast the latest local commitment transaction, either because
1495-
/// a monitor update failed with [`ChannelMonitorUpdateStatus::PermanentFailure`] or because we've
1496-
/// fallen behind (i.e. we've received proof that our counterparty side knows a revocation
1497-
/// secret we gave them that they shouldn't know).
1492+
/// a monitor update failed or because we've fallen behind (i.e. we've received proof that our
1493+
/// counterparty side knows a revocation secret we gave them that they shouldn't know).
14981494
///
14991495
/// Broadcasting these transactions in the second case is UNSAFE, as they allow counterparty
15001496
/// side to punish you. Nevertheless you may want to broadcast them if counterparty doesn't
15011497
/// close channel with their commitment transaction after a substantial amount of time. Best
15021498
/// may be to contact the other node operator out-of-band to coordinate other options available
1503-
/// to you. In any-case, the choice is up to you.
1499+
/// to you.
15041500
///
1505-
/// [`ChannelMonitorUpdateStatus::PermanentFailure`]: super::ChannelMonitorUpdateStatus::PermanentFailure
1501+
/// [`ChannelManager`]: crate::ln::channelmanager::ChannelManager
15061502
pub fn get_latest_holder_commitment_txn<L: Deref>(&self, logger: &L) -> Vec<Transaction>
15071503
where L::Target: Logger {
15081504
self.inner.lock().unwrap().get_latest_holder_commitment_txn(logger)
@@ -2599,6 +2595,7 @@ impl<Signer: WriteableEcdsaChannelSigner> ChannelMonitorImpl<Signer> {
25992595
ChannelMonitorUpdateStep::CommitmentSecret { idx, secret } => {
26002596
log_trace!(logger, "Updating ChannelMonitor with commitment secret");
26012597
if let Err(e) = self.provide_secret(*idx, *secret) {
2598+
debug_assert!(false, "Latest counterparty commitment secret was invalid");
26022599
log_error!(logger, "Providing latest counterparty commitment secret failed/was refused:");
26032600
log_error!(logger, " {}", e);
26042601
ret = Err(());
@@ -4413,13 +4410,12 @@ mod tests {
44134410
use crate::chain::chaininterface::LowerBoundedFeeEstimator;
44144411

44154412
use super::ChannelMonitorUpdateStep;
4416-
use crate::{check_added_monitors, check_closed_broadcast, check_closed_event, check_spends, get_local_commitment_txn, get_monitor, get_route_and_payment_hash, unwrap_send_err};
4413+
use crate::{check_added_monitors, check_spends, get_local_commitment_txn, get_monitor, get_route_and_payment_hash, unwrap_send_err};
44174414
use crate::chain::{BestBlock, Confirm};
44184415
use crate::chain::channelmonitor::ChannelMonitor;
44194416
use crate::chain::package::{weight_offered_htlc, weight_received_htlc, weight_revoked_offered_htlc, weight_revoked_received_htlc, WEIGHT_REVOKED_OUTPUT};
44204417
use crate::chain::transaction::OutPoint;
44214418
use crate::sign::InMemorySigner;
4422-
use crate::events::ClosureReason;
44234419
use crate::ln::{PaymentPreimage, PaymentHash};
44244420
use crate::ln::chan_utils;
44254421
use crate::ln::chan_utils::{HTLCOutputInCommitment, ChannelPublicKeys, ChannelTransactionParameters, HolderCommitmentTransaction, CounterpartyChannelTransactionParameters};
@@ -4485,18 +4481,14 @@ mod tests {
44854481
let (route, payment_hash, _, payment_secret) = get_route_and_payment_hash!(nodes[1], nodes[0], 100_000);
44864482
unwrap_send_err!(nodes[1].node.send_payment_with_route(&route, payment_hash,
44874483
RecipientOnionFields::secret_only(payment_secret), PaymentId(payment_hash.0)
4488-
), true, APIError::ChannelUnavailable { ref err },
4489-
assert!(err.contains("ChannelMonitor storage failure")));
4490-
check_added_monitors!(nodes[1], 2); // After the failure we generate a close-channel monitor update
4491-
check_closed_broadcast!(nodes[1], true);
4492-
check_closed_event!(nodes[1], 1, ClosureReason::ProcessingError { err: "ChannelMonitor storage failure".to_string() },
4493-
[nodes[0].node.get_our_node_id()], 100000);
4484+
), false, APIError::MonitorUpdateInProgress, {});
4485+
check_added_monitors!(nodes[1], 1);
44944486

44954487
// Build a new ChannelMonitorUpdate which contains both the failing commitment tx update
44964488
// and provides the claim preimages for the two pending HTLCs. The first update generates
44974489
// an error, but the point of this test is to ensure the later updates are still applied.
44984490
let monitor_updates = nodes[1].chain_monitor.monitor_updates.lock().unwrap();
4499-
let mut replay_update = monitor_updates.get(&channel.2).unwrap().iter().rev().skip(1).next().unwrap().clone();
4491+
let mut replay_update = monitor_updates.get(&channel.2).unwrap().iter().rev().next().unwrap().clone();
45004492
assert_eq!(replay_update.updates.len(), 1);
45014493
if let ChannelMonitorUpdateStep::LatestCounterpartyCommitmentTXInfo { .. } = replay_update.updates[0] {
45024494
} else { panic!(); }

0 commit comments

Comments
 (0)