Skip to content

Commit 345f8df

Browse files
committed
Re-claim forwarded HTLCs on startup
Because `ChannelMonitorUpdate`s can complete asynchronously and out-of-order now, a `commitment_signed` `ChannelMonitorUpdate` from a downstream channel could complete prior to the preimage `ChannelMonitorUpdate` on the upstream channel. In that case, we may not get a `update_fulfill_htlc` replay on startup. Thus, we have to ensure any payment preimages contained in that downstream update are re-claimed on startup. Here we do this during the existing walk of the `ChannelMonitor` preimages for closed channels.
1 parent 63217bc commit 345f8df

File tree

1 file changed

+98
-26
lines changed

1 file changed

+98
-26
lines changed

lightning/src/ln/channelmanager.rs

+98-26
Original file line numberDiff line numberDiff line change
@@ -1098,7 +1098,6 @@ where
10981098
/// Notifier the lock contains sends out a notification when the lock is released.
10991099
total_consistency_lock: RwLock<()>,
11001100

1101-
#[cfg(debug_assertions)]
11021101
background_events_processed_since_startup: AtomicBool,
11031102

11041103
persistence_notifier: Notifier,
@@ -1872,9 +1871,7 @@ macro_rules! handle_new_monitor_update {
18721871
// update_maps_on_chan_removal needs to be able to take id_to_peer, so make sure we can in
18731872
// any case so that it won't deadlock.
18741873
debug_assert_ne!($self.id_to_peer.held_by_thread(), LockHeldState::HeldByThread);
1875-
#[cfg(debug_assertions)] {
1876-
debug_assert!($self.background_events_processed_since_startup.load(Ordering::Acquire));
1877-
}
1874+
debug_assert!($self.background_events_processed_since_startup.load(Ordering::Acquire));
18781875
match $update_res {
18791876
ChannelMonitorUpdateStatus::InProgress => {
18801877
log_debug!($self.logger, "ChannelMonitor update for {} in flight, holding messages until the update completes.",
@@ -2060,7 +2057,6 @@ where
20602057
pending_events_processor: AtomicBool::new(false),
20612058
pending_background_events: Mutex::new(Vec::new()),
20622059
total_consistency_lock: RwLock::new(()),
2063-
#[cfg(debug_assertions)]
20642060
background_events_processed_since_startup: AtomicBool::new(false),
20652061
persistence_notifier: Notifier::new(),
20662062

@@ -4097,7 +4093,6 @@ where
40974093
fn process_background_events(&self) -> NotifyOption {
40984094
debug_assert_ne!(self.total_consistency_lock.held_by_thread(), LockHeldState::NotHeldByThread);
40994095

4100-
#[cfg(debug_assertions)]
41014096
self.background_events_processed_since_startup.store(true, Ordering::Release);
41024097

41034098
let mut background_events = Vec::new();
@@ -4688,6 +4683,11 @@ where
46884683
-> Result<(), (PublicKey, MsgHandleErrInternal)> {
46894684
//TODO: Delay the claimed_funds relaying just like we do outbound relay!
46904685

4686+
// If we haven't yet run background events assume we're still deserializing and shouldn't
4687+
// actually pass `ChannelMonitorUpdate`s to users yet. Instead, queue them up as
4688+
// `BackgroundEvent`s.
4689+
let during_init = !self.background_events_processed_since_startup.load(Ordering::Acquire);
4690+
46914691
{
46924692
let per_peer_state = self.per_peer_state.read().unwrap();
46934693
let chan_id = prev_hop.outpoint.to_channel_id();
@@ -4714,14 +4714,26 @@ where
47144714
log_bytes!(chan_id), action);
47154715
peer_state.monitor_update_blocked_actions.entry(chan_id).or_insert(Vec::new()).push(action);
47164716
}
4717-
let res = handle_new_monitor_update!(self, prev_hop.outpoint, monitor_update, peer_state_lock,
4718-
peer_state, per_peer_state, chan);
4719-
if let Err(e) = res {
4720-
// TODO: This is a *critical* error - we probably updated the outbound edge
4721-
// of the HTLC's monitor with a preimage. We should retry this monitor
4722-
// update over and over again until morale improves.
4723-
log_error!(self.logger, "Failed to update channel monitor with preimage {:?}", payment_preimage);
4724-
return Err((counterparty_node_id, e));
4717+
if !during_init {
4718+
let res = handle_new_monitor_update!(self, prev_hop.outpoint, monitor_update, peer_state_lock,
4719+
peer_state, per_peer_state, chan);
4720+
if let Err(e) = res {
4721+
// TODO: This is a *critical* error - we probably updated the outbound edge
4722+
// of the HTLC's monitor with a preimage. We should retry this monitor
4723+
// update over and over again until morale improves.
4724+
log_error!(self.logger, "Failed to update channel monitor with preimage {:?}", payment_preimage);
4725+
return Err((counterparty_node_id, e));
4726+
}
4727+
} else {
4728+
// If we're running during init we cannot update a monitor directly -
4729+
// they probably haven't actually been loaded yet. Instead, push the
4730+
// monitor update as a background event.
4731+
self.pending_background_events.lock().unwrap().push(
4732+
BackgroundEvent::MonitorUpdateRegeneratedOnStartup {
4733+
counterparty_node_id,
4734+
funding_txo: prev_hop.outpoint,
4735+
update: monitor_update.clone(),
4736+
});
47254737
}
47264738
}
47274739
return Ok(());
@@ -4734,16 +4746,34 @@ where
47344746
payment_preimage,
47354747
}],
47364748
};
4737-
// We update the ChannelMonitor on the backward link, after
4738-
// receiving an `update_fulfill_htlc` from the forward link.
4739-
let update_res = self.chain_monitor.update_channel(prev_hop.outpoint, &preimage_update);
4740-
if update_res != ChannelMonitorUpdateStatus::Completed {
4741-
// TODO: This needs to be handled somehow - if we receive a monitor update
4742-
// with a preimage we *must* somehow manage to propagate it to the upstream
4743-
// channel, or we must have an ability to receive the same event and try
4744-
// again on restart.
4745-
log_error!(self.logger, "Critical error: failed to update channel monitor with preimage {:?}: {:?}",
4746-
payment_preimage, update_res);
4749+
4750+
if !during_init {
4751+
// We update the ChannelMonitor on the backward link, after
4752+
// receiving an `update_fulfill_htlc` from the forward link.
4753+
let update_res = self.chain_monitor.update_channel(prev_hop.outpoint, &preimage_update);
4754+
if update_res != ChannelMonitorUpdateStatus::Completed {
4755+
// TODO: This needs to be handled somehow - if we receive a monitor update
4756+
// with a preimage we *must* somehow manage to propagate it to the upstream
4757+
// channel, or we must have an ability to receive the same event and try
4758+
// again on restart.
4759+
log_error!(self.logger, "Critical error: failed to update channel monitor with preimage {:?}: {:?}",
4760+
payment_preimage, update_res);
4761+
}
4762+
} else {
4763+
// If we're running during init we cannot update a monitor directly - they probably
4764+
// haven't actually been loaded yet. Instead, push the monitor update as a background
4765+
// event.
4766+
// Note that while its safe to use `ClosingMonitorUpdateRegeneratedOnStartup` here (the
4767+
// channel is already closed) we need to ultimately handle the monitor update
4768+
// completion action only after we've completed the monitor update. This is the only
4769+
// way to guarantee this update *will* be regenerated on startup (otherwise if this was
4770+
// from a forwarded HTLC the downstream preimage may be deleted before we claim
4771+
// upstream). Thus, we need to transition to some new `BackgroundEvent` type which will
4772+
// complete the monitor update completion action from `completion_action`.
4773+
self.pending_background_events.lock().unwrap().push(
4774+
BackgroundEvent::ClosingMonitorUpdateRegeneratedOnStartup((
4775+
prev_hop.outpoint, preimage_update,
4776+
)));
47474777
}
47484778
// Note that we do process the completion action here. This totally could be a
47494779
// duplicate claim, but we have no way of knowing without interrogating the
@@ -4761,6 +4791,8 @@ where
47614791
fn claim_funds_internal(&self, source: HTLCSource, payment_preimage: PaymentPreimage, forwarded_htlc_value_msat: Option<u64>, from_onchain: bool, next_channel_id: [u8; 32]) {
47624792
match source {
47634793
HTLCSource::OutboundRoute { session_priv, payment_id, path, .. } => {
4794+
debug_assert!(self.background_events_processed_since_startup.load(Ordering::Acquire),
4795+
"We don't support claim_htlc claims during startup - monitors may not be available yet");
47644796
self.pending_outbound_payments.claim_htlc(payment_id, payment_preimage, session_priv, path, from_onchain, &self.pending_events, &self.logger);
47654797
},
47664798
HTLCSource::PreviousHopData(hop_data) => {
@@ -8485,6 +8517,11 @@ where
84858517
// Note that we have to do the above replays before we push new monitor updates.
84868518
pending_background_events.append(&mut close_background_events);
84878519

8520+
// If there's any preimages for forwarded HTLCs hanging around in ChannelMonitors we
8521+
// should ensure we try them again on the inbound edge. We put them here and do so after we
8522+
// have a fully-constructed `ChannelManager` at the end.
8523+
let mut pending_claims_to_replay = Vec::new();
8524+
84888525
{
84898526
// If we're tracking pending payments, ensure we haven't lost any by looking at the
84908527
// ChannelMonitor data for any channels for which we do not have authorative state
@@ -8495,7 +8532,8 @@ where
84958532
// We only rebuild the pending payments map if we were most recently serialized by
84968533
// 0.0.102+
84978534
for (_, monitor) in args.channel_monitors.iter() {
8498-
if id_to_peer.get(&monitor.get_funding_txo().0.to_channel_id()).is_none() {
8535+
let counterparty_opt = id_to_peer.get(&monitor.get_funding_txo().0.to_channel_id());
8536+
if counterparty_opt.is_none() {
84998537
for (htlc_source, (htlc, _)) in monitor.get_pending_or_resolved_outbound_htlcs() {
85008538
if let HTLCSource::OutboundRoute { payment_id, session_priv, path, .. } = htlc_source {
85018539
if path.hops.is_empty() {
@@ -8589,6 +8627,33 @@ where
85898627
}
85908628
}
85918629
}
8630+
8631+
// Whether the downstream channel was closed or not, try to re-apply any payment
8632+
// preimages from it which may be needed in upstream channels for forwarded
8633+
// payments.
8634+
let outbound_claimed_htlcs_iter = monitor.get_all_current_outbound_htlcs()
8635+
.into_iter()
8636+
.filter_map(|(htlc_source, (htlc, preimage_opt))| {
8637+
if let HTLCSource::PreviousHopData(_) = htlc_source {
8638+
if let Some(payment_preimage) = preimage_opt {
8639+
Some((htlc_source, payment_preimage, htlc.amount_msat,
8640+
// Check if `counterparty_opt.is_none()` to see if the
8641+
// downstream chan is closed (because we don't have a
8642+
// channel_id -> peer map entry).
8643+
counterparty_opt.is_none(),
8644+
monitor.get_funding_txo().0.to_channel_id()))
8645+
} else { None }
8646+
} else {
8647+
// If it was an outbound payment, we've handled it above - if a preimage
8648+
// came in and we persisted the `ChannelManager` we either handled it and
8649+
// are good to go or the channel force-closed - we don't have to handle the
8650+
// channel still live case here.
8651+
None
8652+
}
8653+
});
8654+
for tuple in outbound_claimed_htlcs_iter {
8655+
pending_claims_to_replay.push(tuple);
8656+
}
85928657
}
85938658
}
85948659

@@ -8821,7 +8886,6 @@ where
88218886
pending_events_processor: AtomicBool::new(false),
88228887
pending_background_events: Mutex::new(pending_background_events),
88238888
total_consistency_lock: RwLock::new(()),
8824-
#[cfg(debug_assertions)]
88258889
background_events_processed_since_startup: AtomicBool::new(false),
88268890
persistence_notifier: Notifier::new(),
88278891

@@ -8840,6 +8904,14 @@ where
88408904
channel_manager.fail_htlc_backwards_internal(&source, &payment_hash, &reason, receiver);
88418905
}
88428906

8907+
for (source, preimage, downstream_value, downstream_closed, downstream_chan_id) in pending_claims_to_replay {
8908+
// We use `downstream_closed` in place of `from_onchain` here just as a guess - we
8909+
// don't remember in the `ChannelMonitor` where we got a preimage from, but if the
8910+
// channel is closed we just assume that it probably came from an on-chain claim.
8911+
channel_manager.claim_funds_internal(source, preimage, Some(downstream_value),
8912+
downstream_closed, downstream_chan_id);
8913+
}
8914+
88438915
//TODO: Broadcast channel update for closed channels, but only after we've made a
88448916
//connection or two.
88458917

0 commit comments

Comments
 (0)