Skip to content

Commit a6d4676

Browse files
committed
Immediately unblock channels on duplicate claims
When `MonitorUpdateCompletionAction`s were added, we didn't consider the case of a duplicate claim during normal HTLC processing (as the handling only had an `if let` rather than a `match`, which made the branch easy to miss). This can lead to a channel freezing indefinitely if an HTLC is claimed (without a `commitment_signed`), the peer disconnects, and then the HTLC is claimed again, leading to a never-completing `MonitorUpdateCompletionAction`. The fix is simple - if we get back an `UpdateFulfillCommitFetch::DuplicateClaim` when claiming from the inbound edge, immediately unlock the outbound edge channel with a new `MonitorUpdateCompletionAction::FreeOtherChannelImmediately`. Here we implement this fix by actually generating the new variant when a claim is duplicative.
1 parent 3bd1e41 commit a6d4676

File tree

2 files changed

+217
-39
lines changed

2 files changed

+217
-39
lines changed

lightning/src/ln/chanmon_update_fail_tests.rs

+69
Original file line numberDiff line numberDiff line change
@@ -3431,3 +3431,72 @@ fn test_reload_mon_update_completion_actions() {
34313431
do_test_reload_mon_update_completion_actions(true);
34323432
do_test_reload_mon_update_completion_actions(false);
34333433
}
3434+
3435+
fn do_test_glacial_peer_cant_hang(hold_chan_a: bool) {
3436+
// Test that if a peer manages to send an `update_fulfill_htlc` message without a
3437+
// `commitment_signed`, disconnects, then replays the `update_fulfill_htlc` message it doesn't
3438+
// result in a channel hang. This was previously broken as the `DuplicateClaim` case wasn't
3439+
// handled when claiming an HTLC and handling wasn't added when completion actions were added
3440+
// (which must always complete at some point).
3441+
let chanmon_cfgs = create_chanmon_cfgs(3);
3442+
let node_cfgs = create_node_cfgs(3, &chanmon_cfgs);
3443+
3444+
let node_chanmgrs = create_node_chanmgrs(3, &node_cfgs, &[None, None, None]);
3445+
let mut nodes = create_network(3, &node_cfgs, &node_chanmgrs);
3446+
3447+
create_announced_chan_between_nodes(&nodes, 0, 1);
3448+
create_announced_chan_between_nodes(&nodes, 1, 2);
3449+
3450+
// Route a payment from A, through B, to C, then claim it on C. Replay the
3451+
// `update_fulfill_htlc` twice on B to check that B doesn't hang.
3452+
let (payment_preimage, payment_hash, ..) = route_payment(&nodes[0], &[&nodes[1], &nodes[2]], 1_000_000);
3453+
3454+
nodes[2].node.claim_funds(payment_preimage);
3455+
check_added_monitors(&nodes[2], 1);
3456+
expect_payment_claimed!(nodes[2], payment_hash, 1_000_000);
3457+
3458+
let cs_updates = get_htlc_update_msgs(&nodes[2], &nodes[1].node.get_our_node_id());
3459+
if hold_chan_a {
3460+
// The first update will be on the A <-> B channel, which we allow to complete.
3461+
chanmon_cfgs[1].persister.set_update_ret(ChannelMonitorUpdateStatus::InProgress);
3462+
}
3463+
nodes[1].node.handle_update_fulfill_htlc(&nodes[2].node.get_our_node_id(), &cs_updates.update_fulfill_htlcs[0]);
3464+
check_added_monitors(&nodes[1], 1);
3465+
3466+
if !hold_chan_a {
3467+
let bs_updates = get_htlc_update_msgs(&nodes[1], &nodes[0].node.get_our_node_id());
3468+
nodes[0].node.handle_update_fulfill_htlc(&nodes[1].node.get_our_node_id(), &bs_updates.update_fulfill_htlcs[0]);
3469+
commitment_signed_dance!(nodes[0], nodes[1], bs_updates.commitment_signed, false);
3470+
expect_payment_sent!(&nodes[0], payment_preimage);
3471+
}
3472+
3473+
nodes[1].node.peer_disconnected(&nodes[2].node.get_our_node_id());
3474+
nodes[2].node.peer_disconnected(&nodes[1].node.get_our_node_id());
3475+
3476+
let mut reconnect = ReconnectArgs::new(&nodes[1], &nodes[2]);
3477+
reconnect.pending_htlc_claims = (1, 0);
3478+
reconnect_nodes(reconnect);
3479+
3480+
if !hold_chan_a {
3481+
expect_payment_forwarded!(nodes[1], nodes[0], nodes[2], Some(1000), false, false);
3482+
send_payment(&nodes[0], &[&nodes[1], &nodes[2]], 100_000);
3483+
} else {
3484+
assert!(nodes[1].node.get_and_clear_pending_events().is_empty());
3485+
assert!(nodes[1].node.get_and_clear_pending_msg_events().is_empty());
3486+
3487+
let (route, payment_hash_2, _, payment_secret_2) = get_route_and_payment_hash!(&nodes[1], nodes[2], 1_000_000);
3488+
3489+
nodes[1].node.send_payment_with_route(&route, payment_hash_2,
3490+
RecipientOnionFields::secret_only(payment_secret_2), PaymentId(payment_hash_2.0)).unwrap();
3491+
check_added_monitors(&nodes[1], 0);
3492+
3493+
assert!(nodes[1].node.get_and_clear_pending_events().is_empty());
3494+
assert!(nodes[1].node.get_and_clear_pending_msg_events().is_empty());
3495+
}
3496+
}
3497+
3498+
#[test]
3499+
fn test_glacial_peer_cant_hang() {
3500+
do_test_glacial_peer_cant_hang(false);
3501+
do_test_glacial_peer_cant_hang(true);
3502+
}

lightning/src/ln/channelmanager.rs

+148-39
Original file line numberDiff line numberDiff line change
@@ -563,6 +563,7 @@ struct ClaimablePayments {
563563
/// usually because we're running pre-full-init. They are handled immediately once we detect we are
564564
/// running normally, and specifically must be processed before any other non-background
565565
/// [`ChannelMonitorUpdate`]s are applied.
566+
#[derive(Debug)]
566567
enum BackgroundEvent {
567568
/// Handle a ChannelMonitorUpdate which closes the channel or for an already-closed channel.
568569
/// This is only separated from [`Self::MonitorUpdateRegeneratedOnStartup`] as the
@@ -5381,8 +5382,11 @@ where
53815382
for htlc in sources.drain(..) {
53825383
if let Err((pk, err)) = self.claim_funds_from_hop(
53835384
htlc.prev_hop, payment_preimage,
5384-
|_| Some(MonitorUpdateCompletionAction::PaymentClaimed { payment_hash }))
5385-
{
5385+
|_, definitely_duplicate| {
5386+
debug_assert!(!definitely_duplicate, "We shouldn't claim duplicatively from a payment");
5387+
Some(MonitorUpdateCompletionAction::PaymentClaimed { payment_hash })
5388+
}
5389+
) {
53865390
if let msgs::ErrorAction::IgnoreError = err.err.action {
53875391
// We got a temporary failure updating monitor, but will claim the
53885392
// HTLC when the monitor updating is restored (or on chain).
@@ -5410,7 +5414,7 @@ where
54105414
}
54115415
}
54125416

5413-
fn claim_funds_from_hop<ComplFunc: FnOnce(Option<u64>) -> Option<MonitorUpdateCompletionAction>>(&self,
5417+
fn claim_funds_from_hop<ComplFunc: FnOnce(Option<u64>, bool) -> Option<MonitorUpdateCompletionAction>>(&self,
54145418
prev_hop: HTLCPreviousHopData, payment_preimage: PaymentPreimage, completion_action: ComplFunc)
54155419
-> Result<(), (PublicKey, MsgHandleErrInternal)> {
54165420
//TODO: Delay the claimed_funds relaying just like we do outbound relay!
@@ -5420,6 +5424,11 @@ where
54205424
// `BackgroundEvent`s.
54215425
let during_init = !self.background_events_processed_since_startup.load(Ordering::Acquire);
54225426

5427+
// As we may call handle_monitor_update_completion_actions in rather rare cases, check that
5428+
// the required mutexes are not held before we start.
5429+
debug_assert_ne!(self.pending_events.held_by_thread(), LockHeldState::HeldByThread);
5430+
debug_assert_ne!(self.claimable_payments.held_by_thread(), LockHeldState::HeldByThread);
5431+
54235432
{
54245433
let per_peer_state = self.per_peer_state.read().unwrap();
54255434
let chan_id = prev_hop.outpoint.to_channel_id();
@@ -5441,25 +5450,70 @@ where
54415450
let counterparty_node_id = chan.context.get_counterparty_node_id();
54425451
let fulfill_res = chan.get_update_fulfill_htlc_and_commit(prev_hop.htlc_id, payment_preimage, &self.logger);
54435452

5444-
if let UpdateFulfillCommitFetch::NewClaim { htlc_value_msat, monitor_update } = fulfill_res {
5445-
if let Some(action) = completion_action(Some(htlc_value_msat)) {
5446-
log_trace!(self.logger, "Tracking monitor update completion action for channel {}: {:?}",
5447-
chan_id, action);
5448-
peer_state.monitor_update_blocked_actions.entry(chan_id).or_insert(Vec::new()).push(action);
5453+
match fulfill_res {
5454+
UpdateFulfillCommitFetch::NewClaim { htlc_value_msat, monitor_update } => {
5455+
if let Some(action) = completion_action(Some(htlc_value_msat), false) {
5456+
log_trace!(self.logger, "Tracking monitor update completion action for channel {}: {:?}",
5457+
chan_id, action);
5458+
peer_state.monitor_update_blocked_actions.entry(chan_id).or_insert(Vec::new()).push(action);
5459+
}
5460+
if !during_init {
5461+
handle_new_monitor_update!(self, prev_hop.outpoint, monitor_update, peer_state_lock,
5462+
peer_state, per_peer_state, chan);
5463+
} else {
5464+
// If we're running during init we cannot update a monitor directly -
5465+
// they probably haven't actually been loaded yet. Instead, push the
5466+
// monitor update as a background event.
5467+
self.pending_background_events.lock().unwrap().push(
5468+
BackgroundEvent::MonitorUpdateRegeneratedOnStartup {
5469+
counterparty_node_id,
5470+
funding_txo: prev_hop.outpoint,
5471+
update: monitor_update.clone(),
5472+
});
5473+
}
54495474
}
5450-
if !during_init {
5451-
handle_new_monitor_update!(self, prev_hop.outpoint, monitor_update, peer_state_lock,
5452-
peer_state, per_peer_state, chan);
5453-
} else {
5454-
// If we're running during init we cannot update a monitor directly -
5455-
// they probably haven't actually been loaded yet. Instead, push the
5456-
// monitor update as a background event.
5457-
self.pending_background_events.lock().unwrap().push(
5458-
BackgroundEvent::MonitorUpdateRegeneratedOnStartup {
5459-
counterparty_node_id,
5460-
funding_txo: prev_hop.outpoint,
5461-
update: monitor_update.clone(),
5462-
});
5475+
UpdateFulfillCommitFetch::DuplicateClaim {} => {
5476+
let action = if let Some(action) = completion_action(None, true) {
5477+
action
5478+
} else {
5479+
return Ok(());
5480+
};
5481+
mem::drop(peer_state_lock);
5482+
5483+
log_trace!(self.logger, "Completing monitor update completion action for channel {} as claim was redundant: {:?}",
5484+
chan_id, action);
5485+
let (node_id, funding_outpoint, blocker) =
5486+
if let MonitorUpdateCompletionAction::FreeOtherChannelImmediately {
5487+
downstream_counterparty_node_id: node_id,
5488+
downstream_funding_outpoint: funding_outpoint,
5489+
blocking_action: blocker,
5490+
} = action {
5491+
(node_id, funding_outpoint, blocker)
5492+
} else {
5493+
debug_assert!(false,
5494+
"Duplicate claims should always free another channel immediately");
5495+
return Ok(());
5496+
};
5497+
if let Some(peer_state_mtx) = per_peer_state.get(&node_id) {
5498+
let mut peer_state = peer_state_mtx.lock().unwrap();
5499+
if let Some(blockers) = peer_state
5500+
.actions_blocking_raa_monitor_updates
5501+
.get_mut(&funding_outpoint.to_channel_id())
5502+
{
5503+
let mut found_blocker = false;
5504+
blockers.retain(|iter| {
5505+
// Note that we could actually be blocked, in
5506+
// which case we need to only remove the one
5507+
// blocker which was added duplicatively.
5508+
let first_blocker = !found_blocker;
5509+
if *iter == blocker { found_blocker = true; }
5510+
*iter != blocker || !first_blocker
5511+
});
5512+
debug_assert!(found_blocker);
5513+
}
5514+
} else {
5515+
debug_assert!(false);
5516+
}
54635517
}
54645518
}
54655519
}
@@ -5507,7 +5561,7 @@ where
55075561
// `ChannelMonitor` we've provided the above update to. Instead, note that `Event`s are
55085562
// generally always allowed to be duplicative (and it's specifically noted in
55095563
// `PaymentForwarded`).
5510-
self.handle_monitor_update_completion_actions(completion_action(None));
5564+
self.handle_monitor_update_completion_actions(completion_action(None, false));
55115565
Ok(())
55125566
}
55135567

@@ -5537,13 +5591,74 @@ where
55375591
HTLCSource::PreviousHopData(hop_data) => {
55385592
let prev_outpoint = hop_data.outpoint;
55395593
let completed_blocker = RAAMonitorUpdateBlockingAction::from_prev_hop_data(&hop_data);
5594+
#[cfg(debug_assertions)]
5595+
let claiming_chan_funding_outpoint = hop_data.outpoint;
55405596
let res = self.claim_funds_from_hop(hop_data, payment_preimage,
5541-
|htlc_claim_value_msat| {
5542-
if let Some(forwarded_htlc_value) = forwarded_htlc_value_msat {
5543-
let fee_earned_msat = if let Some(claimed_htlc_value) = htlc_claim_value_msat {
5544-
Some(claimed_htlc_value - forwarded_htlc_value)
5545-
} else { None };
5597+
|htlc_claim_value_msat, definitely_duplicate| {
5598+
let chan_to_release =
5599+
if let Some(node_id) = next_channel_counterparty_node_id {
5600+
Some((node_id, next_channel_outpoint, completed_blocker))
5601+
} else {
5602+
// We can only get `None` here if we are processing a
5603+
// `ChannelMonitor`-originated event, in which case we
5604+
// don't care about ensuring we wake the downstream
5605+
// channel's monitor updating - the channel is already
5606+
// closed.
5607+
None
5608+
};
55465609

5610+
if definitely_duplicate && startup_replay {
5611+
// On startup we may get redundant claims which are related to
5612+
// monitor updates still in flight. In that case, we shouldn't
5613+
// immediately free, but instead let that monitor update complete
5614+
// in the background.
5615+
#[cfg(debug_assertions)] {
5616+
let background_events = self.pending_background_events.lock().unwrap();
5617+
// There should be a `BackgroundEvent` pending...
5618+
assert!(background_events.iter().any(|ev| {
5619+
match ev {
5620+
// to apply a monitor update that blocked the claiming channel,
5621+
BackgroundEvent::MonitorUpdateRegeneratedOnStartup {
5622+
funding_txo, update, ..
5623+
} => {
5624+
if *funding_txo == claiming_chan_funding_outpoint {
5625+
assert!(update.updates.iter().any(|upd|
5626+
if let ChannelMonitorUpdateStep::PaymentPreimage {
5627+
payment_preimage: update_preimage
5628+
} = upd {
5629+
payment_preimage == *update_preimage
5630+
} else { false }
5631+
), "{:?}", update);
5632+
true
5633+
} else { false }
5634+
},
5635+
// or the channel we'd unblock is already closed,
5636+
BackgroundEvent::ClosedMonitorUpdateRegeneratedOnStartup((funding_txo, ..))
5637+
=> *funding_txo == next_channel_outpoint,
5638+
// or the monitor update has completed and will unblock
5639+
// immediately once we get going.
5640+
BackgroundEvent::MonitorUpdatesComplete {
5641+
channel_id, ..
5642+
} =>
5643+
*channel_id == claiming_chan_funding_outpoint.to_channel_id(),
5644+
}
5645+
}), "{:?}", *background_events);
5646+
}
5647+
None
5648+
} else if definitely_duplicate {
5649+
if let Some(other_chan) = chan_to_release {
5650+
Some(MonitorUpdateCompletionAction::FreeOtherChannelImmediately {
5651+
downstream_counterparty_node_id: other_chan.0,
5652+
downstream_funding_outpoint: other_chan.1,
5653+
blocking_action: other_chan.2,
5654+
})
5655+
} else { None }
5656+
} else {
5657+
let fee_earned_msat = if let Some(forwarded_htlc_value) = forwarded_htlc_value_msat {
5658+
if let Some(claimed_htlc_value) = htlc_claim_value_msat {
5659+
Some(claimed_htlc_value - forwarded_htlc_value)
5660+
} else { None }
5661+
} else { None };
55475662
Some(MonitorUpdateCompletionAction::EmitEventAndFreeOtherChannel {
55485663
event: events::Event::PaymentForwarded {
55495664
fee_earned_msat,
@@ -5552,19 +5667,9 @@ where
55525667
next_channel_id: Some(next_channel_outpoint.to_channel_id()),
55535668
outbound_amount_forwarded_msat: forwarded_htlc_value_msat,
55545669
},
5555-
downstream_counterparty_and_funding_outpoint:
5556-
if let Some(node_id) = next_channel_counterparty_node_id {
5557-
Some((node_id, next_channel_outpoint, completed_blocker))
5558-
} else {
5559-
// We can only get `None` here if we are processing a
5560-
// `ChannelMonitor`-originated event, in which case we
5561-
// don't care about ensuring we wake the downstream
5562-
// channel's monitor updating - the channel is already
5563-
// closed.
5564-
None
5565-
},
5670+
downstream_counterparty_and_funding_outpoint: chan_to_release,
55665671
})
5567-
} else { None }
5672+
}
55685673
});
55695674
if let Err((pk, err)) = res {
55705675
let result: Result<(), _> = Err(err);
@@ -5580,6 +5685,10 @@ where
55805685
}
55815686

55825687
fn handle_monitor_update_completion_actions<I: IntoIterator<Item=MonitorUpdateCompletionAction>>(&self, actions: I) {
5688+
debug_assert_ne!(self.pending_events.held_by_thread(), LockHeldState::HeldByThread);
5689+
debug_assert_ne!(self.claimable_payments.held_by_thread(), LockHeldState::HeldByThread);
5690+
debug_assert_ne!(self.per_peer_state.held_by_thread(), LockHeldState::HeldByThread);
5691+
55835692
for action in actions.into_iter() {
55845693
match action {
55855694
MonitorUpdateCompletionAction::PaymentClaimed { payment_hash } => {

0 commit comments

Comments
 (0)