Skip to content

Commit a9ab9b7

Browse files
committed
Rewrite Channel resend tracking to make it much more reliable
Resending revoke_and_ack and commitment_signed (+update) messages after monitor-update-failure or disconnection has been a highly unreliable part of our codebase for some time (as evidenced by the number of bugs caught in the chanmon_fail_consistency fuzz target). This is due to its rather ad-hoc nature and tracking/behavior which consists of checking a number of different flags to try to deduce which messages were/were not delivered and go from there. Instead, this commit rewrites it to simply keep track of the order messages were generated originally, as we always resend in the originally-generated order. I'm anticipating this will be way more robust than the old code, in addition to its simplicity.
1 parent 2a19051 commit a9ab9b7

File tree

2 files changed

+41
-68
lines changed

2 files changed

+41
-68
lines changed

src/ln/channel.rs

Lines changed: 40 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -237,19 +237,19 @@ pub(super) struct Channel {
237237
cur_local_commitment_transaction_number: u64,
238238
cur_remote_commitment_transaction_number: u64,
239239
value_to_self_msat: u64, // Excluding all pending_htlcs, excluding fees
240-
/// Upon receipt of a channel_reestablish we have to figure out whether to send a
241-
/// revoke_and_ack first or a commitment update first. Generally, we prefer to send
242-
/// revoke_and_ack first, but if we had a pending commitment update of our own waiting on a
243-
/// remote revoke when we received the latest commitment update from the remote we have to make
244-
/// sure that commitment update gets resent first.
245-
received_commitment_while_awaiting_raa: bool,
246240
pending_inbound_htlcs: Vec<InboundHTLCOutput>,
247241
pending_outbound_htlcs: Vec<OutboundHTLCOutput>,
248242
holding_cell_htlc_updates: Vec<HTLCUpdateAwaitingACK>,
249243

244+
/// When resending CS/RAA messages on channel monitor restoration or on reconnect, we always
245+
/// need to ensure we resend them in the order we originally generated them. Thus, we track
246+
/// that order here by always setting this to the opposite of the message we are generating (ie
247+
/// when we generate a CS, we set this to RAAFirst as we should send any pending RAAs first and
248+
/// visa-versa).
249+
resend_order: RAACommitmentOrder,
250+
250251
monitor_pending_revoke_and_ack: bool,
251252
monitor_pending_commitment_signed: bool,
252-
monitor_pending_order: Option<RAACommitmentOrder>,
253253
monitor_pending_forwards: Vec<(PendingForwardHTLCInfo, u64)>,
254254
monitor_pending_failures: Vec<(HTLCSource, PaymentHash, HTLCFailReason)>,
255255

@@ -457,7 +457,6 @@ impl Channel {
457457
cur_local_commitment_transaction_number: INITIAL_COMMITMENT_NUMBER,
458458
cur_remote_commitment_transaction_number: INITIAL_COMMITMENT_NUMBER,
459459
value_to_self_msat: channel_value_satoshis * 1000 - push_msat,
460-
received_commitment_while_awaiting_raa: false,
461460

462461
pending_inbound_htlcs: Vec::new(),
463462
pending_outbound_htlcs: Vec::new(),
@@ -468,9 +467,10 @@ impl Channel {
468467
next_remote_htlc_id: 0,
469468
channel_update_count: 1,
470469

470+
resend_order: RAACommitmentOrder::CommitmentFirst,
471+
471472
monitor_pending_revoke_and_ack: false,
472473
monitor_pending_commitment_signed: false,
473-
monitor_pending_order: None,
474474
monitor_pending_forwards: Vec::new(),
475475
monitor_pending_failures: Vec::new(),
476476

@@ -646,7 +646,6 @@ impl Channel {
646646
cur_local_commitment_transaction_number: INITIAL_COMMITMENT_NUMBER,
647647
cur_remote_commitment_transaction_number: INITIAL_COMMITMENT_NUMBER,
648648
value_to_self_msat: msg.push_msat,
649-
received_commitment_while_awaiting_raa: false,
650649

651650
pending_inbound_htlcs: Vec::new(),
652651
pending_outbound_htlcs: Vec::new(),
@@ -657,9 +656,10 @@ impl Channel {
657656
next_remote_htlc_id: 0,
658657
channel_update_count: 1,
659658

659+
resend_order: RAACommitmentOrder::CommitmentFirst,
660+
660661
monitor_pending_revoke_and_ack: false,
661662
monitor_pending_commitment_signed: false,
662-
monitor_pending_order: None,
663663
monitor_pending_forwards: Vec::new(),
664664
monitor_pending_failures: Vec::new(),
665665

@@ -1810,12 +1810,6 @@ impl Channel {
18101810
}
18111811
}
18121812

1813-
if self.channel_state & (ChannelState::MonitorUpdateFailed as u32) == 0 {
1814-
// This is a response to our post-monitor-failed unfreeze messages, so we can clear the
1815-
// monitor_pending_order requirement as we won't re-send the monitor_pending messages.
1816-
self.monitor_pending_order = None;
1817-
}
1818-
18191813
self.channel_monitor.provide_latest_local_commitment_tx_info(local_commitment_tx.0, local_keys, self.feerate_per_kw, htlcs_and_sigs);
18201814

18211815
for htlc in self.pending_inbound_htlcs.iter_mut() {
@@ -1838,14 +1832,13 @@ impl Channel {
18381832

18391833
self.cur_local_commitment_transaction_number -= 1;
18401834
self.last_local_commitment_txn = new_local_commitment_txn;
1841-
self.received_commitment_while_awaiting_raa = (self.channel_state & (ChannelState::AwaitingRemoteRevoke as u32)) != 0;
1835+
// Note that if we need_our_commitment & !AwaitingRemoteRevoke we'll call
1836+
// send_commitment_no_status_check() next which will reset this to RAAFirst.
1837+
self.resend_order = RAACommitmentOrder::CommitmentFirst;
18421838

18431839
if (self.channel_state & ChannelState::MonitorUpdateFailed as u32) != 0 {
18441840
// In case we initially failed monitor updating without requiring a response, we need
18451841
// to make sure the RAA gets sent first.
1846-
if !self.monitor_pending_commitment_signed {
1847-
self.monitor_pending_order = Some(RAACommitmentOrder::RevokeAndACKFirst);
1848-
}
18491842
self.monitor_pending_revoke_and_ack = true;
18501843
if need_our_commitment && (self.channel_state & (ChannelState::AwaitingRemoteRevoke as u32)) == 0 {
18511844
// If we were going to send a commitment_signed after the RAA, go ahead and do all
@@ -2019,12 +2012,6 @@ impl Channel {
20192012
self.their_prev_commitment_point = self.their_cur_commitment_point;
20202013
self.their_cur_commitment_point = Some(msg.next_per_commitment_point);
20212014
self.cur_remote_commitment_transaction_number -= 1;
2022-
self.received_commitment_while_awaiting_raa = false;
2023-
if self.channel_state & (ChannelState::MonitorUpdateFailed as u32) == 0 {
2024-
// This is a response to our post-monitor-failed unfreeze messages, so we can clear the
2025-
// monitor_pending_order requirement as we won't re-send the monitor_pending messages.
2026-
self.monitor_pending_order = None;
2027-
}
20282015

20292016
log_trace!(self, "Updating HTLCs on receipt of RAA...");
20302017
let mut to_forward_infos = Vec::new();
@@ -2142,7 +2129,7 @@ impl Channel {
21422129
// When the monitor updating is restored we'll call get_last_commitment_update(),
21432130
// which does not update state, but we're definitely now awaiting a remote revoke
21442131
// before we can step forward any more, so set it here.
2145-
self.channel_state |= ChannelState::AwaitingRemoteRevoke as u32;
2132+
self.send_commitment_no_status_check()?;
21462133
}
21472134
self.monitor_pending_forwards.append(&mut to_forward_infos);
21482135
self.monitor_pending_failures.append(&mut revoked_htlcs);
@@ -2290,15 +2277,13 @@ impl Channel {
22902277
/// Indicates that a ChannelMonitor update failed to be stored by the client and further
22912278
/// updates are partially paused.
22922279
/// This must be called immediately after the call which generated the ChannelMonitor update
2293-
/// which failed, with the order argument set to the type of call it represented (ie a
2294-
/// commitment update or a revoke_and_ack generation). The messages which were generated from
2295-
/// that original call must *not* have been sent to the remote end, and must instead have been
2296-
/// dropped. They will be regenerated when monitor_updating_restored is called.
2297-
pub fn monitor_update_failed(&mut self, order: RAACommitmentOrder, resend_raa: bool, resend_commitment: bool, mut pending_forwards: Vec<(PendingForwardHTLCInfo, u64)>, mut pending_fails: Vec<(HTLCSource, PaymentHash, HTLCFailReason)>) {
2280+
/// which failed. The messages which were generated from that call which generated the
2281+
/// monitor update failure must *not* have been sent to the remote end, and must instead
2282+
/// have been dropped. They will be regenerated when monitor_updating_restored is called.
2283+
pub fn monitor_update_failed(&mut self, resend_raa: bool, resend_commitment: bool, mut pending_forwards: Vec<(PendingForwardHTLCInfo, u64)>, mut pending_fails: Vec<(HTLCSource, PaymentHash, HTLCFailReason)>) {
22982284
assert_eq!(self.channel_state & ChannelState::MonitorUpdateFailed as u32, 0);
22992285
self.monitor_pending_revoke_and_ack = resend_raa;
23002286
self.monitor_pending_commitment_signed = resend_commitment;
2301-
self.monitor_pending_order = Some(order);
23022287
assert!(self.monitor_pending_forwards.is_empty());
23032288
mem::swap(&mut pending_forwards, &mut self.monitor_pending_forwards);
23042289
assert!(self.monitor_pending_failures.is_empty());
@@ -2319,7 +2304,6 @@ impl Channel {
23192304
mem::swap(&mut failures, &mut self.monitor_pending_failures);
23202305

23212306
if self.channel_state & (ChannelState::PeerDisconnected as u32) != 0 {
2322-
// Leave monitor_pending_order so we can order our channel_reestablish responses
23232307
self.monitor_pending_revoke_and_ack = false;
23242308
self.monitor_pending_commitment_signed = false;
23252309
return (None, None, RAACommitmentOrder::RevokeAndACKFirst, forwards, failures);
@@ -2334,7 +2318,7 @@ impl Channel {
23342318

23352319
self.monitor_pending_revoke_and_ack = false;
23362320
self.monitor_pending_commitment_signed = false;
2337-
let order = self.monitor_pending_order.clone().unwrap();
2321+
let order = self.resend_order.clone();
23382322
log_trace!(self, "Restored monitor updating resulting in {} commitment update and {} RAA, with {} first",
23392323
if commitment_update.is_some() { "a" } else { "no" },
23402324
if raa.is_some() { "an" } else { "no" },
@@ -2497,33 +2481,26 @@ impl Channel {
24972481
})
24982482
} else { None };
24992483

2500-
let order = self.monitor_pending_order.clone().unwrap_or(if self.received_commitment_while_awaiting_raa {
2501-
RAACommitmentOrder::CommitmentFirst
2502-
} else {
2503-
RAACommitmentOrder::RevokeAndACKFirst
2504-
});
2505-
25062484
if msg.next_local_commitment_number == our_next_remote_commitment_number {
25072485
if required_revoke.is_some() {
25082486
log_debug!(self, "Reconnected channel {} with only lost outbound RAA", log_bytes!(self.channel_id()));
25092487
} else {
25102488
log_debug!(self, "Reconnected channel {} with no loss", log_bytes!(self.channel_id()));
25112489
}
25122490

2513-
if (self.channel_state & (ChannelState::AwaitingRemoteRevoke as u32 | ChannelState::MonitorUpdateFailed as u32)) == 0 &&
2514-
self.monitor_pending_order.is_none() { // monitor_pending_order indicates we're waiting on a response to a unfreeze
2491+
if (self.channel_state & (ChannelState::AwaitingRemoteRevoke as u32 | ChannelState::MonitorUpdateFailed as u32)) == 0 {
25152492
// We're up-to-date and not waiting on a remote revoke (if we are our
25162493
// channel_reestablish should result in them sending a revoke_and_ack), but we may
25172494
// have received some updates while we were disconnected. Free the holding cell
25182495
// now!
25192496
match self.free_holding_cell_htlcs() {
25202497
Err(ChannelError::Close(msg)) => return Err(ChannelError::Close(msg)),
25212498
Err(ChannelError::Ignore(_)) => panic!("Got non-channel-failing result from free_holding_cell_htlcs"),
2522-
Ok(Some((commitment_update, channel_monitor))) => return Ok((resend_funding_locked, required_revoke, Some(commitment_update), Some(channel_monitor), order, shutdown_msg)),
2523-
Ok(None) => return Ok((resend_funding_locked, required_revoke, None, None, order, shutdown_msg)),
2499+
Ok(Some((commitment_update, channel_monitor))) => return Ok((resend_funding_locked, required_revoke, Some(commitment_update), Some(channel_monitor), self.resend_order.clone(), shutdown_msg)),
2500+
Ok(None) => return Ok((resend_funding_locked, required_revoke, None, None, self.resend_order.clone(), shutdown_msg)),
25242501
}
25252502
} else {
2526-
return Ok((resend_funding_locked, required_revoke, None, None, order, shutdown_msg));
2503+
return Ok((resend_funding_locked, required_revoke, None, None, self.resend_order.clone(), shutdown_msg));
25272504
}
25282505
} else if msg.next_local_commitment_number == our_next_remote_commitment_number - 1 {
25292506
if required_revoke.is_some() {
@@ -2534,10 +2511,10 @@ impl Channel {
25342511

25352512
if self.channel_state & (ChannelState::MonitorUpdateFailed as u32) != 0 {
25362513
self.monitor_pending_commitment_signed = true;
2537-
return Ok((resend_funding_locked, None, None, None, order, shutdown_msg));
2514+
return Ok((resend_funding_locked, None, None, None, self.resend_order.clone(), shutdown_msg));
25382515
}
25392516

2540-
return Ok((resend_funding_locked, required_revoke, Some(self.get_last_commitment_update()), None, order, shutdown_msg));
2517+
return Ok((resend_funding_locked, required_revoke, Some(self.get_last_commitment_update()), None, self.resend_order.clone(), shutdown_msg));
25412518
} else {
25422519
return Err(ChannelError::Close("Peer attempted to reestablish channel with a very old remote commitment transaction"));
25432520
}
@@ -3358,6 +3335,7 @@ impl Channel {
33583335
htlc.state = OutboundHTLCState::AwaitingRemovedRemoteRevoke(fail_reason);
33593336
}
33603337
}
3338+
self.resend_order = RAACommitmentOrder::RevokeAndACKFirst;
33613339

33623340
let (res, remote_commitment_tx, htlcs) = match self.send_commitment_no_state_update() {
33633341
Ok((res, (remote_commitment_tx, mut htlcs))) => {
@@ -3568,8 +3546,6 @@ impl Writeable for Channel {
35683546
self.cur_remote_commitment_transaction_number.write(writer)?;
35693547
self.value_to_self_msat.write(writer)?;
35703548

3571-
self.received_commitment_while_awaiting_raa.write(writer)?;
3572-
35733549
let mut dropped_inbound_htlcs = 0;
35743550
for htlc in self.pending_inbound_htlcs.iter() {
35753551
if let InboundHTLCState::RemoteAnnounced(_) = htlc.state {
@@ -3669,13 +3645,13 @@ impl Writeable for Channel {
36693645
}
36703646
}
36713647

3648+
match self.resend_order {
3649+
RAACommitmentOrder::CommitmentFirst => 0u8.write(writer)?,
3650+
RAACommitmentOrder::RevokeAndACKFirst => 1u8.write(writer)?,
3651+
}
3652+
36723653
self.monitor_pending_revoke_and_ack.write(writer)?;
36733654
self.monitor_pending_commitment_signed.write(writer)?;
3674-
match self.monitor_pending_order {
3675-
None => 0u8.write(writer)?,
3676-
Some(RAACommitmentOrder::CommitmentFirst) => 1u8.write(writer)?,
3677-
Some(RAACommitmentOrder::RevokeAndACKFirst) => 2u8.write(writer)?,
3678-
}
36793655

36803656
(self.monitor_pending_forwards.len() as u64).write(writer)?;
36813657
for &(ref pending_forward, ref htlc_id) in self.monitor_pending_forwards.iter() {
@@ -3773,8 +3749,6 @@ impl<R : ::std::io::Read> ReadableArgs<R, Arc<Logger>> for Channel {
37733749
let cur_remote_commitment_transaction_number = Readable::read(reader)?;
37743750
let value_to_self_msat = Readable::read(reader)?;
37753751

3776-
let received_commitment_while_awaiting_raa = Readable::read(reader)?;
3777-
37783752
let pending_inbound_htlc_count: u64 = Readable::read(reader)?;
37793753
let mut pending_inbound_htlcs = Vec::with_capacity(cmp::min(pending_inbound_htlc_count as usize, OUR_MAX_HTLCS as usize));
37803754
for _ in 0..pending_inbound_htlc_count {
@@ -3837,16 +3811,15 @@ impl<R : ::std::io::Read> ReadableArgs<R, Arc<Logger>> for Channel {
38373811
});
38383812
}
38393813

3840-
let monitor_pending_revoke_and_ack = Readable::read(reader)?;
3841-
let monitor_pending_commitment_signed = Readable::read(reader)?;
3842-
3843-
let monitor_pending_order = match <u8 as Readable<R>>::read(reader)? {
3844-
0 => None,
3845-
1 => Some(RAACommitmentOrder::CommitmentFirst),
3846-
2 => Some(RAACommitmentOrder::RevokeAndACKFirst),
3814+
let resend_order = match <u8 as Readable<R>>::read(reader)? {
3815+
0 => RAACommitmentOrder::CommitmentFirst,
3816+
1 => RAACommitmentOrder::RevokeAndACKFirst,
38473817
_ => return Err(DecodeError::InvalidValue),
38483818
};
38493819

3820+
let monitor_pending_revoke_and_ack = Readable::read(reader)?;
3821+
let monitor_pending_commitment_signed = Readable::read(reader)?;
3822+
38503823
let monitor_pending_forwards_count: u64 = Readable::read(reader)?;
38513824
let mut monitor_pending_forwards = Vec::with_capacity(cmp::min(monitor_pending_forwards_count as usize, OUR_MAX_HTLCS as usize));
38523825
for _ in 0..monitor_pending_forwards_count {
@@ -3933,14 +3906,14 @@ impl<R : ::std::io::Read> ReadableArgs<R, Arc<Logger>> for Channel {
39333906
cur_remote_commitment_transaction_number,
39343907
value_to_self_msat,
39353908

3936-
received_commitment_while_awaiting_raa,
39373909
pending_inbound_htlcs,
39383910
pending_outbound_htlcs,
39393911
holding_cell_htlc_updates,
39403912

3913+
resend_order,
3914+
39413915
monitor_pending_revoke_and_ack,
39423916
monitor_pending_commitment_signed,
3943-
monitor_pending_order,
39443917
monitor_pending_forwards,
39453918
monitor_pending_failures,
39463919

src/ln/channelmanager.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -494,7 +494,7 @@ macro_rules! handle_monitor_err {
494494
if !$resend_raa {
495495
debug_assert!($action_type == RAACommitmentOrder::CommitmentFirst || !$resend_commitment);
496496
}
497-
$entry.get_mut().monitor_update_failed($action_type, $resend_raa, $resend_commitment, $failed_forwards, $failed_fails);
497+
$entry.get_mut().monitor_update_failed($resend_raa, $resend_commitment, $failed_forwards, $failed_fails);
498498
Err(MsgHandleErrInternal::from_chan_no_close(ChannelError::Ignore("Failed to update ChannelMonitor"), *$entry.key()))
499499
},
500500
}

0 commit comments

Comments
 (0)