Skip to content

Commit 6353683

Browse files
committed
Disconnect peers on timer ticks to unblock channel state machine
At times, we've noticed that channels with `lnd` counterparties do not receive messages we expect to in a timely manner (or at all) after sending them a `ChannelReestablish` upon reconnection, or a `CommitmentSigned` message. This can block the channel state machine from making progress, eventually leading to force closes, if any pending HTLCs are committed and their expiration is met. It seems common wisdom for `lnd` node operators to periodically restart their node/reconnect to their peers, allowing them to start from a fresh state such that the message we expect to receive hopefully gets sent. We can achieve the same end result by disconnecting peers ourselves (regardless of whether they're a `lnd` node), which we opt to implement here by awaiting their response within two timer ticks.
1 parent d7f6e34 commit 6353683

File tree

3 files changed

+206
-2
lines changed

3 files changed

+206
-2
lines changed

lightning/src/ln/channel.rs

+68-1
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,7 @@ enum HTLCUpdateAwaitingACK {
251251
/// Note that PeerDisconnected can be set on both ChannelReady and FundingSent.
252252
/// ChannelReady can then get all remaining flags set on it, until we finish shutdown, then we
253253
/// move on to ShutdownComplete, at which point most calls into this channel are disallowed.
254+
#[derive(Copy, Clone)]
254255
enum ChannelState {
255256
/// Implies we have (or are prepared to) send our open_channel/accept_channel message
256257
OurInitSent = 1 << 0,
@@ -479,6 +480,13 @@ pub(crate) const MIN_AFFORDABLE_HTLC_COUNT: usize = 4;
479480
/// * `EXPIRE_PREV_CONFIG_TICKS` = convergence_delay / tick_interval
480481
pub(crate) const EXPIRE_PREV_CONFIG_TICKS: usize = 5;
481482

483+
/// The number of ticks that may elapse while we're waiting for a response to a
484+
/// [`msgs::RevokeAndACK`] or [`msgs::ChannelReestablish`] message before we attempt to disconnect
485+
/// them.
486+
///
487+
/// See [`Channel::sent_message_awaiting_response`] for more information.
488+
pub(crate) const DISCONNECT_PEER_AWAITING_RESPONSE_TICKS: usize = 2;
489+
482490
struct PendingChannelMonitorUpdate {
483491
update: ChannelMonitorUpdate,
484492
/// In some cases we need to delay letting the [`ChannelMonitorUpdate`] go until after an
@@ -715,6 +723,18 @@ pub(super) struct Channel<Signer: ChannelSigner> {
715723
/// See-also <https://github.com/lightningnetwork/lnd/issues/4006>
716724
pub workaround_lnd_bug_4006: Option<msgs::ChannelReady>,
717725

726+
/// An option set when we wish to track how many ticks we've remained in the same state for
727+
/// after sending a message which requires a response. The first element in the tuple tracks the
728+
/// state which we should no longer be in after receiving the response to the message we sent.
729+
/// The second element tracks the number of ticks that have elapsed since the message was sent.
730+
/// If the peer has yet to respond after reaching `DISCONNECT_PEER_AWAITING_RESPONSE_TICKS`, a
731+
/// reconnection should be attempted to try to unblock the state machine.
732+
///
733+
/// This behavior is mostly motivated by a few lnd bugs (such as the one outlined in
734+
/// `workaround_lnd_bug_4006`), in which we don't receive a message we expect to in a timely
735+
/// manner, which may lead to channels becoming unusable and/or force-closed.
736+
sent_message_awaiting_response: Option<(ChannelState, usize)>,
737+
718738
#[cfg(any(test, fuzzing))]
719739
// When we receive an HTLC fulfill on an outbound path, we may immediately fulfill the
720740
// corresponding HTLC on the inbound path. If, then, the outbound path channel is
@@ -1130,6 +1150,7 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
11301150
next_remote_commitment_tx_fee_info_cached: Mutex::new(None),
11311151

11321152
workaround_lnd_bug_4006: None,
1153+
sent_message_awaiting_response: None,
11331154

11341155
latest_inbound_scid_alias: None,
11351156
outbound_scid_alias,
@@ -1489,6 +1510,7 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
14891510
next_remote_commitment_tx_fee_info_cached: Mutex::new(None),
14901511

14911512
workaround_lnd_bug_4006: None,
1513+
sent_message_awaiting_response: None,
14921514

14931515
latest_inbound_scid_alias: None,
14941516
outbound_scid_alias,
@@ -3526,6 +3548,7 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
35263548
// OK, we step the channel here and *then* if the new generation fails we can fail the
35273549
// channel based on that, but stepping stuff here should be safe either way.
35283550
self.channel_state &= !(ChannelState::AwaitingRemoteRevoke as u32);
3551+
self.sent_message_awaiting_response = None;
35293552
self.counterparty_prev_commitment_point = self.counterparty_cur_commitment_point;
35303553
self.counterparty_cur_commitment_point = Some(msg.next_per_commitment_point);
35313554
self.cur_counterparty_commitment_transaction_number -= 1;
@@ -3841,6 +3864,8 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
38413864
}
38423865
}
38433866

3867+
self.sent_message_awaiting_response = None;
3868+
38443869
self.channel_state |= ChannelState::PeerDisconnected as u32;
38453870
log_trace!(logger, "Peer disconnection resulted in {} remote-announced HTLC drops on channel {}", inbound_drop_count, log_bytes!(self.channel_id()));
38463871
}
@@ -3943,6 +3968,7 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
39433968
Some(self.get_last_revoke_and_ack())
39443969
} else { None };
39453970
let commitment_update = if self.monitor_pending_commitment_signed {
3971+
self.mark_awaiting_remote_revoke();
39463972
Some(self.get_last_commitment_update(logger))
39473973
} else { None };
39483974

@@ -4132,6 +4158,7 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
41324158
// Go ahead and unmark PeerDisconnected as various calls we may make check for it (and all
41334159
// remaining cases either succeed or ErrorMessage-fail).
41344160
self.channel_state &= !(ChannelState::PeerDisconnected as u32);
4161+
self.sent_message_awaiting_response = None;
41354162

41364163
let shutdown_msg = if self.channel_state & (ChannelState::LocalShutdownSent as u32) != 0 {
41374164
assert!(self.shutdown_scriptpubkey.is_some());
@@ -4192,7 +4219,11 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
41924219
// revoke_and_ack, not on sending commitment_signed, so we add one if have
41934220
// AwaitingRemoteRevoke set, which indicates we sent a commitment_signed but haven't gotten
41944221
// the corresponding revoke_and_ack back yet.
4195-
let next_counterparty_commitment_number = INITIAL_COMMITMENT_NUMBER - self.cur_counterparty_commitment_transaction_number + if (self.channel_state & ChannelState::AwaitingRemoteRevoke as u32) != 0 { 1 } else { 0 };
4222+
let is_awaiting_remote_revoke = self.channel_state & ChannelState::AwaitingRemoteRevoke as u32 != 0;
4223+
if is_awaiting_remote_revoke {
4224+
self.mark_awaiting_remote_revoke();
4225+
}
4226+
let next_counterparty_commitment_number = INITIAL_COMMITMENT_NUMBER - self.cur_counterparty_commitment_transaction_number + if is_awaiting_remote_revoke { 1 } else { 0 };
41964227

41974228
let channel_ready = if msg.next_local_commitment_number == 1 && INITIAL_COMMITMENT_NUMBER - self.cur_holder_commitment_transaction_number == 1 {
41984229
// We should never have to worry about MonitorUpdateInProgress resending ChannelReady
@@ -4361,6 +4392,41 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
43614392
}), None))
43624393
}
43634394

4395+
// Marks the channels as waiting for the counterparty's [`msgs::ChannelReestablish`] message.
4396+
// If it's not received [`DISCONNECT_PEER_AWAITING_RESPONSE_TICKS`] after sending our own
4397+
// to them, then we'll attempt a reconnection.
4398+
pub fn mark_awaiting_channel_reestablish(&mut self) {
4399+
self.sent_message_awaiting_response = Some((ChannelState::PeerDisconnected, 0));
4400+
}
4401+
4402+
// Marks the channels as waiting for the counterparty's [`msgs::RevokeAndACK`] message.
4403+
// If it's not received [`DISCONNECT_PEER_AWAITING_RESPONSE_TICKS`] after sending our own
4404+
// to them, then we'll attempt a reconnection.
4405+
fn mark_awaiting_remote_revoke(&mut self) {
4406+
self.sent_message_awaiting_response = Some((ChannelState::AwaitingRemoteRevoke, 0));
4407+
}
4408+
4409+
/// Determines whether we should disconnect the counterparty due to not receiving a response
4410+
/// within our expected timeframe.
4411+
///
4412+
/// This should be called on every [`super::channelmanager::ChannelManager::timer_tick_occurred`].
4413+
pub fn should_disconnect_peer_awaiting_response(&mut self) -> bool {
4414+
let (state, ticks_elapsed) = if let Some(ticks_elapsed) = self.sent_message_awaiting_response.as_mut() {
4415+
ticks_elapsed
4416+
} else {
4417+
// Don't disconnect when we're not waiting on a response.
4418+
return false;
4419+
};
4420+
*ticks_elapsed += 1;
4421+
if *ticks_elapsed < DISCONNECT_PEER_AWAITING_RESPONSE_TICKS {
4422+
// Don't disconnect yet, they still have time left.
4423+
return false;
4424+
}
4425+
// `state` represents the `ChannelState` variant that should no longer be present after
4426+
// processing the counterparty's response.
4427+
self.channel_state & *state as u32 == *state as u32
4428+
}
4429+
43644430
pub fn shutdown<SP: Deref>(
43654431
&mut self, signer_provider: &SP, their_features: &InitFeatures, msg: &msgs::Shutdown
43664432
) -> Result<(Option<msgs::Shutdown>, Option<&ChannelMonitorUpdate>, Vec<(HTLCSource, PaymentHash)>), ChannelError>
@@ -7090,6 +7156,7 @@ impl<'a, 'b, 'c, ES: Deref, SP: Deref> ReadableArgs<(&'a ES, &'b SP, u32, &'c Ch
70907156
next_remote_commitment_tx_fee_info_cached: Mutex::new(None),
70917157

70927158
workaround_lnd_bug_4006: None,
7159+
sent_message_awaiting_response: None,
70937160

70947161
latest_inbound_scid_alias,
70957162
// Later in the ChannelManager deserialization phase we scan for channels and assign scid aliases if its missing

lightning/src/ln/channelmanager.rs

+11
Original file line numberDiff line numberDiff line change
@@ -3869,6 +3869,7 @@ where
38693869
for (counterparty_node_id, peer_state_mutex) in per_peer_state.iter() {
38703870
let mut peer_state_lock = peer_state_mutex.lock().unwrap();
38713871
let peer_state = &mut *peer_state_lock;
3872+
let is_peer_connected = peer_state.is_connected;
38723873
let pending_msg_events = &mut peer_state.pending_msg_events;
38733874
let counterparty_node_id = *counterparty_node_id;
38743875
peer_state.channel_by_id.retain(|chan_id, chan| {
@@ -3921,6 +3922,15 @@ where
39213922

39223923
chan.maybe_expire_prev_config();
39233924

3925+
if is_peer_connected && chan.should_disconnect_peer_awaiting_response() {
3926+
log_debug!(self.logger, "Disconnecting peer {} due to not making any progress on channel {}",
3927+
counterparty_node_id, log_bytes!(*chan_id));
3928+
pending_msg_events.push(MessageSendEvent::HandleError {
3929+
node_id: counterparty_node_id,
3930+
action: msgs::ErrorAction::DisconnectPeer { msg: None },
3931+
});
3932+
}
3933+
39243934
true
39253935
});
39263936
if peer_state.ok_to_remove(true) {
@@ -6734,6 +6744,7 @@ where
67346744
node_id: chan.get_counterparty_node_id(),
67356745
msg: chan.get_channel_reestablish(&self.logger),
67366746
});
6747+
chan.mark_awaiting_channel_reestablish();
67376748
true
67386749
}
67396750
} else { true };

lightning/src/ln/functional_tests.rs

+127-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ use crate::events::{Event, MessageSendEvent, MessageSendEventsProvider, PathFail
2222
use crate::ln::{PaymentPreimage, PaymentSecret, PaymentHash};
2323
use crate::ln::channel::{commitment_tx_base_weight, COMMITMENT_TX_WEIGHT_PER_HTLC, CONCURRENT_INBOUND_HTLC_FEE_BUFFER, FEE_SPIKE_BUFFER_FEE_INCREASE_MULTIPLE, MIN_AFFORDABLE_HTLC_COUNT};
2424
use crate::ln::channelmanager::{self, PaymentId, RAACommitmentOrder, PaymentSendFailure, RecipientOnionFields, BREAKDOWN_TIMEOUT, ENABLE_GOSSIP_TICKS, DISABLE_GOSSIP_TICKS, MIN_CLTV_EXPIRY_DELTA};
25-
use crate::ln::channel::{Channel, ChannelError};
25+
use crate::ln::channel::{DISCONNECT_PEER_AWAITING_RESPONSE_TICKS, Channel, ChannelError};
2626
use crate::ln::{chan_utils, onion_utils};
2727
use crate::ln::chan_utils::{OFFERED_HTLC_SCRIPT_WEIGHT, htlc_success_tx_weight, htlc_timeout_tx_weight, HTLCOutputInCommitment};
2828
use crate::routing::gossip::{NetworkGraph, NetworkUpdate};
@@ -9992,3 +9992,129 @@ fn test_payment_with_custom_min_cltv_expiry_delta() {
99929992
do_payment_with_custom_min_final_cltv_expiry(true, false);
99939993
do_payment_with_custom_min_final_cltv_expiry(true, true);
99949994
}
9995+
9996+
#[test]
9997+
fn test_disconnects_peer_awaiting_response_ticks() {
9998+
// Tests that nodes which are awaiting on a response critical for channel responsiveness
9999+
// disconnect their counterparty after `DISCONNECT_PEER_AWAITING_RESPONSE_TICKS`.
10000+
let mut chanmon_cfgs = create_chanmon_cfgs(2);
10001+
let node_cfgs = create_node_cfgs(2, &chanmon_cfgs);
10002+
let node_chanmgrs = create_node_chanmgrs(2, &node_cfgs, &[None, None]);
10003+
let nodes = create_network(2, &node_cfgs, &node_chanmgrs);
10004+
10005+
// Asserts a disconnect event is queued to the user.
10006+
let check_disconnect_event = |node: &Node, should_disconnect: bool| {
10007+
let disconnect_event = node.node.get_and_clear_pending_msg_events().iter().find_map(|event|
10008+
if let MessageSendEvent::HandleError { action, .. } = event {
10009+
if let msgs::ErrorAction::DisconnectPeer { msg } = action {
10010+
assert!(msg.is_none());
10011+
Some(())
10012+
} else {
10013+
None
10014+
}
10015+
} else {
10016+
None
10017+
}
10018+
);
10019+
assert_eq!(disconnect_event.is_some(), should_disconnect);
10020+
};
10021+
10022+
// Fires timer ticks ensuring we only attempt to disconnect peers after reaching
10023+
// `DISCONNECT_PEER_AWAITING_RESPONSE_TICKS`.
10024+
let check_disconnect = |node: &Node| {
10025+
// No disconnect without any timer ticks.
10026+
check_disconnect_event(node, false);
10027+
10028+
// No disconnect with 1 timer tick less than required.
10029+
for _ in 0..DISCONNECT_PEER_AWAITING_RESPONSE_TICKS - 1 {
10030+
node.node.timer_tick_occurred();
10031+
check_disconnect_event(node, false);
10032+
}
10033+
10034+
// Disconnect after reaching the required ticks.
10035+
node.node.timer_tick_occurred();
10036+
check_disconnect_event(node, true);
10037+
10038+
// Disconnect again on the next tick if the peer hasn't been disconnected yet.
10039+
node.node.timer_tick_occurred();
10040+
check_disconnect_event(node, true);
10041+
};
10042+
10043+
create_chan_between_nodes(&nodes[0], &nodes[1]);
10044+
10045+
// We'll start by performing a fee update with Alice (nodes[0]) on the channel.
10046+
*nodes[0].fee_estimator.sat_per_kw.lock().unwrap() *= 2;
10047+
nodes[0].node.timer_tick_occurred();
10048+
check_added_monitors!(&nodes[0], 1);
10049+
let alice_fee_update = get_htlc_update_msgs(&nodes[0], &nodes[1].node.get_our_node_id());
10050+
nodes[1].node.handle_update_fee(&nodes[0].node.get_our_node_id(), alice_fee_update.update_fee.as_ref().unwrap());
10051+
nodes[1].node.handle_commitment_signed(&nodes[0].node.get_our_node_id(), &alice_fee_update.commitment_signed);
10052+
check_added_monitors!(&nodes[1], 1);
10053+
10054+
// This will prompt Bob (nodes[1]) to respond with his `CommitmentSigned` and `RevokeAndACK`.
10055+
let (bob_revoke_and_ack, bob_commitment_signed) = get_revoke_commit_msgs!(&nodes[1], nodes[0].node.get_our_node_id());
10056+
nodes[0].node.handle_revoke_and_ack(&nodes[1].node.get_our_node_id(), &bob_revoke_and_ack);
10057+
check_added_monitors!(&nodes[0], 1);
10058+
nodes[0].node.handle_commitment_signed(&nodes[1].node.get_our_node_id(), &bob_commitment_signed);
10059+
check_added_monitors(&nodes[0], 1);
10060+
10061+
// Alice then needs to send her final `RevokeAndACK` to complete the commitment dance. We
10062+
// pretend Bob hasn't received the message and check whether he'll disconnect Alice after
10063+
// reaching `DISCONNECT_PEER_AWAITING_RESPONSE_TICKS`.
10064+
let alice_revoke_and_ack = get_event_msg!(nodes[0], MessageSendEvent::SendRevokeAndACK, nodes[1].node.get_our_node_id());
10065+
check_disconnect(&nodes[1]);
10066+
10067+
// Now, we'll reconnect them to test awaiting a `ChannelReestablish` message.
10068+
//
10069+
// Note that since the commitment dance didn't complete above, Alice is expected to resend her
10070+
// final `RevokeAndACK` to Bob to complete it.
10071+
nodes[0].node.peer_disconnected(&nodes[1].node.get_our_node_id());
10072+
nodes[1].node.peer_disconnected(&nodes[0].node.get_our_node_id());
10073+
let bob_init = msgs::Init { features: nodes[1].node.init_features(), remote_network_address: None };
10074+
nodes[0].node.peer_connected(&nodes[1].node.get_our_node_id(), &bob_init, true).unwrap();
10075+
let alice_init = msgs::Init { features: nodes[0].node.init_features(), remote_network_address: None };
10076+
nodes[1].node.peer_connected(&nodes[0].node.get_our_node_id(), &alice_init, true).unwrap();
10077+
10078+
// Upon reconnection, Alice sends her `ChannelReestablish` to Bob. Alice, however, hasn't
10079+
// received Bob's yet, so she should disconnect him after reaching
10080+
// `DISCONNECT_PEER_AWAITING_RESPONSE_TICKS`.
10081+
let alice_channel_reestablish = get_event_msg!(
10082+
nodes[0], MessageSendEvent::SendChannelReestablish, nodes[1].node.get_our_node_id()
10083+
);
10084+
nodes[1].node.handle_channel_reestablish(&nodes[0].node.get_our_node_id(), &alice_channel_reestablish);
10085+
check_disconnect(&nodes[0]);
10086+
10087+
// Bob now sends his `ChannelReestablish` to Alice to resume the channel and consider it "live".
10088+
let bob_channel_reestablish = nodes[1].node.get_and_clear_pending_msg_events().iter().find_map(|event|
10089+
if let MessageSendEvent::SendChannelReestablish { node_id, msg } = event {
10090+
assert_eq!(*node_id, nodes[0].node.get_our_node_id());
10091+
Some(msg.clone())
10092+
} else {
10093+
None
10094+
}
10095+
).unwrap();
10096+
nodes[0].node.handle_channel_reestablish(&nodes[1].node.get_our_node_id(), &bob_channel_reestablish);
10097+
10098+
// Sanity check that Alice won't disconnect Bob since she's no longer waiting for any messages.
10099+
for _ in 0..DISCONNECT_PEER_AWAITING_RESPONSE_TICKS {
10100+
nodes[0].node.timer_tick_occurred();
10101+
check_disconnect_event(&nodes[0], false);
10102+
}
10103+
10104+
// However, Bob is still waiting on Alice's `RevokeAndACK`, so he should disconnect her after
10105+
// reaching `DISCONNECT_PEER_AWAITING_RESPONSE_TICKS`.
10106+
check_disconnect(&nodes[1]);
10107+
10108+
// Finally, have Bob process the last message.
10109+
nodes[1].node.handle_revoke_and_ack(&nodes[0].node.get_our_node_id(), &alice_revoke_and_ack);
10110+
check_added_monitors(&nodes[1], 1);
10111+
10112+
// At this point, neither node should attempt to disconnect each other, since they aren't
10113+
// waiting on any messages.
10114+
for node in &nodes {
10115+
for _ in 0..DISCONNECT_PEER_AWAITING_RESPONSE_TICKS {
10116+
node.node.timer_tick_occurred();
10117+
check_disconnect_event(node, false);
10118+
}
10119+
}
10120+
}

0 commit comments

Comments
 (0)