Skip to content

Commit 09c051f

Browse files
committed
Disconnect peers on timer ticks to unblock channel state machine
At times, we've noticed that channels with `lnd` counterparties do not receive messages we expect to in a timely manner (or at all) after sending them a `ChannelReestablish` upon reconnection, or a `CommitmentSigned` message. This can block the channel state machine from making progress, eventually leading to force closes, if any pending HTLCs are committed and their expiration is met. It seems common wisdom for `lnd` node operators to periodically restart their node/reconnect to their peers, allowing them to start from a fresh state such that the message we expect to receive hopefully gets sent. We can achieve the same end result by disconnecting peers ourselves (regardless of whether they're a `lnd` node), which we opt to implement here by awaiting their response within two timer ticks.
1 parent 98768f2 commit 09c051f

File tree

3 files changed

+197
-3
lines changed

3 files changed

+197
-3
lines changed

lightning/src/ln/channel.rs

+57-2
Original file line numberDiff line numberDiff line change
@@ -479,6 +479,13 @@ pub(crate) const MIN_AFFORDABLE_HTLC_COUNT: usize = 4;
479479
/// * `EXPIRE_PREV_CONFIG_TICKS` = convergence_delay / tick_interval
480480
pub(crate) const EXPIRE_PREV_CONFIG_TICKS: usize = 5;
481481

482+
/// The number of ticks that may elapse while we're waiting for a response to a
483+
/// [`msgs::RevokeAndACK`] or [`msgs::ChannelReestablish`] message before we attempt to disconnect
484+
/// them.
485+
///
486+
/// See [`Channel::sent_message_awaiting_response`] for more information.
487+
pub(crate) const DISCONNECT_PEER_AWAITING_RESPONSE_TICKS: usize = 2;
488+
482489
struct PendingChannelMonitorUpdate {
483490
update: ChannelMonitorUpdate,
484491
/// In some cases we need to delay letting the [`ChannelMonitorUpdate`] go until after an
@@ -715,6 +722,19 @@ pub(super) struct Channel<Signer: ChannelSigner> {
715722
/// See-also <https://github.com/lightningnetwork/lnd/issues/4006>
716723
pub workaround_lnd_bug_4006: Option<msgs::ChannelReady>,
717724

725+
/// An option set when we wish to track how many ticks have elapsed while waiting for a response
726+
/// from our counterparty after sending a message. If the peer has yet to respond after reaching
727+
/// `DISCONNECT_PEER_AWAITING_RESPONSE_TICKS`, a reconnection should be attempted to try to
728+
/// unblock the state machine.
729+
///
730+
/// This behavior is mostly motivated by a lnd bug in which we don't receive a message we expect
731+
/// to in a timely manner, which may lead to channels becoming unusable and/or force-closed. An
732+
/// example of such can be found at <https://github.com/lightningnetwork/lnd/issues/7682>.
733+
///
734+
/// This is currently only used when waiting for a [`msgs::ChannelReestablish`] or
735+
/// [`msgs::RevokeAndACK`] message from the counterparty.
736+
sent_message_awaiting_response: Option<usize>,
737+
718738
#[cfg(any(test, fuzzing))]
719739
// When we receive an HTLC fulfill on an outbound path, we may immediately fulfill the
720740
// corresponding HTLC on the inbound path. If, then, the outbound path channel is
@@ -1130,6 +1150,7 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
11301150
next_remote_commitment_tx_fee_info_cached: Mutex::new(None),
11311151

11321152
workaround_lnd_bug_4006: None,
1153+
sent_message_awaiting_response: None,
11331154

11341155
latest_inbound_scid_alias: None,
11351156
outbound_scid_alias,
@@ -1489,6 +1510,7 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
14891510
next_remote_commitment_tx_fee_info_cached: Mutex::new(None),
14901511

14911512
workaround_lnd_bug_4006: None,
1513+
sent_message_awaiting_response: None,
14921514

14931515
latest_inbound_scid_alias: None,
14941516
outbound_scid_alias,
@@ -3526,6 +3548,7 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
35263548
// OK, we step the channel here and *then* if the new generation fails we can fail the
35273549
// channel based on that, but stepping stuff here should be safe either way.
35283550
self.channel_state &= !(ChannelState::AwaitingRemoteRevoke as u32);
3551+
self.sent_message_awaiting_response = None;
35293552
self.counterparty_prev_commitment_point = self.counterparty_cur_commitment_point;
35303553
self.counterparty_cur_commitment_point = Some(msg.next_per_commitment_point);
35313554
self.cur_counterparty_commitment_transaction_number -= 1;
@@ -3841,6 +3864,8 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
38413864
}
38423865
}
38433866

3867+
self.sent_message_awaiting_response = None;
3868+
38443869
self.channel_state |= ChannelState::PeerDisconnected as u32;
38453870
log_trace!(logger, "Peer disconnection resulted in {} remote-announced HTLC drops on channel {}", inbound_drop_count, log_bytes!(self.channel_id()));
38463871
}
@@ -3943,6 +3968,7 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
39433968
Some(self.get_last_revoke_and_ack())
39443969
} else { None };
39453970
let commitment_update = if self.monitor_pending_commitment_signed {
3971+
self.mark_awaiting_response();
39463972
Some(self.get_last_commitment_update(logger))
39473973
} else { None };
39483974

@@ -4132,6 +4158,7 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
41324158
// Go ahead and unmark PeerDisconnected as various calls we may make check for it (and all
41334159
// remaining cases either succeed or ErrorMessage-fail).
41344160
self.channel_state &= !(ChannelState::PeerDisconnected as u32);
4161+
self.sent_message_awaiting_response = None;
41354162

41364163
let shutdown_msg = if self.channel_state & (ChannelState::LocalShutdownSent as u32) != 0 {
41374164
assert!(self.shutdown_scriptpubkey.is_some());
@@ -4192,7 +4219,11 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
41924219
// revoke_and_ack, not on sending commitment_signed, so we add one if have
41934220
// AwaitingRemoteRevoke set, which indicates we sent a commitment_signed but haven't gotten
41944221
// the corresponding revoke_and_ack back yet.
4195-
let next_counterparty_commitment_number = INITIAL_COMMITMENT_NUMBER - self.cur_counterparty_commitment_transaction_number + if (self.channel_state & ChannelState::AwaitingRemoteRevoke as u32) != 0 { 1 } else { 0 };
4222+
let is_awaiting_remote_revoke = self.channel_state & ChannelState::AwaitingRemoteRevoke as u32 != 0;
4223+
if is_awaiting_remote_revoke && !self.is_awaiting_monitor_update() {
4224+
self.mark_awaiting_response();
4225+
}
4226+
let next_counterparty_commitment_number = INITIAL_COMMITMENT_NUMBER - self.cur_counterparty_commitment_transaction_number + if is_awaiting_remote_revoke { 1 } else { 0 };
41964227

41974228
let channel_ready = if msg.next_local_commitment_number == 1 && INITIAL_COMMITMENT_NUMBER - self.cur_holder_commitment_transaction_number == 1 {
41984229
// We should never have to worry about MonitorUpdateInProgress resending ChannelReady
@@ -4361,6 +4392,28 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
43614392
}), None))
43624393
}
43634394

4395+
// Marks a channel as waiting for a response from the counterparty. If it's not received
4396+
// [`DISCONNECT_PEER_AWAITING_RESPONSE_TICKS`] after sending our own to them, then we'll attempt
4397+
// a reconnection.
4398+
fn mark_awaiting_response(&mut self) {
4399+
self.sent_message_awaiting_response = Some(0);
4400+
}
4401+
4402+
/// Determines whether we should disconnect the counterparty due to not receiving a response
4403+
/// within our expected timeframe.
4404+
///
4405+
/// This should be called on every [`super::channelmanager::ChannelManager::timer_tick_occurred`].
4406+
pub fn should_disconnect_peer_awaiting_response(&mut self) -> bool {
4407+
let ticks_elapsed = if let Some(ticks_elapsed) = self.sent_message_awaiting_response.as_mut() {
4408+
ticks_elapsed
4409+
} else {
4410+
// Don't disconnect when we're not waiting on a response.
4411+
return false;
4412+
};
4413+
*ticks_elapsed += 1;
4414+
*ticks_elapsed >= DISCONNECT_PEER_AWAITING_RESPONSE_TICKS
4415+
}
4416+
43644417
pub fn shutdown<SP: Deref>(
43654418
&mut self, signer_provider: &SP, their_features: &InitFeatures, msg: &msgs::Shutdown
43664419
) -> Result<(Option<msgs::Shutdown>, Option<&ChannelMonitorUpdate>, Vec<(HTLCSource, PaymentHash)>), ChannelError>
@@ -5733,7 +5786,7 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
57335786

57345787
/// May panic if called on a channel that wasn't immediately-previously
57355788
/// self.remove_uncommitted_htlcs_and_mark_paused()'d
5736-
pub fn get_channel_reestablish<L: Deref>(&self, logger: &L) -> msgs::ChannelReestablish where L::Target: Logger {
5789+
pub fn get_channel_reestablish<L: Deref>(&mut self, logger: &L) -> msgs::ChannelReestablish where L::Target: Logger {
57375790
assert_eq!(self.channel_state & ChannelState::PeerDisconnected as u32, ChannelState::PeerDisconnected as u32);
57385791
assert_ne!(self.cur_counterparty_commitment_transaction_number, INITIAL_COMMITMENT_NUMBER);
57395792
// Prior to static_remotekey, my_current_per_commitment_point was critical to claiming
@@ -5752,6 +5805,7 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
57525805
log_info!(logger, "Sending a data_loss_protect with no previous remote per_commitment_secret for channel {}", log_bytes!(self.channel_id()));
57535806
[0;32]
57545807
};
5808+
self.mark_awaiting_response();
57555809
msgs::ChannelReestablish {
57565810
channel_id: self.channel_id(),
57575811
// The protocol has two different commitment number concepts - the "commitment
@@ -7090,6 +7144,7 @@ impl<'a, 'b, 'c, ES: Deref, SP: Deref> ReadableArgs<(&'a ES, &'b SP, u32, &'c Ch
70907144
next_remote_commitment_tx_fee_info_cached: Mutex::new(None),
70917145

70927146
workaround_lnd_bug_4006: None,
7147+
sent_message_awaiting_response: None,
70937148

70947149
latest_inbound_scid_alias,
70957150
// Later in the ChannelManager deserialization phase we scan for channels and assign scid aliases if its missing

lightning/src/ln/channelmanager.rs

+14
Original file line numberDiff line numberDiff line change
@@ -3921,6 +3921,20 @@ where
39213921

39223922
chan.maybe_expire_prev_config();
39233923

3924+
if chan.should_disconnect_peer_awaiting_response() {
3925+
log_debug!(self.logger, "Disconnecting peer {} due to not making any progress on channel {}",
3926+
counterparty_node_id, log_bytes!(*chan_id));
3927+
pending_msg_events.push(MessageSendEvent::HandleError {
3928+
node_id: counterparty_node_id,
3929+
action: msgs::ErrorAction::DisconnectPeerWithWarning {
3930+
msg: msgs::WarningMessage {
3931+
channel_id: *chan_id,
3932+
data: "Disconnecting due to timeout awaiting response".to_owned(),
3933+
},
3934+
},
3935+
});
3936+
}
3937+
39243938
true
39253939
});
39263940
if peer_state.ok_to_remove(true) {

lightning/src/ln/functional_tests.rs

+126-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ use crate::events::{Event, MessageSendEvent, MessageSendEventsProvider, PathFail
2222
use crate::ln::{PaymentPreimage, PaymentSecret, PaymentHash};
2323
use crate::ln::channel::{commitment_tx_base_weight, COMMITMENT_TX_WEIGHT_PER_HTLC, CONCURRENT_INBOUND_HTLC_FEE_BUFFER, FEE_SPIKE_BUFFER_FEE_INCREASE_MULTIPLE, MIN_AFFORDABLE_HTLC_COUNT};
2424
use crate::ln::channelmanager::{self, PaymentId, RAACommitmentOrder, PaymentSendFailure, RecipientOnionFields, BREAKDOWN_TIMEOUT, ENABLE_GOSSIP_TICKS, DISABLE_GOSSIP_TICKS, MIN_CLTV_EXPIRY_DELTA};
25-
use crate::ln::channel::{Channel, ChannelError};
25+
use crate::ln::channel::{DISCONNECT_PEER_AWAITING_RESPONSE_TICKS, Channel, ChannelError};
2626
use crate::ln::{chan_utils, onion_utils};
2727
use crate::ln::chan_utils::{OFFERED_HTLC_SCRIPT_WEIGHT, htlc_success_tx_weight, htlc_timeout_tx_weight, HTLCOutputInCommitment};
2828
use crate::routing::gossip::{NetworkGraph, NetworkUpdate};
@@ -9955,3 +9955,128 @@ fn test_payment_with_custom_min_cltv_expiry_delta() {
99559955
do_payment_with_custom_min_final_cltv_expiry(true, false);
99569956
do_payment_with_custom_min_final_cltv_expiry(true, true);
99579957
}
9958+
9959+
#[test]
9960+
fn test_disconnects_peer_awaiting_response_ticks() {
9961+
// Tests that nodes which are awaiting on a response critical for channel responsiveness
9962+
// disconnect their counterparty after `DISCONNECT_PEER_AWAITING_RESPONSE_TICKS`.
9963+
let mut chanmon_cfgs = create_chanmon_cfgs(2);
9964+
let node_cfgs = create_node_cfgs(2, &chanmon_cfgs);
9965+
let node_chanmgrs = create_node_chanmgrs(2, &node_cfgs, &[None, None]);
9966+
let nodes = create_network(2, &node_cfgs, &node_chanmgrs);
9967+
9968+
// Asserts a disconnect event is queued to the user.
9969+
let check_disconnect_event = |node: &Node, should_disconnect: bool| {
9970+
let disconnect_event = node.node.get_and_clear_pending_msg_events().iter().find_map(|event|
9971+
if let MessageSendEvent::HandleError { action, .. } = event {
9972+
if let msgs::ErrorAction::DisconnectPeerWithWarning { .. } = action {
9973+
Some(())
9974+
} else {
9975+
None
9976+
}
9977+
} else {
9978+
None
9979+
}
9980+
);
9981+
assert_eq!(disconnect_event.is_some(), should_disconnect);
9982+
};
9983+
9984+
// Fires timer ticks ensuring we only attempt to disconnect peers after reaching
9985+
// `DISCONNECT_PEER_AWAITING_RESPONSE_TICKS`.
9986+
let check_disconnect = |node: &Node| {
9987+
// No disconnect without any timer ticks.
9988+
check_disconnect_event(node, false);
9989+
9990+
// No disconnect with 1 timer tick less than required.
9991+
for _ in 0..DISCONNECT_PEER_AWAITING_RESPONSE_TICKS - 1 {
9992+
node.node.timer_tick_occurred();
9993+
check_disconnect_event(node, false);
9994+
}
9995+
9996+
// Disconnect after reaching the required ticks.
9997+
node.node.timer_tick_occurred();
9998+
check_disconnect_event(node, true);
9999+
10000+
// Disconnect again on the next tick if the peer hasn't been disconnected yet.
10001+
node.node.timer_tick_occurred();
10002+
check_disconnect_event(node, true);
10003+
};
10004+
10005+
create_chan_between_nodes(&nodes[0], &nodes[1]);
10006+
10007+
// We'll start by performing a fee update with Alice (nodes[0]) on the channel.
10008+
*nodes[0].fee_estimator.sat_per_kw.lock().unwrap() *= 2;
10009+
nodes[0].node.timer_tick_occurred();
10010+
check_added_monitors!(&nodes[0], 1);
10011+
let alice_fee_update = get_htlc_update_msgs(&nodes[0], &nodes[1].node.get_our_node_id());
10012+
nodes[1].node.handle_update_fee(&nodes[0].node.get_our_node_id(), alice_fee_update.update_fee.as_ref().unwrap());
10013+
nodes[1].node.handle_commitment_signed(&nodes[0].node.get_our_node_id(), &alice_fee_update.commitment_signed);
10014+
check_added_monitors!(&nodes[1], 1);
10015+
10016+
// This will prompt Bob (nodes[1]) to respond with his `CommitmentSigned` and `RevokeAndACK`.
10017+
let (bob_revoke_and_ack, bob_commitment_signed) = get_revoke_commit_msgs!(&nodes[1], nodes[0].node.get_our_node_id());
10018+
nodes[0].node.handle_revoke_and_ack(&nodes[1].node.get_our_node_id(), &bob_revoke_and_ack);
10019+
check_added_monitors!(&nodes[0], 1);
10020+
nodes[0].node.handle_commitment_signed(&nodes[1].node.get_our_node_id(), &bob_commitment_signed);
10021+
check_added_monitors(&nodes[0], 1);
10022+
10023+
// Alice then needs to send her final `RevokeAndACK` to complete the commitment dance. We
10024+
// pretend Bob hasn't received the message and check whether he'll disconnect Alice after
10025+
// reaching `DISCONNECT_PEER_AWAITING_RESPONSE_TICKS`.
10026+
let alice_revoke_and_ack = get_event_msg!(nodes[0], MessageSendEvent::SendRevokeAndACK, nodes[1].node.get_our_node_id());
10027+
check_disconnect(&nodes[1]);
10028+
10029+
// Now, we'll reconnect them to test awaiting a `ChannelReestablish` message.
10030+
//
10031+
// Note that since the commitment dance didn't complete above, Alice is expected to resend her
10032+
// final `RevokeAndACK` to Bob to complete it.
10033+
nodes[0].node.peer_disconnected(&nodes[1].node.get_our_node_id());
10034+
nodes[1].node.peer_disconnected(&nodes[0].node.get_our_node_id());
10035+
let bob_init = msgs::Init { features: nodes[1].node.init_features(), remote_network_address: None };
10036+
nodes[0].node.peer_connected(&nodes[1].node.get_our_node_id(), &bob_init, true).unwrap();
10037+
let alice_init = msgs::Init { features: nodes[0].node.init_features(), remote_network_address: None };
10038+
nodes[1].node.peer_connected(&nodes[0].node.get_our_node_id(), &alice_init, true).unwrap();
10039+
10040+
// Upon reconnection, Alice sends her `ChannelReestablish` to Bob. Alice, however, hasn't
10041+
// received Bob's yet, so she should disconnect him after reaching
10042+
// `DISCONNECT_PEER_AWAITING_RESPONSE_TICKS`.
10043+
let alice_channel_reestablish = get_event_msg!(
10044+
nodes[0], MessageSendEvent::SendChannelReestablish, nodes[1].node.get_our_node_id()
10045+
);
10046+
nodes[1].node.handle_channel_reestablish(&nodes[0].node.get_our_node_id(), &alice_channel_reestablish);
10047+
check_disconnect(&nodes[0]);
10048+
10049+
// Bob now sends his `ChannelReestablish` to Alice to resume the channel and consider it "live".
10050+
let bob_channel_reestablish = nodes[1].node.get_and_clear_pending_msg_events().iter().find_map(|event|
10051+
if let MessageSendEvent::SendChannelReestablish { node_id, msg } = event {
10052+
assert_eq!(*node_id, nodes[0].node.get_our_node_id());
10053+
Some(msg.clone())
10054+
} else {
10055+
None
10056+
}
10057+
).unwrap();
10058+
nodes[0].node.handle_channel_reestablish(&nodes[1].node.get_our_node_id(), &bob_channel_reestablish);
10059+
10060+
// Sanity check that Alice won't disconnect Bob since she's no longer waiting for any messages.
10061+
for _ in 0..DISCONNECT_PEER_AWAITING_RESPONSE_TICKS {
10062+
nodes[0].node.timer_tick_occurred();
10063+
check_disconnect_event(&nodes[0], false);
10064+
}
10065+
10066+
// However, Bob is still waiting on Alice's `RevokeAndACK`, so he should disconnect her after
10067+
// reaching `DISCONNECT_PEER_AWAITING_RESPONSE_TICKS`.
10068+
check_disconnect(&nodes[1]);
10069+
10070+
// Finally, have Bob process the last message.
10071+
nodes[1].node.handle_revoke_and_ack(&nodes[0].node.get_our_node_id(), &alice_revoke_and_ack);
10072+
check_added_monitors(&nodes[1], 1);
10073+
10074+
// At this point, neither node should attempt to disconnect each other, since they aren't
10075+
// waiting on any messages.
10076+
for node in &nodes {
10077+
for _ in 0..DISCONNECT_PEER_AWAITING_RESPONSE_TICKS {
10078+
node.node.timer_tick_occurred();
10079+
check_disconnect_event(node, false);
10080+
}
10081+
}
10082+
}

0 commit comments

Comments
 (0)