Skip to content

Commit fe3a962

Browse files
committed
Disconnect peers on timer ticks to unblock channel state machine
At times, we've noticed that channels with `lnd` counterparties do not receive messages we expect to in a timely manner (or at all) after sending them a `ChannelReestablish` upon reconnection, or a `CommitmentSigned` message. This can block the channel state machine from making progress, eventually leading to force closes, if any pending HTLCs are committed and their expiration is met. It seems common wisdom for `lnd` node operators to periodically restart their node/reconnect to their peers, allowing them to start from a fresh state such that the message we expect to receive hopefully gets sent. We can achieve the same end result by disconnecting peers ourselves (regardless of whether they're a `lnd` node), which we opt to implement here by awaiting their response within two timer ticks.
1 parent 7ea4d39 commit fe3a962

File tree

3 files changed

+198
-2
lines changed

3 files changed

+198
-2
lines changed

lightning/src/ln/channel.rs

+56-1
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,7 @@ enum HTLCUpdateAwaitingACK {
251251
/// Note that PeerDisconnected can be set on both ChannelReady and FundingSent.
252252
/// ChannelReady can then get all remaining flags set on it, until we finish shutdown, then we
253253
/// move on to ShutdownComplete, at which point most calls into this channel are disallowed.
254+
#[derive(Copy, Clone)]
254255
enum ChannelState {
255256
/// Implies we have (or are prepared to) send our open_channel/accept_channel message
256257
OurInitSent = 1 << 0,
@@ -479,6 +480,13 @@ pub(crate) const MIN_AFFORDABLE_HTLC_COUNT: usize = 4;
479480
/// * `EXPIRE_PREV_CONFIG_TICKS` = convergence_delay / tick_interval
480481
pub(crate) const EXPIRE_PREV_CONFIG_TICKS: usize = 5;
481482

483+
/// The number of ticks that may elapse while we're waiting for a response to a
484+
/// [`msgs::RevokeAndACK`] or [`msgs::ChannelReestablish`] message before we attempt to disconnect
485+
/// them.
486+
///
487+
/// See [`Channel::sent_message_awaiting_response`] for more information.
488+
pub(crate) const DISCONNECT_PEER_AWAITING_RESPONSE_TICKS: usize = 2;
489+
482490
struct PendingChannelMonitorUpdate {
483491
update: ChannelMonitorUpdate,
484492
/// In some cases we need to delay letting the [`ChannelMonitorUpdate`] go until after an
@@ -715,6 +723,19 @@ pub(super) struct Channel<Signer: ChannelSigner> {
715723
/// See-also <https://github.com/lightningnetwork/lnd/issues/4006>
716724
pub workaround_lnd_bug_4006: Option<msgs::ChannelReady>,
717725

726+
/// An option set when we wish to track how many ticks have elapsed while waiting for a response
727+
/// from our counterparty after sending a message. If the peer has yet to respond after reaching
728+
/// `DISCONNECT_PEER_AWAITING_RESPONSE_TICKS`, a reconnection should be attempted to try to
729+
/// unblock the state machine.
730+
///
731+
/// This behavior is mostly motivated by a few lnd bugs (such as the one outlined in
732+
/// `workaround_lnd_bug_4006`), in which we don't receive a message we expect to in a timely
733+
/// manner, which may lead to channels becoming unusable and/or force-closed.
734+
///
735+
/// This is currently only used when waiting for a [`msgs::ChannelReestablish`] or
736+
/// [`msgs::RevokeAndACK`] message from the counterparty.
737+
sent_message_awaiting_response: Option<usize>,
738+
718739
#[cfg(any(test, fuzzing))]
719740
// When we receive an HTLC fulfill on an outbound path, we may immediately fulfill the
720741
// corresponding HTLC on the inbound path. If, then, the outbound path channel is
@@ -1130,6 +1151,7 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
11301151
next_remote_commitment_tx_fee_info_cached: Mutex::new(None),
11311152

11321153
workaround_lnd_bug_4006: None,
1154+
sent_message_awaiting_response: None,
11331155

11341156
latest_inbound_scid_alias: None,
11351157
outbound_scid_alias,
@@ -1489,6 +1511,7 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
14891511
next_remote_commitment_tx_fee_info_cached: Mutex::new(None),
14901512

14911513
workaround_lnd_bug_4006: None,
1514+
sent_message_awaiting_response: None,
14921515

14931516
latest_inbound_scid_alias: None,
14941517
outbound_scid_alias,
@@ -3526,6 +3549,7 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
35263549
// OK, we step the channel here and *then* if the new generation fails we can fail the
35273550
// channel based on that, but stepping stuff here should be safe either way.
35283551
self.channel_state &= !(ChannelState::AwaitingRemoteRevoke as u32);
3552+
self.sent_message_awaiting_response = None;
35293553
self.counterparty_prev_commitment_point = self.counterparty_cur_commitment_point;
35303554
self.counterparty_cur_commitment_point = Some(msg.next_per_commitment_point);
35313555
self.cur_counterparty_commitment_transaction_number -= 1;
@@ -3841,6 +3865,8 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
38413865
}
38423866
}
38433867

3868+
self.sent_message_awaiting_response = None;
3869+
38443870
self.channel_state |= ChannelState::PeerDisconnected as u32;
38453871
log_trace!(logger, "Peer disconnection resulted in {} remote-announced HTLC drops on channel {}", inbound_drop_count, log_bytes!(self.channel_id()));
38463872
}
@@ -3943,6 +3969,7 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
39433969
Some(self.get_last_revoke_and_ack())
39443970
} else { None };
39453971
let commitment_update = if self.monitor_pending_commitment_signed {
3972+
self.mark_awaiting_response();
39463973
Some(self.get_last_commitment_update(logger))
39473974
} else { None };
39483975

@@ -4132,6 +4159,7 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
41324159
// Go ahead and unmark PeerDisconnected as various calls we may make check for it (and all
41334160
// remaining cases either succeed or ErrorMessage-fail).
41344161
self.channel_state &= !(ChannelState::PeerDisconnected as u32);
4162+
self.sent_message_awaiting_response = None;
41354163

41364164
let shutdown_msg = if self.channel_state & (ChannelState::LocalShutdownSent as u32) != 0 {
41374165
assert!(self.shutdown_scriptpubkey.is_some());
@@ -4192,7 +4220,11 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
41924220
// revoke_and_ack, not on sending commitment_signed, so we add one if have
41934221
// AwaitingRemoteRevoke set, which indicates we sent a commitment_signed but haven't gotten
41944222
// the corresponding revoke_and_ack back yet.
4195-
let next_counterparty_commitment_number = INITIAL_COMMITMENT_NUMBER - self.cur_counterparty_commitment_transaction_number + if (self.channel_state & ChannelState::AwaitingRemoteRevoke as u32) != 0 { 1 } else { 0 };
4223+
let is_awaiting_remote_revoke = self.channel_state & ChannelState::AwaitingRemoteRevoke as u32 != 0;
4224+
if is_awaiting_remote_revoke && !self.is_awaiting_monitor_update() {
4225+
self.mark_awaiting_response();
4226+
}
4227+
let next_counterparty_commitment_number = INITIAL_COMMITMENT_NUMBER - self.cur_counterparty_commitment_transaction_number + if is_awaiting_remote_revoke { 1 } else { 0 };
41964228

41974229
let channel_ready = if msg.next_local_commitment_number == 1 && INITIAL_COMMITMENT_NUMBER - self.cur_holder_commitment_transaction_number == 1 {
41984230
// We should never have to worry about MonitorUpdateInProgress resending ChannelReady
@@ -4361,6 +4393,28 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
43614393
}), None))
43624394
}
43634395

4396+
// Marks a channel as waiting for a response from the counterparty. If it's not received
4397+
// [`DISCONNECT_PEER_AWAITING_RESPONSE_TICKS`] after sending our own to them, then we'll attempt
4398+
// a reconnection.
4399+
pub fn mark_awaiting_response(&mut self) {
4400+
self.sent_message_awaiting_response = Some(0);
4401+
}
4402+
4403+
/// Determines whether we should disconnect the counterparty due to not receiving a response
4404+
/// within our expected timeframe.
4405+
///
4406+
/// This should be called on every [`super::channelmanager::ChannelManager::timer_tick_occurred`].
4407+
pub fn should_disconnect_peer_awaiting_response(&mut self) -> bool {
4408+
let ticks_elapsed = if let Some(ticks_elapsed) = self.sent_message_awaiting_response.as_mut() {
4409+
ticks_elapsed
4410+
} else {
4411+
// Don't disconnect when we're not waiting on a response.
4412+
return false;
4413+
};
4414+
*ticks_elapsed += 1;
4415+
*ticks_elapsed >= DISCONNECT_PEER_AWAITING_RESPONSE_TICKS
4416+
}
4417+
43644418
pub fn shutdown<SP: Deref>(
43654419
&mut self, signer_provider: &SP, their_features: &InitFeatures, msg: &msgs::Shutdown
43664420
) -> Result<(Option<msgs::Shutdown>, Option<&ChannelMonitorUpdate>, Vec<(HTLCSource, PaymentHash)>), ChannelError>
@@ -7090,6 +7144,7 @@ impl<'a, 'b, 'c, ES: Deref, SP: Deref> ReadableArgs<(&'a ES, &'b SP, u32, &'c Ch
70907144
next_remote_commitment_tx_fee_info_cached: Mutex::new(None),
70917145

70927146
workaround_lnd_bug_4006: None,
7147+
sent_message_awaiting_response: None,
70937148

70947149
latest_inbound_scid_alias,
70957150
// Later in the ChannelManager deserialization phase we scan for channels and assign scid aliases if its missing

lightning/src/ln/channelmanager.rs

+16
Original file line numberDiff line numberDiff line change
@@ -3869,6 +3869,7 @@ where
38693869
for (counterparty_node_id, peer_state_mutex) in per_peer_state.iter() {
38703870
let mut peer_state_lock = peer_state_mutex.lock().unwrap();
38713871
let peer_state = &mut *peer_state_lock;
3872+
let is_peer_connected = peer_state.is_connected;
38723873
let pending_msg_events = &mut peer_state.pending_msg_events;
38733874
let counterparty_node_id = *counterparty_node_id;
38743875
peer_state.channel_by_id.retain(|chan_id, chan| {
@@ -3921,6 +3922,20 @@ where
39213922

39223923
chan.maybe_expire_prev_config();
39233924

3925+
if is_peer_connected && chan.should_disconnect_peer_awaiting_response() {
3926+
log_debug!(self.logger, "Disconnecting peer {} due to not making any progress on channel {}",
3927+
counterparty_node_id, log_bytes!(*chan_id));
3928+
pending_msg_events.push(MessageSendEvent::HandleError {
3929+
node_id: counterparty_node_id,
3930+
action: msgs::ErrorAction::DisconnectPeerWithWarning {
3931+
msg: msgs::WarningMessage {
3932+
channel_id: *chan_id,
3933+
data: "Disconnecting due to timeout awaiting response".to_owned(),
3934+
},
3935+
},
3936+
});
3937+
}
3938+
39243939
true
39253940
});
39263941
if peer_state.ok_to_remove(true) {
@@ -6734,6 +6749,7 @@ where
67346749
node_id: chan.get_counterparty_node_id(),
67356750
msg: chan.get_channel_reestablish(&self.logger),
67366751
});
6752+
chan.mark_awaiting_response();
67376753
true
67386754
}
67396755
} else { true };

lightning/src/ln/functional_tests.rs

+126-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ use crate::events::{Event, MessageSendEvent, MessageSendEventsProvider, PathFail
2222
use crate::ln::{PaymentPreimage, PaymentSecret, PaymentHash};
2323
use crate::ln::channel::{commitment_tx_base_weight, COMMITMENT_TX_WEIGHT_PER_HTLC, CONCURRENT_INBOUND_HTLC_FEE_BUFFER, FEE_SPIKE_BUFFER_FEE_INCREASE_MULTIPLE, MIN_AFFORDABLE_HTLC_COUNT};
2424
use crate::ln::channelmanager::{self, PaymentId, RAACommitmentOrder, PaymentSendFailure, RecipientOnionFields, BREAKDOWN_TIMEOUT, ENABLE_GOSSIP_TICKS, DISABLE_GOSSIP_TICKS, MIN_CLTV_EXPIRY_DELTA};
25-
use crate::ln::channel::{Channel, ChannelError};
25+
use crate::ln::channel::{DISCONNECT_PEER_AWAITING_RESPONSE_TICKS, Channel, ChannelError};
2626
use crate::ln::{chan_utils, onion_utils};
2727
use crate::ln::chan_utils::{OFFERED_HTLC_SCRIPT_WEIGHT, htlc_success_tx_weight, htlc_timeout_tx_weight, HTLCOutputInCommitment};
2828
use crate::routing::gossip::{NetworkGraph, NetworkUpdate};
@@ -9955,3 +9955,128 @@ fn test_payment_with_custom_min_cltv_expiry_delta() {
99559955
do_payment_with_custom_min_final_cltv_expiry(true, false);
99569956
do_payment_with_custom_min_final_cltv_expiry(true, true);
99579957
}
9958+
9959+
#[test]
9960+
fn test_disconnects_peer_awaiting_response_ticks() {
9961+
// Tests that nodes which are awaiting on a response critical for channel responsiveness
9962+
// disconnect their counterparty after `DISCONNECT_PEER_AWAITING_RESPONSE_TICKS`.
9963+
let mut chanmon_cfgs = create_chanmon_cfgs(2);
9964+
let node_cfgs = create_node_cfgs(2, &chanmon_cfgs);
9965+
let node_chanmgrs = create_node_chanmgrs(2, &node_cfgs, &[None, None]);
9966+
let nodes = create_network(2, &node_cfgs, &node_chanmgrs);
9967+
9968+
// Asserts a disconnect event is queued to the user.
9969+
let check_disconnect_event = |node: &Node, should_disconnect: bool| {
9970+
let disconnect_event = node.node.get_and_clear_pending_msg_events().iter().find_map(|event|
9971+
if let MessageSendEvent::HandleError { action, .. } = event {
9972+
if let msgs::ErrorAction::DisconnectPeerWithWarning { .. } = action {
9973+
Some(())
9974+
} else {
9975+
None
9976+
}
9977+
} else {
9978+
None
9979+
}
9980+
);
9981+
assert_eq!(disconnect_event.is_some(), should_disconnect);
9982+
};
9983+
9984+
// Fires timer ticks ensuring we only attempt to disconnect peers after reaching
9985+
// `DISCONNECT_PEER_AWAITING_RESPONSE_TICKS`.
9986+
let check_disconnect = |node: &Node| {
9987+
// No disconnect without any timer ticks.
9988+
check_disconnect_event(node, false);
9989+
9990+
// No disconnect with 1 timer tick less than required.
9991+
for _ in 0..DISCONNECT_PEER_AWAITING_RESPONSE_TICKS - 1 {
9992+
node.node.timer_tick_occurred();
9993+
check_disconnect_event(node, false);
9994+
}
9995+
9996+
// Disconnect after reaching the required ticks.
9997+
node.node.timer_tick_occurred();
9998+
check_disconnect_event(node, true);
9999+
10000+
// Disconnect again on the next tick if the peer hasn't been disconnected yet.
10001+
node.node.timer_tick_occurred();
10002+
check_disconnect_event(node, true);
10003+
};
10004+
10005+
create_chan_between_nodes(&nodes[0], &nodes[1]);
10006+
10007+
// We'll start by performing a fee update with Alice (nodes[0]) on the channel.
10008+
*nodes[0].fee_estimator.sat_per_kw.lock().unwrap() *= 2;
10009+
nodes[0].node.timer_tick_occurred();
10010+
check_added_monitors!(&nodes[0], 1);
10011+
let alice_fee_update = get_htlc_update_msgs(&nodes[0], &nodes[1].node.get_our_node_id());
10012+
nodes[1].node.handle_update_fee(&nodes[0].node.get_our_node_id(), alice_fee_update.update_fee.as_ref().unwrap());
10013+
nodes[1].node.handle_commitment_signed(&nodes[0].node.get_our_node_id(), &alice_fee_update.commitment_signed);
10014+
check_added_monitors!(&nodes[1], 1);
10015+
10016+
// This will prompt Bob (nodes[1]) to respond with his `CommitmentSigned` and `RevokeAndACK`.
10017+
let (bob_revoke_and_ack, bob_commitment_signed) = get_revoke_commit_msgs!(&nodes[1], nodes[0].node.get_our_node_id());
10018+
nodes[0].node.handle_revoke_and_ack(&nodes[1].node.get_our_node_id(), &bob_revoke_and_ack);
10019+
check_added_monitors!(&nodes[0], 1);
10020+
nodes[0].node.handle_commitment_signed(&nodes[1].node.get_our_node_id(), &bob_commitment_signed);
10021+
check_added_monitors(&nodes[0], 1);
10022+
10023+
// Alice then needs to send her final `RevokeAndACK` to complete the commitment dance. We
10024+
// pretend Bob hasn't received the message and check whether he'll disconnect Alice after
10025+
// reaching `DISCONNECT_PEER_AWAITING_RESPONSE_TICKS`.
10026+
let alice_revoke_and_ack = get_event_msg!(nodes[0], MessageSendEvent::SendRevokeAndACK, nodes[1].node.get_our_node_id());
10027+
check_disconnect(&nodes[1]);
10028+
10029+
// Now, we'll reconnect them to test awaiting a `ChannelReestablish` message.
10030+
//
10031+
// Note that since the commitment dance didn't complete above, Alice is expected to resend her
10032+
// final `RevokeAndACK` to Bob to complete it.
10033+
nodes[0].node.peer_disconnected(&nodes[1].node.get_our_node_id());
10034+
nodes[1].node.peer_disconnected(&nodes[0].node.get_our_node_id());
10035+
let bob_init = msgs::Init { features: nodes[1].node.init_features(), remote_network_address: None };
10036+
nodes[0].node.peer_connected(&nodes[1].node.get_our_node_id(), &bob_init, true).unwrap();
10037+
let alice_init = msgs::Init { features: nodes[0].node.init_features(), remote_network_address: None };
10038+
nodes[1].node.peer_connected(&nodes[0].node.get_our_node_id(), &alice_init, true).unwrap();
10039+
10040+
// Upon reconnection, Alice sends her `ChannelReestablish` to Bob. Alice, however, hasn't
10041+
// received Bob's yet, so she should disconnect him after reaching
10042+
// `DISCONNECT_PEER_AWAITING_RESPONSE_TICKS`.
10043+
let alice_channel_reestablish = get_event_msg!(
10044+
nodes[0], MessageSendEvent::SendChannelReestablish, nodes[1].node.get_our_node_id()
10045+
);
10046+
nodes[1].node.handle_channel_reestablish(&nodes[0].node.get_our_node_id(), &alice_channel_reestablish);
10047+
check_disconnect(&nodes[0]);
10048+
10049+
// Bob now sends his `ChannelReestablish` to Alice to resume the channel and consider it "live".
10050+
let bob_channel_reestablish = nodes[1].node.get_and_clear_pending_msg_events().iter().find_map(|event|
10051+
if let MessageSendEvent::SendChannelReestablish { node_id, msg } = event {
10052+
assert_eq!(*node_id, nodes[0].node.get_our_node_id());
10053+
Some(msg.clone())
10054+
} else {
10055+
None
10056+
}
10057+
).unwrap();
10058+
nodes[0].node.handle_channel_reestablish(&nodes[1].node.get_our_node_id(), &bob_channel_reestablish);
10059+
10060+
// Sanity check that Alice won't disconnect Bob since she's no longer waiting for any messages.
10061+
for _ in 0..DISCONNECT_PEER_AWAITING_RESPONSE_TICKS {
10062+
nodes[0].node.timer_tick_occurred();
10063+
check_disconnect_event(&nodes[0], false);
10064+
}
10065+
10066+
// However, Bob is still waiting on Alice's `RevokeAndACK`, so he should disconnect her after
10067+
// reaching `DISCONNECT_PEER_AWAITING_RESPONSE_TICKS`.
10068+
check_disconnect(&nodes[1]);
10069+
10070+
// Finally, have Bob process the last message.
10071+
nodes[1].node.handle_revoke_and_ack(&nodes[0].node.get_our_node_id(), &alice_revoke_and_ack);
10072+
check_added_monitors(&nodes[1], 1);
10073+
10074+
// At this point, neither node should attempt to disconnect each other, since they aren't
10075+
// waiting on any messages.
10076+
for node in &nodes {
10077+
for _ in 0..DISCONNECT_PEER_AWAITING_RESPONSE_TICKS {
10078+
node.node.timer_tick_occurred();
10079+
check_disconnect_event(node, false);
10080+
}
10081+
}
10082+
}

0 commit comments

Comments
 (0)