Skip to content

Commit eec5ec6

Browse files
authored
Merge pull request #2293 from wpaulino/disconnect-peers-timer-tick
Disconnect peers on timer ticks to unblock channel state machine
2 parents a9d49ae + 5bf7fac commit eec5ec6

File tree

6 files changed

+299
-23
lines changed

6 files changed

+299
-23
lines changed

lightning/src/ln/channel.rs

+57-2
Original file line numberDiff line numberDiff line change
@@ -479,6 +479,13 @@ pub(crate) const MIN_AFFORDABLE_HTLC_COUNT: usize = 4;
479479
/// * `EXPIRE_PREV_CONFIG_TICKS` = convergence_delay / tick_interval
480480
pub(crate) const EXPIRE_PREV_CONFIG_TICKS: usize = 5;
481481

482+
/// The number of ticks that may elapse while we're waiting for a response to a
483+
/// [`msgs::RevokeAndACK`] or [`msgs::ChannelReestablish`] message before we attempt to disconnect
484+
/// them.
485+
///
486+
/// See [`Channel::sent_message_awaiting_response`] for more information.
487+
pub(crate) const DISCONNECT_PEER_AWAITING_RESPONSE_TICKS: usize = 2;
488+
482489
struct PendingChannelMonitorUpdate {
483490
update: ChannelMonitorUpdate,
484491
/// In some cases we need to delay letting the [`ChannelMonitorUpdate`] go until after an
@@ -715,6 +722,19 @@ pub(super) struct Channel<Signer: ChannelSigner> {
715722
/// See-also <https://github.com/lightningnetwork/lnd/issues/4006>
716723
pub workaround_lnd_bug_4006: Option<msgs::ChannelReady>,
717724

725+
/// An option set when we wish to track how many ticks have elapsed while waiting for a response
726+
/// from our counterparty after sending a message. If the peer has yet to respond after reaching
727+
/// `DISCONNECT_PEER_AWAITING_RESPONSE_TICKS`, a reconnection should be attempted to try to
728+
/// unblock the state machine.
729+
///
730+
/// This behavior is mostly motivated by a lnd bug in which we don't receive a message we expect
731+
/// to in a timely manner, which may lead to channels becoming unusable and/or force-closed. An
732+
/// example of such can be found at <https://github.com/lightningnetwork/lnd/issues/7682>.
733+
///
734+
/// This is currently only used when waiting for a [`msgs::ChannelReestablish`] or
735+
/// [`msgs::RevokeAndACK`] message from the counterparty.
736+
sent_message_awaiting_response: Option<usize>,
737+
718738
#[cfg(any(test, fuzzing))]
719739
// When we receive an HTLC fulfill on an outbound path, we may immediately fulfill the
720740
// corresponding HTLC on the inbound path. If, then, the outbound path channel is
@@ -1130,6 +1150,7 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
11301150
next_remote_commitment_tx_fee_info_cached: Mutex::new(None),
11311151

11321152
workaround_lnd_bug_4006: None,
1153+
sent_message_awaiting_response: None,
11331154

11341155
latest_inbound_scid_alias: None,
11351156
outbound_scid_alias,
@@ -1489,6 +1510,7 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
14891510
next_remote_commitment_tx_fee_info_cached: Mutex::new(None),
14901511

14911512
workaround_lnd_bug_4006: None,
1513+
sent_message_awaiting_response: None,
14921514

14931515
latest_inbound_scid_alias: None,
14941516
outbound_scid_alias,
@@ -3526,6 +3548,7 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
35263548
// OK, we step the channel here and *then* if the new generation fails we can fail the
35273549
// channel based on that, but stepping stuff here should be safe either way.
35283550
self.channel_state &= !(ChannelState::AwaitingRemoteRevoke as u32);
3551+
self.sent_message_awaiting_response = None;
35293552
self.counterparty_prev_commitment_point = self.counterparty_cur_commitment_point;
35303553
self.counterparty_cur_commitment_point = Some(msg.next_per_commitment_point);
35313554
self.cur_counterparty_commitment_transaction_number -= 1;
@@ -3841,6 +3864,8 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
38413864
}
38423865
}
38433866

3867+
self.sent_message_awaiting_response = None;
3868+
38443869
self.channel_state |= ChannelState::PeerDisconnected as u32;
38453870
log_trace!(logger, "Peer disconnection resulted in {} remote-announced HTLC drops on channel {}", inbound_drop_count, log_bytes!(self.channel_id()));
38463871
}
@@ -3943,6 +3968,7 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
39433968
Some(self.get_last_revoke_and_ack())
39443969
} else { None };
39453970
let commitment_update = if self.monitor_pending_commitment_signed {
3971+
self.mark_awaiting_response();
39463972
Some(self.get_last_commitment_update(logger))
39473973
} else { None };
39483974

@@ -4132,6 +4158,7 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
41324158
// Go ahead and unmark PeerDisconnected as various calls we may make check for it (and all
41334159
// remaining cases either succeed or ErrorMessage-fail).
41344160
self.channel_state &= !(ChannelState::PeerDisconnected as u32);
4161+
self.sent_message_awaiting_response = None;
41354162

41364163
let shutdown_msg = if self.channel_state & (ChannelState::LocalShutdownSent as u32) != 0 {
41374164
assert!(self.shutdown_scriptpubkey.is_some());
@@ -4192,7 +4219,11 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
41924219
// revoke_and_ack, not on sending commitment_signed, so we add one if have
41934220
// AwaitingRemoteRevoke set, which indicates we sent a commitment_signed but haven't gotten
41944221
// the corresponding revoke_and_ack back yet.
4195-
let next_counterparty_commitment_number = INITIAL_COMMITMENT_NUMBER - self.cur_counterparty_commitment_transaction_number + if (self.channel_state & ChannelState::AwaitingRemoteRevoke as u32) != 0 { 1 } else { 0 };
4222+
let is_awaiting_remote_revoke = self.channel_state & ChannelState::AwaitingRemoteRevoke as u32 != 0;
4223+
if is_awaiting_remote_revoke && !self.is_awaiting_monitor_update() {
4224+
self.mark_awaiting_response();
4225+
}
4226+
let next_counterparty_commitment_number = INITIAL_COMMITMENT_NUMBER - self.cur_counterparty_commitment_transaction_number + if is_awaiting_remote_revoke { 1 } else { 0 };
41964227

41974228
let channel_ready = if msg.next_local_commitment_number == 1 && INITIAL_COMMITMENT_NUMBER - self.cur_holder_commitment_transaction_number == 1 {
41984229
// We should never have to worry about MonitorUpdateInProgress resending ChannelReady
@@ -4361,6 +4392,28 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
43614392
}), None))
43624393
}
43634394

4395+
// Marks a channel as waiting for a response from the counterparty. If it's not received
4396+
// [`DISCONNECT_PEER_AWAITING_RESPONSE_TICKS`] after sending our own to them, then we'll attempt
4397+
// a reconnection.
4398+
fn mark_awaiting_response(&mut self) {
4399+
self.sent_message_awaiting_response = Some(0);
4400+
}
4401+
4402+
/// Determines whether we should disconnect the counterparty due to not receiving a response
4403+
/// within our expected timeframe.
4404+
///
4405+
/// This should be called on every [`super::channelmanager::ChannelManager::timer_tick_occurred`].
4406+
pub fn should_disconnect_peer_awaiting_response(&mut self) -> bool {
4407+
let ticks_elapsed = if let Some(ticks_elapsed) = self.sent_message_awaiting_response.as_mut() {
4408+
ticks_elapsed
4409+
} else {
4410+
// Don't disconnect when we're not waiting on a response.
4411+
return false;
4412+
};
4413+
*ticks_elapsed += 1;
4414+
*ticks_elapsed >= DISCONNECT_PEER_AWAITING_RESPONSE_TICKS
4415+
}
4416+
43644417
pub fn shutdown<SP: Deref>(
43654418
&mut self, signer_provider: &SP, their_features: &InitFeatures, msg: &msgs::Shutdown
43664419
) -> Result<(Option<msgs::Shutdown>, Option<&ChannelMonitorUpdate>, Vec<(HTLCSource, PaymentHash)>), ChannelError>
@@ -5733,7 +5786,7 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
57335786

57345787
/// May panic if called on a channel that wasn't immediately-previously
57355788
/// self.remove_uncommitted_htlcs_and_mark_paused()'d
5736-
pub fn get_channel_reestablish<L: Deref>(&self, logger: &L) -> msgs::ChannelReestablish where L::Target: Logger {
5789+
pub fn get_channel_reestablish<L: Deref>(&mut self, logger: &L) -> msgs::ChannelReestablish where L::Target: Logger {
57375790
assert_eq!(self.channel_state & ChannelState::PeerDisconnected as u32, ChannelState::PeerDisconnected as u32);
57385791
assert_ne!(self.cur_counterparty_commitment_transaction_number, INITIAL_COMMITMENT_NUMBER);
57395792
// Prior to static_remotekey, my_current_per_commitment_point was critical to claiming
@@ -5752,6 +5805,7 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
57525805
log_info!(logger, "Sending a data_loss_protect with no previous remote per_commitment_secret for channel {}", log_bytes!(self.channel_id()));
57535806
[0;32]
57545807
};
5808+
self.mark_awaiting_response();
57555809
msgs::ChannelReestablish {
57565810
channel_id: self.channel_id(),
57575811
// The protocol has two different commitment number concepts - the "commitment
@@ -7090,6 +7144,7 @@ impl<'a, 'b, 'c, ES: Deref, SP: Deref> ReadableArgs<(&'a ES, &'b SP, u32, &'c Ch
70907144
next_remote_commitment_tx_fee_info_cached: Mutex::new(None),
70917145

70927146
workaround_lnd_bug_4006: None,
7147+
sent_message_awaiting_response: None,
70937148

70947149
latest_inbound_scid_alias,
70957150
// Later in the ChannelManager deserialization phase we scan for channels and assign scid aliases if its missing

lightning/src/ln/channelmanager.rs

+14
Original file line numberDiff line numberDiff line change
@@ -3921,6 +3921,20 @@ where
39213921

39223922
chan.maybe_expire_prev_config();
39233923

3924+
if chan.should_disconnect_peer_awaiting_response() {
3925+
log_debug!(self.logger, "Disconnecting peer {} due to not making any progress on channel {}",
3926+
counterparty_node_id, log_bytes!(*chan_id));
3927+
pending_msg_events.push(MessageSendEvent::HandleError {
3928+
node_id: counterparty_node_id,
3929+
action: msgs::ErrorAction::DisconnectPeerWithWarning {
3930+
msg: msgs::WarningMessage {
3931+
channel_id: *chan_id,
3932+
data: "Disconnecting due to timeout awaiting response".to_owned(),
3933+
},
3934+
},
3935+
});
3936+
}
3937+
39243938
true
39253939
});
39263940
if peer_state.ok_to_remove(true) {

lightning/src/ln/functional_tests.rs

+126-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ use crate::events::{Event, MessageSendEvent, MessageSendEventsProvider, PathFail
2222
use crate::ln::{PaymentPreimage, PaymentSecret, PaymentHash};
2323
use crate::ln::channel::{commitment_tx_base_weight, COMMITMENT_TX_WEIGHT_PER_HTLC, CONCURRENT_INBOUND_HTLC_FEE_BUFFER, FEE_SPIKE_BUFFER_FEE_INCREASE_MULTIPLE, MIN_AFFORDABLE_HTLC_COUNT};
2424
use crate::ln::channelmanager::{self, PaymentId, RAACommitmentOrder, PaymentSendFailure, RecipientOnionFields, BREAKDOWN_TIMEOUT, ENABLE_GOSSIP_TICKS, DISABLE_GOSSIP_TICKS, MIN_CLTV_EXPIRY_DELTA};
25-
use crate::ln::channel::{Channel, ChannelError};
25+
use crate::ln::channel::{DISCONNECT_PEER_AWAITING_RESPONSE_TICKS, Channel, ChannelError};
2626
use crate::ln::{chan_utils, onion_utils};
2727
use crate::ln::chan_utils::{OFFERED_HTLC_SCRIPT_WEIGHT, htlc_success_tx_weight, htlc_timeout_tx_weight, HTLCOutputInCommitment};
2828
use crate::routing::gossip::{NetworkGraph, NetworkUpdate};
@@ -9955,3 +9955,128 @@ fn test_payment_with_custom_min_cltv_expiry_delta() {
99559955
do_payment_with_custom_min_final_cltv_expiry(true, false);
99569956
do_payment_with_custom_min_final_cltv_expiry(true, true);
99579957
}
9958+
9959+
#[test]
9960+
fn test_disconnects_peer_awaiting_response_ticks() {
9961+
// Tests that nodes which are awaiting on a response critical for channel responsiveness
9962+
// disconnect their counterparty after `DISCONNECT_PEER_AWAITING_RESPONSE_TICKS`.
9963+
let mut chanmon_cfgs = create_chanmon_cfgs(2);
9964+
let node_cfgs = create_node_cfgs(2, &chanmon_cfgs);
9965+
let node_chanmgrs = create_node_chanmgrs(2, &node_cfgs, &[None, None]);
9966+
let nodes = create_network(2, &node_cfgs, &node_chanmgrs);
9967+
9968+
// Asserts a disconnect event is queued to the user.
9969+
let check_disconnect_event = |node: &Node, should_disconnect: bool| {
9970+
let disconnect_event = node.node.get_and_clear_pending_msg_events().iter().find_map(|event|
9971+
if let MessageSendEvent::HandleError { action, .. } = event {
9972+
if let msgs::ErrorAction::DisconnectPeerWithWarning { .. } = action {
9973+
Some(())
9974+
} else {
9975+
None
9976+
}
9977+
} else {
9978+
None
9979+
}
9980+
);
9981+
assert_eq!(disconnect_event.is_some(), should_disconnect);
9982+
};
9983+
9984+
// Fires timer ticks ensuring we only attempt to disconnect peers after reaching
9985+
// `DISCONNECT_PEER_AWAITING_RESPONSE_TICKS`.
9986+
let check_disconnect = |node: &Node| {
9987+
// No disconnect without any timer ticks.
9988+
check_disconnect_event(node, false);
9989+
9990+
// No disconnect with 1 timer tick less than required.
9991+
for _ in 0..DISCONNECT_PEER_AWAITING_RESPONSE_TICKS - 1 {
9992+
node.node.timer_tick_occurred();
9993+
check_disconnect_event(node, false);
9994+
}
9995+
9996+
// Disconnect after reaching the required ticks.
9997+
node.node.timer_tick_occurred();
9998+
check_disconnect_event(node, true);
9999+
10000+
// Disconnect again on the next tick if the peer hasn't been disconnected yet.
10001+
node.node.timer_tick_occurred();
10002+
check_disconnect_event(node, true);
10003+
};
10004+
10005+
create_chan_between_nodes(&nodes[0], &nodes[1]);
10006+
10007+
// We'll start by performing a fee update with Alice (nodes[0]) on the channel.
10008+
*nodes[0].fee_estimator.sat_per_kw.lock().unwrap() *= 2;
10009+
nodes[0].node.timer_tick_occurred();
10010+
check_added_monitors!(&nodes[0], 1);
10011+
let alice_fee_update = get_htlc_update_msgs(&nodes[0], &nodes[1].node.get_our_node_id());
10012+
nodes[1].node.handle_update_fee(&nodes[0].node.get_our_node_id(), alice_fee_update.update_fee.as_ref().unwrap());
10013+
nodes[1].node.handle_commitment_signed(&nodes[0].node.get_our_node_id(), &alice_fee_update.commitment_signed);
10014+
check_added_monitors!(&nodes[1], 1);
10015+
10016+
// This will prompt Bob (nodes[1]) to respond with his `CommitmentSigned` and `RevokeAndACK`.
10017+
let (bob_revoke_and_ack, bob_commitment_signed) = get_revoke_commit_msgs!(&nodes[1], nodes[0].node.get_our_node_id());
10018+
nodes[0].node.handle_revoke_and_ack(&nodes[1].node.get_our_node_id(), &bob_revoke_and_ack);
10019+
check_added_monitors!(&nodes[0], 1);
10020+
nodes[0].node.handle_commitment_signed(&nodes[1].node.get_our_node_id(), &bob_commitment_signed);
10021+
check_added_monitors(&nodes[0], 1);
10022+
10023+
// Alice then needs to send her final `RevokeAndACK` to complete the commitment dance. We
10024+
// pretend Bob hasn't received the message and check whether he'll disconnect Alice after
10025+
// reaching `DISCONNECT_PEER_AWAITING_RESPONSE_TICKS`.
10026+
let alice_revoke_and_ack = get_event_msg!(nodes[0], MessageSendEvent::SendRevokeAndACK, nodes[1].node.get_our_node_id());
10027+
check_disconnect(&nodes[1]);
10028+
10029+
// Now, we'll reconnect them to test awaiting a `ChannelReestablish` message.
10030+
//
10031+
// Note that since the commitment dance didn't complete above, Alice is expected to resend her
10032+
// final `RevokeAndACK` to Bob to complete it.
10033+
nodes[0].node.peer_disconnected(&nodes[1].node.get_our_node_id());
10034+
nodes[1].node.peer_disconnected(&nodes[0].node.get_our_node_id());
10035+
let bob_init = msgs::Init { features: nodes[1].node.init_features(), remote_network_address: None };
10036+
nodes[0].node.peer_connected(&nodes[1].node.get_our_node_id(), &bob_init, true).unwrap();
10037+
let alice_init = msgs::Init { features: nodes[0].node.init_features(), remote_network_address: None };
10038+
nodes[1].node.peer_connected(&nodes[0].node.get_our_node_id(), &alice_init, true).unwrap();
10039+
10040+
// Upon reconnection, Alice sends her `ChannelReestablish` to Bob. Alice, however, hasn't
10041+
// received Bob's yet, so she should disconnect him after reaching
10042+
// `DISCONNECT_PEER_AWAITING_RESPONSE_TICKS`.
10043+
let alice_channel_reestablish = get_event_msg!(
10044+
nodes[0], MessageSendEvent::SendChannelReestablish, nodes[1].node.get_our_node_id()
10045+
);
10046+
nodes[1].node.handle_channel_reestablish(&nodes[0].node.get_our_node_id(), &alice_channel_reestablish);
10047+
check_disconnect(&nodes[0]);
10048+
10049+
// Bob now sends his `ChannelReestablish` to Alice to resume the channel and consider it "live".
10050+
let bob_channel_reestablish = nodes[1].node.get_and_clear_pending_msg_events().iter().find_map(|event|
10051+
if let MessageSendEvent::SendChannelReestablish { node_id, msg } = event {
10052+
assert_eq!(*node_id, nodes[0].node.get_our_node_id());
10053+
Some(msg.clone())
10054+
} else {
10055+
None
10056+
}
10057+
).unwrap();
10058+
nodes[0].node.handle_channel_reestablish(&nodes[1].node.get_our_node_id(), &bob_channel_reestablish);
10059+
10060+
// Sanity check that Alice won't disconnect Bob since she's no longer waiting for any messages.
10061+
for _ in 0..DISCONNECT_PEER_AWAITING_RESPONSE_TICKS {
10062+
nodes[0].node.timer_tick_occurred();
10063+
check_disconnect_event(&nodes[0], false);
10064+
}
10065+
10066+
// However, Bob is still waiting on Alice's `RevokeAndACK`, so he should disconnect her after
10067+
// reaching `DISCONNECT_PEER_AWAITING_RESPONSE_TICKS`.
10068+
check_disconnect(&nodes[1]);
10069+
10070+
// Finally, have Bob process the last message.
10071+
nodes[1].node.handle_revoke_and_ack(&nodes[0].node.get_our_node_id(), &alice_revoke_and_ack);
10072+
check_added_monitors(&nodes[1], 1);
10073+
10074+
// At this point, neither node should attempt to disconnect each other, since they aren't
10075+
// waiting on any messages.
10076+
for node in &nodes {
10077+
for _ in 0..DISCONNECT_PEER_AWAITING_RESPONSE_TICKS {
10078+
node.node.timer_tick_occurred();
10079+
check_disconnect_event(node, false);
10080+
}
10081+
}
10082+
}

lightning/src/ln/msgs.rs

+5
Original file line numberDiff line numberDiff line change
@@ -1137,6 +1137,11 @@ pub enum ErrorAction {
11371137
/// An error message which we should make an effort to send before we disconnect.
11381138
msg: Option<ErrorMessage>
11391139
},
1140+
/// The peer did something incorrect. Tell them without closing any channels and disconnect them.
1141+
DisconnectPeerWithWarning {
1142+
/// A warning message which we should make an effort to send before we disconnect.
1143+
msg: WarningMessage,
1144+
},
11401145
/// The peer did something harmless that we weren't able to process, just log and ignore
11411146
// New code should *not* use this. New code must use IgnoreAndLog, below!
11421147
IgnoreError,

0 commit comments

Comments
 (0)