Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Commit 2fba107

Browse files
authored
Faster room joins: Try other destinations when resyncing the state of a partial-state room (#12812)
Signed-off-by: Sean Quah <[email protected]>
1 parent 3594f6c commit 2fba107

File tree

4 files changed

+94
-9
lines changed

4 files changed

+94
-9
lines changed

Diff for: changelog.d/12812.misc

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Try other homeservers when re-syncing state for rooms with partial state.

Diff for: synapse/federation/federation_client.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -405,6 +405,9 @@ async def get_room_state_ids(
405405
406406
Returns:
407407
a tuple of (state event_ids, auth event_ids)
408+
409+
Raises:
410+
InvalidResponseError: if fields in the response have the wrong type.
408411
"""
409412
result = await self.transport_layer.get_room_state_ids(
410413
destination, room_id, event_id=event_id
@@ -416,7 +419,7 @@ async def get_room_state_ids(
416419
if not isinstance(state_event_ids, list) or not isinstance(
417420
auth_event_ids, list
418421
):
419-
raise Exception("invalid response from /state_ids")
422+
raise InvalidResponseError("invalid response from /state_ids")
420423

421424
return state_event_ids, auth_event_ids
422425

Diff for: synapse/handlers/federation.py

+78-8
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,16 @@
2020
import logging
2121
from enum import Enum
2222
from http import HTTPStatus
23-
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union
23+
from typing import (
24+
TYPE_CHECKING,
25+
Collection,
26+
Dict,
27+
Iterable,
28+
List,
29+
Optional,
30+
Tuple,
31+
Union,
32+
)
2433

2534
import attr
2635
from signedjson.key import decode_verify_key_bytes
@@ -34,6 +43,7 @@
3443
CodeMessageException,
3544
Codes,
3645
FederationDeniedError,
46+
FederationError,
3747
HttpResponseException,
3848
NotFoundError,
3949
RequestSendFailed,
@@ -545,7 +555,8 @@ async def do_invite_join(
545555
run_as_background_process(
546556
desc="sync_partial_state_room",
547557
func=self._sync_partial_state_room,
548-
destination=origin,
558+
initial_destination=origin,
559+
other_destinations=ret.servers_in_room,
549560
room_id=room_id,
550561
)
551562

@@ -1454,13 +1465,16 @@ async def get_room_complexity(
14541465

14551466
async def _sync_partial_state_room(
14561467
self,
1457-
destination: str,
1468+
initial_destination: Optional[str],
1469+
other_destinations: Collection[str],
14581470
room_id: str,
14591471
) -> None:
14601472
"""Background process to resync the state of a partial-state room
14611473
14621474
Args:
1463-
destination: homeserver to pull the state from
1475+
initial_destination: the initial homeserver to pull the state from
1476+
other_destinations: other homeservers to try to pull the state from, if
1477+
`initial_destination` is unavailable
14641478
room_id: room to be resynced
14651479
"""
14661480

@@ -1472,8 +1486,29 @@ async def _sync_partial_state_room(
14721486
# really leave, that might mean we have difficulty getting the room state over
14731487
# federation.
14741488
#
1475-
# TODO(faster_joins): try other destinations if the one we have fails
1489+
# TODO(faster_joins): we need some way of prioritising which homeservers in
1490+
# `other_destinations` to try first, otherwise we'll spend ages trying dead
1491+
# homeservers for large rooms.
1492+
1493+
if initial_destination is None and len(other_destinations) == 0:
1494+
raise ValueError(
1495+
f"Cannot resync state of {room_id}: no destinations provided"
1496+
)
14761497

1498+
# Make an infinite iterator of destinations to try. Once we find a working
1499+
# destination, we'll stick with it until it flakes.
1500+
if initial_destination is not None:
1501+
# Move `initial_destination` to the front of the list.
1502+
destinations = list(other_destinations)
1503+
if initial_destination in destinations:
1504+
destinations.remove(initial_destination)
1505+
destinations = [initial_destination] + destinations
1506+
destination_iter = itertools.cycle(destinations)
1507+
else:
1508+
destination_iter = itertools.cycle(other_destinations)
1509+
1510+
# `destination` is the current remote homeserver we're pulling from.
1511+
destination = next(destination_iter)
14771512
logger.info("Syncing state for room %s via %s", room_id, destination)
14781513

14791514
# we work through the queue in order of increasing stream ordering.
@@ -1511,6 +1546,41 @@ async def _sync_partial_state_room(
15111546
allow_rejected=True,
15121547
)
15131548
for event in events:
1514-
await self._federation_event_handler.update_state_for_partial_state_event(
1515-
destination, event
1516-
)
1549+
for attempt in itertools.count():
1550+
try:
1551+
await self._federation_event_handler.update_state_for_partial_state_event(
1552+
destination, event
1553+
)
1554+
break
1555+
except FederationError as e:
1556+
if attempt == len(destinations) - 1:
1557+
# We have tried every remote server for this event. Give up.
1558+
# TODO(faster_joins) giving up isn't the right thing to do
1559+
# if there's a temporary network outage. retrying
1560+
# indefinitely is also not the right thing to do if we can
1561+
# reach all homeservers and they all claim they don't have
1562+
# the state we want.
1563+
logger.error(
1564+
"Failed to get state for %s at %s from %s because %s, "
1565+
"giving up!",
1566+
room_id,
1567+
event,
1568+
destination,
1569+
e,
1570+
)
1571+
raise
1572+
1573+
# Try the next remote server.
1574+
logger.info(
1575+
"Failed to get state for %s at %s from %s because %s",
1576+
room_id,
1577+
event,
1578+
destination,
1579+
e,
1580+
)
1581+
destination = next(destination_iter)
1582+
logger.info(
1583+
"Syncing state for room %s via %s instead",
1584+
room_id,
1585+
destination,
1586+
)

Diff for: synapse/handlers/federation_event.py

+11
Original file line numberDiff line numberDiff line change
@@ -505,6 +505,9 @@ async def update_state_for_partial_state_event(
505505
Args:
506506
destination: server to request full state from
507507
event: partial-state event to be de-partial-stated
508+
509+
Raises:
510+
FederationError if we fail to request state from the remote server.
508511
"""
509512
logger.info("Updating state for %s", event.event_id)
510513
with nested_logging_context(suffix=event.event_id):
@@ -815,6 +818,10 @@ async def _resolve_state_at_missing_prevs(
815818
Returns:
816819
if we already had all the prev events, `None`. Otherwise, returns
817820
the event ids of the state at `event`.
821+
822+
Raises:
823+
FederationError if we fail to get the state from the remote server after any
824+
missing `prev_event`s.
818825
"""
819826
room_id = event.room_id
820827
event_id = event.event_id
@@ -901,6 +908,10 @@ async def _get_state_ids_after_missing_prev_event(
901908
902909
Returns:
903910
The event ids of the state *after* the given event.
911+
912+
Raises:
913+
InvalidResponseError: if the remote homeserver's response contains fields
914+
of the wrong type.
904915
"""
905916
(
906917
state_event_ids,

0 commit comments

Comments
 (0)