-
-
Notifications
You must be signed in to change notification settings - Fork 2.1k
Faster room joins: Try other destinations when resyncing the state of a partial-state room #12812
Changes from 4 commits
dd3e926
2d63f45
83a9310
808ad43
356f0c3
c713c47
ab3ccf7
55460da
972a021
ed290b2
4363bd3
3efe1df
dd6bf3f
af79717
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
Try other homeservers when re-syncing state for rooms with partial state. |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,7 +20,16 @@ | |
import logging | ||
from enum import Enum | ||
from http import HTTPStatus | ||
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union | ||
from typing import ( | ||
TYPE_CHECKING, | ||
Collection, | ||
Dict, | ||
Iterable, | ||
List, | ||
Optional, | ||
Tuple, | ||
Union, | ||
) | ||
|
||
import attr | ||
from signedjson.key import decode_verify_key_bytes | ||
|
@@ -34,6 +43,7 @@ | |
CodeMessageException, | ||
Codes, | ||
FederationDeniedError, | ||
FederationError, | ||
HttpResponseException, | ||
NotFoundError, | ||
RequestSendFailed, | ||
|
@@ -545,6 +555,7 @@ async def do_invite_join( | |
desc="sync_partial_state_room", | ||
func=self._sync_partial_state_room, | ||
destination=origin, | ||
destinations=ret.servers_in_room, | ||
room_id=room_id, | ||
) | ||
|
||
|
@@ -1443,13 +1454,16 @@ async def get_room_complexity( | |
|
||
async def _sync_partial_state_room( | ||
self, | ||
destination: str, | ||
destination: Optional[str], | ||
destinations: Collection[str], | ||
richvdh marked this conversation as resolved.
Show resolved
Hide resolved
|
||
room_id: str, | ||
) -> None: | ||
"""Background process to resync the state of a partial-state room | ||
|
||
Args: | ||
destination: homeserver to pull the state from | ||
destination: the initial homeserver to pull the state from | ||
destinations: other homeservers to try to pull the state from, if | ||
`destination` is unavailable | ||
room_id: room to be resynced | ||
""" | ||
|
||
|
@@ -1460,9 +1474,19 @@ async def _sync_partial_state_room( | |
# TODO(faster_joins): what happens if we leave the room during a resync? if we | ||
# really leave, that might mean we have difficulty getting the room state over | ||
# federation. | ||
# | ||
# TODO(faster_joins): try other destinations if the one we have fails | ||
|
||
# Make an infinite iterator of destinations to try. Once we find a working | ||
# destination, we'll stick with it until it flakes. | ||
if destination is not None: | ||
# Move `destination` to the front of the list. | ||
destinations = list(destinations) | ||
if destination in destinations: | ||
destinations.remove(destination) | ||
destinations = [destination] + destinations | ||
destination_iter = itertools.cycle(destinations) | ||
richvdh marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
# `destination` is now the current remote homeserver we're pulling from. | ||
destination = next(destination_iter) | ||
logger.info("Syncing state for room %s via %s", room_id, destination) | ||
|
||
# we work through the queue in order of increasing stream ordering. | ||
|
@@ -1498,6 +1522,36 @@ async def _sync_partial_state_room( | |
allow_rejected=True, | ||
) | ||
for event in events: | ||
await self._federation_event_handler.update_state_for_partial_state_event( | ||
destination, event | ||
) | ||
for attempt in range(len(destinations)): | ||
richvdh marked this conversation as resolved.
Show resolved
Hide resolved
|
||
try: | ||
await self._federation_event_handler.update_state_for_partial_state_event( | ||
destination, event | ||
) | ||
break | ||
except FederationError as e: | ||
if attempt == len(destinations) - 1: | ||
# We have tried every remote server for this event. Give up. | ||
logger.error( | ||
"Failed to get state for %s at %s from %s because %s, " | ||
"giving up!", | ||
room_id, | ||
event, | ||
destination, | ||
e, | ||
) | ||
raise | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This might not be the best thing to do. We might reach here in the event of a temporary network outage. Then the resync would be stuck until the next restart. Retrying all destinations indefinitely is also not the best thing to do, eg. if we can successfully reach all homeservers and they all claim they don't have the data we want. Nonetheless, this PR is an improvement and doesn't introduce any failure modes that didn't exist before. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. agreed. This is one of my "big unanswered questions" for faster joins. Let's leave it like this for now. could you add a |
||
|
||
# Try the next remote server. | ||
logger.info( | ||
"Failed to get state for %s at %s from %s because %s", | ||
room_id, | ||
event, | ||
destination, | ||
e, | ||
) | ||
destination = next(destination_iter) | ||
logger.info( | ||
"Syncing state for room %s via %s instead", | ||
room_id, | ||
destination, | ||
) |
Uh oh!
There was an error while loading. Please reload this page.