Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Commit 17d99f7

Browse files
authored
Optimise backfill calculation (#12522)
Try to avoid an OOM by checking fewer extremities. Generally this is a big rewrite of _maybe_backfill, to try and fix some of the TODOs and other problems in it. It's best reviewed commit-by-commit.
1 parent e75c7e3 commit 17d99f7

File tree

5 files changed

+168
-106
lines changed

5 files changed

+168
-106
lines changed

changelog.d/12522.bugfix

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix a bug introduced in Synapse 0.99.3 which could cause Synapse to consume large amounts of RAM when back-paginating in a large room.

synapse/handlers/federation.py

+145-89
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright 2014-2021 The Matrix.org Foundation C.I.C.
1+
# Copyright 2014-2022 The Matrix.org Foundation C.I.C.
22
# Copyright 2020 Sorunome
33
#
44
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,10 +15,14 @@
1515

1616
"""Contains handlers for federation events."""
1717

18+
import enum
19+
import itertools
1820
import logging
21+
from enum import Enum
1922
from http import HTTPStatus
2023
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union
2124

25+
import attr
2226
from signedjson.key import decode_verify_key_bytes
2327
from signedjson.sign import verify_signed_json
2428
from unpaddedbase64 import decode_base64
@@ -92,6 +96,24 @@ def get_domains_from_state(state: StateMap[EventBase]) -> List[Tuple[str, int]]:
9296
return sorted(joined_domains.items(), key=lambda d: d[1])
9397

9498

99+
class _BackfillPointType(Enum):
100+
# a regular backwards extremity (ie, an event which we don't yet have, but which
101+
# is referred to by other events in the DAG)
102+
BACKWARDS_EXTREMITY = enum.auto()
103+
104+
# an MSC2716 "insertion event"
105+
INSERTION_PONT = enum.auto()
106+
107+
108+
@attr.s(slots=True, auto_attribs=True, frozen=True)
109+
class _BackfillPoint:
110+
"""A potential point we might backfill from"""
111+
112+
event_id: str
113+
depth: int
114+
type: _BackfillPointType
115+
116+
95117
class FederationHandler:
96118
"""Handles general incoming federation requests
97119
@@ -157,89 +179,51 @@ async def maybe_backfill(
157179
async def _maybe_backfill_inner(
158180
self, room_id: str, current_depth: int, limit: int
159181
) -> bool:
160-
oldest_events_with_depth = (
161-
await self.store.get_oldest_event_ids_with_depth_in_room(room_id)
162-
)
182+
backwards_extremities = [
183+
_BackfillPoint(event_id, depth, _BackfillPointType.BACKWARDS_EXTREMITY)
184+
for event_id, depth in await self.store.get_oldest_event_ids_with_depth_in_room(
185+
room_id
186+
)
187+
]
163188

164-
insertion_events_to_be_backfilled: Dict[str, int] = {}
189+
insertion_events_to_be_backfilled: List[_BackfillPoint] = []
165190
if self.hs.config.experimental.msc2716_enabled:
166-
insertion_events_to_be_backfilled = (
167-
await self.store.get_insertion_event_backward_extremities_in_room(
191+
insertion_events_to_be_backfilled = [
192+
_BackfillPoint(event_id, depth, _BackfillPointType.INSERTION_PONT)
193+
for event_id, depth in await self.store.get_insertion_event_backward_extremities_in_room(
168194
room_id
169195
)
170-
)
196+
]
171197
logger.debug(
172-
"_maybe_backfill_inner: extremities oldest_events_with_depth=%s insertion_events_to_be_backfilled=%s",
173-
oldest_events_with_depth,
198+
"_maybe_backfill_inner: backwards_extremities=%s insertion_events_to_be_backfilled=%s",
199+
backwards_extremities,
174200
insertion_events_to_be_backfilled,
175201
)
176202

177-
if not oldest_events_with_depth and not insertion_events_to_be_backfilled:
203+
if not backwards_extremities and not insertion_events_to_be_backfilled:
178204
logger.debug("Not backfilling as no extremeties found.")
179205
return False
180206

181-
# We only want to paginate if we can actually see the events we'll get,
182-
# as otherwise we'll just spend a lot of resources to get redacted
183-
# events.
184-
#
185-
# We do this by filtering all the backwards extremities and seeing if
186-
# any remain. Given we don't have the extremity events themselves, we
187-
# need to actually check the events that reference them.
188-
#
189-
# *Note*: the spec wants us to keep backfilling until we reach the start
190-
# of the room in case we are allowed to see some of the history. However
191-
# in practice that causes more issues than its worth, as a) its
192-
# relatively rare for there to be any visible history and b) even when
193-
# there is its often sufficiently long ago that clients would stop
194-
# attempting to paginate before backfill reached the visible history.
195-
#
196-
# TODO: If we do do a backfill then we should filter the backwards
197-
# extremities to only include those that point to visible portions of
198-
# history.
199-
#
200-
# TODO: Correctly handle the case where we are allowed to see the
201-
# forward event but not the backward extremity, e.g. in the case of
202-
# initial join of the server where we are allowed to see the join
203-
# event but not anything before it. This would require looking at the
204-
# state *before* the event, ignoring the special casing certain event
205-
# types have.
206-
207-
forward_event_ids = await self.store.get_successor_events(
208-
list(oldest_events_with_depth)
209-
)
210-
211-
extremities_events = await self.store.get_events(
212-
forward_event_ids,
213-
redact_behaviour=EventRedactBehaviour.AS_IS,
214-
get_prev_content=False,
207+
# we now have a list of potential places to backpaginate from. We prefer to
208+
# start with the most recent (ie, max depth), so let's sort the list.
209+
sorted_backfill_points: List[_BackfillPoint] = sorted(
210+
itertools.chain(
211+
backwards_extremities,
212+
insertion_events_to_be_backfilled,
213+
),
214+
key=lambda e: -int(e.depth),
215215
)
216216

217-
# We set `check_history_visibility_only` as we might otherwise get false
218-
# positives from users having been erased.
219-
filtered_extremities = await filter_events_for_server(
220-
self.storage,
221-
self.server_name,
222-
list(extremities_events.values()),
223-
redact=False,
224-
check_history_visibility_only=True,
225-
)
226217
logger.debug(
227-
"_maybe_backfill_inner: filtered_extremities %s", filtered_extremities
218+
"_maybe_backfill_inner: room_id: %s: current_depth: %s, limit: %s, "
219+
"backfill points (%d): %s",
220+
room_id,
221+
current_depth,
222+
limit,
223+
len(sorted_backfill_points),
224+
sorted_backfill_points,
228225
)
229226

230-
if not filtered_extremities and not insertion_events_to_be_backfilled:
231-
return False
232-
233-
extremities = {
234-
**oldest_events_with_depth,
235-
# TODO: insertion_events_to_be_backfilled is currently skipping the filtered_extremities checks
236-
**insertion_events_to_be_backfilled,
237-
}
238-
239-
# Check if we reached a point where we should start backfilling.
240-
sorted_extremeties_tuple = sorted(extremities.items(), key=lambda e: -int(e[1]))
241-
max_depth = sorted_extremeties_tuple[0][1]
242-
243227
# If we're approaching an extremity we trigger a backfill, otherwise we
244228
# no-op.
245229
#
@@ -249,6 +233,11 @@ async def _maybe_backfill_inner(
249233
# chose more than one times the limit in case of failure, but choosing a
250234
# much larger factor will result in triggering a backfill request much
251235
# earlier than necessary.
236+
#
237+
# XXX: shouldn't we do this *after* the filter by depth below? Again, we don't
238+
# care about events that have happened after our current position.
239+
#
240+
max_depth = sorted_backfill_points[0].depth
252241
if current_depth - 2 * limit > max_depth:
253242
logger.debug(
254243
"Not backfilling as we don't need to. %d < %d - 2 * %d",
@@ -265,31 +254,98 @@ async def _maybe_backfill_inner(
265254
# 2. we have likely previously tried and failed to backfill from that
266255
# extremity, so to avoid getting "stuck" requesting the same
267256
# backfill repeatedly we drop those extremities.
268-
filtered_sorted_extremeties_tuple = [
269-
t for t in sorted_extremeties_tuple if int(t[1]) <= current_depth
270-
]
271-
272-
logger.debug(
273-
"room_id: %s, backfill: current_depth: %s, limit: %s, max_depth: %s, extrems (%d): %s filtered_sorted_extremeties_tuple: %s",
274-
room_id,
275-
current_depth,
276-
limit,
277-
max_depth,
278-
len(sorted_extremeties_tuple),
279-
sorted_extremeties_tuple,
280-
filtered_sorted_extremeties_tuple,
281-
)
282-
257+
#
283258
# However, we need to check that the filtered extremities are non-empty.
284259
# If they are empty then either we can a) bail or b) still attempt to
285260
# backfill. We opt to try backfilling anyway just in case we do get
286261
# relevant events.
287-
if filtered_sorted_extremeties_tuple:
288-
sorted_extremeties_tuple = filtered_sorted_extremeties_tuple
262+
#
263+
filtered_sorted_backfill_points = [
264+
t for t in sorted_backfill_points if t.depth <= current_depth
265+
]
266+
if filtered_sorted_backfill_points:
267+
logger.debug(
268+
"_maybe_backfill_inner: backfill points before current depth: %s",
269+
filtered_sorted_backfill_points,
270+
)
271+
sorted_backfill_points = filtered_sorted_backfill_points
272+
else:
273+
logger.debug(
274+
"_maybe_backfill_inner: all backfill points are *after* current depth. Backfilling anyway."
275+
)
276+
277+
# For performance's sake, we only want to paginate from a particular extremity
278+
# if we can actually see the events we'll get. Otherwise, we'd just spend a lot
279+
# of resources to get redacted events. We check each extremity in turn and
280+
# ignore those which users on our server wouldn't be able to see.
281+
#
282+
# Additionally, we limit ourselves to backfilling from at most 5 extremities,
283+
# for two reasons:
284+
#
285+
# - The check which determines if we can see an extremity's events can be
286+
# expensive (we load the full state for the room at each of the backfill
287+
# points, or (worse) their successors)
288+
# - We want to avoid the server-server API request URI becoming too long.
289+
#
290+
# *Note*: the spec wants us to keep backfilling until we reach the start
291+
# of the room in case we are allowed to see some of the history. However,
292+
# in practice that causes more issues than its worth, as (a) it's
293+
# relatively rare for there to be any visible history and (b) even when
294+
# there is it's often sufficiently long ago that clients would stop
295+
# attempting to paginate before backfill reached the visible history.
289296

290-
# We don't want to specify too many extremities as it causes the backfill
291-
# request URI to be too long.
292-
extremities = dict(sorted_extremeties_tuple[:5])
297+
extremities_to_request: List[str] = []
298+
for bp in sorted_backfill_points:
299+
if len(extremities_to_request) >= 5:
300+
break
301+
302+
# For regular backwards extremities, we don't have the extremity events
303+
# themselves, so we need to actually check the events that reference them -
304+
# their "successor" events.
305+
#
306+
# TODO: Correctly handle the case where we are allowed to see the
307+
# successor event but not the backward extremity, e.g. in the case of
308+
# initial join of the server where we are allowed to see the join
309+
# event but not anything before it. This would require looking at the
310+
# state *before* the event, ignoring the special casing certain event
311+
# types have.
312+
if bp.type == _BackfillPointType.INSERTION_PONT:
313+
event_ids_to_check = [bp.event_id]
314+
else:
315+
event_ids_to_check = await self.store.get_successor_events(bp.event_id)
316+
317+
events_to_check = await self.store.get_events_as_list(
318+
event_ids_to_check,
319+
redact_behaviour=EventRedactBehaviour.AS_IS,
320+
get_prev_content=False,
321+
)
322+
323+
# We set `check_history_visibility_only` as we might otherwise get false
324+
# positives from users having been erased.
325+
filtered_extremities = await filter_events_for_server(
326+
self.storage,
327+
self.server_name,
328+
events_to_check,
329+
redact=False,
330+
check_history_visibility_only=True,
331+
)
332+
if filtered_extremities:
333+
extremities_to_request.append(bp.event_id)
334+
else:
335+
logger.debug(
336+
"_maybe_backfill_inner: skipping extremity %s as it would not be visible",
337+
bp,
338+
)
339+
340+
if not extremities_to_request:
341+
logger.debug(
342+
"_maybe_backfill_inner: found no extremities which would be visible"
343+
)
344+
return False
345+
346+
logger.debug(
347+
"_maybe_backfill_inner: extremities_to_request %s", extremities_to_request
348+
)
293349

294350
# Now we need to decide which hosts to hit first.
295351

@@ -309,7 +365,7 @@ async def try_backfill(domains: List[str]) -> bool:
309365
for dom in domains:
310366
try:
311367
await self._federation_event_handler.backfill(
312-
dom, room_id, limit=100, extremities=extremities
368+
dom, room_id, limit=100, extremities=extremities_to_request
313369
)
314370
# If this succeeded then we probably already have the
315371
# appropriate stuff.

synapse/handlers/room_batch.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ async def inherit_depth_from_prev_ids(self, prev_event_ids: List[str]) -> int:
5454
# it has a larger `depth` but before the successor event because the `stream_ordering`
5555
# is negative before the successor event.
5656
successor_event_ids = await self.store.get_successor_events(
57-
[most_recent_prev_event_id]
57+
most_recent_prev_event_id
5858
)
5959

6060
# If we can't find any successor events, then it's a forward extremity of

0 commit comments

Comments
 (0)