@@ -113,7 +113,7 @@ class UrlPreviewer:
113
113
1. Checks URL and timestamp against the database cache and returns the result if it
114
114
has not expired and was successful (a 2xx return code).
115
115
2. Checks if the URL matches an oEmbed (https://oembed.com/) pattern. If it
116
- does, update the URL to download.
116
+ does and the new URL is not blocked , update the URL to download.
117
117
3. Downloads the URL and stores it into a file via the media storage provider
118
118
and saves the local media metadata.
119
119
4. If the media is an image:
@@ -127,14 +127,14 @@ class UrlPreviewer:
127
127
and saves the local media metadata.
128
128
2. Convert the oEmbed response to an Open Graph response.
129
129
3. Override any Open Graph data from the HTML with data from oEmbed.
130
- 4. If an image exists in the Open Graph response:
130
+ 4. If an image URL exists in the Open Graph response:
131
131
1. Downloads the URL and stores it into a file via the media storage
132
132
provider and saves the local media metadata.
133
133
2. Generates thumbnails.
134
134
3. Updates the Open Graph response based on image properties.
135
- 6. If the media is JSON and an oEmbed URL was found:
135
+ 6. If an oEmbed URL was found and the media is JSON :
136
136
1. Convert the oEmbed response to an Open Graph response.
137
- 2. If a thumbnail or image is in the oEmbed response:
137
+ 2. If an image URL is in the oEmbed response:
138
138
1. Downloads the URL and stores it into a file via the media storage
139
139
provider and saves the local media metadata.
140
140
2. Generates thumbnails.
@@ -144,7 +144,8 @@ class UrlPreviewer:
144
144
145
145
If any additional requests (e.g. from oEmbed autodiscovery, step 5.3 or
146
146
image thumbnailing, step 5.4 or 6.4) fails then the URL preview as a whole
147
- does not fail. As much information as possible is returned.
147
+ does not fail. If any of them are blocked, then those additional requests
148
+ are skipped. As much information as possible is returned.
148
149
149
150
The in-memory cache expires after 1 hour.
150
151
@@ -203,48 +204,14 @@ def __init__(
203
204
)
204
205
205
206
async def preview (self , url : str , user : UserID , ts : int ) -> bytes :
206
- # XXX: we could move this into _do_preview if we wanted.
207
- url_tuple = urlsplit (url )
208
- for entry in self .url_preview_url_blacklist :
209
- match = True
210
- for attrib in entry :
211
- pattern = entry [attrib ]
212
- value = getattr (url_tuple , attrib )
213
- logger .debug (
214
- "Matching attrib '%s' with value '%s' against pattern '%s'" ,
215
- attrib ,
216
- value ,
217
- pattern ,
218
- )
219
-
220
- if value is None :
221
- match = False
222
- continue
223
-
224
- # Some attributes might not be parsed as strings by urlsplit (such as the
225
- # port, which is parsed as an int). Because we use match functions that
226
- # expect strings, we want to make sure that's what we give them.
227
- value_str = str (value )
228
-
229
- if pattern .startswith ("^" ):
230
- if not re .match (pattern , value_str ):
231
- match = False
232
- continue
233
- else :
234
- if not fnmatch .fnmatch (value_str , pattern ):
235
- match = False
236
- continue
237
- if match :
238
- logger .warning ("URL %s blocked by url_blacklist entry %s" , url , entry )
239
- raise SynapseError (
240
- 403 , "URL blocked by url pattern blacklist entry" , Codes .UNKNOWN
241
- )
242
-
243
207
# the in-memory cache:
244
- # * ensures that only one request is active at a time
208
+ # * ensures that only one request to a URL is active at a time
245
209
# * takes load off the DB for the thundering herds
246
210
# * also caches any failures (unlike the DB) so we don't keep
247
- # requesting the same endpoint
211
+ # requesting the same endpoint
212
+ #
213
+ # Note that autodiscovered oEmbed URLs and pre-caching of images
214
+ # are not captured in the in-memory cache.
248
215
249
216
observable = self ._cache .get (url )
250
217
@@ -283,7 +250,7 @@ async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes:
283
250
og = og .encode ("utf8" )
284
251
return og
285
252
286
- # If this URL can be accessed via oEmbed, use that instead.
253
+ # If this URL can be accessed via an allowed oEmbed, use that instead.
287
254
url_to_download = url
288
255
oembed_url = self ._oembed .get_oembed_url (url )
289
256
if oembed_url :
@@ -329,6 +296,7 @@ async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes:
329
296
# defer to that.
330
297
oembed_url = self ._oembed .autodiscover_from_html (tree )
331
298
og_from_oembed : JsonDict = {}
299
+ # Only download to the oEmbed URL if it is allowed.
332
300
if oembed_url :
333
301
try :
334
302
oembed_info = await self ._handle_url (
@@ -411,6 +379,59 @@ async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes:
411
379
412
380
return jsonog .encode ("utf8" )
413
381
382
+ def _is_url_blocked (self , url : str ) -> bool :
383
+ """
384
+ Check whether the URL is allowed to be previewed (according to the homeserver
385
+ configuration).
386
+
387
+ Args:
388
+ url: The requested URL.
389
+
390
+ Return:
391
+ True if the URL is blocked, False if it is allowed.
392
+ """
393
+ url_tuple = urlsplit (url )
394
+ for entry in self .url_preview_url_blacklist :
395
+ match = True
396
+ # Iterate over each entry. If *all* attributes of that entry match
397
+ # the current URL, then reject it.
398
+ for attrib , pattern in entry .items ():
399
+ value = getattr (url_tuple , attrib )
400
+ logger .debug (
401
+ "Matching attrib '%s' with value '%s' against pattern '%s'" ,
402
+ attrib ,
403
+ value ,
404
+ pattern ,
405
+ )
406
+
407
+ if value is None :
408
+ match = False
409
+ break
410
+
411
+ # Some attributes might not be parsed as strings by urlsplit (such as the
412
+ # port, which is parsed as an int). Because we use match functions that
413
+ # expect strings, we want to make sure that's what we give them.
414
+ value_str = str (value )
415
+
416
+ # Check the value against the pattern as either a regular expression or
417
+ # a glob. If it doesn't match, the entry doesn't match.
418
+ if pattern .startswith ("^" ):
419
+ if not re .match (pattern , value_str ):
420
+ match = False
421
+ break
422
+ else :
423
+ if not fnmatch .fnmatch (value_str , pattern ):
424
+ match = False
425
+ break
426
+
427
+ # All fields matched, return true (the URL is blocked).
428
+ if match :
429
+ logger .warning ("URL %s blocked by url_blacklist entry %s" , url , entry )
430
+ return match
431
+
432
+ # No matches were found, the URL is allowed.
433
+ return False
434
+
414
435
async def _download_url (self , url : str , output_stream : BinaryIO ) -> DownloadResult :
415
436
"""
416
437
Fetches a remote URL and parses the headers.
@@ -547,8 +568,16 @@ async def _handle_url(
547
568
548
569
Returns:
549
570
A MediaInfo object describing the fetched content.
571
+
572
+ Raises:
573
+ SynapseError if the URL is blocked.
550
574
"""
551
575
576
+ if self ._is_url_blocked (url ):
577
+ raise SynapseError (
578
+ 403 , "URL blocked by url pattern blacklist entry" , Codes .UNKNOWN
579
+ )
580
+
552
581
# TODO: we should probably honour robots.txt... except in practice
553
582
# we're most likely being explicitly triggered by a human rather than a
554
583
# bot, so are we really a robot?
@@ -624,7 +653,7 @@ async def _precache_image_url(
624
653
return
625
654
626
655
# The image URL from the HTML might be relative to the previewed page,
627
- # convert it to an URL which can be requested directly.
656
+ # convert it to a URL which can be requested directly.
628
657
url_parts = urlparse (image_url )
629
658
if url_parts .scheme != "data" :
630
659
image_url = urljoin (media_info .uri , image_url )
0 commit comments