Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Commit 4ee82c0

Browse files
authored
Apply url_preview_url_blacklist to oEmbed and pre-cached images (#15601)
There are two situations which were previously not properly checked: 1. If the requested URL was replaced with an oEmbed URL, then the oEmbed URL was not checked against url_preview_url_blacklist. 2. Follow-up URLs (either via autodiscovery of oEmbed or to pre-cache images) were not checked against url_preview_url_blacklist.
1 parent 375b0a8 commit 4ee82c0

File tree

4 files changed

+379
-50
lines changed

4 files changed

+379
-50
lines changed

changelog.d/15601.bugfix

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix a long-standing bug where the `url_preview_url_blacklist` configuration setting was not applied to oEmbed or image URLs found while previewing a URL.

synapse/media/url_previewer.py

+75-46
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ class UrlPreviewer:
113113
1. Checks URL and timestamp against the database cache and returns the result if it
114114
has not expired and was successful (a 2xx return code).
115115
2. Checks if the URL matches an oEmbed (https://oembed.com/) pattern. If it
116-
does, update the URL to download.
116+
does and the new URL is not blocked, update the URL to download.
117117
3. Downloads the URL and stores it into a file via the media storage provider
118118
and saves the local media metadata.
119119
4. If the media is an image:
@@ -127,14 +127,14 @@ class UrlPreviewer:
127127
and saves the local media metadata.
128128
2. Convert the oEmbed response to an Open Graph response.
129129
3. Override any Open Graph data from the HTML with data from oEmbed.
130-
4. If an image exists in the Open Graph response:
130+
4. If an image URL exists in the Open Graph response:
131131
1. Downloads the URL and stores it into a file via the media storage
132132
provider and saves the local media metadata.
133133
2. Generates thumbnails.
134134
3. Updates the Open Graph response based on image properties.
135-
6. If the media is JSON and an oEmbed URL was found:
135+
6. If an oEmbed URL was found and the media is JSON:
136136
1. Convert the oEmbed response to an Open Graph response.
137-
2. If a thumbnail or image is in the oEmbed response:
137+
2. If an image URL is in the oEmbed response:
138138
1. Downloads the URL and stores it into a file via the media storage
139139
provider and saves the local media metadata.
140140
2. Generates thumbnails.
@@ -144,7 +144,8 @@ class UrlPreviewer:
144144
145145
If any additional requests (e.g. from oEmbed autodiscovery, step 5.3 or
146146
image thumbnailing, step 5.4 or 6.4) fails then the URL preview as a whole
147-
does not fail. As much information as possible is returned.
147+
does not fail. If any of them are blocked, then those additional requests
148+
are skipped. As much information as possible is returned.
148149
149150
The in-memory cache expires after 1 hour.
150151
@@ -203,48 +204,14 @@ def __init__(
203204
)
204205

205206
async def preview(self, url: str, user: UserID, ts: int) -> bytes:
206-
# XXX: we could move this into _do_preview if we wanted.
207-
url_tuple = urlsplit(url)
208-
for entry in self.url_preview_url_blacklist:
209-
match = True
210-
for attrib in entry:
211-
pattern = entry[attrib]
212-
value = getattr(url_tuple, attrib)
213-
logger.debug(
214-
"Matching attrib '%s' with value '%s' against pattern '%s'",
215-
attrib,
216-
value,
217-
pattern,
218-
)
219-
220-
if value is None:
221-
match = False
222-
continue
223-
224-
# Some attributes might not be parsed as strings by urlsplit (such as the
225-
# port, which is parsed as an int). Because we use match functions that
226-
# expect strings, we want to make sure that's what we give them.
227-
value_str = str(value)
228-
229-
if pattern.startswith("^"):
230-
if not re.match(pattern, value_str):
231-
match = False
232-
continue
233-
else:
234-
if not fnmatch.fnmatch(value_str, pattern):
235-
match = False
236-
continue
237-
if match:
238-
logger.warning("URL %s blocked by url_blacklist entry %s", url, entry)
239-
raise SynapseError(
240-
403, "URL blocked by url pattern blacklist entry", Codes.UNKNOWN
241-
)
242-
243207
# the in-memory cache:
244-
# * ensures that only one request is active at a time
208+
# * ensures that only one request to a URL is active at a time
245209
# * takes load off the DB for the thundering herds
246210
# * also caches any failures (unlike the DB) so we don't keep
247-
# requesting the same endpoint
211+
# requesting the same endpoint
212+
#
213+
# Note that autodiscovered oEmbed URLs and pre-caching of images
214+
# are not captured in the in-memory cache.
248215

249216
observable = self._cache.get(url)
250217

@@ -283,7 +250,7 @@ async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes:
283250
og = og.encode("utf8")
284251
return og
285252

286-
# If this URL can be accessed via oEmbed, use that instead.
253+
# If this URL can be accessed via an allowed oEmbed, use that instead.
287254
url_to_download = url
288255
oembed_url = self._oembed.get_oembed_url(url)
289256
if oembed_url:
@@ -329,6 +296,7 @@ async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes:
329296
# defer to that.
330297
oembed_url = self._oembed.autodiscover_from_html(tree)
331298
og_from_oembed: JsonDict = {}
299+
# Only download to the oEmbed URL if it is allowed.
332300
if oembed_url:
333301
try:
334302
oembed_info = await self._handle_url(
@@ -411,6 +379,59 @@ async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes:
411379

412380
return jsonog.encode("utf8")
413381

382+
def _is_url_blocked(self, url: str) -> bool:
383+
"""
384+
Check whether the URL is allowed to be previewed (according to the homeserver
385+
configuration).
386+
387+
Args:
388+
url: The requested URL.
389+
390+
Return:
391+
True if the URL is blocked, False if it is allowed.
392+
"""
393+
url_tuple = urlsplit(url)
394+
for entry in self.url_preview_url_blacklist:
395+
match = True
396+
# Iterate over each entry. If *all* attributes of that entry match
397+
# the current URL, then reject it.
398+
for attrib, pattern in entry.items():
399+
value = getattr(url_tuple, attrib)
400+
logger.debug(
401+
"Matching attrib '%s' with value '%s' against pattern '%s'",
402+
attrib,
403+
value,
404+
pattern,
405+
)
406+
407+
if value is None:
408+
match = False
409+
break
410+
411+
# Some attributes might not be parsed as strings by urlsplit (such as the
412+
# port, which is parsed as an int). Because we use match functions that
413+
# expect strings, we want to make sure that's what we give them.
414+
value_str = str(value)
415+
416+
# Check the value against the pattern as either a regular expression or
417+
# a glob. If it doesn't match, the entry doesn't match.
418+
if pattern.startswith("^"):
419+
if not re.match(pattern, value_str):
420+
match = False
421+
break
422+
else:
423+
if not fnmatch.fnmatch(value_str, pattern):
424+
match = False
425+
break
426+
427+
# All fields matched, return true (the URL is blocked).
428+
if match:
429+
logger.warning("URL %s blocked by url_blacklist entry %s", url, entry)
430+
return match
431+
432+
# No matches were found, the URL is allowed.
433+
return False
434+
414435
async def _download_url(self, url: str, output_stream: BinaryIO) -> DownloadResult:
415436
"""
416437
Fetches a remote URL and parses the headers.
@@ -547,8 +568,16 @@ async def _handle_url(
547568
548569
Returns:
549570
A MediaInfo object describing the fetched content.
571+
572+
Raises:
573+
SynapseError if the URL is blocked.
550574
"""
551575

576+
if self._is_url_blocked(url):
577+
raise SynapseError(
578+
403, "URL blocked by url pattern blacklist entry", Codes.UNKNOWN
579+
)
580+
552581
# TODO: we should probably honour robots.txt... except in practice
553582
# we're most likely being explicitly triggered by a human rather than a
554583
# bot, so are we really a robot?
@@ -624,7 +653,7 @@ async def _precache_image_url(
624653
return
625654

626655
# The image URL from the HTML might be relative to the previewed page,
627-
# convert it to an URL which can be requested directly.
656+
# convert it to a URL which can be requested directly.
628657
url_parts = urlparse(image_url)
629658
if url_parts.scheme != "data":
630659
image_url = urljoin(media_info.uri, image_url)

tests/media/test_url_previewer.py

+113
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
# Copyright 2023 The Matrix.org Foundation C.I.C.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
import os
15+
16+
from twisted.test.proto_helpers import MemoryReactor
17+
18+
from synapse.server import HomeServer
19+
from synapse.util import Clock
20+
21+
from tests import unittest
22+
from tests.unittest import override_config
23+
24+
try:
25+
import lxml
26+
except ImportError:
27+
lxml = None
28+
29+
30+
class URLPreviewTests(unittest.HomeserverTestCase):
31+
if not lxml:
32+
skip = "url preview feature requires lxml"
33+
34+
def make_homeserver(self, reactor: MemoryReactor, clock: Clock) -> HomeServer:
35+
config = self.default_config()
36+
config["url_preview_enabled"] = True
37+
config["max_spider_size"] = 9999999
38+
config["url_preview_ip_range_blacklist"] = (
39+
"192.168.1.1",
40+
"1.0.0.0/8",
41+
"3fff:ffff:ffff:ffff:ffff:ffff:ffff:ffff",
42+
"2001:800::/21",
43+
)
44+
45+
self.storage_path = self.mktemp()
46+
self.media_store_path = self.mktemp()
47+
os.mkdir(self.storage_path)
48+
os.mkdir(self.media_store_path)
49+
config["media_store_path"] = self.media_store_path
50+
51+
provider_config = {
52+
"module": "synapse.media.storage_provider.FileStorageProviderBackend",
53+
"store_local": True,
54+
"store_synchronous": False,
55+
"store_remote": True,
56+
"config": {"directory": self.storage_path},
57+
}
58+
59+
config["media_storage_providers"] = [provider_config]
60+
61+
return self.setup_test_homeserver(config=config)
62+
63+
def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
64+
media_repo_resource = hs.get_media_repository_resource()
65+
preview_url = media_repo_resource.children[b"preview_url"]
66+
self.url_previewer = preview_url._url_previewer
67+
68+
def test_all_urls_allowed(self) -> None:
69+
self.assertFalse(self.url_previewer._is_url_blocked("http://matrix.org"))
70+
self.assertFalse(self.url_previewer._is_url_blocked("https://matrix.org"))
71+
self.assertFalse(self.url_previewer._is_url_blocked("http://localhost:8000"))
72+
self.assertFalse(
73+
self.url_previewer._is_url_blocked("http://user:[email protected]")
74+
)
75+
76+
@override_config(
77+
{
78+
"url_preview_url_blacklist": [
79+
{"username": "user"},
80+
{"scheme": "http", "netloc": "matrix.org"},
81+
]
82+
}
83+
)
84+
def test_blocked_url(self) -> None:
85+
# Blocked via scheme and URL.
86+
self.assertTrue(self.url_previewer._is_url_blocked("http://matrix.org"))
87+
# Not blocked because all components must match.
88+
self.assertFalse(self.url_previewer._is_url_blocked("https://matrix.org"))
89+
90+
# Blocked due to the user.
91+
self.assertTrue(
92+
self.url_previewer._is_url_blocked("http://user:[email protected]")
93+
)
94+
self.assertTrue(self.url_previewer._is_url_blocked("http://[email protected]"))
95+
96+
@override_config({"url_preview_url_blacklist": [{"netloc": "*.example.com"}]})
97+
def test_glob_blocked_url(self) -> None:
98+
# All subdomains are blocked.
99+
self.assertTrue(self.url_previewer._is_url_blocked("http://foo.example.com"))
100+
self.assertTrue(self.url_previewer._is_url_blocked("http://.example.com"))
101+
102+
# The TLD is not blocked.
103+
self.assertFalse(self.url_previewer._is_url_blocked("https://example.com"))
104+
105+
@override_config({"url_preview_url_blacklist": [{"netloc": "^.+\\.example\\.com"}]})
106+
def test_regex_blocked_urL(self) -> None:
107+
# All subdomains are blocked.
108+
self.assertTrue(self.url_previewer._is_url_blocked("http://foo.example.com"))
109+
# Requires a non-empty subdomain.
110+
self.assertFalse(self.url_previewer._is_url_blocked("http://.example.com"))
111+
112+
# The TLD is not blocked.
113+
self.assertFalse(self.url_previewer._is_url_blocked("https://example.com"))

0 commit comments

Comments
 (0)