Apply url_preview_url_blacklist to oEmbed and pre-cached images (#15601)

clokep · web-flow · commit 4ee82c0576ba · 2023-05-16T16:25:01.000-04:00
There are two situations which were previously not properly checked:

1. If the requested URL was replaced with an oEmbed URL, then the
   oEmbed URL was not checked against url_preview_url_blacklist.
2. Follow-up URLs (either via autodiscovery of oEmbed or to pre-cache
   images) were not checked against url_preview_url_blacklist.
diff --git a/changelog.d/15601.bugfix b/changelog.d/15601.bugfix
@@ -0,0 +1 @@
+Fix a long-standing bug where the `url_preview_url_blacklist` configuration setting was not applied to oEmbed or image URLs found while previewing a URL.
diff --git a/synapse/media/url_previewer.py b/synapse/media/url_previewer.py
@@ -113,7 +113,7 @@ class UrlPreviewer:
        1. Checks URL and timestamp against the database cache and returns the result if it
           has not expired and was successful (a 2xx return code).
        2. Checks if the URL matches an oEmbed (https://oembed.com/) pattern. If it
-          does, update the URL to download.
+          does and the new URL is not blocked, update the URL to download.
        3. Downloads the URL and stores it into a file via the media storage provider
           and saves the local media metadata.
        4. If the media is an image:
@@ -127,14 +127,14 @@ class UrlPreviewer:
                 and saves the local media metadata.
              2. Convert the oEmbed response to an Open Graph response.
              3. Override any Open Graph data from the HTML with data from oEmbed.
-          4. If an image exists in the Open Graph response:
+          4. If an image URL exists in the Open Graph response:
              1. Downloads the URL and stores it into a file via the media storage
                 provider and saves the local media metadata.
              2. Generates thumbnails.
              3. Updates the Open Graph response based on image properties.
-       6. If the media is JSON and an oEmbed URL was found:
+       6. If an oEmbed URL was found and the media is JSON:
           1. Convert the oEmbed response to an Open Graph response.
-          2. If a thumbnail or image is in the oEmbed response:
+          2. If an image URL is in the oEmbed response:
              1. Downloads the URL and stores it into a file via the media storage
                 provider and saves the local media metadata.
              2. Generates thumbnails.
@@ -144,7 +144,8 @@ class UrlPreviewer:
 
     If any additional requests (e.g. from oEmbed autodiscovery, step 5.3 or
     image thumbnailing, step 5.4 or 6.4) fails then the URL preview as a whole
-    does not fail. As much information as possible is returned.
+    does not fail. If any of them are blocked, then those additional requests
+    are skipped. As much information as possible is returned.
 
     The in-memory cache expires after 1 hour.
 
@@ -203,48 +204,14 @@ def __init__(
             )
 
     async def preview(self, url: str, user: UserID, ts: int) -> bytes:
-        # XXX: we could move this into _do_preview if we wanted.
-        url_tuple = urlsplit(url)
-        for entry in self.url_preview_url_blacklist:
-            match = True
-            for attrib in entry:
-                pattern = entry[attrib]
-                value = getattr(url_tuple, attrib)
-                logger.debug(
-                    "Matching attrib '%s' with value '%s' against pattern '%s'",
-                    attrib,
-                    value,
-                    pattern,
-                )
-
-                if value is None:
-                    match = False
-                    continue
-
-                # Some attributes might not be parsed as strings by urlsplit (such as the
-                # port, which is parsed as an int). Because we use match functions that
-                # expect strings, we want to make sure that's what we give them.
-                value_str = str(value)
-
-                if pattern.startswith("^"):
-                    if not re.match(pattern, value_str):
-                        match = False
-                        continue
-                else:
-                    if not fnmatch.fnmatch(value_str, pattern):
-                        match = False
-                        continue
-            if match:
-                logger.warning("URL %s blocked by url_blacklist entry %s", url, entry)
-                raise SynapseError(
-                    403, "URL blocked by url pattern blacklist entry", Codes.UNKNOWN
-                )
-
         # the in-memory cache:
-        # * ensures that only one request is active at a time
+        # * ensures that only one request to a URL is active at a time
         # * takes load off the DB for the thundering herds
         # * also caches any failures (unlike the DB) so we don't keep
-        #    requesting the same endpoint
+        #   requesting the same endpoint
+        #
+        # Note that autodiscovered oEmbed URLs and pre-caching of images
+        # are not captured in the in-memory cache.
 
         observable = self._cache.get(url)
 
@@ -283,7 +250,7 @@ async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes:
                 og = og.encode("utf8")
             return og
 
-        # If this URL can be accessed via oEmbed, use that instead.
+        # If this URL can be accessed via an allowed oEmbed, use that instead.
         url_to_download = url
         oembed_url = self._oembed.get_oembed_url(url)
         if oembed_url:
@@ -329,6 +296,7 @@ async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes:
                 # defer to that.
                 oembed_url = self._oembed.autodiscover_from_html(tree)
                 og_from_oembed: JsonDict = {}
+                # Only download to the oEmbed URL if it is allowed.
                 if oembed_url:
                     try:
                         oembed_info = await self._handle_url(
@@ -411,6 +379,59 @@ async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes:
 
         return jsonog.encode("utf8")
 
+    def _is_url_blocked(self, url: str) -> bool:
+        """
+        Check whether the URL is allowed to be previewed (according to the homeserver
+        configuration).
+
+        Args:
+            url: The requested URL.
+
+        Return:
+            True if the URL is blocked, False if it is allowed.
+        """
+        url_tuple = urlsplit(url)
+        for entry in self.url_preview_url_blacklist:
+            match = True
+            # Iterate over each entry. If *all* attributes of that entry match
+            # the current URL, then reject it.
+            for attrib, pattern in entry.items():
+                value = getattr(url_tuple, attrib)
+                logger.debug(
+                    "Matching attrib '%s' with value '%s' against pattern '%s'",
+                    attrib,
+                    value,
+                    pattern,
+                )
+
+                if value is None:
+                    match = False
+                    break
+
+                # Some attributes might not be parsed as strings by urlsplit (such as the
+                # port, which is parsed as an int). Because we use match functions that
+                # expect strings, we want to make sure that's what we give them.
+                value_str = str(value)
+
+                # Check the value against the pattern as either a regular expression or
+                # a glob. If it doesn't match, the entry doesn't match.
+                if pattern.startswith("^"):
+                    if not re.match(pattern, value_str):
+                        match = False
+                        break
+                else:
+                    if not fnmatch.fnmatch(value_str, pattern):
+                        match = False
+                        break
+
+            # All fields matched, return true (the URL is blocked).
+            if match:
+                logger.warning("URL %s blocked by url_blacklist entry %s", url, entry)
+                return match
+
+        # No matches were found, the URL is allowed.
+        return False
+
     async def _download_url(self, url: str, output_stream: BinaryIO) -> DownloadResult:
         """
         Fetches a remote URL and parses the headers.
@@ -547,8 +568,16 @@ async def _handle_url(
 
         Returns:
             A MediaInfo object describing the fetched content.
+
+        Raises:
+            SynapseError if the URL is blocked.
         """
 
+        if self._is_url_blocked(url):
+            raise SynapseError(
+                403, "URL blocked by url pattern blacklist entry", Codes.UNKNOWN
+            )
+
         # TODO: we should probably honour robots.txt... except in practice
         # we're most likely being explicitly triggered by a human rather than a
         # bot, so are we really a robot?
@@ -624,7 +653,7 @@ async def _precache_image_url(
             return
 
         # The image URL from the HTML might be relative to the previewed page,
-        # convert it to an URL which can be requested directly.
+        # convert it to a URL which can be requested directly.
         url_parts = urlparse(image_url)
         if url_parts.scheme != "data":
             image_url = urljoin(media_info.uri, image_url)
diff --git a/tests/media/test_url_previewer.py b/tests/media/test_url_previewer.py
@@ -0,0 +1,113 @@
+# Copyright 2023 The Matrix.org Foundation C.I.C.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+from twisted.test.proto_helpers import MemoryReactor
+
+from synapse.server import HomeServer
+from synapse.util import Clock
+
+from tests import unittest
+from tests.unittest import override_config
+
+try:
+    import lxml
+except ImportError:
+    lxml = None
+
+
+class URLPreviewTests(unittest.HomeserverTestCase):
+    if not lxml:
+        skip = "url preview feature requires lxml"
+
+    def make_homeserver(self, reactor: MemoryReactor, clock: Clock) -> HomeServer:
+        config = self.default_config()
+        config["url_preview_enabled"] = True
+        config["max_spider_size"] = 9999999
+        config["url_preview_ip_range_blacklist"] = (
+            "192.168.1.1",
+            "1.0.0.0/8",
+            "3fff:ffff:ffff:ffff:ffff:ffff:ffff:ffff",
+            "2001:800::/21",
+        )
+
+        self.storage_path = self.mktemp()
+        self.media_store_path = self.mktemp()
+        os.mkdir(self.storage_path)
+        os.mkdir(self.media_store_path)
+        config["media_store_path"] = self.media_store_path
+
+        provider_config = {
+            "module": "synapse.media.storage_provider.FileStorageProviderBackend",
+            "store_local": True,
+            "store_synchronous": False,
+            "store_remote": True,
+            "config": {"directory": self.storage_path},
+        }
+
+        config["media_storage_providers"] = [provider_config]
+
+        return self.setup_test_homeserver(config=config)
+
+    def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
+        media_repo_resource = hs.get_media_repository_resource()
+        preview_url = media_repo_resource.children[b"preview_url"]
+        self.url_previewer = preview_url._url_previewer
+
+    def test_all_urls_allowed(self) -> None:
+        self.assertFalse(self.url_previewer._is_url_blocked("http://matrix.org"))
+        self.assertFalse(self.url_previewer._is_url_blocked("https://matrix.org"))
+        self.assertFalse(self.url_previewer._is_url_blocked("http://localhost:8000"))
+        self.assertFalse(
+            self.url_previewer._is_url_blocked("http://user:pass@matrix.org")
+        )
+
+    @override_config(
+        {
+            "url_preview_url_blacklist": [
+                {"username": "user"},
+                {"scheme": "http", "netloc": "matrix.org"},
+            ]
+        }
+    )
+    def test_blocked_url(self) -> None:
+        # Blocked via scheme and URL.
+        self.assertTrue(self.url_previewer._is_url_blocked("http://matrix.org"))
+        # Not blocked because all components must match.
+        self.assertFalse(self.url_previewer._is_url_blocked("https://matrix.org"))
+
+        # Blocked due to the user.
+        self.assertTrue(
+            self.url_previewer._is_url_blocked("http://user:pass@example.com")
+        )
+        self.assertTrue(self.url_previewer._is_url_blocked("http://user@example.com"))
+
+    @override_config({"url_preview_url_blacklist": [{"netloc": "*.example.com"}]})
+    def test_glob_blocked_url(self) -> None:
+        # All subdomains are blocked.
+        self.assertTrue(self.url_previewer._is_url_blocked("http://foo.example.com"))
+        self.assertTrue(self.url_previewer._is_url_blocked("http://.example.com"))
+
+        # The TLD is not blocked.
+        self.assertFalse(self.url_previewer._is_url_blocked("https://example.com"))
+
+    @override_config({"url_preview_url_blacklist": [{"netloc": "^.+\\.example\\.com"}]})
+    def test_regex_blocked_urL(self) -> None:
+        # All subdomains are blocked.
+        self.assertTrue(self.url_previewer._is_url_blocked("http://foo.example.com"))
+        # Requires a non-empty subdomain.
+        self.assertFalse(self.url_previewer._is_url_blocked("http://.example.com"))
+
+        # The TLD is not blocked.
+        self.assertFalse(self.url_previewer._is_url_blocked("https://example.com"))
diff --git a/tests/rest/media/test_url_preview.py b/tests/rest/media/test_url_preview.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+Fix a long-standing bug where the `url_preview_url_blacklist` configuration setting was not applied to oEmbed or image URLs found while previewing a URL.