Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Commit 15ffc41

Browse files
authored
Fix preview of imgur and Tenor URLs. (#11669)
By scraping Open Graph information from the HTML even when an autodiscovery endpoint is found. The results are then combined to capture as much information as possible from the page.
1 parent 9eab71a commit 15ffc41

File tree

4 files changed

+39
-14
lines changed

4 files changed

+39
-14
lines changed

changelog.d/11669.bugfix

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix preview of some gif URLs (like tenor.com). Contributed by Philippe Daouadi.

docs/development/url_previews.md

+6-1
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,12 @@ When Synapse is asked to preview a URL it does the following:
3535
5. If the media is HTML:
3636
1. Decodes the HTML via the stored file.
3737
2. Generates an Open Graph response from the HTML.
38-
3. If an image exists in the Open Graph response:
38+
3. If a JSON oEmbed URL was found in the HTML via autodiscovery:
39+
1. Downloads the URL and stores it into a file via the media storage provider
40+
and saves the local media metadata.
41+
2. Convert the oEmbed response to an Open Graph response.
42+
3. Override any Open Graph data from the HTML with data from oEmbed.
43+
4. If an image exists in the Open Graph response:
3944
1. Downloads the URL and stores it into a file via the media storage
4045
provider and saves the local media metadata.
4146
2. Generates thumbnails.

synapse/rest/media/v1/oembed.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@
3333
class OEmbedResult:
3434
# The Open Graph result (converted from the oEmbed result).
3535
open_graph_result: JsonDict
36+
# The author_name of the oEmbed result
37+
author_name: Optional[str]
3638
# Number of milliseconds to cache the content, according to the oEmbed response.
3739
#
3840
# This will be None if no cache-age is provided in the oEmbed response (or
@@ -154,11 +156,12 @@ def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
154156
"og:url": url,
155157
}
156158

157-
# Use either title or author's name as the title.
158-
title = oembed.get("title") or oembed.get("author_name")
159+
title = oembed.get("title")
159160
if title:
160161
open_graph_response["og:title"] = title
161162

163+
author_name = oembed.get("author_name")
164+
162165
# Use the provider name and as the site.
163166
provider_name = oembed.get("provider_name")
164167
if provider_name:
@@ -193,9 +196,10 @@ def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
193196
# Trap any exception and let the code follow as usual.
194197
logger.warning("Error parsing oEmbed metadata from %s: %r", url, e)
195198
open_graph_response = {}
199+
author_name = None
196200
cache_age = None
197201

198-
return OEmbedResult(open_graph_response, cache_age)
202+
return OEmbedResult(open_graph_response, author_name, cache_age)
199203

200204

201205
def _fetch_urls(tree: "etree.Element", tag_name: str) -> List[str]:

synapse/rest/media/v1/preview_url_resource.py

+25-10
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,7 @@ async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes:
262262

263263
# The number of milliseconds that the response should be considered valid.
264264
expiration_ms = media_info.expires
265+
author_name: Optional[str] = None
265266

266267
if _is_media(media_info.media_type):
267268
file_id = media_info.filesystem_id
@@ -294,25 +295,33 @@ async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes:
294295
# Check if this HTML document points to oEmbed information and
295296
# defer to that.
296297
oembed_url = self._oembed.autodiscover_from_html(tree)
297-
og = {}
298+
og_from_oembed: JsonDict = {}
298299
if oembed_url:
299300
oembed_info = await self._download_url(oembed_url, user)
300-
og, expiration_ms = await self._handle_oembed_response(
301+
(
302+
og_from_oembed,
303+
author_name,
304+
expiration_ms,
305+
) = await self._handle_oembed_response(
301306
url, oembed_info, expiration_ms
302307
)
303308

304-
# If there was no oEmbed URL (or oEmbed parsing failed), attempt
305-
# to generate the Open Graph information from the HTML.
306-
if not oembed_url or not og:
307-
og = parse_html_to_open_graph(tree, media_info.uri)
309+
# Parse Open Graph information from the HTML in case the oEmbed
310+
# response failed or is incomplete.
311+
og_from_html = parse_html_to_open_graph(tree, media_info.uri)
312+
313+
# Compile the Open Graph response by using the scraped
314+
# information from the HTML and overlaying any information
315+
# from the oEmbed response.
316+
og = {**og_from_html, **og_from_oembed}
308317

309318
await self._precache_image_url(user, media_info, og)
310319
else:
311320
og = {}
312321

313322
elif oembed_url:
314323
# Handle the oEmbed information.
315-
og, expiration_ms = await self._handle_oembed_response(
324+
og, author_name, expiration_ms = await self._handle_oembed_response(
316325
url, media_info, expiration_ms
317326
)
318327
await self._precache_image_url(user, media_info, og)
@@ -321,6 +330,11 @@ async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes:
321330
logger.warning("Failed to find any OG data in %s", url)
322331
og = {}
323332

333+
# If we don't have a title but we have author_name, copy it as
334+
# title
335+
if not og.get("og:title") and author_name:
336+
og["og:title"] = author_name
337+
324338
# filter out any stupidly long values
325339
keys_to_remove = []
326340
for k, v in og.items():
@@ -484,7 +498,7 @@ async def _precache_image_url(
484498

485499
async def _handle_oembed_response(
486500
self, url: str, media_info: MediaInfo, expiration_ms: int
487-
) -> Tuple[JsonDict, int]:
501+
) -> Tuple[JsonDict, Optional[str], int]:
488502
"""
489503
Parse the downloaded oEmbed info.
490504
@@ -497,11 +511,12 @@ async def _handle_oembed_response(
497511
Returns:
498512
A tuple of:
499513
The Open Graph dictionary, if the oEmbed info can be parsed.
514+
The author name if it could be retrieved from oEmbed.
500515
The (possibly updated) length of time, in milliseconds, the media is valid for.
501516
"""
502517
# If JSON was not returned, there's nothing to do.
503518
if not _is_json(media_info.media_type):
504-
return {}, expiration_ms
519+
return {}, None, expiration_ms
505520

506521
with open(media_info.filename, "rb") as file:
507522
body = file.read()
@@ -513,7 +528,7 @@ async def _handle_oembed_response(
513528
if open_graph_result and oembed_response.cache_age is not None:
514529
expiration_ms = oembed_response.cache_age
515530

516-
return open_graph_result, expiration_ms
531+
return open_graph_result, oembed_response.author_name, expiration_ms
517532

518533
def _start_expire_url_cache_data(self) -> Deferred:
519534
return run_as_background_process(

0 commit comments

Comments
 (0)