Skip to content

Commit af7f726

Browse files
committed
Add support for MIME block list instead of allow list (#7)
I also document and test the (existing) behaviour with generic text and binary files. Closes element-hq/customer-success#197
1 parent 912ebff commit af7f726

File tree

5 files changed

+117
-3
lines changed

5 files changed

+117
-3
lines changed

Diff for: config.sample.yaml

+11
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,20 @@ scan:
3636

3737
# List of allowed MIME types. If a file has a MIME type that's not in this list, its
3838
# scan is considered failed.
39+
# Unrecognised binary files are considered to be `application/octet-stream`.
40+
# Unrecognised text files are considered to be `text/plain`.
3941
# Optional, defaults to allowing all MIME types.
4042
allowed_mimetypes: ["image/jpeg"]
4143

44+
# List of blocked MIME types.
45+
# If specified, `allowed_mimetypes` must not be specified as well.
46+
# If specified, a file whose MIME type is on this list will produce a scan that is
47+
# considered failed.
48+
# Unrecognised binary files are considered to be `application/octet-stream`.
49+
# Unrecognised text files are considered to be `text/plain`.
50+
# Optional.
51+
# blocked_mimetypes: ["image/jpeg"]
52+
4253
# Configuration of scan result caching.
4354
#
4455
# Results are stored in a cache to avoid having to download and scan a file twice. There

Diff for: src/matrix_content_scanner/config.py

+14
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ def _parse_size(size: Optional[Union[str, float]]) -> Optional[float]:
7575
"temp_directory": {"type": "string"},
7676
"removal_command": {"type": "string"},
7777
"allowed_mimetypes": {"type": "array", "items": {"type": "string"}},
78+
"blocked_mimetypes": {"type": "array", "items": {"type": "string"}},
7879
},
7980
},
8081
"download": {
@@ -131,6 +132,7 @@ class ScanConfig:
131132
temp_directory: str
132133
removal_command: str = "rm"
133134
allowed_mimetypes: Optional[List[str]] = None
135+
blocked_mimetypes: Optional[List[str]] = None
134136

135137

136138
@attr.s(auto_attribs=True, frozen=True, slots=True)
@@ -175,3 +177,15 @@ def __init__(self, config_dict: Dict[str, Any]):
175177
self.crypto = CryptoConfig(**(config_dict.get("crypto") or {}))
176178
self.download = DownloadConfig(**(config_dict.get("download") or {}))
177179
self.result_cache = ResultCacheConfig(**(config_dict.get("result_cache") or {}))
180+
181+
# Don't allow both allowlist and blocklist for MIME types, since we do not document
182+
# the semantics for that and it is in any case pointless.
183+
# This could have been expressed in JSONSchema but I suspect the error message would be poor
184+
# in that case.
185+
if (
186+
self.scan.allowed_mimetypes is not None
187+
and self.scan.blocked_mimetypes is not None
188+
):
189+
raise ConfigError(
190+
"Both `scan.allowed_mimetypes` and `scan.blocked_mimetypes` are specified, which is not allowed!"
191+
)

Diff for: src/matrix_content_scanner/scanner/scanner.py

+22-1
Original file line numberDiff line numberDiff line change
@@ -76,11 +76,18 @@ def __init__(self, mcs: "MatrixContentScanner"):
7676

7777
self._max_size_to_cache = mcs.config.result_cache.max_file_size
7878

79-
# List of MIME types we should allow. If None, we don't fail files based on their
79+
# List of MIME types we should allow.
80+
# If None, we fall back to `_blocked_mimetypes`.
81+
# If that's also None, we don't fail files based on their
8082
# MIME types (besides comparing it with the Content-Type header from the server
8183
# for unencrypted files).
8284
self._allowed_mimetypes = mcs.config.scan.allowed_mimetypes
8385

86+
# List of MIME types we should block.
87+
# Must not be specified at the same time as `_allowed_mimetypes`.
88+
# See the comment for `_allowed_mimetypes` for the semantics.
89+
self._blocked_mimetypes = mcs.config.scan.blocked_mimetypes
90+
8491
# Cache of futures for files that are currently scanning and downloading, so that
8592
# concurrent requests don't cause a file to be downloaded and scanned twice.
8693
self._current_scans: Dict[str, Future[MediaDescription]] = {}
@@ -505,3 +512,17 @@ def _check_mimetype(self, media_content: bytes) -> None:
505512
raise FileMimeTypeForbiddenError(
506513
f"File type: {detected_mimetype} not allowed"
507514
)
515+
516+
# If there's a block list for MIME types, check that the MIME type detected for
517+
# this file is NOT in it.
518+
if (
519+
self._blocked_mimetypes is not None
520+
and detected_mimetype in self._blocked_mimetypes
521+
):
522+
logger.error(
523+
"MIME type for file is forbidden: %s",
524+
detected_mimetype,
525+
)
526+
raise FileMimeTypeForbiddenError(
527+
f"File type: {detected_mimetype} not allowed"
528+
)

Diff for: tests/scanner/test_scanner.py

+64-2
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,10 @@
2222
from tests.testutils import (
2323
ENCRYPTED_FILE_METADATA,
2424
MEDIA_PATH,
25+
SMALL_BINARY_FILE,
2526
SMALL_PNG,
2627
SMALL_PNG_ENCRYPTED,
28+
SMALL_TEXT_FILE,
2729
get_content_scanner,
2830
to_thumbnail_params,
2931
)
@@ -219,7 +221,7 @@ async def test_different_encryption_key(self) -> None:
219221
# But it also causes it to be downloaded again because its metadata have changed.
220222
self.assertEqual(self.downloader_mock.call_count, 2)
221223

222-
async def test_mimetype(self) -> None:
224+
async def test_allowlist_mimetype(self) -> None:
223225
"""Tests that, if there's an allow list for MIME types and the file's MIME type
224226
isn't in it, the file's scan fails.
225227
"""
@@ -230,7 +232,7 @@ async def test_mimetype(self) -> None:
230232
with self.assertRaises(FileMimeTypeForbiddenError):
231233
await self.scanner.scan_file(MEDIA_PATH)
232234

233-
async def test_mimetype_encrypted(self) -> None:
235+
async def test_allowlist_mimetype_encrypted(self) -> None:
234236
"""Tests that the file's MIME type is correctly detected and compared with the
235237
allow list (if set), even if it's encrypted.
236238
"""
@@ -243,6 +245,66 @@ async def test_mimetype_encrypted(self) -> None:
243245
with self.assertRaises(FileMimeTypeForbiddenError):
244246
await self.scanner.scan_file(MEDIA_PATH, ENCRYPTED_FILE_METADATA)
245247

248+
async def test_blocklist_mimetype(self) -> None:
249+
"""Tests that, if there's an allow list for MIME types and the file's MIME type
250+
isn't in it, the file's scan fails.
251+
"""
252+
# Set a block list that blocks PNG images.
253+
self.scanner._blocked_mimetypes = ["image/png"]
254+
255+
# Check that the scan fails since the file is a PNG.
256+
with self.assertRaises(FileMimeTypeForbiddenError):
257+
await self.scanner.scan_file(MEDIA_PATH)
258+
259+
async def test_blocklist_mimetype_encrypted(self) -> None:
260+
"""Tests that the file's MIME type is correctly detected and compared with the
261+
allow list (if set), even if it's encrypted.
262+
"""
263+
self._setup_encrypted()
264+
265+
# Set a block list that blocks PNG images.
266+
self.scanner._blocked_mimetypes = ["image/png"]
267+
268+
# Check that the scan fails since the file is a PNG.
269+
with self.assertRaises(FileMimeTypeForbiddenError):
270+
await self.scanner.scan_file(MEDIA_PATH, ENCRYPTED_FILE_METADATA)
271+
272+
async def test_blocklist_mimetype_fallback_binary_file(self) -> None:
273+
"""Tests that unrecognised binary files' MIME type is assumed to be
274+
`application/octet-stream` and that they can be blocked in this way.
275+
"""
276+
277+
self.downloader_res = MediaDescription(
278+
# This is the *claimed* content-type by the uploader
279+
content_type="application/vnd.io.element.generic_binary_file",
280+
content=SMALL_BINARY_FILE,
281+
response_headers=CIMultiDictProxy(CIMultiDict()),
282+
)
283+
284+
# Set a block list that blocks uncategorised binary files.
285+
self.scanner._blocked_mimetypes = ["application/octet-stream"]
286+
287+
with self.assertRaises(FileMimeTypeForbiddenError):
288+
await self.scanner.scan_file(MEDIA_PATH)
289+
290+
async def test_blocklist_mimetype_fallback_text_file(self) -> None:
291+
"""Tests that unrecognised text files' MIME type is assumed to be
292+
`text/plain` and that they can be blocked in this way.
293+
"""
294+
295+
self.downloader_res = MediaDescription(
296+
# This is the *claimed* content-type by the uploader
297+
content_type="application/vnd.io.element.generic_file",
298+
content=SMALL_TEXT_FILE,
299+
response_headers=CIMultiDictProxy(CIMultiDict()),
300+
)
301+
302+
# Set a block list that blocks uncategorised text files.
303+
self.scanner._blocked_mimetypes = ["text/plain"]
304+
305+
with self.assertRaises(FileMimeTypeForbiddenError):
306+
await self.scanner.scan_file(MEDIA_PATH)
307+
246308
async def test_dont_cache_exit_codes(self) -> None:
247309
"""Tests that if the configuration specifies exit codes to ignore when running
248310
the scanning script, we don't cache them.

Diff for: tests/testutils.py

+6
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,12 @@
2222
b"0a2db40000000049454e44ae426082"
2323
)
2424

25+
# A small binary file without any specific format.
26+
SMALL_BINARY_FILE = unhexlify(b"010203")
27+
28+
# A small text file without any specific format.
29+
SMALL_TEXT_FILE = b"Hello world\nThis is a tiny text file"
30+
2531
# A small, encrypted PNG.
2632
SMALL_PNG_ENCRYPTED = unhexlify(
2733
b"9fd28dd7a1d845a04948f13af104e39402c888f7b601bce313ad"

0 commit comments

Comments
 (0)