From 71f0401f3a83784f649e80403456f822f2532db9 Mon Sep 17 00:00:00 2001 From: Christian Heimes Date: Wed, 16 Mar 2022 12:29:12 +0100 Subject: [PATCH 1/6] bpo-45150: Add hashlib.file_digest() for efficient file hashing --- Doc/library/hashlib.rst | 41 +++++++++++++++ Lib/hashlib.py | 50 +++++++++++++++++- Lib/test/test_hashlib.py | 51 +++++++++++++++++++ .../2022-03-16-11-52-52.bpo-45150.kYbIME.rst | 1 + 4 files changed, 142 insertions(+), 1 deletion(-) create mode 100644 Misc/NEWS.d/next/Library/2022-03-16-11-52-52.bpo-45150.kYbIME.rst diff --git a/Doc/library/hashlib.rst b/Doc/library/hashlib.rst index aa24131f8bf444..1802a738cbc002 100644 --- a/Doc/library/hashlib.rst +++ b/Doc/library/hashlib.rst @@ -228,6 +228,47 @@ by the SHAKE algorithm. exchange the value safely in email or other non-binary environments. +File hashing +------------ + +The hashlib module provides a helper function for efficient hashing of +a file or file-like object. + +.. function:: file_digest(fileobj, digest, /) + + Return a digest object that has been updated with contents of file object. + + *fileobj* must be a file-like object opened for reading in binary mode. + It accepts file objects from builtin :func:`open`, :class:`~io.BytesIO` + instances, SocketIO objects from :meth:`socket.socket.makefile`, and + similar. The function may bypass Python's I/O and use the file descriptor + from :meth:`~io.IOBase.fileno` directly. + + *digest* must either be a hash algorithm name as a *str*, a hash + constructor, or a callable that returns a hash object. + + Example: + + >>> import io, hashlib, hmac + >>> with open(hashlib.__file__, "rb") as f: + ... digest = hashlib.file_digest(f, "sha256") + ... + >>> digest.hexdigest() + ... # doctest: +ELLIPSIS + + >>> buf = io.BytesIO(b"somedata") + >>> mac1 = hmac.HMAC(b"key", digestmod=hashlib.sha512) + >>> digest = hashlib.file_digest(buf, lambda: mac1) + + >>> digest is mac1 + True + >>> mac2 = hmac.HMAC(b"key", b"somedata", digestmod=hashlib.sha512) + >>> mac1.digest() == mac2.digest() + True + + .. versionadded:: 3.11 + + Key derivation -------------- diff --git a/Lib/hashlib.py b/Lib/hashlib.py index 562501860a72b3..5fd606ff9d2c10 100644 --- a/Lib/hashlib.py +++ b/Lib/hashlib.py @@ -65,7 +65,7 @@ algorithms_available = set(__always_supported) __all__ = __always_supported + ('new', 'algorithms_guaranteed', - 'algorithms_available', 'pbkdf2_hmac') + 'algorithms_available', 'pbkdf2_hmac', 'file_digest') __builtin_constructor_cache = {} @@ -254,6 +254,54 @@ def prf(msg, inner=inner, outer=outer): pass +def file_digest(fileobj, digest, /, *, _bufsize=2**18): + """Efficient hashing of file object + + *fileobj* must be a file-like object opened for reading in binary mode. + It accepts file objects from open(), io.BytesIO(), and SocketIO objects. + The function may bypass Python's I/O and use the file descriptor *fileno* + directly. + + *digest* must either be a hash algorithm name as a *str*, a hash + constructor, or a callable that returns a hash object. + """ + # On Linux we could use AF_ALG sockets and sendfile() to archive zero-copy + # hashing with hardware acceleration. + if isinstance(digest, str): + digestobj = new(digest) + else: + digestobj = digest() + + if hasattr(fileobj, "getbuffer"): + # io.BytesIO object, use zero-copy buffer + digestobj.update(fileobj.getbuffer()) + return digestobj + + # check for file-like object in binary mode + if not all( + hasattr(fileobj, name) + for name in ("fileno", "mode", "readable", "readinto") + ): + raise TypeError( + f"fileobj must be a file-like object, not {fileobj!r}." + ) + if not fileobj.readable() or not "b" in fileobj.mode: + raise ValueError("fileobj must be opened for reading in binary mode.") + + # binary file, socket.SocketIO object + # Note: socket I/O uses different syscalls than file I/O. + fileobj.fileno() # so we can rely on working fileno() in the future. + buf = bytearray(_bufsize) # Reusable buffer to reduce allocations. + view = memoryview(buf) + while True: + size = fileobj.readinto(buf) + if size == 0: + break # EOF + digestobj.update(view[:size]) + + return digestobj + + for __func_name in __always_supported: # try them all, some may not work due to the OpenSSL # version not supporting that algorithm. diff --git a/Lib/test/test_hashlib.py b/Lib/test/test_hashlib.py index ea31f8be2cb82b..daa187220987ce 100644 --- a/Lib/test/test_hashlib.py +++ b/Lib/test/test_hashlib.py @@ -10,6 +10,7 @@ from binascii import unhexlify import hashlib import importlib +import io import itertools import os import sys @@ -20,6 +21,7 @@ from test import support from test.support import _4G, bigmemtest from test.support.import_helper import import_fresh_module +from test.support import os_helper from test.support import threading_helper from test.support import warnings_helper from http.client import HTTPException @@ -371,6 +373,32 @@ def check(self, name, data, hexdigest, shake=False, **kwargs): if not shake: self.assertEqual(len(digest), m.digest_size) + if not shake and kwargs.get("key") is None: + # skip shake and blake2 extended parameter tests + self.check_file_digest(name, data, hexdigest) + + def check_file_digest(self, name, data, hexdigest): + hexdigest = hexdigest.lower() + digests = [name] + digests.extend(self.constructors_to_test[name]) + + for digest in digests: + with self.subTest(digest=digest): + buf = io.BytesIO(data) + buf.seek(0) + self.assertEqual( + hashlib.file_digest(buf, digest).hexdigest(), hexdigest + ) + with open(os_helper.TESTFN, "wb") as f: + f.write(data) + try: + with open(os_helper.TESTFN, "rb") as f: + digestobj = hashlib.file_digest(f, digest) + finally: + os.unlink(os_helper.TESTFN) + self.assertEqual(digestobj.hexdigest(), hexdigest) + + def check_no_unicode(self, algorithm_name): # Unicode objects are not allowed as input. constructors = self.constructors_to_test[algorithm_name] @@ -1117,6 +1145,29 @@ def test_normalized_name(self): self.assertNotIn("blake2b512", hashlib.algorithms_available) self.assertNotIn("sha3-512", hashlib.algorithms_available) + def test_file_digest(self): + data = b'a' * 65536 + d1 = hashlib.sha256() + self.addCleanup(os.unlink, os_helper.TESTFN) + with open(os_helper.TESTFN, "wb") as f: + for _ in range(10): + d1.update(data) + f.write(data) + + with open(os_helper.TESTFN, "rb") as f: + d2 = hashlib.file_digest(f, hashlib.sha256) + + self.assertEqual(d1.hexdigest(), d2.hexdigest()) + self.assertEqual(d1.name, d2.name) + self.assertIs(type(d1), type(d2)) + + with self.assertRaises(TypeError): + hashlib.file_digest(None, "sha256") + + with self.assertRaises(ValueError): + with open(os_helper.TESTFN, "wb") as f: + hashlib.file_digest(f, "sha256") + if __name__ == "__main__": unittest.main() diff --git a/Misc/NEWS.d/next/Library/2022-03-16-11-52-52.bpo-45150.kYbIME.rst b/Misc/NEWS.d/next/Library/2022-03-16-11-52-52.bpo-45150.kYbIME.rst new file mode 100644 index 00000000000000..1c6ea5a8e43bcc --- /dev/null +++ b/Misc/NEWS.d/next/Library/2022-03-16-11-52-52.bpo-45150.kYbIME.rst @@ -0,0 +1 @@ +Add :func:`hashlib.file_digest` helper for efficient hashing of file object. From 85cee7f698361d69d44a7d26852ced411fd4dfaf Mon Sep 17 00:00:00 2001 From: Christian Heimes Date: Wed, 16 Mar 2022 13:50:59 +0100 Subject: [PATCH 2/6] Remove whitespace --- Doc/library/hashlib.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Doc/library/hashlib.rst b/Doc/library/hashlib.rst index 1802a738cbc002..e89d01a00d3111 100644 --- a/Doc/library/hashlib.rst +++ b/Doc/library/hashlib.rst @@ -255,11 +255,11 @@ a file or file-like object. ... >>> digest.hexdigest() ... # doctest: +ELLIPSIS - + >>> buf = io.BytesIO(b"somedata") >>> mac1 = hmac.HMAC(b"key", digestmod=hashlib.sha512) >>> digest = hashlib.file_digest(buf, lambda: mac1) - + >>> digest is mac1 True >>> mac2 = hmac.HMAC(b"key", b"somedata", digestmod=hashlib.sha512) From 77df20d15553daee1478189a72e81eb324bca864 Mon Sep 17 00:00:00 2001 From: Christian Heimes Date: Wed, 16 Mar 2022 14:19:39 +0100 Subject: [PATCH 3/6] Doctests are annoying --- Doc/library/hashlib.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Doc/library/hashlib.rst b/Doc/library/hashlib.rst index e89d01a00d3111..616a6ac55e39ac 100644 --- a/Doc/library/hashlib.rst +++ b/Doc/library/hashlib.rst @@ -253,8 +253,8 @@ a file or file-like object. >>> with open(hashlib.__file__, "rb") as f: ... digest = hashlib.file_digest(f, "sha256") ... - >>> digest.hexdigest() - ... # doctest: +ELLIPSIS + >>> digest.hexdigest() # doctest: +ELLIPSIS + '...' >>> buf = io.BytesIO(b"somedata") >>> mac1 = hmac.HMAC(b"key", digestmod=hashlib.sha512) From 67b8d7ff4a258d5f1c9161fa9cd07e5b544d3227 Mon Sep 17 00:00:00 2001 From: Christian Heimes Date: Thu, 17 Mar 2022 09:26:27 +0100 Subject: [PATCH 4/6] Address Greg's review comment --- Doc/library/hashlib.rst | 4 +++- Lib/hashlib.py | 18 ++++++++---------- Lib/test/test_hashlib.py | 6 +++++- 3 files changed, 16 insertions(+), 12 deletions(-) diff --git a/Doc/library/hashlib.rst b/Doc/library/hashlib.rst index 616a6ac55e39ac..da97b0e9a74d15 100644 --- a/Doc/library/hashlib.rst +++ b/Doc/library/hashlib.rst @@ -242,7 +242,9 @@ a file or file-like object. It accepts file objects from builtin :func:`open`, :class:`~io.BytesIO` instances, SocketIO objects from :meth:`socket.socket.makefile`, and similar. The function may bypass Python's I/O and use the file descriptor - from :meth:`~io.IOBase.fileno` directly. + from :meth:`~io.IOBase.fileno` directly. *fileobj* must be assumed to be + in an unknown state after this function returns or raises. It is up to + the caller to close *fileobj*. *digest* must either be a hash algorithm name as a *str*, a hash constructor, or a callable that returns a hash object. diff --git a/Lib/hashlib.py b/Lib/hashlib.py index 5fd606ff9d2c10..b546a3fd795311 100644 --- a/Lib/hashlib.py +++ b/Lib/hashlib.py @@ -255,7 +255,7 @@ def prf(msg, inner=inner, outer=outer): def file_digest(fileobj, digest, /, *, _bufsize=2**18): - """Efficient hashing of file object + """Hash the contents of a file-like object. Returns a digest object. *fileobj* must be a file-like object opened for reading in binary mode. It accepts file objects from open(), io.BytesIO(), and SocketIO objects. @@ -277,20 +277,18 @@ def file_digest(fileobj, digest, /, *, _bufsize=2**18): digestobj.update(fileobj.getbuffer()) return digestobj - # check for file-like object in binary mode - if not all( - hasattr(fileobj, name) - for name in ("fileno", "mode", "readable", "readinto") + # Only binary files implement readinto(). + if not ( + hasattr(fileobj, "readinto") + and hasattr(fileobj, "readable") + and fileobj.readable() ): - raise TypeError( - f"fileobj must be a file-like object, not {fileobj!r}." + raise ValueError( + f"'{fileobj!r}' is not a file-like object in binary reading mode." ) - if not fileobj.readable() or not "b" in fileobj.mode: - raise ValueError("fileobj must be opened for reading in binary mode.") # binary file, socket.SocketIO object # Note: socket I/O uses different syscalls than file I/O. - fileobj.fileno() # so we can rely on working fileno() in the future. buf = bytearray(_bufsize) # Reusable buffer to reduce allocations. view = memoryview(buf) while True: diff --git a/Lib/test/test_hashlib.py b/Lib/test/test_hashlib.py index daa187220987ce..db2f4b272e7d97 100644 --- a/Lib/test/test_hashlib.py +++ b/Lib/test/test_hashlib.py @@ -1161,9 +1161,13 @@ def test_file_digest(self): self.assertEqual(d1.name, d2.name) self.assertIs(type(d1), type(d2)) - with self.assertRaises(TypeError): + with self.assertRaises(ValueError): hashlib.file_digest(None, "sha256") + with self.assertRaises(ValueError): + with open(os_helper.TESTFN, "r") as f: + hashlib.file_digest(f, "sha256") + with self.assertRaises(ValueError): with open(os_helper.TESTFN, "wb") as f: hashlib.file_digest(f, "sha256") From 884310285022e37199d7e89eab52b37d510ff8f8 Mon Sep 17 00:00:00 2001 From: Christian Heimes Date: Mon, 21 Mar 2022 22:20:31 +0100 Subject: [PATCH 5/6] Write file once --- Lib/test/test_hashlib.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/Lib/test/test_hashlib.py b/Lib/test/test_hashlib.py index db2f4b272e7d97..319a5118bc65c1 100644 --- a/Lib/test/test_hashlib.py +++ b/Lib/test/test_hashlib.py @@ -382,22 +382,24 @@ def check_file_digest(self, name, data, hexdigest): digests = [name] digests.extend(self.constructors_to_test[name]) - for digest in digests: - with self.subTest(digest=digest): + with open(os_helper.TESTFN, "wb") as f: + f.write(data) + + try: + for digest in digests: buf = io.BytesIO(data) buf.seek(0) self.assertEqual( hashlib.file_digest(buf, digest).hexdigest(), hexdigest ) - with open(os_helper.TESTFN, "wb") as f: - f.write(data) try: with open(os_helper.TESTFN, "rb") as f: digestobj = hashlib.file_digest(f, digest) finally: os.unlink(os_helper.TESTFN) self.assertEqual(digestobj.hexdigest(), hexdigest) - + finally: + os.unlink(os_helper.TESTFN) def check_no_unicode(self, algorithm_name): # Unicode objects are not allowed as input. From 689e5ec35c426af08a6e77f02f1a38f6c44c6733 Mon Sep 17 00:00:00 2001 From: Christian Heimes Date: Mon, 21 Mar 2022 23:31:35 +0100 Subject: [PATCH 6/6] Remove inner try/finally --- Lib/test/test_hashlib.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/Lib/test/test_hashlib.py b/Lib/test/test_hashlib.py index 319a5118bc65c1..daf6e3862a24f7 100644 --- a/Lib/test/test_hashlib.py +++ b/Lib/test/test_hashlib.py @@ -392,11 +392,8 @@ def check_file_digest(self, name, data, hexdigest): self.assertEqual( hashlib.file_digest(buf, digest).hexdigest(), hexdigest ) - try: - with open(os_helper.TESTFN, "rb") as f: - digestobj = hashlib.file_digest(f, digest) - finally: - os.unlink(os_helper.TESTFN) + with open(os_helper.TESTFN, "rb") as f: + digestobj = hashlib.file_digest(f, digest) self.assertEqual(digestobj.hexdigest(), hexdigest) finally: os.unlink(os_helper.TESTFN)