Skip to content

Commit 71f0401

Browse files
committed
bpo-45150: Add hashlib.file_digest() for efficient file hashing
1 parent 7c77652 commit 71f0401

File tree

4 files changed

+142
-1
lines changed

4 files changed

+142
-1
lines changed

Doc/library/hashlib.rst

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,47 @@ by the SHAKE algorithm.
228228
exchange the value safely in email or other non-binary environments.
229229

230230

231+
File hashing
232+
------------
233+
234+
The hashlib module provides a helper function for efficient hashing of
235+
a file or file-like object.
236+
237+
.. function:: file_digest(fileobj, digest, /)
238+
239+
Return a digest object that has been updated with contents of file object.
240+
241+
*fileobj* must be a file-like object opened for reading in binary mode.
242+
It accepts file objects from builtin :func:`open`, :class:`~io.BytesIO`
243+
instances, SocketIO objects from :meth:`socket.socket.makefile`, and
244+
similar. The function may bypass Python's I/O and use the file descriptor
245+
from :meth:`~io.IOBase.fileno` directly.
246+
247+
*digest* must either be a hash algorithm name as a *str*, a hash
248+
constructor, or a callable that returns a hash object.
249+
250+
Example:
251+
252+
>>> import io, hashlib, hmac
253+
>>> with open(hashlib.__file__, "rb") as f:
254+
... digest = hashlib.file_digest(f, "sha256")
255+
...
256+
>>> digest.hexdigest()
257+
... # doctest: +ELLIPSIS
258+
259+
>>> buf = io.BytesIO(b"somedata")
260+
>>> mac1 = hmac.HMAC(b"key", digestmod=hashlib.sha512)
261+
>>> digest = hashlib.file_digest(buf, lambda: mac1)
262+
263+
>>> digest is mac1
264+
True
265+
>>> mac2 = hmac.HMAC(b"key", b"somedata", digestmod=hashlib.sha512)
266+
>>> mac1.digest() == mac2.digest()
267+
True
268+
269+
.. versionadded:: 3.11
270+
271+
231272
Key derivation
232273
--------------
233274

Lib/hashlib.py

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@
6565
algorithms_available = set(__always_supported)
6666

6767
__all__ = __always_supported + ('new', 'algorithms_guaranteed',
68-
'algorithms_available', 'pbkdf2_hmac')
68+
'algorithms_available', 'pbkdf2_hmac', 'file_digest')
6969

7070

7171
__builtin_constructor_cache = {}
@@ -254,6 +254,54 @@ def prf(msg, inner=inner, outer=outer):
254254
pass
255255

256256

257+
def file_digest(fileobj, digest, /, *, _bufsize=2**18):
258+
"""Efficient hashing of file object
259+
260+
*fileobj* must be a file-like object opened for reading in binary mode.
261+
It accepts file objects from open(), io.BytesIO(), and SocketIO objects.
262+
The function may bypass Python's I/O and use the file descriptor *fileno*
263+
directly.
264+
265+
*digest* must either be a hash algorithm name as a *str*, a hash
266+
constructor, or a callable that returns a hash object.
267+
"""
268+
# On Linux we could use AF_ALG sockets and sendfile() to archive zero-copy
269+
# hashing with hardware acceleration.
270+
if isinstance(digest, str):
271+
digestobj = new(digest)
272+
else:
273+
digestobj = digest()
274+
275+
if hasattr(fileobj, "getbuffer"):
276+
# io.BytesIO object, use zero-copy buffer
277+
digestobj.update(fileobj.getbuffer())
278+
return digestobj
279+
280+
# check for file-like object in binary mode
281+
if not all(
282+
hasattr(fileobj, name)
283+
for name in ("fileno", "mode", "readable", "readinto")
284+
):
285+
raise TypeError(
286+
f"fileobj must be a file-like object, not {fileobj!r}."
287+
)
288+
if not fileobj.readable() or not "b" in fileobj.mode:
289+
raise ValueError("fileobj must be opened for reading in binary mode.")
290+
291+
# binary file, socket.SocketIO object
292+
# Note: socket I/O uses different syscalls than file I/O.
293+
fileobj.fileno() # so we can rely on working fileno() in the future.
294+
buf = bytearray(_bufsize) # Reusable buffer to reduce allocations.
295+
view = memoryview(buf)
296+
while True:
297+
size = fileobj.readinto(buf)
298+
if size == 0:
299+
break # EOF
300+
digestobj.update(view[:size])
301+
302+
return digestobj
303+
304+
257305
for __func_name in __always_supported:
258306
# try them all, some may not work due to the OpenSSL
259307
# version not supporting that algorithm.

Lib/test/test_hashlib.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from binascii import unhexlify
1111
import hashlib
1212
import importlib
13+
import io
1314
import itertools
1415
import os
1516
import sys
@@ -20,6 +21,7 @@
2021
from test import support
2122
from test.support import _4G, bigmemtest
2223
from test.support.import_helper import import_fresh_module
24+
from test.support import os_helper
2325
from test.support import threading_helper
2426
from test.support import warnings_helper
2527
from http.client import HTTPException
@@ -371,6 +373,32 @@ def check(self, name, data, hexdigest, shake=False, **kwargs):
371373
if not shake:
372374
self.assertEqual(len(digest), m.digest_size)
373375

376+
if not shake and kwargs.get("key") is None:
377+
# skip shake and blake2 extended parameter tests
378+
self.check_file_digest(name, data, hexdigest)
379+
380+
def check_file_digest(self, name, data, hexdigest):
381+
hexdigest = hexdigest.lower()
382+
digests = [name]
383+
digests.extend(self.constructors_to_test[name])
384+
385+
for digest in digests:
386+
with self.subTest(digest=digest):
387+
buf = io.BytesIO(data)
388+
buf.seek(0)
389+
self.assertEqual(
390+
hashlib.file_digest(buf, digest).hexdigest(), hexdigest
391+
)
392+
with open(os_helper.TESTFN, "wb") as f:
393+
f.write(data)
394+
try:
395+
with open(os_helper.TESTFN, "rb") as f:
396+
digestobj = hashlib.file_digest(f, digest)
397+
finally:
398+
os.unlink(os_helper.TESTFN)
399+
self.assertEqual(digestobj.hexdigest(), hexdigest)
400+
401+
374402
def check_no_unicode(self, algorithm_name):
375403
# Unicode objects are not allowed as input.
376404
constructors = self.constructors_to_test[algorithm_name]
@@ -1117,6 +1145,29 @@ def test_normalized_name(self):
11171145
self.assertNotIn("blake2b512", hashlib.algorithms_available)
11181146
self.assertNotIn("sha3-512", hashlib.algorithms_available)
11191147

1148+
def test_file_digest(self):
1149+
data = b'a' * 65536
1150+
d1 = hashlib.sha256()
1151+
self.addCleanup(os.unlink, os_helper.TESTFN)
1152+
with open(os_helper.TESTFN, "wb") as f:
1153+
for _ in range(10):
1154+
d1.update(data)
1155+
f.write(data)
1156+
1157+
with open(os_helper.TESTFN, "rb") as f:
1158+
d2 = hashlib.file_digest(f, hashlib.sha256)
1159+
1160+
self.assertEqual(d1.hexdigest(), d2.hexdigest())
1161+
self.assertEqual(d1.name, d2.name)
1162+
self.assertIs(type(d1), type(d2))
1163+
1164+
with self.assertRaises(TypeError):
1165+
hashlib.file_digest(None, "sha256")
1166+
1167+
with self.assertRaises(ValueError):
1168+
with open(os_helper.TESTFN, "wb") as f:
1169+
hashlib.file_digest(f, "sha256")
1170+
11201171

11211172
if __name__ == "__main__":
11221173
unittest.main()
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Add :func:`hashlib.file_digest` helper for efficient hashing of file object.

0 commit comments

Comments
 (0)