Skip to content

Commit 4f97d64

Browse files
authored
bpo-45150: Add hashlib.file_digest() for efficient file hashing (GH-31930)
1 parent 3751b6b commit 4f97d64

File tree

4 files changed

+145
-1
lines changed

4 files changed

+145
-1
lines changed

Doc/library/hashlib.rst

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,49 @@ by the SHAKE algorithm.
228228
exchange the value safely in email or other non-binary environments.
229229

230230

231+
File hashing
232+
------------
233+
234+
The hashlib module provides a helper function for efficient hashing of
235+
a file or file-like object.
236+
237+
.. function:: file_digest(fileobj, digest, /)
238+
239+
Return a digest object that has been updated with contents of file object.
240+
241+
*fileobj* must be a file-like object opened for reading in binary mode.
242+
It accepts file objects from builtin :func:`open`, :class:`~io.BytesIO`
243+
instances, SocketIO objects from :meth:`socket.socket.makefile`, and
244+
similar. The function may bypass Python's I/O and use the file descriptor
245+
from :meth:`~io.IOBase.fileno` directly. *fileobj* must be assumed to be
246+
in an unknown state after this function returns or raises. It is up to
247+
the caller to close *fileobj*.
248+
249+
*digest* must either be a hash algorithm name as a *str*, a hash
250+
constructor, or a callable that returns a hash object.
251+
252+
Example:
253+
254+
>>> import io, hashlib, hmac
255+
>>> with open(hashlib.__file__, "rb") as f:
256+
... digest = hashlib.file_digest(f, "sha256")
257+
...
258+
>>> digest.hexdigest() # doctest: +ELLIPSIS
259+
'...'
260+
261+
>>> buf = io.BytesIO(b"somedata")
262+
>>> mac1 = hmac.HMAC(b"key", digestmod=hashlib.sha512)
263+
>>> digest = hashlib.file_digest(buf, lambda: mac1)
264+
265+
>>> digest is mac1
266+
True
267+
>>> mac2 = hmac.HMAC(b"key", b"somedata", digestmod=hashlib.sha512)
268+
>>> mac1.digest() == mac2.digest()
269+
True
270+
271+
.. versionadded:: 3.11
272+
273+
231274
Key derivation
232275
--------------
233276

Lib/hashlib.py

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@
6565
algorithms_available = set(__always_supported)
6666

6767
__all__ = __always_supported + ('new', 'algorithms_guaranteed',
68-
'algorithms_available', 'pbkdf2_hmac')
68+
'algorithms_available', 'pbkdf2_hmac', 'file_digest')
6969

7070

7171
__builtin_constructor_cache = {}
@@ -254,6 +254,52 @@ def prf(msg, inner=inner, outer=outer):
254254
pass
255255

256256

257+
def file_digest(fileobj, digest, /, *, _bufsize=2**18):
258+
"""Hash the contents of a file-like object. Returns a digest object.
259+
260+
*fileobj* must be a file-like object opened for reading in binary mode.
261+
It accepts file objects from open(), io.BytesIO(), and SocketIO objects.
262+
The function may bypass Python's I/O and use the file descriptor *fileno*
263+
directly.
264+
265+
*digest* must either be a hash algorithm name as a *str*, a hash
266+
constructor, or a callable that returns a hash object.
267+
"""
268+
# On Linux we could use AF_ALG sockets and sendfile() to archive zero-copy
269+
# hashing with hardware acceleration.
270+
if isinstance(digest, str):
271+
digestobj = new(digest)
272+
else:
273+
digestobj = digest()
274+
275+
if hasattr(fileobj, "getbuffer"):
276+
# io.BytesIO object, use zero-copy buffer
277+
digestobj.update(fileobj.getbuffer())
278+
return digestobj
279+
280+
# Only binary files implement readinto().
281+
if not (
282+
hasattr(fileobj, "readinto")
283+
and hasattr(fileobj, "readable")
284+
and fileobj.readable()
285+
):
286+
raise ValueError(
287+
f"'{fileobj!r}' is not a file-like object in binary reading mode."
288+
)
289+
290+
# binary file, socket.SocketIO object
291+
# Note: socket I/O uses different syscalls than file I/O.
292+
buf = bytearray(_bufsize) # Reusable buffer to reduce allocations.
293+
view = memoryview(buf)
294+
while True:
295+
size = fileobj.readinto(buf)
296+
if size == 0:
297+
break # EOF
298+
digestobj.update(view[:size])
299+
300+
return digestobj
301+
302+
257303
for __func_name in __always_supported:
258304
# try them all, some may not work due to the OpenSSL
259305
# version not supporting that algorithm.

Lib/test/test_hashlib.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from binascii import unhexlify
1111
import hashlib
1212
import importlib
13+
import io
1314
import itertools
1415
import os
1516
import sys
@@ -20,6 +21,7 @@
2021
from test import support
2122
from test.support import _4G, bigmemtest
2223
from test.support.import_helper import import_fresh_module
24+
from test.support import os_helper
2325
from test.support import threading_helper
2426
from test.support import warnings_helper
2527
from http.client import HTTPException
@@ -371,6 +373,31 @@ def check(self, name, data, hexdigest, shake=False, **kwargs):
371373
if not shake:
372374
self.assertEqual(len(digest), m.digest_size)
373375

376+
if not shake and kwargs.get("key") is None:
377+
# skip shake and blake2 extended parameter tests
378+
self.check_file_digest(name, data, hexdigest)
379+
380+
def check_file_digest(self, name, data, hexdigest):
381+
hexdigest = hexdigest.lower()
382+
digests = [name]
383+
digests.extend(self.constructors_to_test[name])
384+
385+
with open(os_helper.TESTFN, "wb") as f:
386+
f.write(data)
387+
388+
try:
389+
for digest in digests:
390+
buf = io.BytesIO(data)
391+
buf.seek(0)
392+
self.assertEqual(
393+
hashlib.file_digest(buf, digest).hexdigest(), hexdigest
394+
)
395+
with open(os_helper.TESTFN, "rb") as f:
396+
digestobj = hashlib.file_digest(f, digest)
397+
self.assertEqual(digestobj.hexdigest(), hexdigest)
398+
finally:
399+
os.unlink(os_helper.TESTFN)
400+
374401
def check_no_unicode(self, algorithm_name):
375402
# Unicode objects are not allowed as input.
376403
constructors = self.constructors_to_test[algorithm_name]
@@ -1117,6 +1144,33 @@ def test_normalized_name(self):
11171144
self.assertNotIn("blake2b512", hashlib.algorithms_available)
11181145
self.assertNotIn("sha3-512", hashlib.algorithms_available)
11191146

1147+
def test_file_digest(self):
1148+
data = b'a' * 65536
1149+
d1 = hashlib.sha256()
1150+
self.addCleanup(os.unlink, os_helper.TESTFN)
1151+
with open(os_helper.TESTFN, "wb") as f:
1152+
for _ in range(10):
1153+
d1.update(data)
1154+
f.write(data)
1155+
1156+
with open(os_helper.TESTFN, "rb") as f:
1157+
d2 = hashlib.file_digest(f, hashlib.sha256)
1158+
1159+
self.assertEqual(d1.hexdigest(), d2.hexdigest())
1160+
self.assertEqual(d1.name, d2.name)
1161+
self.assertIs(type(d1), type(d2))
1162+
1163+
with self.assertRaises(ValueError):
1164+
hashlib.file_digest(None, "sha256")
1165+
1166+
with self.assertRaises(ValueError):
1167+
with open(os_helper.TESTFN, "r") as f:
1168+
hashlib.file_digest(f, "sha256")
1169+
1170+
with self.assertRaises(ValueError):
1171+
with open(os_helper.TESTFN, "wb") as f:
1172+
hashlib.file_digest(f, "sha256")
1173+
11201174

11211175
if __name__ == "__main__":
11221176
unittest.main()
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Add :func:`hashlib.file_digest` helper for efficient hashing of file object.

0 commit comments

Comments
 (0)