Skip to content

bpo-45150: Add hashlib.file_digest() for efficient file hashing #31930

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Mar 22, 2022
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions Doc/library/hashlib.rst
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,49 @@ by the SHAKE algorithm.
exchange the value safely in email or other non-binary environments.


File hashing
------------

The hashlib module provides a helper function for efficient hashing of
a file or file-like object.

.. function:: file_digest(fileobj, digest, /)

Return a digest object that has been updated with contents of file object.

*fileobj* must be a file-like object opened for reading in binary mode.
It accepts file objects from builtin :func:`open`, :class:`~io.BytesIO`
instances, SocketIO objects from :meth:`socket.socket.makefile`, and
similar. The function may bypass Python's I/O and use the file descriptor
from :meth:`~io.IOBase.fileno` directly. *fileobj* must be assumed to be
in an unknown state after this function returns or raises. It is up to
the caller to close *fileobj*.

*digest* must either be a hash algorithm name as a *str*, a hash
constructor, or a callable that returns a hash object.

Example:

>>> import io, hashlib, hmac
>>> with open(hashlib.__file__, "rb") as f:
... digest = hashlib.file_digest(f, "sha256")
...
>>> digest.hexdigest() # doctest: +ELLIPSIS
'...'

>>> buf = io.BytesIO(b"somedata")
>>> mac1 = hmac.HMAC(b"key", digestmod=hashlib.sha512)
>>> digest = hashlib.file_digest(buf, lambda: mac1)

>>> digest is mac1
True
>>> mac2 = hmac.HMAC(b"key", b"somedata", digestmod=hashlib.sha512)
>>> mac1.digest() == mac2.digest()
True

.. versionadded:: 3.11


Key derivation
--------------

Expand Down
48 changes: 47 additions & 1 deletion Lib/hashlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@
algorithms_available = set(__always_supported)

__all__ = __always_supported + ('new', 'algorithms_guaranteed',
'algorithms_available', 'pbkdf2_hmac')
'algorithms_available', 'pbkdf2_hmac', 'file_digest')


__builtin_constructor_cache = {}
Expand Down Expand Up @@ -254,6 +254,52 @@ def prf(msg, inner=inner, outer=outer):
pass


def file_digest(fileobj, digest, /, *, _bufsize=2**18):
"""Hash the contents of a file-like object. Returns a digest object.

*fileobj* must be a file-like object opened for reading in binary mode.
It accepts file objects from open(), io.BytesIO(), and SocketIO objects.
The function may bypass Python's I/O and use the file descriptor *fileno*
directly.

*digest* must either be a hash algorithm name as a *str*, a hash
constructor, or a callable that returns a hash object.
"""
# On Linux we could use AF_ALG sockets and sendfile() to archive zero-copy
# hashing with hardware acceleration.
if isinstance(digest, str):
digestobj = new(digest)
else:
digestobj = digest()

if hasattr(fileobj, "getbuffer"):
# io.BytesIO object, use zero-copy buffer
digestobj.update(fileobj.getbuffer())
return digestobj

# Only binary files implement readinto().
if not (
hasattr(fileobj, "readinto")
and hasattr(fileobj, "readable")
and fileobj.readable()
):
raise ValueError(
f"'{fileobj!r}' is not a file-like object in binary reading mode."
)

# binary file, socket.SocketIO object
# Note: socket I/O uses different syscalls than file I/O.
buf = bytearray(_bufsize) # Reusable buffer to reduce allocations.
view = memoryview(buf)
while True:
size = fileobj.readinto(buf)
if size == 0:
break # EOF
digestobj.update(view[:size])

return digestobj


for __func_name in __always_supported:
# try them all, some may not work due to the OpenSSL
# version not supporting that algorithm.
Expand Down
57 changes: 57 additions & 0 deletions Lib/test/test_hashlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from binascii import unhexlify
import hashlib
import importlib
import io
import itertools
import os
import sys
Expand All @@ -20,6 +21,7 @@
from test import support
from test.support import _4G, bigmemtest
from test.support.import_helper import import_fresh_module
from test.support import os_helper
from test.support import threading_helper
from test.support import warnings_helper
from http.client import HTTPException
Expand Down Expand Up @@ -371,6 +373,34 @@ def check(self, name, data, hexdigest, shake=False, **kwargs):
if not shake:
self.assertEqual(len(digest), m.digest_size)

if not shake and kwargs.get("key") is None:
# skip shake and blake2 extended parameter tests
self.check_file_digest(name, data, hexdigest)

def check_file_digest(self, name, data, hexdigest):
hexdigest = hexdigest.lower()
digests = [name]
digests.extend(self.constructors_to_test[name])

with open(os_helper.TESTFN, "wb") as f:
f.write(data)

try:
for digest in digests:
buf = io.BytesIO(data)
buf.seek(0)
self.assertEqual(
hashlib.file_digest(buf, digest).hexdigest(), hexdigest
)
try:
with open(os_helper.TESTFN, "rb") as f:
digestobj = hashlib.file_digest(f, digest)
finally:
os.unlink(os_helper.TESTFN)
self.assertEqual(digestobj.hexdigest(), hexdigest)
finally:
os.unlink(os_helper.TESTFN)

def check_no_unicode(self, algorithm_name):
# Unicode objects are not allowed as input.
constructors = self.constructors_to_test[algorithm_name]
Expand Down Expand Up @@ -1117,6 +1147,33 @@ def test_normalized_name(self):
self.assertNotIn("blake2b512", hashlib.algorithms_available)
self.assertNotIn("sha3-512", hashlib.algorithms_available)

def test_file_digest(self):
data = b'a' * 65536
d1 = hashlib.sha256()
self.addCleanup(os.unlink, os_helper.TESTFN)
with open(os_helper.TESTFN, "wb") as f:
for _ in range(10):
d1.update(data)
f.write(data)

with open(os_helper.TESTFN, "rb") as f:
d2 = hashlib.file_digest(f, hashlib.sha256)

self.assertEqual(d1.hexdigest(), d2.hexdigest())
self.assertEqual(d1.name, d2.name)
self.assertIs(type(d1), type(d2))

with self.assertRaises(ValueError):
hashlib.file_digest(None, "sha256")

with self.assertRaises(ValueError):
with open(os_helper.TESTFN, "r") as f:
hashlib.file_digest(f, "sha256")

with self.assertRaises(ValueError):
with open(os_helper.TESTFN, "wb") as f:
hashlib.file_digest(f, "sha256")


if __name__ == "__main__":
unittest.main()
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add :func:`hashlib.file_digest` helper for efficient hashing of file object.