From ae0acff5d19f2ef04fd25afcda84556b3a00b696 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Bidoul?= Date: Mon, 18 Apr 2022 13:54:59 +0200 Subject: [PATCH] Hash checking of cached wheels --- news/5037.feature.rst | 1 + src/pip/_internal/cache.py | 6 ++++ src/pip/_internal/exceptions.py | 16 +++++++--- src/pip/_internal/operations/prepare.py | 32 ++++++++++++++++--- src/pip/_internal/req/req_install.py | 6 ++++ .../_internal/resolution/legacy/resolver.py | 13 ++++---- .../resolution/resolvelib/candidates.py | 5 +++ .../resolution/resolvelib/factory.py | 11 ++----- src/pip/_internal/utils/hashes.py | 27 ++++++++++++---- src/pip/_internal/wheel_builder.py | 14 ++++++++ 10 files changed, 100 insertions(+), 31 deletions(-) create mode 100644 news/5037.feature.rst diff --git a/news/5037.feature.rst b/news/5037.feature.rst new file mode 100644 index 00000000000..fe4637b6cf2 --- /dev/null +++ b/news/5037.feature.rst @@ -0,0 +1 @@ +Support wheel cache when using ``--require-hashes``. diff --git a/src/pip/_internal/cache.py b/src/pip/_internal/cache.py index 1d6df220118..63acb186f34 100644 --- a/src/pip/_internal/cache.py +++ b/src/pip/_internal/cache.py @@ -5,12 +5,14 @@ import json import logging import os +from pathlib import Path from typing import Any, Dict, List, Optional, Set from pip._vendor.packaging.tags import Tag, interpreter_name, interpreter_version from pip._vendor.packaging.utils import canonicalize_name from pip._internal.exceptions import InvalidWheelFilename +from pip._internal.models.direct_url import DirectUrl from pip._internal.models.format_control import FormatControl from pip._internal.models.link import Link from pip._internal.models.wheel import Wheel @@ -204,6 +206,10 @@ def __init__( ): self.link = link self.persistent = persistent + self.origin: Optional[DirectUrl] = None + origin_direct_url_path = Path(self.link.file_path).parent / "origin.json" + if origin_direct_url_path.exists(): + self.origin = DirectUrl.from_json(origin_direct_url_path.read_text()) class WheelCache(Cache): diff --git a/src/pip/_internal/exceptions.py b/src/pip/_internal/exceptions.py index 97b9612a187..ccdc47c689d 100644 --- a/src/pip/_internal/exceptions.py +++ b/src/pip/_internal/exceptions.py @@ -8,7 +8,7 @@ import configparser import re from itertools import chain, groupby, repeat -from typing import TYPE_CHECKING, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Dict, List, Mapping, Optional, Union from pip._vendor.requests.models import Request, Response from pip._vendor.rich.console import Console, ConsoleOptions, RenderResult @@ -16,11 +16,12 @@ from pip._vendor.rich.text import Text if TYPE_CHECKING: - from hashlib import _Hash - from typing import Literal + from typing import Literal, Protocol from pip._internal.metadata import BaseDistribution from pip._internal.req.req_install import InstallRequirement +else: + Protocol = object # @@ -570,6 +571,11 @@ class HashUnpinned(HashError): ) +class SupportsHexDigest(Protocol): + def hexdigest(self) -> str: + ... + + class HashMismatch(HashError): """ Distribution file hash values don't match. @@ -588,7 +594,9 @@ class HashMismatch(HashError): "someone may have tampered with them." ) - def __init__(self, allowed: Dict[str, List[str]], gots: Dict[str, "_Hash"]) -> None: + def __init__( + self, allowed: Dict[str, List[str]], gots: Mapping[str, SupportsHexDigest] + ) -> None: """ :param allowed: A dict of algorithm names pointing to lists of allowed hex digests diff --git a/src/pip/_internal/operations/prepare.py b/src/pip/_internal/operations/prepare.py index 7550d3a90d1..0bb5d82afa1 100644 --- a/src/pip/_internal/operations/prepare.py +++ b/src/pip/_internal/operations/prepare.py @@ -37,7 +37,12 @@ from pip._internal.req.req_install import InstallRequirement from pip._internal.utils.hashes import Hashes, MissingHashes from pip._internal.utils.logging import indent_log -from pip._internal.utils.misc import display_path, hide_url, is_installable_dir +from pip._internal.utils.misc import ( + display_path, + hash_file, + hide_url, + is_installable_dir, +) from pip._internal.utils.temp_dir import TempDirectory from pip._internal.utils.unpacking import unpack_file from pip._internal.vcs import vcs @@ -98,7 +103,10 @@ def get_http_url( def get_file_url( - link: Link, download_dir: Optional[str] = None, hashes: Optional[Hashes] = None + link: Link, + download_dir: Optional[str] = None, + hashes: Optional[Hashes] = None, + archive_hash: Optional[str] = None, ) -> File: """Get file and optionally check its hash.""" # If a download dir is specified, is the file already there and valid? @@ -117,7 +125,13 @@ def get_file_url( # hash in `hashes` matching: a URL-based or an option-based # one; no internet-sourced hash will be in `hashes`. if hashes: - hashes.check_against_path(from_path) + if archive_hash: + # When we get a wheel from the cache, we don't check the file hash but + # rather compare expected hash against the hash of the original archive + # that was downloaded to build the cached wheel. + hashes.check_against_hash(archive_hash) + else: + hashes.check_against_path(from_path) return File(from_path, None) @@ -128,6 +142,7 @@ def unpack_url( verbosity: int, download_dir: Optional[str] = None, hashes: Optional[Hashes] = None, + archive_hash: Optional[str] = None, ) -> Optional[File]: """Unpack link into location, downloading if required. @@ -145,7 +160,9 @@ def unpack_url( # file urls if link.is_file: - file = get_file_url(link, download_dir, hashes=hashes) + file = get_file_url( + link, download_dir, hashes=hashes, archive_hash=archive_hash + ) # http urls else: @@ -470,6 +487,7 @@ def _prepare_linked_requirement( self.verbosity, self.download_dir, hashes, + req.archive_hash, ) except NetworkConnectionError as exc: raise InstallationError( @@ -486,6 +504,12 @@ def _prepare_linked_requirement( # preserve the file path on the requirement. if local_file: req.local_file_path = local_file.path + # Also compute and preserve the hash of the file we downloaded. + # Note: as an optimization we may use link.hash if it is a sha256, + # as we verify elsewhere that it matches the downloaded content. + # TODO Should we use hashes.FAVORITE_HASH type ? + hash = hash_file(local_file.path)[0].hexdigest() + req.archive_hash = f"sha256={hash}" dist = _get_prepared_distribution( req, diff --git a/src/pip/_internal/req/req_install.py b/src/pip/_internal/req/req_install.py index 637b6ce10af..1255e7ed8c3 100644 --- a/src/pip/_internal/req/req_install.py +++ b/src/pip/_internal/req/req_install.py @@ -108,6 +108,12 @@ def __init__( # PEP 508 URL requirement link = Link(req.url) self.link = self.original_link = link + + # The locally computed hash of the (source) archive we downloaded. If no + # download occured because a corresponding wheel was found in the local wheel + # cache, this is the hash that was recorded in the cache entry. + self.archive_hash: Optional[str] = None + self.original_link_is_in_wheel_cache = False # Path to any downloaded or already-existing package. diff --git a/src/pip/_internal/resolution/legacy/resolver.py b/src/pip/_internal/resolution/legacy/resolver.py index 8c149d437d7..a9d3dc3d5ed 100644 --- a/src/pip/_internal/resolution/legacy/resolver.py +++ b/src/pip/_internal/resolution/legacy/resolver.py @@ -33,6 +33,7 @@ ) from pip._internal.index.package_finder import PackageFinder from pip._internal.metadata import BaseDistribution +from pip._internal.models import direct_url from pip._internal.models.link import Link from pip._internal.operations.prepare import RequirementPreparer from pip._internal.req.req_install import ( @@ -289,17 +290,11 @@ def _populate_link(self, req: InstallRequirement) -> None: Note that req.link may still be None - if the requirement is already installed and not needed to be upgraded based on the return value of _is_upgrade_allowed(). - - If preparer.require_hashes is True, don't use the wheel cache, because - cached wheels, always built locally, have different hashes than the - files downloaded from the index server and thus throw false hash - mismatches. Furthermore, cached wheels at present have undeterministic - contents due to file modification times. """ if req.link is None: req.link = self._find_requirement_link(req) - if self.wheel_cache is None or self.preparer.require_hashes: + if self.wheel_cache is None: return cache_entry = self.wheel_cache.get_cache_entry( link=req.link, @@ -310,6 +305,10 @@ def _populate_link(self, req: InstallRequirement) -> None: logger.debug("Using cached wheel link: %s", cache_entry.link) if req.link is req.original_link and cache_entry.persistent: req.original_link_is_in_wheel_cache = True + if cache_entry.origin is not None and isinstance( + cache_entry.origin.info, direct_url.ArchiveInfo + ): + req.archive_hash = cache_entry.origin.info.hash req.link = cache_entry.link def _get_dist_for(self, req: InstallRequirement) -> BaseDistribution: diff --git a/src/pip/_internal/resolution/resolvelib/candidates.py b/src/pip/_internal/resolution/resolvelib/candidates.py index 9b8450e86b8..2e1ebd5ec6e 100644 --- a/src/pip/_internal/resolution/resolvelib/candidates.py +++ b/src/pip/_internal/resolution/resolvelib/candidates.py @@ -11,6 +11,7 @@ MetadataInconsistent, ) from pip._internal.metadata import BaseDistribution +from pip._internal.models import direct_url from pip._internal.models.link import Link, links_equivalent from pip._internal.models.wheel import Wheel from pip._internal.req.constructors import ( @@ -284,6 +285,10 @@ def __init__( and template.link is template.original_link ): ireq.original_link_is_in_wheel_cache = True + if cache_entry.origin is not None and isinstance( + cache_entry.origin.info, direct_url.ArchiveInfo + ): + ireq.archive_hash = cache_entry.origin.info.hash super().__init__( link=link, diff --git a/src/pip/_internal/resolution/resolvelib/factory.py b/src/pip/_internal/resolution/resolvelib/factory.py index 3cfcac865ff..45b13d11e4f 100644 --- a/src/pip/_internal/resolution/resolvelib/factory.py +++ b/src/pip/_internal/resolution/resolvelib/factory.py @@ -536,15 +536,8 @@ def make_requires_python_requirement( def get_wheel_cache_entry( self, link: Link, name: Optional[str] ) -> Optional[CacheEntry]: - """Look up the link in the wheel cache. - - If ``preparer.require_hashes`` is True, don't use the wheel cache, - because cached wheels, always built locally, have different hashes - than the files downloaded from the index server and thus throw false - hash mismatches. Furthermore, cached wheels at present have - nondeterministic contents due to file modification times. - """ - if self._wheel_cache is None or self.preparer.require_hashes: + """Look up the link in the wheel cache.""" + if self._wheel_cache is None: return None return self._wheel_cache.get_cache_entry( link=link, diff --git a/src/pip/_internal/utils/hashes.py b/src/pip/_internal/utils/hashes.py index 609d981cd35..6be9b7262dc 100644 --- a/src/pip/_internal/utils/hashes.py +++ b/src/pip/_internal/utils/hashes.py @@ -1,12 +1,15 @@ import hashlib -from typing import TYPE_CHECKING, BinaryIO, Dict, Iterable, List - -from pip._internal.exceptions import HashMismatch, HashMissing, InstallationError +from typing import TYPE_CHECKING, BinaryIO, Dict, Iterable, List, Mapping + +from pip._internal.exceptions import ( + HashMismatch, + HashMissing, + InstallationError, + SupportsHexDigest, +) from pip._internal.utils.misc import read_chunks if TYPE_CHECKING: - from hashlib import _Hash - # NoReturn introduced in 3.6.2; imported only for type checking to maintain # pip compatibility with older patch versions of Python 3.6 from typing import NoReturn @@ -22,6 +25,11 @@ STRONG_HASHES = ["sha256", "sha384", "sha512"] +class HexDigestStr(str, SupportsHexDigest): + def hexdigest(self) -> str: + return self + + class Hashes: """A wrapper that builds multiple hashes at once and checks them against known-good values @@ -94,7 +102,7 @@ def check_against_chunks(self, chunks: Iterable[bytes]) -> None: return self._raise(gots) - def _raise(self, gots: Dict[str, "_Hash"]) -> "NoReturn": + def _raise(self, gots: Mapping[str, SupportsHexDigest]) -> "NoReturn": raise HashMismatch(self._allowed, gots) def check_against_file(self, file: BinaryIO) -> None: @@ -109,6 +117,11 @@ def check_against_path(self, path: str) -> None: with open(path, "rb") as file: return self.check_against_file(file) + def check_against_hash(self, hash: str) -> None: + alg, value = hash.split("=", 1) + if value not in self._allowed.get(alg, []): + self._raise({alg: HexDigestStr(value)}) + def __bool__(self) -> bool: """Return whether I know any known-good hashes.""" return bool(self._allowed) @@ -144,5 +157,5 @@ def __init__(self) -> None: # empty list, it will never match, so an error will always raise. super().__init__(hashes={FAVORITE_HASH: []}) - def _raise(self, gots: Dict[str, "_Hash"]) -> "NoReturn": + def _raise(self, gots: Mapping[str, SupportsHexDigest]) -> "NoReturn": raise HashMissing(gots[FAVORITE_HASH].hexdigest()) diff --git a/src/pip/_internal/wheel_builder.py b/src/pip/_internal/wheel_builder.py index d0663443b22..c9c19df9cec 100644 --- a/src/pip/_internal/wheel_builder.py +++ b/src/pip/_internal/wheel_builder.py @@ -5,6 +5,7 @@ import os.path import re import shutil +from pathlib import Path from typing import Any, Callable, Iterable, List, Optional, Tuple from pip._vendor.packaging.utils import canonicalize_name, canonicalize_version @@ -13,12 +14,14 @@ from pip._internal.cache import WheelCache from pip._internal.exceptions import InvalidWheelFilename, UnsupportedWheel from pip._internal.metadata import FilesystemWheel, get_wheel_distribution +from pip._internal.models import direct_url from pip._internal.models.link import Link from pip._internal.models.wheel import Wheel from pip._internal.operations.build.wheel import build_wheel_pep517 from pip._internal.operations.build.wheel_editable import build_wheel_editable from pip._internal.operations.build.wheel_legacy import build_wheel_legacy from pip._internal.req.req_install import InstallRequirement +from pip._internal.utils.direct_url_helpers import direct_url_from_link from pip._internal.utils.logging import indent_log from pip._internal.utils.misc import ensure_dir, hash_file, is_wheel_installed from pip._internal.utils.setuptools_build import make_setuptools_clean_args @@ -344,6 +347,7 @@ def build( build_successes, build_failures = [], [] for req in requirements: assert req.name + assert req.link cache_dir = _get_cache_dir(req, wheel_cache) wheel_file = _build_one( req, @@ -354,6 +358,16 @@ def build( req.editable and req.permit_editable_wheels, ) if wheel_file: + # Store the origin URL of this cache entry + # TODO move this to cache.py / refactor + origin_direct_url = direct_url_from_link(req.link, req.source_dir) + if isinstance(origin_direct_url.info, direct_url.ArchiveInfo): + # Record the hash of the file that was downloaded. + assert req.archive_hash + origin_direct_url.info.hash = req.archive_hash + Path(cache_dir).joinpath("origin.json").write_text( + origin_direct_url.to_json() + ) # Update the link for this. req.link = Link(path_to_url(wheel_file)) req.local_file_path = req.link.file_path