From 3a847b90aa7fd29fae422e9da11f17a05e60b2c9 Mon Sep 17 00:00:00 2001 From: Richard Si Date: Fri, 27 Dec 2024 14:04:43 -0500 Subject: [PATCH] perf: Avoid unnecessary URL processing while parsing links MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are three optimizations in this commit, in descending order of impact: - If the file URL in the "project detail" response is already absolute, then avoid calling urljoin() as it's expensive (mostly because it calls urlparse() on both of its URL arguments) and does nothing. While it'd be more correct to check whether the file URL has a scheme, we'd need to parse the URL which is what we're trying to avoid in the first place. Anyway, by simply checking if the URL starts with http[s]://, we can avoid slow urljoin() calls for PyPI responses. - Replacing urllib.parse.urlparse() with urllib.parse.urlsplit() in _ensure_quoted_url(). The URL parsing functions are equivalent for our needs[^1]. However, urlsplit() isfaster, and we achieve better cache utilization of its internal cache if we call it directly[^2]. - Calculating the Link.path property in advance as it's very hot. [^1]: we don't care about URL parameters AFAIK (which are different than the query component!) [^2]: urlparse() calls urlsplit() internally, but it passes the authority parameter (unlike any of our calls) so it bypasses the cache. Co-authored-by: Stéphane Bidoul --- news/13132.feature.rst | 1 + src/pip/_internal/models/link.py | 26 ++++++++++++++++++++------ 2 files changed, 21 insertions(+), 6 deletions(-) create mode 100644 news/13132.feature.rst diff --git a/news/13132.feature.rst b/news/13132.feature.rst new file mode 100644 index 00000000000..d8cd57e7c56 --- /dev/null +++ b/news/13132.feature.rst @@ -0,0 +1 @@ +Optimize package collection by avoiding unnecessary URL parsing and other processing. diff --git a/src/pip/_internal/models/link.py b/src/pip/_internal/models/link.py index 2f41f2f6a09..612a42f95c1 100644 --- a/src/pip/_internal/models/link.py +++ b/src/pip/_internal/models/link.py @@ -170,12 +170,23 @@ def _ensure_quoted_url(url: str) -> str: and without double-quoting other characters. """ # Split the URL into parts according to the general structure - # `scheme://netloc/path;parameters?query#fragment`. - result = urllib.parse.urlparse(url) + # `scheme://netloc/path?query#fragment`. + result = urllib.parse.urlsplit(url) # If the netloc is empty, then the URL refers to a local filesystem path. is_local_path = not result.netloc path = _clean_url_path(result.path, is_local_path=is_local_path) - return urllib.parse.urlunparse(result._replace(path=path)) + return urllib.parse.urlunsplit(result._replace(path=path)) + + +def _absolute_link_url(base_url: str, url: str) -> str: + """ + A faster implementation of urllib.parse.urljoin with a shortcut + for absolute http/https URLs. + """ + if url.startswith(("https://", "http://")): + return url + else: + return urllib.parse.urljoin(base_url, url) @functools.total_ordering @@ -185,6 +196,7 @@ class Link: __slots__ = [ "_parsed_url", "_url", + "_path", "_hashes", "comes_from", "requires_python", @@ -241,6 +253,8 @@ def __init__( # Store the url as a private attribute to prevent accidentally # trying to set a new value. self._url = url + # The .path property is hot, so calculate its value ahead of time. + self._path = urllib.parse.unquote(self._parsed_url.path) link_hash = LinkHash.find_hash_url_fragment(url) hashes_from_link = {} if link_hash is None else link_hash.as_dict() @@ -270,7 +284,7 @@ def from_json( if file_url is None: return None - url = _ensure_quoted_url(urllib.parse.urljoin(page_url, file_url)) + url = _ensure_quoted_url(_absolute_link_url(page_url, file_url)) pyrequire = file_data.get("requires-python") yanked_reason = file_data.get("yanked") hashes = file_data.get("hashes", {}) @@ -322,7 +336,7 @@ def from_element( if not href: return None - url = _ensure_quoted_url(urllib.parse.urljoin(base_url, href)) + url = _ensure_quoted_url(_absolute_link_url(base_url, href)) pyrequire = anchor_attribs.get("data-requires-python") yanked_reason = anchor_attribs.get("data-yanked") @@ -421,7 +435,7 @@ def netloc(self) -> str: @property def path(self) -> str: - return urllib.parse.unquote(self._parsed_url.path) + return self._path def splitext(self) -> Tuple[str, str]: return splitext(posixpath.basename(self.path.rstrip("/")))