Skip to content

Commit 83f9a08

Browse files
committed
perf: Avoid unnecessary URL processing while parsing links
There are three optimizations in this commit, in descending order of impact: - If the file URL in the "project detail" response is already absolute, then avoid calling urljoin() as it's expensive (mostly because it calls urlparse() on both of its URL arguments) and does nothing. While it'd be more correct to check whether the file URL has a scheme, we'd need to parse the URL which is what we're trying to avoid in the first place. Anyway, by simply checking if the URL starts with http[s]://, we can avoid slow urljoin() calls for PyPI responses. - Replacing urllib.parse.urlparse() with urllib.parse.urlsplit() in _ensure_quoted_url(). The URL parsing functions are equivalent for our needs[^1]. However, urlsplit() isfaster, and we achieve better cache utilization of its internal cache if we call it directly[^2]. - Calculating the Link.path property in advance as it's very hot. [^1]: we don't care about URL parameters AFAIK (which are different than the query component!) [^2]: urlparse() calls urlsplit() internally, but it passes the authority parameter (unlike any of our calls) so it bypasses the cache.
1 parent c10dda5 commit 83f9a08

File tree

2 files changed

+28
-6
lines changed

2 files changed

+28
-6
lines changed

news/13132.feature.rst

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Optimize package collection by avoiding unnecessary URL parsing and other processing.

src/pip/_internal/models/link.py

+27-6
Original file line numberDiff line numberDiff line change
@@ -170,12 +170,30 @@ def _ensure_quoted_url(url: str) -> str:
170170
and without double-quoting other characters.
171171
"""
172172
# Split the URL into parts according to the general structure
173-
# `scheme://netloc/path;parameters?query#fragment`.
174-
result = urllib.parse.urlparse(url)
173+
# `scheme://netloc/path?query#fragment`.
174+
result = urllib.parse.urlsplit(url)
175175
# If the netloc is empty, then the URL refers to a local filesystem path.
176176
is_local_path = not result.netloc
177177
path = _clean_url_path(result.path, is_local_path=is_local_path)
178-
return urllib.parse.urlunparse(result._replace(path=path))
178+
return urllib.parse.urlunsplit(result._replace(path=path))
179+
180+
181+
def _absolute_link_url(base_url: str, file_url: str) -> str:
182+
"""Return an absolute file URL from a simple response.
183+
184+
If the file URL is already absolute, the function does nothing.
185+
If the file URL is relative, it is joined with the base URL.
186+
"""
187+
# If the file URL is already absolute, joining it with the page URL
188+
# will do nothing. PyPI returns absolute URLs so we can avoid expensive
189+
# urljoin() calls in the common case. (It would be technically more
190+
# correct to parse the file URL and check if it has a scheme, but the
191+
# slow URL parsing urljoin() does is what we're trying to avoid in the
192+
# first place, so we only check for the http[s]:// prefix.)
193+
if file_url.startswith(("https://", "http://")):
194+
return file_url
195+
else:
196+
return urllib.parse.urljoin(base_url, file_url)
179197

180198

181199
@functools.total_ordering
@@ -185,6 +203,7 @@ class Link:
185203
__slots__ = [
186204
"_parsed_url",
187205
"_url",
206+
"_path",
188207
"_hashes",
189208
"comes_from",
190209
"requires_python",
@@ -241,6 +260,8 @@ def __init__(
241260
# Store the url as a private attribute to prevent accidentally
242261
# trying to set a new value.
243262
self._url = url
263+
# The .path property is hot, so calculate its value ahead of time.
264+
self._path = urllib.parse.unquote(self._parsed_url.path)
244265

245266
link_hash = LinkHash.find_hash_url_fragment(url)
246267
hashes_from_link = {} if link_hash is None else link_hash.as_dict()
@@ -270,7 +291,7 @@ def from_json(
270291
if file_url is None:
271292
return None
272293

273-
url = _ensure_quoted_url(urllib.parse.urljoin(page_url, file_url))
294+
url = _ensure_quoted_url(_absolute_link_url(page_url, file_url))
274295
pyrequire = file_data.get("requires-python")
275296
yanked_reason = file_data.get("yanked")
276297
hashes = file_data.get("hashes", {})
@@ -322,7 +343,7 @@ def from_element(
322343
if not href:
323344
return None
324345

325-
url = _ensure_quoted_url(urllib.parse.urljoin(base_url, href))
346+
url = _ensure_quoted_url(_absolute_link_url(base_url, href))
326347
pyrequire = anchor_attribs.get("data-requires-python")
327348
yanked_reason = anchor_attribs.get("data-yanked")
328349

@@ -421,7 +442,7 @@ def netloc(self) -> str:
421442

422443
@property
423444
def path(self) -> str:
424-
return urllib.parse.unquote(self._parsed_url.path)
445+
return self._path
425446

426447
def splitext(self) -> Tuple[str, str]:
427448
return splitext(posixpath.basename(self.path.rstrip("/")))

0 commit comments

Comments
 (0)