Skip to content

Commit c779621

Browse files
committed
PoC of PEP 691
1 parent 713e00f commit c779621

File tree

2 files changed

+106
-54
lines changed

2 files changed

+106
-54
lines changed

src/pip/_internal/index/collector.py

+103-51
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import email.message
77
import functools
88
import itertools
9+
import json
910
import logging
1011
import os
1112
import re
@@ -65,32 +66,44 @@ def _match_vcs_scheme(url: str) -> Optional[str]:
6566
return None
6667

6768

68-
class _NotHTML(Exception):
69+
class _NotAPIContent(Exception):
6970
def __init__(self, content_type: str, request_desc: str) -> None:
7071
super().__init__(content_type, request_desc)
7172
self.content_type = content_type
7273
self.request_desc = request_desc
7374

7475

75-
def _ensure_html_header(response: Response) -> None:
76-
"""Check the Content-Type header to ensure the response contains HTML.
76+
def _ensure_api_header(response: Response) -> None:
77+
"""
78+
Check the Content-Type header to ensure the response contains a Simple
79+
API Response.
7780
78-
Raises `_NotHTML` if the content type is not text/html.
81+
Raises `_NotAPIContent` if the content type is not a valid content-type.
7982
"""
8083
content_type = response.headers.get("Content-Type", "")
81-
if not content_type.lower().startswith("text/html"):
82-
raise _NotHTML(content_type, response.request.method)
84+
85+
content_type_l = content_type.lower()
86+
if content_type_l.startswith("text/html"):
87+
return
88+
elif content_type_l.startswith("application/vnd.pypi.simple.v1+html"):
89+
return
90+
elif content_type_l.startswith("application/vnd.pypi.simple.v1+json"):
91+
return
92+
93+
raise _NotAPIContent(content_type, response.request.method)
8394

8495

8596
class _NotHTTP(Exception):
8697
pass
8798

8899

89-
def _ensure_html_response(url: str, session: PipSession) -> None:
90-
"""Send a HEAD request to the URL, and ensure the response contains HTML.
100+
def _ensure_api_response(url: str, session: PipSession) -> None:
101+
"""
102+
Send a HEAD request to the URL, and ensure the response contains a simple
103+
API Response.
91104
92105
Raises `_NotHTTP` if the URL is not available for a HEAD request, or
93-
`_NotHTML` if the content type is not text/html.
106+
`_NotAPIContent` if the content type is not a valid content type.
94107
"""
95108
scheme, netloc, path, query, fragment = urllib.parse.urlsplit(url)
96109
if scheme not in {"http", "https"}:
@@ -99,31 +112,37 @@ def _ensure_html_response(url: str, session: PipSession) -> None:
99112
resp = session.head(url, allow_redirects=True)
100113
raise_for_status(resp)
101114

102-
_ensure_html_header(resp)
115+
_ensure_api_header(resp)
103116

104117

105-
def _get_html_response(url: str, session: PipSession) -> Response:
106-
"""Access an HTML page with GET, and return the response.
118+
def _get_simple_response(url: str, session: PipSession) -> Response:
119+
"""Access an Simple API response with GET, and return the response.
107120
108121
This consists of three parts:
109122
110123
1. If the URL looks suspiciously like an archive, send a HEAD first to
111-
check the Content-Type is HTML, to avoid downloading a large file.
112-
Raise `_NotHTTP` if the content type cannot be determined, or
113-
`_NotHTML` if it is not HTML.
124+
check the Content-Type is HTML or Simple API, to avoid downloading a
125+
large file. Raise `_NotHTTP` if the content type cannot be determined, or
126+
`_NotAPIContent` if it is not HTML or a Simple API.
114127
2. Actually perform the request. Raise HTTP exceptions on network failures.
115-
3. Check the Content-Type header to make sure we got HTML, and raise
116-
`_NotHTML` otherwise.
128+
3. Check the Content-Type header to make sure we got a Simple API response,
129+
and raise `_NotAPIContent` otherwise.
117130
"""
118131
if is_archive_file(Link(url).filename):
119-
_ensure_html_response(url, session=session)
132+
_ensure_api_response(url, session=session)
120133

121134
logger.debug("Getting page %s", redact_auth_from_url(url))
122135

123136
resp = session.get(
124137
url,
125138
headers={
126-
"Accept": "text/html",
139+
"Accept": ", ".join(
140+
[
141+
"application/vnd.pypi.simple.v1+json",
142+
"application/vnd.pypi.simple.v1+html; q=0.2",
143+
"text/html; q=0.1",
144+
]
145+
),
127146
# We don't want to blindly returned cached data for
128147
# /simple/, because authors generally expecting that
129148
# twine upload && pip install will function, but if
@@ -145,9 +164,10 @@ def _get_html_response(url: str, session: PipSession) -> Response:
145164
# The check for archives above only works if the url ends with
146165
# something that looks like an archive. However that is not a
147166
# requirement of an url. Unless we issue a HEAD request on every
148-
# url we cannot know ahead of time for sure if something is HTML
149-
# or not. However we can check after we've downloaded it.
150-
_ensure_html_header(resp)
167+
# url we cannot know ahead of time for sure if something is a
168+
# Simple API response or not. However we can check after we've
169+
# downloaded it.
170+
_ensure_api_header(resp)
151171

152172
return resp
153173

@@ -273,7 +293,7 @@ def _create_link_from_element(
273293

274294

275295
class CacheablePageContent:
276-
def __init__(self, page: "HTMLPage") -> None:
296+
def __init__(self, page: "IndexContent") -> None:
277297
assert page.cache_link_parsing
278298
self.page = page
279299

@@ -286,15 +306,15 @@ def __hash__(self) -> int:
286306

287307
class ParseLinks(Protocol):
288308
def __call__(
289-
self, page: "HTMLPage", use_deprecated_html5lib: bool
309+
self, page: "IndexContent", use_deprecated_html5lib: bool
290310
) -> Iterable[Link]:
291311
...
292312

293313

294-
def with_cached_html_pages(fn: ParseLinks) -> ParseLinks:
314+
def with_cached_index_content(fn: ParseLinks) -> ParseLinks:
295315
"""
296-
Given a function that parses an Iterable[Link] from an HTMLPage, cache the
297-
function's result (keyed by CacheablePageContent), unless the HTMLPage
316+
Given a function that parses an Iterable[Link] from an IndexContent, cache the
317+
function's result (keyed by CacheablePageContent), unless the IndexContent
298318
`page` has `page.cache_link_parsing == False`.
299319
"""
300320

@@ -305,15 +325,17 @@ def wrapper(
305325
return list(fn(cacheable_page.page, use_deprecated_html5lib))
306326

307327
@functools.wraps(fn)
308-
def wrapper_wrapper(page: "HTMLPage", use_deprecated_html5lib: bool) -> List[Link]:
328+
def wrapper_wrapper(
329+
page: "IndexContent", use_deprecated_html5lib: bool
330+
) -> List[Link]:
309331
if page.cache_link_parsing:
310332
return wrapper(CacheablePageContent(page), use_deprecated_html5lib)
311333
return list(fn(page, use_deprecated_html5lib))
312334

313335
return wrapper_wrapper
314336

315337

316-
def _parse_links_html5lib(page: "HTMLPage") -> Iterable[Link]:
338+
def _parse_links_html5lib(page: "IndexContent") -> Iterable[Link]:
317339
"""
318340
Parse an HTML document, and yield its anchor elements as Link objects.
319341
@@ -338,12 +360,35 @@ def _parse_links_html5lib(page: "HTMLPage") -> Iterable[Link]:
338360
yield link
339361

340362

341-
@with_cached_html_pages
342-
def parse_links(page: "HTMLPage", use_deprecated_html5lib: bool) -> Iterable[Link]:
363+
@with_cached_index_content
364+
def parse_links(page: "IndexContent", use_deprecated_html5lib: bool) -> Iterable[Link]:
343365
"""
344-
Parse an HTML document, and yield its anchor elements as Link objects.
366+
Parse a Simple API's Index Content, and yield its anchor elements as Link objects.
345367
"""
346368

369+
content_type_l = page.content_type.lower()
370+
if content_type_l.startswith("application/vnd.pypi.simple.v1+json"):
371+
data = json.loads(page.content)
372+
for file in data.get("files", []):
373+
file_url = file.get("url")
374+
if file_url is None:
375+
continue
376+
377+
# The Link.yanked_reason expects an empty string instead of a boolean.
378+
yanked_reason = file.get("yanked")
379+
if yanked_reason and not isinstance(yanked_reason, str):
380+
yanked_reason = ""
381+
# The Link.yanked_reason expects None instead of False
382+
elif not yanked_reason:
383+
yanked_reason = None
384+
385+
yield Link(
386+
_clean_link(urllib.parse.urljoin(page.url, file_url)),
387+
comes_from=page.url,
388+
requires_python=file.get("requires-python"),
389+
yanked_reason=yanked_reason,
390+
)
391+
347392
if use_deprecated_html5lib:
348393
yield from _parse_links_html5lib(page)
349394
return
@@ -365,12 +410,13 @@ def parse_links(page: "HTMLPage", use_deprecated_html5lib: bool) -> Iterable[Lin
365410
yield link
366411

367412

368-
class HTMLPage:
369-
"""Represents one page, along with its URL"""
413+
class IndexContent:
414+
"""Represents one response (or page), along with its URL"""
370415

371416
def __init__(
372417
self,
373418
content: bytes,
419+
content_type: str,
374420
encoding: Optional[str],
375421
url: str,
376422
cache_link_parsing: bool = True,
@@ -383,6 +429,7 @@ def __init__(
383429
have this set to False, for example.
384430
"""
385431
self.content = content
432+
self.content_type = content_type
386433
self.encoding = encoding
387434
self.url = url
388435
self.cache_link_parsing = cache_link_parsing
@@ -419,7 +466,7 @@ def get_href(self, attrs: List[Tuple[str, Optional[str]]]) -> Optional[str]:
419466
return None
420467

421468

422-
def _handle_get_page_fail(
469+
def _handle_get_simple_fail(
423470
link: Link,
424471
reason: Union[str, Exception],
425472
meth: Optional[Callable[..., None]] = None,
@@ -429,19 +476,22 @@ def _handle_get_page_fail(
429476
meth("Could not fetch URL %s: %s - skipping", link, reason)
430477

431478

432-
def _make_html_page(response: Response, cache_link_parsing: bool = True) -> HTMLPage:
479+
def _make_index_content(
480+
response: Response, cache_link_parsing: bool = True
481+
) -> IndexContent:
433482
encoding = _get_encoding_from_headers(response.headers)
434-
return HTMLPage(
483+
return IndexContent(
435484
response.content,
485+
response.headers["Content-Type"],
436486
encoding=encoding,
437487
url=response.url,
438488
cache_link_parsing=cache_link_parsing,
439489
)
440490

441491

442-
def _get_html_page(
492+
def _get_index_content(
443493
link: Link, session: Optional[PipSession] = None
444-
) -> Optional["HTMLPage"]:
494+
) -> Optional["IndexContent"]:
445495
if session is None:
446496
raise TypeError(
447497
"_get_html_page() missing 1 required keyword argument: 'session'"
@@ -468,37 +518,39 @@ def _get_html_page(
468518
url += "/"
469519
url = urllib.parse.urljoin(url, "index.html")
470520
logger.debug(" file: URL is directory, getting %s", url)
521+
# TODO: index.json?
471522

472523
try:
473-
resp = _get_html_response(url, session=session)
524+
resp = _get_simple_response(url, session=session)
474525
except _NotHTTP:
475526
logger.warning(
476527
"Skipping page %s because it looks like an archive, and cannot "
477528
"be checked by a HTTP HEAD request.",
478529
link,
479530
)
480-
except _NotHTML as exc:
531+
except _NotAPIContent as exc:
481532
logger.warning(
482-
"Skipping page %s because the %s request got Content-Type: %s."
483-
"The only supported Content-Type is text/html",
533+
"Skipping page %s because the %s request got Content-Type: %s. "
534+
"The only supported Content-Types are application/vnd.pypi.simple.v1+json, "
535+
"application/vnd.pypi.simple.v1+html, and text/html",
484536
link,
485537
exc.request_desc,
486538
exc.content_type,
487539
)
488540
except NetworkConnectionError as exc:
489-
_handle_get_page_fail(link, exc)
541+
_handle_get_simple_fail(link, exc)
490542
except RetryError as exc:
491-
_handle_get_page_fail(link, exc)
543+
_handle_get_simple_fail(link, exc)
492544
except SSLError as exc:
493545
reason = "There was a problem confirming the ssl certificate: "
494546
reason += str(exc)
495-
_handle_get_page_fail(link, reason, meth=logger.info)
547+
_handle_get_simple_fail(link, reason, meth=logger.info)
496548
except requests.ConnectionError as exc:
497-
_handle_get_page_fail(link, f"connection error: {exc}")
549+
_handle_get_simple_fail(link, f"connection error: {exc}")
498550
except requests.Timeout:
499-
_handle_get_page_fail(link, "timed out")
551+
_handle_get_simple_fail(link, "timed out")
500552
else:
501-
return _make_html_page(resp, cache_link_parsing=link.cache_link_parsing)
553+
return _make_index_content(resp, cache_link_parsing=link.cache_link_parsing)
502554
return None
503555

504556

@@ -561,11 +613,11 @@ def create(
561613
def find_links(self) -> List[str]:
562614
return self.search_scope.find_links
563615

564-
def fetch_page(self, location: Link) -> Optional[HTMLPage]:
616+
def fetch_response(self, location: Link) -> Optional[IndexContent]:
565617
"""
566618
Fetch an HTML page containing package links.
567619
"""
568-
return _get_html_page(location, session=self.session)
620+
return _get_index_content(location, session=self.session)
569621

570622
def collect_sources(
571623
self,

src/pip/_internal/index/package_finder.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -792,11 +792,11 @@ def process_project_url(
792792
"Fetching project page and analyzing links: %s",
793793
project_url,
794794
)
795-
html_page = self._link_collector.fetch_page(project_url)
796-
if html_page is None:
795+
index_response = self._link_collector.fetch_response(project_url)
796+
if index_response is None:
797797
return []
798798

799-
page_links = list(parse_links(html_page, self._use_deprecated_html5lib))
799+
page_links = list(parse_links(index_response, self._use_deprecated_html5lib))
800800

801801
with indent_log():
802802
package_links = self.evaluate_links(

0 commit comments

Comments
 (0)