6
6
import email .message
7
7
import functools
8
8
import itertools
9
+ import json
9
10
import logging
10
11
import os
11
12
import re
@@ -65,32 +66,44 @@ def _match_vcs_scheme(url: str) -> Optional[str]:
65
66
return None
66
67
67
68
68
- class _NotHTML (Exception ):
69
+ class _NotAPIContent (Exception ):
69
70
def __init__ (self , content_type : str , request_desc : str ) -> None :
70
71
super ().__init__ (content_type , request_desc )
71
72
self .content_type = content_type
72
73
self .request_desc = request_desc
73
74
74
75
75
- def _ensure_html_header (response : Response ) -> None :
76
- """Check the Content-Type header to ensure the response contains HTML.
76
+ def _ensure_api_header (response : Response ) -> None :
77
+ """
78
+ Check the Content-Type header to ensure the response contains a Simple
79
+ API Response.
77
80
78
- Raises `_NotHTML ` if the content type is not text/html .
81
+ Raises `_NotAPIContent ` if the content type is not a valid content-type .
79
82
"""
80
83
content_type = response .headers .get ("Content-Type" , "" )
81
- if not content_type .lower ().startswith ("text/html" ):
82
- raise _NotHTML (content_type , response .request .method )
84
+
85
+ content_type_l = content_type .lower ()
86
+ if content_type_l .startswith ("text/html" ):
87
+ return
88
+ elif content_type_l .startswith ("application/vnd.pypi.simple.v1+html" ):
89
+ return
90
+ elif content_type_l .startswith ("application/vnd.pypi.simple.v1+json" ):
91
+ return
92
+
93
+ raise _NotAPIContent (content_type , response .request .method )
83
94
84
95
85
96
class _NotHTTP (Exception ):
86
97
pass
87
98
88
99
89
- def _ensure_html_response (url : str , session : PipSession ) -> None :
90
- """Send a HEAD request to the URL, and ensure the response contains HTML.
100
+ def _ensure_api_response (url : str , session : PipSession ) -> None :
101
+ """
102
+ Send a HEAD request to the URL, and ensure the response contains a simple
103
+ API Response.
91
104
92
105
Raises `_NotHTTP` if the URL is not available for a HEAD request, or
93
- `_NotHTML ` if the content type is not text/html .
106
+ `_NotAPIContent ` if the content type is not a valid content type .
94
107
"""
95
108
scheme , netloc , path , query , fragment = urllib .parse .urlsplit (url )
96
109
if scheme not in {"http" , "https" }:
@@ -99,31 +112,37 @@ def _ensure_html_response(url: str, session: PipSession) -> None:
99
112
resp = session .head (url , allow_redirects = True )
100
113
raise_for_status (resp )
101
114
102
- _ensure_html_header (resp )
115
+ _ensure_api_header (resp )
103
116
104
117
105
- def _get_html_response (url : str , session : PipSession ) -> Response :
106
- """Access an HTML page with GET, and return the response.
118
+ def _get_simple_response (url : str , session : PipSession ) -> Response :
119
+ """Access an Simple API response with GET, and return the response.
107
120
108
121
This consists of three parts:
109
122
110
123
1. If the URL looks suspiciously like an archive, send a HEAD first to
111
- check the Content-Type is HTML, to avoid downloading a large file.
112
- Raise `_NotHTTP` if the content type cannot be determined, or
113
- `_NotHTML ` if it is not HTML.
124
+ check the Content-Type is HTML or Simple API , to avoid downloading a
125
+ large file. Raise `_NotHTTP` if the content type cannot be determined, or
126
+ `_NotAPIContent ` if it is not HTML or a Simple API .
114
127
2. Actually perform the request. Raise HTTP exceptions on network failures.
115
- 3. Check the Content-Type header to make sure we got HTML, and raise
116
- `_NotHTML ` otherwise.
128
+ 3. Check the Content-Type header to make sure we got a Simple API response,
129
+ and raise `_NotAPIContent ` otherwise.
117
130
"""
118
131
if is_archive_file (Link (url ).filename ):
119
- _ensure_html_response (url , session = session )
132
+ _ensure_api_response (url , session = session )
120
133
121
134
logger .debug ("Getting page %s" , redact_auth_from_url (url ))
122
135
123
136
resp = session .get (
124
137
url ,
125
138
headers = {
126
- "Accept" : "text/html" ,
139
+ "Accept" : ", " .join (
140
+ [
141
+ "application/vnd.pypi.simple.v1+json" ,
142
+ "application/vnd.pypi.simple.v1+html; q=0.2" ,
143
+ "text/html; q=0.1" ,
144
+ ]
145
+ ),
127
146
# We don't want to blindly returned cached data for
128
147
# /simple/, because authors generally expecting that
129
148
# twine upload && pip install will function, but if
@@ -145,9 +164,10 @@ def _get_html_response(url: str, session: PipSession) -> Response:
145
164
# The check for archives above only works if the url ends with
146
165
# something that looks like an archive. However that is not a
147
166
# requirement of an url. Unless we issue a HEAD request on every
148
- # url we cannot know ahead of time for sure if something is HTML
149
- # or not. However we can check after we've downloaded it.
150
- _ensure_html_header (resp )
167
+ # url we cannot know ahead of time for sure if something is a
168
+ # Simple API response or not. However we can check after we've
169
+ # downloaded it.
170
+ _ensure_api_header (resp )
151
171
152
172
return resp
153
173
@@ -273,7 +293,7 @@ def _create_link_from_element(
273
293
274
294
275
295
class CacheablePageContent :
276
- def __init__ (self , page : "HTMLPage " ) -> None :
296
+ def __init__ (self , page : "IndexContent " ) -> None :
277
297
assert page .cache_link_parsing
278
298
self .page = page
279
299
@@ -286,15 +306,15 @@ def __hash__(self) -> int:
286
306
287
307
class ParseLinks (Protocol ):
288
308
def __call__ (
289
- self , page : "HTMLPage " , use_deprecated_html5lib : bool
309
+ self , page : "IndexContent " , use_deprecated_html5lib : bool
290
310
) -> Iterable [Link ]:
291
311
...
292
312
293
313
294
- def with_cached_html_pages (fn : ParseLinks ) -> ParseLinks :
314
+ def with_cached_index_content (fn : ParseLinks ) -> ParseLinks :
295
315
"""
296
- Given a function that parses an Iterable[Link] from an HTMLPage , cache the
297
- function's result (keyed by CacheablePageContent), unless the HTMLPage
316
+ Given a function that parses an Iterable[Link] from an IndexContent , cache the
317
+ function's result (keyed by CacheablePageContent), unless the IndexContent
298
318
`page` has `page.cache_link_parsing == False`.
299
319
"""
300
320
@@ -305,15 +325,17 @@ def wrapper(
305
325
return list (fn (cacheable_page .page , use_deprecated_html5lib ))
306
326
307
327
@functools .wraps (fn )
308
- def wrapper_wrapper (page : "HTMLPage" , use_deprecated_html5lib : bool ) -> List [Link ]:
328
+ def wrapper_wrapper (
329
+ page : "IndexContent" , use_deprecated_html5lib : bool
330
+ ) -> List [Link ]:
309
331
if page .cache_link_parsing :
310
332
return wrapper (CacheablePageContent (page ), use_deprecated_html5lib )
311
333
return list (fn (page , use_deprecated_html5lib ))
312
334
313
335
return wrapper_wrapper
314
336
315
337
316
- def _parse_links_html5lib (page : "HTMLPage " ) -> Iterable [Link ]:
338
+ def _parse_links_html5lib (page : "IndexContent " ) -> Iterable [Link ]:
317
339
"""
318
340
Parse an HTML document, and yield its anchor elements as Link objects.
319
341
@@ -338,12 +360,35 @@ def _parse_links_html5lib(page: "HTMLPage") -> Iterable[Link]:
338
360
yield link
339
361
340
362
341
- @with_cached_html_pages
342
- def parse_links (page : "HTMLPage " , use_deprecated_html5lib : bool ) -> Iterable [Link ]:
363
+ @with_cached_index_content
364
+ def parse_links (page : "IndexContent " , use_deprecated_html5lib : bool ) -> Iterable [Link ]:
343
365
"""
344
- Parse an HTML document , and yield its anchor elements as Link objects.
366
+ Parse a Simple API's Index Content , and yield its anchor elements as Link objects.
345
367
"""
346
368
369
+ content_type_l = page .content_type .lower ()
370
+ if content_type_l .startswith ("application/vnd.pypi.simple.v1+json" ):
371
+ data = json .loads (page .content )
372
+ for file in data .get ("files" , []):
373
+ file_url = file .get ("url" )
374
+ if file_url is None :
375
+ continue
376
+
377
+ # The Link.yanked_reason expects an empty string instead of a boolean.
378
+ yanked_reason = file .get ("yanked" )
379
+ if yanked_reason and not isinstance (yanked_reason , str ):
380
+ yanked_reason = ""
381
+ # The Link.yanked_reason expects None instead of False
382
+ elif not yanked_reason :
383
+ yanked_reason = None
384
+
385
+ yield Link (
386
+ _clean_link (urllib .parse .urljoin (page .url , file_url )),
387
+ comes_from = page .url ,
388
+ requires_python = file .get ("requires-python" ),
389
+ yanked_reason = yanked_reason ,
390
+ )
391
+
347
392
if use_deprecated_html5lib :
348
393
yield from _parse_links_html5lib (page )
349
394
return
@@ -365,12 +410,13 @@ def parse_links(page: "HTMLPage", use_deprecated_html5lib: bool) -> Iterable[Lin
365
410
yield link
366
411
367
412
368
- class HTMLPage :
369
- """Represents one page, along with its URL"""
413
+ class IndexContent :
414
+ """Represents one response (or page) , along with its URL"""
370
415
371
416
def __init__ (
372
417
self ,
373
418
content : bytes ,
419
+ content_type : str ,
374
420
encoding : Optional [str ],
375
421
url : str ,
376
422
cache_link_parsing : bool = True ,
@@ -383,6 +429,7 @@ def __init__(
383
429
have this set to False, for example.
384
430
"""
385
431
self .content = content
432
+ self .content_type = content_type
386
433
self .encoding = encoding
387
434
self .url = url
388
435
self .cache_link_parsing = cache_link_parsing
@@ -419,7 +466,7 @@ def get_href(self, attrs: List[Tuple[str, Optional[str]]]) -> Optional[str]:
419
466
return None
420
467
421
468
422
- def _handle_get_page_fail (
469
+ def _handle_get_simple_fail (
423
470
link : Link ,
424
471
reason : Union [str , Exception ],
425
472
meth : Optional [Callable [..., None ]] = None ,
@@ -429,19 +476,22 @@ def _handle_get_page_fail(
429
476
meth ("Could not fetch URL %s: %s - skipping" , link , reason )
430
477
431
478
432
- def _make_html_page (response : Response , cache_link_parsing : bool = True ) -> HTMLPage :
479
+ def _make_index_content (
480
+ response : Response , cache_link_parsing : bool = True
481
+ ) -> IndexContent :
433
482
encoding = _get_encoding_from_headers (response .headers )
434
- return HTMLPage (
483
+ return IndexContent (
435
484
response .content ,
485
+ response .headers ["Content-Type" ],
436
486
encoding = encoding ,
437
487
url = response .url ,
438
488
cache_link_parsing = cache_link_parsing ,
439
489
)
440
490
441
491
442
- def _get_html_page (
492
+ def _get_index_content (
443
493
link : Link , session : Optional [PipSession ] = None
444
- ) -> Optional ["HTMLPage " ]:
494
+ ) -> Optional ["IndexContent " ]:
445
495
if session is None :
446
496
raise TypeError (
447
497
"_get_html_page() missing 1 required keyword argument: 'session'"
@@ -468,37 +518,39 @@ def _get_html_page(
468
518
url += "/"
469
519
url = urllib .parse .urljoin (url , "index.html" )
470
520
logger .debug (" file: URL is directory, getting %s" , url )
521
+ # TODO: index.json?
471
522
472
523
try :
473
- resp = _get_html_response (url , session = session )
524
+ resp = _get_simple_response (url , session = session )
474
525
except _NotHTTP :
475
526
logger .warning (
476
527
"Skipping page %s because it looks like an archive, and cannot "
477
528
"be checked by a HTTP HEAD request." ,
478
529
link ,
479
530
)
480
- except _NotHTML as exc :
531
+ except _NotAPIContent as exc :
481
532
logger .warning (
482
- "Skipping page %s because the %s request got Content-Type: %s."
483
- "The only supported Content-Type is text/html" ,
533
+ "Skipping page %s because the %s request got Content-Type: %s. "
534
+ "The only supported Content-Types are application/vnd.pypi.simple.v1+json, "
535
+ "application/vnd.pypi.simple.v1+html, and text/html" ,
484
536
link ,
485
537
exc .request_desc ,
486
538
exc .content_type ,
487
539
)
488
540
except NetworkConnectionError as exc :
489
- _handle_get_page_fail (link , exc )
541
+ _handle_get_simple_fail (link , exc )
490
542
except RetryError as exc :
491
- _handle_get_page_fail (link , exc )
543
+ _handle_get_simple_fail (link , exc )
492
544
except SSLError as exc :
493
545
reason = "There was a problem confirming the ssl certificate: "
494
546
reason += str (exc )
495
- _handle_get_page_fail (link , reason , meth = logger .info )
547
+ _handle_get_simple_fail (link , reason , meth = logger .info )
496
548
except requests .ConnectionError as exc :
497
- _handle_get_page_fail (link , f"connection error: { exc } " )
549
+ _handle_get_simple_fail (link , f"connection error: { exc } " )
498
550
except requests .Timeout :
499
- _handle_get_page_fail (link , "timed out" )
551
+ _handle_get_simple_fail (link , "timed out" )
500
552
else :
501
- return _make_html_page (resp , cache_link_parsing = link .cache_link_parsing )
553
+ return _make_index_content (resp , cache_link_parsing = link .cache_link_parsing )
502
554
return None
503
555
504
556
@@ -561,11 +613,11 @@ def create(
561
613
def find_links (self ) -> List [str ]:
562
614
return self .search_scope .find_links
563
615
564
- def fetch_page (self , location : Link ) -> Optional [HTMLPage ]:
616
+ def fetch_response (self , location : Link ) -> Optional [IndexContent ]:
565
617
"""
566
618
Fetch an HTML page containing package links.
567
619
"""
568
- return _get_html_page (location , session = self .session )
620
+ return _get_index_content (location , session = self .session )
569
621
570
622
def collect_sources (
571
623
self ,
0 commit comments