1
1
"""
2
- The main purpose of this module is to expose LinkCollector.collect_links ().
2
+ The main purpose of this module is to expose LinkCollector.collect_sources ().
3
3
"""
4
4
5
5
import cgi
6
+ import collections
6
7
import functools
7
8
import html
8
9
import itertools
9
10
import logging
10
- import mimetypes
11
11
import os
12
12
import re
13
13
import urllib .parse
14
14
import urllib .request
15
15
import xml .etree .ElementTree
16
- from collections import OrderedDict
17
16
from optparse import Values
18
17
from typing import (
19
18
Callable ,
20
19
Iterable ,
21
20
List ,
22
21
MutableMapping ,
22
+ NamedTuple ,
23
23
Optional ,
24
24
Sequence ,
25
- Tuple ,
26
25
Union ,
27
26
)
28
27
37
36
from pip ._internal .network .utils import raise_for_status
38
37
from pip ._internal .utils .filetypes import is_archive_file
39
38
from pip ._internal .utils .misc import pairwise , redact_auth_from_url
40
- from pip ._internal .utils .urls import path_to_url , url_to_path
41
- from pip ._internal .vcs import is_url , vcs
39
+ from pip ._internal .vcs import vcs
40
+
41
+ from .sources import CandidatesFromPage , LinkSource , build_source
42
42
43
43
logger = logging .getLogger (__name__ )
44
44
@@ -449,107 +449,9 @@ def _get_html_page(link, session=None):
449
449
return None
450
450
451
451
452
- def _remove_duplicate_links (links ):
453
- # type: (Iterable[Link]) -> List[Link]
454
- """
455
- Return a list of links, with duplicates removed and ordering preserved.
456
- """
457
- # We preserve the ordering when removing duplicates because we can.
458
- return list (OrderedDict .fromkeys (links ))
459
-
460
-
461
- def group_locations (locations , expand_dir = False ):
462
- # type: (Sequence[str], bool) -> Tuple[List[str], List[str]]
463
- """
464
- Divide a list of locations into two groups: "files" (archives) and "urls."
465
-
466
- :return: A pair of lists (files, urls).
467
- """
468
- files = []
469
- urls = []
470
-
471
- # puts the url for the given file path into the appropriate list
472
- def sort_path (path ):
473
- # type: (str) -> None
474
- url = path_to_url (path )
475
- if mimetypes .guess_type (url , strict = False )[0 ] == 'text/html' :
476
- urls .append (url )
477
- else :
478
- files .append (url )
479
-
480
- for url in locations :
481
-
482
- is_local_path = os .path .exists (url )
483
- is_file_url = url .startswith ('file:' )
484
-
485
- if is_local_path or is_file_url :
486
- if is_local_path :
487
- path = url
488
- else :
489
- path = url_to_path (url )
490
- if os .path .isdir (path ):
491
- if expand_dir :
492
- path = os .path .realpath (path )
493
- for item in os .listdir (path ):
494
- sort_path (os .path .join (path , item ))
495
- elif is_file_url :
496
- urls .append (url )
497
- else :
498
- logger .warning (
499
- "Path '%s' is ignored: it is a directory." , path ,
500
- )
501
- elif os .path .isfile (path ):
502
- sort_path (path )
503
- else :
504
- logger .warning (
505
- "Url '%s' is ignored: it is neither a file "
506
- "nor a directory." , url ,
507
- )
508
- elif is_url (url ):
509
- # Only add url with clear scheme
510
- urls .append (url )
511
- else :
512
- logger .warning (
513
- "Url '%s' is ignored. It is either a non-existing "
514
- "path or lacks a specific scheme." , url ,
515
- )
516
-
517
- return files , urls
518
-
519
-
520
- class CollectedLinks :
521
-
522
- """
523
- Encapsulates the return value of a call to LinkCollector.collect_links().
524
-
525
- The return value includes both URLs to project pages containing package
526
- links, as well as individual package Link objects collected from other
527
- sources.
528
-
529
- This info is stored separately as:
530
-
531
- (1) links from the configured file locations,
532
- (2) links from the configured find_links, and
533
- (3) urls to HTML project pages, as described by the PEP 503 simple
534
- repository API.
535
- """
536
-
537
- def __init__ (
538
- self ,
539
- files , # type: List[Link]
540
- find_links , # type: List[Link]
541
- project_urls , # type: List[Link]
542
- ):
543
- # type: (...) -> None
544
- """
545
- :param files: Links from file locations.
546
- :param find_links: Links from find_links.
547
- :param project_urls: URLs to HTML project pages, as described by
548
- the PEP 503 simple repository API.
549
- """
550
- self .files = files
551
- self .find_links = find_links
552
- self .project_urls = project_urls
452
+ class CollectedSources (NamedTuple ):
453
+ find_links : Sequence [Optional [LinkSource ]]
454
+ index_urls : Sequence [Optional [LinkSource ]]
553
455
554
456
555
457
class LinkCollector :
@@ -558,7 +460,7 @@ class LinkCollector:
558
460
Responsible for collecting Link objects from all configured locations,
559
461
making network requests as needed.
560
462
561
- The class's main method is its collect_links () method.
463
+ The class's main method is its collect_sources () method.
562
464
"""
563
465
564
466
def __init__ (
@@ -609,51 +511,46 @@ def fetch_page(self, location):
609
511
"""
610
512
return _get_html_page (location , session = self .session )
611
513
612
- def collect_links (self , project_name ):
613
- # type: (str) -> CollectedLinks
614
- """Find all available links for the given project name.
615
-
616
- :return: All the Link objects (unfiltered), as a CollectedLinks object.
617
- """
618
- search_scope = self .search_scope
619
- index_locations = search_scope .get_index_urls_locations (project_name )
620
- index_file_loc , index_url_loc = group_locations (index_locations )
621
- fl_file_loc , fl_url_loc = group_locations (
622
- self .find_links , expand_dir = True ,
623
- )
624
-
625
- file_links = [
626
- Link (url ) for url in itertools .chain (index_file_loc , fl_file_loc )
627
- ]
628
-
629
- # We trust every directly linked archive in find_links
630
- find_link_links = [Link (url , '-f' ) for url in self .find_links ]
631
-
632
- # We trust every url that the user has given us whether it was given
633
- # via --index-url or --find-links.
634
- # We want to filter out anything that does not have a secure origin.
635
- url_locations = [
636
- link for link in itertools .chain (
637
- # Mark PyPI indices as "cache_link_parsing == False" -- this
638
- # will avoid caching the result of parsing the page for links.
639
- (Link (url , cache_link_parsing = False ) for url in index_url_loc ),
640
- (Link (url ) for url in fl_url_loc ),
514
+ def collect_sources (
515
+ self ,
516
+ project_name : str ,
517
+ candidates_from_page : CandidatesFromPage ,
518
+ ) -> CollectedSources :
519
+ # The OrderedDict calls deduplicate sources by URL.
520
+ index_url_sources = collections .OrderedDict (
521
+ build_source (
522
+ loc ,
523
+ candidates_from_page = candidates_from_page ,
524
+ page_validator = self .session .is_secure_origin ,
525
+ expand_dir = False ,
526
+ cache_link_parsing = False ,
527
+ )
528
+ for loc in self .search_scope .get_index_urls_locations (project_name )
529
+ ).values ()
530
+ find_links_sources = collections .OrderedDict (
531
+ build_source (
532
+ loc ,
533
+ candidates_from_page = candidates_from_page ,
534
+ page_validator = self .session .is_secure_origin ,
535
+ expand_dir = True ,
536
+ cache_link_parsing = True ,
641
537
)
642
- if self .session .is_secure_origin (link )
643
- ]
644
-
645
- url_locations = _remove_duplicate_links (url_locations )
646
- lines = [
647
- '{} location(s) to search for versions of {}:' .format (
648
- len (url_locations ), project_name ,
649
- ),
650
- ]
651
- for link in url_locations :
652
- lines .append (f'* { link } ' )
653
- logger .debug ('\n ' .join (lines ))
654
-
655
- return CollectedLinks (
656
- files = file_links ,
657
- find_links = find_link_links ,
658
- project_urls = url_locations ,
538
+ for loc in self .find_links
539
+ ).values ()
540
+
541
+ if logger .isEnabledFor (logging .DEBUG ):
542
+ lines = [
543
+ f"* { s .link } "
544
+ for s in itertools .chain (find_links_sources , index_url_sources )
545
+ if s is not None and s .link is not None
546
+ ]
547
+ lines = [
548
+ f"{ len (lines )} location(s) to search "
549
+ f"for versions of { project_name } :"
550
+ ] + lines
551
+ logger .debug ("\n " .join (lines ))
552
+
553
+ return CollectedSources (
554
+ find_links = list (find_links_sources ),
555
+ index_urls = list (index_url_sources ),
659
556
)
0 commit comments