Skip to content

Commit a912c55

Browse files
committed
Rewrite collect_links
This introduces a collect_sources() method to do the same thing, but instead of flattening links eagerly, return each repository entry separately (and return a None for invalid repository options), so subsequent code can better distinguish which link comes from which repository.
1 parent a0f6041 commit a912c55

File tree

5 files changed

+389
-218
lines changed

5 files changed

+389
-218
lines changed

docs/html/development/architecture/package-finding.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ One of ``PackageFinder``'s main top-level methods is
101101
1. Calls its ``find_all_candidates()`` method, which gathers all
102102
possible package links by reading and parsing the index URL's and
103103
locations provided by the user (the :ref:`LinkCollector
104-
<link-collector-class>` class's ``collect_links()`` method), constructs a
104+
<link-collector-class>` class's ``collect_sources()`` method), constructs a
105105
:ref:`LinkEvaluator <link-evaluator-class>` object to filter out some of
106106
those links, and then returns a list of ``InstallationCandidates`` (aka
107107
candidates for install). This corresponds to steps 1-3 of the
@@ -131,7 +131,7 @@ responsible for collecting the raw list of "links" to package files
131131
The ``LinkCollector`` class takes into account the user's :ref:`--find-links
132132
<install_--find-links>`, :ref:`--extra-index-url <install_--extra-index-url>`,
133133
and related options when deciding which locations to collect links from. The
134-
class's main method is the ``collect_links()`` method. The :ref:`PackageFinder
134+
class's main method is the ``collect_sources()`` method. The :ref:`PackageFinder
135135
<package-finder-class>` class invokes this method as the first step of its
136136
``find_all_candidates()`` method.
137137

src/pip/_internal/index/collector.py

Lines changed: 51 additions & 154 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,27 @@
11
"""
2-
The main purpose of this module is to expose LinkCollector.collect_links().
2+
The main purpose of this module is to expose LinkCollector.collect_sources().
33
"""
44

55
import cgi
6+
import collections
67
import functools
78
import html
89
import itertools
910
import logging
10-
import mimetypes
1111
import os
1212
import re
1313
import urllib.parse
1414
import urllib.request
1515
import xml.etree.ElementTree
16-
from collections import OrderedDict
1716
from optparse import Values
1817
from typing import (
1918
Callable,
2019
Iterable,
2120
List,
2221
MutableMapping,
22+
NamedTuple,
2323
Optional,
2424
Sequence,
25-
Tuple,
2625
Union,
2726
)
2827

@@ -37,8 +36,9 @@
3736
from pip._internal.network.utils import raise_for_status
3837
from pip._internal.utils.filetypes import is_archive_file
3938
from pip._internal.utils.misc import pairwise, redact_auth_from_url
40-
from pip._internal.utils.urls import path_to_url, url_to_path
41-
from pip._internal.vcs import is_url, vcs
39+
from pip._internal.vcs import vcs
40+
41+
from .sources import CandidatesFromPage, LinkSource, build_source
4242

4343
logger = logging.getLogger(__name__)
4444

@@ -449,107 +449,9 @@ def _get_html_page(link, session=None):
449449
return None
450450

451451

452-
def _remove_duplicate_links(links):
453-
# type: (Iterable[Link]) -> List[Link]
454-
"""
455-
Return a list of links, with duplicates removed and ordering preserved.
456-
"""
457-
# We preserve the ordering when removing duplicates because we can.
458-
return list(OrderedDict.fromkeys(links))
459-
460-
461-
def group_locations(locations, expand_dir=False):
462-
# type: (Sequence[str], bool) -> Tuple[List[str], List[str]]
463-
"""
464-
Divide a list of locations into two groups: "files" (archives) and "urls."
465-
466-
:return: A pair of lists (files, urls).
467-
"""
468-
files = []
469-
urls = []
470-
471-
# puts the url for the given file path into the appropriate list
472-
def sort_path(path):
473-
# type: (str) -> None
474-
url = path_to_url(path)
475-
if mimetypes.guess_type(url, strict=False)[0] == 'text/html':
476-
urls.append(url)
477-
else:
478-
files.append(url)
479-
480-
for url in locations:
481-
482-
is_local_path = os.path.exists(url)
483-
is_file_url = url.startswith('file:')
484-
485-
if is_local_path or is_file_url:
486-
if is_local_path:
487-
path = url
488-
else:
489-
path = url_to_path(url)
490-
if os.path.isdir(path):
491-
if expand_dir:
492-
path = os.path.realpath(path)
493-
for item in os.listdir(path):
494-
sort_path(os.path.join(path, item))
495-
elif is_file_url:
496-
urls.append(url)
497-
else:
498-
logger.warning(
499-
"Path '%s' is ignored: it is a directory.", path,
500-
)
501-
elif os.path.isfile(path):
502-
sort_path(path)
503-
else:
504-
logger.warning(
505-
"Url '%s' is ignored: it is neither a file "
506-
"nor a directory.", url,
507-
)
508-
elif is_url(url):
509-
# Only add url with clear scheme
510-
urls.append(url)
511-
else:
512-
logger.warning(
513-
"Url '%s' is ignored. It is either a non-existing "
514-
"path or lacks a specific scheme.", url,
515-
)
516-
517-
return files, urls
518-
519-
520-
class CollectedLinks:
521-
522-
"""
523-
Encapsulates the return value of a call to LinkCollector.collect_links().
524-
525-
The return value includes both URLs to project pages containing package
526-
links, as well as individual package Link objects collected from other
527-
sources.
528-
529-
This info is stored separately as:
530-
531-
(1) links from the configured file locations,
532-
(2) links from the configured find_links, and
533-
(3) urls to HTML project pages, as described by the PEP 503 simple
534-
repository API.
535-
"""
536-
537-
def __init__(
538-
self,
539-
files, # type: List[Link]
540-
find_links, # type: List[Link]
541-
project_urls, # type: List[Link]
542-
):
543-
# type: (...) -> None
544-
"""
545-
:param files: Links from file locations.
546-
:param find_links: Links from find_links.
547-
:param project_urls: URLs to HTML project pages, as described by
548-
the PEP 503 simple repository API.
549-
"""
550-
self.files = files
551-
self.find_links = find_links
552-
self.project_urls = project_urls
452+
class CollectedSources(NamedTuple):
453+
find_links: Sequence[Optional[LinkSource]]
454+
index_urls: Sequence[Optional[LinkSource]]
553455

554456

555457
class LinkCollector:
@@ -558,7 +460,7 @@ class LinkCollector:
558460
Responsible for collecting Link objects from all configured locations,
559461
making network requests as needed.
560462
561-
The class's main method is its collect_links() method.
463+
The class's main method is its collect_sources() method.
562464
"""
563465

564466
def __init__(
@@ -609,51 +511,46 @@ def fetch_page(self, location):
609511
"""
610512
return _get_html_page(location, session=self.session)
611513

612-
def collect_links(self, project_name):
613-
# type: (str) -> CollectedLinks
614-
"""Find all available links for the given project name.
615-
616-
:return: All the Link objects (unfiltered), as a CollectedLinks object.
617-
"""
618-
search_scope = self.search_scope
619-
index_locations = search_scope.get_index_urls_locations(project_name)
620-
index_file_loc, index_url_loc = group_locations(index_locations)
621-
fl_file_loc, fl_url_loc = group_locations(
622-
self.find_links, expand_dir=True,
623-
)
624-
625-
file_links = [
626-
Link(url) for url in itertools.chain(index_file_loc, fl_file_loc)
627-
]
628-
629-
# We trust every directly linked archive in find_links
630-
find_link_links = [Link(url, '-f') for url in self.find_links]
631-
632-
# We trust every url that the user has given us whether it was given
633-
# via --index-url or --find-links.
634-
# We want to filter out anything that does not have a secure origin.
635-
url_locations = [
636-
link for link in itertools.chain(
637-
# Mark PyPI indices as "cache_link_parsing == False" -- this
638-
# will avoid caching the result of parsing the page for links.
639-
(Link(url, cache_link_parsing=False) for url in index_url_loc),
640-
(Link(url) for url in fl_url_loc),
514+
def collect_sources(
515+
self,
516+
project_name: str,
517+
candidates_from_page: CandidatesFromPage,
518+
) -> CollectedSources:
519+
# The OrderedDict calls deduplicate sources by URL.
520+
index_url_sources = collections.OrderedDict(
521+
build_source(
522+
loc,
523+
candidates_from_page=candidates_from_page,
524+
page_validator=self.session.is_secure_origin,
525+
expand_dir=False,
526+
cache_link_parsing=False,
527+
)
528+
for loc in self.search_scope.get_index_urls_locations(project_name)
529+
).values()
530+
find_links_sources = collections.OrderedDict(
531+
build_source(
532+
loc,
533+
candidates_from_page=candidates_from_page,
534+
page_validator=self.session.is_secure_origin,
535+
expand_dir=True,
536+
cache_link_parsing=True,
641537
)
642-
if self.session.is_secure_origin(link)
643-
]
644-
645-
url_locations = _remove_duplicate_links(url_locations)
646-
lines = [
647-
'{} location(s) to search for versions of {}:'.format(
648-
len(url_locations), project_name,
649-
),
650-
]
651-
for link in url_locations:
652-
lines.append(f'* {link}')
653-
logger.debug('\n'.join(lines))
654-
655-
return CollectedLinks(
656-
files=file_links,
657-
find_links=find_link_links,
658-
project_urls=url_locations,
538+
for loc in self.find_links
539+
).values()
540+
541+
if logger.isEnabledFor(logging.DEBUG):
542+
lines = [
543+
f"* {s.link}"
544+
for s in itertools.chain(find_links_sources, index_url_sources)
545+
if s is not None and s.link is not None
546+
]
547+
lines = [
548+
f"{len(lines)} location(s) to search "
549+
f"for versions of {project_name}:"
550+
] + lines
551+
logger.debug("\n".join(lines))
552+
553+
return CollectedSources(
554+
find_links=list(find_links_sources),
555+
index_urls=list(index_url_sources),
659556
)

src/pip/_internal/index/package_finder.py

Lines changed: 27 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
# mypy: strict-optional=False
55

66
import functools
7+
import itertools
78
import logging
89
import re
910
from typing import FrozenSet, Iterable, List, Optional, Set, Tuple, Union
@@ -804,38 +805,41 @@ def find_all_candidates(self, project_name):
804805
See LinkEvaluator.evaluate_link() for details on which files
805806
are accepted.
806807
"""
807-
collected_links = self._link_collector.collect_links(project_name)
808-
809808
link_evaluator = self.make_link_evaluator(project_name)
810809

811-
find_links_versions = self.evaluate_links(
812-
link_evaluator,
813-
links=collected_links.find_links,
810+
collected_sources = self._link_collector.collect_sources(
811+
project_name=project_name,
812+
candidates_from_page=functools.partial(
813+
self.process_project_url,
814+
link_evaluator=link_evaluator,
815+
),
814816
)
815817

816-
page_versions = []
817-
for project_url in collected_links.project_urls:
818-
package_links = self.process_project_url(
819-
project_url, link_evaluator=link_evaluator,
820-
)
821-
page_versions.extend(package_links)
818+
page_candidates_it = itertools.chain.from_iterable(
819+
source.page_candidates()
820+
for sources in collected_sources
821+
for source in sources
822+
if source is not None
823+
)
824+
page_candidates = list(page_candidates_it)
822825

823-
file_versions = self.evaluate_links(
826+
file_links_it = itertools.chain.from_iterable(
827+
source.file_links()
828+
for sources in collected_sources
829+
for source in sources
830+
if source is not None
831+
)
832+
file_candidates = self.evaluate_links(
824833
link_evaluator,
825-
links=collected_links.files,
834+
sorted(file_links_it, reverse=True),
826835
)
827-
if file_versions:
828-
file_versions.sort(reverse=True)
829-
logger.debug(
830-
'Local files found: %s',
831-
', '.join([
832-
url_to_path(candidate.link.url)
833-
for candidate in file_versions
834-
])
835-
)
836+
837+
if logger.isEnabledFor(logging.DEBUG) and file_candidates:
838+
paths = [url_to_path(c.link.url) for c in file_candidates]
839+
logger.debug("Local files found: %s", ", ".join(paths))
836840

837841
# This is an intentional priority ordering
838-
return file_versions + find_links_versions + page_versions
842+
return file_candidates + page_candidates
839843

840844
def make_candidate_evaluator(
841845
self,

0 commit comments

Comments
 (0)