pypa · chrahunt · Nov 11, 2019 · Sep 23, 2019 · Sep 23, 2019 · Sep 23, 2019
diff --git a/docs/html/development/architecture/package-finding.rst b/docs/html/development/architecture/package-finding.rst
@@ -15,17 +15,16 @@ Overview
 Here is a rough description of the process that pip uses to choose what
 file to download for a package, given a requirement:
 
-1. Access the various network and file system locations configured for pip
-   that contain package files. These locations can include, for example,
-   pip's :ref:`--index-url <--index-url>` (with default
-   https://pypi.org/simple/ ) and any configured
-   :ref:`--extra-index-url <--extra-index-url>` locations.
-   Each of these locations is a `PEP 503`_ "simple repository" page, which
-   is an HTML page of anchor links.
-2. Collect together all of the links (e.g. by parsing the anchor links
-   from the HTML pages) and create ``Link`` objects from each of these.
-   The :ref:`LinkCollector <link-collector-class>` class is responsible
-   for both this step and the previous.
+1. Collect together the various network and file system locations containing
+   project package files. These locations are derived, for example, from pip's
+   :ref:`--index-url <--index-url>` (with default https://pypi.org/simple/ )
+   setting and any configured :ref:`--extra-index-url <--extra-index-url>`
+   locations. Each of the project page URL's is an HTML page of anchor links,
+   as defined in `PEP 503`_, the "Simple Repository API."
+2. For each project page URL, fetch the HTML and parse out the anchor links,
+   creating a ``Link`` object from each one. The :ref:`LinkCollector
+   <link-collector-class>` class is responsible for both the previous step
+   and fetching the HTML over the network.
 3. Determine which of the links are minimally relevant, using the
    :ref:`LinkEvaluator <link-evaluator-class>` class.  Create an
    ``InstallationCandidate`` object (aka candidate for install) for each
@@ -111,6 +110,12 @@ One of ``PackageFinder``'s main top-level methods is
    class's ``compute_best_candidate()`` method on the return value of
    ``find_all_candidates()``. This corresponds to steps 4-5 of the Overview.
 
+``PackageFinder`` also has a ``process_project_url()`` method (called by
+``find_best_candidate()``) to process a `PEP 503`_ "simple repository"
+project page. This method fetches and parses the HTML from a PEP 503 project
+page URL, extracts the anchor elements and creates ``Link`` objects from
+them, and then evaluates those links.
+
 
 .. _link-collector-class:
 
@@ -119,12 +124,8 @@ The ``LinkCollector`` class
 
 The :ref:`LinkCollector <link-collector-class>` class is the class
 responsible for collecting the raw list of "links" to package files
-(represented as ``Link`` objects). An instance of the class accesses the
-various `PEP 503`_ HTML "simple repository" pages, parses their HTML,
-extracts the links from the anchor elements, and creates ``Link`` objects
-from that information. The ``LinkCollector`` class is "unintelligent" in that
-it doesn't do any evaluation of whether the links are relevant to the
-original requirement; it just collects them.
+(represented as ``Link`` objects) from file system locations, as well as the
+`PEP 503`_ project page URL's that ``PackageFinder`` should access.
 
 The ``LinkCollector`` class takes into account the user's :ref:`--find-links
 <--find-links>`, :ref:`--extra-index-url <--extra-index-url>`, and related
@@ -133,6 +134,10 @@ method is the ``collect_links()`` method. The :ref:`PackageFinder
 <package-finder-class>` class invokes this method as the first step of its
 ``find_all_candidates()`` method.
 
+``LinkCollector`` also has a ``fetch_page()`` method to fetch the HTML from a
+project page URL. This method is "unintelligent" in that it doesn't parse the
+HTML.
+
 The ``LinkCollector`` class is the only class in the ``index`` sub-package that
 makes network requests and is the only class in the sub-package that depends
 directly on ``PipSession``, which stores pip's configuration options and

diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py
@@ -27,8 +27,8 @@
 
 if MYPY_CHECK_RUNNING:
     from typing import (
-        Callable, Dict, Iterable, List, MutableMapping, Optional, Sequence,
-        Tuple, Union,
+        Callable, Iterable, List, MutableMapping, Optional, Sequence, Tuple,
+        Union,
     )
     import xml.etree.ElementTree
 
@@ -435,29 +435,36 @@ def sort_path(path):
 class CollectedLinks(object):
 
     """
-    Encapsulates all the Link objects collected by a call to
-    LinkCollector.collect_links(), stored separately as--
+    Encapsulates the return value of a call to LinkCollector.collect_links().
+
+    The return value includes both URLs to project pages containing package
+    links, as well as individual package Link objects collected from other
+    sources.
+
+    This info is stored separately as:
 
     (1) links from the configured file locations,
     (2) links from the configured find_links, and
-    (3) a dict mapping HTML page url to links from that page.
+    (3) urls to HTML project pages, as described by the PEP 503 simple
+        repository API.
     """
 
     def __init__(
         self,
-        files,       # type: List[Link]
-        find_links,  # type: List[Link]
-        pages,       # type: Dict[str, List[Link]]
+        files,         # type: List[Link]
+        find_links,    # type: List[Link]
+        project_urls,  # type: List[Link]
     ):
         # type: (...) -> None
         """
         :param files: Links from file locations.
         :param find_links: Links from find_links.
-        :param pages: A dict mapping HTML page url to links from that page.
+        :param project_urls: URLs to HTML project pages, as described by
+            the PEP 503 simple repository API.
         """
         self.files = files
         self.find_links = find_links
-        self.pages = pages
+        self.project_urls = project_urls
 
 
 class LinkCollector(object):
@@ -483,18 +490,12 @@ def find_links(self):
         # type: () -> List[str]
         return self.search_scope.find_links
 
-    def _get_pages(self, locations):
-        # type: (Iterable[Link]) -> Iterable[HTMLPage]
+    def fetch_page(self, location):
+        # type: (Link) -> Optional[HTMLPage]
         """
-        Yields (page, page_url) from the given locations, skipping
-        locations that have errors.
+        Fetch an HTML page containing package links.
         """
-        for location in locations:
-            page = _get_html_page(location, session=self.session)
-            if page is None:
-                continue
-
-            yield page
+        return _get_html_page(location, session=self.session)
 
     def collect_links(self, project_name):
         # type: (str) -> CollectedLinks
@@ -537,12 +538,8 @@ def collect_links(self, project_name):
             lines.append('* {}'.format(link))
         logger.debug('\n'.join(lines))
 
-        pages_links = {}
-        for page in self._get_pages(url_locations):
-            pages_links[page.url] = list(parse_links(page))
-
         return CollectedLinks(
             files=file_links,
             find_links=find_link_links,
-            pages=pages_links,
+            project_urls=url_locations,
         )
diff --git a/src/pip/_internal/index/package_finder.py b/src/pip/_internal/index/package_finder.py
@@ -19,6 +19,7 @@
     InvalidWheelFilename,
     UnsupportedWheel,
 )
+from pip._internal.index.collector import parse_links
 from pip._internal.models.candidate import InstallationCandidate
 from pip._internal.models.format_control import FormatControl
 from pip._internal.models.link import Link
@@ -778,6 +779,25 @@ def evaluate_links(self, link_evaluator, links):
 
         return candidates
 
+    def process_project_url(self, project_url, link_evaluator):
+        # type: (Link, LinkEvaluator) -> List[InstallationCandidate]
+        logger.debug(
+            'Fetching project page and analyzing links: %s', project_url,
+        )
+        html_page = self._link_collector.fetch_page(project_url)
+        if html_page is None:
+            return []
+
+        page_links = list(parse_links(html_page))
+
+        with indent_log():
+            package_links = self.evaluate_links(
+                link_evaluator,
+                links=page_links,
+            )
+
+        return package_links
+
     def find_all_candidates(self, project_name):
         # type: (str) -> List[InstallationCandidate]
         """Find all available InstallationCandidate for project_name
@@ -798,14 +818,11 @@ def find_all_candidates(self, project_name):
         )
 
         page_versions = []
-        for page_url, page_links in collected_links.pages.items():
-            logger.debug('Analyzing links from page %s', page_url)
-            with indent_log():
-                new_versions = self.evaluate_links(
-                    link_evaluator,
-                    links=page_links,
-                )
-                page_versions.extend(new_versions)
+        for project_url in collected_links.project_urls:
+            package_links = self.process_project_url(
+                project_url, link_evaluator=link_evaluator,
+            )
+            page_versions.extend(package_links)
 
         file_versions = self.evaluate_links(
             link_evaluator,

diff --git a/tests/functional/test_install_config.py b/tests/functional/test_install_config.py
@@ -91,7 +91,7 @@ def test_command_line_append_flags(script, virtualenv, data):
         'test.pypi.org',
     )
     assert (
-        "Analyzing links from page https://test.pypi.org"
+        "Fetching project page and analyzing links: https://test.pypi.org"
         in result.stdout
     ), str(result)
     virtualenv.clear()
@@ -100,7 +100,7 @@ def test_command_line_append_flags(script, virtualenv, data):
         '--trusted-host', 'test.pypi.org',
     )
     assert (
-        "Analyzing links from page https://test.pypi.org"
+        "Fetching project page and analyzing links: https://test.pypi.org"
         in result.stdout
     )
     assert (
@@ -124,7 +124,7 @@ def test_command_line_appends_correctly(script, data):
     )
 
     assert (
-        "Analyzing links from page https://test.pypi.org"
+        "Fetching project page and analyzing links: https://test.pypi.org"
         in result.stdout
     ), result.stdout
     assert (

diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py
@@ -436,13 +436,28 @@ def check_links_include(links, names):
 class TestLinkCollector(object):
 
     @patch('pip._internal.index.collector._get_html_response')
-    def test_collect_links(self, mock_get_html_response, caplog, data):
-        caplog.set_level(logging.DEBUG)
+    def test_fetch_page(self, mock_get_html_response):
+        url = 'https://pypi.org/simple/twine/'
+
+        fake_response = make_fake_html_response(url)
+        mock_get_html_response.return_value = fake_response
 
-        expected_url = 'https://pypi.org/simple/twine/'
+        location = Link(url)
+        link_collector = make_test_link_collector()
+        actual = link_collector.fetch_page(location)
 
-        fake_page = make_fake_html_response(expected_url)
-        mock_get_html_response.return_value = fake_page
+        assert actual.content == fake_response.content
+        assert actual.encoding is None
+        assert actual.url == url
+
+        # Also check that the right session object was passed to
+        # _get_html_response().
+        mock_get_html_response.assert_called_once_with(
+            url, session=link_collector.session,
+        )
+
+    def test_collect_links(self, caplog, data):
+        caplog.set_level(logging.DEBUG)
 
         link_collector = make_test_link_collector(
             find_links=[data.find_links],
@@ -452,24 +467,14 @@ def test_collect_links(self, mock_get_html_response, caplog, data):
         )
         actual = link_collector.collect_links('twine')
 
-        mock_get_html_response.assert_called_once_with(
-            expected_url, session=link_collector.session,
-        )
-
         # Spot-check the CollectedLinks return value.
         assert len(actual.files) > 20
         check_links_include(actual.files, names=['simple-1.0.tar.gz'])
 
         assert len(actual.find_links) == 1
         check_links_include(actual.find_links, names=['packages'])
 
-        actual_pages = actual.pages
-        assert list(actual_pages) == [expected_url]
-        actual_page_links = actual_pages[expected_url]
-        assert len(actual_page_links) == 1
-        assert actual_page_links[0].url == (
-            'https://pypi.org/abc-1.0.tar.gz#md5=000000000'
-        )
+        assert actual.project_urls == [Link('https://pypi.org/simple/twine/')]
 
         expected_message = dedent("""\
         1 location(s) to search for versions of twine:

diff --git a/tests/unit/test_finder.py b/tests/unit/test_finder.py
@@ -473,6 +473,22 @@ def test_evaluate_link__substring_fails(self, url, expected_msg):
         assert actual == (False, expected_msg)
 
 
+def test_process_project_url(data):
+    project_name = 'simple'
+    index_url = data.index_url('simple')
+    project_url = Link('{}/{}'.format(index_url, project_name))
+    finder = make_test_finder(index_urls=[index_url])
+    link_evaluator = finder.make_link_evaluator(project_name)
+    actual = finder.process_project_url(
+        project_url, link_evaluator=link_evaluator,
+    )
+
+    assert len(actual) == 1
+    package_link = actual[0]
+    assert package_link.project == 'simple'
+    assert str(package_link.version) == '1.0'
+
+
 def test_find_all_candidates_nothing():
     """Find nothing without anything"""
     finder = make_test_finder()