Merge pull request #5838 from uranusjr/htmlpage-extract-breakdown-get-page

cjerdonek · web-flow · commit 8dbbe165f80d · 2018-10-11T12:27:17.000-07:00
Refactor _get_html_page() to use exceptions for flow control
diff --git a/news/5838.bugfix b/news/5838.bugfix
@@ -0,0 +1 @@
+Fix content type detection if a directory named like an archive is used as a package source.
diff --git a/src/pip/_internal/index.py b/src/pip/_internal/index.py
@@ -59,18 +59,113 @@
 logger = logging.getLogger(__name__)
 
 
-def _get_content_type(url, session):
-    """Get the Content-Type of the given url, using a HEAD request"""
+def _match_vcs_scheme(url):
+    """Look for VCS schemes in the URL.
+
+    Returns the matched VCS scheme, or None if there's no match.
+    """
+    from pip._internal.vcs import VcsSupport
+    for scheme in VcsSupport.schemes:
+        if url.lower().startswith(scheme) and url[len(scheme)] in '+:':
+            return scheme
+    return None
+
+
+def _is_url_like_archive(url):
+    """Return whether the URL looks like an archive.
+    """
+    filename = Link(url).filename
+    for bad_ext in ARCHIVE_EXTENSIONS:
+        if filename.endswith(bad_ext):
+            return True
+    return False
+
+
+class _NotHTML(Exception):
+    def __init__(self, content_type, request_desc):
+        super(_NotHTML, self).__init__(content_type, request_desc)
+        self.content_type = content_type
+        self.request_desc = request_desc
+
+
+def _ensure_html_header(response):
+    """Check the Content-Type header to ensure the response contains HTML.
+
+    Raises `_NotHTML` if the content type is not text/html.
+    """
+    content_type = response.headers.get("Content-Type", "")
+    if not content_type.lower().startswith("text/html"):
+        raise _NotHTML(content_type, response.request.method)
+
+
+class _NotHTTP(Exception):
+    pass
+
+
+def _ensure_html_response(url, session):
+    """Send a HEAD request to the URL, and ensure the response contains HTML.
+
+    Raises `_NotHTTP` if the URL is not available for a HEAD request, or
+    `_NotHTML` if the content type is not text/html.
+    """
     scheme, netloc, path, query, fragment = urllib_parse.urlsplit(url)
     if scheme not in {'http', 'https'}:
-        # FIXME: some warning or something?
-        # assertion error?
-        return ''
+        raise _NotHTTP()
 
     resp = session.head(url, allow_redirects=True)
     resp.raise_for_status()
 
-    return resp.headers.get("Content-Type", "")
+    _ensure_html_header(resp)
+
+
+def _get_html_response(url, session):
+    """Access an HTML page with GET, and return the response.
+
+    This consists of three parts:
+
+    1. If the URL looks suspiciously like an archive, send a HEAD first to
+       check the Content-Type is HTML, to avoid downloading a large file.
+       Raise `_NotHTTP` if the content type cannot be determined, or
+       `_NotHTML` if it is not HTML.
+    2. Actually perform the request. Raise HTTP exceptions on network failures.
+    3. Check the Content-Type header to make sure we got HTML, and raise
+       `_NotHTML` otherwise.
+    """
+    if _is_url_like_archive(url):
+        _ensure_html_response(url, session=session)
+
+    logger.debug('Getting page %s', url)
+
+    resp = session.get(
+        url,
+        headers={
+            "Accept": "text/html",
+            # We don't want to blindly returned cached data for
+            # /simple/, because authors generally expecting that
+            # twine upload && pip install will function, but if
+            # they've done a pip install in the last ~10 minutes
+            # it won't. Thus by setting this to zero we will not
+            # blindly use any cached data, however the benefit of
+            # using max-age=0 instead of no-cache, is that we will
+            # still support conditional requests, so we will still
+            # minimize traffic sent in cases where the page hasn't
+            # changed at all, we will just always incur the round
+            # trip for the conditional GET now instead of only
+            # once per 10 minutes.
+            # For more information, please see pypa/pip#5670.
+            "Cache-Control": "max-age=0",
+        },
+    )
+    resp.raise_for_status()
+
+    # The check for archives above only works if the url ends with
+    # something that looks like an archive. However that is not a
+    # requirement of an url. Unless we issue a HEAD request on every
+    # url we cannot know ahead of time for sure if something is HTML
+    # or not. However we can check after we've downloaded it.
+    _ensure_html_header(resp)
+
+    return resp
 
 
 def _handle_get_page_fail(link, reason, url, meth=None):
@@ -85,82 +180,36 @@ def _get_html_page(link, session=None):
             "_get_html_page() missing 1 required keyword argument: 'session'"
         )
 
-    url = link.url
-    url = url.split('#', 1)[0]
+    url = link.url.split('#', 1)[0]
 
     # Check for VCS schemes that do not support lookup as web pages.
-    from pip._internal.vcs import VcsSupport
-    for scheme in VcsSupport.schemes:
-        if url.lower().startswith(scheme) and url[len(scheme)] in '+:':
-            logger.debug('Cannot look at %s URL %s', scheme, link)
-            return None
+    vcs_scheme = _match_vcs_scheme(url)
+    if vcs_scheme:
+        logger.debug('Cannot look at %s URL %s', vcs_scheme, link)
+        return None
 
-    try:
-        filename = link.filename
-        for bad_ext in ARCHIVE_EXTENSIONS:
-            if filename.endswith(bad_ext):
-                content_type = _get_content_type(url, session=session)
-                if content_type.lower().startswith('text/html'):
-                    break
-                else:
-                    logger.debug(
-                        'Skipping page %s because of Content-Type: %s',
-                        link,
-                        content_type,
-                    )
-                    return
+    # Tack index.html onto file:// URLs that point to directories
+    scheme, _, path, _, _, _ = urllib_parse.urlparse(url)
+    if (scheme == 'file' and os.path.isdir(urllib_request.url2pathname(path))):
+        # add trailing slash if not present so urljoin doesn't trim
+        # final segment
+        if not url.endswith('/'):
+            url += '/'
+        url = urllib_parse.urljoin(url, 'index.html')
+        logger.debug(' file: URL is directory, getting %s', url)
 
-        logger.debug('Getting page %s', url)
-
-        # Tack index.html onto file:// URLs that point to directories
-        (scheme, netloc, path, params, query, fragment) = \
-            urllib_parse.urlparse(url)
-        if (scheme == 'file' and
-                os.path.isdir(urllib_request.url2pathname(path))):
-            # add trailing slash if not present so urljoin doesn't trim
-            # final segment
-            if not url.endswith('/'):
-                url += '/'
-            url = urllib_parse.urljoin(url, 'index.html')
-            logger.debug(' file: URL is directory, getting %s', url)
-
-        resp = session.get(
-            url,
-            headers={
-                "Accept": "text/html",
-                # We don't want to blindly returned cached data for
-                # /simple/, because authors generally expecting that
-                # twine upload && pip install will function, but if
-                # they've done a pip install in the last ~10 minutes
-                # it won't. Thus by setting this to zero we will not
-                # blindly use any cached data, however the benefit of
-                # using max-age=0 instead of no-cache, is that we will
-                # still support conditional requests, so we will still
-                # minimize traffic sent in cases where the page hasn't
-                # changed at all, we will just always incur the round
-                # trip for the conditional GET now instead of only
-                # once per 10 minutes.
-                # For more information, please see pypa/pip#5670.
-                "Cache-Control": "max-age=0",
-            },
+    try:
+        resp = _get_html_response(url, session=session)
+    except _NotHTTP as exc:
+        logger.debug(
+            'Skipping page %s because it looks like an archive, and cannot '
+            'be checked by HEAD.', link,
+        )
+    except _NotHTML as exc:
+        logger.debug(
+            'Skipping page %s because the %s request got Content-Type: %s',
+            link, exc.request_desc, exc.content_type,
         )
-        resp.raise_for_status()
-
-        # The check for archives above only works if the url ends with
-        # something that looks like an archive. However that is not a
-        # requirement of an url. Unless we issue a HEAD request on every
-        # url we cannot know ahead of time for sure if something is HTML
-        # or not. However we can check after we've downloaded it.
-        content_type = resp.headers.get('Content-Type', 'unknown')
-        if not content_type.lower().startswith("text/html"):
-            logger.debug(
-                'Skipping page %s because of Content-Type: %s',
-                link,
-                content_type,
-            )
-            return
-
-        inst = HTMLPage(resp.content, resp.url, resp.headers)
     except requests.HTTPError as exc:
         _handle_get_page_fail(link, exc, url)
     except RetryError as exc:
@@ -174,7 +223,7 @@ def _get_html_page(link, session=None):
     except requests.Timeout:
         _handle_get_page_fail(link, "timed out", url)
     else:
-        return inst
+        return HTMLPage(resp.content, resp.url, resp.headers)
 
 
 class PackageFinder(object):
@@ -679,7 +728,7 @@ def _get_pages(self, locations, project_name):
                 continue
             seen.add(location)
 
-            page = self._get_page(location)
+            page = _get_html_page(location, session=self.session)
             if page is None:
                 continue
 
@@ -796,9 +845,6 @@ def _link_package_versions(self, link, search):
 
         return InstallationCandidate(search.supplied, version, link)
 
-    def _get_page(self, link):
-        return _get_html_page(link, session=self.session)
-
 
 def egg_info_matches(
         egg_info, search_name, link,
diff --git a/tests/unit/test_index_html_page.py b/tests/unit/test_index_html_page.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Fix content type detection if a directory named like an archive is used as a package source.`