Skip to content

Commit 8dbbe16

Browse files
authored
Merge pull request #5838 from uranusjr/htmlpage-extract-breakdown-get-page
Refactor _get_html_page() to use exceptions for flow control
2 parents 541ec23 + fc53f71 commit 8dbbe16

File tree

3 files changed

+291
-82
lines changed

3 files changed

+291
-82
lines changed

news/5838.bugfix

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix content type detection if a directory named like an archive is used as a package source.

src/pip/_internal/index.py

Lines changed: 128 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -59,18 +59,113 @@
5959
logger = logging.getLogger(__name__)
6060

6161

62-
def _get_content_type(url, session):
63-
"""Get the Content-Type of the given url, using a HEAD request"""
62+
def _match_vcs_scheme(url):
63+
"""Look for VCS schemes in the URL.
64+
65+
Returns the matched VCS scheme, or None if there's no match.
66+
"""
67+
from pip._internal.vcs import VcsSupport
68+
for scheme in VcsSupport.schemes:
69+
if url.lower().startswith(scheme) and url[len(scheme)] in '+:':
70+
return scheme
71+
return None
72+
73+
74+
def _is_url_like_archive(url):
75+
"""Return whether the URL looks like an archive.
76+
"""
77+
filename = Link(url).filename
78+
for bad_ext in ARCHIVE_EXTENSIONS:
79+
if filename.endswith(bad_ext):
80+
return True
81+
return False
82+
83+
84+
class _NotHTML(Exception):
85+
def __init__(self, content_type, request_desc):
86+
super(_NotHTML, self).__init__(content_type, request_desc)
87+
self.content_type = content_type
88+
self.request_desc = request_desc
89+
90+
91+
def _ensure_html_header(response):
92+
"""Check the Content-Type header to ensure the response contains HTML.
93+
94+
Raises `_NotHTML` if the content type is not text/html.
95+
"""
96+
content_type = response.headers.get("Content-Type", "")
97+
if not content_type.lower().startswith("text/html"):
98+
raise _NotHTML(content_type, response.request.method)
99+
100+
101+
class _NotHTTP(Exception):
102+
pass
103+
104+
105+
def _ensure_html_response(url, session):
106+
"""Send a HEAD request to the URL, and ensure the response contains HTML.
107+
108+
Raises `_NotHTTP` if the URL is not available for a HEAD request, or
109+
`_NotHTML` if the content type is not text/html.
110+
"""
64111
scheme, netloc, path, query, fragment = urllib_parse.urlsplit(url)
65112
if scheme not in {'http', 'https'}:
66-
# FIXME: some warning or something?
67-
# assertion error?
68-
return ''
113+
raise _NotHTTP()
69114

70115
resp = session.head(url, allow_redirects=True)
71116
resp.raise_for_status()
72117

73-
return resp.headers.get("Content-Type", "")
118+
_ensure_html_header(resp)
119+
120+
121+
def _get_html_response(url, session):
122+
"""Access an HTML page with GET, and return the response.
123+
124+
This consists of three parts:
125+
126+
1. If the URL looks suspiciously like an archive, send a HEAD first to
127+
check the Content-Type is HTML, to avoid downloading a large file.
128+
Raise `_NotHTTP` if the content type cannot be determined, or
129+
`_NotHTML` if it is not HTML.
130+
2. Actually perform the request. Raise HTTP exceptions on network failures.
131+
3. Check the Content-Type header to make sure we got HTML, and raise
132+
`_NotHTML` otherwise.
133+
"""
134+
if _is_url_like_archive(url):
135+
_ensure_html_response(url, session=session)
136+
137+
logger.debug('Getting page %s', url)
138+
139+
resp = session.get(
140+
url,
141+
headers={
142+
"Accept": "text/html",
143+
# We don't want to blindly returned cached data for
144+
# /simple/, because authors generally expecting that
145+
# twine upload && pip install will function, but if
146+
# they've done a pip install in the last ~10 minutes
147+
# it won't. Thus by setting this to zero we will not
148+
# blindly use any cached data, however the benefit of
149+
# using max-age=0 instead of no-cache, is that we will
150+
# still support conditional requests, so we will still
151+
# minimize traffic sent in cases where the page hasn't
152+
# changed at all, we will just always incur the round
153+
# trip for the conditional GET now instead of only
154+
# once per 10 minutes.
155+
# For more information, please see pypa/pip#5670.
156+
"Cache-Control": "max-age=0",
157+
},
158+
)
159+
resp.raise_for_status()
160+
161+
# The check for archives above only works if the url ends with
162+
# something that looks like an archive. However that is not a
163+
# requirement of an url. Unless we issue a HEAD request on every
164+
# url we cannot know ahead of time for sure if something is HTML
165+
# or not. However we can check after we've downloaded it.
166+
_ensure_html_header(resp)
167+
168+
return resp
74169

75170

76171
def _handle_get_page_fail(link, reason, url, meth=None):
@@ -85,82 +180,36 @@ def _get_html_page(link, session=None):
85180
"_get_html_page() missing 1 required keyword argument: 'session'"
86181
)
87182

88-
url = link.url
89-
url = url.split('#', 1)[0]
183+
url = link.url.split('#', 1)[0]
90184

91185
# Check for VCS schemes that do not support lookup as web pages.
92-
from pip._internal.vcs import VcsSupport
93-
for scheme in VcsSupport.schemes:
94-
if url.lower().startswith(scheme) and url[len(scheme)] in '+:':
95-
logger.debug('Cannot look at %s URL %s', scheme, link)
96-
return None
186+
vcs_scheme = _match_vcs_scheme(url)
187+
if vcs_scheme:
188+
logger.debug('Cannot look at %s URL %s', vcs_scheme, link)
189+
return None
97190

98-
try:
99-
filename = link.filename
100-
for bad_ext in ARCHIVE_EXTENSIONS:
101-
if filename.endswith(bad_ext):
102-
content_type = _get_content_type(url, session=session)
103-
if content_type.lower().startswith('text/html'):
104-
break
105-
else:
106-
logger.debug(
107-
'Skipping page %s because of Content-Type: %s',
108-
link,
109-
content_type,
110-
)
111-
return
191+
# Tack index.html onto file:// URLs that point to directories
192+
scheme, _, path, _, _, _ = urllib_parse.urlparse(url)
193+
if (scheme == 'file' and os.path.isdir(urllib_request.url2pathname(path))):
194+
# add trailing slash if not present so urljoin doesn't trim
195+
# final segment
196+
if not url.endswith('/'):
197+
url += '/'
198+
url = urllib_parse.urljoin(url, 'index.html')
199+
logger.debug(' file: URL is directory, getting %s', url)
112200

113-
logger.debug('Getting page %s', url)
114-
115-
# Tack index.html onto file:// URLs that point to directories
116-
(scheme, netloc, path, params, query, fragment) = \
117-
urllib_parse.urlparse(url)
118-
if (scheme == 'file' and
119-
os.path.isdir(urllib_request.url2pathname(path))):
120-
# add trailing slash if not present so urljoin doesn't trim
121-
# final segment
122-
if not url.endswith('/'):
123-
url += '/'
124-
url = urllib_parse.urljoin(url, 'index.html')
125-
logger.debug(' file: URL is directory, getting %s', url)
126-
127-
resp = session.get(
128-
url,
129-
headers={
130-
"Accept": "text/html",
131-
# We don't want to blindly returned cached data for
132-
# /simple/, because authors generally expecting that
133-
# twine upload && pip install will function, but if
134-
# they've done a pip install in the last ~10 minutes
135-
# it won't. Thus by setting this to zero we will not
136-
# blindly use any cached data, however the benefit of
137-
# using max-age=0 instead of no-cache, is that we will
138-
# still support conditional requests, so we will still
139-
# minimize traffic sent in cases where the page hasn't
140-
# changed at all, we will just always incur the round
141-
# trip for the conditional GET now instead of only
142-
# once per 10 minutes.
143-
# For more information, please see pypa/pip#5670.
144-
"Cache-Control": "max-age=0",
145-
},
201+
try:
202+
resp = _get_html_response(url, session=session)
203+
except _NotHTTP as exc:
204+
logger.debug(
205+
'Skipping page %s because it looks like an archive, and cannot '
206+
'be checked by HEAD.', link,
207+
)
208+
except _NotHTML as exc:
209+
logger.debug(
210+
'Skipping page %s because the %s request got Content-Type: %s',
211+
link, exc.request_desc, exc.content_type,
146212
)
147-
resp.raise_for_status()
148-
149-
# The check for archives above only works if the url ends with
150-
# something that looks like an archive. However that is not a
151-
# requirement of an url. Unless we issue a HEAD request on every
152-
# url we cannot know ahead of time for sure if something is HTML
153-
# or not. However we can check after we've downloaded it.
154-
content_type = resp.headers.get('Content-Type', 'unknown')
155-
if not content_type.lower().startswith("text/html"):
156-
logger.debug(
157-
'Skipping page %s because of Content-Type: %s',
158-
link,
159-
content_type,
160-
)
161-
return
162-
163-
inst = HTMLPage(resp.content, resp.url, resp.headers)
164213
except requests.HTTPError as exc:
165214
_handle_get_page_fail(link, exc, url)
166215
except RetryError as exc:
@@ -174,7 +223,7 @@ def _get_html_page(link, session=None):
174223
except requests.Timeout:
175224
_handle_get_page_fail(link, "timed out", url)
176225
else:
177-
return inst
226+
return HTMLPage(resp.content, resp.url, resp.headers)
178227

179228

180229
class PackageFinder(object):
@@ -679,7 +728,7 @@ def _get_pages(self, locations, project_name):
679728
continue
680729
seen.add(location)
681730

682-
page = self._get_page(location)
731+
page = _get_html_page(location, session=self.session)
683732
if page is None:
684733
continue
685734

@@ -796,9 +845,6 @@ def _link_package_versions(self, link, search):
796845

797846
return InstallationCandidate(search.supplied, version, link)
798847

799-
def _get_page(self, link):
800-
return _get_html_page(link, session=self.session)
801-
802848

803849
def egg_info_matches(
804850
egg_info, search_name, link,

0 commit comments

Comments
 (0)