Skip to content

Commit 5544cd8

Browse files
committed
Merge pull request #984 from pypa/skip-non-html
Do a check prior to returning HTMLPage that it is indeed html
2 parents 60f8da5 + 75cef55 commit 5544cd8

File tree

1 file changed

+15
-0
lines changed

1 file changed

+15
-0
lines changed

pip/index.py

+15
Original file line numberDiff line numberDiff line change
@@ -544,6 +544,21 @@ def get_page(cls, link, req, cache=None, skip_archives=True):
544544
contents = gzip.GzipFile(fileobj=BytesIO(contents)).read()
545545
if encoding == 'deflate':
546546
contents = zlib.decompress(contents)
547+
548+
# The check for archives above only works if the url ends with
549+
# something that looks like an archive. However that is not a
550+
# requirement. For instance http://sourceforge.net/projects/docutils/files/docutils/0.8.1/docutils-0.8.1.tar.gz/download
551+
# redirects to http://superb-dca3.dl.sourceforge.net/project/docutils/docutils/0.8.1/docutils-0.8.1.tar.gz
552+
# Unless we issue a HEAD request on every url we cannot know
553+
# ahead of time for sure if something is HTML or not. However we
554+
# can check after we've downloaded it.
555+
if not headers["Content-Type"].lower().startswith("text/html"):
556+
logger.debug('Skipping page %s because of Content-Type: %s' %
557+
(link, headers["Content-Type"]))
558+
if cache is not None:
559+
cache.set_is_archive(url)
560+
return None
561+
547562
inst = cls(u(contents), real_url, headers)
548563
except (HTTPError, URLError, socket.timeout, socket.error, OSError, WindowsError):
549564
e = sys.exc_info()[1]

0 commit comments

Comments
 (0)