From 75cef55df7c86bdc93915522beea0c4118e77fda Mon Sep 17 00:00:00 2001 From: Donald Stufft Date: Fri, 7 Jun 2013 08:38:30 -0400 Subject: [PATCH] Do a check prior to returning HTMLPage that it is indeed html Previously pip would check with a HEAD request if the url looked like it contained an archive and skip the file. However this didn't work if the url didn't look like an archive and instead just redirected to an archive. This will therefore, after the url has been fetched, inspect the headers and look to see if the Content-Type is text/html. --- pip/index.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pip/index.py b/pip/index.py index 00bbdb10eb4..717da182c98 100644 --- a/pip/index.py +++ b/pip/index.py @@ -544,6 +544,21 @@ def get_page(cls, link, req, cache=None, skip_archives=True): contents = gzip.GzipFile(fileobj=BytesIO(contents)).read() if encoding == 'deflate': contents = zlib.decompress(contents) + + # The check for archives above only works if the url ends with + # something that looks like an archive. However that is not a + # requirement. For instance http://sourceforge.net/projects/docutils/files/docutils/0.8.1/docutils-0.8.1.tar.gz/download + # redirects to http://superb-dca3.dl.sourceforge.net/project/docutils/docutils/0.8.1/docutils-0.8.1.tar.gz + # Unless we issue a HEAD request on every url we cannot know + # ahead of time for sure if something is HTML or not. However we + # can check after we've downloaded it. + if not headers["Content-Type"].lower().startswith("text/html"): + logger.debug('Skipping page %s because of Content-Type: %s' % + (link, headers["Content-Type"])) + if cache is not None: + cache.set_is_archive(url) + return None + inst = cls(u(contents), real_url, headers) except (HTTPError, URLError, socket.timeout, socket.error, OSError, WindowsError): e = sys.exc_info()[1]