59
59
logger = logging .getLogger (__name__ )
60
60
61
61
62
- def _get_content_type (url , session ):
63
- """Get the Content-Type of the given url, using a HEAD request"""
62
+ def _match_vcs_scheme (url ):
63
+ """Look for VCS schemes in the URL.
64
+
65
+ Returns the matched VCS scheme, or None if there's no match.
66
+ """
67
+ from pip ._internal .vcs import VcsSupport
68
+ for scheme in VcsSupport .schemes :
69
+ if url .lower ().startswith (scheme ) and url [len (scheme )] in '+:' :
70
+ return scheme
71
+ return None
72
+
73
+
74
+ def _is_url_like_archive (url ):
75
+ """Return whether the URL looks like an archive.
76
+ """
77
+ filename = Link (url ).filename
78
+ for bad_ext in ARCHIVE_EXTENSIONS :
79
+ if filename .endswith (bad_ext ):
80
+ return True
81
+ return False
82
+
83
+
84
+ class _NotHTML (Exception ):
85
+ def __init__ (self , content_type , request_desc ):
86
+ super (_NotHTML , self ).__init__ (content_type , request_desc )
87
+ self .content_type = content_type
88
+ self .request_desc = request_desc
89
+
90
+
91
+ def _ensure_html_header (response ):
92
+ """Check the Content-Type header to ensure the response contains HTML.
93
+
94
+ Raises `_NotHTML` if the content type is not text/html.
95
+ """
96
+ content_type = response .headers .get ("Content-Type" , "" )
97
+ if not content_type .lower ().startswith ("text/html" ):
98
+ raise _NotHTML (content_type , response .request .method )
99
+
100
+
101
+ class _NotHTTP (Exception ):
102
+ pass
103
+
104
+
105
+ def _ensure_html_response (url , session ):
106
+ """Send a HEAD request to the URL, and ensure the response contains HTML.
107
+
108
+ Raises `_NotHTTP` if the URL is not available for a HEAD request, or
109
+ `_NotHTML` if the content type is not text/html.
110
+ """
64
111
scheme , netloc , path , query , fragment = urllib_parse .urlsplit (url )
65
112
if scheme not in {'http' , 'https' }:
66
- # FIXME: some warning or something?
67
- # assertion error?
68
- return ''
113
+ raise _NotHTTP ()
69
114
70
115
resp = session .head (url , allow_redirects = True )
71
116
resp .raise_for_status ()
72
117
73
- return resp .headers .get ("Content-Type" , "" )
118
+ _ensure_html_header (resp )
119
+
120
+
121
+ def _get_html_response (url , session ):
122
+ """Access an HTML page with GET, and return the response.
123
+
124
+ This consists of three parts:
125
+
126
+ 1. If the URL looks suspiciously like an archive, send a HEAD first to
127
+ check the Content-Type is HTML, to avoid downloading a large file.
128
+ Raise `_NotHTTP` if the content type cannot be determined, or
129
+ `_NotHTML` if it is not HTML.
130
+ 2. Actually perform the request. Raise HTTP exceptions on network failures.
131
+ 3. Check the Content-Type header to make sure we got HTML, and raise
132
+ `_NotHTML` otherwise.
133
+ """
134
+ if _is_url_like_archive (url ):
135
+ _ensure_html_response (url , session = session )
136
+
137
+ logger .debug ('Getting page %s' , url )
138
+
139
+ resp = session .get (
140
+ url ,
141
+ headers = {
142
+ "Accept" : "text/html" ,
143
+ # We don't want to blindly returned cached data for
144
+ # /simple/, because authors generally expecting that
145
+ # twine upload && pip install will function, but if
146
+ # they've done a pip install in the last ~10 minutes
147
+ # it won't. Thus by setting this to zero we will not
148
+ # blindly use any cached data, however the benefit of
149
+ # using max-age=0 instead of no-cache, is that we will
150
+ # still support conditional requests, so we will still
151
+ # minimize traffic sent in cases where the page hasn't
152
+ # changed at all, we will just always incur the round
153
+ # trip for the conditional GET now instead of only
154
+ # once per 10 minutes.
155
+ # For more information, please see pypa/pip#5670.
156
+ "Cache-Control" : "max-age=0" ,
157
+ },
158
+ )
159
+ resp .raise_for_status ()
160
+
161
+ # The check for archives above only works if the url ends with
162
+ # something that looks like an archive. However that is not a
163
+ # requirement of an url. Unless we issue a HEAD request on every
164
+ # url we cannot know ahead of time for sure if something is HTML
165
+ # or not. However we can check after we've downloaded it.
166
+ _ensure_html_header (resp )
167
+
168
+ return resp
74
169
75
170
76
171
def _handle_get_page_fail (link , reason , url , meth = None ):
@@ -85,82 +180,36 @@ def _get_html_page(link, session=None):
85
180
"_get_html_page() missing 1 required keyword argument: 'session'"
86
181
)
87
182
88
- url = link .url
89
- url = url .split ('#' , 1 )[0 ]
183
+ url = link .url .split ('#' , 1 )[0 ]
90
184
91
185
# Check for VCS schemes that do not support lookup as web pages.
92
- from pip ._internal .vcs import VcsSupport
93
- for scheme in VcsSupport .schemes :
94
- if url .lower ().startswith (scheme ) and url [len (scheme )] in '+:' :
95
- logger .debug ('Cannot look at %s URL %s' , scheme , link )
96
- return None
186
+ vcs_scheme = _match_vcs_scheme (url )
187
+ if vcs_scheme :
188
+ logger .debug ('Cannot look at %s URL %s' , vcs_scheme , link )
189
+ return None
97
190
98
- try :
99
- filename = link .filename
100
- for bad_ext in ARCHIVE_EXTENSIONS :
101
- if filename .endswith (bad_ext ):
102
- content_type = _get_content_type (url , session = session )
103
- if content_type .lower ().startswith ('text/html' ):
104
- break
105
- else :
106
- logger .debug (
107
- 'Skipping page %s because of Content-Type: %s' ,
108
- link ,
109
- content_type ,
110
- )
111
- return
191
+ # Tack index.html onto file:// URLs that point to directories
192
+ scheme , _ , path , _ , _ , _ = urllib_parse .urlparse (url )
193
+ if (scheme == 'file' and os .path .isdir (urllib_request .url2pathname (path ))):
194
+ # add trailing slash if not present so urljoin doesn't trim
195
+ # final segment
196
+ if not url .endswith ('/' ):
197
+ url += '/'
198
+ url = urllib_parse .urljoin (url , 'index.html' )
199
+ logger .debug (' file: URL is directory, getting %s' , url )
112
200
113
- logger .debug ('Getting page %s' , url )
114
-
115
- # Tack index.html onto file:// URLs that point to directories
116
- (scheme , netloc , path , params , query , fragment ) = \
117
- urllib_parse .urlparse (url )
118
- if (scheme == 'file' and
119
- os .path .isdir (urllib_request .url2pathname (path ))):
120
- # add trailing slash if not present so urljoin doesn't trim
121
- # final segment
122
- if not url .endswith ('/' ):
123
- url += '/'
124
- url = urllib_parse .urljoin (url , 'index.html' )
125
- logger .debug (' file: URL is directory, getting %s' , url )
126
-
127
- resp = session .get (
128
- url ,
129
- headers = {
130
- "Accept" : "text/html" ,
131
- # We don't want to blindly returned cached data for
132
- # /simple/, because authors generally expecting that
133
- # twine upload && pip install will function, but if
134
- # they've done a pip install in the last ~10 minutes
135
- # it won't. Thus by setting this to zero we will not
136
- # blindly use any cached data, however the benefit of
137
- # using max-age=0 instead of no-cache, is that we will
138
- # still support conditional requests, so we will still
139
- # minimize traffic sent in cases where the page hasn't
140
- # changed at all, we will just always incur the round
141
- # trip for the conditional GET now instead of only
142
- # once per 10 minutes.
143
- # For more information, please see pypa/pip#5670.
144
- "Cache-Control" : "max-age=0" ,
145
- },
201
+ try :
202
+ resp = _get_html_response (url , session = session )
203
+ except _NotHTTP as exc :
204
+ logger .debug (
205
+ 'Skipping page %s because it looks like an archive, and cannot '
206
+ 'be checked by HEAD.' , link ,
207
+ )
208
+ except _NotHTML as exc :
209
+ logger .debug (
210
+ 'Skipping page %s because the %s request got Content-Type: %s' ,
211
+ link , exc .request_desc , exc .content_type ,
146
212
)
147
- resp .raise_for_status ()
148
-
149
- # The check for archives above only works if the url ends with
150
- # something that looks like an archive. However that is not a
151
- # requirement of an url. Unless we issue a HEAD request on every
152
- # url we cannot know ahead of time for sure if something is HTML
153
- # or not. However we can check after we've downloaded it.
154
- content_type = resp .headers .get ('Content-Type' , 'unknown' )
155
- if not content_type .lower ().startswith ("text/html" ):
156
- logger .debug (
157
- 'Skipping page %s because of Content-Type: %s' ,
158
- link ,
159
- content_type ,
160
- )
161
- return
162
-
163
- inst = HTMLPage (resp .content , resp .url , resp .headers )
164
213
except requests .HTTPError as exc :
165
214
_handle_get_page_fail (link , exc , url )
166
215
except RetryError as exc :
@@ -174,7 +223,7 @@ def _get_html_page(link, session=None):
174
223
except requests .Timeout :
175
224
_handle_get_page_fail (link , "timed out" , url )
176
225
else :
177
- return inst
226
+ return HTMLPage ( resp . content , resp . url , resp . headers )
178
227
179
228
180
229
class PackageFinder (object ):
@@ -679,7 +728,7 @@ def _get_pages(self, locations, project_name):
679
728
continue
680
729
seen .add (location )
681
730
682
- page = self . _get_page (location )
731
+ page = _get_html_page (location , session = self . session )
683
732
if page is None :
684
733
continue
685
734
@@ -796,9 +845,6 @@ def _link_package_versions(self, link, search):
796
845
797
846
return InstallationCandidate (search .supplied , version , link )
798
847
799
- def _get_page (self , link ):
800
- return _get_html_page (link , session = self .session )
801
-
802
848
803
849
def egg_info_matches (
804
850
egg_info , search_name , link ,
0 commit comments