diff --git a/Lib/test/test_urllib2.py b/Lib/test/test_urllib2.py index 9db23e6ce04bd5..c604097fc91eca 100644 --- a/Lib/test/test_urllib2.py +++ b/Lib/test/test_urllib2.py @@ -808,12 +808,12 @@ def test_file(self): self.assertEqual(respurl, url) for url in [ - "file://localhost:80%s" % urlpath, + "file://localhost:80/%s" % urlpath, "file:///file_does_not_exist.txt", "file://not-a-local-host.com//dir/file.txt", - "file://%s:80%s/%s" % (socket.gethostbyname('localhost'), + "file://%s:80/%s/%s" % (socket.gethostbyname('localhost'), os.getcwd(), TESTFN), - "file://somerandomhost.ontheinternet.com%s/%s" % + "file://somerandomhost.ontheinternet.com/%s/%s" % (os.getcwd(), TESTFN), ]: try: diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py index dff9a8ede9b601..342cd300005135 100644 --- a/Lib/test/test_urlparse.py +++ b/Lib/test/test_urlparse.py @@ -608,9 +608,8 @@ def test_urlsplit_attributes(self): # Verify an illegal port raises ValueError url = b"HTTP://WWW.PYTHON.ORG:65536/doc/#frag" - p = urllib.parse.urlsplit(url) with self.assertRaisesRegex(ValueError, "out of range"): - p.port + urllib.parse.urlsplit(url) def test_urlsplit_remove_unsafe_bytes(self): # Remove ASCII tabs and newlines from input @@ -660,10 +659,8 @@ def test_attributes_bad_port(self): if bytes: netloc = netloc.encode("ascii") url = url.encode("ascii") - p = parse(url) - self.assertEqual(p.netloc, netloc) with self.assertRaises(ValueError): - p.port + parse(url) def test_attributes_without_netloc(self): # This example is straight from RFC 3261. It looks like it @@ -1014,13 +1011,11 @@ def test_issue14072(self): def test_port_casting_failure_message(self): message = "Port could not be cast to integer value as 'oracle'" - p1 = urllib.parse.urlparse('http://Server=sde; Service=sde:oracle') with self.assertRaisesRegex(ValueError, message): - p1.port + urllib.parse.urlparse('http://Server=sde; Service=sde:oracle') - p2 = urllib.parse.urlsplit('http://Server=sde; Service=sde:oracle') with self.assertRaisesRegex(ValueError, message): - p2.port + urllib.parse.urlsplit('http://Server=sde; Service=sde:oracle') def test_telurl_params(self): p1 = urllib.parse.urlparse('tel:123-4;phone-context=+1-650-516') diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index bf16d0f42e5794..4fe73661b6a63e 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -32,6 +32,7 @@ import re import sys import types +import collections import warnings __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag", @@ -123,6 +124,18 @@ def _coerce_args(*args): return args + (_noop,) return _decode_args(args) + (_encode_result,) +def _parse_hostinfo(netloc): + _, _, hostinfo = netloc.rpartition('@') + _, have_open_br, bracketed = hostinfo.partition('[') + if have_open_br: + hostname, _, port = bracketed.partition(']') + _, _, port = port.partition(':') + else: + hostname, _, port = hostinfo.partition(':') + if not port: + port = None + return hostname, port + # Result objects are more helpful than simple tuples class _ResultMixinStr(object): """Standard approach to encoding parsed results from str to bytes""" @@ -167,13 +180,7 @@ def hostname(self): def port(self): port = self._hostinfo[1] if port is not None: - try: - port = int(port, 10) - except ValueError: - message = f'Port could not be cast to integer value as {port!r}' - raise ValueError(message) from None - if not ( 0 <= port <= 65535): - raise ValueError("Port out of range 0-65535") + port = _validate_port(port) return port __class_getitem__ = classmethod(types.GenericAlias) @@ -197,16 +204,7 @@ def _userinfo(self): @property def _hostinfo(self): netloc = self.netloc - _, _, hostinfo = netloc.rpartition('@') - _, have_open_br, bracketed = hostinfo.partition('[') - if have_open_br: - hostname, _, port = bracketed.partition(']') - _, _, port = port.partition(':') - else: - hostname, _, port = hostinfo.partition(':') - if not port: - port = None - return hostname, port + return _parse_hostinfo(netloc) class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes): @@ -402,6 +400,16 @@ def _splitparams(url): i = url.find(';') return url[:i], url[i+1:] +def _validate_port(port_str): + try: + port = int(port_str, 10) + except ValueError: + message = f'Port could not be cast to integer value as {port_str!r}' + raise ValueError(message) from None + if not (0 <= port <= 65535): + raise ValueError("Port out of range 0-65535: %r" % port_str) + return port + def _splitnetloc(url, start=0): delim = len(url) # position of end of domain part of url, default is end for c in '/?#': # look for delimiters; the order is NOT important @@ -410,9 +418,7 @@ def _splitnetloc(url, start=0): delim = min(delim, wdelim) # use earliest delim position return url[start:delim], url[delim:] # return (domain, rest) -def _checknetloc(netloc): - if not netloc or netloc.isascii(): - return +def _checknetloc_nfkc(netloc): # looking for characters like \u2100 that expand to 'a/c' # IDNA uses NFKC equivalence, so normalize for this check import unicodedata @@ -428,6 +434,14 @@ def _checknetloc(netloc): raise ValueError("netloc '" + netloc + "' contains invalid " + "characters under NFKC normalization") +def _checknetloc(netloc): + if netloc and not netloc.isascii(): + _checknetloc_nfkc(netloc) + + _, port = _parse_hostinfo(netloc) + if port is not None: + _validate_port(port) + # typed=True avoids BytesWarnings being emitted during cache key # comparison since this API supports both bytes and str input. @functools.lru_cache(typed=True) diff --git a/Misc/ACKS b/Misc/ACKS index b023bcb6d72fd3..b208fb74408719 100644 --- a/Misc/ACKS +++ b/Misc/ACKS @@ -227,6 +227,7 @@ Tom Bridgman Anthony Briggs Keith Briggs Tobias Brink +Miguel Brito Dillon Brock Richard Brodie Michael Broghton diff --git a/Misc/NEWS.d/next/Core and Builtins/2021-05-01-10-22-18.bpo-43871.y2Cvah.rst b/Misc/NEWS.d/next/Core and Builtins/2021-05-01-10-22-18.bpo-43871.y2Cvah.rst new file mode 100644 index 00000000000000..811278ea26307c --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2021-05-01-10-22-18.bpo-43871.y2Cvah.rst @@ -0,0 +1 @@ +Move port validation logic to parsing time. Patch by Miguel Brito.