diff --git a/hyperlink/_url.py b/hyperlink/_url.py index 4eb6f4da..04ac4b71 100644 --- a/hyperlink/_url.py +++ b/hyperlink/_url.py @@ -20,14 +20,20 @@ import string import socket from unicodedata import normalize - +try: + from socket import inet_pton +except ImportError: + inet_pton = None # defined below try: from collections.abc import Mapping except ImportError: # Python 2 from collections import Mapping -try: - from socket import inet_pton -except ImportError: + +# Note: IDNAError is a subclass of UnicodeError +from idna import encode as idna_encode, decode as idna_decode, IDNAError + + +if inet_pton is None: # based on https://gist.github.com/nnemkin/4966028 # this code only applies on Windows Python 2.7 import ctypes @@ -573,13 +579,59 @@ def _percent_decode(text, normalize_case=False, subencoding='utf-8', def _decode_host(host): + """Decode a host from ASCII-encodable text to IDNA-decoded text. If + the host text is not ASCII, it is returned unchanged, as it is + presumed that it is already IDNA-decoded. + + Some technical details: _decode_host is built on top of the "idna" + package, which has some quirks: + + Capital letters are not valid IDNA2008. The idna package will + raise an exception like this on capital letters: + + > idna.core.InvalidCodepoint: Codepoint U+004B at position 1 ... not allowed + + However, if a segment of a host (i.e., something in + url.host.split('.')) is already ASCII, idna doesn't perform its + usual checks. In fact, for capital letters it automatically + lowercases them. + + This check and some other functionality can be bypassed by passing + uts46=True to idna.encode/decode. This allows a more permissive and + convenient interface. So far it seems like the balanced approach. + + Example output (from idna==2.6): + + >> idna.encode(u'mahmöud.io') + 'xn--mahmud-zxa.io' + >> idna.encode(u'Mahmöud.io') + Traceback (most recent call last): + File "", line 1, in + File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 355, in encode + result.append(alabel(label)) + File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 276, in alabel + check_label(label) + File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 253, in check_label + raise InvalidCodepoint('Codepoint {0} at position {1} of {2} not allowed'.format(_unot(cp_value), pos+1, repr(label))) + idna.core.InvalidCodepoint: Codepoint U+004D at position 1 of u'Mahm\xf6ud' not allowed + >> idna.encode(u'Mahmoud.io') + 'Mahmoud.io' + + # Similar behavior for decodes below + >> idna.decode(u'Mahmoud.io') + u'mahmoud.io + >> idna.decode(u'Méhmoud.io', uts46=True) + u'm\xe9hmoud.io' + """ + if not host: + return u'' try: host_bytes = host.encode("ascii") except UnicodeEncodeError: host_text = host else: try: - host_text = host_bytes.decode("idna") + host_text = idna_decode(host_bytes, uts46=True) except ValueError: # only reached on "narrow" (UCS-2) Python builds <3.4, see #7 # NOTE: not going to raise here, because there's no @@ -1255,8 +1307,8 @@ def to_uri(self): For example:: - >>> URL.from_text(u'https://→example.com/foo⇧bar/').to_uri() - URL.from_text(u'https://xn--example-dk9c.com/foo%E2%87%A7bar/') + >>> URL.from_text(u'https://ايران.com/foo⇧bar/').to_uri() + URL.from_text(u'https://xn--mgba3a4fra.com/foo%E2%87%A7bar/') Returns: URL: A new instance with its path segments, query parameters, and @@ -1267,9 +1319,10 @@ def to_uri(self): self.userinfo.split(':', 1)]) new_path = _encode_path_parts(self.path, has_scheme=bool(self.scheme), rooted=False, joined=False, maximal=True) + new_host = self.host if not self.host else idna_encode(self.host, uts46=True).decode("ascii") return self.replace( userinfo=new_userinfo, - host=self.host.encode("idna").decode("ascii"), + host=new_host, path=new_path, query=tuple([tuple(_encode_query_part(x, maximal=True) if x is not None else None @@ -1285,9 +1338,9 @@ def to_iri(self): Percent-encoded Unicode and IDNA-encoded hostnames are decoded, like so:: - >>> url = URL.from_text(u'https://xn--example-dk9c.com/foo%E2%87%A7bar/') + >>> url = URL.from_text(u'https://xn--mgba3a4fra.example.com/foo%E2%87%A7bar/') >>> print(url.to_iri().to_text()) - https://→example.com/foo⇧bar/ + https://ايران.example.com/foo⇧bar/ .. note:: diff --git a/hyperlink/test/test_url.py b/hyperlink/test/test_url.py index a1c6d2f1..1e777648 100644 --- a/hyperlink/test/test_url.py +++ b/hyperlink/test/test_url.py @@ -1167,3 +1167,17 @@ def test_str(self): else: assert isinstance(str(url), unicode) assert isinstance(bytes(url), bytes) + + def test_idna_corners(self): + text = u'http://abé.com/' + url = URL.from_text(text) + assert url.to_iri().host == u'abé.com' + assert url.to_uri().host == u'xn--ab-cja.com' + + url = URL.from_text("http://ドメイン.テスト.co.jp#test") + assert url.to_iri().host == u'ドメイン.テスト.co.jp' + assert url.to_uri().host == u'xn--eckwd4c7c.xn--zckzah.co.jp' + + assert url.to_uri().get_decoded_url().host == u'ドメイン.テスト.co.jp' + + assert URL.from_text('http://Example.com').to_uri().get_decoded_url().host == 'example.com' diff --git a/requirements-test.txt b/requirements-test.txt index 766d766f..0e9a261b 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,3 +1,5 @@ +coverage==4.4.1 +idna==2.5 pytest==2.9.2 pytest-cov==2.3.0 tox==2.6.0 diff --git a/setup.py b/setup.py index 60ec97ae..8cc554d4 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ __author__ = 'Mahmoud Hashemi and Glyph Lefkowitz' -__version__ = '17.3.2dev' +__version__ = '18.0.0dev' __contact__ = 'mahmoud@hatnote.com' __url__ = 'https://github.com/python-hyper/hyperlink' __license__ = 'MIT' @@ -19,7 +19,7 @@ setup(name='hyperlink', version=__version__, - description="A featureful, correct URL for Python.", + description="A featureful, immutable, and correct URL for Python.", long_description=__doc__, author=__author__, author_email=__contact__, @@ -29,6 +29,7 @@ zip_safe=False, license=__license__, platforms='any', + install_requires=['idna>=2.5'], classifiers=[ 'Topic :: Utilities', 'Intended Audience :: Developers', diff --git a/tox.ini b/tox.ini index ef2ec9c5..ab995a87 100644 --- a/tox.ini +++ b/tox.ini @@ -4,7 +4,7 @@ envlist = py26,py27,py34,py35,py36,pypy,coverage-report,packaging [testenv] changedir = .tox deps = -rrequirements-test.txt -commands = coverage run --parallel --rcfile {toxinidir}/.tox-coveragerc -m pytest --doctest-modules {envsitepackagesdir}/hyperlink {posargs} +commands = coverage run --parallel --omit 'flycheck__*' --rcfile {toxinidir}/.tox-coveragerc -m pytest --doctest-modules {envsitepackagesdir}/hyperlink {posargs} # Uses default basepython otherwise reporting doesn't work on Travis where # Python 3.6 is only available in 3.6 jobs.