From 18f295ae5ada7b17b390a6347497ed929470d75b Mon Sep 17 00:00:00 2001 From: Mahmoud Hashemi Date: Thu, 4 Jan 2018 10:58:06 -0800 Subject: [PATCH 1/2] working integration of the idna package, with certain invalid (doc)tests purged and replaced, fixes #19 --- hyperlink/_url.py | 64 ++++++++++++++++++++++++++++++++------ hyperlink/test/test_url.py | 14 +++++++++ requirements-test.txt | 2 ++ setup.py | 1 + 4 files changed, 71 insertions(+), 10 deletions(-) diff --git a/hyperlink/_url.py b/hyperlink/_url.py index 4eb6f4da..5b401378 100644 --- a/hyperlink/_url.py +++ b/hyperlink/_url.py @@ -20,14 +20,20 @@ import string import socket from unicodedata import normalize - +try: + from socket import inet_pton +except ImportError: + inet_pton = None # defined below try: from collections.abc import Mapping except ImportError: # Python 2 from collections import Mapping -try: - from socket import inet_pton -except ImportError: + +# Note: IDNAError is a subclass of UnicodeError +from idna import encode as idna_encode, decode as idna_decode, IDNAError + + +if inet_pton is None: # based on https://gist.github.com/nnemkin/4966028 # this code only applies on Windows Python 2.7 import ctypes @@ -573,13 +579,15 @@ def _percent_decode(text, normalize_case=False, subencoding='utf-8', def _decode_host(host): + if not host: + return u'' try: host_bytes = host.encode("ascii") except UnicodeEncodeError: host_text = host else: try: - host_text = host_bytes.decode("idna") + host_text = idna_decode(host_bytes, uts46=True) except ValueError: # only reached on "narrow" (UCS-2) Python builds <3.4, see #7 # NOTE: not going to raise here, because there's no @@ -1255,8 +1263,8 @@ def to_uri(self): For example:: - >>> URL.from_text(u'https://→example.com/foo⇧bar/').to_uri() - URL.from_text(u'https://xn--example-dk9c.com/foo%E2%87%A7bar/') + >>> URL.from_text(u'https://ايران.com/foo⇧bar/').to_uri() + URL.from_text(u'https://xn--mgba3a4fra.com/foo%E2%87%A7bar/') Returns: URL: A new instance with its path segments, query parameters, and @@ -1267,9 +1275,10 @@ def to_uri(self): self.userinfo.split(':', 1)]) new_path = _encode_path_parts(self.path, has_scheme=bool(self.scheme), rooted=False, joined=False, maximal=True) + new_host = self.host if not self.host else idna_encode(self.host, uts46=True).decode("ascii") return self.replace( userinfo=new_userinfo, - host=self.host.encode("idna").decode("ascii"), + host=new_host, path=new_path, query=tuple([tuple(_encode_query_part(x, maximal=True) if x is not None else None @@ -1285,9 +1294,9 @@ def to_iri(self): Percent-encoded Unicode and IDNA-encoded hostnames are decoded, like so:: - >>> url = URL.from_text(u'https://xn--example-dk9c.com/foo%E2%87%A7bar/') + >>> url = URL.from_text(u'https://xn--mgba3a4fra.example.com/foo%E2%87%A7bar/') >>> print(url.to_iri().to_text()) - https://→example.com/foo⇧bar/ + https://ايران.example.com/foo⇧bar/ .. note:: @@ -1793,3 +1802,38 @@ def parse(url, decoded=True, lazy=False): return enc_url dec_url = DecodedURL(enc_url, lazy=lazy) return dec_url + +"""idna package notes: + +* If a segment of a host (i.e., something in url.host.split('.')) is +already ascii, idna doesn't perform its usual checks. For instance, +capital letters are not valid idna2008. The package automatically lowercases. + +You'll get something like: + +> idna.core.InvalidCodepoint: Codepoint U+004B at position 1 ... not allowed + +This check and some other functionality can be bypassed by passing +uts46=True to encode/decode. This allows a more permission and +convenient interface. So far it seems like the balanced approach. + +However, all of this is bypassed if the string segment contains no +unicode characters. + +Example output: + +>>> idna.encode(u'mahmöud.io') +'xn--mahmud-zxa.io' +>>> idna.encode(u'Mahmöud.io') +Traceback (most recent call last): + File "", line 1, in + File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 355, in encode + result.append(alabel(label)) + File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 276, in alabel + check_label(label) + File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 253, in check_label + raise InvalidCodepoint('Codepoint {0} at position {1} of {2} not allowed'.format(_unot(cp_value), pos+1, repr(label))) +idna.core.InvalidCodepoint: Codepoint U+004D at position 1 of u'Mahm\xf6ud' not allowed +>>> idna.encode(u'Mahmoud.io') +'Mahmoud.io' +""" diff --git a/hyperlink/test/test_url.py b/hyperlink/test/test_url.py index a1c6d2f1..1e777648 100644 --- a/hyperlink/test/test_url.py +++ b/hyperlink/test/test_url.py @@ -1167,3 +1167,17 @@ def test_str(self): else: assert isinstance(str(url), unicode) assert isinstance(bytes(url), bytes) + + def test_idna_corners(self): + text = u'http://abé.com/' + url = URL.from_text(text) + assert url.to_iri().host == u'abé.com' + assert url.to_uri().host == u'xn--ab-cja.com' + + url = URL.from_text("http://ドメイン.テスト.co.jp#test") + assert url.to_iri().host == u'ドメイン.テスト.co.jp' + assert url.to_uri().host == u'xn--eckwd4c7c.xn--zckzah.co.jp' + + assert url.to_uri().get_decoded_url().host == u'ドメイン.テスト.co.jp' + + assert URL.from_text('http://Example.com').to_uri().get_decoded_url().host == 'example.com' diff --git a/requirements-test.txt b/requirements-test.txt index 766d766f..0e9a261b 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,3 +1,5 @@ +coverage==4.4.1 +idna==2.5 pytest==2.9.2 pytest-cov==2.3.0 tox==2.6.0 diff --git a/setup.py b/setup.py index 60ec97ae..bb51786e 100644 --- a/setup.py +++ b/setup.py @@ -29,6 +29,7 @@ zip_safe=False, license=__license__, platforms='any', + install_requires=['idna>=2.5,<2.7'], classifiers=[ 'Topic :: Utilities', 'Intended Audience :: Developers', From 99e9c02981b7306c596b37e3d633b749d51df6eb Mon Sep 17 00:00:00 2001 From: Mahmoud Hashemi Date: Sun, 7 Jan 2018 14:28:25 -0800 Subject: [PATCH 2/2] address idna review comments, removing the idna package version cap and improve docs on implementation --- hyperlink/_url.py | 79 ++++++++++++++++++++++++++--------------------- setup.py | 6 ++-- tox.ini | 2 +- 3 files changed, 48 insertions(+), 39 deletions(-) diff --git a/hyperlink/_url.py b/hyperlink/_url.py index 5b401378..04ac4b71 100644 --- a/hyperlink/_url.py +++ b/hyperlink/_url.py @@ -579,6 +579,50 @@ def _percent_decode(text, normalize_case=False, subencoding='utf-8', def _decode_host(host): + """Decode a host from ASCII-encodable text to IDNA-decoded text. If + the host text is not ASCII, it is returned unchanged, as it is + presumed that it is already IDNA-decoded. + + Some technical details: _decode_host is built on top of the "idna" + package, which has some quirks: + + Capital letters are not valid IDNA2008. The idna package will + raise an exception like this on capital letters: + + > idna.core.InvalidCodepoint: Codepoint U+004B at position 1 ... not allowed + + However, if a segment of a host (i.e., something in + url.host.split('.')) is already ASCII, idna doesn't perform its + usual checks. In fact, for capital letters it automatically + lowercases them. + + This check and some other functionality can be bypassed by passing + uts46=True to idna.encode/decode. This allows a more permissive and + convenient interface. So far it seems like the balanced approach. + + Example output (from idna==2.6): + + >> idna.encode(u'mahmöud.io') + 'xn--mahmud-zxa.io' + >> idna.encode(u'Mahmöud.io') + Traceback (most recent call last): + File "", line 1, in + File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 355, in encode + result.append(alabel(label)) + File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 276, in alabel + check_label(label) + File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 253, in check_label + raise InvalidCodepoint('Codepoint {0} at position {1} of {2} not allowed'.format(_unot(cp_value), pos+1, repr(label))) + idna.core.InvalidCodepoint: Codepoint U+004D at position 1 of u'Mahm\xf6ud' not allowed + >> idna.encode(u'Mahmoud.io') + 'Mahmoud.io' + + # Similar behavior for decodes below + >> idna.decode(u'Mahmoud.io') + u'mahmoud.io + >> idna.decode(u'Méhmoud.io', uts46=True) + u'm\xe9hmoud.io' + """ if not host: return u'' try: @@ -1802,38 +1846,3 @@ def parse(url, decoded=True, lazy=False): return enc_url dec_url = DecodedURL(enc_url, lazy=lazy) return dec_url - -"""idna package notes: - -* If a segment of a host (i.e., something in url.host.split('.')) is -already ascii, idna doesn't perform its usual checks. For instance, -capital letters are not valid idna2008. The package automatically lowercases. - -You'll get something like: - -> idna.core.InvalidCodepoint: Codepoint U+004B at position 1 ... not allowed - -This check and some other functionality can be bypassed by passing -uts46=True to encode/decode. This allows a more permission and -convenient interface. So far it seems like the balanced approach. - -However, all of this is bypassed if the string segment contains no -unicode characters. - -Example output: - ->>> idna.encode(u'mahmöud.io') -'xn--mahmud-zxa.io' ->>> idna.encode(u'Mahmöud.io') -Traceback (most recent call last): - File "", line 1, in - File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 355, in encode - result.append(alabel(label)) - File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 276, in alabel - check_label(label) - File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 253, in check_label - raise InvalidCodepoint('Codepoint {0} at position {1} of {2} not allowed'.format(_unot(cp_value), pos+1, repr(label))) -idna.core.InvalidCodepoint: Codepoint U+004D at position 1 of u'Mahm\xf6ud' not allowed ->>> idna.encode(u'Mahmoud.io') -'Mahmoud.io' -""" diff --git a/setup.py b/setup.py index bb51786e..8cc554d4 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ __author__ = 'Mahmoud Hashemi and Glyph Lefkowitz' -__version__ = '17.3.2dev' +__version__ = '18.0.0dev' __contact__ = 'mahmoud@hatnote.com' __url__ = 'https://github.com/python-hyper/hyperlink' __license__ = 'MIT' @@ -19,7 +19,7 @@ setup(name='hyperlink', version=__version__, - description="A featureful, correct URL for Python.", + description="A featureful, immutable, and correct URL for Python.", long_description=__doc__, author=__author__, author_email=__contact__, @@ -29,7 +29,7 @@ zip_safe=False, license=__license__, platforms='any', - install_requires=['idna>=2.5,<2.7'], + install_requires=['idna>=2.5'], classifiers=[ 'Topic :: Utilities', 'Intended Audience :: Developers', diff --git a/tox.ini b/tox.ini index ef2ec9c5..ab995a87 100644 --- a/tox.ini +++ b/tox.ini @@ -4,7 +4,7 @@ envlist = py26,py27,py34,py35,py36,pypy,coverage-report,packaging [testenv] changedir = .tox deps = -rrequirements-test.txt -commands = coverage run --parallel --rcfile {toxinidir}/.tox-coveragerc -m pytest --doctest-modules {envsitepackagesdir}/hyperlink {posargs} +commands = coverage run --parallel --omit 'flycheck__*' --rcfile {toxinidir}/.tox-coveragerc -m pytest --doctest-modules {envsitepackagesdir}/hyperlink {posargs} # Uses default basepython otherwise reporting doesn't work on Travis where # Python 3.6 is only available in 3.6 jobs.