Skip to content

Commit 05cea28

Browse files
authored
Merge pull request #56 from python-hyper/i19_better_idna
Switch to using idna package
2 parents a23a1a4 + 99e9c02 commit 05cea28

File tree

5 files changed

+83
-13
lines changed

5 files changed

+83
-13
lines changed

hyperlink/_url.py

+63-10
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,20 @@
2020
import string
2121
import socket
2222
from unicodedata import normalize
23-
23+
try:
24+
from socket import inet_pton
25+
except ImportError:
26+
inet_pton = None # defined below
2427
try:
2528
from collections.abc import Mapping
2629
except ImportError: # Python 2
2730
from collections import Mapping
28-
try:
29-
from socket import inet_pton
30-
except ImportError:
31+
32+
# Note: IDNAError is a subclass of UnicodeError
33+
from idna import encode as idna_encode, decode as idna_decode, IDNAError
34+
35+
36+
if inet_pton is None:
3137
# based on https://gist.github.com/nnemkin/4966028
3238
# this code only applies on Windows Python 2.7
3339
import ctypes
@@ -573,13 +579,59 @@ def _percent_decode(text, normalize_case=False, subencoding='utf-8',
573579

574580

575581
def _decode_host(host):
582+
"""Decode a host from ASCII-encodable text to IDNA-decoded text. If
583+
the host text is not ASCII, it is returned unchanged, as it is
584+
presumed that it is already IDNA-decoded.
585+
586+
Some technical details: _decode_host is built on top of the "idna"
587+
package, which has some quirks:
588+
589+
Capital letters are not valid IDNA2008. The idna package will
590+
raise an exception like this on capital letters:
591+
592+
> idna.core.InvalidCodepoint: Codepoint U+004B at position 1 ... not allowed
593+
594+
However, if a segment of a host (i.e., something in
595+
url.host.split('.')) is already ASCII, idna doesn't perform its
596+
usual checks. In fact, for capital letters it automatically
597+
lowercases them.
598+
599+
This check and some other functionality can be bypassed by passing
600+
uts46=True to idna.encode/decode. This allows a more permissive and
601+
convenient interface. So far it seems like the balanced approach.
602+
603+
Example output (from idna==2.6):
604+
605+
>> idna.encode(u'mahmöud.io')
606+
'xn--mahmud-zxa.io'
607+
>> idna.encode(u'Mahmöud.io')
608+
Traceback (most recent call last):
609+
File "<stdin>", line 1, in <module>
610+
File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 355, in encode
611+
result.append(alabel(label))
612+
File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 276, in alabel
613+
check_label(label)
614+
File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 253, in check_label
615+
raise InvalidCodepoint('Codepoint {0} at position {1} of {2} not allowed'.format(_unot(cp_value), pos+1, repr(label)))
616+
idna.core.InvalidCodepoint: Codepoint U+004D at position 1 of u'Mahm\xf6ud' not allowed
617+
>> idna.encode(u'Mahmoud.io')
618+
'Mahmoud.io'
619+
620+
# Similar behavior for decodes below
621+
>> idna.decode(u'Mahmoud.io')
622+
u'mahmoud.io
623+
>> idna.decode(u'Méhmoud.io', uts46=True)
624+
u'm\xe9hmoud.io'
625+
"""
626+
if not host:
627+
return u''
576628
try:
577629
host_bytes = host.encode("ascii")
578630
except UnicodeEncodeError:
579631
host_text = host
580632
else:
581633
try:
582-
host_text = host_bytes.decode("idna")
634+
host_text = idna_decode(host_bytes, uts46=True)
583635
except ValueError:
584636
# only reached on "narrow" (UCS-2) Python builds <3.4, see #7
585637
# NOTE: not going to raise here, because there's no
@@ -1255,8 +1307,8 @@ def to_uri(self):
12551307
12561308
For example::
12571309
1258-
>>> URL.from_text(u'https://→example.com/foo⇧bar/').to_uri()
1259-
URL.from_text(u'https://xn--example-dk9c.com/foo%E2%87%A7bar/')
1310+
>>> URL.from_text(u'https://ايران.com/foo⇧bar/').to_uri()
1311+
URL.from_text(u'https://xn--mgba3a4fra.com/foo%E2%87%A7bar/')
12601312
12611313
Returns:
12621314
URL: A new instance with its path segments, query parameters, and
@@ -1267,9 +1319,10 @@ def to_uri(self):
12671319
self.userinfo.split(':', 1)])
12681320
new_path = _encode_path_parts(self.path, has_scheme=bool(self.scheme),
12691321
rooted=False, joined=False, maximal=True)
1322+
new_host = self.host if not self.host else idna_encode(self.host, uts46=True).decode("ascii")
12701323
return self.replace(
12711324
userinfo=new_userinfo,
1272-
host=self.host.encode("idna").decode("ascii"),
1325+
host=new_host,
12731326
path=new_path,
12741327
query=tuple([tuple(_encode_query_part(x, maximal=True)
12751328
if x is not None else None
@@ -1285,9 +1338,9 @@ def to_iri(self):
12851338
Percent-encoded Unicode and IDNA-encoded hostnames are
12861339
decoded, like so::
12871340
1288-
>>> url = URL.from_text(u'https://xn--example-dk9c.com/foo%E2%87%A7bar/')
1341+
>>> url = URL.from_text(u'https://xn--mgba3a4fra.example.com/foo%E2%87%A7bar/')
12891342
>>> print(url.to_iri().to_text())
1290-
https://example.com/foo⇧bar/
1343+
https://ايران.example.com/foo⇧bar/
12911344
12921345
.. note::
12931346

hyperlink/test/test_url.py

+14
Original file line numberDiff line numberDiff line change
@@ -1167,3 +1167,17 @@ def test_str(self):
11671167
else:
11681168
assert isinstance(str(url), unicode)
11691169
assert isinstance(bytes(url), bytes)
1170+
1171+
def test_idna_corners(self):
1172+
text = u'http://abé.com/'
1173+
url = URL.from_text(text)
1174+
assert url.to_iri().host == u'abé.com'
1175+
assert url.to_uri().host == u'xn--ab-cja.com'
1176+
1177+
url = URL.from_text("http://ドメイン.テスト.co.jp#test")
1178+
assert url.to_iri().host == u'ドメイン.テスト.co.jp'
1179+
assert url.to_uri().host == u'xn--eckwd4c7c.xn--zckzah.co.jp'
1180+
1181+
assert url.to_uri().get_decoded_url().host == u'ドメイン.テスト.co.jp'
1182+
1183+
assert URL.from_text('http://Example.com').to_uri().get_decoded_url().host == 'example.com'

requirements-test.txt

+2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
coverage==4.4.1
2+
idna==2.5
13
pytest==2.9.2
24
pytest-cov==2.3.0
35
tox==2.6.0

setup.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,15 @@
1111

1212

1313
__author__ = 'Mahmoud Hashemi and Glyph Lefkowitz'
14-
__version__ = '17.3.2dev'
14+
__version__ = '18.0.0dev'
1515
__contact__ = '[email protected]'
1616
__url__ = 'https://github.com/python-hyper/hyperlink'
1717
__license__ = 'MIT'
1818

1919

2020
setup(name='hyperlink',
2121
version=__version__,
22-
description="A featureful, correct URL for Python.",
22+
description="A featureful, immutable, and correct URL for Python.",
2323
long_description=__doc__,
2424
author=__author__,
2525
author_email=__contact__,
@@ -29,6 +29,7 @@
2929
zip_safe=False,
3030
license=__license__,
3131
platforms='any',
32+
install_requires=['idna>=2.5'],
3233
classifiers=[
3334
'Topic :: Utilities',
3435
'Intended Audience :: Developers',

tox.ini

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ envlist = py26,py27,py34,py35,py36,pypy,coverage-report,packaging
44
[testenv]
55
changedir = .tox
66
deps = -rrequirements-test.txt
7-
commands = coverage run --parallel --rcfile {toxinidir}/.tox-coveragerc -m pytest --doctest-modules {envsitepackagesdir}/hyperlink {posargs}
7+
commands = coverage run --parallel --omit 'flycheck__*' --rcfile {toxinidir}/.tox-coveragerc -m pytest --doctest-modules {envsitepackagesdir}/hyperlink {posargs}
88

99
# Uses default basepython otherwise reporting doesn't work on Travis where
1010
# Python 3.6 is only available in 3.6 jobs.

0 commit comments

Comments
 (0)