Skip to content

Commit 18f295a

Browse files
committed
working integration of the idna package, with certain invalid (doc)tests purged and replaced, fixes #19
1 parent a23a1a4 commit 18f295a

File tree

4 files changed

+71
-10
lines changed

4 files changed

+71
-10
lines changed

Diff for: hyperlink/_url.py

+54-10
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,20 @@
2020
import string
2121
import socket
2222
from unicodedata import normalize
23-
23+
try:
24+
from socket import inet_pton
25+
except ImportError:
26+
inet_pton = None # defined below
2427
try:
2528
from collections.abc import Mapping
2629
except ImportError: # Python 2
2730
from collections import Mapping
28-
try:
29-
from socket import inet_pton
30-
except ImportError:
31+
32+
# Note: IDNAError is a subclass of UnicodeError
33+
from idna import encode as idna_encode, decode as idna_decode, IDNAError
34+
35+
36+
if inet_pton is None:
3137
# based on https://gist.github.com/nnemkin/4966028
3238
# this code only applies on Windows Python 2.7
3339
import ctypes
@@ -573,13 +579,15 @@ def _percent_decode(text, normalize_case=False, subencoding='utf-8',
573579

574580

575581
def _decode_host(host):
582+
if not host:
583+
return u''
576584
try:
577585
host_bytes = host.encode("ascii")
578586
except UnicodeEncodeError:
579587
host_text = host
580588
else:
581589
try:
582-
host_text = host_bytes.decode("idna")
590+
host_text = idna_decode(host_bytes, uts46=True)
583591
except ValueError:
584592
# only reached on "narrow" (UCS-2) Python builds <3.4, see #7
585593
# NOTE: not going to raise here, because there's no
@@ -1255,8 +1263,8 @@ def to_uri(self):
12551263
12561264
For example::
12571265
1258-
>>> URL.from_text(u'https://→example.com/foo⇧bar/').to_uri()
1259-
URL.from_text(u'https://xn--example-dk9c.com/foo%E2%87%A7bar/')
1266+
>>> URL.from_text(u'https://ايران.com/foo⇧bar/').to_uri()
1267+
URL.from_text(u'https://xn--mgba3a4fra.com/foo%E2%87%A7bar/')
12601268
12611269
Returns:
12621270
URL: A new instance with its path segments, query parameters, and
@@ -1267,9 +1275,10 @@ def to_uri(self):
12671275
self.userinfo.split(':', 1)])
12681276
new_path = _encode_path_parts(self.path, has_scheme=bool(self.scheme),
12691277
rooted=False, joined=False, maximal=True)
1278+
new_host = self.host if not self.host else idna_encode(self.host, uts46=True).decode("ascii")
12701279
return self.replace(
12711280
userinfo=new_userinfo,
1272-
host=self.host.encode("idna").decode("ascii"),
1281+
host=new_host,
12731282
path=new_path,
12741283
query=tuple([tuple(_encode_query_part(x, maximal=True)
12751284
if x is not None else None
@@ -1285,9 +1294,9 @@ def to_iri(self):
12851294
Percent-encoded Unicode and IDNA-encoded hostnames are
12861295
decoded, like so::
12871296
1288-
>>> url = URL.from_text(u'https://xn--example-dk9c.com/foo%E2%87%A7bar/')
1297+
>>> url = URL.from_text(u'https://xn--mgba3a4fra.example.com/foo%E2%87%A7bar/')
12891298
>>> print(url.to_iri().to_text())
1290-
https://example.com/foo⇧bar/
1299+
https://ايران.example.com/foo⇧bar/
12911300
12921301
.. note::
12931302
@@ -1793,3 +1802,38 @@ def parse(url, decoded=True, lazy=False):
17931802
return enc_url
17941803
dec_url = DecodedURL(enc_url, lazy=lazy)
17951804
return dec_url
1805+
1806+
"""idna package notes:
1807+
1808+
* If a segment of a host (i.e., something in url.host.split('.')) is
1809+
already ascii, idna doesn't perform its usual checks. For instance,
1810+
capital letters are not valid idna2008. The package automatically lowercases.
1811+
1812+
You'll get something like:
1813+
1814+
> idna.core.InvalidCodepoint: Codepoint U+004B at position 1 ... not allowed
1815+
1816+
This check and some other functionality can be bypassed by passing
1817+
uts46=True to encode/decode. This allows a more permission and
1818+
convenient interface. So far it seems like the balanced approach.
1819+
1820+
However, all of this is bypassed if the string segment contains no
1821+
unicode characters.
1822+
1823+
Example output:
1824+
1825+
>>> idna.encode(u'mahmöud.io')
1826+
'xn--mahmud-zxa.io'
1827+
>>> idna.encode(u'Mahmöud.io')
1828+
Traceback (most recent call last):
1829+
File "<stdin>", line 1, in <module>
1830+
File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 355, in encode
1831+
result.append(alabel(label))
1832+
File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 276, in alabel
1833+
check_label(label)
1834+
File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 253, in check_label
1835+
raise InvalidCodepoint('Codepoint {0} at position {1} of {2} not allowed'.format(_unot(cp_value), pos+1, repr(label)))
1836+
idna.core.InvalidCodepoint: Codepoint U+004D at position 1 of u'Mahm\xf6ud' not allowed
1837+
>>> idna.encode(u'Mahmoud.io')
1838+
'Mahmoud.io'
1839+
"""

Diff for: hyperlink/test/test_url.py

+14
Original file line numberDiff line numberDiff line change
@@ -1167,3 +1167,17 @@ def test_str(self):
11671167
else:
11681168
assert isinstance(str(url), unicode)
11691169
assert isinstance(bytes(url), bytes)
1170+
1171+
def test_idna_corners(self):
1172+
text = u'http://abé.com/'
1173+
url = URL.from_text(text)
1174+
assert url.to_iri().host == u'abé.com'
1175+
assert url.to_uri().host == u'xn--ab-cja.com'
1176+
1177+
url = URL.from_text("http://ドメイン.テスト.co.jp#test")
1178+
assert url.to_iri().host == u'ドメイン.テスト.co.jp'
1179+
assert url.to_uri().host == u'xn--eckwd4c7c.xn--zckzah.co.jp'
1180+
1181+
assert url.to_uri().get_decoded_url().host == u'ドメイン.テスト.co.jp'
1182+
1183+
assert URL.from_text('http://Example.com').to_uri().get_decoded_url().host == 'example.com'

Diff for: requirements-test.txt

+2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
coverage==4.4.1
2+
idna==2.5
13
pytest==2.9.2
24
pytest-cov==2.3.0
35
tox==2.6.0

Diff for: setup.py

+1
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
zip_safe=False,
3030
license=__license__,
3131
platforms='any',
32+
install_requires=['idna>=2.5,<2.7'],
3233
classifiers=[
3334
'Topic :: Utilities',
3435
'Intended Audience :: Developers',

0 commit comments

Comments
 (0)