Skip to content

Commit 99e9c02

Browse files
committed
address idna review comments, removing the idna package version cap and improve docs on implementation
1 parent 18f295a commit 99e9c02

File tree

3 files changed

+48
-39
lines changed

3 files changed

+48
-39
lines changed

hyperlink/_url.py

+44-35
Original file line numberDiff line numberDiff line change
@@ -579,6 +579,50 @@ def _percent_decode(text, normalize_case=False, subencoding='utf-8',
579579

580580

581581
def _decode_host(host):
582+
"""Decode a host from ASCII-encodable text to IDNA-decoded text. If
583+
the host text is not ASCII, it is returned unchanged, as it is
584+
presumed that it is already IDNA-decoded.
585+
586+
Some technical details: _decode_host is built on top of the "idna"
587+
package, which has some quirks:
588+
589+
Capital letters are not valid IDNA2008. The idna package will
590+
raise an exception like this on capital letters:
591+
592+
> idna.core.InvalidCodepoint: Codepoint U+004B at position 1 ... not allowed
593+
594+
However, if a segment of a host (i.e., something in
595+
url.host.split('.')) is already ASCII, idna doesn't perform its
596+
usual checks. In fact, for capital letters it automatically
597+
lowercases them.
598+
599+
This check and some other functionality can be bypassed by passing
600+
uts46=True to idna.encode/decode. This allows a more permissive and
601+
convenient interface. So far it seems like the balanced approach.
602+
603+
Example output (from idna==2.6):
604+
605+
>> idna.encode(u'mahmöud.io')
606+
'xn--mahmud-zxa.io'
607+
>> idna.encode(u'Mahmöud.io')
608+
Traceback (most recent call last):
609+
File "<stdin>", line 1, in <module>
610+
File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 355, in encode
611+
result.append(alabel(label))
612+
File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 276, in alabel
613+
check_label(label)
614+
File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 253, in check_label
615+
raise InvalidCodepoint('Codepoint {0} at position {1} of {2} not allowed'.format(_unot(cp_value), pos+1, repr(label)))
616+
idna.core.InvalidCodepoint: Codepoint U+004D at position 1 of u'Mahm\xf6ud' not allowed
617+
>> idna.encode(u'Mahmoud.io')
618+
'Mahmoud.io'
619+
620+
# Similar behavior for decodes below
621+
>> idna.decode(u'Mahmoud.io')
622+
u'mahmoud.io
623+
>> idna.decode(u'Méhmoud.io', uts46=True)
624+
u'm\xe9hmoud.io'
625+
"""
582626
if not host:
583627
return u''
584628
try:
@@ -1802,38 +1846,3 @@ def parse(url, decoded=True, lazy=False):
18021846
return enc_url
18031847
dec_url = DecodedURL(enc_url, lazy=lazy)
18041848
return dec_url
1805-
1806-
"""idna package notes:
1807-
1808-
* If a segment of a host (i.e., something in url.host.split('.')) is
1809-
already ascii, idna doesn't perform its usual checks. For instance,
1810-
capital letters are not valid idna2008. The package automatically lowercases.
1811-
1812-
You'll get something like:
1813-
1814-
> idna.core.InvalidCodepoint: Codepoint U+004B at position 1 ... not allowed
1815-
1816-
This check and some other functionality can be bypassed by passing
1817-
uts46=True to encode/decode. This allows a more permission and
1818-
convenient interface. So far it seems like the balanced approach.
1819-
1820-
However, all of this is bypassed if the string segment contains no
1821-
unicode characters.
1822-
1823-
Example output:
1824-
1825-
>>> idna.encode(u'mahmöud.io')
1826-
'xn--mahmud-zxa.io'
1827-
>>> idna.encode(u'Mahmöud.io')
1828-
Traceback (most recent call last):
1829-
File "<stdin>", line 1, in <module>
1830-
File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 355, in encode
1831-
result.append(alabel(label))
1832-
File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 276, in alabel
1833-
check_label(label)
1834-
File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 253, in check_label
1835-
raise InvalidCodepoint('Codepoint {0} at position {1} of {2} not allowed'.format(_unot(cp_value), pos+1, repr(label)))
1836-
idna.core.InvalidCodepoint: Codepoint U+004D at position 1 of u'Mahm\xf6ud' not allowed
1837-
>>> idna.encode(u'Mahmoud.io')
1838-
'Mahmoud.io'
1839-
"""

setup.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,15 @@
1111

1212

1313
__author__ = 'Mahmoud Hashemi and Glyph Lefkowitz'
14-
__version__ = '17.3.2dev'
14+
__version__ = '18.0.0dev'
1515
__contact__ = '[email protected]'
1616
__url__ = 'https://github.com/python-hyper/hyperlink'
1717
__license__ = 'MIT'
1818

1919

2020
setup(name='hyperlink',
2121
version=__version__,
22-
description="A featureful, correct URL for Python.",
22+
description="A featureful, immutable, and correct URL for Python.",
2323
long_description=__doc__,
2424
author=__author__,
2525
author_email=__contact__,
@@ -29,7 +29,7 @@
2929
zip_safe=False,
3030
license=__license__,
3131
platforms='any',
32-
install_requires=['idna>=2.5,<2.7'],
32+
install_requires=['idna>=2.5'],
3333
classifiers=[
3434
'Topic :: Utilities',
3535
'Intended Audience :: Developers',

tox.ini

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ envlist = py26,py27,py34,py35,py36,pypy,coverage-report,packaging
44
[testenv]
55
changedir = .tox
66
deps = -rrequirements-test.txt
7-
commands = coverage run --parallel --rcfile {toxinidir}/.tox-coveragerc -m pytest --doctest-modules {envsitepackagesdir}/hyperlink {posargs}
7+
commands = coverage run --parallel --omit 'flycheck__*' --rcfile {toxinidir}/.tox-coveragerc -m pytest --doctest-modules {envsitepackagesdir}/hyperlink {posargs}
88

99
# Uses default basepython otherwise reporting doesn't work on Travis where
1010
# Python 3.6 is only available in 3.6 jobs.

0 commit comments

Comments
 (0)