Skip to content

Switch to using idna package #56

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jan 8, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 63 additions & 10 deletions hyperlink/_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,20 @@
import string
import socket
from unicodedata import normalize

try:
from socket import inet_pton
except ImportError:
inet_pton = None # defined below
try:
from collections.abc import Mapping
except ImportError: # Python 2
from collections import Mapping
try:
from socket import inet_pton
except ImportError:

# Note: IDNAError is a subclass of UnicodeError
from idna import encode as idna_encode, decode as idna_decode, IDNAError


if inet_pton is None:
# based on https://gist.github.com/nnemkin/4966028
# this code only applies on Windows Python 2.7
import ctypes
Expand Down Expand Up @@ -573,13 +579,59 @@ def _percent_decode(text, normalize_case=False, subencoding='utf-8',


def _decode_host(host):
"""Decode a host from ASCII-encodable text to IDNA-decoded text. If
the host text is not ASCII, it is returned unchanged, as it is
presumed that it is already IDNA-decoded.

Some technical details: _decode_host is built on top of the "idna"
package, which has some quirks:

Capital letters are not valid IDNA2008. The idna package will
raise an exception like this on capital letters:

> idna.core.InvalidCodepoint: Codepoint U+004B at position 1 ... not allowed

However, if a segment of a host (i.e., something in
url.host.split('.')) is already ASCII, idna doesn't perform its
usual checks. In fact, for capital letters it automatically
lowercases them.

This check and some other functionality can be bypassed by passing
uts46=True to idna.encode/decode. This allows a more permissive and
convenient interface. So far it seems like the balanced approach.

Example output (from idna==2.6):

>> idna.encode(u'mahmöud.io')
'xn--mahmud-zxa.io'
>> idna.encode(u'Mahmöud.io')
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 355, in encode
result.append(alabel(label))
File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 276, in alabel
check_label(label)
File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 253, in check_label
raise InvalidCodepoint('Codepoint {0} at position {1} of {2} not allowed'.format(_unot(cp_value), pos+1, repr(label)))
idna.core.InvalidCodepoint: Codepoint U+004D at position 1 of u'Mahm\xf6ud' not allowed
>> idna.encode(u'Mahmoud.io')
'Mahmoud.io'

# Similar behavior for decodes below
>> idna.decode(u'Mahmoud.io')
u'mahmoud.io
>> idna.decode(u'Méhmoud.io', uts46=True)
u'm\xe9hmoud.io'
"""
if not host:
return u''
try:
host_bytes = host.encode("ascii")
except UnicodeEncodeError:
host_text = host
else:
try:
host_text = host_bytes.decode("idna")
host_text = idna_decode(host_bytes, uts46=True)
except ValueError:
# only reached on "narrow" (UCS-2) Python builds <3.4, see #7
# NOTE: not going to raise here, because there's no
Expand Down Expand Up @@ -1255,8 +1307,8 @@ def to_uri(self):

For example::

>>> URL.from_text(u'https://→example.com/foo⇧bar/').to_uri()
URL.from_text(u'https://xn--example-dk9c.com/foo%E2%87%A7bar/')
>>> URL.from_text(u'https://ايران.com/foo⇧bar/').to_uri()
URL.from_text(u'https://xn--mgba3a4fra.com/foo%E2%87%A7bar/')

Returns:
URL: A new instance with its path segments, query parameters, and
Expand All @@ -1267,9 +1319,10 @@ def to_uri(self):
self.userinfo.split(':', 1)])
new_path = _encode_path_parts(self.path, has_scheme=bool(self.scheme),
rooted=False, joined=False, maximal=True)
new_host = self.host if not self.host else idna_encode(self.host, uts46=True).decode("ascii")
return self.replace(
userinfo=new_userinfo,
host=self.host.encode("idna").decode("ascii"),
host=new_host,
path=new_path,
query=tuple([tuple(_encode_query_part(x, maximal=True)
if x is not None else None
Expand All @@ -1285,9 +1338,9 @@ def to_iri(self):
Percent-encoded Unicode and IDNA-encoded hostnames are
decoded, like so::

>>> url = URL.from_text(u'https://xn--example-dk9c.com/foo%E2%87%A7bar/')
>>> url = URL.from_text(u'https://xn--mgba3a4fra.example.com/foo%E2%87%A7bar/')
>>> print(url.to_iri().to_text())
https://example.com/foo⇧bar/
https://ايران.example.com/foo⇧bar/

.. note::

Expand Down
14 changes: 14 additions & 0 deletions hyperlink/test/test_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -1167,3 +1167,17 @@ def test_str(self):
else:
assert isinstance(str(url), unicode)
assert isinstance(bytes(url), bytes)

def test_idna_corners(self):
text = u'http://abé.com/'
url = URL.from_text(text)
assert url.to_iri().host == u'abé.com'
assert url.to_uri().host == u'xn--ab-cja.com'

url = URL.from_text("http://ドメイン.テスト.co.jp#test")
assert url.to_iri().host == u'ドメイン.テスト.co.jp'
assert url.to_uri().host == u'xn--eckwd4c7c.xn--zckzah.co.jp'

assert url.to_uri().get_decoded_url().host == u'ドメイン.テスト.co.jp'

assert URL.from_text('http://Example.com').to_uri().get_decoded_url().host == 'example.com'
2 changes: 2 additions & 0 deletions requirements-test.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
coverage==4.4.1
idna==2.5
pytest==2.9.2
pytest-cov==2.3.0
tox==2.6.0
5 changes: 3 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,15 @@


__author__ = 'Mahmoud Hashemi and Glyph Lefkowitz'
__version__ = '17.3.2dev'
__version__ = '18.0.0dev'
__contact__ = '[email protected]'
__url__ = 'https://github.com/python-hyper/hyperlink'
__license__ = 'MIT'


setup(name='hyperlink',
version=__version__,
description="A featureful, correct URL for Python.",
description="A featureful, immutable, and correct URL for Python.",
long_description=__doc__,
author=__author__,
author_email=__contact__,
Expand All @@ -29,6 +29,7 @@
zip_safe=False,
license=__license__,
platforms='any',
install_requires=['idna>=2.5'],
classifiers=[
'Topic :: Utilities',
'Intended Audience :: Developers',
Expand Down
2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ envlist = py26,py27,py34,py35,py36,pypy,coverage-report,packaging
[testenv]
changedir = .tox
deps = -rrequirements-test.txt
commands = coverage run --parallel --rcfile {toxinidir}/.tox-coveragerc -m pytest --doctest-modules {envsitepackagesdir}/hyperlink {posargs}
commands = coverage run --parallel --omit 'flycheck__*' --rcfile {toxinidir}/.tox-coveragerc -m pytest --doctest-modules {envsitepackagesdir}/hyperlink {posargs}

# Uses default basepython otherwise reporting doesn't work on Travis where
# Python 3.6 is only available in 3.6 jobs.
Expand Down