20
20
import string
21
21
import socket
22
22
from unicodedata import normalize
23
-
23
+ try :
24
+ from socket import inet_pton
25
+ except ImportError :
26
+ inet_pton = None # defined below
24
27
try :
25
28
from collections .abc import Mapping
26
29
except ImportError : # Python 2
27
30
from collections import Mapping
28
- try :
29
- from socket import inet_pton
30
- except ImportError :
31
+
32
+ # Note: IDNAError is a subclass of UnicodeError
33
+ from idna import encode as idna_encode , decode as idna_decode , IDNAError
34
+
35
+
36
+ if inet_pton is None :
31
37
# based on https://gist.github.com/nnemkin/4966028
32
38
# this code only applies on Windows Python 2.7
33
39
import ctypes
@@ -573,13 +579,15 @@ def _percent_decode(text, normalize_case=False, subencoding='utf-8',
573
579
574
580
575
581
def _decode_host (host ):
582
+ if not host :
583
+ return u''
576
584
try :
577
585
host_bytes = host .encode ("ascii" )
578
586
except UnicodeEncodeError :
579
587
host_text = host
580
588
else :
581
589
try :
582
- host_text = host_bytes . decode ( "idna" )
590
+ host_text = idna_decode ( host_bytes , uts46 = True )
583
591
except ValueError :
584
592
# only reached on "narrow" (UCS-2) Python builds <3.4, see #7
585
593
# NOTE: not going to raise here, because there's no
@@ -1255,8 +1263,8 @@ def to_uri(self):
1255
1263
1256
1264
For example::
1257
1265
1258
- >>> URL.from_text(u'https://→example .com/foo⇧bar/').to_uri()
1259
- URL.from_text(u'https://xn--example-dk9c .com/foo%E2%87%A7bar/')
1266
+ >>> URL.from_text(u'https://ايران .com/foo⇧bar/').to_uri()
1267
+ URL.from_text(u'https://xn--mgba3a4fra .com/foo%E2%87%A7bar/')
1260
1268
1261
1269
Returns:
1262
1270
URL: A new instance with its path segments, query parameters, and
@@ -1267,9 +1275,10 @@ def to_uri(self):
1267
1275
self .userinfo .split (':' , 1 )])
1268
1276
new_path = _encode_path_parts (self .path , has_scheme = bool (self .scheme ),
1269
1277
rooted = False , joined = False , maximal = True )
1278
+ new_host = self .host if not self .host else idna_encode (self .host , uts46 = True ).decode ("ascii" )
1270
1279
return self .replace (
1271
1280
userinfo = new_userinfo ,
1272
- host = self . host . encode ( "idna" ). decode ( "ascii" ) ,
1281
+ host = new_host ,
1273
1282
path = new_path ,
1274
1283
query = tuple ([tuple (_encode_query_part (x , maximal = True )
1275
1284
if x is not None else None
@@ -1285,9 +1294,9 @@ def to_iri(self):
1285
1294
Percent-encoded Unicode and IDNA-encoded hostnames are
1286
1295
decoded, like so::
1287
1296
1288
- >>> url = URL.from_text(u'https://xn--example-dk9c .com/foo%E2%87%A7bar/')
1297
+ >>> url = URL.from_text(u'https://xn--mgba3a4fra. example.com/foo%E2%87%A7bar/')
1289
1298
>>> print(url.to_iri().to_text())
1290
- https://→ example.com/foo⇧bar/
1299
+ https://ايران. example.com/foo⇧bar/
1291
1300
1292
1301
.. note::
1293
1302
@@ -1793,3 +1802,38 @@ def parse(url, decoded=True, lazy=False):
1793
1802
return enc_url
1794
1803
dec_url = DecodedURL (enc_url , lazy = lazy )
1795
1804
return dec_url
1805
+
1806
+ """idna package notes:
1807
+
1808
+ * If a segment of a host (i.e., something in url.host.split('.')) is
1809
+ already ascii, idna doesn't perform its usual checks. For instance,
1810
+ capital letters are not valid idna2008. The package automatically lowercases.
1811
+
1812
+ You'll get something like:
1813
+
1814
+ > idna.core.InvalidCodepoint: Codepoint U+004B at position 1 ... not allowed
1815
+
1816
+ This check and some other functionality can be bypassed by passing
1817
+ uts46=True to encode/decode. This allows a more permission and
1818
+ convenient interface. So far it seems like the balanced approach.
1819
+
1820
+ However, all of this is bypassed if the string segment contains no
1821
+ unicode characters.
1822
+
1823
+ Example output:
1824
+
1825
+ >>> idna.encode(u'mahmöud.io')
1826
+ 'xn--mahmud-zxa.io'
1827
+ >>> idna.encode(u'Mahmöud.io')
1828
+ Traceback (most recent call last):
1829
+ File "<stdin>", line 1, in <module>
1830
+ File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 355, in encode
1831
+ result.append(alabel(label))
1832
+ File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 276, in alabel
1833
+ check_label(label)
1834
+ File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 253, in check_label
1835
+ raise InvalidCodepoint('Codepoint {0} at position {1} of {2} not allowed'.format(_unot(cp_value), pos+1, repr(label)))
1836
+ idna.core.InvalidCodepoint: Codepoint U+004D at position 1 of u'Mahm\xf6 ud' not allowed
1837
+ >>> idna.encode(u'Mahmoud.io')
1838
+ 'Mahmoud.io'
1839
+ """
0 commit comments