20
20
import string
21
21
import socket
22
22
from unicodedata import normalize
23
-
23
+ try :
24
+ from socket import inet_pton
25
+ except ImportError :
26
+ inet_pton = None # defined below
24
27
try :
25
28
from collections .abc import Mapping
26
29
except ImportError : # Python 2
27
30
from collections import Mapping
28
- try :
29
- from socket import inet_pton
30
- except ImportError :
31
+
32
+ # Note: IDNAError is a subclass of UnicodeError
33
+ from idna import encode as idna_encode , decode as idna_decode , IDNAError
34
+
35
+
36
+ if inet_pton is None :
31
37
# based on https://gist.github.com/nnemkin/4966028
32
38
# this code only applies on Windows Python 2.7
33
39
import ctypes
@@ -573,13 +579,59 @@ def _percent_decode(text, normalize_case=False, subencoding='utf-8',
573
579
574
580
575
581
def _decode_host (host ):
582
+ """Decode a host from ASCII-encodable text to IDNA-decoded text. If
583
+ the host text is not ASCII, it is returned unchanged, as it is
584
+ presumed that it is already IDNA-decoded.
585
+
586
+ Some technical details: _decode_host is built on top of the "idna"
587
+ package, which has some quirks:
588
+
589
+ Capital letters are not valid IDNA2008. The idna package will
590
+ raise an exception like this on capital letters:
591
+
592
+ > idna.core.InvalidCodepoint: Codepoint U+004B at position 1 ... not allowed
593
+
594
+ However, if a segment of a host (i.e., something in
595
+ url.host.split('.')) is already ASCII, idna doesn't perform its
596
+ usual checks. In fact, for capital letters it automatically
597
+ lowercases them.
598
+
599
+ This check and some other functionality can be bypassed by passing
600
+ uts46=True to idna.encode/decode. This allows a more permissive and
601
+ convenient interface. So far it seems like the balanced approach.
602
+
603
+ Example output (from idna==2.6):
604
+
605
+ >> idna.encode(u'mahmöud.io')
606
+ 'xn--mahmud-zxa.io'
607
+ >> idna.encode(u'Mahmöud.io')
608
+ Traceback (most recent call last):
609
+ File "<stdin>", line 1, in <module>
610
+ File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 355, in encode
611
+ result.append(alabel(label))
612
+ File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 276, in alabel
613
+ check_label(label)
614
+ File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 253, in check_label
615
+ raise InvalidCodepoint('Codepoint {0} at position {1} of {2} not allowed'.format(_unot(cp_value), pos+1, repr(label)))
616
+ idna.core.InvalidCodepoint: Codepoint U+004D at position 1 of u'Mahm\xf6 ud' not allowed
617
+ >> idna.encode(u'Mahmoud.io')
618
+ 'Mahmoud.io'
619
+
620
+ # Similar behavior for decodes below
621
+ >> idna.decode(u'Mahmoud.io')
622
+ u'mahmoud.io
623
+ >> idna.decode(u'Méhmoud.io', uts46=True)
624
+ u'm\xe9 hmoud.io'
625
+ """
626
+ if not host :
627
+ return u''
576
628
try :
577
629
host_bytes = host .encode ("ascii" )
578
630
except UnicodeEncodeError :
579
631
host_text = host
580
632
else :
581
633
try :
582
- host_text = host_bytes . decode ( "idna" )
634
+ host_text = idna_decode ( host_bytes , uts46 = True )
583
635
except ValueError :
584
636
# only reached on "narrow" (UCS-2) Python builds <3.4, see #7
585
637
# NOTE: not going to raise here, because there's no
@@ -1255,8 +1307,8 @@ def to_uri(self):
1255
1307
1256
1308
For example::
1257
1309
1258
- >>> URL.from_text(u'https://→example .com/foo⇧bar/').to_uri()
1259
- URL.from_text(u'https://xn--example-dk9c .com/foo%E2%87%A7bar/')
1310
+ >>> URL.from_text(u'https://ايران .com/foo⇧bar/').to_uri()
1311
+ URL.from_text(u'https://xn--mgba3a4fra .com/foo%E2%87%A7bar/')
1260
1312
1261
1313
Returns:
1262
1314
URL: A new instance with its path segments, query parameters, and
@@ -1267,9 +1319,10 @@ def to_uri(self):
1267
1319
self .userinfo .split (':' , 1 )])
1268
1320
new_path = _encode_path_parts (self .path , has_scheme = bool (self .scheme ),
1269
1321
rooted = False , joined = False , maximal = True )
1322
+ new_host = self .host if not self .host else idna_encode (self .host , uts46 = True ).decode ("ascii" )
1270
1323
return self .replace (
1271
1324
userinfo = new_userinfo ,
1272
- host = self . host . encode ( "idna" ). decode ( "ascii" ) ,
1325
+ host = new_host ,
1273
1326
path = new_path ,
1274
1327
query = tuple ([tuple (_encode_query_part (x , maximal = True )
1275
1328
if x is not None else None
@@ -1285,9 +1338,9 @@ def to_iri(self):
1285
1338
Percent-encoded Unicode and IDNA-encoded hostnames are
1286
1339
decoded, like so::
1287
1340
1288
- >>> url = URL.from_text(u'https://xn--example-dk9c .com/foo%E2%87%A7bar/')
1341
+ >>> url = URL.from_text(u'https://xn--mgba3a4fra. example.com/foo%E2%87%A7bar/')
1289
1342
>>> print(url.to_iri().to_text())
1290
- https://→ example.com/foo⇧bar/
1343
+ https://ايران. example.com/foo⇧bar/
1291
1344
1292
1345
.. note::
1293
1346
0 commit comments