@@ -579,6 +579,50 @@ def _percent_decode(text, normalize_case=False, subencoding='utf-8',
579
579
580
580
581
581
def _decode_host (host ):
582
+ """Decode a host from ASCII-encodable text to IDNA-decoded text. If
583
+ the host text is not ASCII, it is returned unchanged, as it is
584
+ presumed that it is already IDNA-decoded.
585
+
586
+ Some technical details: _decode_host is built on top of the "idna"
587
+ package, which has some quirks:
588
+
589
+ Capital letters are not valid IDNA2008. The idna package will
590
+ raise an exception like this on capital letters:
591
+
592
+ > idna.core.InvalidCodepoint: Codepoint U+004B at position 1 ... not allowed
593
+
594
+ However, if a segment of a host (i.e., something in
595
+ url.host.split('.')) is already ASCII, idna doesn't perform its
596
+ usual checks. In fact, for capital letters it automatically
597
+ lowercases them.
598
+
599
+ This check and some other functionality can be bypassed by passing
600
+ uts46=True to idna.encode/decode. This allows a more permissive and
601
+ convenient interface. So far it seems like the balanced approach.
602
+
603
+ Example output (from idna==2.6):
604
+
605
+ >> idna.encode(u'mahmöud.io')
606
+ 'xn--mahmud-zxa.io'
607
+ >> idna.encode(u'Mahmöud.io')
608
+ Traceback (most recent call last):
609
+ File "<stdin>", line 1, in <module>
610
+ File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 355, in encode
611
+ result.append(alabel(label))
612
+ File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 276, in alabel
613
+ check_label(label)
614
+ File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 253, in check_label
615
+ raise InvalidCodepoint('Codepoint {0} at position {1} of {2} not allowed'.format(_unot(cp_value), pos+1, repr(label)))
616
+ idna.core.InvalidCodepoint: Codepoint U+004D at position 1 of u'Mahm\xf6 ud' not allowed
617
+ >> idna.encode(u'Mahmoud.io')
618
+ 'Mahmoud.io'
619
+
620
+ # Similar behavior for decodes below
621
+ >> idna.decode(u'Mahmoud.io')
622
+ u'mahmoud.io
623
+ >> idna.decode(u'Méhmoud.io', uts46=True)
624
+ u'm\xe9 hmoud.io'
625
+ """
582
626
if not host :
583
627
return u''
584
628
try :
@@ -1802,38 +1846,3 @@ def parse(url, decoded=True, lazy=False):
1802
1846
return enc_url
1803
1847
dec_url = DecodedURL (enc_url , lazy = lazy )
1804
1848
return dec_url
1805
-
1806
- """idna package notes:
1807
-
1808
- * If a segment of a host (i.e., something in url.host.split('.')) is
1809
- already ascii, idna doesn't perform its usual checks. For instance,
1810
- capital letters are not valid idna2008. The package automatically lowercases.
1811
-
1812
- You'll get something like:
1813
-
1814
- > idna.core.InvalidCodepoint: Codepoint U+004B at position 1 ... not allowed
1815
-
1816
- This check and some other functionality can be bypassed by passing
1817
- uts46=True to encode/decode. This allows a more permission and
1818
- convenient interface. So far it seems like the balanced approach.
1819
-
1820
- However, all of this is bypassed if the string segment contains no
1821
- unicode characters.
1822
-
1823
- Example output:
1824
-
1825
- >>> idna.encode(u'mahmöud.io')
1826
- 'xn--mahmud-zxa.io'
1827
- >>> idna.encode(u'Mahmöud.io')
1828
- Traceback (most recent call last):
1829
- File "<stdin>", line 1, in <module>
1830
- File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 355, in encode
1831
- result.append(alabel(label))
1832
- File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 276, in alabel
1833
- check_label(label)
1834
- File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 253, in check_label
1835
- raise InvalidCodepoint('Codepoint {0} at position {1} of {2} not allowed'.format(_unot(cp_value), pos+1, repr(label)))
1836
- idna.core.InvalidCodepoint: Codepoint U+004D at position 1 of u'Mahm\xf6 ud' not allowed
1837
- >>> idna.encode(u'Mahmoud.io')
1838
- 'Mahmoud.io'
1839
- """
0 commit comments