Skip to content

Commit bf02ab5

Browse files
author
Greg Guthe
committed
sanitizer: update sanitize_uri_value for Python 3.9 urlparse
Use Python 3.9 urlparse scheme parsing behavior for all Python versions Changes: * add utils._parse_uri_scheme to match Python 3.9 urlparse behavior * add utils._is_valid_netloc_and_port with Django URL validator * in test_uri_value_allowed_protocols: * add test case for implicit http for IP and port with path and fragment * add test case for data: scheme * add test case for relative path URI * test "is not allowed by default" test cases against default ALLOWED_PROTOCOLS * convert test_invalid_uri_does_not_raise_error into a test case
1 parent be1ef9a commit bf02ab5

File tree

3 files changed

+125
-28
lines changed

3 files changed

+125
-28
lines changed

bleach/sanitizer.py

+39-21
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,12 @@
99
from xml.sax.saxutils import unescape
1010

1111
from bleach import html5lib_shim
12-
from bleach.utils import alphabetize_attributes, force_unicode
12+
from bleach.utils import (
13+
_is_valid_netloc_and_port,
14+
_parse_uri_scheme,
15+
alphabetize_attributes,
16+
force_unicode,
17+
)
1318

1419

1520
#: List of allowed tags
@@ -443,9 +448,20 @@ def sanitize_characters(self, token):
443448
return new_tokens
444449

445450
def sanitize_uri_value(self, value, allowed_protocols):
446-
"""Checks a uri value to see if it's allowed
451+
"""Checks a URI value to see if it's allowed
452+
453+
``urllib.parse.urlparse`` must be able to parse the URI.
454+
455+
The URI scheme must be in ``allowed_protocols`` or not have a
456+
scheme and begin with a ``#`` indicating a relative URI by
457+
fragment.
458+
459+
When ``"http"`` is in ``allowed_protocols`` (the default),
460+
``sanitize_uri_value`` also allows relative URIs matching an
461+
IP address or hostname and port (e.g. ``localhost:8000``) and
462+
relative URIs without a scheme (e.g. ``/path``).
447463
448-
:arg value: the uri value to sanitize
464+
:arg value: the URI value to sanitize
449465
:arg allowed_protocols: list of allowed protocols
450466
451467
:returns: allowed value or None
@@ -469,33 +485,35 @@ def sanitize_uri_value(self, value, allowed_protocols):
469485
new_value = new_value.lower()
470486

471487
try:
472-
# Drop attributes with uri values that have protocols that aren't
473-
# allowed
474-
parsed = urlparse(new_value)
488+
_ = urlparse(new_value)
475489
except ValueError:
476490
# URI is impossible to parse, therefore it's not allowed
477491
return None
478492

479-
if parsed.scheme:
480-
# If urlparse found a scheme, check that
481-
if parsed.scheme in allowed_protocols:
482-
return value
493+
# If there's no protocol/scheme specified, then assume it's "http"
494+
# and see if that's allowed
495+
implicit_http_allowed = "http" in allowed_protocols
483496

497+
# Drop attributes with uri values that have protocols that aren't
498+
# allowed
499+
scheme = _parse_uri_scheme(new_value)
500+
if scheme:
501+
if scheme in allowed_protocols:
502+
return value
503+
elif implicit_http_allowed and _is_valid_netloc_and_port(scheme):
504+
return value
505+
else:
506+
# parsed a disallowed protocol/scheme
507+
# or implicit protocols are allowed and it's an invalid netloc:port
508+
return None
484509
else:
485-
# Allow uris that are just an anchor
486510
if new_value.startswith("#"):
511+
# Allow uris that are just an anchor
487512
return value
488-
489-
# Handle protocols that urlparse doesn't recognize like "myprotocol"
490-
if ":" in new_value and new_value.split(":")[0] in allowed_protocols:
491-
return value
492-
493-
# If there's no protocol/scheme specified, then assume it's "http"
494-
# and see if that's allowed
495-
if "http" in allowed_protocols:
513+
elif implicit_http_allowed:
496514
return value
497-
498-
return None
515+
else:
516+
return None
499517

500518
def allow_token(self, token):
501519
"""Handles the case where we're allowing the tag"""

bleach/utils.py

+52
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
from collections import OrderedDict
2+
import re
23

34
import six
45

6+
from bleach._vendor.django.core.validators import URLValidator
7+
58

69
def _attr_key(attr):
710
"""Returns appropriate key for sorting attribute names
@@ -40,3 +43,52 @@ def force_unicode(text):
4043

4144
# If not, convert it
4245
return six.text_type(text, "utf-8", "strict")
46+
47+
48+
netloc_port_re = re.compile(
49+
"^" + URLValidator.netloc_re + URLValidator.port_re + "$", re.IGNORECASE
50+
)
51+
52+
53+
# Characters valid in scheme names
54+
scheme_chars = (
55+
"abcdefghijklmnopqrstuvwxyz" "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "0123456789" "+-."
56+
)
57+
58+
59+
def _is_valid_netloc_and_port(netloc):
60+
"""
61+
Returns the scheme for a URI or None when parsing the URI fails
62+
63+
:arg str/unicode netloc:
64+
65+
:returns: bool
66+
67+
"""
68+
# The maximum length of a full host name is 253 characters per RFC 1034
69+
# section 3.1. It's defined to be 255 bytes or less, but this includes
70+
# one byte for the length of the name and one byte for the trailing dot
71+
# that's used to indicate absolute names in DNS.
72+
netloc = netloc_port_re.match(netloc)
73+
return bool(netloc and len(netloc.group(0)) < 254)
74+
75+
76+
def _parse_uri_scheme(uri):
77+
"""
78+
Returns the scheme for a URI or None when parsing the URI fails
79+
80+
:arg str/unicode text:
81+
82+
:returns: text or None
83+
84+
"""
85+
# replicate Python 3.9 urlparse scheme parsing for older Python versions
86+
i = uri.find(":")
87+
if i > 0:
88+
scheme = uri[:i]
89+
for c in uri[:i]:
90+
if c not in scheme_chars:
91+
break
92+
return scheme
93+
94+
return None

tests/test_clean.py

+34-7
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
from bleach import clean
88
from bleach.html5lib_shim import Filter
9-
from bleach.sanitizer import Cleaner
9+
from bleach.sanitizer import ALLOWED_PROTOCOLS, Cleaner
1010
from bleach._vendor.html5lib.constants import rcdataElements
1111

1212

@@ -58,10 +58,6 @@ def test_html_is_lowercased():
5858
)
5959

6060

61-
def test_invalid_uri_does_not_raise_error():
62-
assert clean('<a href="http://example.com]">text</a>') == "<a>text</a>"
63-
64-
6561
@pytest.mark.parametrize(
6662
"data, should_strip, expected",
6763
[
@@ -471,10 +467,31 @@ def test_attributes_list():
471467
@pytest.mark.parametrize(
472468
"data, kwargs, expected",
473469
[
470+
# invalid URI (urlparse raises a ValueError: Invalid IPv6 URL)
471+
# is not allowed by default
472+
(
473+
'<a href="http://example.com]">text</a>',
474+
{"protocols": ALLOWED_PROTOCOLS},
475+
"<a>text</a>",
476+
),
477+
# data protocol is not allowed by default
478+
(
479+
'<a href="data:text/javascript,prompt(1)">foo</a>',
480+
{"protocols": ALLOWED_PROTOCOLS},
481+
"<a>foo</a>",
482+
),
474483
# javascript: is not allowed by default
475-
("<a href=\"javascript:alert('XSS')\">xss</a>", {}, "<a>xss</a>"),
484+
(
485+
"<a href=\"javascript:alert('XSS')\">xss</a>",
486+
{"protocols": ALLOWED_PROTOCOLS},
487+
"<a>xss</a>",
488+
),
476489
# File protocol is not allowed by default
477-
('<a href="file:///tmp/foo">foo</a>', {}, "<a>foo</a>"),
490+
(
491+
'<a href="file:///tmp/foo">foo</a>',
492+
{"protocols": ALLOWED_PROTOCOLS},
493+
"<a>foo</a>",
494+
),
478495
# Specified protocols are allowed
479496
(
480497
'<a href="myprotocol://more_text">allowed href</a>',
@@ -494,6 +511,11 @@ def test_attributes_list():
494511
'<a href="#example.com">foo</a>',
495512
),
496513
# Allow implicit http if allowed
514+
(
515+
'<a href="/path">valid</a>',
516+
{"protocols": ["http"]},
517+
'<a href="/path">valid</a>',
518+
),
497519
(
498520
'<a href="example.com">valid</a>',
499521
{"protocols": ["http"]},
@@ -524,6 +546,11 @@ def test_attributes_list():
524546
{"protocols": ["http"]},
525547
'<a href="192.168.100.100:8000">valid</a>',
526548
),
549+
(
550+
'<a href="192.168.100.100:8000/foo#bar">valid</a>',
551+
{"protocols": ["http"]},
552+
'<a href="192.168.100.100:8000/foo#bar">valid</a>',
553+
),
527554
# Disallow implicit http if disallowed
528555
('<a href="example.com">foo</a>', {"protocols": []}, "<a>foo</a>"),
529556
('<a href="example.com:8000">foo</a>', {"protocols": []}, "<a>foo</a>"),

0 commit comments

Comments
 (0)