Skip to content

Commit 76c0b01

Browse files
gh-77057: Fix handling of invalid markup declarations in HTMLParser (GH-9295)
Co-authored-by: Serhiy Storchaka <[email protected]>
1 parent e7741dd commit 76c0b01

File tree

3 files changed

+68
-19
lines changed

3 files changed

+68
-19
lines changed

Lib/html/parser.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -278,7 +278,7 @@ def parse_html_declaration(self, i):
278278
if rawdata[i:i+4] == '<!--':
279279
# this case is actually already handled in goahead()
280280
return self.parse_comment(i)
281-
elif rawdata[i:i+3] == '<![':
281+
elif rawdata[i:i+9] == '<![CDATA[':
282282
return self.parse_marked_section(i)
283283
elif rawdata[i:i+9].lower() == '<!doctype':
284284
# find the closing >
@@ -295,7 +295,7 @@ def parse_html_declaration(self, i):
295295
def parse_bogus_comment(self, i, report=1):
296296
rawdata = self.rawdata
297297
assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
298-
'parse_comment()')
298+
'parse_bogus_comment()')
299299
pos = rawdata.find('>', i+2)
300300
if pos == -1:
301301
return -1

Lib/test/test_htmlparser.py

Lines changed: 64 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -566,52 +566,99 @@ def test_EOF_in_charref(self):
566566
for html, expected in data:
567567
self._run_check(html, expected)
568568

569-
def test_broken_comments(self):
569+
def test_EOF_in_comments_or_decls(self):
570+
data = [
571+
('<!', [('data', '<!')]),
572+
('<!-', [('data', '<!-')]),
573+
('<!--', [('data', '<!--')]),
574+
('<![', [('data', '<![')]),
575+
('<![CDATA[', [('data', '<![CDATA[')]),
576+
('<![CDATA[x', [('data', '<![CDATA[x')]),
577+
('<!DOCTYPE', [('data', '<!DOCTYPE')]),
578+
('<!DOCTYPE HTML', [('data', '<!DOCTYPE HTML')]),
579+
]
580+
for html, expected in data:
581+
self._run_check(html, expected)
582+
def test_bogus_comments(self):
570583
html = ('<! not really a comment >'
571584
'<! not a comment either -->'
572585
'<! -- close enough -->'
573586
'<!><!<-- this was an empty comment>'
574-
'<!!! another bogus comment !!!>')
587+
'<!!! another bogus comment !!!>'
588+
# see #32876
589+
'<![with square brackets]!>'
590+
'<![\nmultiline\nbogusness\n]!>'
591+
'<![more brackets]-[and a hyphen]!>'
592+
'<![cdata[should be uppercase]]>'
593+
'<![CDATA [whitespaces are not ignored]]>'
594+
'<![CDATA]]>' # required '[' after CDATA
595+
)
575596
expected = [
576597
('comment', ' not really a comment '),
577598
('comment', ' not a comment either --'),
578599
('comment', ' -- close enough --'),
579600
('comment', ''),
580601
('comment', '<-- this was an empty comment'),
581602
('comment', '!! another bogus comment !!!'),
603+
('comment', '[with square brackets]!'),
604+
('comment', '[\nmultiline\nbogusness\n]!'),
605+
('comment', '[more brackets]-[and a hyphen]!'),
606+
('comment', '[cdata[should be uppercase]]'),
607+
('comment', '[CDATA [whitespaces are not ignored]]'),
608+
('comment', '[CDATA]]'),
582609
]
583610
self._run_check(html, expected)
584611

585612
def test_broken_condcoms(self):
586613
# these condcoms are missing the '--' after '<!' and before the '>'
614+
# and they are considered bogus comments according to
615+
# "8.2.4.42. Markup declaration open state"
587616
html = ('<![if !(IE)]>broken condcom<![endif]>'
588617
'<![if ! IE]><link href="favicon.tiff"/><![endif]>'
589618
'<![if !IE 6]><img src="firefox.png" /><![endif]>'
590619
'<![if !ie 6]><b>foo</b><![endif]>'
591620
'<![if (!IE)|(lt IE 9)]><img src="mammoth.bmp" /><![endif]>')
592-
# According to the HTML5 specs sections "8.2.4.44 Bogus comment state"
593-
# and "8.2.4.45 Markup declaration open state", comment tokens should
594-
# be emitted instead of 'unknown decl', but calling unknown_decl
595-
# provides more flexibility.
596-
# See also Lib/_markupbase.py:parse_declaration
597621
expected = [
598-
('unknown decl', 'if !(IE)'),
622+
('comment', '[if !(IE)]'),
599623
('data', 'broken condcom'),
600-
('unknown decl', 'endif'),
601-
('unknown decl', 'if ! IE'),
624+
('comment', '[endif]'),
625+
('comment', '[if ! IE]'),
602626
('startendtag', 'link', [('href', 'favicon.tiff')]),
603-
('unknown decl', 'endif'),
604-
('unknown decl', 'if !IE 6'),
627+
('comment', '[endif]'),
628+
('comment', '[if !IE 6]'),
605629
('startendtag', 'img', [('src', 'firefox.png')]),
606-
('unknown decl', 'endif'),
607-
('unknown decl', 'if !ie 6'),
630+
('comment', '[endif]'),
631+
('comment', '[if !ie 6]'),
608632
('starttag', 'b', []),
609633
('data', 'foo'),
610634
('endtag', 'b'),
611-
('unknown decl', 'endif'),
612-
('unknown decl', 'if (!IE)|(lt IE 9)'),
635+
('comment', '[endif]'),
636+
('comment', '[if (!IE)|(lt IE 9)]'),
613637
('startendtag', 'img', [('src', 'mammoth.bmp')]),
614-
('unknown decl', 'endif')
638+
('comment', '[endif]')
639+
]
640+
self._run_check(html, expected)
641+
642+
def test_cdata_declarations(self):
643+
# More tests should be added. See also "8.2.4.42. Markup
644+
# declaration open state", "8.2.4.69. CDATA section state",
645+
# and issue 32876
646+
html = ('<![CDATA[just some plain text]]>')
647+
expected = [('unknown decl', 'CDATA[just some plain text')]
648+
self._run_check(html, expected)
649+
650+
def test_cdata_declarations_multiline(self):
651+
html = ('<code><![CDATA['
652+
' if (a < b && a > b) {'
653+
' printf("[<marquee>How?</marquee>]");'
654+
' }'
655+
']]></code>')
656+
expected = [
657+
('starttag', 'code', []),
658+
('unknown decl',
659+
'CDATA[ if (a < b && a > b) { '
660+
'printf("[<marquee>How?</marquee>]"); }'),
661+
('endtag', 'code')
615662
]
616663
self._run_check(html, expected)
617664

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Fix handling of invalid markup declarations in
2+
:class:`html.parser.HTMLParser`.

0 commit comments

Comments
 (0)