@@ -566,52 +566,99 @@ def test_EOF_in_charref(self):
566
566
for html , expected in data :
567
567
self ._run_check (html , expected )
568
568
569
- def test_broken_comments (self ):
569
+ def test_EOF_in_comments_or_decls (self ):
570
+ data = [
571
+ ('<!' , [('data' , '<!' )]),
572
+ ('<!-' , [('data' , '<!-' )]),
573
+ ('<!--' , [('data' , '<!--' )]),
574
+ ('<![' , [('data' , '<![' )]),
575
+ ('<![CDATA[' , [('data' , '<![CDATA[' )]),
576
+ ('<![CDATA[x' , [('data' , '<![CDATA[x' )]),
577
+ ('<!DOCTYPE' , [('data' , '<!DOCTYPE' )]),
578
+ ('<!DOCTYPE HTML' , [('data' , '<!DOCTYPE HTML' )]),
579
+ ]
580
+ for html , expected in data :
581
+ self ._run_check (html , expected )
582
+ def test_bogus_comments (self ):
570
583
html = ('<! not really a comment >'
571
584
'<! not a comment either -->'
572
585
'<! -- close enough -->'
573
586
'<!><!<-- this was an empty comment>'
574
- '<!!! another bogus comment !!!>' )
587
+ '<!!! another bogus comment !!!>'
588
+ # see #32876
589
+ '<![with square brackets]!>'
590
+ '<![\n multiline\n bogusness\n ]!>'
591
+ '<![more brackets]-[and a hyphen]!>'
592
+ '<![cdata[should be uppercase]]>'
593
+ '<![CDATA [whitespaces are not ignored]]>'
594
+ '<![CDATA]]>' # required '[' after CDATA
595
+ )
575
596
expected = [
576
597
('comment' , ' not really a comment ' ),
577
598
('comment' , ' not a comment either --' ),
578
599
('comment' , ' -- close enough --' ),
579
600
('comment' , '' ),
580
601
('comment' , '<-- this was an empty comment' ),
581
602
('comment' , '!! another bogus comment !!!' ),
603
+ ('comment' , '[with square brackets]!' ),
604
+ ('comment' , '[\n multiline\n bogusness\n ]!' ),
605
+ ('comment' , '[more brackets]-[and a hyphen]!' ),
606
+ ('comment' , '[cdata[should be uppercase]]' ),
607
+ ('comment' , '[CDATA [whitespaces are not ignored]]' ),
608
+ ('comment' , '[CDATA]]' ),
582
609
]
583
610
self ._run_check (html , expected )
584
611
585
612
def test_broken_condcoms (self ):
586
613
# these condcoms are missing the '--' after '<!' and before the '>'
614
+ # and they are considered bogus comments according to
615
+ # "8.2.4.42. Markup declaration open state"
587
616
html = ('<![if !(IE)]>broken condcom<![endif]>'
588
617
'<![if ! IE]><link href="favicon.tiff"/><![endif]>'
589
618
'<![if !IE 6]><img src="firefox.png" /><![endif]>'
590
619
'<![if !ie 6]><b>foo</b><![endif]>'
591
620
'<![if (!IE)|(lt IE 9)]><img src="mammoth.bmp" /><![endif]>' )
592
- # According to the HTML5 specs sections "8.2.4.44 Bogus comment state"
593
- # and "8.2.4.45 Markup declaration open state", comment tokens should
594
- # be emitted instead of 'unknown decl', but calling unknown_decl
595
- # provides more flexibility.
596
- # See also Lib/_markupbase.py:parse_declaration
597
621
expected = [
598
- ('unknown decl ' , 'if !(IE)' ),
622
+ ('comment ' , '[ if !(IE)] ' ),
599
623
('data' , 'broken condcom' ),
600
- ('unknown decl ' , 'endif' ),
601
- ('unknown decl ' , 'if ! IE' ),
624
+ ('comment ' , '[ endif] ' ),
625
+ ('comment ' , '[ if ! IE] ' ),
602
626
('startendtag' , 'link' , [('href' , 'favicon.tiff' )]),
603
- ('unknown decl ' , 'endif' ),
604
- ('unknown decl ' , 'if !IE 6' ),
627
+ ('comment ' , '[ endif] ' ),
628
+ ('comment ' , '[ if !IE 6] ' ),
605
629
('startendtag' , 'img' , [('src' , 'firefox.png' )]),
606
- ('unknown decl ' , 'endif' ),
607
- ('unknown decl ' , 'if !ie 6' ),
630
+ ('comment ' , '[ endif] ' ),
631
+ ('comment ' , '[ if !ie 6] ' ),
608
632
('starttag' , 'b' , []),
609
633
('data' , 'foo' ),
610
634
('endtag' , 'b' ),
611
- ('unknown decl ' , 'endif' ),
612
- ('unknown decl ' , 'if (!IE)|(lt IE 9)' ),
635
+ ('comment ' , '[ endif] ' ),
636
+ ('comment ' , '[ if (!IE)|(lt IE 9)] ' ),
613
637
('startendtag' , 'img' , [('src' , 'mammoth.bmp' )]),
614
- ('unknown decl' , 'endif' )
638
+ ('comment' , '[endif]' )
639
+ ]
640
+ self ._run_check (html , expected )
641
+
642
+ def test_cdata_declarations (self ):
643
+ # More tests should be added. See also "8.2.4.42. Markup
644
+ # declaration open state", "8.2.4.69. CDATA section state",
645
+ # and issue 32876
646
+ html = ('<![CDATA[just some plain text]]>' )
647
+ expected = [('unknown decl' , 'CDATA[just some plain text' )]
648
+ self ._run_check (html , expected )
649
+
650
+ def test_cdata_declarations_multiline (self ):
651
+ html = ('<code><![CDATA['
652
+ ' if (a < b && a > b) {'
653
+ ' printf("[<marquee>How?</marquee>]");'
654
+ ' }'
655
+ ']]></code>' )
656
+ expected = [
657
+ ('starttag' , 'code' , []),
658
+ ('unknown decl' ,
659
+ 'CDATA[ if (a < b && a > b) { '
660
+ 'printf("[<marquee>How?</marquee>]"); }' ),
661
+ ('endtag' , 'code' )
615
662
]
616
663
self ._run_check (html , expected )
617
664
0 commit comments