Skip to content

Commit 0e39324

Browse files
committed
Add patch from issue 152 by fantasai
--HG-- branch : csswg-testsuite
1 parent 21bf1ad commit 0e39324

File tree

1 file changed

+100
-18
lines changed

1 file changed

+100
-18
lines changed

html5lib/serializer/htmlserializer.py

+100-18
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
from html5lib import utils
1313
from xml.sax.saxutils import escape
1414

15+
import re
16+
1517
spaceCharacters = u"".join(spaceCharacters)
1618

1719
try:
@@ -84,6 +86,7 @@ class HTMLSerializer(object):
8486
resolve_entities = True
8587

8688
# miscellaneous options
89+
emit_doctype = 'preserve'
8790
inject_meta_charset = True
8891
strip_whitespace = False
8992
sanitize = False
@@ -92,13 +95,23 @@ class HTMLSerializer(object):
9295
"minimize_boolean_attributes", "use_trailing_solidus",
9396
"space_before_trailing_solidus", "omit_optional_tags",
9497
"strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs",
95-
"escape_rcdata", "resolve_entities", "sanitize")
98+
"escape_rcdata", "resolve_entities", "emit_doctype", "sanitize")
9699

97100
def __init__(self, **kwargs):
98101
"""Initialize HTMLSerializer.
99102
100103
Keyword options (default given first unless specified) include:
101104
105+
emit_doctype='html'|'xhtml'|'html5'|'preserve'
106+
Whether to output a doctype.
107+
* emit_doctype='xhtml' preserves unknown doctypes and valid
108+
XHTML doctypes, converts valid HTML doctypes to their XHTML
109+
counterparts, and drops <!DOCTYPE html>
110+
* emit_doctype='html' preserves unknown doctypes and valid
111+
HTML doctypes, converts valid XHTML doctypes to their HTML
112+
counterparts, and uses <!DOCTYPE html> for missing doctypes
113+
* emit_doctype='html5' Uses <!DOCTYPE html> as the doctype
114+
* emit_doctype='preserve' preserves the doctype, if any, unchanged
102115
inject_meta_charset=True|False
103116
..?
104117
quote_attr_values=True|False
@@ -140,6 +153,86 @@ def __init__(self, **kwargs):
140153
self.errors = []
141154
self.strict = False
142155

156+
def calc_doctype(self, token=None):
157+
if self.emit_doctype == 'html5' or \
158+
not token and self.emit_doctype == 'html':
159+
if token:
160+
return u'<!DOCTYPE html>'
161+
else:
162+
return u'<!DOCTYPE html>\n'
163+
164+
rootElement = token["name"]
165+
publicID = token["publicId"]
166+
systemID = token["systemId"]
167+
168+
if re.match(u'html', rootElement, re.IGNORECASE):
169+
if self.emit_doctype == u'html':
170+
# XHTML 1.1
171+
if publicID == u"-//W3C//DTD XHTML 1.1//EN" and (not systemID \
172+
or systemID == u"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"):
173+
publicID = u"-//W3C//DTD HTML 4.01//EN"
174+
if systemID:
175+
systemID = u"http://www.w3.org/TR/html4/strict.dtd"
176+
# XHTML 1.0 Strict
177+
elif publicID == u"-//W3C//DTD XHTML 1.0 Strict//EN" and (not systemID \
178+
or systemID == u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"):
179+
publicID = u"-//W3C//DTD HTML 4.01//EN"
180+
if systemID:
181+
systemID = u"http://www.w3.org/TR/html4/strict.dtd"
182+
# XHTML 1.0 Transitional
183+
elif publicID == u"-//W3C//DTD XHTML 1.0 Transitional//EN" and (not systemID \
184+
or systemID == u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"):
185+
publicID = u"-//W3C//DTD HTML 4.01 Transitional//EN"
186+
if systemID:
187+
systemID = u"http://www.w3.org/TR/html4/loose.dtd"
188+
# XHTML 1.0 Frameset
189+
elif publicID == u"-//W3C//DTD XHTML 1.0 Frameset//EN" and (not systemID \
190+
or systemID == u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd"):
191+
publicID = u"-//W3C//DTD HTML 4.01 Frameset//EN"
192+
if systemID:
193+
systemID = u"http://www.w3.org/TR/html4/frameset.dtd"
194+
elif self.emit_doctype == u'xhtml':
195+
# HTML 4.01 Strict
196+
if re.match(u"-//W3C//DTD HTML 4.0(1)?//EN", publicID) and \
197+
(not systemID or \
198+
re.match(u"http://www.w3.org/TR/(html4|REC-html40)/strict.dtd", systemID)):
199+
publicID = u"-//W3C//DTD XHTML 1.0 Strict//EN"
200+
if systemID:
201+
systemID = u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"
202+
# HTML4.01 Transitional
203+
elif re.match(u"-//W3C//DTD HTML 4.0(1)? Transitional//EN", publicID) and \
204+
(not systemID or \
205+
re.match(u"http://www.w3.org/TR/(html4|REC-html40)/loose.dtd", systemID)):
206+
publicID = u"-//W3C//DTD XHTML 1.0 Transitional//EN"
207+
if systemID:
208+
systemID = u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"
209+
# HTML 4.01 Frameset
210+
elif re.match(u"-//W3C//DTD HTML 4.0(1)? Frameset//EN", publicID) and \
211+
(not systemID or \
212+
re.match(u"http://www.w3.org/TR/(html4|REC-html40)/frameset.dtd", systemID)):
213+
publicID = u"-//W3C//DTD XHTML 1.0 Frameset//EN"
214+
if systemID:
215+
systemID = u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd"
216+
# HTML 3.2
217+
elif re.match(u"-//W3C//DTD HTML 3.2( Final)?//EN", publicID) and not systemID:
218+
publicID = u"-//W3C//DTD XHTML 1.0 Transitional//EN"
219+
220+
doctype = u"<!DOCTYPE %s" % rootElement
221+
if token["publicId"]:
222+
doctype += u' PUBLIC "%s"' % publicID
223+
elif systemID:
224+
doctype += u" SYSTEM"
225+
if systemID:
226+
if systemID.find(u'"') >= 0:
227+
if systemID.find(u"'") >= 0:
228+
self.serializeError(_("System identifer contains both single and double quote characters"))
229+
quote_char = u"'"
230+
else:
231+
quote_char = u'"'
232+
doctype += u" %s%s%s" % (quote_char, systemID, quote_char)
233+
doctype += u">"
234+
return doctype
235+
143236
def serialize(self, treewalker, encoding=None):
144237
in_cdata = False
145238
self.errors = []
@@ -157,26 +250,12 @@ def serialize(self, treewalker, encoding=None):
157250
if self.omit_optional_tags:
158251
from html5lib.filters.optionaltags import Filter
159252
treewalker = Filter(treewalker)
253+
posted_doctype = False
160254
for token in treewalker:
161255
type = token["type"]
162256
if type == "Doctype":
163-
doctype = u"<!DOCTYPE %s" % token["name"]
164-
165-
if token["publicId"]:
166-
doctype += u' PUBLIC "%s"' % token["publicId"]
167-
elif token["systemId"]:
168-
doctype += u" SYSTEM"
169-
if token["systemId"]:
170-
if token["systemId"].find(u'"') >= 0:
171-
if token["systemId"].find(u"'") >= 0:
172-
self.serializeError(_("System identifer contains both single and double quote characters"))
173-
quote_char = u"'"
174-
else:
175-
quote_char = u'"'
176-
doctype += u" %s%s%s" % (quote_char, token["systemId"], quote_char)
177-
178-
doctype += u">"
179-
257+
posted_doctype = True
258+
doctype = self.calc_doctype(token)
180259
if encoding:
181260
yield doctype.encode(encoding)
182261
else:
@@ -196,6 +275,9 @@ def serialize(self, treewalker, encoding=None):
196275
yield escape(token["data"])
197276

198277
elif type in ("StartTag", "EmptyTag"):
278+
if not posted_doctype:
279+
posted_doctype = True
280+
yield self.calc_doctype()
199281
name = token["name"]
200282
if name in rcdataElements and not self.escape_rcdata:
201283
in_cdata = True

0 commit comments

Comments
 (0)