12
12
from html5lib import utils
13
13
from xml .sax .saxutils import escape
14
14
15
+ import re
16
+
15
17
spaceCharacters = u"" .join (spaceCharacters )
16
18
17
19
try :
@@ -84,6 +86,7 @@ class HTMLSerializer(object):
84
86
resolve_entities = True
85
87
86
88
# miscellaneous options
89
+ emit_doctype = 'preserve'
87
90
inject_meta_charset = True
88
91
strip_whitespace = False
89
92
sanitize = False
@@ -92,13 +95,23 @@ class HTMLSerializer(object):
92
95
"minimize_boolean_attributes" , "use_trailing_solidus" ,
93
96
"space_before_trailing_solidus" , "omit_optional_tags" ,
94
97
"strip_whitespace" , "inject_meta_charset" , "escape_lt_in_attrs" ,
95
- "escape_rcdata" , "resolve_entities" , "sanitize" )
98
+ "escape_rcdata" , "resolve_entities" , "emit_doctype" , " sanitize" )
96
99
97
100
def __init__ (self , ** kwargs ):
98
101
"""Initialize HTMLSerializer.
99
102
100
103
Keyword options (default given first unless specified) include:
101
104
105
+ emit_doctype='html'|'xhtml'|'html5'|'preserve'
106
+ Whether to output a doctype.
107
+ * emit_doctype='xhtml' preserves unknown doctypes and valid
108
+ XHTML doctypes, converts valid HTML doctypes to their XHTML
109
+ counterparts, and drops <!DOCTYPE html>
110
+ * emit_doctype='html' preserves unknown doctypes and valid
111
+ HTML doctypes, converts valid XHTML doctypes to their HTML
112
+ counterparts, and uses <!DOCTYPE html> for missing doctypes
113
+ * emit_doctype='html5' Uses <!DOCTYPE html> as the doctype
114
+ * emit_doctype='preserve' preserves the doctype, if any, unchanged
102
115
inject_meta_charset=True|False
103
116
..?
104
117
quote_attr_values=True|False
@@ -140,6 +153,86 @@ def __init__(self, **kwargs):
140
153
self .errors = []
141
154
self .strict = False
142
155
156
+ def calc_doctype (self , token = None ):
157
+ if self .emit_doctype == 'html5' or \
158
+ not token and self .emit_doctype == 'html' :
159
+ if token :
160
+ return u'<!DOCTYPE html>'
161
+ else :
162
+ return u'<!DOCTYPE html>\n '
163
+
164
+ rootElement = token ["name" ]
165
+ publicID = token ["publicId" ]
166
+ systemID = token ["systemId" ]
167
+
168
+ if re .match (u'html' , rootElement , re .IGNORECASE ):
169
+ if self .emit_doctype == u'html' :
170
+ # XHTML 1.1
171
+ if publicID == u"-//W3C//DTD XHTML 1.1//EN" and (not systemID \
172
+ or systemID == u"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd" ):
173
+ publicID = u"-//W3C//DTD HTML 4.01//EN"
174
+ if systemID :
175
+ systemID = u"http://www.w3.org/TR/html4/strict.dtd"
176
+ # XHTML 1.0 Strict
177
+ elif publicID == u"-//W3C//DTD XHTML 1.0 Strict//EN" and (not systemID \
178
+ or systemID == u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd" ):
179
+ publicID = u"-//W3C//DTD HTML 4.01//EN"
180
+ if systemID :
181
+ systemID = u"http://www.w3.org/TR/html4/strict.dtd"
182
+ # XHTML 1.0 Transitional
183
+ elif publicID == u"-//W3C//DTD XHTML 1.0 Transitional//EN" and (not systemID \
184
+ or systemID == u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd" ):
185
+ publicID = u"-//W3C//DTD HTML 4.01 Transitional//EN"
186
+ if systemID :
187
+ systemID = u"http://www.w3.org/TR/html4/loose.dtd"
188
+ # XHTML 1.0 Frameset
189
+ elif publicID == u"-//W3C//DTD XHTML 1.0 Frameset//EN" and (not systemID \
190
+ or systemID == u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd" ):
191
+ publicID = u"-//W3C//DTD HTML 4.01 Frameset//EN"
192
+ if systemID :
193
+ systemID = u"http://www.w3.org/TR/html4/frameset.dtd"
194
+ elif self .emit_doctype == u'xhtml' :
195
+ # HTML 4.01 Strict
196
+ if re .match (u"-//W3C//DTD HTML 4.0(1)?//EN" , publicID ) and \
197
+ (not systemID or \
198
+ re .match (u"http://www.w3.org/TR/(html4|REC-html40)/strict.dtd" , systemID )):
199
+ publicID = u"-//W3C//DTD XHTML 1.0 Strict//EN"
200
+ if systemID :
201
+ systemID = u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"
202
+ # HTML4.01 Transitional
203
+ elif re .match (u"-//W3C//DTD HTML 4.0(1)? Transitional//EN" , publicID ) and \
204
+ (not systemID or \
205
+ re .match (u"http://www.w3.org/TR/(html4|REC-html40)/loose.dtd" , systemID )):
206
+ publicID = u"-//W3C//DTD XHTML 1.0 Transitional//EN"
207
+ if systemID :
208
+ systemID = u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"
209
+ # HTML 4.01 Frameset
210
+ elif re .match (u"-//W3C//DTD HTML 4.0(1)? Frameset//EN" , publicID ) and \
211
+ (not systemID or \
212
+ re .match (u"http://www.w3.org/TR/(html4|REC-html40)/frameset.dtd" , systemID )):
213
+ publicID = u"-//W3C//DTD XHTML 1.0 Frameset//EN"
214
+ if systemID :
215
+ systemID = u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd"
216
+ # HTML 3.2
217
+ elif re .match (u"-//W3C//DTD HTML 3.2( Final)?//EN" , publicID ) and not systemID :
218
+ publicID = u"-//W3C//DTD XHTML 1.0 Transitional//EN"
219
+
220
+ doctype = u"<!DOCTYPE %s" % rootElement
221
+ if token ["publicId" ]:
222
+ doctype += u' PUBLIC "%s"' % publicID
223
+ elif systemID :
224
+ doctype += u" SYSTEM"
225
+ if systemID :
226
+ if systemID .find (u'"' ) >= 0 :
227
+ if systemID .find (u"'" ) >= 0 :
228
+ self .serializeError (_ ("System identifer contains both single and double quote characters" ))
229
+ quote_char = u"'"
230
+ else :
231
+ quote_char = u'"'
232
+ doctype += u" %s%s%s" % (quote_char , systemID , quote_char )
233
+ doctype += u">"
234
+ return doctype
235
+
143
236
def serialize (self , treewalker , encoding = None ):
144
237
in_cdata = False
145
238
self .errors = []
@@ -157,26 +250,12 @@ def serialize(self, treewalker, encoding=None):
157
250
if self .omit_optional_tags :
158
251
from html5lib .filters .optionaltags import Filter
159
252
treewalker = Filter (treewalker )
253
+ posted_doctype = False
160
254
for token in treewalker :
161
255
type = token ["type" ]
162
256
if type == "Doctype" :
163
- doctype = u"<!DOCTYPE %s" % token ["name" ]
164
-
165
- if token ["publicId" ]:
166
- doctype += u' PUBLIC "%s"' % token ["publicId" ]
167
- elif token ["systemId" ]:
168
- doctype += u" SYSTEM"
169
- if token ["systemId" ]:
170
- if token ["systemId" ].find (u'"' ) >= 0 :
171
- if token ["systemId" ].find (u"'" ) >= 0 :
172
- self .serializeError (_ ("System identifer contains both single and double quote characters" ))
173
- quote_char = u"'"
174
- else :
175
- quote_char = u'"'
176
- doctype += u" %s%s%s" % (quote_char , token ["systemId" ], quote_char )
177
-
178
- doctype += u">"
179
-
257
+ posted_doctype = True
258
+ doctype = self .calc_doctype (token )
180
259
if encoding :
181
260
yield doctype .encode (encoding )
182
261
else :
@@ -196,6 +275,9 @@ def serialize(self, treewalker, encoding=None):
196
275
yield escape (token ["data" ])
197
276
198
277
elif type in ("StartTag" , "EmptyTag" ):
278
+ if not posted_doctype :
279
+ posted_doctype = True
280
+ yield self .calc_doctype ()
199
281
name = token ["name" ]
200
282
if name in rcdataElements and not self .escape_rcdata :
201
283
in_cdata = True
0 commit comments