Skip to content
Permalink

Comparing changes

Choose two branches to see what’s changed or to start a new pull request. If you need to, you can also or learn more about diff comparisons.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also . Learn more about diff comparisons here.
base repository: html5lib/html5lib-python
Failed to load repositories. Confirm that selected base ref is valid, then try again.
Loading
base: master
Choose a base ref
...
head repository: html5lib/html5lib-python
Failed to load repositories. Confirm that selected head ref is valid, then try again.
Loading
compare: csswg-testsuite
Choose a head ref
Can’t automatically merge. Don’t worry, you can still create the pull request.
  • 4 commits
  • 1 file changed
  • 1 contributor

Commits on Jul 20, 2010

  1. Add patch from issue 150 by fantasai

    --HG--
    branch : csswg-testsuite
    gsnedders committed Jul 20, 2010
    Copy the full SHA
    21bf1ad View commit details
  2. Add patch from issue 152 by fantasai

    --HG--
    branch : csswg-testsuite
    gsnedders committed Jul 20, 2010
    Copy the full SHA
    0e39324 View commit details
  3. Add patch from issue 153 by fantasai

    --HG--
    branch : csswg-testsuite
    gsnedders committed Jul 20, 2010
    Copy the full SHA
    f0a8c6e View commit details
  4. Add patch from issue 154 by fantasai

    --HG--
    branch : csswg-testsuite
    gsnedders committed Jul 20, 2010
    Copy the full SHA
    956d9b8 View commit details
Showing with 157 additions and 19 deletions.
  1. +157 −19 html5lib/serializer/htmlserializer.py
176 changes: 157 additions & 19 deletions html5lib/serializer/htmlserializer.py
Original file line number Diff line number Diff line change
@@ -12,6 +12,8 @@
from html5lib import utils
from xml.sax.saxutils import escape

import re

spaceCharacters = u"".join(spaceCharacters)

try:
@@ -84,24 +86,160 @@ class HTMLSerializer(object):
resolve_entities = True

# miscellaneous options
emit_doctype = 'preserve'
inject_meta_charset = True
lang_attr = 'preserve'
strip_whitespace = False
sanitize = False

options = ("quote_attr_values", "quote_char", "use_best_quote_char",
"minimize_boolean_attributes", "use_trailing_solidus",
"space_before_trailing_solidus", "omit_optional_tags",
"strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs",
"escape_rcdata", "resolve_entities", "sanitize")
"escape_rcdata", "resolve_entities", "emit_doctype", "lang_attr",
"sanitize")

def __init__(self, **kwargs):
"""Initialize HTMLSerializer.
Keyword options (default given first unless specified) include:
emit_doctype='html'|'xhtml'|'html5'|'preserve'
Whether to output a doctype.
* emit_doctype='xhtml' preserves unknown doctypes and valid
XHTML doctypes, converts valid HTML doctypes to their XHTML
counterparts, and drops <!DOCTYPE html>
* emit_doctype='html' preserves unknown doctypes and valid
HTML doctypes, converts valid XHTML doctypes to their HTML
counterparts, and uses <!DOCTYPE html> for missing doctypes
* emit_doctype='html5' Uses <!DOCTYPE html> as the doctype
* emit_doctype='preserve' preserves the doctype, if any, unchanged
inject_meta_charset=True|False
..?
lang_attr='preserve'|'xml'|'html'
Whether to translate 'lang' attributes.
* lang_attr='preserve' does no translation
* lang_attr='xml' translates 'lang' to 'xml:lang'
* lang_attr='html' translates 'xml:lang' to 'lang'
quote_attr_values=True|False
Whether to quote attribute values that don't require quoting
per HTML5 parsing rules.
quote_char=u'"'|u"'"
Use given quote character for attribute quoting. Default is to
use double quote unless attribute value contains a double quote,
in which case single quotes are used instead.
escape_lt_in_attrs=False|True
Whether to escape < in attribute values.
escape_rc_data=False|True
..?
resolve_entities=True|False
Whether to resolve named character entities that appear in the
source tree. The XML predified entities &lt; &gt; &amp; &quot; &apos;
are unaffected by this setting.
strip_whitespace=False|True
..?
minimize_boolean_attributes=True|false
Shortens boolean attributes to give just the attribute value,
for example <input disabled="disabled"> becomes <input disabled>.
use_trailing_solidus
Includes a close-tag slash at the end of the start tag of void
elements (empty elements whose end tag is forbidden). E.g. <hr/>.
space_before_trailing_solidus
Places a space immediately before the closing slash in a tag
using a trailing solidus. E.g. <hr />. Requires use_trailing_solidus.
sanitize
Strip all unsafe or unknown constructs from output.
See `html5lib user documentation`_
.. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation
"""
if kwargs.has_key('quote_char'):
self.use_best_quote_char = False
for attr in self.options:
setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
self.errors = []
self.strict = False

def calc_doctype(self, token=None):
if self.emit_doctype == 'html5' or \
not token and self.emit_doctype == 'html':
if token:
return u'<!DOCTYPE html>'
else:
return u'<!DOCTYPE html>\n'

rootElement = token["name"]
publicID = token["publicId"]
systemID = token["systemId"]

if re.match(u'html', rootElement, re.IGNORECASE):
if self.emit_doctype == u'html':
# XHTML 1.1
if publicID == u"-//W3C//DTD XHTML 1.1//EN" and (not systemID \
or systemID == u"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"):
publicID = u"-//W3C//DTD HTML 4.01//EN"
if systemID:
systemID = u"http://www.w3.org/TR/html4/strict.dtd"
# XHTML 1.0 Strict
elif publicID == u"-//W3C//DTD XHTML 1.0 Strict//EN" and (not systemID \
or systemID == u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"):
publicID = u"-//W3C//DTD HTML 4.01//EN"
if systemID:
systemID = u"http://www.w3.org/TR/html4/strict.dtd"
# XHTML 1.0 Transitional
elif publicID == u"-//W3C//DTD XHTML 1.0 Transitional//EN" and (not systemID \
or systemID == u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"):
publicID = u"-//W3C//DTD HTML 4.01 Transitional//EN"
if systemID:
systemID = u"http://www.w3.org/TR/html4/loose.dtd"
# XHTML 1.0 Frameset
elif publicID == u"-//W3C//DTD XHTML 1.0 Frameset//EN" and (not systemID \
or systemID == u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd"):
publicID = u"-//W3C//DTD HTML 4.01 Frameset//EN"
if systemID:
systemID = u"http://www.w3.org/TR/html4/frameset.dtd"
elif self.emit_doctype == u'xhtml':
# HTML 4.01 Strict
if re.match(u"-//W3C//DTD HTML 4.0(1)?//EN", publicID) and \
(not systemID or \
re.match(u"http://www.w3.org/TR/(html4|REC-html40)/strict.dtd", systemID)):
publicID = u"-//W3C//DTD XHTML 1.0 Strict//EN"
if systemID:
systemID = u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"
# HTML4.01 Transitional
elif re.match(u"-//W3C//DTD HTML 4.0(1)? Transitional//EN", publicID) and \
(not systemID or \
re.match(u"http://www.w3.org/TR/(html4|REC-html40)/loose.dtd", systemID)):
publicID = u"-//W3C//DTD XHTML 1.0 Transitional//EN"
if systemID:
systemID = u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"
# HTML 4.01 Frameset
elif re.match(u"-//W3C//DTD HTML 4.0(1)? Frameset//EN", publicID) and \
(not systemID or \
re.match(u"http://www.w3.org/TR/(html4|REC-html40)/frameset.dtd", systemID)):
publicID = u"-//W3C//DTD XHTML 1.0 Frameset//EN"
if systemID:
systemID = u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd"
# HTML 3.2
elif re.match(u"-//W3C//DTD HTML 3.2( Final)?//EN", publicID) and not systemID:
publicID = u"-//W3C//DTD XHTML 1.0 Transitional//EN"

doctype = u"<!DOCTYPE %s" % rootElement
if token["publicId"]:
doctype += u' PUBLIC "%s"' % publicID
elif systemID:
doctype += u" SYSTEM"
if systemID:
if systemID.find(u'"') >= 0:
if systemID.find(u"'") >= 0:
self.serializeError(_("System identifer contains both single and double quote characters"))
quote_char = u"'"
else:
quote_char = u'"'
doctype += u" %s%s%s" % (quote_char, systemID, quote_char)
doctype += u">"
return doctype

def serialize(self, treewalker, encoding=None):
in_cdata = False
self.errors = []
@@ -119,26 +257,12 @@ def serialize(self, treewalker, encoding=None):
if self.omit_optional_tags:
from html5lib.filters.optionaltags import Filter
treewalker = Filter(treewalker)
posted_doctype = False
for token in treewalker:
type = token["type"]
if type == "Doctype":
doctype = u"<!DOCTYPE %s" % token["name"]

if token["publicId"]:
doctype += u' PUBLIC "%s"' % token["publicId"]
elif token["systemId"]:
doctype += u" SYSTEM"
if token["systemId"]:
if token["systemId"].find(u'"') >= 0:
if token["systemId"].find(u"'") >= 0:
self.serializeError(_("System identifer contains both single and double quote characters"))
quote_char = u"'"
else:
quote_char = u'"'
doctype += u" %s%s%s" % (quote_char, token["systemId"], quote_char)

doctype += u">"

posted_doctype = True
doctype = self.calc_doctype(token)
if encoding:
yield doctype.encode(encoding)
else:
@@ -158,6 +282,9 @@ def serialize(self, treewalker, encoding=None):
yield escape(token["data"])

elif type in ("StartTag", "EmptyTag"):
if not posted_doctype:
posted_doctype = True
yield self.calc_doctype()
name = token["name"]
if name in rcdataElements and not self.escape_rcdata:
in_cdata = True
@@ -166,9 +293,20 @@ def serialize(self, treewalker, encoding=None):
attrs = token["data"]
if hasattr(attrs, "items"):
attrs = attrs.items()
attrs.sort()
attributes = []
for k,v in attrs:

# clean up xml:lang
if k == '{http://www.w3.org/XML/1998/namespace}lang':
k = 'xml:lang'
if self.lang_attr == 'xml':
if k == 'lang' and not ('xml:lang' in attrs or
'{http://www.w3.org/XML/1998/namespace}lang' in attrs):
k = 'xml:lang'
elif self.lang_attr == 'html':
if k == 'xml:lang' and not ('lang' in attrs):
k = 'lang'

if encoding:
k = k.encode(encoding, "strict")
attributes.append(' ')