html5lib · Jul 20, 2010 · Jul 20, 2010 · Jul 20, 2010 · Jul 20, 2010
Showing with 157 additions and 19 deletions.

+157 −19 html5lib/serializer/htmlserializer.py
diff --git a/html5lib/serializer/htmlserializer.py b/html5lib/serializer/htmlserializer.py
@@ -12,6 +12,8 @@
 from html5lib import utils
 from xml.sax.saxutils import escape
 
+import re
+
 spaceCharacters = u"".join(spaceCharacters)
 
 try:
@@ -84,24 +86,160 @@ class HTMLSerializer(object):
     resolve_entities = True
 
     # miscellaneous options
+    emit_doctype = 'preserve'
     inject_meta_charset = True
+    lang_attr = 'preserve'
     strip_whitespace = False
     sanitize = False
 
     options = ("quote_attr_values", "quote_char", "use_best_quote_char",
           "minimize_boolean_attributes", "use_trailing_solidus",
           "space_before_trailing_solidus", "omit_optional_tags",
           "strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs",
-          "escape_rcdata", "resolve_entities", "sanitize")
+          "escape_rcdata", "resolve_entities", "emit_doctype", "lang_attr",
+          "sanitize")
 
     def __init__(self, **kwargs):
+        """Initialize HTMLSerializer.
+
+        Keyword options (default given first unless specified) include:
+
+        emit_doctype='html'|'xhtml'|'html5'|'preserve'
+          Whether to output a doctype.
+            * emit_doctype='xhtml' preserves unknown doctypes and valid
+              XHTML doctypes, converts valid HTML doctypes to their XHTML
+              counterparts, and drops <!DOCTYPE html>
+            * emit_doctype='html' preserves unknown doctypes and valid
+              HTML doctypes, converts valid XHTML doctypes to their HTML
+              counterparts, and uses <!DOCTYPE html> for missing doctypes
+            * emit_doctype='html5' Uses <!DOCTYPE html> as the doctype
+            * emit_doctype='preserve' preserves the doctype, if any, unchanged
+        inject_meta_charset=True|False
+          ..?
+        lang_attr='preserve'|'xml'|'html'
+          Whether to translate 'lang' attributes.
+            * lang_attr='preserve' does no translation
+            * lang_attr='xml' translates 'lang' to 'xml:lang'
+            * lang_attr='html' translates 'xml:lang' to 'lang'
+        quote_attr_values=True|False
+          Whether to quote attribute values that don't require quoting
+          per HTML5 parsing rules.
+        quote_char=u'"'|u"'"
+          Use given quote character for attribute quoting. Default is to
+          use double quote unless attribute value contains a double quote,
+          in which case single quotes are used instead.
+        escape_lt_in_attrs=False|True
+          Whether to escape < in attribute values.
+        escape_rc_data=False|True
+          ..?
+        resolve_entities=True|False
+          Whether to resolve named character entities that appear in the
+          source tree. The XML predified entities &lt; &gt; &amp; &quot; &apos;
+          are unaffected by this setting.
+        strip_whitespace=False|True
+          ..?
+        minimize_boolean_attributes=True|false
+          Shortens boolean attributes to give just the attribute value,
+          for example <input disabled="disabled"> becomes <input disabled>.
+        use_trailing_solidus
+          Includes a close-tag slash at the end of the start tag of void
+          elements (empty elements whose end tag is forbidden). E.g. <hr/>.
+        space_before_trailing_solidus
+          Places a space immediately before the closing slash in a tag
+          using a trailing solidus. E.g. <hr />. Requires use_trailing_solidus.
+        sanitize
+          Strip all unsafe or unknown constructs from output.
+          See `html5lib user documentation`_
+
+        .. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation
+        """
         if kwargs.has_key('quote_char'):
             self.use_best_quote_char = False
         for attr in self.options:
             setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
         self.errors = []
         self.strict = False
 
+    def calc_doctype(self, token=None):
+        if self.emit_doctype == 'html5' or \
+           not token and self.emit_doctype == 'html':
+            if token:
+                return u'<!DOCTYPE html>'
+            else:
+                return u'<!DOCTYPE html>\n'
+
+        rootElement = token["name"]
+        publicID    = token["publicId"]
+        systemID    = token["systemId"]
+
+        if re.match(u'html', rootElement, re.IGNORECASE):
+            if self.emit_doctype == u'html':
+                # XHTML 1.1
+                if publicID == u"-//W3C//DTD XHTML 1.1//EN" and (not systemID \
+                or systemID == u"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"):
+                    publicID = u"-//W3C//DTD HTML 4.01//EN"
+                    if systemID:
+                        systemID = u"http://www.w3.org/TR/html4/strict.dtd"
+                # XHTML 1.0 Strict
+                elif publicID == u"-//W3C//DTD XHTML 1.0 Strict//EN" and (not systemID \
+                or systemID == u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"):
+                    publicID = u"-//W3C//DTD HTML 4.01//EN"
+                    if systemID:
+                        systemID = u"http://www.w3.org/TR/html4/strict.dtd"
+                # XHTML 1.0 Transitional
+                elif publicID == u"-//W3C//DTD XHTML 1.0 Transitional//EN" and (not systemID \
+                or systemID == u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"):
+                    publicID = u"-//W3C//DTD HTML 4.01 Transitional//EN"
+                    if systemID:
+                        systemID = u"http://www.w3.org/TR/html4/loose.dtd"
+                # XHTML 1.0 Frameset
+                elif publicID == u"-//W3C//DTD XHTML 1.0 Frameset//EN" and (not systemID \
+                or systemID == u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd"):
+                    publicID = u"-//W3C//DTD HTML 4.01 Frameset//EN"
+                    if systemID:
+                        systemID = u"http://www.w3.org/TR/html4/frameset.dtd"
+            elif self.emit_doctype == u'xhtml':
+                # HTML 4.01 Strict
+                if re.match(u"-//W3C//DTD HTML 4.0(1)?//EN", publicID) and \
+                (not systemID or \
+                re.match(u"http://www.w3.org/TR/(html4|REC-html40)/strict.dtd", systemID)):
+                    publicID = u"-//W3C//DTD XHTML 1.0 Strict//EN"
+                    if systemID:
+                        systemID = u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"
+                # HTML4.01 Transitional
+                elif re.match(u"-//W3C//DTD HTML 4.0(1)? Transitional//EN", publicID) and \
+                (not systemID or \
+                 re.match(u"http://www.w3.org/TR/(html4|REC-html40)/loose.dtd", systemID)):
+                    publicID = u"-//W3C//DTD XHTML 1.0 Transitional//EN"
+                    if systemID:
+                        systemID = u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"
+                # HTML 4.01 Frameset
+                elif re.match(u"-//W3C//DTD HTML 4.0(1)? Frameset//EN", publicID) and \
+                (not systemID or \
+                 re.match(u"http://www.w3.org/TR/(html4|REC-html40)/frameset.dtd", systemID)):
+                    publicID = u"-//W3C//DTD XHTML 1.0 Frameset//EN"
+                    if systemID:
+                        systemID = u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd"
+                # HTML 3.2
+                elif re.match(u"-//W3C//DTD HTML 3.2( Final)?//EN", publicID) and not systemID:
+                    publicID = u"-//W3C//DTD XHTML 1.0 Transitional//EN"
+
+        doctype = u"<!DOCTYPE %s" % rootElement
+        if token["publicId"]:
+            doctype += u' PUBLIC "%s"' % publicID
+        elif systemID:
+            doctype += u" SYSTEM"
+        if systemID:
+            if systemID.find(u'"') >= 0:
+                if systemID.find(u"'") >= 0:
+                    self.serializeError(_("System identifer contains both single and double quote characters"))
+                quote_char = u"'"
+            else:
+                quote_char = u'"'
+            doctype += u" %s%s%s" % (quote_char, systemID, quote_char)
+        doctype += u">"
+        return doctype
+
     def serialize(self, treewalker, encoding=None):
         in_cdata = False
         self.errors = []
@@ -119,26 +257,12 @@ def serialize(self, treewalker, encoding=None):
         if self.omit_optional_tags:
             from html5lib.filters.optionaltags import Filter
             treewalker = Filter(treewalker)
+        posted_doctype = False
         for token in treewalker:
             type = token["type"]
             if type == "Doctype":
-                doctype = u"<!DOCTYPE %s" % token["name"]
-
-                if token["publicId"]:
-                    doctype += u' PUBLIC "%s"' % token["publicId"]
-                elif token["systemId"]:
-                    doctype += u" SYSTEM"
-                if token["systemId"]:                
-                    if token["systemId"].find(u'"') >= 0:
-                        if token["systemId"].find(u"'") >= 0:
-                            self.serializeError(_("System identifer contains both single and double quote characters"))
-                        quote_char = u"'"
-                    else:
-                        quote_char = u'"'
-                    doctype += u" %s%s%s" % (quote_char, token["systemId"], quote_char)
-
-                doctype += u">"
-
+                posted_doctype = True
+                doctype = self.calc_doctype(token)
                 if encoding:
                     yield doctype.encode(encoding)
                 else:
@@ -158,6 +282,9 @@ def serialize(self, treewalker, encoding=None):
                     yield escape(token["data"])
 
             elif type in ("StartTag", "EmptyTag"):
+                if not posted_doctype:
+                    posted_doctype = True
+                    yield self.calc_doctype()
                 name = token["name"]
                 if name in rcdataElements and not self.escape_rcdata:
                     in_cdata = True
@@ -166,9 +293,20 @@ def serialize(self, treewalker, encoding=None):
                 attrs = token["data"]
                 if hasattr(attrs, "items"):
                     attrs = attrs.items()
-                attrs.sort()
                 attributes = []
                 for k,v in attrs:
+
+                    # clean up xml:lang
+                    if k == '{http://www.w3.org/XML/1998/namespace}lang':
+                        k = 'xml:lang'
+                    if self.lang_attr == 'xml':
+                        if k == 'lang' and not ('xml:lang' in attrs or
+                           '{http://www.w3.org/XML/1998/namespace}lang' in attrs):
+                            k = 'xml:lang'
+                    elif self.lang_attr == 'html':
+                        if k == 'xml:lang' and not ('lang' in attrs):
+                            k = 'lang'
+
                     if encoding:
                         k = k.encode(encoding, "strict")
                     attributes.append(' ')