5
5
6
6
from codecs import register_error , xmlcharrefreplace_errors
7
7
8
- from .. constants import voidElements , booleanAttributes , spaceCharacters
9
- from .. constants import rcdataElements , entities , xmlEntities
10
- from .. import utils
8
+ from .constants import voidElements , booleanAttributes , spaceCharacters
9
+ from .constants import rcdataElements , entities , xmlEntities
10
+ from . import treewalkers , _utils
11
11
from xml .sax .saxutils import escape
12
12
13
- spaceCharacters = "" .join (spaceCharacters )
13
+ _quoteAttributeSpecChars = "" .join (spaceCharacters ) + "\" '=<>`"
14
+ _quoteAttributeSpec = re .compile ("[" + _quoteAttributeSpecChars + "]" )
15
+ _quoteAttributeLegacy = re .compile ("[" + _quoteAttributeSpecChars +
16
+ "\x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \t \n "
17
+ "\x0b \x0c \r \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 "
18
+ "\x16 \x17 \x18 \x19 \x1a \x1b \x1c \x1d \x1e \x1f "
19
+ "\x20 \x2f \x60 \xa0 \u1680 \u180e \u180f \u2000 "
20
+ "\u2001 \u2002 \u2003 \u2004 \u2005 \u2006 \u2007 "
21
+ "\u2008 \u2009 \u200a \u2028 \u2029 \u202f \u205f "
22
+ "\u3000 ]" )
14
23
15
- quoteAttributeSpecChars = spaceCharacters + "\" '=<>`"
16
- quoteAttributeSpec = re .compile ("[" + quoteAttributeSpecChars + "]" )
17
- quoteAttributeLegacy = re .compile ("[" + quoteAttributeSpecChars +
18
- "\x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \t \n "
19
- "\x0b \x0c \r \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 "
20
- "\x16 \x17 \x18 \x19 \x1a \x1b \x1c \x1d \x1e \x1f "
21
- "\x20 \x2f \x60 \xa0 \u1680 \u180e \u180f \u2000 "
22
- "\u2001 \u2002 \u2003 \u2004 \u2005 \u2006 \u2007 "
23
- "\u2008 \u2009 \u200a \u2028 \u2029 \u202f \u205f "
24
- "\u3000 ]" )
25
24
26
-
27
- encode_entity_map = {}
28
- is_ucs4 = len ("\U0010FFFF " ) == 1
25
+ _encode_entity_map = {}
26
+ _is_ucs4 = len ("\U0010FFFF " ) == 1
29
27
for k , v in list (entities .items ()):
30
28
# skip multi-character entities
31
- if ((is_ucs4 and len (v ) > 1 ) or
32
- (not is_ucs4 and len (v ) > 2 )):
29
+ if ((_is_ucs4 and len (v ) > 1 ) or
30
+ (not _is_ucs4 and len (v ) > 2 )):
33
31
continue
34
32
if v != "&" :
35
33
if len (v ) == 2 :
36
- v = utils .surrogatePairToCodepoint (v )
34
+ v = _utils .surrogatePairToCodepoint (v )
37
35
else :
38
36
v = ord (v )
39
- if v not in encode_entity_map or k .islower ():
37
+ if v not in _encode_entity_map or k .islower ():
40
38
# prefer < over < and similarly for &, >, etc.
41
- encode_entity_map [v ] = k
39
+ _encode_entity_map [v ] = k
42
40
43
41
44
42
def htmlentityreplace_errors (exc ):
@@ -51,14 +49,14 @@ def htmlentityreplace_errors(exc):
51
49
skip = False
52
50
continue
53
51
index = i + exc .start
54
- if utils .isSurrogatePair (exc .object [index :min ([exc .end , index + 2 ])]):
55
- codepoint = utils .surrogatePairToCodepoint (exc .object [index :index + 2 ])
52
+ if _utils .isSurrogatePair (exc .object [index :min ([exc .end , index + 2 ])]):
53
+ codepoint = _utils .surrogatePairToCodepoint (exc .object [index :index + 2 ])
56
54
skip = True
57
55
else :
58
56
codepoint = ord (c )
59
57
codepoints .append (codepoint )
60
58
for cp in codepoints :
61
- e = encode_entity_map .get (cp )
59
+ e = _encode_entity_map .get (cp )
62
60
if e :
63
61
res .append ("&" )
64
62
res .append (e )
@@ -73,6 +71,13 @@ def htmlentityreplace_errors(exc):
73
71
register_error ("htmlentityreplace" , htmlentityreplace_errors )
74
72
75
73
74
+ def serialize (input , tree = "etree" , encoding = None , ** serializer_opts ):
75
+ # XXX: Should we cache this?
76
+ walker = treewalkers .getTreeWalker (tree )
77
+ s = HTMLSerializer (** serializer_opts )
78
+ return s .render (walker (input ), encoding )
79
+
80
+
76
81
class HTMLSerializer (object ):
77
82
78
83
# attribute quoting options
@@ -181,24 +186,24 @@ def serialize(self, treewalker, encoding=None):
181
186
self .errors = []
182
187
183
188
if encoding and self .inject_meta_charset :
184
- from .. filters .inject_meta_charset import Filter
189
+ from .filters .inject_meta_charset import Filter
185
190
treewalker = Filter (treewalker , encoding )
186
191
# Alphabetical attributes is here under the assumption that none of
187
192
# the later filters add or change order of attributes; it needs to be
188
193
# before the sanitizer so escaped elements come out correctly
189
194
if self .alphabetical_attributes :
190
- from .. filters .alphabeticalattributes import Filter
195
+ from .filters .alphabeticalattributes import Filter
191
196
treewalker = Filter (treewalker )
192
197
# WhitespaceFilter should be used before OptionalTagFilter
193
198
# for maximum efficiently of this latter filter
194
199
if self .strip_whitespace :
195
- from .. filters .whitespace import Filter
200
+ from .filters .whitespace import Filter
196
201
treewalker = Filter (treewalker )
197
202
if self .sanitize :
198
- from .. filters .sanitizer import Filter
203
+ from .filters .sanitizer import Filter
199
204
treewalker = Filter (treewalker )
200
205
if self .omit_optional_tags :
201
- from .. filters .optionaltags import Filter
206
+ from .filters .optionaltags import Filter
202
207
treewalker = Filter (treewalker )
203
208
204
209
for token in treewalker :
@@ -251,9 +256,9 @@ def serialize(self, treewalker, encoding=None):
251
256
if self .quote_attr_values == "always" or len (v ) == 0 :
252
257
quote_attr = True
253
258
elif self .quote_attr_values == "spec" :
254
- quote_attr = quoteAttributeSpec .search (v ) is not None
259
+ quote_attr = _quoteAttributeSpec .search (v ) is not None
255
260
elif self .quote_attr_values == "legacy" :
256
- quote_attr = quoteAttributeLegacy .search (v ) is not None
261
+ quote_attr = _quoteAttributeLegacy .search (v ) is not None
257
262
else :
258
263
raise ValueError ("quote_attr_values must be one of: "
259
264
"'always', 'spec', or 'legacy'" )
0 commit comments