Skip to content

Commit a8ba43e

Browse files
authoredJul 14, 2016
Merge pull request #270 from gsnedders/rename_stuff
2 parents 945911b + 8cb144b commit a8ba43e

40 files changed

+219
-275
lines changed
 

‎CHANGES.rst

+14
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,20 @@ Released on XXX
5050
with a set of keyword arguments: override_encoding, transport_encoding,
5151
same_origin_parent_encoding, likely_encoding, and default_encoding.**
5252

53+
* **Move filters._base, treebuilder._base, and treewalkers._base to .base
54+
to clarify their status as public.**
55+
56+
* **Get rid of the sanitizer package. Merge sanitizer.sanitize into the
57+
sanitizer.htmlsanitizer module and move that to saniziter. This means
58+
anyone who used sanitizer.sanitize or sanitizer.HTMLSanitizer needs no
59+
code changes.**
60+
61+
* **Rename treewalkers.lxmletree to .etree_lxml and
62+
treewalkers.genshistream to .genshi to have a consistent API.**
63+
64+
* Move a whole load of stuff (inputstream, ihatexml, trie, tokenizer,
65+
utils) to be underscore prefixed to clarify their status as private.
66+
5367

5468
0.9999999/1.0b8
5569
~~~~~~~~~~~~~~~

‎doc/html5lib.filters.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
filters Package
22
===============
33

4-
:mod:`_base` Module
4+
:mod:`base` Module
55
-------------------
66

7-
.. automodule:: html5lib.filters._base
7+
.. automodule:: html5lib.filters.base
88
:members:
99
:undoc-members:
1010
:show-inheritance:

‎doc/html5lib.rst

+2-35
Original file line numberDiff line numberDiff line change
@@ -25,42 +25,10 @@ html5lib Package
2525
:undoc-members:
2626
:show-inheritance:
2727

28-
:mod:`ihatexml` Module
28+
:mod:`serializer` Module
2929
----------------------
3030

31-
.. automodule:: html5lib.ihatexml
32-
:members:
33-
:undoc-members:
34-
:show-inheritance:
35-
36-
:mod:`inputstream` Module
37-
-------------------------
38-
39-
.. automodule:: html5lib.inputstream
40-
:members:
41-
:undoc-members:
42-
:show-inheritance:
43-
44-
:mod:`sanitizer` Module
45-
-----------------------
46-
47-
.. automodule:: html5lib.sanitizer
48-
:members:
49-
:undoc-members:
50-
:show-inheritance:
51-
52-
:mod:`tokenizer` Module
53-
-----------------------
54-
55-
.. automodule:: html5lib.tokenizer
56-
:members:
57-
:undoc-members:
58-
:show-inheritance:
59-
60-
:mod:`utils` Module
61-
-------------------
62-
63-
.. automodule:: html5lib.utils
31+
.. automodule:: html5lib.serializer
6432
:members:
6533
:undoc-members:
6634
:show-inheritance:
@@ -71,7 +39,6 @@ Subpackages
7139
.. toctree::
7240

7341
html5lib.filters
74-
html5lib.serializer
7542
html5lib.treebuilders
7643
html5lib.treewalkers
7744

‎doc/html5lib.serializer.rst

-19
This file was deleted.

‎doc/html5lib.treebuilders.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,10 @@ treebuilders Package
99
:undoc-members:
1010
:show-inheritance:
1111

12-
:mod:`_base` Module
12+
:mod:`base` Module
1313
-------------------
1414

15-
.. automodule:: html5lib.treebuilders._base
15+
.. automodule:: html5lib.treebuilders.base
1616
:members:
1717
:undoc-members:
1818
:show-inheritance:

‎doc/html5lib.treewalkers.rst

+10-9
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,10 @@ treewalkers Package
99
:undoc-members:
1010
:show-inheritance:
1111

12-
:mod:`_base` Module
12+
:mod:`base` Module
1313
-------------------
1414

15-
.. automodule:: html5lib.treewalkers._base
15+
.. automodule:: html5lib.treewalkers.base
1616
:members:
1717
:undoc-members:
1818
:show-inheritance:
@@ -33,18 +33,19 @@ treewalkers Package
3333
:undoc-members:
3434
:show-inheritance:
3535

36-
:mod:`genshistream` Module
37-
--------------------------
36+
:mod:`etree_lxml` Module
37+
-----------------------
3838

39-
.. automodule:: html5lib.treewalkers.genshistream
39+
.. automodule:: html5lib.treewalkers.etree_lxml
4040
:members:
4141
:undoc-members:
4242
:show-inheritance:
4343

44-
:mod:`lxmletree` Module
45-
-----------------------
4644

47-
.. automodule:: html5lib.treewalkers.lxmletree
45+
:mod:`genshi` Module
46+
--------------------------
47+
48+
.. automodule:: html5lib.treewalkers.genshi
4849
:members:
4950
:undoc-members:
50-
:show-inheritance:
51+
:show-inheritance:
File renamed without changes.

‎html5lib/inputstream.py renamed to ‎html5lib/_inputstream.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
1212
from .constants import ReparseException
13-
from . import utils
13+
from . import _utils
1414

1515
from io import StringIO
1616

@@ -28,7 +28,7 @@
2828

2929
invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]" # noqa
3030

31-
if utils.supports_lone_surrogates:
31+
if _utils.supports_lone_surrogates:
3232
# Use one extra step of indirection and create surrogates with
3333
# eval. Not using this indirection would introduce an illegal
3434
# unicode literal on platforms not supporting such lone
@@ -176,7 +176,7 @@ def __init__(self, source):
176176
177177
"""
178178

179-
if not utils.supports_lone_surrogates:
179+
if not _utils.supports_lone_surrogates:
180180
# Such platforms will have already checked for such
181181
# surrogate errors, so no need to do this checking.
182182
self.reportCharacterErrors = None
@@ -304,9 +304,9 @@ def characterErrorsUCS2(self, data):
304304
codepoint = ord(match.group())
305305
pos = match.start()
306306
# Pretty sure there should be endianness issues here
307-
if utils.isSurrogatePair(data[pos:pos + 2]):
307+
if _utils.isSurrogatePair(data[pos:pos + 2]):
308308
# We have a surrogate pair!
309-
char_val = utils.surrogatePairToCodepoint(data[pos:pos + 2])
309+
char_val = _utils.surrogatePairToCodepoint(data[pos:pos + 2])
310310
if char_val in non_bmp_invalid_codepoints:
311311
self.errors.append("invalid-codepoint")
312312
skip = True

‎html5lib/tokenizer.py renamed to ‎html5lib/_tokenizer.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,9 @@
1111
from .constants import tokenTypes, tagTokenTypes
1212
from .constants import replacementCharacters
1313

14-
from .inputstream import HTMLInputStream
14+
from ._inputstream import HTMLInputStream
1515

16-
from .trie import Trie
16+
from ._trie import Trie
1717

1818
entitiesTrie = Trie(entities)
1919

File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.

‎html5lib/filters/alphabeticalattributes.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
11
from __future__ import absolute_import, division, unicode_literals
22

3-
from . import _base
3+
from . import base
44

55
try:
66
from collections import OrderedDict
77
except ImportError:
88
from ordereddict import OrderedDict
99

1010

11-
class Filter(_base.Filter):
11+
class Filter(base.Filter):
1212
def __iter__(self):
13-
for token in _base.Filter.__iter__(self):
13+
for token in base.Filter.__iter__(self):
1414
if token["type"] in ("StartTag", "EmptyTag"):
1515
attrs = OrderedDict()
1616
for name, value in sorted(token["data"].items(),
File renamed without changes.

‎html5lib/filters/inject_meta_charset.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,19 @@
11
from __future__ import absolute_import, division, unicode_literals
22

3-
from . import _base
3+
from . import base
44

55

6-
class Filter(_base.Filter):
6+
class Filter(base.Filter):
77
def __init__(self, source, encoding):
8-
_base.Filter.__init__(self, source)
8+
base.Filter.__init__(self, source)
99
self.encoding = encoding
1010

1111
def __iter__(self):
1212
state = "pre_head"
1313
meta_found = (self.encoding is None)
1414
pending = []
1515

16-
for token in _base.Filter.__iter__(self):
16+
for token in base.Filter.__iter__(self):
1717
type = token["type"]
1818
if type == "StartTag":
1919
if token["name"].lower() == "head":

‎html5lib/filters/lint.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -2,21 +2,21 @@
22

33
from six import text_type
44

5-
from . import _base
5+
from . import base
66
from ..constants import namespaces, voidElements
77

88
from ..constants import spaceCharacters
99
spaceCharacters = "".join(spaceCharacters)
1010

1111

12-
class Filter(_base.Filter):
12+
class Filter(base.Filter):
1313
def __init__(self, source, require_matching_tags=True):
1414
super(Filter, self).__init__(source)
1515
self.require_matching_tags = require_matching_tags
1616

1717
def __iter__(self):
1818
open_elements = []
19-
for token in _base.Filter.__iter__(self):
19+
for token in base.Filter.__iter__(self):
2020
type = token["type"]
2121
if type in ("StartTag", "EmptyTag"):
2222
namespace = token["namespace"]

‎html5lib/filters/optionaltags.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
from __future__ import absolute_import, division, unicode_literals
22

3-
from . import _base
3+
from . import base
44

55

6-
class Filter(_base.Filter):
6+
class Filter(base.Filter):
77
def slider(self):
88
previous1 = previous2 = None
99
for token in self.source:

‎html5lib/filters/sanitizer.py

+10-18
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,13 @@
55

66
from six.moves import urllib_parse as urlparse
77

8-
from . import _base
8+
from . import base
99
from ..constants import namespaces, prefixes
1010

1111
__all__ = ["Filter"]
1212

1313

14-
acceptable_elements = frozenset((
14+
allowed_elements = frozenset((
1515
(namespaces['html'], 'a'),
1616
(namespaces['html'], 'abbr'),
1717
(namespaces['html'], 'acronym'),
@@ -175,7 +175,7 @@
175175
(namespaces['svg'], 'use'),
176176
))
177177

178-
acceptable_attributes = frozenset((
178+
allowed_attributes = frozenset((
179179
# HTML attributes
180180
(None, 'abbr'),
181181
(None, 'accept'),
@@ -552,7 +552,7 @@
552552
(None, 'use')
553553
))
554554

555-
acceptable_css_properties = frozenset((
555+
allowed_css_properties = frozenset((
556556
'azimuth',
557557
'background-color',
558558
'border-bottom-color',
@@ -601,7 +601,7 @@
601601
'width',
602602
))
603603

604-
acceptable_css_keywords = frozenset((
604+
allowed_css_keywords = frozenset((
605605
'auto',
606606
'aqua',
607607
'black',
@@ -643,7 +643,7 @@
643643
'yellow',
644644
))
645645

646-
acceptable_svg_properties = frozenset((
646+
allowed_svg_properties = frozenset((
647647
'fill',
648648
'fill-opacity',
649649
'fill-rule',
@@ -654,7 +654,7 @@
654654
'stroke-opacity',
655655
))
656656

657-
acceptable_protocols = frozenset((
657+
allowed_protocols = frozenset((
658658
'ed2k',
659659
'ftp',
660660
'http',
@@ -680,7 +680,7 @@
680680
'data',
681681
))
682682

683-
acceptable_content_types = frozenset((
683+
allowed_content_types = frozenset((
684684
'image/png',
685685
'image/jpeg',
686686
'image/gif',
@@ -689,14 +689,6 @@
689689
'text/plain',
690690
))
691691

692-
allowed_elements = acceptable_elements
693-
allowed_attributes = acceptable_attributes
694-
allowed_css_properties = acceptable_css_properties
695-
allowed_css_keywords = acceptable_css_keywords
696-
allowed_svg_properties = acceptable_svg_properties
697-
allowed_protocols = acceptable_protocols
698-
allowed_content_types = acceptable_content_types
699-
700692

701693
data_content_type = re.compile(r'''
702694
^
@@ -712,7 +704,7 @@
712704
re.VERBOSE)
713705

714706

715-
class Filter(_base.Filter):
707+
class Filter(base.Filter):
716708
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""
717709
def __init__(self,
718710
source,
@@ -739,7 +731,7 @@ def __init__(self,
739731
self.svg_allow_local_href = svg_allow_local_href
740732

741733
def __iter__(self):
742-
for token in _base.Filter.__iter__(self):
734+
for token in base.Filter.__iter__(self):
743735
token = self.sanitize_token(token)
744736
if token:
745737
yield token

‎html5lib/filters/whitespace.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,20 @@
22

33
import re
44

5-
from . import _base
5+
from . import base
66
from ..constants import rcdataElements, spaceCharacters
77
spaceCharacters = "".join(spaceCharacters)
88

99
SPACES_REGEX = re.compile("[%s]+" % spaceCharacters)
1010

1111

12-
class Filter(_base.Filter):
12+
class Filter(base.Filter):
1313

1414
spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements))
1515

1616
def __iter__(self):
1717
preserve = 0
18-
for token in _base.Filter.__iter__(self):
18+
for token in base.Filter.__iter__(self):
1919
type = token["type"]
2020
if type == "StartTag" \
2121
and (preserve or token["name"] in self.spacePreserveElements):

‎html5lib/html5parser.py

+46-46
Large diffs are not rendered by default.

‎html5lib/serializer/htmlserializer.py renamed to ‎html5lib/serializer.py

+37-32
Original file line numberDiff line numberDiff line change
@@ -5,40 +5,38 @@
55

66
from codecs import register_error, xmlcharrefreplace_errors
77

8-
from ..constants import voidElements, booleanAttributes, spaceCharacters
9-
from ..constants import rcdataElements, entities, xmlEntities
10-
from .. import utils
8+
from .constants import voidElements, booleanAttributes, spaceCharacters
9+
from .constants import rcdataElements, entities, xmlEntities
10+
from . import treewalkers, _utils
1111
from xml.sax.saxutils import escape
1212

13-
spaceCharacters = "".join(spaceCharacters)
13+
_quoteAttributeSpecChars = "".join(spaceCharacters) + "\"'=<>`"
14+
_quoteAttributeSpec = re.compile("[" + _quoteAttributeSpecChars + "]")
15+
_quoteAttributeLegacy = re.compile("[" + _quoteAttributeSpecChars +
16+
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"
17+
"\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
18+
"\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
19+
"\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"
20+
"\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
21+
"\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
22+
"\u3000]")
1423

15-
quoteAttributeSpecChars = spaceCharacters + "\"'=<>`"
16-
quoteAttributeSpec = re.compile("[" + quoteAttributeSpecChars + "]")
17-
quoteAttributeLegacy = re.compile("[" + quoteAttributeSpecChars +
18-
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"
19-
"\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
20-
"\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
21-
"\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"
22-
"\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
23-
"\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
24-
"\u3000]")
2524

26-
27-
encode_entity_map = {}
28-
is_ucs4 = len("\U0010FFFF") == 1
25+
_encode_entity_map = {}
26+
_is_ucs4 = len("\U0010FFFF") == 1
2927
for k, v in list(entities.items()):
3028
# skip multi-character entities
31-
if ((is_ucs4 and len(v) > 1) or
32-
(not is_ucs4 and len(v) > 2)):
29+
if ((_is_ucs4 and len(v) > 1) or
30+
(not _is_ucs4 and len(v) > 2)):
3331
continue
3432
if v != "&":
3533
if len(v) == 2:
36-
v = utils.surrogatePairToCodepoint(v)
34+
v = _utils.surrogatePairToCodepoint(v)
3735
else:
3836
v = ord(v)
39-
if v not in encode_entity_map or k.islower():
37+
if v not in _encode_entity_map or k.islower():
4038
# prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
41-
encode_entity_map[v] = k
39+
_encode_entity_map[v] = k
4240

4341

4442
def htmlentityreplace_errors(exc):
@@ -51,14 +49,14 @@ def htmlentityreplace_errors(exc):
5149
skip = False
5250
continue
5351
index = i + exc.start
54-
if utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
55-
codepoint = utils.surrogatePairToCodepoint(exc.object[index:index + 2])
52+
if _utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
53+
codepoint = _utils.surrogatePairToCodepoint(exc.object[index:index + 2])
5654
skip = True
5755
else:
5856
codepoint = ord(c)
5957
codepoints.append(codepoint)
6058
for cp in codepoints:
61-
e = encode_entity_map.get(cp)
59+
e = _encode_entity_map.get(cp)
6260
if e:
6361
res.append("&")
6462
res.append(e)
@@ -73,6 +71,13 @@ def htmlentityreplace_errors(exc):
7371
register_error("htmlentityreplace", htmlentityreplace_errors)
7472

7573

74+
def serialize(input, tree="etree", encoding=None, **serializer_opts):
75+
# XXX: Should we cache this?
76+
walker = treewalkers.getTreeWalker(tree)
77+
s = HTMLSerializer(**serializer_opts)
78+
return s.render(walker(input), encoding)
79+
80+
7681
class HTMLSerializer(object):
7782

7883
# attribute quoting options
@@ -181,24 +186,24 @@ def serialize(self, treewalker, encoding=None):
181186
self.errors = []
182187

183188
if encoding and self.inject_meta_charset:
184-
from ..filters.inject_meta_charset import Filter
189+
from .filters.inject_meta_charset import Filter
185190
treewalker = Filter(treewalker, encoding)
186191
# Alphabetical attributes is here under the assumption that none of
187192
# the later filters add or change order of attributes; it needs to be
188193
# before the sanitizer so escaped elements come out correctly
189194
if self.alphabetical_attributes:
190-
from ..filters.alphabeticalattributes import Filter
195+
from .filters.alphabeticalattributes import Filter
191196
treewalker = Filter(treewalker)
192197
# WhitespaceFilter should be used before OptionalTagFilter
193198
# for maximum efficiently of this latter filter
194199
if self.strip_whitespace:
195-
from ..filters.whitespace import Filter
200+
from .filters.whitespace import Filter
196201
treewalker = Filter(treewalker)
197202
if self.sanitize:
198-
from ..filters.sanitizer import Filter
203+
from .filters.sanitizer import Filter
199204
treewalker = Filter(treewalker)
200205
if self.omit_optional_tags:
201-
from ..filters.optionaltags import Filter
206+
from .filters.optionaltags import Filter
202207
treewalker = Filter(treewalker)
203208

204209
for token in treewalker:
@@ -251,9 +256,9 @@ def serialize(self, treewalker, encoding=None):
251256
if self.quote_attr_values == "always" or len(v) == 0:
252257
quote_attr = True
253258
elif self.quote_attr_values == "spec":
254-
quote_attr = quoteAttributeSpec.search(v) is not None
259+
quote_attr = _quoteAttributeSpec.search(v) is not None
255260
elif self.quote_attr_values == "legacy":
256-
quote_attr = quoteAttributeLegacy.search(v) is not None
261+
quote_attr = _quoteAttributeLegacy.search(v) is not None
257262
else:
258263
raise ValueError("quote_attr_values must be one of: "
259264
"'always', 'spec', or 'legacy'")

‎html5lib/serializer/__init__.py

-16
This file was deleted.

‎html5lib/tests/test_encoding.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,15 @@
55
import pytest
66

77
from .support import get_data_files, test_dir, errorMessage, TestData as _TestData
8-
from html5lib import HTMLParser, inputstream
8+
from html5lib import HTMLParser, _inputstream
99

1010

1111
def test_basic_prescan_length():
1212
data = "<title>Caf\u00E9</title><!--a--><meta charset='utf-8'>".encode('utf-8')
1313
pad = 1024 - len(data) + 1
1414
data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-")
1515
assert len(data) == 1024 # Sanity
16-
stream = inputstream.HTMLBinaryInputStream(data, useChardet=False)
16+
stream = _inputstream.HTMLBinaryInputStream(data, useChardet=False)
1717
assert 'utf-8' == stream.charEncoding[0].name
1818

1919

@@ -22,7 +22,7 @@ def test_parser_reparse():
2222
pad = 10240 - len(data) + 1
2323
data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-")
2424
assert len(data) == 10240 # Sanity
25-
stream = inputstream.HTMLBinaryInputStream(data, useChardet=False)
25+
stream = _inputstream.HTMLBinaryInputStream(data, useChardet=False)
2626
assert 'windows-1252' == stream.charEncoding[0].name
2727
p = HTMLParser(namespaceHTMLElements=False)
2828
doc = p.parse(data, useChardet=False)
@@ -47,7 +47,7 @@ def test_parser_reparse():
4747
("windows-1252", b"", {}),
4848
])
4949
def test_parser_args(expected, data, kwargs):
50-
stream = inputstream.HTMLBinaryInputStream(data, useChardet=False, **kwargs)
50+
stream = _inputstream.HTMLBinaryInputStream(data, useChardet=False, **kwargs)
5151
assert expected == stream.charEncoding[0].name
5252
p = HTMLParser()
5353
p.parse(data, useChardet=False, **kwargs)
@@ -85,7 +85,7 @@ def runParserEncodingTest(data, encoding):
8585

8686

8787
def runPreScanEncodingTest(data, encoding):
88-
stream = inputstream.HTMLBinaryInputStream(data, useChardet=False)
88+
stream = _inputstream.HTMLBinaryInputStream(data, useChardet=False)
8989
encoding = encoding.lower().decode("ascii")
9090

9191
# Very crude way to ignore irrelevant tests
@@ -111,6 +111,6 @@ def test_encoding():
111111
else:
112112
def test_chardet():
113113
with open(os.path.join(test_dir, "encoding", "chardet", "test_big5.txt"), "rb") as fp:
114-
encoding = inputstream.HTMLInputStream(fp.read()).charEncoding
114+
encoding = _inputstream.HTMLInputStream(fp.read()).charEncoding
115115
assert encoding[0].name == "big5"
116116
# pylint:enable=wrong-import-position

‎html5lib/tests/test_serializer.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from html5lib import constants
1111
from html5lib.filters.lint import Filter as Lint
1212
from html5lib.serializer import HTMLSerializer, serialize
13-
from html5lib.treewalkers._base import TreeWalker
13+
from html5lib.treewalkers.base import TreeWalker
1414

1515
# pylint:disable=wrong-import-position
1616
optionals_loaded = []

‎html5lib/tests/test_stream.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,9 @@
1111
import six
1212
from six.moves import http_client, urllib
1313

14-
from html5lib.inputstream import (BufferedStream, HTMLInputStream,
15-
HTMLUnicodeInputStream, HTMLBinaryInputStream)
16-
from html5lib.utils import supports_lone_surrogates
14+
from html5lib._inputstream import (BufferedStream, HTMLInputStream,
15+
HTMLUnicodeInputStream, HTMLBinaryInputStream)
16+
from html5lib._utils import supports_lone_surrogates
1717

1818

1919
def test_basic():

‎html5lib/tests/tokenizer.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@
88
import pytest
99
from six import unichr
1010

11-
from html5lib.tokenizer import HTMLTokenizer
12-
from html5lib import constants, utils
11+
from html5lib._tokenizer import HTMLTokenizer
12+
from html5lib import constants, _utils
1313

1414

1515
class TokenizerTestParser(object):
@@ -156,7 +156,7 @@ def repl(m):
156156
except ValueError:
157157
# This occurs when unichr throws ValueError, which should
158158
# only be for a lone-surrogate.
159-
if utils.supports_lone_surrogates:
159+
if _utils.supports_lone_surrogates:
160160
raise
161161
return None
162162

‎html5lib/treebuilders/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828

2929
from __future__ import absolute_import, division, unicode_literals
3030

31-
from ..utils import default_etree
31+
from .._utils import default_etree
3232

3333
treeBuilderCache = {}
3434

File renamed without changes.

‎html5lib/treebuilders/dom.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,10 @@
55
from xml.dom import minidom, Node
66
import weakref
77

8-
from . import _base
8+
from . import base
99
from .. import constants
1010
from ..constants import namespaces
11-
from ..utils import moduleFactoryFactory
11+
from .._utils import moduleFactoryFactory
1212

1313

1414
def getDomBuilder(DomImplementation):
@@ -50,9 +50,9 @@ def __delitem__(self, name):
5050
else:
5151
del self.element.attributes[name]
5252

53-
class NodeBuilder(_base.Node):
53+
class NodeBuilder(base.Node):
5454
def __init__(self, element):
55-
_base.Node.__init__(self, element.nodeName)
55+
base.Node.__init__(self, element.nodeName)
5656
self.element = element
5757

5858
namespace = property(lambda self: hasattr(self.element, "namespaceURI") and
@@ -117,7 +117,7 @@ def getNameTuple(self):
117117

118118
nameTuple = property(getNameTuple)
119119

120-
class TreeBuilder(_base.TreeBuilder): # pylint:disable=unused-variable
120+
class TreeBuilder(base.TreeBuilder): # pylint:disable=unused-variable
121121
def documentClass(self):
122122
self.dom = Dom.getDOMImplementation().createDocument(None, None, None)
123123
return weakref.proxy(self)
@@ -157,12 +157,12 @@ def getDocument(self):
157157
return self.dom
158158

159159
def getFragment(self):
160-
return _base.TreeBuilder.getFragment(self).element
160+
return base.TreeBuilder.getFragment(self).element
161161

162162
def insertText(self, data, parent=None):
163163
data = data
164164
if parent != self:
165-
_base.TreeBuilder.insertText(self, data, parent)
165+
base.TreeBuilder.insertText(self, data, parent)
166166
else:
167167
# HACK: allow text nodes as children of the document node
168168
if hasattr(self.dom, '_child_node_types'):

‎html5lib/treebuilders/etree.py

+8-8
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,11 @@
55

66
import re
77

8-
from . import _base
9-
from .. import ihatexml
8+
from . import base
9+
from .. import _ihatexml
1010
from .. import constants
1111
from ..constants import namespaces
12-
from ..utils import moduleFactoryFactory
12+
from .._utils import moduleFactoryFactory
1313

1414
tag_regexp = re.compile("{([^}]*)}(.*)")
1515

@@ -18,7 +18,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
1818
ElementTree = ElementTreeImplementation
1919
ElementTreeCommentType = ElementTree.Comment("asd").tag
2020

21-
class Element(_base.Node):
21+
class Element(base.Node):
2222
def __init__(self, name, namespace=None):
2323
self._name = name
2424
self._namespace = namespace
@@ -142,7 +142,7 @@ def reparentChildren(self, newParent):
142142
if self._element.text is not None:
143143
newParent._element.text += self._element.text
144144
self._element.text = ""
145-
_base.Node.reparentChildren(self, newParent)
145+
base.Node.reparentChildren(self, newParent)
146146

147147
class Comment(Element):
148148
def __init__(self, data):
@@ -259,7 +259,7 @@ def serializeElement(element, indent=0):
259259
def tostring(element): # pylint:disable=unused-variable
260260
"""Serialize an element and its child nodes to a string"""
261261
rv = []
262-
filter = ihatexml.InfosetFilter()
262+
filter = _ihatexml.InfosetFilter()
263263

264264
def serializeElement(element):
265265
if isinstance(element, ElementTree.ElementTree):
@@ -310,7 +310,7 @@ def serializeElement(element):
310310

311311
return "".join(rv)
312312

313-
class TreeBuilder(_base.TreeBuilder): # pylint:disable=unused-variable
313+
class TreeBuilder(base.TreeBuilder): # pylint:disable=unused-variable
314314
documentClass = Document
315315
doctypeClass = DocumentType
316316
elementClass = Element
@@ -332,7 +332,7 @@ def getDocument(self):
332332
return self.document._element.find("html")
333333

334334
def getFragment(self):
335-
return _base.TreeBuilder.getFragment(self)._element
335+
return base.TreeBuilder.getFragment(self)._element
336336

337337
return locals()
338338

‎html5lib/treebuilders/etree_lxml.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,11 @@
1616
import re
1717
import sys
1818

19-
from . import _base
19+
from . import base
2020
from ..constants import DataLossWarning
2121
from .. import constants
2222
from . import etree as etree_builders
23-
from .. import ihatexml
23+
from .. import _ihatexml
2424

2525
import lxml.etree as etree
2626

@@ -54,7 +54,7 @@ def _getChildNodes(self):
5454

5555
def testSerializer(element):
5656
rv = []
57-
infosetFilter = ihatexml.InfosetFilter(preventDoubleDashComments=True)
57+
infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True)
5858

5959
def serializeElement(element, indent=0):
6060
if not hasattr(element, "tag"):
@@ -172,7 +172,7 @@ def serializeElement(element):
172172
return "".join(rv)
173173

174174

175-
class TreeBuilder(_base.TreeBuilder):
175+
class TreeBuilder(base.TreeBuilder):
176176
documentClass = Document
177177
doctypeClass = DocumentType
178178
elementClass = None
@@ -182,7 +182,7 @@ class TreeBuilder(_base.TreeBuilder):
182182

183183
def __init__(self, namespaceHTMLElements, fullTree=False):
184184
builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
185-
infosetFilter = self.infosetFilter = ihatexml.InfosetFilter(preventDoubleDashComments=True)
185+
infosetFilter = self.infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True)
186186
self.namespaceHTMLElements = namespaceHTMLElements
187187

188188
class Attributes(dict):
@@ -254,10 +254,10 @@ def _getData(self):
254254
self.elementClass = Element
255255
self.commentClass = Comment
256256
# self.fragmentClass = builder.DocumentFragment
257-
_base.TreeBuilder.__init__(self, namespaceHTMLElements)
257+
base.TreeBuilder.__init__(self, namespaceHTMLElements)
258258

259259
def reset(self):
260-
_base.TreeBuilder.reset(self)
260+
base.TreeBuilder.reset(self)
261261
self.insertComment = self.insertCommentInitial
262262
self.initial_comments = []
263263
self.doctype = None

‎html5lib/treewalkers/__init__.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,9 @@
1111
from __future__ import absolute_import, division, unicode_literals
1212

1313
from .. import constants
14-
from ..utils import default_etree
14+
from .._utils import default_etree
1515

16-
__all__ = ["getTreeWalker", "pprint", "dom", "etree", "genshistream", "lxmletree"]
16+
__all__ = ["getTreeWalker", "pprint", "dom", "etree", "genshi", "etree_lxml"]
1717

1818
treeWalkerCache = {}
1919

@@ -43,11 +43,11 @@ def getTreeWalker(treeType, implementation=None, **kwargs):
4343
from . import dom
4444
treeWalkerCache[treeType] = dom.TreeWalker
4545
elif treeType == "genshi":
46-
from . import genshistream
47-
treeWalkerCache[treeType] = genshistream.TreeWalker
46+
from . import genshi
47+
treeWalkerCache[treeType] = genshi.TreeWalker
4848
elif treeType == "lxml":
49-
from . import lxmletree
50-
treeWalkerCache[treeType] = lxmletree.TreeWalker
49+
from . import etree_lxml
50+
treeWalkerCache[treeType] = etree_lxml.TreeWalker
5151
elif treeType == "etree":
5252
from . import etree
5353
if implementation is None:
File renamed without changes.

‎html5lib/treewalkers/dom.py

+8-8
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,16 @@
22

33
from xml.dom import Node
44

5-
from . import _base
5+
from . import base
66

77

8-
class TreeWalker(_base.NonRecursiveTreeWalker):
8+
class TreeWalker(base.NonRecursiveTreeWalker):
99
def getNodeDetails(self, node):
1010
if node.nodeType == Node.DOCUMENT_TYPE_NODE:
11-
return _base.DOCTYPE, node.name, node.publicId, node.systemId
11+
return base.DOCTYPE, node.name, node.publicId, node.systemId
1212

1313
elif node.nodeType in (Node.TEXT_NODE, Node.CDATA_SECTION_NODE):
14-
return _base.TEXT, node.nodeValue
14+
return base.TEXT, node.nodeValue
1515

1616
elif node.nodeType == Node.ELEMENT_NODE:
1717
attrs = {}
@@ -21,17 +21,17 @@ def getNodeDetails(self, node):
2121
attrs[(attr.namespaceURI, attr.localName)] = attr.value
2222
else:
2323
attrs[(None, attr.name)] = attr.value
24-
return (_base.ELEMENT, node.namespaceURI, node.nodeName,
24+
return (base.ELEMENT, node.namespaceURI, node.nodeName,
2525
attrs, node.hasChildNodes())
2626

2727
elif node.nodeType == Node.COMMENT_NODE:
28-
return _base.COMMENT, node.nodeValue
28+
return base.COMMENT, node.nodeValue
2929

3030
elif node.nodeType in (Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE):
31-
return (_base.DOCUMENT,)
31+
return (base.DOCUMENT,)
3232

3333
else:
34-
return _base.UNKNOWN, node.nodeType
34+
return base.UNKNOWN, node.nodeType
3535

3636
def getFirstChild(self, node):
3737
return node.firstChild

‎html5lib/treewalkers/etree.py

+8-8
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@
1212

1313
from six import string_types
1414

15-
from . import _base
16-
from ..utils import moduleFactoryFactory
15+
from . import base
16+
from .._utils import moduleFactoryFactory
1717

1818
tag_regexp = re.compile("{([^}]*)}(.*)")
1919

@@ -22,7 +22,7 @@ def getETreeBuilder(ElementTreeImplementation):
2222
ElementTree = ElementTreeImplementation
2323
ElementTreeCommentType = ElementTree.Comment("asd").tag
2424

25-
class TreeWalker(_base.NonRecursiveTreeWalker): # pylint:disable=unused-variable
25+
class TreeWalker(base.NonRecursiveTreeWalker): # pylint:disable=unused-variable
2626
"""Given the particular ElementTree representation, this implementation,
2727
to avoid using recursion, returns "nodes" as tuples with the following
2828
content:
@@ -40,22 +40,22 @@ def getNodeDetails(self, node):
4040
if isinstance(node, tuple): # It might be the root Element
4141
elt, _, _, flag = node
4242
if flag in ("text", "tail"):
43-
return _base.TEXT, getattr(elt, flag)
43+
return base.TEXT, getattr(elt, flag)
4444
else:
4545
node = elt
4646

4747
if not(hasattr(node, "tag")):
4848
node = node.getroot()
4949

5050
if node.tag in ("DOCUMENT_ROOT", "DOCUMENT_FRAGMENT"):
51-
return (_base.DOCUMENT,)
51+
return (base.DOCUMENT,)
5252

5353
elif node.tag == "<!DOCTYPE>":
54-
return (_base.DOCTYPE, node.text,
54+
return (base.DOCTYPE, node.text,
5555
node.get("publicId"), node.get("systemId"))
5656

5757
elif node.tag == ElementTreeCommentType:
58-
return _base.COMMENT, node.text
58+
return base.COMMENT, node.text
5959

6060
else:
6161
assert isinstance(node.tag, string_types), type(node.tag)
@@ -73,7 +73,7 @@ def getNodeDetails(self, node):
7373
attrs[(match.group(1), match.group(2))] = value
7474
else:
7575
attrs[(None, name)] = value
76-
return (_base.ELEMENT, namespace, tag,
76+
return (base.ELEMENT, namespace, tag,
7777
attrs, len(node) or node.text)
7878

7979
def getFirstChild(self, node):

‎html5lib/treewalkers/lxmletree.py renamed to ‎html5lib/treewalkers/etree_lxml.py

+12-12
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@
44
from lxml import etree
55
from ..treebuilders.etree import tag_regexp
66

7-
from . import _base
7+
from . import base
88

9-
from .. import ihatexml
9+
from .. import _ihatexml
1010

1111

1212
def ensure_str(s):
@@ -122,7 +122,7 @@ def __len__(self):
122122
return len(self.obj)
123123

124124

125-
class TreeWalker(_base.NonRecursiveTreeWalker):
125+
class TreeWalker(base.NonRecursiveTreeWalker):
126126
def __init__(self, tree):
127127
# pylint:disable=redefined-variable-type
128128
if isinstance(tree, list):
@@ -131,29 +131,29 @@ def __init__(self, tree):
131131
else:
132132
self.fragmentChildren = set()
133133
tree = Root(tree)
134-
_base.NonRecursiveTreeWalker.__init__(self, tree)
135-
self.filter = ihatexml.InfosetFilter()
134+
base.NonRecursiveTreeWalker.__init__(self, tree)
135+
self.filter = _ihatexml.InfosetFilter()
136136

137137
def getNodeDetails(self, node):
138138
if isinstance(node, tuple): # Text node
139139
node, key = node
140140
assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
141-
return _base.TEXT, ensure_str(getattr(node, key))
141+
return base.TEXT, ensure_str(getattr(node, key))
142142

143143
elif isinstance(node, Root):
144-
return (_base.DOCUMENT,)
144+
return (base.DOCUMENT,)
145145

146146
elif isinstance(node, Doctype):
147-
return _base.DOCTYPE, node.name, node.public_id, node.system_id
147+
return base.DOCTYPE, node.name, node.public_id, node.system_id
148148

149149
elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"):
150-
return _base.TEXT, ensure_str(node.obj)
150+
return base.TEXT, ensure_str(node.obj)
151151

152152
elif node.tag == etree.Comment:
153-
return _base.COMMENT, ensure_str(node.text)
153+
return base.COMMENT, ensure_str(node.text)
154154

155155
elif node.tag == etree.Entity:
156-
return _base.ENTITY, ensure_str(node.text)[1:-1] # strip &;
156+
return base.ENTITY, ensure_str(node.text)[1:-1] # strip &;
157157

158158
else:
159159
# This is assumed to be an ordinary element
@@ -172,7 +172,7 @@ def getNodeDetails(self, node):
172172
attrs[(match.group(1), match.group(2))] = value
173173
else:
174174
attrs[(None, name)] = value
175-
return (_base.ELEMENT, namespace, self.filter.fromXmlName(tag),
175+
return (base.ELEMENT, namespace, self.filter.fromXmlName(tag),
176176
attrs, len(node) > 0 or node.text)
177177

178178
def getFirstChild(self, node):

‎html5lib/treewalkers/genshistream.py renamed to ‎html5lib/treewalkers/genshi.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,12 @@
44
from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT
55
from genshi.core import START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
66

7-
from . import _base
7+
from . import base
88

99
from ..constants import voidElements, namespaces
1010

1111

12-
class TreeWalker(_base.TreeWalker):
12+
class TreeWalker(base.TreeWalker):
1313
def __iter__(self):
1414
# Buffer the events so we can pass in the following one
1515
previous = None

‎parse.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from html5lib import html5parser
1212
from html5lib import treebuilders, serializer, treewalkers
1313
from html5lib import constants
14-
from html5lib import utils
14+
from html5lib import _utils
1515

1616

1717
def parse():
@@ -116,7 +116,7 @@ def printOutput(parser, document, opts):
116116
import lxml.etree
117117
sys.stdout.write(lxml.etree.tostring(document, encoding="unicode"))
118118
elif tb == "etree":
119-
sys.stdout.write(utils.default_etree.tostring(document, encoding="unicode"))
119+
sys.stdout.write(_utils.default_etree.tostring(document, encoding="unicode"))
120120
elif opts.tree:
121121
if not hasattr(document, '__getitem__'):
122122
document = [document]

0 commit comments

Comments
 (0)
Please sign in to comment.