Skip to content

Commit f207307

Browse files
committed
Merge pull request #51 from OpenScienceFramework/issue_51
Not all docx files must have a styles.xml file
2 parents 2bb32d6 + b54e80b commit f207307

File tree

4 files changed

+32
-13
lines changed

4 files changed

+32
-13
lines changed

CHANGELOG

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11

22
Changelog
33
=========
4+
* 0.3.5
5+
* Not all docx files contain a `styles.xml` file. We are no longer assuming
6+
they do.
47
* 0.3.4
58
* It is possible for `w:t` tags to have `text` set to `None`. This no longer causes an error when escaping that text.
69
* 0.3.3

pydocx/DocxParser.py

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -46,23 +46,24 @@ class DocxParser:
4646
__metaclass__ = ABCMeta
4747
pre_processor_class = PydocxPrePorcessor
4848

49+
def _extract_xml(self, f, xml_path):
50+
try:
51+
return f.read(xml_path)
52+
except KeyError:
53+
return None
54+
4955
def _build_data(self, path, *args, **kwargs):
5056
with ZipFile(path) as f:
57+
# These must be in the ZIP in order for the docx to be valid.
5158
self.document_text = f.read('word/document.xml')
52-
self.styles_text = f.read('word/styles.xml')
53-
try:
54-
self.fonts = f.read('/word/fontTable.xml')
55-
except KeyError:
56-
self.fonts = None
57-
try: # Only present if there are lists
58-
self.numbering_text = f.read('word/numbering.xml')
59-
except KeyError:
60-
self.numbering_text = None
61-
try: # Only present if there are comments
62-
self.comment_text = f.read('word/comments.xml')
63-
except KeyError:
64-
self.comment_text = None
6559
self.relationship_text = f.read('word/_rels/document.xml.rels')
60+
61+
# These are all optional.
62+
self.styles_text = self._extract_xml(f, 'word/styles.xml')
63+
self.fonts = self._extract_xml(f, 'word/fontTable.xml')
64+
self.numbering_text = self._extract_xml(f, 'word/numbering.xml')
65+
self.comment_text = self._extract_xml(f, 'word/comments.xml')
66+
6667
zipped_image_files = [
6768
e for e in f.infolist()
6869
if e.filename.startswith('word/media/')
@@ -79,6 +80,8 @@ def _build_data(self, path, *args, **kwargs):
7980
self.comment_root = parse_xml_from_string(self.comment_text)
8081

8182
def _parse_styles(self):
83+
if self.styles_text is None:
84+
return {}
8285
tree = parse_xml_from_string(self.styles_text)
8386
result = {}
8487
for style in find_all(tree, 'style'):

pydocx/fixtures/missing_style.docx

2.58 KB
Binary file not shown.

pydocx/tests/test_docx.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -722,6 +722,19 @@ def test_justification():
722722
''')
723723

724724

725+
def test_missing_style():
726+
file_path = path.join(
727+
path.abspath(path.dirname(__file__)),
728+
'..',
729+
'fixtures',
730+
'missing_style.docx',
731+
)
732+
actual_html = convert(file_path)
733+
assert_html_equal(actual_html, BASE_HTML % '''
734+
<p>AAA</p>
735+
''')
736+
737+
725738
def _converter(*args, **kwargs):
726739
# Having a converter that does nothing is the same as if abiword fails to
727740
# convert.

0 commit comments

Comments
 (0)