Merge pull request #51 from OpenScienceFramework/issue_51

jlward · jlward · commit f207307179d8 · 2013-07-08T13:49:23.000-07:00
Not all docx files must have a styles.xml file
diff --git a/CHANGELOG b/CHANGELOG
@@ -1,6 +1,9 @@
 
 Changelog
 =========
+* 0.3.5
+    * Not all docx files contain a `styles.xml` file. We are no longer assuming
+      they do.
 * 0.3.4
     * It is possible for `w:t` tags to have `text` set to `None`. This no longer causes an error when escaping that text.
 * 0.3.3
diff --git a/pydocx/DocxParser.py b/pydocx/DocxParser.py
@@ -46,23 +46,24 @@ class DocxParser:
     __metaclass__ = ABCMeta
     pre_processor_class = PydocxPrePorcessor
 
+    def _extract_xml(self, f, xml_path):
+        try:
+            return f.read(xml_path)
+        except KeyError:
+            return None
+
     def _build_data(self, path, *args, **kwargs):
         with ZipFile(path) as f:
+            # These must be in the ZIP in order for the docx to be valid.
             self.document_text = f.read('word/document.xml')
-            self.styles_text = f.read('word/styles.xml')
-            try:
-                self.fonts = f.read('/word/fontTable.xml')
-            except KeyError:
-                self.fonts = None
-            try:  # Only present if there are lists
-                self.numbering_text = f.read('word/numbering.xml')
-            except KeyError:
-                self.numbering_text = None
-            try:  # Only present if there are comments
-                self.comment_text = f.read('word/comments.xml')
-            except KeyError:
-                self.comment_text = None
             self.relationship_text = f.read('word/_rels/document.xml.rels')
+
+            # These are all optional.
+            self.styles_text = self._extract_xml(f, 'word/styles.xml')
+            self.fonts = self._extract_xml(f, 'word/fontTable.xml')
+            self.numbering_text = self._extract_xml(f, 'word/numbering.xml')
+            self.comment_text = self._extract_xml(f, 'word/comments.xml')
+
             zipped_image_files = [
                 e for e in f.infolist()
                 if e.filename.startswith('word/media/')
@@ -79,6 +80,8 @@ def _build_data(self, path, *args, **kwargs):
             self.comment_root = parse_xml_from_string(self.comment_text)
 
     def _parse_styles(self):
+        if self.styles_text is None:
+            return {}
         tree = parse_xml_from_string(self.styles_text)
         result = {}
         for style in find_all(tree, 'style'):
diff --git a/pydocx/fixtures/missing_style.docx b/pydocx/fixtures/missing_style.docx
diff --git a/pydocx/tests/test_docx.py b/pydocx/tests/test_docx.py
@@ -722,6 +722,19 @@ def test_justification():
     ''')
 
 
+def test_missing_style():
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'missing_style.docx',
+    )
+    actual_html = convert(file_path)
+    assert_html_equal(actual_html, BASE_HTML % '''
+    <p>AAA</p>
+    ''')
+
+
 def _converter(*args, **kwargs):
     # Having a converter that does nothing is the same as if abiword fails to
     # convert.