@@ -46,23 +46,24 @@ class DocxParser:
46
46
__metaclass__ = ABCMeta
47
47
pre_processor_class = PydocxPrePorcessor
48
48
49
+ def _extract_xml (self , f , xml_path ):
50
+ try :
51
+ return f .read (xml_path )
52
+ except KeyError :
53
+ return None
54
+
49
55
def _build_data (self , path , * args , ** kwargs ):
50
56
with ZipFile (path ) as f :
57
+ # These must be in the ZIP in order for the docx to be valid.
51
58
self .document_text = f .read ('word/document.xml' )
52
- self .styles_text = f .read ('word/styles.xml' )
53
- try :
54
- self .fonts = f .read ('/word/fontTable.xml' )
55
- except KeyError :
56
- self .fonts = None
57
- try : # Only present if there are lists
58
- self .numbering_text = f .read ('word/numbering.xml' )
59
- except KeyError :
60
- self .numbering_text = None
61
- try : # Only present if there are comments
62
- self .comment_text = f .read ('word/comments.xml' )
63
- except KeyError :
64
- self .comment_text = None
65
59
self .relationship_text = f .read ('word/_rels/document.xml.rels' )
60
+
61
+ # These are all optional.
62
+ self .styles_text = self ._extract_xml (f , 'word/styles.xml' )
63
+ self .fonts = self ._extract_xml (f , 'word/fontTable.xml' )
64
+ self .numbering_text = self ._extract_xml (f , 'word/numbering.xml' )
65
+ self .comment_text = self ._extract_xml (f , 'word/comments.xml' )
66
+
66
67
zipped_image_files = [
67
68
e for e in f .infolist ()
68
69
if e .filename .startswith ('word/media/' )
@@ -79,6 +80,8 @@ def _build_data(self, path, *args, **kwargs):
79
80
self .comment_root = parse_xml_from_string (self .comment_text )
80
81
81
82
def _parse_styles (self ):
83
+ if self .styles_text is None :
84
+ return {}
82
85
tree = parse_xml_from_string (self .styles_text )
83
86
result = {}
84
87
for style in find_all (tree , 'style' ):
0 commit comments