-
Notifications
You must be signed in to change notification settings - Fork 117
/
Copy path_extruct.py
169 lines (163 loc) · 6.15 KB
/
_extruct.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import logging
import warnings
from extruct.jsonld import JsonLdExtractor
from extruct.rdfa import RDFaExtractor
from extruct.w3cmicrodata import MicrodataExtractor
from extruct.opengraph import OpenGraphExtractor
from extruct.microformat import MicroformatExtractor
from extruct.dublincore import DublinCoreExtractor
from extruct.uniform import _umicrodata_microformat, _uopengraph, _udublincore
from extruct.utils import parse_xmldom_html
logger = logging.getLogger(__name__)
SYNTAXES = ['microdata', 'opengraph', 'json-ld', 'microformat', 'rdfa', 'dublincore']
def extract(htmlstring,
base_url=None,
encoding="UTF-8",
syntaxes=SYNTAXES,
errors='strict',
uniform=False,
return_html_node=False,
schema_context='http://schema.org',
with_og_array=False,
**kwargs):
"""htmlstring: string with valid html document;
base_url: base url of the html document
encoding: encoding of the html document
syntaxes: list of syntaxes to extract, default SYNTAXES
errors: set to 'log' to log the exceptions, 'ignore' to ignore them
or 'strict'(default) to raise them
uniform: if True uniform output format of all syntaxes to a list of dicts.
Returned dicts structure:
{'@context': 'http://example.com',
'@type': 'example_type',
/* All other the properties in keys here */
}
return_html_node: if True, it includes into the result a HTML node of
respective embedded metadata under 'htmlNode' key.
The feature is supported only by microdata syntax.
Each node is of `lxml.etree.Element` type.
schema_context: schema's context for current page"""
if base_url is None and 'url' in kwargs:
warnings.warn(
'"url" argument is deprecated, please use "base_url"',
DeprecationWarning,
stacklevel=2)
base_url = kwargs.pop('url')
if kwargs:
raise TypeError('Unexpected keyword arguments')
if not (isinstance(syntaxes, list) and all(v in SYNTAXES
for v in syntaxes)):
raise ValueError("syntaxes must be a list with any or all (default) of"
"these values: {}".format(SYNTAXES))
if errors not in ['log', 'ignore', 'strict']:
raise ValueError('Invalid error command, valid values are either "log"'
', "ignore" or "strict"')
try:
tree = parse_xmldom_html(htmlstring, encoding=encoding)
except Exception as e:
if errors == 'ignore':
return {}
if errors == 'log':
logger.exception(
'Failed to parse html, raises {}'.format(e))
return {}
if errors == 'strict':
raise
processors = []
if 'microdata' in syntaxes:
processors.append(
('microdata',
MicrodataExtractor(add_html_node=return_html_node).extract_items,
tree
))
if 'json-ld' in syntaxes:
processors.append(
('json-ld',
JsonLdExtractor().extract_items,
tree,
))
if 'opengraph' in syntaxes:
processors.append(
('opengraph',
OpenGraphExtractor().extract_items,
tree
))
if 'microformat' in syntaxes:
processors.append(
('microformat',
MicroformatExtractor().extract_items,
htmlstring
))
if 'rdfa' in syntaxes:
processors.append(
('rdfa', RDFaExtractor().extract_items,
tree,
))
if 'dublincore' in syntaxes:
processors.append(
('dublincore', DublinCoreExtractor().extract_items,
tree,
))
output = {}
for syntax, extract, document in processors:
try:
output[syntax] = list(extract(document, base_url=base_url))
except Exception as e:
if errors == 'log':
logger.exception('Failed to extract {}, raises {}'
.format(syntax, e)
)
if errors == 'ignore':
pass
if errors == 'strict':
raise
if uniform:
uniform_processors = []
if 'microdata' in syntaxes:
uniform_processors.append(
('microdata',
_umicrodata_microformat,
output['microdata'],
schema_context,
))
if 'microformat' in syntaxes:
uniform_processors.append(
('microformat',
_umicrodata_microformat,
output['microformat'],
'http://microformats.org/wiki/',
))
if 'opengraph' in syntaxes:
uniform_processors.append(
('opengraph',
_uopengraph,
output['opengraph'],
None,
))
if 'dublincore' in syntaxes:
uniform_processors.append(
('dublincore',
_udublincore,
output['dublincore'],
None,
))
for syntax, uniform, raw, schema_context in uniform_processors:
try:
if syntax == 'opengraph':
output[syntax] = uniform(raw, with_og_array=with_og_array)
elif syntax == 'dublincore':
output[syntax] = uniform(raw)
else:
output[syntax] = uniform(raw, schema_context)
except Exception as e:
if errors == 'ignore':
output[syntax] = []
if errors == 'log':
output[syntax] = []
logger.exception(
'Failed to uniform extracted for {}, raises {}'
.format(syntax, e)
)
if errors == 'strict':
raise
return output