diff --git a/extruct/_extruct.py b/extruct/_extruct.py
index ba35a6fa..2f206383 100644
--- a/extruct/_extruct.py
+++ b/extruct/_extruct.py
@@ -54,6 +54,7 @@ def extract(htmlstring,
if errors not in ['log', 'ignore', 'strict']:
raise ValueError('Invalid error command, valid values are either "log"'
', "ignore" or "strict"')
+
try:
tree = parse_xmldom_html(htmlstring, encoding=encoding)
except Exception as e:
@@ -65,6 +66,7 @@ def extract(htmlstring,
return {}
if errors == 'strict':
raise
+
processors = []
if 'microdata' in syntaxes:
processors.append(
@@ -95,6 +97,7 @@ def extract(htmlstring,
('rdfa', RDFaExtractor().extract_items,
tree,
))
+
output = {}
for syntax, extract, document in processors:
try:
@@ -108,6 +111,7 @@ def extract(htmlstring,
pass
if errors == 'strict':
raise
+
if uniform:
uniform_processors = []
if 'microdata' in syntaxes:
@@ -131,6 +135,7 @@ def extract(htmlstring,
output['opengraph'],
None,
))
+
for syntax, uniform, raw, schema_context in uniform_processors:
try:
if syntax == 'opengraph':
diff --git a/extruct/opengraph.py b/extruct/opengraph.py
index 78e836bf..978d25ab 100644
--- a/extruct/opengraph.py
+++ b/extruct/opengraph.py
@@ -30,8 +30,10 @@ def extract_items(self, document, base_url=None):
namespaces.update(self.get_namespaces(head))
props = []
for el in head.xpath('meta[@property and @content]'):
- prop = el.attrib['property']
- val = el.attrib['content']
+ prop = el.attrib['property'].strip()
+ val = el.attrib['content'].strip()
+ if prop == '' or val == '':
+ continue
ns = prop.partition(':')[0]
if ns in _OG_NAMESPACES:
namespaces[ns] = _OG_NAMESPACES[ns]
diff --git a/requirements.txt b/requirements.txt
index 87a27224..820557a0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,5 +7,5 @@ requests
rdflib
rdflib-jsonld
mf2py>=1.1.0
-six
+six>=1.11
w3lib
diff --git a/tests/samples/songkick/elysianfields.html b/tests/samples/songkick/elysianfields.html
index 790465b3..4fa2ba0a 100644
--- a/tests/samples/songkick/elysianfields.html
+++ b/tests/samples/songkick/elysianfields.html
@@ -27,7 +27,9 @@
+
+
diff --git a/tests/samples/songkick/elysianfields.json b/tests/samples/songkick/elysianfields.json
index 4b9f3649..ba8e9f56 100644
--- a/tests/samples/songkick/elysianfields.json
+++ b/tests/samples/songkick/elysianfields.json
@@ -232,6 +232,9 @@
"http://ogp.me/ns#description": [
{
"@value": "Buy tickets for an upcoming Elysian Fields concert near you. List of all Elysian Fields tickets and tour dates for 2017."
+ },
+ {
+ "@value": ""
}
],
"http://ogp.me/ns#image": [
@@ -250,6 +253,9 @@
"http://ogp.me/ns#title": [
{
"@value": "Elysian Fields"
+ },
+ {
+ "@value": " "
}
],
"http://ogp.me/ns#type": [
diff --git a/tests/test_extruct.py b/tests/test_extruct.py
index dc08401e..a2ba8003 100644
--- a/tests/test_extruct.py
+++ b/tests/test_extruct.py
@@ -5,7 +5,6 @@
import pytest
import extruct
-from extruct import SYNTAXES
from tests import get_testdata, jsonize_dict, replace_node_ref_with_node_id
@@ -17,9 +16,13 @@ def test_all(self):
body = get_testdata('songkick', 'elysianfields.html')
expected = json.loads(get_testdata('songkick', 'elysianfields.json').decode('UTF-8'))
data = extruct.extract(body, base_url='http://www.songkick.com/artists/236156-elysian-fields')
- # See test_rdfa_not_preserving_order()
- del data['rdfa'][0]['http://ogp.me/ns#image']
- del expected['rdfa'][0]['http://ogp.me/ns#image']
+ # Sorting the values here because RDFa is not preserving ordering on duplicated properties.
+ # See https://github.com/scrapinghub/extruct/issues/116
+ # Also see test_rdfa_not_preserving_order()
+ for rdf in data['rdfa']:
+ for key, pairs in rdf.items():
+ if ':' in key and isinstance(pairs, list):
+ rdf[key] = sorted(pairs, key=lambda e: e["@value"], reverse=True)
self.assertEqual(jsonize_dict(data), expected)
@pytest.mark.xfail