diff --git a/extruct/_extruct.py b/extruct/_extruct.py index ba35a6fa..2f206383 100644 --- a/extruct/_extruct.py +++ b/extruct/_extruct.py @@ -54,6 +54,7 @@ def extract(htmlstring, if errors not in ['log', 'ignore', 'strict']: raise ValueError('Invalid error command, valid values are either "log"' ', "ignore" or "strict"') + try: tree = parse_xmldom_html(htmlstring, encoding=encoding) except Exception as e: @@ -65,6 +66,7 @@ def extract(htmlstring, return {} if errors == 'strict': raise + processors = [] if 'microdata' in syntaxes: processors.append( @@ -95,6 +97,7 @@ def extract(htmlstring, ('rdfa', RDFaExtractor().extract_items, tree, )) + output = {} for syntax, extract, document in processors: try: @@ -108,6 +111,7 @@ def extract(htmlstring, pass if errors == 'strict': raise + if uniform: uniform_processors = [] if 'microdata' in syntaxes: @@ -131,6 +135,7 @@ def extract(htmlstring, output['opengraph'], None, )) + for syntax, uniform, raw, schema_context in uniform_processors: try: if syntax == 'opengraph': diff --git a/extruct/opengraph.py b/extruct/opengraph.py index 78e836bf..978d25ab 100644 --- a/extruct/opengraph.py +++ b/extruct/opengraph.py @@ -30,8 +30,10 @@ def extract_items(self, document, base_url=None): namespaces.update(self.get_namespaces(head)) props = [] for el in head.xpath('meta[@property and @content]'): - prop = el.attrib['property'] - val = el.attrib['content'] + prop = el.attrib['property'].strip() + val = el.attrib['content'].strip() + if prop == '' or val == '': + continue ns = prop.partition(':')[0] if ns in _OG_NAMESPACES: namespaces[ns] = _OG_NAMESPACES[ns] diff --git a/requirements.txt b/requirements.txt index 87a27224..820557a0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,5 @@ requests rdflib rdflib-jsonld mf2py>=1.1.0 -six +six>=1.11 w3lib diff --git a/tests/samples/songkick/elysianfields.html b/tests/samples/songkick/elysianfields.html index 790465b3..4fa2ba0a 100644 --- a/tests/samples/songkick/elysianfields.html +++ b/tests/samples/songkick/elysianfields.html @@ -27,7 +27,9 @@ + + diff --git a/tests/samples/songkick/elysianfields.json b/tests/samples/songkick/elysianfields.json index 4b9f3649..ba8e9f56 100644 --- a/tests/samples/songkick/elysianfields.json +++ b/tests/samples/songkick/elysianfields.json @@ -232,6 +232,9 @@ "http://ogp.me/ns#description": [ { "@value": "Buy tickets for an upcoming Elysian Fields concert near you. List of all Elysian Fields tickets and tour dates for 2017." + }, + { + "@value": "" } ], "http://ogp.me/ns#image": [ @@ -250,6 +253,9 @@ "http://ogp.me/ns#title": [ { "@value": "Elysian Fields" + }, + { + "@value": " " } ], "http://ogp.me/ns#type": [ diff --git a/tests/test_extruct.py b/tests/test_extruct.py index dc08401e..a2ba8003 100644 --- a/tests/test_extruct.py +++ b/tests/test_extruct.py @@ -5,7 +5,6 @@ import pytest import extruct -from extruct import SYNTAXES from tests import get_testdata, jsonize_dict, replace_node_ref_with_node_id @@ -17,9 +16,13 @@ def test_all(self): body = get_testdata('songkick', 'elysianfields.html') expected = json.loads(get_testdata('songkick', 'elysianfields.json').decode('UTF-8')) data = extruct.extract(body, base_url='http://www.songkick.com/artists/236156-elysian-fields') - # See test_rdfa_not_preserving_order() - del data['rdfa'][0]['http://ogp.me/ns#image'] - del expected['rdfa'][0]['http://ogp.me/ns#image'] + # Sorting the values here because RDFa is not preserving ordering on duplicated properties. + # See https://github.com/scrapinghub/extruct/issues/116 + # Also see test_rdfa_not_preserving_order() + for rdf in data['rdfa']: + for key, pairs in rdf.items(): + if ':' in key and isinstance(pairs, list): + rdf[key] = sorted(pairs, key=lambda e: e["@value"], reverse=True) self.assertEqual(jsonize_dict(data), expected) @pytest.mark.xfail