diff --git a/extruct/jsonld.py b/extruct/jsonld.py index b0cecefb..a66ef99c 100644 --- a/extruct/jsonld.py +++ b/extruct/jsonld.py @@ -3,7 +3,7 @@ JSON-LD extractor """ -import json +from extruct import utils import re import lxml.etree @@ -29,10 +29,10 @@ def extract_items(self, document, *args, **kwargs): def _extract_items(self, node): script = node.xpath('string()') try: - data = json.loads(script) - except ValueError: + data = utils.json_loads(script) + except utils.native_json_exc: # sometimes JSON-decoding errors are due to leading HTML or JavaScript comments - data = json.loads(HTML_OR_JS_COMMENTLINE.sub('', script)) + data = utils.json_loads(HTML_OR_JS_COMMENTLINE.sub('', script)) if isinstance(data, list): return data elif isinstance(data, dict): diff --git a/extruct/rdfa.py b/extruct/rdfa.py index 8d93feb2..d2a05e42 100644 --- a/extruct/rdfa.py +++ b/extruct/rdfa.py @@ -4,7 +4,7 @@ Based on pyrdfa3 and rdflib """ -import json +from extruct import utils import logging rdflib_logger = logging.getLogger('rdflib') rdflib_logger.setLevel(logging.ERROR) @@ -48,4 +48,4 @@ def extract_items(self, document, url, expanded=True, *args, **kwargs): g = PyRdfa(options, base=url).graph_from_DOM(document, graph=Graph(), pgraph=Graph()) jsonld_string = g.serialize(format='json-ld', auto_compact=not expanded).decode('utf-8') - return json.loads(jsonld_string) + return utils.json_loads(jsonld_string) diff --git a/extruct/utils.py b/extruct/utils.py new file mode 100644 index 00000000..48b385f8 --- /dev/null +++ b/extruct/utils.py @@ -0,0 +1,57 @@ +import json +import sys + + +# Python 2 is a vengeful fossil +native_json_exc = getattr(json, 'JSONDecodeError', ValueError) + + +_json_decoder = json.loads +_json_decoder_raises = tuple() # Better not to catch built-in errors at all! +def set_json_decoder(loader_func=_json_decoder, + loader_raises=_json_decoder_raises): + """ + Sets extruct's preferred JSON decoder function. + + You should provide a function that accepts strings, and returns decoded + native objects, as loader_func. + + When your preferred decoder encounters non-JSON strings, or malformed JSON, + typically it will raise Exceptions. Extruct expects json.JSONDecodeError + in such cases. If your preferred decoder does something else (such as the + ValueErrors raised by ujson), provide a tuple of all Exception classes + raised on bad JSON or non-JSON. + """ + global _json_decoder + global _json_decoder_raises + _json_decoder = loader_func + _json_decoder_raises = loader_raises + + +def json_loads(json_string): + """ + Uses the preferred JSON decoder (default is stdlib json) to decode a string, + converting any idiosyncratic exceptions to json.JSONDecodeError. + + Using this utility function allows one to swap in different decoders with + utils.set_json_decoder, for example to use `ujson`, without requiring + extruct to directly handle and support the weirdnesses of each third-party + json library. + """ + # Does this need `global _json_decoder` to prevent reference capture and failure-to-switch? + try: + data = _json_decoder(json_string) + except _json_decoder_raises as E: + # TODO: Deprecate with Python 2. Reason: Prefer exception chaining with `raise from` + if isinstance(E, native_json_exc): + raise + _, _, traceback = sys.exc_info() + if sys.version_info < (3,): + raise ValueError("Error decoding document: {}".format(traceback)) + else: + raise json.JSONDecodeError( + msg="Error decoding document (error index unknown, see preceding traceback)", + doc=json_string, + pos=0, + ).with_traceback(traceback) + return data diff --git a/setup.py b/setup.py index 38ded3b0..d0431fe8 100644 --- a/setup.py +++ b/setup.py @@ -29,6 +29,7 @@ def get_version(): packages=find_packages(exclude=['tests',]), package_data={'extruct': ['VERSION']}, install_requires=['lxml', 'rdflib', 'rdflib-jsonld'], + tests_require=['ujson'], extras_require={ 'service': [ 'bottle', diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 00000000..e7c087f8 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,56 @@ +# -*- coding: utf-8 -*- +import unittest +import json + +import ujson +from extruct import utils + + +class NotAJSONDecodeError(ValueError): + pass + + +class _json_shimmer: + def __init__(self): + self.invoked = False + + def decode(self, json_string): + if json_string == 'FAIL WITH VALUEERROR': + raise ValueError("Yes sir") + try: + return json.loads(json_string) + except utils.native_json_exc: + raise NotAJSONDecodeError("This operation totally failed") + finally: + self.invoked = True + + +class TestJson(unittest.TestCase): + + def test_json_abstraction(self): + # Use default decoder + self.assertEqual(utils._json_decoder, json.loads) + self.assertEqual(utils._json_decoder_raises, tuple()) + self.assertEqual(utils.json_loads('{}'), {}) + with self.assertRaises(utils.native_json_exc): # ugh, Python 2 + utils.json_loads('{') + # Set decoder, try again + shimmer = _json_shimmer() + utils.set_json_decoder(shimmer.decode, (NotAJSONDecodeError,)) + self.assertEqual(utils._json_decoder, shimmer.decode) + self.assertEqual(utils._json_decoder_raises, (NotAJSONDecodeError,)) + self.assertEqual(utils.json_loads('{}'), {}) + # ensure utils.json_loads didn't call a stale reference to json.loads + self.assertTrue(shimmer.invoked) + # Specified exceptions should be converted to JSONDecodeErrors. + with self.assertRaises(utils.native_json_exc): + utils.json_loads('{') + # Others should not. + with self.assertRaises(ValueError): + utils.json_loads('FAIL WITH VALUEERROR') + + def test_ujson(self): + utils.set_json_decoder(ujson.loads, (ValueError,)) + self.assertEqual(utils._json_decoder, ujson.loads) + self.assertEqual(utils._json_decoder_raises, (ValueError,)) + self.assertEqual(utils.json_loads('{"foo": "bar"}'), {'foo': 'bar'}) diff --git a/tox.ini b/tox.ini index e2bc2898..36941785 100644 --- a/tox.ini +++ b/tox.ini @@ -7,5 +7,6 @@ deps = pytest pytest-cov mock + ujson commands = py.test --cov-report=term --cov-report= --cov=extruct {posargs:extruct tests}