diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 4a6cf117fd196..ca706e2c7cdbb 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -35,6 +35,7 @@ Other enhancements - Added missing :meth:`pandas.Series.info` to API reference (:issue:`60926`) - :class:`pandas.api.typing.NoDefault` is available for typing ``no_default`` - :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`) +- :func:`json_normalize` now supports parsing JSON strings and bytes directly, eliminating the need for an intermediate apply(json.loads) step (:issue:`61006`) - :func:`pandas.merge` now validates the ``how`` parameter input (merge type) (:issue:`59435`) - :func:`pandas.merge`, :meth:`DataFrame.merge` and :meth:`DataFrame.join` now support anti joins (``left_anti`` and ``right_anti``) in the ``how`` parameter (:issue:`42916`) - :func:`read_spss` now supports kwargs to be passed to pyreadstat (:issue:`56356`) diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py index 642408b35ba24..0669b5c661c2d 100644 --- a/pandas/io/json/_normalize.py +++ b/pandas/io/json/_normalize.py @@ -11,6 +11,7 @@ TYPE_CHECKING, Any, DefaultDict, + Union, overload, ) @@ -267,7 +268,7 @@ def _simple_json_normalize( def json_normalize( - data: dict | list[dict] | Series, + data: Union[dict[Any, Any], list[dict[Any, Any]], Series, str, bytes], record_path: str | list | None = None, meta: str | list[str | list[str]] | None = None, meta_prefix: str | None = None, @@ -285,8 +286,8 @@ def json_normalize( Parameters ---------- - data : dict, list of dicts, or Series of dicts - Unserialized JSON objects. + data : dict, list of dicts, Series of dicts/JSON str/bytes, or JSON str/bytes + Unserialized JSON objects or JSON strings/bytes. record_path : str or list of str, default None Path in each object to list of records. If not passed, data will be assumed to be an array of records. @@ -434,7 +435,30 @@ def json_normalize( 1 2 Returns normalized data with columns prefixed with the given string. + + >>> # JSON string input + >>> json_str = '{"id": 1, "name": {"first": "John", "last": "Doe"}}' + >>> pd.json_normalize(json_str) + id name.first name.last + 0 1 John Doe """ + if isinstance(data, (str, bytes)): + import json + + data = json.loads(data) + + if isinstance(data, Series): + if data.empty: + return DataFrame() + + sample = data.iloc[0] + if isinstance(sample, (str, bytes)): + import json + + data = data.apply(json.loads) + index = data.index + else: + index = None def _pull_field( js: dict[str, Any], spec: list | str, extract_record: bool = False @@ -485,11 +509,6 @@ def _pull_records(js: dict[str, Any], spec: list | str) -> list: ) return result - if isinstance(data, Series): - index = data.index - else: - index = None - if isinstance(data, list) and not data: return DataFrame() elif isinstance(data, dict): diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index fdbfbd004617e..cedccc0af1a12 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -162,21 +162,20 @@ def test_empty_array(self): tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( - "data, record_path, exception_type", + "data, record_path, exception_type, expected", [ - ([{"a": 0}, {"a": 1}], None, None), - ({"a": [{"a": 0}, {"a": 1}]}, "a", None), - ('{"a": [{"a": 0}, {"a": 1}]}', None, NotImplementedError), - (None, None, NotImplementedError), + ([{"a": 0}, {"a": 1}], None, None, DataFrame([0, 1], columns=["a"])), + ({"a": [{"a": 0}, {"a": 1}]}, "a", None, DataFrame([0, 1], columns=["a"])), + ('[{"a": 0}, {"a": 1}]', None, None, DataFrame([0, 1], columns=["a"])), + (None, None, NotImplementedError, None), ], ) - def test_accepted_input(self, data, record_path, exception_type): + def test_accepted_input(self, data, record_path, exception_type, expected): if exception_type is not None: with pytest.raises(exception_type, match=""): json_normalize(data, record_path=record_path) else: result = json_normalize(data, record_path=record_path) - expected = DataFrame([0, 1], columns=["a"]) tm.assert_frame_equal(result, expected) def test_simple_normalize_with_separator(self, deep_nested): @@ -569,6 +568,61 @@ def test_series_index(self, state_data): result = json_normalize(series, "counties") tm.assert_index_equal(result.index, idx.repeat([3, 2])) + def test_json_string_input(self): + # GH61006: Accept JSON as str input + json_str = '{"id": 1, "name": {"first": "John", "last": "Doe"}}' + result = json_normalize(json_str) + expected = DataFrame({"id": [1], "name.first": ["John"], "name.last": ["Doe"]}) + tm.assert_frame_equal(result, expected) + + json_array_str = """[ + {"id": 1, "name": {"first": "John", "last": "Doe"}}, + {"id": 2, "name": {"first": "Jane", "last": "Smith"}} + ]""" + result = json_normalize(json_array_str) + expected = DataFrame( + { + "id": [1, 2], + "name.first": ["John", "Jane"], + "name.last": ["Doe", "Smith"], + } + ) + tm.assert_frame_equal(result, expected) + + def test_json_bytes_input(self): + # GH61006: Accept JSON as bytes input + json_bytes = b'{"id": 1, "name": {"first": "John", "last": "Doe"}}' + result = json_normalize(json_bytes) + expected = DataFrame({"id": [1], "name.first": ["John"], "name.last": ["Doe"]}) + tm.assert_frame_equal(result, expected) + + def test_series_json_string(self): + # GH61006: + s = Series(['{"value": 0.0}', '{"value": 0.5}', '{"value": 1.0}']) + result = json_normalize(s) + expected = DataFrame({"value": [0.0, 0.5, 1.0]}) + tm.assert_frame_equal(result, expected) + + def test_series_json_string_with_index(self): + # GH61006: + s = Series(['{"value": 0.0}', '{"value": 0.5}'], index=["a", "b"]) + result = json_normalize(s) + expected = DataFrame({"value": [0.0, 0.5]}, index=["a", "b"]) + tm.assert_frame_equal(result, expected) + + def test_invalid_json_string(self): + incomplete_json = '{"id": 1, "name": {"first": "John", "last": "Doe"' + with pytest.raises(json.JSONDecodeError): + json_normalize(incomplete_json) + + non_json = "Hello World" + with pytest.raises(json.JSONDecodeError): + json_normalize(non_json) + + malformed_json = '{"a": 1,}' + with pytest.raises(json.JSONDecodeError): + json_normalize(malformed_json) + class TestNestedToRecord: def test_flat_stays_flat(self):