Skip to content

ENH: json_normalize accepts JSON with str and bytes input #61056

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ Other enhancements
- Added missing :meth:`pandas.Series.info` to API reference (:issue:`60926`)
- :class:`pandas.api.typing.NoDefault` is available for typing ``no_default``
- :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`)
- :func:`json_normalize` now supports parsing JSON strings and bytes directly, eliminating the need for an intermediate apply(json.loads) step (:issue:`61006`)
- :func:`pandas.merge` now validates the ``how`` parameter input (merge type) (:issue:`59435`)
- :func:`pandas.merge`, :meth:`DataFrame.merge` and :meth:`DataFrame.join` now support anti joins (``left_anti`` and ``right_anti``) in the ``how`` parameter (:issue:`42916`)
- :func:`read_spss` now supports kwargs to be passed to pyreadstat (:issue:`56356`)
Expand Down
35 changes: 27 additions & 8 deletions pandas/io/json/_normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
TYPE_CHECKING,
Any,
DefaultDict,
Union,
overload,
)

Expand Down Expand Up @@ -267,7 +268,7 @@ def _simple_json_normalize(


def json_normalize(
data: dict | list[dict] | Series,
data: Union[dict[Any, Any], list[dict[Any, Any]], Series, str, bytes],
record_path: str | list | None = None,
meta: str | list[str | list[str]] | None = None,
meta_prefix: str | None = None,
Expand All @@ -285,8 +286,8 @@ def json_normalize(

Parameters
----------
data : dict, list of dicts, or Series of dicts
Unserialized JSON objects.
data : dict, list of dicts, Series of dicts/JSON str/bytes, or JSON str/bytes
Unserialized JSON objects or JSON strings/bytes.
record_path : str or list of str, default None
Path in each object to list of records. If not passed, data will be
assumed to be an array of records.
Expand Down Expand Up @@ -434,7 +435,30 @@ def json_normalize(
1 2

Returns normalized data with columns prefixed with the given string.

>>> # JSON string input
>>> json_str = '{"id": 1, "name": {"first": "John", "last": "Doe"}}'
>>> pd.json_normalize(json_str)
id name.first name.last
0 1 John Doe
"""
if isinstance(data, (str, bytes)):
import json

data = json.loads(data)

if isinstance(data, Series):
if data.empty:
return DataFrame()

sample = data.iloc[0]
if isinstance(sample, (str, bytes)):
import json

data = data.apply(json.loads)
index = data.index
else:
index = None

def _pull_field(
js: dict[str, Any], spec: list | str, extract_record: bool = False
Expand Down Expand Up @@ -485,11 +509,6 @@ def _pull_records(js: dict[str, Any], spec: list | str) -> list:
)
return result

if isinstance(data, Series):
index = data.index
else:
index = None

if isinstance(data, list) and not data:
return DataFrame()
elif isinstance(data, dict):
Expand Down
68 changes: 61 additions & 7 deletions pandas/tests/io/json/test_normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,21 +162,20 @@ def test_empty_array(self):
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize(
"data, record_path, exception_type",
"data, record_path, exception_type, expected",
[
([{"a": 0}, {"a": 1}], None, None),
({"a": [{"a": 0}, {"a": 1}]}, "a", None),
('{"a": [{"a": 0}, {"a": 1}]}', None, NotImplementedError),
(None, None, NotImplementedError),
([{"a": 0}, {"a": 1}], None, None, DataFrame([0, 1], columns=["a"])),
({"a": [{"a": 0}, {"a": 1}]}, "a", None, DataFrame([0, 1], columns=["a"])),
('[{"a": 0}, {"a": 1}]', None, None, DataFrame([0, 1], columns=["a"])),
(None, None, NotImplementedError, None),
],
)
def test_accepted_input(self, data, record_path, exception_type):
def test_accepted_input(self, data, record_path, exception_type, expected):
if exception_type is not None:
with pytest.raises(exception_type, match=""):
json_normalize(data, record_path=record_path)
else:
result = json_normalize(data, record_path=record_path)
expected = DataFrame([0, 1], columns=["a"])
tm.assert_frame_equal(result, expected)

def test_simple_normalize_with_separator(self, deep_nested):
Expand Down Expand Up @@ -569,6 +568,61 @@ def test_series_index(self, state_data):
result = json_normalize(series, "counties")
tm.assert_index_equal(result.index, idx.repeat([3, 2]))

def test_json_string_input(self):
# GH61006: Accept JSON as str input
json_str = '{"id": 1, "name": {"first": "John", "last": "Doe"}}'
result = json_normalize(json_str)
expected = DataFrame({"id": [1], "name.first": ["John"], "name.last": ["Doe"]})
tm.assert_frame_equal(result, expected)

json_array_str = """[
{"id": 1, "name": {"first": "John", "last": "Doe"}},
{"id": 2, "name": {"first": "Jane", "last": "Smith"}}
]"""
result = json_normalize(json_array_str)
expected = DataFrame(
{
"id": [1, 2],
"name.first": ["John", "Jane"],
"name.last": ["Doe", "Smith"],
}
)
tm.assert_frame_equal(result, expected)

def test_json_bytes_input(self):
# GH61006: Accept JSON as bytes input
json_bytes = b'{"id": 1, "name": {"first": "John", "last": "Doe"}}'
result = json_normalize(json_bytes)
expected = DataFrame({"id": [1], "name.first": ["John"], "name.last": ["Doe"]})
tm.assert_frame_equal(result, expected)

def test_series_json_string(self):
# GH61006:
s = Series(['{"value": 0.0}', '{"value": 0.5}', '{"value": 1.0}'])
result = json_normalize(s)
expected = DataFrame({"value": [0.0, 0.5, 1.0]})
tm.assert_frame_equal(result, expected)

def test_series_json_string_with_index(self):
# GH61006:
s = Series(['{"value": 0.0}', '{"value": 0.5}'], index=["a", "b"])
result = json_normalize(s)
expected = DataFrame({"value": [0.0, 0.5]}, index=["a", "b"])
tm.assert_frame_equal(result, expected)

def test_invalid_json_string(self):
incomplete_json = '{"id": 1, "name": {"first": "John", "last": "Doe"'
with pytest.raises(json.JSONDecodeError):
json_normalize(incomplete_json)

non_json = "Hello World"
with pytest.raises(json.JSONDecodeError):
json_normalize(non_json)

malformed_json = '{"a": 1,}'
with pytest.raises(json.JSONDecodeError):
json_normalize(malformed_json)


class TestNestedToRecord:
def test_flat_stays_flat(self):
Expand Down
Loading