Json fix normalize (#49920)

WillAyd · web-flow · commit cd58f3bc2562 · 2022-11-28T12:04:05.000-08:00
* added failing test

* fix + whatsnew

* Refactor for readability

* Better compat
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -717,6 +717,7 @@ I/O
 - Bug in :func:`read_csv` for a single-line csv with fewer columns than ``names`` raised :class:`.errors.ParserError` with ``engine="c"`` (:issue:`47566`)
 - Bug in :func:`DataFrame.to_string` with ``header=False`` that printed the index name on the same line as the first row of the data (:issue:`49230`)
 - Fixed memory leak which stemmed from the initialization of the internal JSON module (:issue:`49222`)
+- Fixed issue where :func:`json_normalize` would incorrectly remove leading characters from column names that matched the ``sep`` argument (:issue:`49861`)
 -
 
 Period
diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py
@@ -7,6 +7,7 @@
     defaultdict,
 )
 import copy
+import sys
 from typing import (
     Any,
     DefaultDict,
@@ -148,13 +149,18 @@ def _normalise_json(
     if isinstance(data, dict):
         for key, value in data.items():
             new_key = f"{key_string}{separator}{key}"
+
+            if not key_string:
+                if sys.version_info < (3, 9):
+                    from pandas.util._str_methods import removeprefix
+
+                    new_key = removeprefix(new_key, separator)
+                else:
+                    new_key = new_key.removeprefix(separator)
+
             _normalise_json(
                 data=value,
-                # to avoid adding the separator to the start of every key
-                # GH#43831 avoid adding key if key_string blank
-                key_string=new_key
-                if new_key[: len(separator)] != separator
-                else new_key[len(separator) :],
+                key_string=new_key,
                 normalized_dict=normalized_dict,
                 separator=separator,
             )
diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py
@@ -561,6 +561,14 @@ def generator_data():
 
         tm.assert_frame_equal(result, expected)
 
+    def test_top_column_with_leading_underscore(self):
+        # 49861
+        data = {"_id": {"a1": 10, "l2": {"l3": 0}}, "gg": 4}
+        result = json_normalize(data, sep="_")
+        expected = DataFrame([[4, 10, 0]], columns=["gg", "_id_a1", "_id_l2_l3"])
+
+        tm.assert_frame_equal(result, expected)
+
 
 class TestNestedToRecord:
     def test_flat_stays_flat(self):

Original file line number	Diff line number	Diff line change
`@@ -717,6 +717,7 @@ I/O`
`717`	`717`	- Bug in :func:`read_csv` for a single-line csv with fewer columns than ``names`` raised :class:`.errors.ParserError` with ``engine="c"`` (:issue:`47566`)
`718`	`718`	- Bug in :func:`DataFrame.to_string` with ``header=False`` that printed the index name on the same line as the first row of the data (:issue:`49230`)
`719`	`719`	- Fixed memory leak which stemmed from the initialization of the internal JSON module (:issue:`49222`)
	`720`	+- Fixed issue where :func:`json_normalize` would incorrectly remove leading characters from column names that matched the ``sep`` argument (:issue:`49861`)
`720`	`721`	`-`
`721`	`722`
`722`	`723`	`Period`