REF: Use PyUnicode_AsUTF8AndSize instead of get_c_string_buf_and_size (#58227)

mroeschke · web-flow · commit 4fe49b160eac · 2024-04-11T20:51:06.000-04:00
diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -3,7 +3,7 @@ Template for each `dtype` helper function for hashtable
 
 WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
 """
-
+from cpython.unicode cimport PyUnicode_AsUTF8
 
 {{py:
 
@@ -98,7 +98,6 @@ from pandas._libs.khash cimport (
 # VectorData
 # ----------------------------------------------------------------------
 
-from pandas._libs.tslibs.util cimport get_c_string
 from pandas._libs.missing cimport C_NA
 
 
@@ -998,7 +997,7 @@ cdef class StringHashTable(HashTable):
         cdef:
             khiter_t k
             const char *v
-        v = get_c_string(val)
+        v = PyUnicode_AsUTF8(val)
 
         k = kh_get_str(self.table, v)
         if k != self.table.n_buckets:
@@ -1012,7 +1011,7 @@ cdef class StringHashTable(HashTable):
             int ret = 0
             const char *v
 
-        v = get_c_string(key)
+        v = PyUnicode_AsUTF8(key)
 
         k = kh_put_str(self.table, v, &ret)
         if kh_exist_str(self.table, k):
@@ -1037,7 +1036,7 @@ cdef class StringHashTable(HashTable):
             raise MemoryError()
         for i in range(n):
             val = values[i]
-            v = get_c_string(val)
+            v = PyUnicode_AsUTF8(val)
             vecs[i] = v
 
         with nogil:
@@ -1071,11 +1070,11 @@ cdef class StringHashTable(HashTable):
             val = values[i]
 
             if isinstance(val, str):
-                # GH#31499 if we have a np.str_ get_c_string won't recognize
+                # GH#31499 if we have a np.str_ PyUnicode_AsUTF8 won't recognize
                 #  it as a str, even though isinstance does.
-                v = get_c_string(<str>val)
+                v = PyUnicode_AsUTF8(<str>val)
             else:
-                v = get_c_string(self.na_string_sentinel)
+                v = PyUnicode_AsUTF8(self.na_string_sentinel)
             vecs[i] = v
 
         with nogil:
@@ -1109,11 +1108,11 @@ cdef class StringHashTable(HashTable):
             val = values[i]
 
             if isinstance(val, str):
-                # GH#31499 if we have a np.str_ get_c_string won't recognize
+                # GH#31499 if we have a np.str_ PyUnicode_AsUTF8 won't recognize
                 #  it as a str, even though isinstance does.
-                v = get_c_string(<str>val)
+                v = PyUnicode_AsUTF8(<str>val)
             else:
-                v = get_c_string(self.na_string_sentinel)
+                v = PyUnicode_AsUTF8(self.na_string_sentinel)
             vecs[i] = v
 
         with nogil:
@@ -1195,9 +1194,9 @@ cdef class StringHashTable(HashTable):
             else:
                 # if ignore_na is False, we also stringify NaN/None/etc.
                 try:
-                    v = get_c_string(<str>val)
+                    v = PyUnicode_AsUTF8(<str>val)
                 except UnicodeEncodeError:
-                    v = get_c_string(<str>repr(val))
+                    v = PyUnicode_AsUTF8(<str>repr(val))
                 vecs[i] = v
 
         # compute
diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx
@@ -18,6 +18,7 @@ from cpython.object cimport (
     Py_LT,
     Py_NE,
 )
+from cpython.unicode cimport PyUnicode_AsUTF8AndSize
 from libc.stdint cimport INT64_MAX
 
 import_datetime()
@@ -44,7 +45,6 @@ from pandas._libs.tslibs.dtypes cimport (
     npy_unit_to_abbrev,
     npy_unit_to_attrname,
 )
-from pandas._libs.tslibs.util cimport get_c_string_buf_and_size
 
 
 cdef extern from "pandas/datetime/pd_datetime.h":
@@ -341,13 +341,13 @@ cdef int string_to_dts(
         const char* format_buf
         FormatRequirement format_requirement
 
-    buf = get_c_string_buf_and_size(val, &length)
+    buf = PyUnicode_AsUTF8AndSize(val, &length)
     if format is None:
         format_buf = b""
         format_length = 0
         format_requirement = INFER_FORMAT
     else:
-        format_buf = get_c_string_buf_and_size(format, &format_length)
+        format_buf = PyUnicode_AsUTF8AndSize(format, &format_length)
         format_requirement = <FormatRequirement>exact
     return parse_iso_8601_datetime(buf, length, want_exc,
                                    dts, out_bestunit, out_local, out_tzoffset,
diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx
@@ -19,6 +19,7 @@ from cpython.datetime cimport (
 from datetime import timezone
 
 from cpython.object cimport PyObject_Str
+from cpython.unicode cimport PyUnicode_AsUTF8AndSize
 from cython cimport Py_ssize_t
 from libc.string cimport strchr
 
@@ -74,10 +75,7 @@ import_pandas_datetime()
 
 from pandas._libs.tslibs.strptime import array_strptime
 
-from pandas._libs.tslibs.util cimport (
-    get_c_string_buf_and_size,
-    is_array,
-)
+from pandas._libs.tslibs.util cimport is_array
 
 
 cdef extern from "pandas/portable.h":
@@ -175,7 +173,7 @@ cdef datetime _parse_delimited_date(
         int day = 1, month = 1, year
         bint can_swap = 0
 
-    buf = get_c_string_buf_and_size(date_string, &length)
+    buf = PyUnicode_AsUTF8AndSize(date_string, &length)
     if length == 10 and _is_delimiter(buf[2]) and _is_delimiter(buf[5]):
         # parsing MM?DD?YYYY and DD?MM?YYYY dates
         month = _parse_2digit(buf)
@@ -251,7 +249,7 @@ cdef bint _does_string_look_like_time(str parse_string):
         Py_ssize_t length
         int hour = -1, minute = -1
 
-    buf = get_c_string_buf_and_size(parse_string, &length)
+    buf = PyUnicode_AsUTF8AndSize(parse_string, &length)
     if length >= 4:
         if buf[1] == b":":
             # h:MM format
@@ -467,7 +465,7 @@ cpdef bint _does_string_look_like_datetime(str py_string):
         char first
         int error = 0
 
-    buf = get_c_string_buf_and_size(py_string, &length)
+    buf = PyUnicode_AsUTF8AndSize(py_string, &length)
     if length >= 1:
         first = buf[0]
         if first == b"0":
@@ -521,7 +519,7 @@ cdef datetime _parse_dateabbr_string(str date_string, datetime default,
             pass
 
     if 4 <= date_len <= 7:
-        buf = get_c_string_buf_and_size(date_string, &date_len)
+        buf = PyUnicode_AsUTF8AndSize(date_string, &date_len)
         try:
             i = date_string.index("Q", 1, 6)
             if i == 1:
diff --git a/pandas/_libs/tslibs/util.pxd b/pandas/_libs/tslibs/util.pxd
@@ -1,6 +1,5 @@
 
 from cpython.object cimport PyTypeObject
-from cpython.unicode cimport PyUnicode_AsUTF8AndSize
 
 
 cdef extern from "Python.h":
@@ -155,36 +154,6 @@ cdef inline bint is_nan(object val):
     return is_complex_object(val) and val != val
 
 
-cdef inline const char* get_c_string_buf_and_size(str py_string,
-                                                  Py_ssize_t *length) except NULL:
-    """
-    Extract internal char* buffer of unicode or bytes object `py_string` with
-    getting length of this internal buffer saved in `length`.
-
-    Notes
-    -----
-    Python object owns memory, thus returned char* must not be freed.
-    `length` can be NULL if getting buffer length is not needed.
-
-    Parameters
-    ----------
-    py_string : str
-    length : Py_ssize_t*
-
-    Returns
-    -------
-    buf : const char*
-    """
-    # Note PyUnicode_AsUTF8AndSize() can
-    #  potentially allocate memory inside in unlikely case of when underlying
-    #  unicode object was stored as non-utf8 and utf8 wasn't requested before.
-    return PyUnicode_AsUTF8AndSize(py_string, length)
-
-
-cdef inline const char* get_c_string(str py_string) except NULL:
-    return get_c_string_buf_and_size(py_string, NULL)
-
-
 cdef inline bytes string_encode_locale(str py_string):
     """As opposed to PyUnicode_Encode, use current system locale to encode."""
     return PyUnicode_EncodeLocale(py_string, NULL)