Skip to content

Commit 4fe49b1

Browse files
authored
REF: Use PyUnicode_AsUTF8AndSize instead of get_c_string_buf_and_size (#58227)
1 parent b4493b6 commit 4fe49b1

File tree

4 files changed

+21
-55
lines changed

4 files changed

+21
-55
lines changed

Diff for: pandas/_libs/hashtable_class_helper.pxi.in

+12-13
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ Template for each `dtype` helper function for hashtable
33

44
WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
55
"""
6-
6+
from cpython.unicode cimport PyUnicode_AsUTF8
77

88
{{py:
99

@@ -98,7 +98,6 @@ from pandas._libs.khash cimport (
9898
# VectorData
9999
# ----------------------------------------------------------------------
100100

101-
from pandas._libs.tslibs.util cimport get_c_string
102101
from pandas._libs.missing cimport C_NA
103102

104103

@@ -998,7 +997,7 @@ cdef class StringHashTable(HashTable):
998997
cdef:
999998
khiter_t k
1000999
const char *v
1001-
v = get_c_string(val)
1000+
v = PyUnicode_AsUTF8(val)
10021001

10031002
k = kh_get_str(self.table, v)
10041003
if k != self.table.n_buckets:
@@ -1012,7 +1011,7 @@ cdef class StringHashTable(HashTable):
10121011
int ret = 0
10131012
const char *v
10141013

1015-
v = get_c_string(key)
1014+
v = PyUnicode_AsUTF8(key)
10161015

10171016
k = kh_put_str(self.table, v, &ret)
10181017
if kh_exist_str(self.table, k):
@@ -1037,7 +1036,7 @@ cdef class StringHashTable(HashTable):
10371036
raise MemoryError()
10381037
for i in range(n):
10391038
val = values[i]
1040-
v = get_c_string(val)
1039+
v = PyUnicode_AsUTF8(val)
10411040
vecs[i] = v
10421041

10431042
with nogil:
@@ -1071,11 +1070,11 @@ cdef class StringHashTable(HashTable):
10711070
val = values[i]
10721071

10731072
if isinstance(val, str):
1074-
# GH#31499 if we have a np.str_ get_c_string won't recognize
1073+
# GH#31499 if we have a np.str_ PyUnicode_AsUTF8 won't recognize
10751074
# it as a str, even though isinstance does.
1076-
v = get_c_string(<str>val)
1075+
v = PyUnicode_AsUTF8(<str>val)
10771076
else:
1078-
v = get_c_string(self.na_string_sentinel)
1077+
v = PyUnicode_AsUTF8(self.na_string_sentinel)
10791078
vecs[i] = v
10801079

10811080
with nogil:
@@ -1109,11 +1108,11 @@ cdef class StringHashTable(HashTable):
11091108
val = values[i]
11101109

11111110
if isinstance(val, str):
1112-
# GH#31499 if we have a np.str_ get_c_string won't recognize
1111+
# GH#31499 if we have a np.str_ PyUnicode_AsUTF8 won't recognize
11131112
# it as a str, even though isinstance does.
1114-
v = get_c_string(<str>val)
1113+
v = PyUnicode_AsUTF8(<str>val)
11151114
else:
1116-
v = get_c_string(self.na_string_sentinel)
1115+
v = PyUnicode_AsUTF8(self.na_string_sentinel)
11171116
vecs[i] = v
11181117

11191118
with nogil:
@@ -1195,9 +1194,9 @@ cdef class StringHashTable(HashTable):
11951194
else:
11961195
# if ignore_na is False, we also stringify NaN/None/etc.
11971196
try:
1198-
v = get_c_string(<str>val)
1197+
v = PyUnicode_AsUTF8(<str>val)
11991198
except UnicodeEncodeError:
1200-
v = get_c_string(<str>repr(val))
1199+
v = PyUnicode_AsUTF8(<str>repr(val))
12011200
vecs[i] = v
12021201

12031202
# compute

Diff for: pandas/_libs/tslibs/np_datetime.pyx

+3-3
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ from cpython.object cimport (
1818
Py_LT,
1919
Py_NE,
2020
)
21+
from cpython.unicode cimport PyUnicode_AsUTF8AndSize
2122
from libc.stdint cimport INT64_MAX
2223

2324
import_datetime()
@@ -44,7 +45,6 @@ from pandas._libs.tslibs.dtypes cimport (
4445
npy_unit_to_abbrev,
4546
npy_unit_to_attrname,
4647
)
47-
from pandas._libs.tslibs.util cimport get_c_string_buf_and_size
4848

4949

5050
cdef extern from "pandas/datetime/pd_datetime.h":
@@ -341,13 +341,13 @@ cdef int string_to_dts(
341341
const char* format_buf
342342
FormatRequirement format_requirement
343343

344-
buf = get_c_string_buf_and_size(val, &length)
344+
buf = PyUnicode_AsUTF8AndSize(val, &length)
345345
if format is None:
346346
format_buf = b""
347347
format_length = 0
348348
format_requirement = INFER_FORMAT
349349
else:
350-
format_buf = get_c_string_buf_and_size(format, &format_length)
350+
format_buf = PyUnicode_AsUTF8AndSize(format, &format_length)
351351
format_requirement = <FormatRequirement>exact
352352
return parse_iso_8601_datetime(buf, length, want_exc,
353353
dts, out_bestunit, out_local, out_tzoffset,

Diff for: pandas/_libs/tslibs/parsing.pyx

+6-8
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ from cpython.datetime cimport (
1919
from datetime import timezone
2020

2121
from cpython.object cimport PyObject_Str
22+
from cpython.unicode cimport PyUnicode_AsUTF8AndSize
2223
from cython cimport Py_ssize_t
2324
from libc.string cimport strchr
2425

@@ -74,10 +75,7 @@ import_pandas_datetime()
7475

7576
from pandas._libs.tslibs.strptime import array_strptime
7677

77-
from pandas._libs.tslibs.util cimport (
78-
get_c_string_buf_and_size,
79-
is_array,
80-
)
78+
from pandas._libs.tslibs.util cimport is_array
8179

8280

8381
cdef extern from "pandas/portable.h":
@@ -175,7 +173,7 @@ cdef datetime _parse_delimited_date(
175173
int day = 1, month = 1, year
176174
bint can_swap = 0
177175

178-
buf = get_c_string_buf_and_size(date_string, &length)
176+
buf = PyUnicode_AsUTF8AndSize(date_string, &length)
179177
if length == 10 and _is_delimiter(buf[2]) and _is_delimiter(buf[5]):
180178
# parsing MM?DD?YYYY and DD?MM?YYYY dates
181179
month = _parse_2digit(buf)
@@ -251,7 +249,7 @@ cdef bint _does_string_look_like_time(str parse_string):
251249
Py_ssize_t length
252250
int hour = -1, minute = -1
253251

254-
buf = get_c_string_buf_and_size(parse_string, &length)
252+
buf = PyUnicode_AsUTF8AndSize(parse_string, &length)
255253
if length >= 4:
256254
if buf[1] == b":":
257255
# h:MM format
@@ -467,7 +465,7 @@ cpdef bint _does_string_look_like_datetime(str py_string):
467465
char first
468466
int error = 0
469467

470-
buf = get_c_string_buf_and_size(py_string, &length)
468+
buf = PyUnicode_AsUTF8AndSize(py_string, &length)
471469
if length >= 1:
472470
first = buf[0]
473471
if first == b"0":
@@ -521,7 +519,7 @@ cdef datetime _parse_dateabbr_string(str date_string, datetime default,
521519
pass
522520

523521
if 4 <= date_len <= 7:
524-
buf = get_c_string_buf_and_size(date_string, &date_len)
522+
buf = PyUnicode_AsUTF8AndSize(date_string, &date_len)
525523
try:
526524
i = date_string.index("Q", 1, 6)
527525
if i == 1:

Diff for: pandas/_libs/tslibs/util.pxd

-31
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11

22
from cpython.object cimport PyTypeObject
3-
from cpython.unicode cimport PyUnicode_AsUTF8AndSize
43

54

65
cdef extern from "Python.h":
@@ -155,36 +154,6 @@ cdef inline bint is_nan(object val):
155154
return is_complex_object(val) and val != val
156155

157156

158-
cdef inline const char* get_c_string_buf_and_size(str py_string,
159-
Py_ssize_t *length) except NULL:
160-
"""
161-
Extract internal char* buffer of unicode or bytes object `py_string` with
162-
getting length of this internal buffer saved in `length`.
163-
164-
Notes
165-
-----
166-
Python object owns memory, thus returned char* must not be freed.
167-
`length` can be NULL if getting buffer length is not needed.
168-
169-
Parameters
170-
----------
171-
py_string : str
172-
length : Py_ssize_t*
173-
174-
Returns
175-
-------
176-
buf : const char*
177-
"""
178-
# Note PyUnicode_AsUTF8AndSize() can
179-
# potentially allocate memory inside in unlikely case of when underlying
180-
# unicode object was stored as non-utf8 and utf8 wasn't requested before.
181-
return PyUnicode_AsUTF8AndSize(py_string, length)
182-
183-
184-
cdef inline const char* get_c_string(str py_string) except NULL:
185-
return get_c_string_buf_and_size(py_string, NULL)
186-
187-
188157
cdef inline bytes string_encode_locale(str py_string):
189158
"""As opposed to PyUnicode_Encode, use current system locale to encode."""
190159
return PyUnicode_EncodeLocale(py_string, NULL)

0 commit comments

Comments
 (0)