Skip to content

Commit 1775091

Browse files
authored
gh-129173: Use _PyUnicodeError_GetParams in PyCodec_SurrogatePassErrors (GH-129134)
1 parent 303043f commit 1775091

File tree

1 file changed

+162
-120
lines changed

1 file changed

+162
-120
lines changed

Python/codecs.c

Lines changed: 162 additions & 120 deletions
Original file line numberDiff line numberDiff line change
@@ -1095,7 +1095,7 @@ PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
10951095
#define ENC_UTF32LE 4
10961096

10971097
static int
1098-
get_standard_encoding(const char *encoding, int *bytelength)
1098+
get_standard_encoding_impl(const char *encoding, int *bytelength)
10991099
{
11001100
if (Py_TOLOWER(encoding[0]) == 'u' &&
11011101
Py_TOLOWER(encoding[1]) == 't' &&
@@ -1153,172 +1153,212 @@ get_standard_encoding(const char *encoding, int *bytelength)
11531153
return ENC_UNKNOWN;
11541154
}
11551155

1156-
/* This handler is declared static until someone demonstrates
1157-
a need to call it directly. */
1156+
1157+
static int
1158+
get_standard_encoding(PyObject *encoding, int *code, int *bytelength)
1159+
{
1160+
const char *encoding_cstr = PyUnicode_AsUTF8(encoding);
1161+
if (encoding_cstr == NULL) {
1162+
return -1;
1163+
}
1164+
*code = get_standard_encoding_impl(encoding_cstr, bytelength);
1165+
return 0;
1166+
}
1167+
1168+
1169+
// --- handler: 'surrogatepass' -----------------------------------------------
1170+
11581171
static PyObject *
1159-
PyCodec_SurrogatePassErrors(PyObject *exc)
1172+
_PyCodec_SurrogatePassUnicodeEncodeError(PyObject *exc)
11601173
{
1161-
PyObject *restuple;
1162-
PyObject *object;
1163-
PyObject *encode;
1164-
const char *encoding;
1165-
int code;
1166-
int bytelength;
1167-
Py_ssize_t i;
1168-
Py_ssize_t start;
1169-
Py_ssize_t end;
1170-
PyObject *res;
1174+
PyObject *encoding = PyUnicodeEncodeError_GetEncoding(exc);
1175+
if (encoding == NULL) {
1176+
return NULL;
1177+
}
1178+
int code, bytelength;
1179+
int rc = get_standard_encoding(encoding, &code, &bytelength);
1180+
Py_DECREF(encoding);
1181+
if (rc < 0) {
1182+
return NULL;
1183+
}
1184+
if (code == ENC_UNKNOWN) {
1185+
goto bail;
1186+
}
11711187

1172-
if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
1173-
unsigned char *outp;
1174-
if (PyUnicodeEncodeError_GetStart(exc, &start))
1175-
return NULL;
1176-
if (PyUnicodeEncodeError_GetEnd(exc, &end))
1177-
return NULL;
1178-
if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1179-
return NULL;
1180-
if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
1181-
Py_DECREF(object);
1182-
return NULL;
1183-
}
1184-
if (!(encoding = PyUnicode_AsUTF8(encode))) {
1185-
Py_DECREF(object);
1186-
Py_DECREF(encode);
1187-
return NULL;
1188-
}
1189-
code = get_standard_encoding(encoding, &bytelength);
1190-
Py_DECREF(encode);
1191-
if (code == ENC_UNKNOWN) {
1192-
/* Not supported, fail with original exception */
1193-
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1194-
Py_DECREF(object);
1195-
return NULL;
1196-
}
1188+
PyObject *obj;
1189+
Py_ssize_t objlen, start, end, slen;
1190+
if (_PyUnicodeError_GetParams(exc,
1191+
&obj, &objlen,
1192+
&start, &end, &slen, false) < 0)
1193+
{
1194+
return NULL;
1195+
}
11971196

1198-
if (end - start > PY_SSIZE_T_MAX / bytelength)
1199-
end = start + PY_SSIZE_T_MAX / bytelength;
1200-
res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
1201-
if (!res) {
1202-
Py_DECREF(object);
1203-
return NULL;
1197+
if (slen > PY_SSIZE_T_MAX / bytelength) {
1198+
end = start + PY_SSIZE_T_MAX / bytelength;
1199+
end = Py_MIN(end, objlen);
1200+
slen = Py_MAX(0, end - start);
1201+
}
1202+
1203+
PyObject *res = PyBytes_FromStringAndSize(NULL, bytelength * slen);
1204+
if (res == NULL) {
1205+
Py_DECREF(obj);
1206+
return NULL;
1207+
}
1208+
1209+
unsigned char *outp = (unsigned char *)PyBytes_AsString(res);
1210+
for (Py_ssize_t i = start; i < end; i++) {
1211+
/* object is guaranteed to be "ready" */
1212+
Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
1213+
if (!Py_UNICODE_IS_SURROGATE(ch)) {
1214+
/* Not a surrogate, fail with original exception */
1215+
Py_DECREF(obj);
1216+
Py_DECREF(res);
1217+
goto bail;
12041218
}
1205-
outp = (unsigned char*)PyBytes_AsString(res);
1206-
for (i = start; i < end; i++) {
1207-
/* object is guaranteed to be "ready" */
1208-
Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
1209-
if (!Py_UNICODE_IS_SURROGATE(ch)) {
1210-
/* Not a surrogate, fail with original exception */
1211-
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1212-
Py_DECREF(res);
1213-
Py_DECREF(object);
1214-
return NULL;
1215-
}
1216-
switch (code) {
1217-
case ENC_UTF8:
1219+
switch (code) {
1220+
case ENC_UTF8: {
12181221
*outp++ = (unsigned char)(0xe0 | (ch >> 12));
12191222
*outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
12201223
*outp++ = (unsigned char)(0x80 | (ch & 0x3f));
12211224
break;
1222-
case ENC_UTF16LE:
1223-
*outp++ = (unsigned char) ch;
1225+
}
1226+
case ENC_UTF16LE: {
1227+
*outp++ = (unsigned char)ch;
12241228
*outp++ = (unsigned char)(ch >> 8);
12251229
break;
1226-
case ENC_UTF16BE:
1230+
}
1231+
case ENC_UTF16BE: {
12271232
*outp++ = (unsigned char)(ch >> 8);
1228-
*outp++ = (unsigned char) ch;
1233+
*outp++ = (unsigned char)ch;
12291234
break;
1230-
case ENC_UTF32LE:
1231-
*outp++ = (unsigned char) ch;
1235+
}
1236+
case ENC_UTF32LE: {
1237+
*outp++ = (unsigned char)ch;
12321238
*outp++ = (unsigned char)(ch >> 8);
12331239
*outp++ = (unsigned char)(ch >> 16);
12341240
*outp++ = (unsigned char)(ch >> 24);
12351241
break;
1236-
case ENC_UTF32BE:
1242+
}
1243+
case ENC_UTF32BE: {
12371244
*outp++ = (unsigned char)(ch >> 24);
12381245
*outp++ = (unsigned char)(ch >> 16);
12391246
*outp++ = (unsigned char)(ch >> 8);
1240-
*outp++ = (unsigned char) ch;
1247+
*outp++ = (unsigned char)ch;
12411248
break;
12421249
}
12431250
}
1244-
restuple = Py_BuildValue("(On)", res, end);
1245-
Py_DECREF(res);
1246-
Py_DECREF(object);
1247-
return restuple;
12481251
}
1249-
else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
1250-
const unsigned char *p;
1251-
Py_UCS4 ch = 0;
1252-
if (PyUnicodeDecodeError_GetStart(exc, &start))
1253-
return NULL;
1254-
if (PyUnicodeDecodeError_GetEnd(exc, &end))
1255-
return NULL;
1256-
if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1257-
return NULL;
1258-
p = (const unsigned char*)PyBytes_AS_STRING(object);
1259-
if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
1260-
Py_DECREF(object);
1261-
return NULL;
1262-
}
1263-
if (!(encoding = PyUnicode_AsUTF8(encode))) {
1264-
Py_DECREF(object);
1265-
Py_DECREF(encode);
1266-
return NULL;
1267-
}
1268-
code = get_standard_encoding(encoding, &bytelength);
1269-
Py_DECREF(encode);
1270-
if (code == ENC_UNKNOWN) {
1271-
/* Not supported, fail with original exception */
1272-
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1273-
Py_DECREF(object);
1274-
return NULL;
1275-
}
12761252

1277-
/* Try decoding a single surrogate character. If
1278-
there are more, let the codec call us again. */
1279-
p += start;
1280-
if (PyBytes_GET_SIZE(object) - start >= bytelength) {
1281-
switch (code) {
1282-
case ENC_UTF8:
1253+
Py_DECREF(obj);
1254+
PyObject *restuple = Py_BuildValue("(Nn)", res, end);
1255+
return restuple;
1256+
1257+
bail:
1258+
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1259+
return NULL;
1260+
}
1261+
1262+
1263+
static PyObject *
1264+
_PyCodec_SurrogatePassUnicodeDecodeError(PyObject *exc)
1265+
{
1266+
PyObject *encoding = PyUnicodeDecodeError_GetEncoding(exc);
1267+
if (encoding == NULL) {
1268+
return NULL;
1269+
}
1270+
int code, bytelength;
1271+
int rc = get_standard_encoding(encoding, &code, &bytelength);
1272+
Py_DECREF(encoding);
1273+
if (rc < 0) {
1274+
return NULL;
1275+
}
1276+
if (code == ENC_UNKNOWN) {
1277+
goto bail;
1278+
}
1279+
1280+
PyObject *obj;
1281+
Py_ssize_t objlen, start, end, slen;
1282+
if (_PyUnicodeError_GetParams(exc,
1283+
&obj, &objlen,
1284+
&start, &end, &slen, true) < 0)
1285+
{
1286+
return NULL;
1287+
}
1288+
1289+
/* Try decoding a single surrogate character. If
1290+
there are more, let the codec call us again. */
1291+
Py_UCS4 ch = 0;
1292+
const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj);
1293+
p += start;
1294+
1295+
if (objlen - start >= bytelength) {
1296+
switch (code) {
1297+
case ENC_UTF8: {
12831298
if ((p[0] & 0xf0) == 0xe0 &&
12841299
(p[1] & 0xc0) == 0x80 &&
1285-
(p[2] & 0xc0) == 0x80) {
1300+
(p[2] & 0xc0) == 0x80)
1301+
{
12861302
/* it's a three-byte code */
1287-
ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
1303+
ch = ((p[0] & 0x0f) << 12) +
1304+
((p[1] & 0x3f) << 6) +
1305+
(p[2] & 0x3f);
12881306
}
12891307
break;
1290-
case ENC_UTF16LE:
1308+
}
1309+
case ENC_UTF16LE: {
12911310
ch = p[1] << 8 | p[0];
12921311
break;
1293-
case ENC_UTF16BE:
1312+
}
1313+
case ENC_UTF16BE: {
12941314
ch = p[0] << 8 | p[1];
12951315
break;
1296-
case ENC_UTF32LE:
1316+
}
1317+
case ENC_UTF32LE: {
12971318
ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
12981319
break;
1299-
case ENC_UTF32BE:
1320+
}
1321+
case ENC_UTF32BE: {
13001322
ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
13011323
break;
13021324
}
13031325
}
1326+
}
1327+
Py_DECREF(obj);
1328+
if (!Py_UNICODE_IS_SURROGATE(ch)) {
1329+
goto bail;
1330+
}
13041331

1305-
Py_DECREF(object);
1306-
if (!Py_UNICODE_IS_SURROGATE(ch)) {
1307-
/* it's not a surrogate - fail */
1308-
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1309-
return NULL;
1310-
}
1311-
res = PyUnicode_FromOrdinal(ch);
1312-
if (res == NULL)
1313-
return NULL;
1314-
return Py_BuildValue("(Nn)", res, start + bytelength);
1332+
PyObject *res = PyUnicode_FromOrdinal(ch);
1333+
if (res == NULL) {
1334+
return NULL;
1335+
}
1336+
return Py_BuildValue("(Nn)", res, start + bytelength);
1337+
1338+
bail:
1339+
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1340+
return NULL;
1341+
}
1342+
1343+
1344+
/* This handler is declared static until someone demonstrates
1345+
a need to call it directly. */
1346+
static PyObject *
1347+
PyCodec_SurrogatePassErrors(PyObject *exc)
1348+
{
1349+
if (_PyIsUnicodeEncodeError(exc)) {
1350+
return _PyCodec_SurrogatePassUnicodeEncodeError(exc);
1351+
}
1352+
else if (_PyIsUnicodeDecodeError(exc)) {
1353+
return _PyCodec_SurrogatePassUnicodeDecodeError(exc);
13151354
}
13161355
else {
13171356
wrong_exception_type(exc);
13181357
return NULL;
13191358
}
13201359
}
13211360

1361+
13221362
static PyObject *
13231363
PyCodec_SurrogateEscapeErrors(PyObject *exc)
13241364
{
@@ -1438,11 +1478,13 @@ namereplace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
14381478
}
14391479

14401480

1441-
static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
1481+
static inline PyObject *
1482+
surrogatepass_errors(PyObject *Py_UNUSED(self), PyObject *exc)
14421483
{
14431484
return PyCodec_SurrogatePassErrors(exc);
14441485
}
14451486

1487+
14461488
static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
14471489
{
14481490
return PyCodec_SurrogateEscapeErrors(exc);

0 commit comments

Comments
 (0)