gh-124064: Fix -Wconversion warnings in Parser/string_parser.c (#124204)

vstinner · web-flow · commit f9fa6ba4f8d9 · 2024-09-18T19:10:56.000+02:00
Fix integer overflow check in decode_unicode_with_escapes(): use
PY_SSIZE_T_MAX instead of SIZE_MAX.
diff --git a/Parser/string_parser.c b/Parser/string_parser.c
@@ -18,7 +18,7 @@ warn_invalid_escape_sequence(Parser *p, const char *first_invalid_escape, Token
         // to avoid showing the warning twice.
         return 0;
     }
-    unsigned char c = *first_invalid_escape;
+    unsigned char c = (unsigned char)*first_invalid_escape;
     if ((t->type == FSTRING_MIDDLE || t->type == FSTRING_END) && (c == '{' || c == '}')) {
         // in this case the tokenizer has already emitted a warning,
         // see Parser/tokenizer/helpers.c:warn_invalid_escape_sequence
@@ -90,12 +90,12 @@ decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
     const char *end;
 
     /* check for integer overflow */
-    if (len > SIZE_MAX / 6) {
+    if (len > (size_t)PY_SSIZE_T_MAX / 6) {
         return NULL;
     }
     /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
        "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
-    u = PyBytes_FromStringAndSize((char *)NULL, len * 6);
+    u = PyBytes_FromStringAndSize((char *)NULL, (Py_ssize_t)len * 6);
     if (u == NULL) {
         return NULL;
     }
@@ -142,11 +142,11 @@ decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
             *p++ = *s++;
         }
     }
-    len = p - buf;
+    len = (size_t)(p - buf);
     s = buf;
 
     const char *first_invalid_escape;
-    v = _PyUnicode_DecodeUnicodeEscapeInternal(s, len, NULL, NULL, &first_invalid_escape);
+    v = _PyUnicode_DecodeUnicodeEscapeInternal(s, (Py_ssize_t)len, NULL, NULL, &first_invalid_escape);
 
     // HACK: later we can simply pass the line no, since we don't preserve the tokens
     // when we are decoding the string but we preserve the line numbers.
@@ -185,7 +185,7 @@ PyObject *
 _PyPegen_decode_string(Parser *p, int raw, const char *s, size_t len, Token *t)
 {
     if (raw) {
-        return PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
+        return PyUnicode_DecodeUTF8Stateful(s, (Py_ssize_t)len, NULL, NULL);
     }
     return decode_unicode_with_escapes(p, s, len, t);
 }
@@ -274,9 +274,9 @@ _PyPegen_parse_string(Parser *p, Token *t)
             }
         }
         if (rawmode) {
-            return PyBytes_FromStringAndSize(s, len);
+            return PyBytes_FromStringAndSize(s, (Py_ssize_t)len);
         }
-        return decode_bytes_with_escapes(p, s, len, t);
+        return decode_bytes_with_escapes(p, s, (Py_ssize_t)len, t);
     }
     return _PyPegen_decode_string(p, rawmode, s, len, t);
 }

Original file line number	Diff line number	Diff line change
`@@ -18,7 +18,7 @@ warn_invalid_escape_sequence(Parser p, const char first_invalid_escape, Token`
`18`	`18`	`// to avoid showing the warning twice.`
`19`	`19`	`return 0;`
`20`	`20`	`}`
`21`		`- unsigned char c = *first_invalid_escape;`
	`21`	`+ unsigned char c = (unsigned char)*first_invalid_escape;`
`22`	`22`	`if ((t->type == FSTRING_MIDDLE \|\| t->type == FSTRING_END) && (c == '{' \|\| c == '}')) {`
`23`	`23`	`// in this case the tokenizer has already emitted a warning,`
`24`	`24`	`// see Parser/tokenizer/helpers.c:warn_invalid_escape_sequence`
`@@ -90,12 +90,12 @@ decode_unicode_with_escapes(Parser parser, const char s, size_t len, Token *t)`
`90`	`90`	`const char *end;`
`91`	`91`
`92`	`92`	`/* check for integer overflow */`
`93`		`- if (len > SIZE_MAX / 6) {`
	`93`	`+ if (len > (size_t)PY_SSIZE_T_MAX / 6) {`
`94`	`94`	`return NULL;`
`95`	`95`	`}`
`96`	`96`	`/* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5`
`97`	`97`	`"\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */`
`98`		`- u = PyBytes_FromStringAndSize((char )NULL, len 6);`
	`98`	`+ u = PyBytes_FromStringAndSize((char )NULL, (Py_ssize_t)len 6);`
`99`	`99`	`if (u == NULL) {`
`100`	`100`	`return NULL;`
`101`	`101`	`}`
`@@ -142,11 +142,11 @@ decode_unicode_with_escapes(Parser parser, const char s, size_t len, Token *t)`
`142`	`142`	`p++ = s++;`
`143`	`143`	`}`
`144`	`144`	`}`
`145`		`- len = p - buf;`
	`145`	`+ len = (size_t)(p - buf);`
`146`	`146`	`s = buf;`
`147`	`147`
`148`	`148`	`const char *first_invalid_escape;`
`149`		`- v = _PyUnicode_DecodeUnicodeEscapeInternal(s, len, NULL, NULL, &first_invalid_escape);`
	`149`	`+ v = _PyUnicode_DecodeUnicodeEscapeInternal(s, (Py_ssize_t)len, NULL, NULL, &first_invalid_escape);`
`150`	`150`
`151`	`151`	`// HACK: later we can simply pass the line no, since we don't preserve the tokens`
`152`	`152`	`// when we are decoding the string but we preserve the line numbers.`
`@@ -185,7 +185,7 @@ PyObject *`
`185`	`185`	`_PyPegen_decode_string(Parser p, int raw, const char s, size_t len, Token *t)`
`186`	`186`	`{`
`187`	`187`	`if (raw) {`
`188`		`- return PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);`
	`188`	`+ return PyUnicode_DecodeUTF8Stateful(s, (Py_ssize_t)len, NULL, NULL);`
`189`	`189`	`}`
`190`	`190`	`return decode_unicode_with_escapes(p, s, len, t);`
`191`	`191`	`}`
`@@ -274,9 +274,9 @@ _PyPegen_parse_string(Parser p, Token t)`
`274`	`274`	`}`
`275`	`275`	`}`
`276`	`276`	`if (rawmode) {`
`277`		`- return PyBytes_FromStringAndSize(s, len);`
	`277`	`+ return PyBytes_FromStringAndSize(s, (Py_ssize_t)len);`
`278`	`278`	`}`
`279`		`- return decode_bytes_with_escapes(p, s, len, t);`
	`279`	`+ return decode_bytes_with_escapes(p, s, (Py_ssize_t)len, t);`
`280`	`280`	`}`
`281`	`281`	`return _PyPegen_decode_string(p, rawmode, s, len, t);`
`282`	`282`	`}`