Skip to content

Commit 9032cf5

Browse files
authored
bpo-43667: Fix broken Unicode encoding in non-UTF locales on Solaris (pythonGH-25096)
1 parent 4908fae commit 9032cf5

File tree

6 files changed

+194
-0
lines changed

6 files changed

+194
-0
lines changed

Diff for: Include/internal/pycore_fileutils.h

+12
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,18 @@ PyAPI_FUNC(void) _Py_closerange(int first, int last);
5353
PyAPI_FUNC(wchar_t*) _Py_GetLocaleEncoding(void);
5454
PyAPI_FUNC(PyObject*) _Py_GetLocaleEncodingObject(void);
5555

56+
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
57+
extern int _Py_LocaleUsesNonUnicodeWchar(void);
58+
59+
extern wchar_t* _Py_DecodeNonUnicodeWchar(
60+
const wchar_t* native,
61+
Py_ssize_t size);
62+
63+
extern int _Py_EncodeNonUnicodeWchar_InPlace(
64+
wchar_t* unicode,
65+
Py_ssize_t size);
66+
#endif
67+
5668
#ifdef __cplusplus
5769
}
5870
#endif

Diff for: Objects/unicodeobject.c

+40
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,10 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
5757
#include <windows.h>
5858
#endif
5959

60+
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
61+
#include "pycore_fileutils.h" // _Py_LocaleUsesNonUnicodeWchar()
62+
#endif
63+
6064
/* Uncomment to display statistics on interned strings at exit
6165
in _PyUnicode_ClearInterned(). */
6266
/* #define INTERNED_STATS 1 */
@@ -2217,6 +2221,20 @@ PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
22172221
if (size == 0)
22182222
_Py_RETURN_UNICODE_EMPTY();
22192223

2224+
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
2225+
/* Oracle Solaris uses non-Unicode internal wchar_t form for
2226+
non-Unicode locales and hence needs conversion to UCS-4 first. */
2227+
if (_Py_LocaleUsesNonUnicodeWchar()) {
2228+
wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size);
2229+
if (!converted) {
2230+
return NULL;
2231+
}
2232+
PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
2233+
PyMem_Free(converted);
2234+
return unicode;
2235+
}
2236+
#endif
2237+
22202238
/* Single character Unicode objects in the Latin-1 range are
22212239
shared when using this constructor */
22222240
if (size == 1 && (Py_UCS4)*u < 256)
@@ -3295,6 +3313,17 @@ PyUnicode_AsWideChar(PyObject *unicode,
32953313
res = size;
32963314
}
32973315
unicode_copy_as_widechar(unicode, w, size);
3316+
3317+
#if HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3318+
/* Oracle Solaris uses non-Unicode internal wchar_t form for
3319+
non-Unicode locales and hence needs conversion first. */
3320+
if (_Py_LocaleUsesNonUnicodeWchar()) {
3321+
if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < 0) {
3322+
return -1;
3323+
}
3324+
}
3325+
#endif
3326+
32983327
return res;
32993328
}
33003329

@@ -3321,6 +3350,17 @@ PyUnicode_AsWideCharString(PyObject *unicode,
33213350
return NULL;
33223351
}
33233352
unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3353+
3354+
#if HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3355+
/* Oracle Solaris uses non-Unicode internal wchar_t form for
3356+
non-Unicode locales and hence needs conversion first. */
3357+
if (_Py_LocaleUsesNonUnicodeWchar()) {
3358+
if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + 1)) < 0) {
3359+
return NULL;
3360+
}
3361+
}
3362+
#endif
3363+
33243364
if (size != NULL) {
33253365
*size = buflen;
33263366
}

Diff for: Python/fileutils.c

+106
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,10 @@ extern int winerror_to_errno(int);
1818
#include <sys/ioctl.h>
1919
#endif
2020

21+
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
22+
#include <iconv.h>
23+
#endif
24+
2125
#ifdef HAVE_FCNTL_H
2226
#include <fcntl.h>
2327
#endif /* HAVE_FCNTL_H */
@@ -93,6 +97,12 @@ _Py_device_encoding(int fd)
9397
static size_t
9498
is_valid_wide_char(wchar_t ch)
9599
{
100+
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
101+
/* Oracle Solaris doesn't use Unicode code points as wchar_t encoding
102+
for non-Unicode locales, which makes values higher than MAX_UNICODE
103+
possibly valid. */
104+
return 1;
105+
#endif
96106
if (Py_UNICODE_IS_SURROGATE(ch)) {
97107
// Reject lone surrogate characters
98108
return 0;
@@ -922,6 +932,102 @@ _Py_GetLocaleEncodingObject(void)
922932
return str;
923933
}
924934

935+
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
936+
937+
/* Check whether current locale uses Unicode as internal wchar_t form. */
938+
int
939+
_Py_LocaleUsesNonUnicodeWchar(void)
940+
{
941+
/* Oracle Solaris uses non-Unicode internal wchar_t form for
942+
non-Unicode locales and hence needs conversion to UTF first. */
943+
char* codeset = nl_langinfo(CODESET);
944+
if (!codeset) {
945+
return 0;
946+
}
947+
/* 646 refers to ISO/IEC 646 standard that corresponds to ASCII encoding */
948+
return (strcmp(codeset, "UTF-8") != 0 && strcmp(codeset, "646") != 0);
949+
}
950+
951+
static wchar_t *
952+
_Py_ConvertWCharForm(const wchar_t *source, Py_ssize_t size,
953+
const char *tocode, const char *fromcode)
954+
{
955+
Py_BUILD_ASSERT(sizeof(wchar_t) == 4);
956+
957+
/* Ensure we won't overflow the size. */
958+
if (size > (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t))) {
959+
PyErr_NoMemory();
960+
return NULL;
961+
}
962+
963+
/* the string doesn't have to be NULL terminated */
964+
wchar_t* target = PyMem_Malloc(size * sizeof(wchar_t));
965+
if (target == NULL) {
966+
PyErr_NoMemory();
967+
return NULL;
968+
}
969+
970+
iconv_t cd = iconv_open(tocode, fromcode);
971+
if (cd == (iconv_t)-1) {
972+
PyErr_Format(PyExc_ValueError, "iconv_open() failed");
973+
PyMem_Free(target);
974+
return NULL;
975+
}
976+
977+
char *inbuf = (char *) source;
978+
char *outbuf = (char *) target;
979+
size_t inbytesleft = sizeof(wchar_t) * size;
980+
size_t outbytesleft = inbytesleft;
981+
982+
size_t ret = iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
983+
if (ret == DECODE_ERROR) {
984+
PyErr_Format(PyExc_ValueError, "iconv() failed");
985+
PyMem_Free(target);
986+
iconv_close(cd);
987+
return NULL;
988+
}
989+
990+
iconv_close(cd);
991+
return target;
992+
}
993+
994+
/* Convert a wide character string to the UCS-4 encoded string. This
995+
is necessary on systems where internal form of wchar_t are not Unicode
996+
code points (e.g. Oracle Solaris).
997+
998+
Return a pointer to a newly allocated string, use PyMem_Free() to free
999+
the memory. Return NULL and raise exception on conversion or memory
1000+
allocation error. */
1001+
wchar_t *
1002+
_Py_DecodeNonUnicodeWchar(const wchar_t *native, Py_ssize_t size)
1003+
{
1004+
return _Py_ConvertWCharForm(native, size, "UCS-4-INTERNAL", "wchar_t");
1005+
}
1006+
1007+
/* Convert a UCS-4 encoded string to native wide character string. This
1008+
is necessary on systems where internal form of wchar_t are not Unicode
1009+
code points (e.g. Oracle Solaris).
1010+
1011+
The conversion is done in place. This can be done because both wchar_t
1012+
and UCS-4 use 4-byte encoding, and one wchar_t symbol always correspond
1013+
to a single UCS-4 symbol and vice versa. (This is true for Oracle Solaris,
1014+
which is currently the only system using these functions; it doesn't have
1015+
to be for other systems).
1016+
1017+
Return 0 on success. Return -1 and raise exception on conversion
1018+
or memory allocation error. */
1019+
int
1020+
_Py_EncodeNonUnicodeWchar_InPlace(wchar_t *unicode, Py_ssize_t size)
1021+
{
1022+
wchar_t* result = _Py_ConvertWCharForm(unicode, size, "wchar_t", "UCS-4-INTERNAL");
1023+
if (!result) {
1024+
return -1;
1025+
}
1026+
memcpy(unicode, result, size * sizeof(wchar_t));
1027+
PyMem_Free(result);
1028+
return 0;
1029+
}
1030+
#endif /* HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION */
9251031

9261032
#ifdef MS_WINDOWS
9271033
static __int64 secs_between_epochs = 11644473600; /* Seconds between 1.1.1601 and 1.1.1970 */

Diff for: configure

+16
Original file line numberDiff line numberDiff line change
@@ -15264,6 +15264,22 @@ else
1526415264
$as_echo "no" >&6; }
1526515265
fi
1526615266

15267+
case $ac_sys_system/$ac_sys_release in
15268+
SunOS/*)
15269+
if test -f /etc/os-release; then
15270+
OS_NAME=$(awk -F= '/^NAME=/ {print substr($2,2,length($2)-2)}' /etc/os-release)
15271+
if test "x$OS_NAME" = "xOracle Solaris"; then
15272+
# bpo-43667: In Oracle Solaris, the internal form of wchar_t in
15273+
# non-Unicode locales is not Unicode and hence cannot be used directly.
15274+
# https://docs.oracle.com/cd/E37838_01/html/E61053/gmwke.html
15275+
15276+
$as_echo "#define HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION 1" >>confdefs.h
15277+
15278+
fi
15279+
fi
15280+
;;
15281+
esac
15282+
1526715283
# check for endianness
1526815284
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether byte ordering is bigendian" >&5
1526915285
$as_echo_n "checking whether byte ordering is bigendian... " >&6; }

Diff for: configure.ac

+16
Original file line numberDiff line numberDiff line change
@@ -4765,6 +4765,22 @@ else
47654765
AC_MSG_RESULT(no)
47664766
fi
47674767

4768+
case $ac_sys_system/$ac_sys_release in
4769+
SunOS/*)
4770+
if test -f /etc/os-release; then
4771+
OS_NAME=$(awk -F= '/^NAME=/ {print substr($2,2,length($2)-2)}' /etc/os-release)
4772+
if test "x$OS_NAME" = "xOracle Solaris"; then
4773+
# bpo-43667: In Oracle Solaris, the internal form of wchar_t in
4774+
# non-Unicode locales is not Unicode and hence cannot be used directly.
4775+
# https://docs.oracle.com/cd/E37838_01/html/E61053/gmwke.html
4776+
AC_DEFINE(HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION, 1,
4777+
[Define if the internal form of wchar_t in non-Unicode locales
4778+
is not Unicode.])
4779+
fi
4780+
fi
4781+
;;
4782+
esac
4783+
47684784
# check for endianness
47694785
AC_C_BIGENDIAN
47704786

Diff for: pyconfig.h.in

+4
Original file line numberDiff line numberDiff line change
@@ -748,6 +748,10 @@
748748
/* Define to 1 if you have the `nice' function. */
749749
#undef HAVE_NICE
750750

751+
/* Define if the internal form of wchar_t in non-Unicode locales is not
752+
Unicode. */
753+
#undef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
754+
751755
/* Define to 1 if you have the `openat' function. */
752756
#undef HAVE_OPENAT
753757

0 commit comments

Comments
 (0)