pythongh-106242: Fix path truncation in os.path.normpath (pythonGH-106816)

finnagin · zooba · commit ae5e111e283f · 2023-08-15T16:38:44.000+01:00
diff --git a/Include/internal/pycore_fileutils.h b/Include/internal/pycore_fileutils.h
@@ -244,7 +244,8 @@ extern int _Py_add_relfile(wchar_t *dirname,
                            const wchar_t *relfile,
                            size_t bufsize);
 extern size_t _Py_find_basename(const wchar_t *filename);
-PyAPI_FUNC(wchar_t *) _Py_normpath(wchar_t *path, Py_ssize_t size);
+PyAPI_FUNC(wchar_t*) _Py_normpath(wchar_t *path, Py_ssize_t size);
+extern wchar_t *_Py_normpath_and_size(wchar_t *path, Py_ssize_t size, Py_ssize_t *length);
 
 
 // Macros to protect CRT calls against instant termination when passed an
diff --git a/Lib/test/test_genericpath.py b/Lib/test/test_genericpath.py
@@ -460,6 +460,10 @@ def test_normpath_issue5827(self):
         for path in ('', '.', '/', '\\', '///foo/.//bar//'):
             self.assertIsInstance(self.pathmodule.normpath(path), str)
 
+    def test_normpath_issue106242(self):
+        for path in ('\x00', 'foo\x00bar', '\x00\x00', '\x00foo', 'foo\x00'):
+            self.assertEqual(self.pathmodule.normpath(path), path)
+
     def test_abspath_issue3426(self):
         # Check that abspath returns unicode when the arg is unicode
         # with both ASCII and non-ASCII cwds.
diff --git a/Misc/NEWS.d/next/Library/2023-08-14-23-11-11.gh-issue-106242.71HMym.rst b/Misc/NEWS.d/next/Library/2023-08-14-23-11-11.gh-issue-106242.71HMym.rst
@@ -0,0 +1 @@
+Fixes :func:`os.path.normpath` to handle embedded null characters without truncating the path.
diff --git a/Modules/posixmodule.c b/Modules/posixmodule.c
@@ -4552,7 +4552,9 @@ os__path_normpath_impl(PyObject *module, PyObject *path)
     if (!buffer) {
         return NULL;
     }
-    PyObject *result = PyUnicode_FromWideChar(_Py_normpath(buffer, len), -1);
+    Py_ssize_t norm_len;
+    wchar_t *norm_path = _Py_normpath_and_size(buffer, len, &norm_len);
+    PyObject *result = PyUnicode_FromWideChar(norm_path, norm_len);
     PyMem_Free(buffer);
     return result;
 }
diff --git a/Python/fileutils.c b/Python/fileutils.c
@@ -2179,12 +2179,14 @@ _Py_find_basename(const wchar_t *filename)
    path, which will be within the original buffer. Guaranteed to not
    make the path longer, and will not fail. 'size' is the length of
    the path, if known. If -1, the first null character will be assumed
-   to be the end of the path. */
+   to be the end of the path. 'normsize' will be set to contain the
+   length of the resulting normalized path. */
 wchar_t *
-_Py_normpath(wchar_t *path, Py_ssize_t size)
+_Py_normpath_and_size(wchar_t *path, Py_ssize_t size, Py_ssize_t *normsize)
 {
     assert(path != NULL);
-    if (!path[0] || size == 0) {
+    if (!path[0] && size < 0 || size == 0) {
+        *normsize = 0;
         return path;
     }
     wchar_t *pEnd = size >= 0 ? &path[size] : NULL;
@@ -2233,11 +2235,7 @@ _Py_normpath(wchar_t *path, Py_ssize_t size)
                 *p2++ = lastC = *p1;
             }
         }
-        if (sepCount) {
-            minP2 = p2;      // Invalid path
-        } else {
-            minP2 = p2 - 1;  // Absolute path has SEP at minP2
-        }
+        minP2 = p2 - 1;
     }
 #else
     // Skip past two leading SEPs
@@ -2297,13 +2295,28 @@ _Py_normpath(wchar_t *path, Py_ssize_t size)
         while (--p2 != minP2 && *p2 == SEP) {
             *p2 = L'\0';
         }
+    } else {
+        --p2;
     }
+    *normsize = p2 - path + 1;
 #undef SEP_OR_END
 #undef IS_SEP
 #undef IS_END
     return path;
 }
 
+/* In-place path normalisation. Returns the start of the normalized
+   path, which will be within the original buffer. Guaranteed to not
+   make the path longer, and will not fail. 'size' is the length of
+   the path, if known. If -1, the first null character will be assumed
+   to be the end of the path. */
+wchar_t *
+_Py_normpath(wchar_t *path, Py_ssize_t size)
+{
+    Py_ssize_t norm_length;
+    return _Py_normpath_and_size(path, size, &norm_length);
+}
+
 
 /* Get the current directory. buflen is the buffer size in wide characters
    including the null character. Decode the path from the locale encoding.

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+Fixes :func:`os.path.normpath` to handle embedded null characters without truncating the path.
Original file line number	Diff line number	Diff line change
`@@ -4552,7 +4552,9 @@ os__path_normpath_impl(PyObject module, PyObject path)`
`4552`	`4552`	`if (!buffer) {`
`4553`	`4553`	`return NULL;`
`4554`	`4554`	`}`
`4555`		`- PyObject *result = PyUnicode_FromWideChar(_Py_normpath(buffer, len), -1);`
	`4555`	`+ Py_ssize_t norm_len;`
	`4556`	`+ wchar_t *norm_path = _Py_normpath_and_size(buffer, len, &norm_len);`
	`4557`	`+ PyObject *result = PyUnicode_FromWideChar(norm_path, norm_len);`
`4556`	`4558`	`PyMem_Free(buffer);`
`4557`	`4559`	`return result;`
`4558`	`4560`	`}`