Add OsStr::to_str_split() and OsString::into_string_split().

jmillikin · jmillikin · commit 0f93dae98b37 · 2023-05-01T22:01:12.000+09:00
diff --git a/library/std/src/ffi/os_str.rs b/library/std/src/ffi/os_str.rs
@@ -178,6 +178,34 @@ impl OsString {
         self.inner.into_string().map_err(|buf| OsString { inner: buf })
     }
 
+    /// Splits the `OsString` into a Unicode prefix and non-Unicode suffix.
+    ///
+    /// The returned `String` is the longest prefix of the `OsString` that
+    /// contained valid Unicode. The returned `OsString` is the rest of the
+    /// original value.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #![feature(osstr_str_prefix_ops)]
+    ///
+    /// use std::ffi::OsString;
+    ///
+    /// let os_string = OsString::from("foo");
+    /// let (prefix, suffix) = os_string.clone().into_string_split();
+    ///
+    /// let mut rejoined = OsString::from(prefix);
+    /// rejoined.push(suffix);
+    /// assert_eq!(rejoined, os_string);
+    /// ```
+    #[unstable(feature = "osstr_str_prefix_ops", issue = "none")]
+    #[must_use]
+    #[inline]
+    pub fn into_string_split(self) -> (String, OsString) {
+        let (prefix, suffix) = self.inner.into_string_split();
+        (prefix, OsString { inner: suffix })
+    }
+
     /// Extends the string with the given <code>&[OsStr]</code> slice.
     ///
     /// # Examples
@@ -703,6 +731,34 @@ impl OsStr {
         self.inner.to_str()
     }
 
+    /// Splits the `OsStr` into a Unicode prefix and non-Unicode suffix.
+    ///
+    /// The returned `str` is the longest prefix of the `OsStr` that
+    /// contained valid Unicode. The returned `OsStr` is the rest of the
+    /// original value.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #![feature(osstr_str_prefix_ops)]
+    ///
+    /// use std::ffi::{OsStr, OsString};
+    ///
+    /// let os_str = OsStr::new("foo");
+    /// let (prefix, suffix) = os_str.to_str_split();
+    ///
+    /// let mut rejoined = OsString::from(prefix);
+    /// rejoined.push(suffix);
+    /// assert_eq!(rejoined, os_str);
+    /// ```
+    #[unstable(feature = "osstr_str_prefix_ops", issue = "none")]
+    #[must_use]
+    #[inline]
+    pub fn to_str_split(&self) -> (&str, &OsStr) {
+        let (prefix, suffix) = self.inner.to_str_split();
+        (prefix, Self::from_inner(suffix))
+    }
+
     /// Converts an `OsStr` to a <code>[Cow]<[str]></code>.
     ///
     /// Any non-Unicode sequences are replaced with
diff --git a/library/std/src/sys/unix/os_str.rs b/library/std/src/sys/unix/os_str.rs
@@ -164,6 +164,27 @@ impl Buf {
         String::from_utf8(self.inner).map_err(|p| Buf { inner: p.into_bytes() })
     }
 
+    pub fn into_string_split(self) -> (String, Buf) {
+        let utf8_err = match str::from_utf8(&self.inner) {
+            Ok(_) => {
+                // SAFETY: If `str::from_utf8()` succeeds then the input is UTF-8.
+                let prefix = unsafe { String::from_utf8_unchecked(self.inner) };
+                return (prefix, Buf { inner: Vec::new() });
+            }
+            Err(err) => err,
+        };
+        let utf8_len = utf8_err.valid_up_to();
+        if utf8_len == 0 {
+            return (String::new(), self);
+        }
+        let mut utf8_bytes = self.inner;
+        let rem_bytes = utf8_bytes.split_off(utf8_len);
+        // SAFETY: `Utf8Error::valid_up_to()` returns an index up to which
+        // valid UTF-8 has been verified.
+        let prefix = unsafe { String::from_utf8_unchecked(utf8_bytes) };
+        (prefix, Buf { inner: rem_bytes })
+    }
+
     pub fn push_slice(&mut self, s: &Slice) {
         self.inner.extend_from_slice(&s.inner)
     }
@@ -205,6 +226,21 @@ impl Slice {
         str::from_utf8(&self.inner).ok()
     }
 
+    pub fn to_str_split(&self) -> (&str, &Slice) {
+        let utf8_err = match str::from_utf8(&self.inner) {
+            Ok(prefix) => return (prefix, Slice::from_u8_slice(b"")),
+            Err(err) => err,
+        };
+        let utf8_len = utf8_err.valid_up_to();
+        if utf8_len == 0 {
+            return ("", self);
+        }
+        // SAFETY: `Utf8Error::valid_up_to()` returns an index up to which
+        // valid UTF-8 has been verified.
+        let prefix = unsafe { str::from_utf8_unchecked(&self.inner[..utf8_len]) };
+        (prefix, Slice::from_u8_slice(&self.inner[utf8_len..]))
+    }
+
     pub fn to_string_lossy(&self) -> Cow<'_, str> {
         String::from_utf8_lossy(&self.inner)
     }
diff --git a/library/std/src/sys/unix/os_str/tests.rs b/library/std/src/sys/unix/os_str/tests.rs
@@ -16,3 +16,37 @@ fn display() {
         Slice::from_u8_slice(b"Hello\xC0\x80 There\xE6\x83 Goodbye").to_string(),
     );
 }
+
+#[test]
+fn buf_into_string_split() {
+    let mut string = Buf::from_string(String::from("héllô wørld"));
+    {
+        let (prefix, suffix) = string.clone().into_string_split();
+        assert_eq!(prefix, String::from("héllô wørld"));
+        assert_eq!(suffix.into_inner(), Vec::new());
+    }
+
+    string.push_slice(Slice::from_u8_slice(b"\xFF"));
+    {
+        let (prefix, suffix) = string.clone().into_string_split();
+        assert_eq!(prefix, String::from("héllô wørld"));
+        assert_eq!(suffix.into_inner(), vec![0xFF]);
+    }
+}
+
+#[test]
+fn slice_to_str_split() {
+    let mut string = Buf::from_string(String::from("héllô wørld"));
+    {
+        let (prefix, suffix) = string.as_slice().to_str_split();
+        assert_eq!(prefix, "héllô wørld");
+        assert_eq!(&suffix.inner, b"");
+    }
+
+    string.push_slice(Slice::from_u8_slice(b"\xFF"));
+    {
+        let (prefix, suffix) = string.as_slice().to_str_split();
+        assert_eq!(prefix, String::from("héllô wørld"));
+        assert_eq!(&suffix.inner, b"\xFF");
+    }
+}
diff --git a/library/std/src/sys/windows/os_str.rs b/library/std/src/sys/windows/os_str.rs
@@ -98,6 +98,11 @@ impl Buf {
         self.inner.into_string().map_err(|buf| Buf { inner: buf })
     }
 
+    pub fn into_string_split(self) -> (String, Buf) {
+        let (prefix, suffix) = self.inner.into_string_split();
+        (prefix, Buf { inner: suffix })
+    }
+
     pub fn push_slice(&mut self, s: &Slice) {
         self.inner.push_wtf8(&s.inner)
     }
@@ -159,6 +164,11 @@ impl Slice {
         self.inner.as_str()
     }
 
+    pub fn to_str_split(&self) -> (&str, &Slice) {
+        let (prefix, suffix) = self.inner.to_str_split();
+        (prefix, Slice { inner: suffix })
+    }
+
     pub fn to_string_lossy(&self) -> Cow<'_, str> {
         self.inner.to_string_lossy()
     }
diff --git a/library/std/src/sys_common/wtf8.rs b/library/std/src/sys_common/wtf8.rs
@@ -441,6 +441,42 @@ impl Wtf8Buf {
         }
     }
 
+    /// Consumes the WTF-8 string and converts it to a (UTF-8, WTF-8) pair.
+    ///
+    /// This does not copy the data.
+    ///
+    /// The first element of the return value is the longest prefix of valid
+    /// UTF-8, with the second element being the remainder.
+    pub fn into_string_split(self) -> (String, Wtf8Buf) {
+        if self.is_known_utf8 {
+            // SAFETY: The inner value is known to be UTF-8.
+            let utf8 = unsafe { String::from_utf8_unchecked(self.bytes) };
+            return (utf8, Wtf8Buf::new());
+        }
+
+        let surrogate_pos = match self.next_surrogate(0) {
+            None => {
+                // SAFETY: Well-formed WTF-8 that contains no surrogates is
+                // also well-formed UTF-8.
+                let utf8 = unsafe { String::from_utf8_unchecked(self.bytes) };
+                return (utf8, Wtf8Buf::new());
+            }
+            Some((surrogate_pos, _)) => surrogate_pos,
+        };
+
+        if surrogate_pos == 0 {
+            return (String::new(), self);
+        }
+
+        let mut utf8_bytes = self.bytes;
+        let wtf8_bytes = utf8_bytes.split_off(surrogate_pos);
+        // SAFETY: `utf8_bytes` is a prefix of a WTF-8 value that contains no
+        // surrogates, and well-formed WTF-8 that contains no surrogates is
+        // also well-formed UTF-8.
+        let utf8 = unsafe { String::from_utf8_unchecked(utf8_bytes) };
+        (utf8, Wtf8Buf { bytes: wtf8_bytes, is_known_utf8: false })
+    }
+
     /// Converts this `Wtf8Buf` into a boxed `Wtf8`.
     #[inline]
     pub fn into_box(self) -> Box<Wtf8> {
@@ -664,6 +700,38 @@ impl Wtf8 {
         }
     }
 
+    /// Losslessly split a WTF-8 string into to a (UTF-8, WTF-8) pair.
+    ///
+    /// This does not copy the data.
+    ///
+    /// The first element of the return value is the longest prefix of valid
+    /// UTF-8, with the second element being the remainder.
+    pub fn to_str_split(&self) -> (&str, &Wtf8) {
+        let surrogate_pos = match self.next_surrogate(0) {
+            None => {
+                // SAFETY: Well-formed WTF-8 that contains no surrogates is
+                // also well-formed UTF-8.
+                let utf8 = unsafe { str::from_utf8_unchecked(&self.bytes) };
+                return (utf8, Wtf8::from_str(""));
+            }
+            Some((surrogate_pos, _)) => surrogate_pos,
+        };
+
+        if surrogate_pos == 0 {
+            return ("", self);
+        }
+
+        let (utf8_bytes, wtf8_bytes) = self.bytes.split_at(surrogate_pos);
+        // SAFETY: `utf8_bytes` is a prefix of a WTF-8 value that contains no
+        // surrogates, and well-formed WTF-8 that contains no surrogates is
+        // also well-formed UTF-8.
+        unsafe {
+            let utf8 = str::from_utf8_unchecked(utf8_bytes);
+            let wtf8 = Wtf8::from_bytes_unchecked(wtf8_bytes);
+            (utf8, wtf8)
+        }
+    }
+
     /// Converts the WTF-8 string to potentially ill-formed UTF-16
     /// and return an iterator of 16-bit code units.
     ///
diff --git a/library/std/src/sys_common/wtf8/tests.rs b/library/std/src/sys_common/wtf8/tests.rs
@@ -352,6 +352,26 @@ fn wtf8buf_into_string_lossy() {
     assert_eq!(string.clone().into_string_lossy(), String::from("aé 💩�"));
 }
 
+#[test]
+fn wtf8buf_into_string_split() {
+    // is_known_utf8
+    let mut string = Wtf8Buf::from_str("aé");
+    assert_eq!(string.clone().into_string_split(), (String::from("aé"), Wtf8Buf::new()),);
+
+    // !is_known_utf8, next_surrogate(0).is_none()
+    string.push_char(' ');
+    string.push(CodePoint::from_u32(0xD83D).unwrap());
+    string.push(CodePoint::from_u32(0xDCA9).unwrap());
+    assert_eq!(string.clone().into_string_split(), (String::from("aé 💩"), Wtf8Buf::new()),);
+
+    // !is_known_utf8, next_surrogate(0).is_some()
+    string.push(CodePoint::from_u32(0xD800).unwrap());
+    assert_eq!(
+        string.clone().into_string_split(),
+        (String::from("aé 💩"), Wtf8Buf::from_wide(&[0xD800])),
+    );
+}
+
 #[test]
 fn wtf8buf_from_iterator() {
     fn f(values: &[u32]) -> Wtf8Buf {
@@ -538,6 +558,20 @@ fn wtf8_to_string_lossy() {
     assert_eq!(string.to_string_lossy(), expected);
 }
 
+#[test]
+fn wtf8_to_str_split() {
+    // next_surrogate(0).is_none()
+    let mut string = Wtf8Buf::from_str("aé 💩");
+    assert_eq!(string.as_slice().to_str_split(), ("aé 💩", Wtf8::from_str("")),);
+
+    // next_surrogate(0).is_some()
+    string.push(CodePoint::from_u32(0xD800).unwrap());
+    assert_eq!(
+        string.as_slice().to_str_split(),
+        ("aé 💩", Wtf8Buf::from_wide(&[0xD800]).as_slice()),
+    );
+}
+
 #[test]
 fn wtf8_display() {
     fn d(b: &[u8]) -> String {

Original file line number	Diff line number	Diff line change
`@@ -98,6 +98,11 @@ impl Buf {`
`98`	`98`	`self.inner.into_string().map_err(\|buf\| Buf { inner: buf })`
`99`	`99`	`}`
`100`	`100`
	`101`	`+ pub fn into_string_split(self) -> (String, Buf) {`
	`102`	`+ let (prefix, suffix) = self.inner.into_string_split();`
	`103`	`+ (prefix, Buf { inner: suffix })`
	`104`	`+ }`
	`105`	`+`
`101`	`106`	`pub fn push_slice(&mut self, s: &Slice) {`
`102`	`107`	`self.inner.push_wtf8(&s.inner)`
`103`	`108`	`}`
`@@ -159,6 +164,11 @@ impl Slice {`
`159`	`164`	`self.inner.as_str()`
`160`	`165`	`}`
`161`	`166`
	`167`	`+ pub fn to_str_split(&self) -> (&str, &Slice) {`
	`168`	`+ let (prefix, suffix) = self.inner.to_str_split();`
	`169`	`+ (prefix, Slice { inner: suffix })`
	`170`	`+ }`
	`171`	`+`
`162`	`172`	`pub fn to_string_lossy(&self) -> Cow<'_, str> {`
`163`	`173`	`self.inner.to_string_lossy()`
`164`	`174`	`}`