Skip to content

Commit 0f93dae

Browse files
committed
Add OsStr::to_str_split() and OsString::into_string_split().
1 parent 9ecda8d commit 0f93dae

File tree

6 files changed

+238
-0
lines changed

6 files changed

+238
-0
lines changed

library/std/src/ffi/os_str.rs

+56
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,34 @@ impl OsString {
178178
self.inner.into_string().map_err(|buf| OsString { inner: buf })
179179
}
180180

181+
/// Splits the `OsString` into a Unicode prefix and non-Unicode suffix.
182+
///
183+
/// The returned `String` is the longest prefix of the `OsString` that
184+
/// contained valid Unicode. The returned `OsString` is the rest of the
185+
/// original value.
186+
///
187+
/// # Examples
188+
///
189+
/// ```
190+
/// #![feature(osstr_str_prefix_ops)]
191+
///
192+
/// use std::ffi::OsString;
193+
///
194+
/// let os_string = OsString::from("foo");
195+
/// let (prefix, suffix) = os_string.clone().into_string_split();
196+
///
197+
/// let mut rejoined = OsString::from(prefix);
198+
/// rejoined.push(suffix);
199+
/// assert_eq!(rejoined, os_string);
200+
/// ```
201+
#[unstable(feature = "osstr_str_prefix_ops", issue = "none")]
202+
#[must_use]
203+
#[inline]
204+
pub fn into_string_split(self) -> (String, OsString) {
205+
let (prefix, suffix) = self.inner.into_string_split();
206+
(prefix, OsString { inner: suffix })
207+
}
208+
181209
/// Extends the string with the given <code>&[OsStr]</code> slice.
182210
///
183211
/// # Examples
@@ -703,6 +731,34 @@ impl OsStr {
703731
self.inner.to_str()
704732
}
705733

734+
/// Splits the `OsStr` into a Unicode prefix and non-Unicode suffix.
735+
///
736+
/// The returned `str` is the longest prefix of the `OsStr` that
737+
/// contained valid Unicode. The returned `OsStr` is the rest of the
738+
/// original value.
739+
///
740+
/// # Examples
741+
///
742+
/// ```
743+
/// #![feature(osstr_str_prefix_ops)]
744+
///
745+
/// use std::ffi::{OsStr, OsString};
746+
///
747+
/// let os_str = OsStr::new("foo");
748+
/// let (prefix, suffix) = os_str.to_str_split();
749+
///
750+
/// let mut rejoined = OsString::from(prefix);
751+
/// rejoined.push(suffix);
752+
/// assert_eq!(rejoined, os_str);
753+
/// ```
754+
#[unstable(feature = "osstr_str_prefix_ops", issue = "none")]
755+
#[must_use]
756+
#[inline]
757+
pub fn to_str_split(&self) -> (&str, &OsStr) {
758+
let (prefix, suffix) = self.inner.to_str_split();
759+
(prefix, Self::from_inner(suffix))
760+
}
761+
706762
/// Converts an `OsStr` to a <code>[Cow]<[str]></code>.
707763
///
708764
/// Any non-Unicode sequences are replaced with

library/std/src/sys/unix/os_str.rs

+36
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,27 @@ impl Buf {
164164
String::from_utf8(self.inner).map_err(|p| Buf { inner: p.into_bytes() })
165165
}
166166

167+
pub fn into_string_split(self) -> (String, Buf) {
168+
let utf8_err = match str::from_utf8(&self.inner) {
169+
Ok(_) => {
170+
// SAFETY: If `str::from_utf8()` succeeds then the input is UTF-8.
171+
let prefix = unsafe { String::from_utf8_unchecked(self.inner) };
172+
return (prefix, Buf { inner: Vec::new() });
173+
}
174+
Err(err) => err,
175+
};
176+
let utf8_len = utf8_err.valid_up_to();
177+
if utf8_len == 0 {
178+
return (String::new(), self);
179+
}
180+
let mut utf8_bytes = self.inner;
181+
let rem_bytes = utf8_bytes.split_off(utf8_len);
182+
// SAFETY: `Utf8Error::valid_up_to()` returns an index up to which
183+
// valid UTF-8 has been verified.
184+
let prefix = unsafe { String::from_utf8_unchecked(utf8_bytes) };
185+
(prefix, Buf { inner: rem_bytes })
186+
}
187+
167188
pub fn push_slice(&mut self, s: &Slice) {
168189
self.inner.extend_from_slice(&s.inner)
169190
}
@@ -205,6 +226,21 @@ impl Slice {
205226
str::from_utf8(&self.inner).ok()
206227
}
207228

229+
pub fn to_str_split(&self) -> (&str, &Slice) {
230+
let utf8_err = match str::from_utf8(&self.inner) {
231+
Ok(prefix) => return (prefix, Slice::from_u8_slice(b"")),
232+
Err(err) => err,
233+
};
234+
let utf8_len = utf8_err.valid_up_to();
235+
if utf8_len == 0 {
236+
return ("", self);
237+
}
238+
// SAFETY: `Utf8Error::valid_up_to()` returns an index up to which
239+
// valid UTF-8 has been verified.
240+
let prefix = unsafe { str::from_utf8_unchecked(&self.inner[..utf8_len]) };
241+
(prefix, Slice::from_u8_slice(&self.inner[utf8_len..]))
242+
}
243+
208244
pub fn to_string_lossy(&self) -> Cow<'_, str> {
209245
String::from_utf8_lossy(&self.inner)
210246
}

library/std/src/sys/unix/os_str/tests.rs

+34
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,37 @@ fn display() {
1616
Slice::from_u8_slice(b"Hello\xC0\x80 There\xE6\x83 Goodbye").to_string(),
1717
);
1818
}
19+
20+
#[test]
21+
fn buf_into_string_split() {
22+
let mut string = Buf::from_string(String::from("héllô wørld"));
23+
{
24+
let (prefix, suffix) = string.clone().into_string_split();
25+
assert_eq!(prefix, String::from("héllô wørld"));
26+
assert_eq!(suffix.into_inner(), Vec::new());
27+
}
28+
29+
string.push_slice(Slice::from_u8_slice(b"\xFF"));
30+
{
31+
let (prefix, suffix) = string.clone().into_string_split();
32+
assert_eq!(prefix, String::from("héllô wørld"));
33+
assert_eq!(suffix.into_inner(), vec![0xFF]);
34+
}
35+
}
36+
37+
#[test]
38+
fn slice_to_str_split() {
39+
let mut string = Buf::from_string(String::from("héllô wørld"));
40+
{
41+
let (prefix, suffix) = string.as_slice().to_str_split();
42+
assert_eq!(prefix, "héllô wørld");
43+
assert_eq!(&suffix.inner, b"");
44+
}
45+
46+
string.push_slice(Slice::from_u8_slice(b"\xFF"));
47+
{
48+
let (prefix, suffix) = string.as_slice().to_str_split();
49+
assert_eq!(prefix, String::from("héllô wørld"));
50+
assert_eq!(&suffix.inner, b"\xFF");
51+
}
52+
}

library/std/src/sys/windows/os_str.rs

+10
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,11 @@ impl Buf {
9898
self.inner.into_string().map_err(|buf| Buf { inner: buf })
9999
}
100100

101+
pub fn into_string_split(self) -> (String, Buf) {
102+
let (prefix, suffix) = self.inner.into_string_split();
103+
(prefix, Buf { inner: suffix })
104+
}
105+
101106
pub fn push_slice(&mut self, s: &Slice) {
102107
self.inner.push_wtf8(&s.inner)
103108
}
@@ -159,6 +164,11 @@ impl Slice {
159164
self.inner.as_str()
160165
}
161166

167+
pub fn to_str_split(&self) -> (&str, &Slice) {
168+
let (prefix, suffix) = self.inner.to_str_split();
169+
(prefix, Slice { inner: suffix })
170+
}
171+
162172
pub fn to_string_lossy(&self) -> Cow<'_, str> {
163173
self.inner.to_string_lossy()
164174
}

library/std/src/sys_common/wtf8.rs

+68
Original file line numberDiff line numberDiff line change
@@ -441,6 +441,42 @@ impl Wtf8Buf {
441441
}
442442
}
443443

444+
/// Consumes the WTF-8 string and converts it to a (UTF-8, WTF-8) pair.
445+
///
446+
/// This does not copy the data.
447+
///
448+
/// The first element of the return value is the longest prefix of valid
449+
/// UTF-8, with the second element being the remainder.
450+
pub fn into_string_split(self) -> (String, Wtf8Buf) {
451+
if self.is_known_utf8 {
452+
// SAFETY: The inner value is known to be UTF-8.
453+
let utf8 = unsafe { String::from_utf8_unchecked(self.bytes) };
454+
return (utf8, Wtf8Buf::new());
455+
}
456+
457+
let surrogate_pos = match self.next_surrogate(0) {
458+
None => {
459+
// SAFETY: Well-formed WTF-8 that contains no surrogates is
460+
// also well-formed UTF-8.
461+
let utf8 = unsafe { String::from_utf8_unchecked(self.bytes) };
462+
return (utf8, Wtf8Buf::new());
463+
}
464+
Some((surrogate_pos, _)) => surrogate_pos,
465+
};
466+
467+
if surrogate_pos == 0 {
468+
return (String::new(), self);
469+
}
470+
471+
let mut utf8_bytes = self.bytes;
472+
let wtf8_bytes = utf8_bytes.split_off(surrogate_pos);
473+
// SAFETY: `utf8_bytes` is a prefix of a WTF-8 value that contains no
474+
// surrogates, and well-formed WTF-8 that contains no surrogates is
475+
// also well-formed UTF-8.
476+
let utf8 = unsafe { String::from_utf8_unchecked(utf8_bytes) };
477+
(utf8, Wtf8Buf { bytes: wtf8_bytes, is_known_utf8: false })
478+
}
479+
444480
/// Converts this `Wtf8Buf` into a boxed `Wtf8`.
445481
#[inline]
446482
pub fn into_box(self) -> Box<Wtf8> {
@@ -664,6 +700,38 @@ impl Wtf8 {
664700
}
665701
}
666702

703+
/// Losslessly split a WTF-8 string into to a (UTF-8, WTF-8) pair.
704+
///
705+
/// This does not copy the data.
706+
///
707+
/// The first element of the return value is the longest prefix of valid
708+
/// UTF-8, with the second element being the remainder.
709+
pub fn to_str_split(&self) -> (&str, &Wtf8) {
710+
let surrogate_pos = match self.next_surrogate(0) {
711+
None => {
712+
// SAFETY: Well-formed WTF-8 that contains no surrogates is
713+
// also well-formed UTF-8.
714+
let utf8 = unsafe { str::from_utf8_unchecked(&self.bytes) };
715+
return (utf8, Wtf8::from_str(""));
716+
}
717+
Some((surrogate_pos, _)) => surrogate_pos,
718+
};
719+
720+
if surrogate_pos == 0 {
721+
return ("", self);
722+
}
723+
724+
let (utf8_bytes, wtf8_bytes) = self.bytes.split_at(surrogate_pos);
725+
// SAFETY: `utf8_bytes` is a prefix of a WTF-8 value that contains no
726+
// surrogates, and well-formed WTF-8 that contains no surrogates is
727+
// also well-formed UTF-8.
728+
unsafe {
729+
let utf8 = str::from_utf8_unchecked(utf8_bytes);
730+
let wtf8 = Wtf8::from_bytes_unchecked(wtf8_bytes);
731+
(utf8, wtf8)
732+
}
733+
}
734+
667735
/// Converts the WTF-8 string to potentially ill-formed UTF-16
668736
/// and return an iterator of 16-bit code units.
669737
///

library/std/src/sys_common/wtf8/tests.rs

+34
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,26 @@ fn wtf8buf_into_string_lossy() {
352352
assert_eq!(string.clone().into_string_lossy(), String::from("aé 💩�"));
353353
}
354354

355+
#[test]
356+
fn wtf8buf_into_string_split() {
357+
// is_known_utf8
358+
let mut string = Wtf8Buf::from_str("aé");
359+
assert_eq!(string.clone().into_string_split(), (String::from("aé"), Wtf8Buf::new()),);
360+
361+
// !is_known_utf8, next_surrogate(0).is_none()
362+
string.push_char(' ');
363+
string.push(CodePoint::from_u32(0xD83D).unwrap());
364+
string.push(CodePoint::from_u32(0xDCA9).unwrap());
365+
assert_eq!(string.clone().into_string_split(), (String::from("aé 💩"), Wtf8Buf::new()),);
366+
367+
// !is_known_utf8, next_surrogate(0).is_some()
368+
string.push(CodePoint::from_u32(0xD800).unwrap());
369+
assert_eq!(
370+
string.clone().into_string_split(),
371+
(String::from("aé 💩"), Wtf8Buf::from_wide(&[0xD800])),
372+
);
373+
}
374+
355375
#[test]
356376
fn wtf8buf_from_iterator() {
357377
fn f(values: &[u32]) -> Wtf8Buf {
@@ -538,6 +558,20 @@ fn wtf8_to_string_lossy() {
538558
assert_eq!(string.to_string_lossy(), expected);
539559
}
540560

561+
#[test]
562+
fn wtf8_to_str_split() {
563+
// next_surrogate(0).is_none()
564+
let mut string = Wtf8Buf::from_str("aé 💩");
565+
assert_eq!(string.as_slice().to_str_split(), ("aé 💩", Wtf8::from_str("")),);
566+
567+
// next_surrogate(0).is_some()
568+
string.push(CodePoint::from_u32(0xD800).unwrap());
569+
assert_eq!(
570+
string.as_slice().to_str_split(),
571+
("aé 💩", Wtf8Buf::from_wide(&[0xD800]).as_slice()),
572+
);
573+
}
574+
541575
#[test]
542576
fn wtf8_display() {
543577
fn d(b: &[u8]) -> String {

0 commit comments

Comments
 (0)