Skip to content

Commit 38654ad

Browse files
authored
Rollup merge of #95967 - CAD97:from-utf16, r=dtolnay
Add explicit-endian String::from_utf16 variants This adds the following APIs under `feature(str_from_utf16_endian)`: ```rust impl String { pub fn from_utf16le(v: &[u8]) -> Result<String, FromUtf16Error>; pub fn from_utf16le_lossy(v: &[u8]) -> String; pub fn from_utf16be(v: &[u8]) -> Result<String, FromUtf16Error>; pub fn from_utf16be_lossy(v: &[u8]) -> String; } ``` These are versions of `String::from_utf16` that explicitly take [UTF-16LE and UTF-16BE](https://unicode.org/faq/utf_bom.html#gen7). Notably, we can do better than just the obvious `decode_utf16(v.array_chunks::<2>().copied().map(u16::from_le_bytes)).collect()` in that: - We handle the case where the byte slice is not an even number of bytes, and - In the case that the UTF-16 is native endian and the slice is aligned, we can forward to `String::from_utf16`. If the Unicode Consortium actively defines how to handle character replacement when decoding a UTF-16 bytestream with a trailing odd byte, I was unable to find reference. However, the behavior implemented here is fairly self-evidently correct: replace the single errant byte with the replacement character.
2 parents d627cf0 + 5facc32 commit 38654ad

File tree

1 file changed

+150
-0
lines changed

1 file changed

+150
-0
lines changed

Diff for: library/alloc/src/string.rs

+150
Original file line numberDiff line numberDiff line change
@@ -714,6 +714,156 @@ impl String {
714714
.collect()
715715
}
716716

717+
/// Decode a UTF-16LE–encoded vector `v` into a `String`, returning [`Err`]
718+
/// if `v` contains any invalid data.
719+
///
720+
/// # Examples
721+
///
722+
/// Basic usage:
723+
///
724+
/// ```
725+
/// #![feature(str_from_utf16_endian)]
726+
/// // 𝄞music
727+
/// let v = &[0x34, 0xD8, 0x1E, 0xDD, 0x6d, 0x00, 0x75, 0x00,
728+
/// 0x73, 0x00, 0x69, 0x00, 0x63, 0x00];
729+
/// assert_eq!(String::from("𝄞music"),
730+
/// String::from_utf16le(v).unwrap());
731+
///
732+
/// // 𝄞mu<invalid>ic
733+
/// let v = &[0x34, 0xD8, 0x1E, 0xDD, 0x6d, 0x00, 0x75, 0x00,
734+
/// 0x00, 0xD8, 0x69, 0x00, 0x63, 0x00];
735+
/// assert!(String::from_utf16le(v).is_err());
736+
/// ```
737+
#[cfg(not(no_global_oom_handling))]
738+
#[unstable(feature = "str_from_utf16_endian", issue = "116258")]
739+
pub fn from_utf16le(v: &[u8]) -> Result<String, FromUtf16Error> {
740+
if v.len() % 2 != 0 {
741+
return Err(FromUtf16Error(()));
742+
}
743+
match (cfg!(target_endian = "little"), unsafe { v.align_to::<u16>() }) {
744+
(true, ([], v, [])) => Self::from_utf16(v),
745+
_ => char::decode_utf16(v.array_chunks::<2>().copied().map(u16::from_le_bytes))
746+
.collect::<Result<_, _>>()
747+
.map_err(|_| FromUtf16Error(())),
748+
}
749+
}
750+
751+
/// Decode a UTF-16LE–encoded slice `v` into a `String`, replacing
752+
/// invalid data with [the replacement character (`U+FFFD`)][U+FFFD].
753+
///
754+
/// Unlike [`from_utf8_lossy`] which returns a [`Cow<'a, str>`],
755+
/// `from_utf16le_lossy` returns a `String` since the UTF-16 to UTF-8
756+
/// conversion requires a memory allocation.
757+
///
758+
/// [`from_utf8_lossy`]: String::from_utf8_lossy
759+
/// [`Cow<'a, str>`]: crate::borrow::Cow "borrow::Cow"
760+
/// [U+FFFD]: core::char::REPLACEMENT_CHARACTER
761+
///
762+
/// # Examples
763+
///
764+
/// Basic usage:
765+
///
766+
/// ```
767+
/// #![feature(str_from_utf16_endian)]
768+
/// // 𝄞mus<invalid>ic<invalid>
769+
/// let v = &[0x34, 0xD8, 0x1E, 0xDD, 0x6d, 0x00, 0x75, 0x00,
770+
/// 0x73, 0x00, 0x1E, 0xDD, 0x69, 0x00, 0x63, 0x00,
771+
/// 0x34, 0xD8];
772+
///
773+
/// assert_eq!(String::from("𝄞mus\u{FFFD}ic\u{FFFD}"),
774+
/// String::from_utf16le_lossy(v));
775+
/// ```
776+
#[cfg(not(no_global_oom_handling))]
777+
#[unstable(feature = "str_from_utf16_endian", issue = "116258")]
778+
pub fn from_utf16le_lossy(v: &[u8]) -> String {
779+
match (cfg!(target_endian = "little"), unsafe { v.align_to::<u16>() }) {
780+
(true, ([], v, [])) => Self::from_utf16_lossy(v),
781+
(true, ([], v, [_remainder])) => Self::from_utf16_lossy(v) + "\u{FFFD}",
782+
_ => {
783+
let mut iter = v.array_chunks::<2>();
784+
let string = char::decode_utf16(iter.by_ref().copied().map(u16::from_le_bytes))
785+
.map(|r| r.unwrap_or(char::REPLACEMENT_CHARACTER))
786+
.collect();
787+
if iter.remainder().is_empty() { string } else { string + "\u{FFFD}" }
788+
}
789+
}
790+
}
791+
792+
/// Decode a UTF-16BE–encoded vector `v` into a `String`, returning [`Err`]
793+
/// if `v` contains any invalid data.
794+
///
795+
/// # Examples
796+
///
797+
/// Basic usage:
798+
///
799+
/// ```
800+
/// #![feature(str_from_utf16_endian)]
801+
/// // 𝄞music
802+
/// let v = &[0xD8, 0x34, 0xDD, 0x1E, 0x00, 0x6d, 0x00, 0x75,
803+
/// 0x00, 0x73, 0x00, 0x69, 0x00, 0x63];
804+
/// assert_eq!(String::from("𝄞music"),
805+
/// String::from_utf16be(v).unwrap());
806+
///
807+
/// // 𝄞mu<invalid>ic
808+
/// let v = &[0xD8, 0x34, 0xDD, 0x1E, 0x00, 0x6d, 0x00, 0x75,
809+
/// 0xD8, 0x00, 0x00, 0x69, 0x00, 0x63];
810+
/// assert!(String::from_utf16be(v).is_err());
811+
/// ```
812+
#[cfg(not(no_global_oom_handling))]
813+
#[unstable(feature = "str_from_utf16_endian", issue = "116258")]
814+
pub fn from_utf16be(v: &[u8]) -> Result<String, FromUtf16Error> {
815+
if v.len() % 2 != 0 {
816+
return Err(FromUtf16Error(()));
817+
}
818+
match (cfg!(target_endian = "big"), unsafe { v.align_to::<u16>() }) {
819+
(true, ([], v, [])) => Self::from_utf16(v),
820+
_ => char::decode_utf16(v.array_chunks::<2>().copied().map(u16::from_be_bytes))
821+
.collect::<Result<_, _>>()
822+
.map_err(|_| FromUtf16Error(())),
823+
}
824+
}
825+
826+
/// Decode a UTF-16BE–encoded slice `v` into a `String`, replacing
827+
/// invalid data with [the replacement character (`U+FFFD`)][U+FFFD].
828+
///
829+
/// Unlike [`from_utf8_lossy`] which returns a [`Cow<'a, str>`],
830+
/// `from_utf16le_lossy` returns a `String` since the UTF-16 to UTF-8
831+
/// conversion requires a memory allocation.
832+
///
833+
/// [`from_utf8_lossy`]: String::from_utf8_lossy
834+
/// [`Cow<'a, str>`]: crate::borrow::Cow "borrow::Cow"
835+
/// [U+FFFD]: core::char::REPLACEMENT_CHARACTER
836+
///
837+
/// # Examples
838+
///
839+
/// Basic usage:
840+
///
841+
/// ```
842+
/// #![feature(str_from_utf16_endian)]
843+
/// // 𝄞mus<invalid>ic<invalid>
844+
/// let v = &[0xD8, 0x34, 0xDD, 0x1E, 0x00, 0x6d, 0x00, 0x75,
845+
/// 0x00, 0x73, 0xDD, 0x1E, 0x00, 0x69, 0x00, 0x63,
846+
/// 0xD8, 0x34];
847+
///
848+
/// assert_eq!(String::from("𝄞mus\u{FFFD}ic\u{FFFD}"),
849+
/// String::from_utf16be_lossy(v));
850+
/// ```
851+
#[cfg(not(no_global_oom_handling))]
852+
#[unstable(feature = "str_from_utf16_endian", issue = "116258")]
853+
pub fn from_utf16be_lossy(v: &[u8]) -> String {
854+
match (cfg!(target_endian = "big"), unsafe { v.align_to::<u16>() }) {
855+
(true, ([], v, [])) => Self::from_utf16_lossy(v),
856+
(true, ([], v, [_remainder])) => Self::from_utf16_lossy(v) + "\u{FFFD}",
857+
_ => {
858+
let mut iter = v.array_chunks::<2>();
859+
let string = char::decode_utf16(iter.by_ref().copied().map(u16::from_be_bytes))
860+
.map(|r| r.unwrap_or(char::REPLACEMENT_CHARACTER))
861+
.collect();
862+
if iter.remainder().is_empty() { string } else { string + "\u{FFFD}" }
863+
}
864+
}
865+
}
866+
717867
/// Decomposes a `String` into its raw components.
718868
///
719869
/// Returns the raw pointer to the underlying data, the length of

0 commit comments

Comments
 (0)