@@ -441,6 +441,42 @@ impl Wtf8Buf {
441
441
}
442
442
}
443
443
444
+ /// Consumes the WTF-8 string and converts it to a (UTF-8, WTF-8) pair.
445
+ ///
446
+ /// This does not copy the data.
447
+ ///
448
+ /// The first element of the return value is the longest prefix of valid
449
+ /// UTF-8, with the second element being the remainder.
450
+ pub fn into_string_split ( self ) -> ( String , Wtf8Buf ) {
451
+ if self . is_known_utf8 {
452
+ // SAFETY: The inner value is known to be UTF-8.
453
+ let utf8 = unsafe { String :: from_utf8_unchecked ( self . bytes ) } ;
454
+ return ( utf8, Wtf8Buf :: new ( ) ) ;
455
+ }
456
+
457
+ let surrogate_pos = match self . next_surrogate ( 0 ) {
458
+ None => {
459
+ // SAFETY: Well-formed WTF-8 that contains no surrogates is
460
+ // also well-formed UTF-8.
461
+ let utf8 = unsafe { String :: from_utf8_unchecked ( self . bytes ) } ;
462
+ return ( utf8, Wtf8Buf :: new ( ) ) ;
463
+ }
464
+ Some ( ( surrogate_pos, _) ) => surrogate_pos,
465
+ } ;
466
+
467
+ if surrogate_pos == 0 {
468
+ return ( String :: new ( ) , self ) ;
469
+ }
470
+
471
+ let mut utf8_bytes = self . bytes ;
472
+ let wtf8_bytes = utf8_bytes. split_off ( surrogate_pos) ;
473
+ // SAFETY: `utf8_bytes` is a prefix of a WTF-8 value that contains no
474
+ // surrogates, and well-formed WTF-8 that contains no surrogates is
475
+ // also well-formed UTF-8.
476
+ let utf8 = unsafe { String :: from_utf8_unchecked ( utf8_bytes) } ;
477
+ ( utf8, Wtf8Buf { bytes : wtf8_bytes, is_known_utf8 : false } )
478
+ }
479
+
444
480
/// Converts this `Wtf8Buf` into a boxed `Wtf8`.
445
481
#[ inline]
446
482
pub fn into_box ( self ) -> Box < Wtf8 > {
@@ -664,6 +700,38 @@ impl Wtf8 {
664
700
}
665
701
}
666
702
703
+ /// Losslessly split a WTF-8 string into to a (UTF-8, WTF-8) pair.
704
+ ///
705
+ /// This does not copy the data.
706
+ ///
707
+ /// The first element of the return value is the longest prefix of valid
708
+ /// UTF-8, with the second element being the remainder.
709
+ pub fn to_str_split ( & self ) -> ( & str , & Wtf8 ) {
710
+ let surrogate_pos = match self . next_surrogate ( 0 ) {
711
+ None => {
712
+ // SAFETY: Well-formed WTF-8 that contains no surrogates is
713
+ // also well-formed UTF-8.
714
+ let utf8 = unsafe { str:: from_utf8_unchecked ( & self . bytes ) } ;
715
+ return ( utf8, Wtf8 :: from_str ( "" ) ) ;
716
+ }
717
+ Some ( ( surrogate_pos, _) ) => surrogate_pos,
718
+ } ;
719
+
720
+ if surrogate_pos == 0 {
721
+ return ( "" , self ) ;
722
+ }
723
+
724
+ let ( utf8_bytes, wtf8_bytes) = self . bytes . split_at ( surrogate_pos) ;
725
+ // SAFETY: `utf8_bytes` is a prefix of a WTF-8 value that contains no
726
+ // surrogates, and well-formed WTF-8 that contains no surrogates is
727
+ // also well-formed UTF-8.
728
+ unsafe {
729
+ let utf8 = str:: from_utf8_unchecked ( utf8_bytes) ;
730
+ let wtf8 = Wtf8 :: from_bytes_unchecked ( wtf8_bytes) ;
731
+ ( utf8, wtf8)
732
+ }
733
+ }
734
+
667
735
/// Converts the WTF-8 string to potentially ill-formed UTF-16
668
736
/// and return an iterator of 16-bit code units.
669
737
///
0 commit comments