@@ -15,7 +15,7 @@ WTF-8 strings can be obtained from UTF-8, UTF-16, or code points.
15
15
16
16
*/
17
17
18
- #![ feature( globs, default_type_params, phase) ]
18
+ #![ feature( globs, default_type_params, phase, macro_rules ) ]
19
19
20
20
#![ no_std]
21
21
@@ -41,8 +41,8 @@ use collections::string::String;
41
41
use collections:: vec:: Vec ;
42
42
use core:: fmt;
43
43
use core:: mem:: transmute;
44
+ use core:: num:: Saturating ;
44
45
use core:: slice;
45
- use core:: str:: Utf16CodeUnits ;
46
46
47
47
// Compensate for #[no_std]
48
48
#[ cfg( not( test) ) ]
@@ -53,6 +53,8 @@ mod std {
53
53
pub use core:: cmp; // deriving(Eq, Ord, etc.)
54
54
}
55
55
56
+ mod not_quite_std;
57
+
56
58
57
59
static UTF8_REPLACEMENT_CHARACTER : & ' static [ u8 ] = b"\xEF \xBF \xBD " ;
58
60
@@ -127,10 +129,7 @@ impl CodePoint {
127
129
/// if the code point is a surrogate (from U+D800 to U+DFFF).
128
130
#[ inline]
129
131
pub fn to_char_lossy ( & self ) -> char {
130
- match self . value {
131
- 0xD800 ... 0xDFFF => '\uFFFD' ,
132
- _ => unsafe { transmute ( self . value ) }
133
- }
132
+ self . to_char ( ) . unwrap_or ( '\uFFFD' )
134
133
}
135
134
}
136
135
@@ -203,10 +202,13 @@ impl Wtf8String {
203
202
for item in str:: utf16_items ( v) {
204
203
match item {
205
204
str:: ScalarValue ( c) => string. push_char ( c) ,
206
- // We’re violating some of the invariants of char here
207
- // in order to skip the surrogate pair check,
208
- // but such a pair would be a str::ScalarValue anyway.
209
- str:: LoneSurrogate ( s) => string. push_char ( unsafe { transmute ( s as u32 ) } )
205
+ str:: LoneSurrogate ( s) => {
206
+ // Surrogates are known to be in the code point range.
207
+ let code_point = unsafe { CodePoint :: from_u32_unchecked ( s as u32 ) } ;
208
+ // Skip the WTF-8 concatenation check,
209
+ // surrogate pairs are already decoded by str::utf16_items
210
+ not_quite_std:: push_code_point ( & mut string, code_point)
211
+ }
210
212
}
211
213
}
212
214
string
@@ -217,6 +219,39 @@ impl Wtf8String {
217
219
unsafe { transmute ( self . bytes . as_slice ( ) ) }
218
220
}
219
221
222
+ /// Reserves capacity for at least `additional` more bytes to be inserted
223
+ /// in the given `Wtf8String`.
224
+ /// The collection may reserve more space to avoid frequent reallocations.
225
+ ///
226
+ /// # Panics
227
+ ///
228
+ /// Panics if the new capacity overflows `uint`.
229
+ ///
230
+ /// # Example
231
+ ///
232
+ /// ```
233
+ /// let mut s = Wtf8String::new();
234
+ /// s.reserve(10);
235
+ /// assert!(s.capacity() >= 10);
236
+ /// ```
237
+ #[ inline]
238
+ pub fn reserve ( & mut self , additional : uint ) {
239
+ self . bytes . reserve ( additional)
240
+ }
241
+
242
+ /// Returns the number of bytes that this string buffer can hold without reallocating.
243
+ ///
244
+ /// # Example
245
+ ///
246
+ /// ```
247
+ /// let s = Wtf8String::with_capacity(10);
248
+ /// assert!(s.capacity() >= 10);
249
+ /// ```
250
+ #[ inline]
251
+ pub fn capacity ( & self ) -> uint {
252
+ self . bytes . capacity ( )
253
+ }
254
+
220
255
/// Append an UTF-8 slice at the end of the string.
221
256
#[ inline]
222
257
pub fn push_str ( & mut self , other : & str ) {
@@ -248,13 +283,7 @@ impl Wtf8String {
248
283
/// Append a Unicode scalar value at the end of the string.
249
284
#[ inline]
250
285
pub fn push_char ( & mut self , c : char ) {
251
- unsafe {
252
- // We’re violating some of the invariants of String here,
253
- // but String::push only assumes a subset of these invariants
254
- // that still hold for Wtf8String.
255
- let not_really_a_string: & mut String = transmute ( self ) ;
256
- not_really_a_string. push ( c)
257
- }
286
+ not_quite_std:: push_code_point ( self , CodePoint :: from_char ( c) )
258
287
}
259
288
260
289
/// Append a code point at the end of the string.
@@ -279,14 +308,8 @@ impl Wtf8String {
279
308
_ => { }
280
309
}
281
310
282
- unsafe {
283
- // We’re violating some of the invariants of String and char here,
284
- // but String::push only assumes a subset of these invariants
285
- // that still hold for Wtf8String and CodePoint.
286
- let not_really_a_string: & mut String = transmute ( self ) ;
287
- let not_really_a_char: char = transmute ( code_point. to_u32 ( ) ) ;
288
- not_really_a_string. push ( not_really_a_char)
289
- }
311
+ // No newly paired surrogates at the boundary.
312
+ not_quite_std:: push_code_point ( self , code_point)
290
313
}
291
314
292
315
/// Shortens a string to the specified length.
@@ -297,13 +320,8 @@ impl Wtf8String {
297
320
/// or if `new_len` is not a code point boundary.
298
321
#[ inline]
299
322
pub fn truncate ( & mut self , new_len : uint ) {
300
- unsafe {
301
- // We’re violating some of the invariants of String here,
302
- // but String::truncate only assumes a subset of these invariants
303
- // that still hold for Wtf8String.
304
- let not_really_a_string: & mut String = transmute ( self ) ;
305
- not_really_a_string. truncate ( new_len)
306
- }
323
+ assert ! ( not_quite_std:: is_code_point_boundary( self . as_slice( ) , new_len) ) ;
324
+ self . bytes . truncate ( new_len)
307
325
}
308
326
309
327
/// Consume the WTF-8 string and try to convert it to UTF-8.
@@ -456,12 +474,13 @@ impl Wtf8Slice {
456
474
/// or point beyond the end of the string.
457
475
#[ inline]
458
476
pub fn slice ( & self , begin : uint , end : uint ) -> & Wtf8Slice {
459
- unsafe {
460
- // We’re violating some of the invariants of &str here,
461
- // but &str::slice only assumes a subset of these invariants
462
- // that still hold for Wtf8Slice.
463
- let not_really_a_str = str:: raw:: from_utf8 ( & self . bytes ) ;
464
- Wtf8Slice :: from_str ( not_really_a_str. slice ( begin, end) )
477
+ // is_code_point_boundary checks that the index is in [0, .len()]
478
+ if begin <= end &&
479
+ not_quite_std:: is_code_point_boundary ( self , begin) &&
480
+ not_quite_std:: is_code_point_boundary ( self , end) {
481
+ unsafe { not_quite_std:: slice_unchecked ( self , begin, end) }
482
+ } else {
483
+ not_quite_std:: slice_error_fail ( self , begin, end)
465
484
}
466
485
}
467
486
@@ -473,12 +492,11 @@ impl Wtf8Slice {
473
492
/// or is beyond the end of the string.
474
493
#[ inline]
475
494
pub fn slice_from ( & self , begin : uint ) -> & Wtf8Slice {
476
- unsafe {
477
- // We’re violating some of the invariants of &str here,
478
- // but &str::slice only assumes a subset of these invariants
479
- // that still hold for Wtf8Slice.
480
- let not_really_a_str = str:: raw:: from_utf8 ( & self . bytes ) ;
481
- Wtf8Slice :: from_str ( not_really_a_str. slice_from ( begin) )
495
+ // is_code_point_boundary checks that the index is in [0, .len()]
496
+ if not_quite_std:: is_code_point_boundary ( self , begin) {
497
+ unsafe { not_quite_std:: slice_unchecked ( self , begin, self . len ( ) ) }
498
+ } else {
499
+ not_quite_std:: slice_error_fail ( self , begin, self . len ( ) )
482
500
}
483
501
}
484
502
@@ -490,12 +508,11 @@ impl Wtf8Slice {
490
508
/// or is beyond the end of the string.
491
509
#[ inline]
492
510
pub fn slice_to ( & self , end : uint ) -> & Wtf8Slice {
493
- unsafe {
494
- // We’re violating some of the invariants of &str here,
495
- // but &str::slice only assumes a subset of these invariants
496
- // that still hold for Wtf8Slice.
497
- let not_really_a_str = str:: raw:: from_utf8 ( & self . bytes ) ;
498
- Wtf8Slice :: from_str ( not_really_a_str. slice_to ( end) )
511
+ // is_code_point_boundary checks that the index is in [0, .len()]
512
+ if not_quite_std:: is_code_point_boundary ( self , end) {
513
+ unsafe { not_quite_std:: slice_unchecked ( self , 0 , end) }
514
+ } else {
515
+ not_quite_std:: slice_error_fail ( self , 0 , end)
499
516
}
500
517
}
501
518
@@ -534,26 +551,13 @@ impl Wtf8Slice {
534
551
/// or is beyond the end of the string.
535
552
#[ inline]
536
553
pub fn code_point_range_at ( & self , position : uint ) -> ( CodePoint , uint ) {
537
- unsafe {
538
- // We’re violating some of the invariants of &str here,
539
- // but &str::slice only assumes a subset of these invariants
540
- // that still hold for Wtf8Slice.
541
- let not_really_a_str = str:: raw:: from_utf8 ( & self . bytes ) ;
542
- let range = not_really_a_str. char_range_at ( position) ;
543
- ( CodePoint :: from_char ( range. ch ) , range. next )
544
- }
554
+ not_quite_std:: code_point_range_at ( self , position)
545
555
}
546
556
547
557
/// Return an iterator for the string’s code points.
548
558
#[ inline]
549
559
pub fn code_points ( & self ) -> Wtf8CodePoints {
550
- unsafe {
551
- // We’re violating some of the invariants of &str here,
552
- // but &str::chars only assumes a subset of these invariants
553
- // that still hold for Wtf8Slice.
554
- let not_really_a_str = str:: raw:: from_utf8 ( & self . bytes ) ;
555
- Wtf8CodePoints { not_really_chars : not_really_a_str. chars ( ) }
556
- }
560
+ Wtf8CodePoints { bytes : self . bytes . iter ( ) }
557
561
}
558
562
559
563
/// Try to convert the string to UTF-8 and return a `&str` slice.
@@ -609,14 +613,8 @@ impl Wtf8Slice {
609
613
/// calling `Wtf8String::from_ill_formed_utf16` on the resulting code units
610
614
/// would always return the original WTF-8 string.
611
615
#[ inline]
612
- pub fn to_ill_formed_utf16 ( & self ) -> Utf16CodeUnits {
613
- unsafe {
614
- // We’re violating some of the invariants of &str here,
615
- // but &str::to_utf16 only assumes a subset of these invariants
616
- // that still hold for Wtf8Slice.
617
- let not_really_a_str = str:: raw:: from_utf8 ( & self . bytes ) ;
618
- not_really_a_str. utf16_units ( )
619
- }
616
+ pub fn to_ill_formed_utf16 ( & self ) -> IllFormedUtf16CodeUnits {
617
+ IllFormedUtf16CodeUnits { code_points : self . code_points ( ) , extra : 0 }
620
618
}
621
619
622
620
#[ inline]
@@ -696,16 +694,41 @@ fn decode_surrogate_pair(lead: u16, trail: u16) -> char {
696
694
/// Created with the method `.code_points()`.
697
695
#[ deriving( Clone ) ]
698
696
pub struct Wtf8CodePoints < ' a > {
699
- not_really_chars : str :: Chars < ' a >
697
+ bytes : slice :: Items < ' a , u8 >
700
698
}
701
699
702
700
impl < ' a > Iterator < CodePoint > for Wtf8CodePoints < ' a > {
703
701
#[ inline]
704
702
fn next ( & mut self ) -> Option < CodePoint > {
705
- match self . not_really_chars . next ( ) {
706
- Some ( not_really_char) => Some ( CodePoint :: from_char ( not_really_char) ) ,
707
- None => None
708
- }
703
+ not_quite_std:: next_code_point ( & mut self . bytes )
704
+ }
705
+
706
+ #[ inline]
707
+ fn size_hint ( & self ) -> ( uint , Option < uint > ) {
708
+ let ( len, _) = self . bytes . size_hint ( ) ;
709
+ ( len. saturating_add ( 3 ) / 4 , Some ( len) )
710
+ }
711
+ }
712
+
713
+ #[ deriving( Clone ) ]
714
+ pub struct IllFormedUtf16CodeUnits < ' a > {
715
+ code_points : Wtf8CodePoints < ' a > ,
716
+ extra : u16
717
+ }
718
+
719
+ impl < ' a > Iterator < u16 > for IllFormedUtf16CodeUnits < ' a > {
720
+ #[ inline]
721
+ fn next ( & mut self ) -> Option < u16 > {
722
+ not_quite_std:: next_utf16_code_unit ( self )
723
+ }
724
+
725
+ #[ inline]
726
+ fn size_hint ( & self ) -> ( uint , Option < uint > ) {
727
+ let ( low, high) = self . code_points . size_hint ( ) ;
728
+ // every code point gets either one u16 or two u16,
729
+ // so this iterator is between 1 or 2 times as
730
+ // long as the underlying iterator.
731
+ ( low, high. and_then ( |n| n. checked_mul ( & 2 ) ) )
709
732
}
710
733
}
711
734
0 commit comments