Skip to content

Commit 8a42f9e

Browse files
committed
Remove dubious transmutes, duplicate code from std instead.
1 parent 0842e8b commit 8a42f9e

File tree

2 files changed

+347
-78
lines changed

2 files changed

+347
-78
lines changed

src/lib.rs

+101-78
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ WTF-8 strings can be obtained from UTF-8, UTF-16, or code points.
1515
1616
*/
1717

18-
#![feature(globs, default_type_params, phase)]
18+
#![feature(globs, default_type_params, phase, macro_rules)]
1919

2020
#![no_std]
2121

@@ -41,8 +41,8 @@ use collections::string::String;
4141
use collections::vec::Vec;
4242
use core::fmt;
4343
use core::mem::transmute;
44+
use core::num::Saturating;
4445
use core::slice;
45-
use core::str::Utf16CodeUnits;
4646

4747
// Compensate for #[no_std]
4848
#[cfg(not(test))]
@@ -53,6 +53,8 @@ mod std {
5353
pub use core::cmp; // deriving(Eq, Ord, etc.)
5454
}
5555

56+
mod not_quite_std;
57+
5658

5759
static UTF8_REPLACEMENT_CHARACTER: &'static [u8] = b"\xEF\xBF\xBD";
5860

@@ -127,10 +129,7 @@ impl CodePoint {
127129
/// if the code point is a surrogate (from U+D800 to U+DFFF).
128130
#[inline]
129131
pub fn to_char_lossy(&self) -> char {
130-
match self.value {
131-
0xD800 ... 0xDFFF => '\uFFFD',
132-
_ => unsafe { transmute(self.value) }
133-
}
132+
self.to_char().unwrap_or('\uFFFD')
134133
}
135134
}
136135

@@ -203,10 +202,13 @@ impl Wtf8String {
203202
for item in str::utf16_items(v) {
204203
match item {
205204
str::ScalarValue(c) => string.push_char(c),
206-
// We’re violating some of the invariants of char here
207-
// in order to skip the surrogate pair check,
208-
// but such a pair would be a str::ScalarValue anyway.
209-
str::LoneSurrogate(s) => string.push_char(unsafe { transmute(s as u32) })
205+
str::LoneSurrogate(s) => {
206+
// Surrogates are known to be in the code point range.
207+
let code_point = unsafe { CodePoint::from_u32_unchecked(s as u32) };
208+
// Skip the WTF-8 concatenation check,
209+
// surrogate pairs are already decoded by str::utf16_items
210+
not_quite_std::push_code_point(&mut string, code_point)
211+
}
210212
}
211213
}
212214
string
@@ -217,6 +219,39 @@ impl Wtf8String {
217219
unsafe { transmute(self.bytes.as_slice()) }
218220
}
219221

222+
/// Reserves capacity for at least `additional` more bytes to be inserted
223+
/// in the given `Wtf8String`.
224+
/// The collection may reserve more space to avoid frequent reallocations.
225+
///
226+
/// # Panics
227+
///
228+
/// Panics if the new capacity overflows `uint`.
229+
///
230+
/// # Example
231+
///
232+
/// ```
233+
/// let mut s = Wtf8String::new();
234+
/// s.reserve(10);
235+
/// assert!(s.capacity() >= 10);
236+
/// ```
237+
#[inline]
238+
pub fn reserve(&mut self, additional: uint) {
239+
self.bytes.reserve(additional)
240+
}
241+
242+
/// Returns the number of bytes that this string buffer can hold without reallocating.
243+
///
244+
/// # Example
245+
///
246+
/// ```
247+
/// let s = Wtf8String::with_capacity(10);
248+
/// assert!(s.capacity() >= 10);
249+
/// ```
250+
#[inline]
251+
pub fn capacity(&self) -> uint {
252+
self.bytes.capacity()
253+
}
254+
220255
/// Append an UTF-8 slice at the end of the string.
221256
#[inline]
222257
pub fn push_str(&mut self, other: &str) {
@@ -248,13 +283,7 @@ impl Wtf8String {
248283
/// Append a Unicode scalar value at the end of the string.
249284
#[inline]
250285
pub fn push_char(&mut self, c: char) {
251-
unsafe {
252-
// We’re violating some of the invariants of String here,
253-
// but String::push only assumes a subset of these invariants
254-
// that still hold for Wtf8String.
255-
let not_really_a_string: &mut String = transmute(self);
256-
not_really_a_string.push(c)
257-
}
286+
not_quite_std::push_code_point(self, CodePoint::from_char(c))
258287
}
259288

260289
/// Append a code point at the end of the string.
@@ -279,14 +308,8 @@ impl Wtf8String {
279308
_ => {}
280309
}
281310

282-
unsafe {
283-
// We’re violating some of the invariants of String and char here,
284-
// but String::push only assumes a subset of these invariants
285-
// that still hold for Wtf8String and CodePoint.
286-
let not_really_a_string: &mut String = transmute(self);
287-
let not_really_a_char: char = transmute(code_point.to_u32());
288-
not_really_a_string.push(not_really_a_char)
289-
}
311+
// No newly paired surrogates at the boundary.
312+
not_quite_std::push_code_point(self, code_point)
290313
}
291314

292315
/// Shortens a string to the specified length.
@@ -297,13 +320,8 @@ impl Wtf8String {
297320
/// or if `new_len` is not a code point boundary.
298321
#[inline]
299322
pub fn truncate(&mut self, new_len: uint) {
300-
unsafe {
301-
// We’re violating some of the invariants of String here,
302-
// but String::truncate only assumes a subset of these invariants
303-
// that still hold for Wtf8String.
304-
let not_really_a_string: &mut String = transmute(self);
305-
not_really_a_string.truncate(new_len)
306-
}
323+
assert!(not_quite_std::is_code_point_boundary(self.as_slice(), new_len));
324+
self.bytes.truncate(new_len)
307325
}
308326

309327
/// Consume the WTF-8 string and try to convert it to UTF-8.
@@ -456,12 +474,13 @@ impl Wtf8Slice {
456474
/// or point beyond the end of the string.
457475
#[inline]
458476
pub fn slice(&self, begin: uint, end: uint) -> &Wtf8Slice {
459-
unsafe {
460-
// We’re violating some of the invariants of &str here,
461-
// but &str::slice only assumes a subset of these invariants
462-
// that still hold for Wtf8Slice.
463-
let not_really_a_str = str::raw::from_utf8(&self.bytes);
464-
Wtf8Slice::from_str(not_really_a_str.slice(begin, end))
477+
// is_code_point_boundary checks that the index is in [0, .len()]
478+
if begin <= end &&
479+
not_quite_std::is_code_point_boundary(self, begin) &&
480+
not_quite_std::is_code_point_boundary(self, end) {
481+
unsafe { not_quite_std::slice_unchecked(self, begin, end) }
482+
} else {
483+
not_quite_std::slice_error_fail(self, begin, end)
465484
}
466485
}
467486

@@ -473,12 +492,11 @@ impl Wtf8Slice {
473492
/// or is beyond the end of the string.
474493
#[inline]
475494
pub fn slice_from(&self, begin: uint) -> &Wtf8Slice {
476-
unsafe {
477-
// We’re violating some of the invariants of &str here,
478-
// but &str::slice only assumes a subset of these invariants
479-
// that still hold for Wtf8Slice.
480-
let not_really_a_str = str::raw::from_utf8(&self.bytes);
481-
Wtf8Slice::from_str(not_really_a_str.slice_from(begin))
495+
// is_code_point_boundary checks that the index is in [0, .len()]
496+
if not_quite_std::is_code_point_boundary(self, begin) {
497+
unsafe { not_quite_std::slice_unchecked(self, begin, self.len()) }
498+
} else {
499+
not_quite_std::slice_error_fail(self, begin, self.len())
482500
}
483501
}
484502

@@ -490,12 +508,11 @@ impl Wtf8Slice {
490508
/// or is beyond the end of the string.
491509
#[inline]
492510
pub fn slice_to(&self, end: uint) -> &Wtf8Slice {
493-
unsafe {
494-
// We’re violating some of the invariants of &str here,
495-
// but &str::slice only assumes a subset of these invariants
496-
// that still hold for Wtf8Slice.
497-
let not_really_a_str = str::raw::from_utf8(&self.bytes);
498-
Wtf8Slice::from_str(not_really_a_str.slice_to(end))
511+
// is_code_point_boundary checks that the index is in [0, .len()]
512+
if not_quite_std::is_code_point_boundary(self, end) {
513+
unsafe { not_quite_std::slice_unchecked(self, 0, end) }
514+
} else {
515+
not_quite_std::slice_error_fail(self, 0, end)
499516
}
500517
}
501518

@@ -534,26 +551,13 @@ impl Wtf8Slice {
534551
/// or is beyond the end of the string.
535552
#[inline]
536553
pub fn code_point_range_at(&self, position: uint) -> (CodePoint, uint) {
537-
unsafe {
538-
// We’re violating some of the invariants of &str here,
539-
// but &str::slice only assumes a subset of these invariants
540-
// that still hold for Wtf8Slice.
541-
let not_really_a_str = str::raw::from_utf8(&self.bytes);
542-
let range = not_really_a_str.char_range_at(position);
543-
(CodePoint::from_char(range.ch), range.next)
544-
}
554+
not_quite_std::code_point_range_at(self, position)
545555
}
546556

547557
/// Return an iterator for the string’s code points.
548558
#[inline]
549559
pub fn code_points(&self) -> Wtf8CodePoints {
550-
unsafe {
551-
// We’re violating some of the invariants of &str here,
552-
// but &str::chars only assumes a subset of these invariants
553-
// that still hold for Wtf8Slice.
554-
let not_really_a_str = str::raw::from_utf8(&self.bytes);
555-
Wtf8CodePoints { not_really_chars: not_really_a_str.chars() }
556-
}
560+
Wtf8CodePoints { bytes: self.bytes.iter() }
557561
}
558562

559563
/// Try to convert the string to UTF-8 and return a `&str` slice.
@@ -609,14 +613,8 @@ impl Wtf8Slice {
609613
/// calling `Wtf8String::from_ill_formed_utf16` on the resulting code units
610614
/// would always return the original WTF-8 string.
611615
#[inline]
612-
pub fn to_ill_formed_utf16(&self) -> Utf16CodeUnits {
613-
unsafe {
614-
// We’re violating some of the invariants of &str here,
615-
// but &str::to_utf16 only assumes a subset of these invariants
616-
// that still hold for Wtf8Slice.
617-
let not_really_a_str = str::raw::from_utf8(&self.bytes);
618-
not_really_a_str.utf16_units()
619-
}
616+
pub fn to_ill_formed_utf16(&self) -> IllFormedUtf16CodeUnits {
617+
IllFormedUtf16CodeUnits { code_points: self.code_points(), extra: 0 }
620618
}
621619

622620
#[inline]
@@ -696,16 +694,41 @@ fn decode_surrogate_pair(lead: u16, trail: u16) -> char {
696694
/// Created with the method `.code_points()`.
697695
#[deriving(Clone)]
698696
pub struct Wtf8CodePoints<'a> {
699-
not_really_chars: str::Chars<'a>
697+
bytes: slice::Items<'a, u8>
700698
}
701699

702700
impl<'a> Iterator<CodePoint> for Wtf8CodePoints<'a> {
703701
#[inline]
704702
fn next(&mut self) -> Option<CodePoint> {
705-
match self.not_really_chars.next() {
706-
Some(not_really_char) => Some(CodePoint::from_char(not_really_char)),
707-
None => None
708-
}
703+
not_quite_std::next_code_point(&mut self.bytes)
704+
}
705+
706+
#[inline]
707+
fn size_hint(&self) -> (uint, Option<uint>) {
708+
let (len, _) = self.bytes.size_hint();
709+
(len.saturating_add(3) / 4, Some(len))
710+
}
711+
}
712+
713+
#[deriving(Clone)]
714+
pub struct IllFormedUtf16CodeUnits<'a> {
715+
code_points: Wtf8CodePoints<'a>,
716+
extra: u16
717+
}
718+
719+
impl<'a> Iterator<u16> for IllFormedUtf16CodeUnits<'a> {
720+
#[inline]
721+
fn next(&mut self) -> Option<u16> {
722+
not_quite_std::next_utf16_code_unit(self)
723+
}
724+
725+
#[inline]
726+
fn size_hint(&self) -> (uint, Option<uint>) {
727+
let (low, high) = self.code_points.size_hint();
728+
// every code point gets either one u16 or two u16,
729+
// so this iterator is between 1 or 2 times as
730+
// long as the underlying iterator.
731+
(low, high.and_then(|n| n.checked_mul(&2)))
709732
}
710733
}
711734

0 commit comments

Comments
 (0)