Skip to content

Commit e1b4dd0

Browse files
committed
Auto merge of #32204 - alexcrichton:redesign-char-encoding-types, r=aturon
std: Change `encode_utf{8,16}` to return iterators Currently these have non-traditional APIs which take a buffer and report how much was filled in, but they're not necessarily ergonomic to use. Returning an iterator which *also* exposes an underlying slice shouldn't result in any performance loss as it's just a lazy version of the same implementation, and it's also much more ergonomic! cc #27784
2 parents e3f2dfd + 48d5fe9 commit e1b4dd0

File tree

10 files changed

+195
-201
lines changed

10 files changed

+195
-201
lines changed

Diff for: src/libcollections/string.rs

+5-20
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,6 @@ use core::iter::FromIterator;
6161
use core::mem;
6262
use core::ops::{self, Add, Index, IndexMut};
6363
use core::ptr;
64-
use core::slice;
6564
use core::str::pattern::Pattern;
6665
use rustc_unicode::char::{decode_utf16, REPLACEMENT_CHARACTER};
6766
use rustc_unicode::str as unicode_str;
@@ -970,22 +969,7 @@ impl String {
970969
pub fn push(&mut self, ch: char) {
971970
match ch.len_utf8() {
972971
1 => self.vec.push(ch as u8),
973-
ch_len => {
974-
let cur_len = self.len();
975-
// This may use up to 4 bytes.
976-
self.vec.reserve(ch_len);
977-
978-
unsafe {
979-
// Attempt to not use an intermediate buffer by just pushing bytes
980-
// directly onto this string.
981-
let slice = slice::from_raw_parts_mut(self.vec
982-
.as_mut_ptr()
983-
.offset(cur_len as isize),
984-
ch_len);
985-
let used = ch.encode_utf8(slice).unwrap_or(0);
986-
self.vec.set_len(cur_len + used);
987-
}
988-
}
972+
_ => self.vec.extend_from_slice(ch.encode_utf8().as_slice()),
989973
}
990974
}
991975

@@ -1136,9 +1120,10 @@ impl String {
11361120
let len = self.len();
11371121
assert!(idx <= len);
11381122
assert!(self.is_char_boundary(idx));
1139-
self.vec.reserve(4);
1140-
let mut bits = [0; 4];
1141-
let amt = ch.encode_utf8(&mut bits).unwrap();
1123+
let bits = ch.encode_utf8();
1124+
let bits = bits.as_slice();
1125+
let amt = bits.len();
1126+
self.vec.reserve(amt);
11421127

11431128
unsafe {
11441129
ptr::copy(self.vec.as_ptr().offset(idx as isize),

Diff for: src/libcollectionstest/str.rs

+4-6
Original file line numberDiff line numberDiff line change
@@ -794,10 +794,9 @@ fn test_rev_iterator() {
794794

795795
#[test]
796796
fn test_chars_decoding() {
797-
let mut bytes = [0; 4];
798797
for c in (0..0x110000).filter_map(::std::char::from_u32) {
799-
let len = c.encode_utf8(&mut bytes).unwrap_or(0);
800-
let s = ::std::str::from_utf8(&bytes[..len]).unwrap();
798+
let bytes = c.encode_utf8();
799+
let s = ::std::str::from_utf8(bytes.as_slice()).unwrap();
801800
if Some(c) != s.chars().next() {
802801
panic!("character {:x}={} does not decode correctly", c as u32, c);
803802
}
@@ -806,10 +805,9 @@ fn test_chars_decoding() {
806805

807806
#[test]
808807
fn test_chars_rev_decoding() {
809-
let mut bytes = [0; 4];
810808
for c in (0..0x110000).filter_map(::std::char::from_u32) {
811-
let len = c.encode_utf8(&mut bytes).unwrap_or(0);
812-
let s = ::std::str::from_utf8(&bytes[..len]).unwrap();
809+
let bytes = c.encode_utf8();
810+
let s = ::std::str::from_utf8(bytes.as_slice()).unwrap();
813811
if Some(c) != s.chars().rev().next() {
814812
panic!("character {:x}={} does not decode correctly", c as u32, c);
815813
}

Diff for: src/libcore/char.rs

+119-70
Original file line numberDiff line numberDiff line change
@@ -269,10 +269,10 @@ pub trait CharExt {
269269
fn len_utf8(self) -> usize;
270270
#[stable(feature = "core", since = "1.6.0")]
271271
fn len_utf16(self) -> usize;
272-
#[stable(feature = "core", since = "1.6.0")]
273-
fn encode_utf8(self, dst: &mut [u8]) -> Option<usize>;
274-
#[stable(feature = "core", since = "1.6.0")]
275-
fn encode_utf16(self, dst: &mut [u16]) -> Option<usize>;
272+
#[unstable(feature = "unicode", issue = "27784")]
273+
fn encode_utf8(self) -> EncodeUtf8;
274+
#[unstable(feature = "unicode", issue = "27784")]
275+
fn encode_utf16(self) -> EncodeUtf16;
276276
}
277277

278278
#[stable(feature = "core", since = "1.6.0")]
@@ -336,75 +336,47 @@ impl CharExt for char {
336336
}
337337

338338
#[inline]
339-
fn encode_utf8(self, dst: &mut [u8]) -> Option<usize> {
340-
encode_utf8_raw(self as u32, dst)
339+
fn encode_utf8(self) -> EncodeUtf8 {
340+
let code = self as u32;
341+
let mut buf = [0; 4];
342+
let pos = if code < MAX_ONE_B {
343+
buf[3] = code as u8;
344+
3
345+
} else if code < MAX_TWO_B {
346+
buf[2] = (code >> 6 & 0x1F) as u8 | TAG_TWO_B;
347+
buf[3] = (code & 0x3F) as u8 | TAG_CONT;
348+
2
349+
} else if code < MAX_THREE_B {
350+
buf[1] = (code >> 12 & 0x0F) as u8 | TAG_THREE_B;
351+
buf[2] = (code >> 6 & 0x3F) as u8 | TAG_CONT;
352+
buf[3] = (code & 0x3F) as u8 | TAG_CONT;
353+
1
354+
} else {
355+
buf[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR_B;
356+
buf[1] = (code >> 12 & 0x3F) as u8 | TAG_CONT;
357+
buf[2] = (code >> 6 & 0x3F) as u8 | TAG_CONT;
358+
buf[3] = (code & 0x3F) as u8 | TAG_CONT;
359+
0
360+
};
361+
EncodeUtf8 { buf: buf, pos: pos }
341362
}
342363

343364
#[inline]
344-
fn encode_utf16(self, dst: &mut [u16]) -> Option<usize> {
345-
encode_utf16_raw(self as u32, dst)
346-
}
347-
}
348-
349-
/// Encodes a raw u32 value as UTF-8 into the provided byte buffer,
350-
/// and then returns the number of bytes written.
351-
///
352-
/// If the buffer is not large enough, nothing will be written into it
353-
/// and a `None` will be returned.
354-
#[inline]
355-
#[unstable(feature = "char_internals",
356-
reason = "this function should not be exposed publicly",
357-
issue = "0")]
358-
#[doc(hidden)]
359-
pub fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> Option<usize> {
360-
// Marked #[inline] to allow llvm optimizing it away
361-
if code < MAX_ONE_B && !dst.is_empty() {
362-
dst[0] = code as u8;
363-
Some(1)
364-
} else if code < MAX_TWO_B && dst.len() >= 2 {
365-
dst[0] = (code >> 6 & 0x1F) as u8 | TAG_TWO_B;
366-
dst[1] = (code & 0x3F) as u8 | TAG_CONT;
367-
Some(2)
368-
} else if code < MAX_THREE_B && dst.len() >= 3 {
369-
dst[0] = (code >> 12 & 0x0F) as u8 | TAG_THREE_B;
370-
dst[1] = (code >> 6 & 0x3F) as u8 | TAG_CONT;
371-
dst[2] = (code & 0x3F) as u8 | TAG_CONT;
372-
Some(3)
373-
} else if dst.len() >= 4 {
374-
dst[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR_B;
375-
dst[1] = (code >> 12 & 0x3F) as u8 | TAG_CONT;
376-
dst[2] = (code >> 6 & 0x3F) as u8 | TAG_CONT;
377-
dst[3] = (code & 0x3F) as u8 | TAG_CONT;
378-
Some(4)
379-
} else {
380-
None
381-
}
382-
}
383-
384-
/// Encodes a raw u32 value as UTF-16 into the provided `u16` buffer,
385-
/// and then returns the number of `u16`s written.
386-
///
387-
/// If the buffer is not large enough, nothing will be written into it
388-
/// and a `None` will be returned.
389-
#[inline]
390-
#[unstable(feature = "char_internals",
391-
reason = "this function should not be exposed publicly",
392-
issue = "0")]
393-
#[doc(hidden)]
394-
pub fn encode_utf16_raw(mut ch: u32, dst: &mut [u16]) -> Option<usize> {
395-
// Marked #[inline] to allow llvm optimizing it away
396-
if (ch & 0xFFFF) == ch && !dst.is_empty() {
397-
// The BMP falls through (assuming non-surrogate, as it should)
398-
dst[0] = ch as u16;
399-
Some(1)
400-
} else if dst.len() >= 2 {
401-
// Supplementary planes break into surrogates.
402-
ch -= 0x1_0000;
403-
dst[0] = 0xD800 | ((ch >> 10) as u16);
404-
dst[1] = 0xDC00 | ((ch as u16) & 0x3FF);
405-
Some(2)
406-
} else {
407-
None
365+
fn encode_utf16(self) -> EncodeUtf16 {
366+
let mut buf = [0; 2];
367+
let mut code = self as u32;
368+
let pos = if (code & 0xFFFF) == code {
369+
// The BMP falls through (assuming non-surrogate, as it should)
370+
buf[1] = code as u16;
371+
1
372+
} else {
373+
// Supplementary planes break into surrogates.
374+
code -= 0x1_0000;
375+
buf[0] = 0xD800 | ((code >> 10) as u16);
376+
buf[1] = 0xDC00 | ((code as u16) & 0x3FF);
377+
0
378+
};
379+
EncodeUtf16 { buf: buf, pos: pos }
408380
}
409381
}
410382

@@ -583,3 +555,80 @@ impl Iterator for EscapeDefault {
583555
}
584556
}
585557
}
558+
559+
/// An iterator over `u8` entries represending the UTF-8 encoding of a `char`
560+
/// value.
561+
///
562+
/// Constructed via the `.encode_utf8()` method on `char`.
563+
#[unstable(feature = "unicode", issue = "27784")]
564+
#[derive(Debug)]
565+
pub struct EncodeUtf8 {
566+
buf: [u8; 4],
567+
pos: usize,
568+
}
569+
570+
impl EncodeUtf8 {
571+
/// Returns the remaining bytes of this iterator as a slice.
572+
#[unstable(feature = "unicode", issue = "27784")]
573+
pub fn as_slice(&self) -> &[u8] {
574+
&self.buf[self.pos..]
575+
}
576+
}
577+
578+
#[unstable(feature = "unicode", issue = "27784")]
579+
impl Iterator for EncodeUtf8 {
580+
type Item = u8;
581+
582+
fn next(&mut self) -> Option<u8> {
583+
if self.pos == self.buf.len() {
584+
None
585+
} else {
586+
let ret = Some(self.buf[self.pos]);
587+
self.pos += 1;
588+
ret
589+
}
590+
}
591+
592+
fn size_hint(&self) -> (usize, Option<usize>) {
593+
self.as_slice().iter().size_hint()
594+
}
595+
}
596+
597+
/// An iterator over `u16` entries represending the UTF-16 encoding of a `char`
598+
/// value.
599+
///
600+
/// Constructed via the `.encode_utf16()` method on `char`.
601+
#[unstable(feature = "unicode", issue = "27784")]
602+
#[derive(Debug)]
603+
pub struct EncodeUtf16 {
604+
buf: [u16; 2],
605+
pos: usize,
606+
}
607+
608+
impl EncodeUtf16 {
609+
/// Returns the remaining bytes of this iterator as a slice.
610+
#[unstable(feature = "unicode", issue = "27784")]
611+
pub fn as_slice(&self) -> &[u16] {
612+
&self.buf[self.pos..]
613+
}
614+
}
615+
616+
617+
#[unstable(feature = "unicode", issue = "27784")]
618+
impl Iterator for EncodeUtf16 {
619+
type Item = u16;
620+
621+
fn next(&mut self) -> Option<u16> {
622+
if self.pos == self.buf.len() {
623+
None
624+
} else {
625+
let ret = Some(self.buf[self.pos]);
626+
self.pos += 1;
627+
ret
628+
}
629+
}
630+
631+
fn size_hint(&self) -> (usize, Option<usize>) {
632+
self.as_slice().iter().size_hint()
633+
}
634+
}

Diff for: src/libcore/fmt/mod.rs

+13-14
Original file line numberDiff line numberDiff line change
@@ -99,9 +99,9 @@ pub trait Write {
9999
/// This function will return an instance of `Error` on error.
100100
#[stable(feature = "fmt_write_char", since = "1.1.0")]
101101
fn write_char(&mut self, c: char) -> Result {
102-
let mut utf_8 = [0u8; 4];
103-
let bytes_written = c.encode_utf8(&mut utf_8).unwrap_or(0);
104-
self.write_str(unsafe { str::from_utf8_unchecked(&utf_8[..bytes_written]) })
102+
self.write_str(unsafe {
103+
str::from_utf8_unchecked(c.encode_utf8().as_slice())
104+
})
105105
}
106106

107107
/// Glue for usage of the `write!` macro with implementors of this trait.
@@ -897,10 +897,9 @@ impl<'a> Formatter<'a> {
897897
// Writes the sign if it exists, and then the prefix if it was requested
898898
let write_prefix = |f: &mut Formatter| {
899899
if let Some(c) = sign {
900-
let mut b = [0; 4];
901-
let n = c.encode_utf8(&mut b).unwrap_or(0);
902-
let b = unsafe { str::from_utf8_unchecked(&b[..n]) };
903-
try!(f.buf.write_str(b));
900+
try!(f.buf.write_str(unsafe {
901+
str::from_utf8_unchecked(c.encode_utf8().as_slice())
902+
}));
904903
}
905904
if prefixed { f.buf.write_str(prefix) }
906905
else { Ok(()) }
@@ -1003,9 +1002,10 @@ impl<'a> Formatter<'a> {
10031002
rt::v1::Alignment::Center => (padding / 2, (padding + 1) / 2),
10041003
};
10051004

1006-
let mut fill = [0; 4];
1007-
let len = self.fill.encode_utf8(&mut fill).unwrap_or(0);
1008-
let fill = unsafe { str::from_utf8_unchecked(&fill[..len]) };
1005+
let fill = self.fill.encode_utf8();
1006+
let fill = unsafe {
1007+
str::from_utf8_unchecked(fill.as_slice())
1008+
};
10091009

10101010
for _ in 0..pre_pad {
10111011
try!(self.buf.write_str(fill));
@@ -1391,10 +1391,9 @@ impl Display for char {
13911391
if f.width.is_none() && f.precision.is_none() {
13921392
f.write_char(*self)
13931393
} else {
1394-
let mut utf8 = [0; 4];
1395-
let amt = self.encode_utf8(&mut utf8).unwrap_or(0);
1396-
let s: &str = unsafe { str::from_utf8_unchecked(&utf8[..amt]) };
1397-
f.pad(s)
1394+
f.pad(unsafe {
1395+
str::from_utf8_unchecked(self.encode_utf8().as_slice())
1396+
})
13981397
}
13991398
}
14001399
}

Diff for: src/libcoretest/char.rs

+8-6
Original file line numberDiff line numberDiff line change
@@ -175,9 +175,10 @@ fn test_escape_unicode() {
175175
#[test]
176176
fn test_encode_utf8() {
177177
fn check(input: char, expect: &[u8]) {
178-
let mut buf = [0; 4];
179-
let n = input.encode_utf8(&mut buf).unwrap_or(0);
180-
assert_eq!(&buf[..n], expect);
178+
assert_eq!(input.encode_utf8().as_slice(), expect);
179+
for (a, b) in input.encode_utf8().zip(expect) {
180+
assert_eq!(a, *b);
181+
}
181182
}
182183

183184
check('x', &[0x78]);
@@ -189,9 +190,10 @@ fn test_encode_utf8() {
189190
#[test]
190191
fn test_encode_utf16() {
191192
fn check(input: char, expect: &[u16]) {
192-
let mut buf = [0; 2];
193-
let n = input.encode_utf16(&mut buf).unwrap_or(0);
194-
assert_eq!(&buf[..n], expect);
193+
assert_eq!(input.encode_utf16().as_slice(), expect);
194+
for (a, b) in input.encode_utf16().zip(expect) {
195+
assert_eq!(a, *b);
196+
}
195197
}
196198

197199
check('x', &[0x0078]);

0 commit comments

Comments
 (0)