Skip to content

Commit 9604cc0

Browse files
committed
unicode: remove implementations of encode_utf8
This commit removes our explicit implementations of encode_utf8 and replaces them with uses of `char::encode_utf8`, which was added to the standard library in Rust 1.15.
1 parent fc06d1a commit 9604cc0

File tree

3 files changed

+9
-78
lines changed

3 files changed

+9
-78
lines changed

Diff for: regex-syntax/src/hir/literal/mod.rs

+3-5
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ use std::mem;
1919
use std::ops;
2020

2121
use hir::{self, Hir, HirKind};
22-
use unicode;
2322

2423
/// A set of literal byte strings extracted from a regular expression.
2524
///
@@ -603,9 +602,8 @@ impl Literals {
603602
fn prefixes(expr: &Hir, lits: &mut Literals) {
604603
match *expr.kind() {
605604
HirKind::Literal(hir::Literal::Unicode(c)) => {
606-
let mut buf = [0u8; 4];
607-
let i = unicode::encode_utf8(c, &mut buf).unwrap();
608-
lits.cross_add(&buf[..i]);
605+
let mut buf = [0; 4];
606+
lits.cross_add(c.encode_utf8(&mut buf).as_bytes());
609607
}
610608
HirKind::Literal(hir::Literal::Byte(b)) => {
611609
lits.cross_add(&[b]);
@@ -685,7 +683,7 @@ fn suffixes(expr: &Hir, lits: &mut Literals) {
685683
match *expr.kind() {
686684
HirKind::Literal(hir::Literal::Unicode(c)) => {
687685
let mut buf = [0u8; 4];
688-
let i = unicode::encode_utf8(c, &mut buf).unwrap();
686+
let i = c.encode_utf8(&mut buf).len();
689687
let mut buf = &mut buf[..i];
690688
buf.reverse();
691689
lits.cross_add(buf);

Diff for: regex-syntax/src/unicode.rs

-36
Original file line numberDiff line numberDiff line change
@@ -25,42 +25,6 @@ pub enum Error {
2525
PropertyValueNotFound,
2626
}
2727

28-
/// Encode the given Unicode character to `dst` as a single UTF-8 sequence.
29-
///
30-
/// If `dst` is not long enough, then `None` is returned. Otherwise, the number
31-
/// of bytes written is returned.
32-
pub fn encode_utf8(character: char, dst: &mut [u8]) -> Option<usize> {
33-
// TODO: Remove this function once we move to at least Rust 1.15, which
34-
// provides char::encode_utf8 for us.
35-
const TAG_CONT: u8 = 0b1000_0000;
36-
const TAG_TWO: u8 = 0b1100_0000;
37-
const TAG_THREE: u8 = 0b1110_0000;
38-
const TAG_FOUR: u8 = 0b1111_0000;
39-
40-
let code = character as u32;
41-
if code <= 0x7F && !dst.is_empty() {
42-
dst[0] = code as u8;
43-
Some(1)
44-
} else if code <= 0x7FF && dst.len() >= 2 {
45-
dst[0] = (code >> 6 & 0x1F) as u8 | TAG_TWO;
46-
dst[1] = (code & 0x3F) as u8 | TAG_CONT;
47-
Some(2)
48-
} else if code <= 0xFFFF && dst.len() >= 3 {
49-
dst[0] = (code >> 12 & 0x0F) as u8 | TAG_THREE;
50-
dst[1] = (code >> 6 & 0x3F) as u8 | TAG_CONT;
51-
dst[2] = (code & 0x3F) as u8 | TAG_CONT;
52-
Some(3)
53-
} else if dst.len() >= 4 {
54-
dst[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR;
55-
dst[1] = (code >> 12 & 0x3F) as u8 | TAG_CONT;
56-
dst[2] = (code >> 6 & 0x3F) as u8 | TAG_CONT;
57-
dst[3] = (code & 0x3F) as u8 | TAG_CONT;
58-
Some(4)
59-
} else {
60-
None
61-
}
62-
}
63-
6428
/// An iterator over a codepoint's simple case equivalence class.
6529
#[derive(Debug)]
6630
pub struct SimpleFoldIter(::std::slice::Iter<'static, char>);

Diff for: src/utf8.rs

+6-37
Original file line numberDiff line numberDiff line change
@@ -38,37 +38,6 @@ pub fn next_utf8(text: &[u8], i: usize) -> usize {
3838
i + inc
3939
}
4040

41-
/// Encode the given Unicode character to `dst` as a single UTF-8 sequence.
42-
///
43-
/// If `dst` is not long enough, then `None` is returned. Otherwise, the number
44-
/// of bytes written is returned.
45-
#[allow(dead_code)]
46-
#[inline]
47-
pub fn encode_utf8(character: char, dst: &mut [u8]) -> Option<usize> {
48-
let code = character as u32;
49-
if code <= 0x7F && !dst.is_empty() {
50-
dst[0] = code as u8;
51-
Some(1)
52-
} else if code <= 0x7FF && dst.len() >= 2 {
53-
dst[0] = (code >> 6 & 0x1F) as u8 | TAG_TWO;
54-
dst[1] = (code & 0x3F) as u8 | TAG_CONT;
55-
Some(2)
56-
} else if code <= 0xFFFF && dst.len() >= 3 {
57-
dst[0] = (code >> 12 & 0x0F) as u8 | TAG_THREE;
58-
dst[1] = (code >> 6 & 0x3F) as u8 | TAG_CONT;
59-
dst[2] = (code & 0x3F) as u8 | TAG_CONT;
60-
Some(3)
61-
} else if dst.len() >= 4 {
62-
dst[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR;
63-
dst[1] = (code >> 12 & 0x3F) as u8 | TAG_CONT;
64-
dst[2] = (code >> 6 & 0x3F) as u8 | TAG_CONT;
65-
dst[3] = (code & 0x3F) as u8 | TAG_CONT;
66-
Some(4)
67-
} else {
68-
None
69-
}
70-
}
71-
7241
/// Decode a single UTF-8 sequence into a single Unicode codepoint from `src`.
7342
///
7443
/// If no valid UTF-8 sequence could be found, then `None` is returned.
@@ -184,14 +153,14 @@ mod tests {
184153

185154
use super::{
186155
TAG_CONT, TAG_TWO, TAG_THREE, TAG_FOUR,
187-
decode_utf8, decode_last_utf8, encode_utf8,
156+
decode_utf8, decode_last_utf8,
188157
};
189158

190159
#[test]
191160
fn prop_roundtrip() {
192161
fn p(given_cp: char) -> bool {
193162
let mut tmp = [0; 4];
194-
let encoded_len = encode_utf8(given_cp, &mut tmp).unwrap();
163+
let encoded_len = given_cp.encode_utf8(&mut tmp).len();
195164
let (got_cp, got_len) = decode_utf8(&tmp[..encoded_len]).unwrap();
196165
encoded_len == got_len && given_cp == got_cp
197166
}
@@ -202,7 +171,7 @@ mod tests {
202171
fn prop_roundtrip_last() {
203172
fn p(given_cp: char) -> bool {
204173
let mut tmp = [0; 4];
205-
let encoded_len = encode_utf8(given_cp, &mut tmp).unwrap();
174+
let encoded_len = given_cp.encode_utf8(&mut tmp).len();
206175
let (got_cp, got_len) =
207176
decode_last_utf8(&tmp[..encoded_len]).unwrap();
208177
encoded_len == got_len && given_cp == got_cp
@@ -214,7 +183,7 @@ mod tests {
214183
fn prop_encode_matches_std() {
215184
fn p(cp: char) -> bool {
216185
let mut got = [0; 4];
217-
let n = encode_utf8(cp, &mut got).unwrap();
186+
let n = cp.encode_utf8(&mut got).len();
218187
let expected = cp.to_string();
219188
&got[..n] == expected.as_bytes()
220189
}
@@ -225,7 +194,7 @@ mod tests {
225194
fn prop_decode_matches_std() {
226195
fn p(given_cp: char) -> bool {
227196
let mut tmp = [0; 4];
228-
let n = encode_utf8(given_cp, &mut tmp).unwrap();
197+
let n = given_cp.encode_utf8(&mut tmp).len();
229198
let (got_cp, _) = decode_utf8(&tmp[..n]).unwrap();
230199
let expected_cp =
231200
str::from_utf8(&tmp[..n]).unwrap().chars().next().unwrap();
@@ -238,7 +207,7 @@ mod tests {
238207
fn prop_decode_last_matches_std() {
239208
fn p(given_cp: char) -> bool {
240209
let mut tmp = [0; 4];
241-
let n = encode_utf8(given_cp, &mut tmp).unwrap();
210+
let n = given_cp.encode_utf8(&mut tmp).len();
242211
let (got_cp, _) = decode_last_utf8(&tmp[..n]).unwrap();
243212
let expected_cp =
244213
str::from_utf8(&tmp[..n]).unwrap()

0 commit comments

Comments
 (0)