unicode: remove implementations of encode_utf8

BurntSushi · BurntSushi · commit 9604cc07ed36 · 2018-05-01T16:48:46.000-04:00
This commit removes our explicit implementations of encode_utf8 and
replaces them with uses of `char::encode_utf8`, which was added to the
standard library in Rust 1.15.
diff --git a/regex-syntax/src/hir/literal/mod.rs b/regex-syntax/src/hir/literal/mod.rs
@@ -19,7 +19,6 @@ use std::mem;
 use std::ops;
 
 use hir::{self, Hir, HirKind};
-use unicode;
 
 /// A set of literal byte strings extracted from a regular expression.
 ///
@@ -603,9 +602,8 @@ impl Literals {
 fn prefixes(expr: &Hir, lits: &mut Literals) {
     match *expr.kind() {
         HirKind::Literal(hir::Literal::Unicode(c)) => {
-            let mut buf = [0u8; 4];
-            let i = unicode::encode_utf8(c, &mut buf).unwrap();
-            lits.cross_add(&buf[..i]);
+            let mut buf = [0; 4];
+            lits.cross_add(c.encode_utf8(&mut buf).as_bytes());
         }
         HirKind::Literal(hir::Literal::Byte(b)) => {
             lits.cross_add(&[b]);
@@ -685,7 +683,7 @@ fn suffixes(expr: &Hir, lits: &mut Literals) {
     match *expr.kind() {
         HirKind::Literal(hir::Literal::Unicode(c)) => {
             let mut buf = [0u8; 4];
-            let i = unicode::encode_utf8(c, &mut buf).unwrap();
+            let i = c.encode_utf8(&mut buf).len();
             let mut buf = &mut buf[..i];
             buf.reverse();
             lits.cross_add(buf);
diff --git a/regex-syntax/src/unicode.rs b/regex-syntax/src/unicode.rs
@@ -25,42 +25,6 @@ pub enum Error {
     PropertyValueNotFound,
 }
 
-/// Encode the given Unicode character to `dst` as a single UTF-8 sequence.
-///
-/// If `dst` is not long enough, then `None` is returned. Otherwise, the number
-/// of bytes written is returned.
-pub fn encode_utf8(character: char, dst: &mut [u8]) -> Option<usize> {
-    // TODO: Remove this function once we move to at least Rust 1.15, which
-    // provides char::encode_utf8 for us.
-    const TAG_CONT: u8 = 0b1000_0000;
-    const TAG_TWO: u8 = 0b1100_0000;
-    const TAG_THREE: u8 = 0b1110_0000;
-    const TAG_FOUR: u8 = 0b1111_0000;
-
-    let code = character as u32;
-    if code <= 0x7F && !dst.is_empty() {
-        dst[0] = code as u8;
-        Some(1)
-    } else if code <= 0x7FF && dst.len() >= 2 {
-        dst[0] = (code >> 6 & 0x1F) as u8 | TAG_TWO;
-        dst[1] = (code & 0x3F) as u8 | TAG_CONT;
-        Some(2)
-    } else if code <= 0xFFFF && dst.len() >= 3  {
-        dst[0] = (code >> 12 & 0x0F) as u8 | TAG_THREE;
-        dst[1] = (code >>  6 & 0x3F) as u8 | TAG_CONT;
-        dst[2] = (code & 0x3F) as u8 | TAG_CONT;
-        Some(3)
-    } else if dst.len() >= 4 {
-        dst[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR;
-        dst[1] = (code >> 12 & 0x3F) as u8 | TAG_CONT;
-        dst[2] = (code >>  6 & 0x3F) as u8 | TAG_CONT;
-        dst[3] = (code & 0x3F) as u8 | TAG_CONT;
-        Some(4)
-    } else {
-        None
-    }
-}
-
 /// An iterator over a codepoint's simple case equivalence class.
 #[derive(Debug)]
 pub struct SimpleFoldIter(::std::slice::Iter<'static, char>);
diff --git a/src/utf8.rs b/src/utf8.rs
@@ -38,37 +38,6 @@ pub fn next_utf8(text: &[u8], i: usize) -> usize {
     i + inc
 }
 
-/// Encode the given Unicode character to `dst` as a single UTF-8 sequence.
-///
-/// If `dst` is not long enough, then `None` is returned. Otherwise, the number
-/// of bytes written is returned.
-#[allow(dead_code)]
-#[inline]
-pub fn encode_utf8(character: char, dst: &mut [u8]) -> Option<usize> {
-    let code = character as u32;
-    if code <= 0x7F && !dst.is_empty() {
-        dst[0] = code as u8;
-        Some(1)
-    } else if code <= 0x7FF && dst.len() >= 2 {
-        dst[0] = (code >> 6 & 0x1F) as u8 | TAG_TWO;
-        dst[1] = (code & 0x3F) as u8 | TAG_CONT;
-        Some(2)
-    } else if code <= 0xFFFF && dst.len() >= 3  {
-        dst[0] = (code >> 12 & 0x0F) as u8 | TAG_THREE;
-        dst[1] = (code >>  6 & 0x3F) as u8 | TAG_CONT;
-        dst[2] = (code & 0x3F) as u8 | TAG_CONT;
-        Some(3)
-    } else if dst.len() >= 4 {
-        dst[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR;
-        dst[1] = (code >> 12 & 0x3F) as u8 | TAG_CONT;
-        dst[2] = (code >>  6 & 0x3F) as u8 | TAG_CONT;
-        dst[3] = (code & 0x3F) as u8 | TAG_CONT;
-        Some(4)
-    } else {
-        None
-    }
-}
-
 /// Decode a single UTF-8 sequence into a single Unicode codepoint from `src`.
 ///
 /// If no valid UTF-8 sequence could be found, then `None` is returned.
@@ -184,14 +153,14 @@ mod tests {
 
     use super::{
         TAG_CONT, TAG_TWO, TAG_THREE, TAG_FOUR,
-        decode_utf8, decode_last_utf8, encode_utf8,
+        decode_utf8, decode_last_utf8,
     };
 
     #[test]
     fn prop_roundtrip() {
         fn p(given_cp: char) -> bool {
             let mut tmp = [0; 4];
-            let encoded_len = encode_utf8(given_cp, &mut tmp).unwrap();
+            let encoded_len = given_cp.encode_utf8(&mut tmp).len();
             let (got_cp, got_len) = decode_utf8(&tmp[..encoded_len]).unwrap();
             encoded_len == got_len && given_cp == got_cp
         }
@@ -202,7 +171,7 @@ mod tests {
     fn prop_roundtrip_last() {
         fn p(given_cp: char) -> bool {
             let mut tmp = [0; 4];
-            let encoded_len = encode_utf8(given_cp, &mut tmp).unwrap();
+            let encoded_len = given_cp.encode_utf8(&mut tmp).len();
             let (got_cp, got_len) =
                 decode_last_utf8(&tmp[..encoded_len]).unwrap();
             encoded_len == got_len && given_cp == got_cp
@@ -214,7 +183,7 @@ mod tests {
     fn prop_encode_matches_std() {
         fn p(cp: char) -> bool {
             let mut got = [0; 4];
-            let n = encode_utf8(cp, &mut got).unwrap();
+            let n = cp.encode_utf8(&mut got).len();
             let expected = cp.to_string();
             &got[..n] == expected.as_bytes()
         }
@@ -225,7 +194,7 @@ mod tests {
     fn prop_decode_matches_std() {
         fn p(given_cp: char) -> bool {
             let mut tmp = [0; 4];
-            let n = encode_utf8(given_cp, &mut tmp).unwrap();
+            let n = given_cp.encode_utf8(&mut tmp).len();
             let (got_cp, _) = decode_utf8(&tmp[..n]).unwrap();
             let expected_cp =
                 str::from_utf8(&tmp[..n]).unwrap().chars().next().unwrap();
@@ -238,7 +207,7 @@ mod tests {
     fn prop_decode_last_matches_std() {
         fn p(given_cp: char) -> bool {
             let mut tmp = [0; 4];
-            let n = encode_utf8(given_cp, &mut tmp).unwrap();
+            let n = given_cp.encode_utf8(&mut tmp).len();
             let (got_cp, _) = decode_last_utf8(&tmp[..n]).unwrap();
             let expected_cp =
                 str::from_utf8(&tmp[..n]).unwrap()