Fix a bug in UTF-8 decoding.

BurntSushi · BurntSushi · commit 1e3410ac0938 · 2017-02-18T19:46:23.000-05:00
It was possible for an invalid continuation byte to sneak through, which resulted in incorrect UTF-8 decoding results. Fixes rust-lang#321
diff --git a/src/utf8.rs b/src/utf8.rs
@@ -92,6 +92,9 @@ pub fn decode_utf8(src: &[u8]) -> Option<(char, usize)> {
                 return None;
             }
             let b1 = src[1];
+            if 0b11_000000 & b1 != TAG_CONT {
+                return None;
+            }
             let cp = ((b0 & !TAG_TWO) as u32) << 6
                      | ((b1 & !TAG_CONT) as u32);
             match cp {
@@ -104,6 +107,12 @@ pub fn decode_utf8(src: &[u8]) -> Option<(char, usize)> {
                 return None;
             }
             let (b1, b2) = (src[1], src[2]);
+            if 0b11_000000 & b1 != TAG_CONT {
+                return None;
+            }
+            if 0b11_000000 & b2 != TAG_CONT {
+                return None;
+            }
             let cp = ((b0 & !TAG_THREE) as u32) << 12
                      | ((b1 & !TAG_CONT) as u32) << 6
                      | ((b2 & !TAG_CONT) as u32);
@@ -118,6 +127,15 @@ pub fn decode_utf8(src: &[u8]) -> Option<(char, usize)> {
                 return None;
             }
             let (b1, b2, b3) = (src[1], src[2], src[3]);
+            if 0b11_000000 & b1 != TAG_CONT {
+                return None;
+            }
+            if 0b11_000000 & b2 != TAG_CONT {
+                return None;
+            }
+            if 0b11_000000 & b3 != TAG_CONT {
+                return None;
+            }
             let cp = ((b0 & !TAG_FOUR) as u32) << 18
                      | ((b1 & !TAG_CONT) as u32) << 12
                      | ((b2 & !TAG_CONT) as u32) << 6
@@ -236,6 +254,8 @@ mod tests {
         assert_eq!(decode_utf8(&[0xFF]), None);
         // Surrogate pair
         assert_eq!(decode_utf8(&[0xED, 0xA0, 0x81]), None);
+        // Invalid continuation byte.
+        assert_eq!(decode_utf8(&[0xD4, 0xC2]), None);
         // Bad lengths
         assert_eq!(decode_utf8(&[0xC3]), None); // 2 bytes
         assert_eq!(decode_utf8(&[0xEF, 0xBF]), None); // 3 bytes
diff --git a/tests/macros.rs b/tests/macros.rs
@@ -13,9 +13,8 @@ macro_rules! ismatch {
     ($name:ident, $re:expr, $text:expr, $ismatch:expr) => {
         #[test]
         fn $name() {
-            let text = text!($text);
             let re = regex!($re);
-            assert!($ismatch == re.is_match(text));
+            assert!($ismatch == re.is_match(text!($text)));
         }
     };
 }
diff --git a/tests/test_default_bytes.rs b/tests/test_default_bytes.rs
@@ -41,6 +41,21 @@ macro_rules! regex_set {
 include!("macros_bytes.rs");
 include!("macros.rs");
 
+// A silly wrapper to make it possible to write and match raw bytes.
+struct R<'a>(&'a [u8]);
+impl<'a> R<'a> { fn as_bytes(&self) -> &'a [u8] { &self.0 } }
+
+// See: https://github.com/rust-lang/regex/issues/321
+//
+// These tests are here because they do not have the same behavior in every
+// regex engine.
+mat!(invalid_utf8_nfa1, r".", R(b"\xD4\xC2\x65\x2B\x0E\xFE"), Some((2, 3)));
+mat!(invalid_utf8_nfa2, r"${2}ä", R(b"\xD4\xC2\x65\x2B\x0E\xFE"), None);
+mat!(invalid_utf8_nfa3, r".", R(b"\x0A\xDB\x82\x6E\x33\x01\xDD\x33\xCD"),
+     Some((1, 3)));
+mat!(invalid_utf8_nfa4, r"${2}ä", R(b"\x0A\xDB\x82\x6E\x33\x01\xDD\x33\xCD"),
+     None);
+
 mod api;
 mod bytes;
 mod crazy;

Original file line number	Diff line number	Diff line change
`@@ -13,9 +13,8 @@ macro_rules! ismatch {`
`13`	`13`	`($name:ident, $re:expr, $text:expr, $ismatch:expr) => {`
`14`	`14`	`#[test]`
`15`	`15`	`fn $name() {`
`16`		`- let text = text!($text);`
`17`	`16`	`let re = regex!($re);`
`18`		`- assert!($ismatch == re.is_match(text));`
	`17`	`+ assert!($ismatch == re.is_match(text!($text)));`
`19`	`18`	`}`
`20`	`19`	`};`
`21`	`20`	`}`