Skip to content

Commit 1e3410a

Browse files
committed
Fix a bug in UTF-8 decoding.
It was possible for an invalid continuation byte to sneak through, which resulted in incorrect UTF-8 decoding results. Fixes rust-lang#321
1 parent cffd451 commit 1e3410a

File tree

3 files changed

+36
-2
lines changed

3 files changed

+36
-2
lines changed

src/utf8.rs

+20
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,9 @@ pub fn decode_utf8(src: &[u8]) -> Option<(char, usize)> {
9292
return None;
9393
}
9494
let b1 = src[1];
95+
if 0b11_000000 & b1 != TAG_CONT {
96+
return None;
97+
}
9598
let cp = ((b0 & !TAG_TWO) as u32) << 6
9699
| ((b1 & !TAG_CONT) as u32);
97100
match cp {
@@ -104,6 +107,12 @@ pub fn decode_utf8(src: &[u8]) -> Option<(char, usize)> {
104107
return None;
105108
}
106109
let (b1, b2) = (src[1], src[2]);
110+
if 0b11_000000 & b1 != TAG_CONT {
111+
return None;
112+
}
113+
if 0b11_000000 & b2 != TAG_CONT {
114+
return None;
115+
}
107116
let cp = ((b0 & !TAG_THREE) as u32) << 12
108117
| ((b1 & !TAG_CONT) as u32) << 6
109118
| ((b2 & !TAG_CONT) as u32);
@@ -118,6 +127,15 @@ pub fn decode_utf8(src: &[u8]) -> Option<(char, usize)> {
118127
return None;
119128
}
120129
let (b1, b2, b3) = (src[1], src[2], src[3]);
130+
if 0b11_000000 & b1 != TAG_CONT {
131+
return None;
132+
}
133+
if 0b11_000000 & b2 != TAG_CONT {
134+
return None;
135+
}
136+
if 0b11_000000 & b3 != TAG_CONT {
137+
return None;
138+
}
121139
let cp = ((b0 & !TAG_FOUR) as u32) << 18
122140
| ((b1 & !TAG_CONT) as u32) << 12
123141
| ((b2 & !TAG_CONT) as u32) << 6
@@ -236,6 +254,8 @@ mod tests {
236254
assert_eq!(decode_utf8(&[0xFF]), None);
237255
// Surrogate pair
238256
assert_eq!(decode_utf8(&[0xED, 0xA0, 0x81]), None);
257+
// Invalid continuation byte.
258+
assert_eq!(decode_utf8(&[0xD4, 0xC2]), None);
239259
// Bad lengths
240260
assert_eq!(decode_utf8(&[0xC3]), None); // 2 bytes
241261
assert_eq!(decode_utf8(&[0xEF, 0xBF]), None); // 3 bytes

tests/macros.rs

+1-2
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,8 @@ macro_rules! ismatch {
1313
($name:ident, $re:expr, $text:expr, $ismatch:expr) => {
1414
#[test]
1515
fn $name() {
16-
let text = text!($text);
1716
let re = regex!($re);
18-
assert!($ismatch == re.is_match(text));
17+
assert!($ismatch == re.is_match(text!($text)));
1918
}
2019
};
2120
}

tests/test_default_bytes.rs

+15
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,21 @@ macro_rules! regex_set {
4141
include!("macros_bytes.rs");
4242
include!("macros.rs");
4343

44+
// A silly wrapper to make it possible to write and match raw bytes.
45+
struct R<'a>(&'a [u8]);
46+
impl<'a> R<'a> { fn as_bytes(&self) -> &'a [u8] { &self.0 } }
47+
48+
// See: https://github.com/rust-lang/regex/issues/321
49+
//
50+
// These tests are here because they do not have the same behavior in every
51+
// regex engine.
52+
mat!(invalid_utf8_nfa1, r".", R(b"\xD4\xC2\x65\x2B\x0E\xFE"), Some((2, 3)));
53+
mat!(invalid_utf8_nfa2, r"${2}ä", R(b"\xD4\xC2\x65\x2B\x0E\xFE"), None);
54+
mat!(invalid_utf8_nfa3, r".", R(b"\x0A\xDB\x82\x6E\x33\x01\xDD\x33\xCD"),
55+
Some((1, 3)));
56+
mat!(invalid_utf8_nfa4, r"${2}ä", R(b"\x0A\xDB\x82\x6E\x33\x01\xDD\x33\xCD"),
57+
None);
58+
4459
mod api;
4560
mod bytes;
4661
mod crazy;

0 commit comments

Comments
 (0)