rust-lang · sonwow · Mar 15, 2013 · huonw · Mar 15, 2013
diff --git a/src/libcore/io.rs b/src/libcore/io.rs
@@ -207,6 +207,25 @@ pub trait ReaderUtil {
     fn read_i8(&self) -> i8;
 }
 
+/// Special utility functions defined on readers.
+pub trait ReaderUtilEx {
+
+    /// Read up until a specified character (which is not returned) or EOF.
+    /// And fix the invalid utf-8 character according to the mode.
+    /// It provides three modes. (strict: raise an error,
+    /// replacement: replace the invalid string to unicode replacement character,
+    /// ignore: ignore the invalid string)
+    fn read_and_fix_utf8_until(&self, c: char, mode: uint) -> ~str;
+
+    /// Read up until the first '\n' char (which is not returned) or EOF.
+    /// And fix the invalid utf-8 character according to the mode.
+    fn read_fixed_utf8_line(&self, mode: uint) -> ~str;
+
+    /// Iterate over every line until the iterator breaks or EOF.
+    /// And fix the invalid utf-8 character according to the mode.
+    fn each_fixed_utf8_line(&self, mode: uint, it: fn(&str) -> bool);
+}
+
 impl<T:Reader> ReaderUtil for T {
 
     fn read_bytes(&self,len: uint) -> ~[u8] {
@@ -463,6 +482,31 @@ impl<T:Reader> ReaderUtil for T {
     }
 }
 
+impl<T:Reader> ReaderUtilEx for T {
+
+    fn read_and_fix_utf8_until(&self, c: char, mode: uint) -> ~str {
+        let mut bytes = ~[];
+        loop {
+            let ch = self.read_byte();
+            if ch == -1 || ch == c as int {
+                break;
+            }
+            bytes.push(ch as u8);
+        }
+        str::from_fixed_utf8_bytes(bytes, mode)
+    }
+
+    fn read_fixed_utf8_line(&self, mode: uint) -> ~str {
+        self.read_and_fix_utf8_until('\n', mode)
+    }
+
+    fn each_fixed_utf8_line(&self, mode: uint, it: fn(&str) -> bool) {
+        while !self.eof() {
+            if !it(self.read_fixed_utf8_line(mode)) { break; }
+        }
+    }
+}
+
 fn extend_sign(val: u64, nbytes: uint) -> i64 {
     let shift = (8 - nbytes) * 8;
     (val << shift) as i64 >> shift

diff --git a/src/libcore/str.rs b/src/libcore/str.rs
@@ -21,6 +21,7 @@ use at_vec;
 use cast;
 use char;
 use cmp::{Equiv, TotalOrd, Ordering, Less, Equal, Greater};
+use condition;
 use libc;
 use option::{None, Option, Some};
 use ptr;
@@ -47,6 +48,35 @@ pub pure fn from_bytes(vv: &[const u8]) -> ~str {
     return unsafe { raw::from_bytes(vv) };
 }
 
+// Condition for invalid UTF-8 string
+condition! {
+    is_not_utf8: ~[u8] -> ();
+}
+
+/**
+ * Condition types for invalid UTF-8 string
+ *
+ * strict       raise an error (default mode)
+ * replacement  replace the invalid string to unicode replacement character
+ * ignore       ignore the invalid string
+ */
+const strict: uint = 1;
+const replacement: uint = 2;
+const ignore: uint = 3;
+
+/**
+ * Convert a vector of bytes to a UTF-8 string
+
+ * Provide a condition when presented with invalid UTF-8
+ */
+pub fn from_fixed_utf8_bytes(v: &[const u8], mode: uint) -> ~str {
+    let bytes = match mode {
+        replacement | ignore => do is_not_utf8::cond.trap(|_| {()}).in { fix_utf8(v, mode) },
+        _ => fix_utf8(v, mode)
+    };
+    return unsafe { raw::from_bytes(bytes) };
+}
+
 /// Copy a slice into a new unique str
 pub pure fn from_slice(s: &str) -> ~str {
     unsafe { raw::slice_bytes(s, 0, len(s)) }
@@ -1548,6 +1578,39 @@ pub pure fn is_utf8(v: &[const u8]) -> bool {
     return true;
 }
 
+/// Fixes if a vector of bytes contains invalid UTF-8
+pub fn fix_utf8(v: &[const u8], mode: uint) -> ~[u8] {
+    let mut i = 0u;
+    let total = vec::len::<u8>(v);
+    let mut result = ~[];
+    while i < total {
+        let chend = i + utf8_char_width(v[i]);
+        let mut j = i + 1u;
+        while j < total && j < chend && v[j] & 192u8 == tag_cont_u8 {
+            j += 1u;
+        }
+        if j == chend {
+            fail_unless!(i != chend);
+            result = vec::append(result, v.view(i, j));
+        } else {
+            match mode {
+                replacement => {
+                    let replacement_char: ~[u8] = ~[0xef, 0xbf, 0xbd];
+                    result = vec::append(result, replacement_char);
+                },
+                _ => ()
+            }
+            if i == chend {
+                is_not_utf8::cond.raise(v.slice(i, i + 1));
+            } else {
+                is_not_utf8::cond.raise(v.slice(i, chend));
+            }
+        }
+        i = j;
+    }
+    result
+}
+
 /// Determines if a vector of `u16` contains valid UTF-16
 pub pure fn is_utf16(v: &[u16]) -> bool {
     let len = vec::len(v);