Skip to content

Provide a condition in string-based IO routines #5399

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions src/libcore/io.rs
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,25 @@ pub trait ReaderUtil {
fn read_i8(&self) -> i8;
}

/// Special utility functions defined on readers.
pub trait ReaderUtilEx {

/// Read up until a specified character (which is not returned) or EOF.
/// And fix the invalid utf-8 character according to the mode.
/// It provides three modes. (strict: raise an error,
/// replacement: replace the invalid string to unicode replacement character,
/// ignore: ignore the invalid string)
fn read_and_fix_utf8_until(&self, c: char, mode: uint) -> ~str;

/// Read up until the first '\n' char (which is not returned) or EOF.
/// And fix the invalid utf-8 character according to the mode.
fn read_fixed_utf8_line(&self, mode: uint) -> ~str;

/// Iterate over every line until the iterator breaks or EOF.
/// And fix the invalid utf-8 character according to the mode.
fn each_fixed_utf8_line(&self, mode: uint, it: fn(&str) -> bool);
}

impl<T:Reader> ReaderUtil for T {

fn read_bytes(&self,len: uint) -> ~[u8] {
Expand Down Expand Up @@ -463,6 +482,31 @@ impl<T:Reader> ReaderUtil for T {
}
}

impl<T:Reader> ReaderUtilEx for T {

fn read_and_fix_utf8_until(&self, c: char, mode: uint) -> ~str {
let mut bytes = ~[];
loop {
let ch = self.read_byte();
if ch == -1 || ch == c as int {
break;
}
bytes.push(ch as u8);
}
str::from_fixed_utf8_bytes(bytes, mode)
}

fn read_fixed_utf8_line(&self, mode: uint) -> ~str {
self.read_and_fix_utf8_until('\n', mode)
}

fn each_fixed_utf8_line(&self, mode: uint, it: fn(&str) -> bool) {
while !self.eof() {
if !it(self.read_fixed_utf8_line(mode)) { break; }
}
}
}

fn extend_sign(val: u64, nbytes: uint) -> i64 {
let shift = (8 - nbytes) * 8;
(val << shift) as i64 >> shift
Expand Down
63 changes: 63 additions & 0 deletions src/libcore/str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ use at_vec;
use cast;
use char;
use cmp::{Equiv, TotalOrd, Ordering, Less, Equal, Greater};
use condition;
use libc;
use option::{None, Option, Some};
use ptr;
Expand All @@ -47,6 +48,35 @@ pub pure fn from_bytes(vv: &[const u8]) -> ~str {
return unsafe { raw::from_bytes(vv) };
}

// Condition for invalid UTF-8 string
condition! {
is_not_utf8: ~[u8] -> ();
}

/**
* Condition types for invalid UTF-8 string
*
* strict raise an error (default mode)
* replacement replace the invalid string to unicode replacement character
* ignore ignore the invalid string
*/
const strict: uint = 1;
const replacement: uint = 2;
const ignore: uint = 3;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is probably better as an enum (enum UTF8Mode { Strict, Replacement, Ignore }), unless there was a particular reason for using the constants?


/**
* Convert a vector of bytes to a UTF-8 string

* Provide a condition when presented with invalid UTF-8
*/
pub fn from_fixed_utf8_bytes(v: &[const u8], mode: uint) -> ~str {
let bytes = match mode {
replacement | ignore => do is_not_utf8::cond.trap(|_| {()}).in { fix_utf8(v, mode) },
_ => fix_utf8(v, mode)
};
return unsafe { raw::from_bytes(bytes) };
}

/// Copy a slice into a new unique str
pub pure fn from_slice(s: &str) -> ~str {
unsafe { raw::slice_bytes(s, 0, len(s)) }
Expand Down Expand Up @@ -1548,6 +1578,39 @@ pub pure fn is_utf8(v: &[const u8]) -> bool {
return true;
}

/// Fixes if a vector of bytes contains invalid UTF-8
pub fn fix_utf8(v: &[const u8], mode: uint) -> ~[u8] {
let mut i = 0u;
let total = vec::len::<u8>(v);
let mut result = ~[];
while i < total {
let chend = i + utf8_char_width(v[i]);
let mut j = i + 1u;
while j < total && j < chend && v[j] & 192u8 == tag_cont_u8 {
j += 1u;
}
if j == chend {
fail_unless!(i != chend);
result = vec::append(result, v.view(i, j));
} else {
match mode {
replacement => {
let replacement_char: ~[u8] = ~[0xef, 0xbf, 0xbd];
result = vec::append(result, replacement_char);
},
_ => ()
}
if i == chend {
is_not_utf8::cond.raise(v.slice(i, i + 1));
} else {
is_not_utf8::cond.raise(v.slice(i, chend));
}
}
i = j;
}
result
}

/// Determines if a vector of `u16` contains valid UTF-16
pub pure fn is_utf16(v: &[u16]) -> bool {
let len = vec::len(v);
Expand Down