auto merge of #5398 : dbaupp/rust/core-readlines, r=graydon

bors · bors · commit 1616ffd0c262 · 2013-03-22T09:24:53.000-07:00
The `each_line` function in `ReaderUtil` acts very differently to equivalent functions in Python, Ruby, Clojure etc. E.g. given a file `t` with contents `trailing\nnew line\n` and `n` containing `no trailing\nnew line`:

Rust:
```Rust
t: ~[~"trailing", ~"new line", ~""]
n: ~[~"no trailing", ~"new line"]
```

Python:
```Python
&gt;&gt;&gt; open('t').readlines()
['trailing\n', 'new line\n']
&gt;&gt;&gt; open('n').readlines()
['no trailing\n', 'new line']
```

Ruby:
```Ruby
irb(main):001:0&gt; File.readlines('t')
=&gt; ["trailing\n", "new line\n"]
irb(main):002:0&gt; File.readlines('n')
=&gt; ["no trailing\n", "new line"]
```

Clojure
```Clojure
user=&gt; (read-lines "t")
("trailing" "new line")
user=&gt; (read-lines "n")
("no trailing" "new line")
```

The extra string that rust includes at the end is inconsistent, and means that it is impossible to distinguish between the "real" empty line a file that ends `...\n\n`, and the "fake" one after the last `\n`.

The code attached makes Rust's `each_line` act like Clojure (and PHP, i.e. not including the `\n`), as well as adjusting `str::lines` to fix the trailing empty line problem.

Also, add a convenience `read_lines` method to read all the lines in a file into a vector.
diff --git a/src/compiletest/runtest.rs b/src/compiletest/runtest.rs
@@ -267,7 +267,7 @@ fn run_debuginfo_test(config: config, props: TestProps, testfile: &Path) {
         // check if each line in props.check_lines appears in the
         // output (in order)
         let mut i = 0u;
-        for str::lines(ProcRes.stdout).each |line| {
+        for str::lines_each(ProcRes.stdout) |line| {
             if props.check_lines[i].trim() == line.trim() {
                 i += 1u;
             }
@@ -297,8 +297,8 @@ fn check_error_patterns(props: TestProps,
     let mut next_err_idx = 0u;
     let mut next_err_pat = props.error_patterns[next_err_idx];
     let mut done = false;
-    for str::split_char(ProcRes.stderr, '\n').each |line| {
-        if str::contains(*line, next_err_pat) {
+    for str::lines_each(ProcRes.stderr) |line| {
+        if str::contains(line, next_err_pat) {
             debug!("found error pattern %s", next_err_pat);
             next_err_idx += 1u;
             if next_err_idx == vec::len(props.error_patterns) {
@@ -347,15 +347,15 @@ fn check_expected_errors(expected_errors: ~[errors::ExpectedError],
     //    filename:line1:col1: line2:col2: *warning:* msg
     // where line1:col1: is the starting point, line2:col2:
     // is the ending point, and * represents ANSI color codes.
-    for str::split_char(ProcRes.stderr, '\n').each |line| {
+    for str::lines_each(ProcRes.stderr) |line| {
         let mut was_expected = false;
         for vec::eachi(expected_errors) |i, ee| {
             if !found_flags[i] {
                 debug!("prefix=%s ee.kind=%s ee.msg=%s line=%s",
-                       prefixes[i], ee.kind, ee.msg, *line);
-                if (str::starts_with(*line, prefixes[i]) &&
-                    str::contains(*line, ee.kind) &&
-                    str::contains(*line, ee.msg)) {
+                       prefixes[i], ee.kind, ee.msg, line);
+                if (str::starts_with(line, prefixes[i]) &&
+                    str::contains(line, ee.kind) &&
+                    str::contains(line, ee.msg)) {
                     found_flags[i] = true;
                     was_expected = true;
                     break;
@@ -364,13 +364,13 @@ fn check_expected_errors(expected_errors: ~[errors::ExpectedError],
         }
 
         // ignore this msg which gets printed at the end
-        if str::contains(*line, ~"aborting due to") {
+        if str::contains(line, ~"aborting due to") {
             was_expected = true;
         }
 
-        if !was_expected && is_compiler_error_or_warning(*line) {
+        if !was_expected && is_compiler_error_or_warning(str::from_slice(line)) {
             fatal_ProcRes(fmt!("unexpected compiler error or warning: '%s'",
-                               *line),
+                               line),
                           ProcRes);
         }
     }
diff --git a/src/libcore/io.rs b/src/libcore/io.rs
@@ -99,8 +99,8 @@ pub trait ReaderUtil {
     /// Read len bytes into a new vec.
     fn read_bytes(&self, len: uint) -> ~[u8];
 
-    /// Read up until a specified character (which is not returned) or EOF.
-    fn read_until(&self, c: char) -> ~str;
+    /// Read up until a specified character (which is optionally included) or EOF.
+    fn read_until(&self, c: char, include: bool) -> ~str;
 
     /// Read up until the first '\n' char (which is not returned), or EOF.
     fn read_line(&self) -> ~str;
@@ -126,6 +126,9 @@ pub trait ReaderUtil {
     /// Iterate over every line until the iterator breaks or EOF.
     fn each_line(&self, it: &fn(&str) -> bool);
 
+    /// Read all the lines of the file into a vector.
+    fn read_lines(&self) -> ~[~str];
+
     /// Read n (between 1 and 8) little-endian unsigned integer bytes.
     fn read_le_uint_n(&self, nbytes: uint) -> u64;
 
@@ -219,11 +222,14 @@ impl<T:Reader> ReaderUtil for T {
         bytes
     }
 
-    fn read_until(&self, c: char) -> ~str {
+    fn read_until(&self, c: char, include: bool) -> ~str {
         let mut bytes = ~[];
         loop {
             let ch = self.read_byte();
             if ch == -1 || ch == c as int {
+                if include && ch == c as int {
+                    bytes.push(ch as u8);
+                }
                 break;
             }
             bytes.push(ch as u8);
@@ -232,7 +238,7 @@ impl<T:Reader> ReaderUtil for T {
     }
 
     fn read_line(&self) -> ~str {
-        self.read_until('\n')
+        self.read_until('\n', false)
     }
 
     fn read_chars(&self, n: uint) -> ~[char] {
@@ -306,7 +312,7 @@ impl<T:Reader> ReaderUtil for T {
     }
 
     fn read_c_str(&self) -> ~str {
-        self.read_until(0 as char)
+        self.read_until(0 as char, false)
     }
 
     fn read_whole_stream(&self) -> ~[u8] {
@@ -329,7 +335,29 @@ impl<T:Reader> ReaderUtil for T {
 
     fn each_line(&self, it: &fn(s: &str) -> bool) {
         while !self.eof() {
-            if !it(self.read_line()) { break; }
+            // include the \n, so that we can distinguish an entirely empty
+            // line read after "...\n", and the trailing empty line in
+            // "...\n\n".
+            let mut line = self.read_until('\n', true);
+
+            // blank line at the end of the reader is ignored
+            if self.eof() && line.is_empty() { break; }
+
+            // trim the \n, so that each_line is consistent with read_line
+            let n = str::len(line);
+            if line[n-1] == '\n' as u8 {
+                unsafe { str::raw::set_len(&mut line, n-1); }
+            }
+
+            if !it(line) { break; }
+        }
+    }
+
+    fn read_lines(&self) -> ~[~str] {
+        do vec::build |push| {
+            for self.each_line |line| {
+                push(str::from_slice(line));
+            }
         }
     }
 
@@ -1335,6 +1363,21 @@ mod tests {
         }
     }
 
+    #[test]
+    fn test_read_lines() {
+        do io::with_str_reader(~"a\nb\nc\n") |inp| {
+            fail_unless!(inp.read_lines() == ~[~"a", ~"b", ~"c"]);
+        }
+
+        do io::with_str_reader(~"a\nb\nc") |inp| {
+            fail_unless!(inp.read_lines() == ~[~"a", ~"b", ~"c"]);
+        }
+
+        do io::with_str_reader(~"") |inp| {
+            fail_unless!(inp.read_lines().is_empty());
+        }
+    }
+
     #[test]
     fn test_readchars_wide() {
         let wide_test = ~"生锈的汤匙切肉汤hello生锈的汤匙切肉汤";
diff --git a/src/libcore/str.rs b/src/libcore/str.rs
@@ -437,28 +437,37 @@ pub pure fn slice(s: &'a str, begin: uint, end: uint) -> &'a str {
     unsafe { raw::slice_bytes(s, begin, end) }
 }
 
-/// Splits a string into substrings at each occurrence of a given character
+/// Splits a string into substrings at each occurrence of a given
+/// character.
 pub pure fn split_char(s: &str, sep: char) -> ~[~str] {
-    split_char_inner(s, sep, len(s), true)
+    split_char_inner(s, sep, len(s), true, true)
 }
 
 /**
  * Splits a string into substrings at each occurrence of a given
- * character up to 'count' times
+ * character up to 'count' times.
  *
  * The byte must be a valid UTF-8/ASCII byte
  */
 pub pure fn splitn_char(s: &str, sep: char, count: uint) -> ~[~str] {
-    split_char_inner(s, sep, count, true)
+    split_char_inner(s, sep, count, true, true)
 }
 
 /// Like `split_char`, but omits empty strings from the returned vector
 pub pure fn split_char_nonempty(s: &str, sep: char) -> ~[~str] {
-    split_char_inner(s, sep, len(s), false)
+    split_char_inner(s, sep, len(s), false, false)
 }
 
-pure fn split_char_inner(s: &str, sep: char, count: uint, allow_empty: bool)
-    -> ~[~str] {
+/**
+ * Like `split_char`, but a trailing empty string is omitted
+ * (e.g. `split_char_no_trailing("A B ",' ') == ~[~"A",~"B"]`)
+ */
+pub pure fn split_char_no_trailing(s: &str, sep: char) -> ~[~str] {
+    split_char_inner(s, sep, len(s), true, false)
+}
+
+pure fn split_char_inner(s: &str, sep: char, count: uint, allow_empty: bool,
+                         allow_trailing_empty: bool) -> ~[~str] {
     if sep < 128u as char {
         let b = sep as u8, l = len(s);
         let mut result = ~[], done = 0u;
@@ -475,19 +484,20 @@ pure fn split_char_inner(s: &str, sep: char, count: uint, allow_empty: bool)
             }
             i += 1u;
         }
-        if allow_empty || start < l {
+        // only push a non-empty trailing substring
+        if allow_trailing_empty || start < l {
             unsafe { result.push(raw::slice_bytes_unique(s, start, l) ) };
         }
         result
     } else {
-        splitn(s, |cur| cur == sep, count)
+        split_inner(s, |cur| cur == sep, count, allow_empty, allow_trailing_empty)
     }
 }
 
 
 /// Splits a string into substrings using a character function
 pub pure fn split(s: &str, sepfn: &fn(char) -> bool) -> ~[~str] {
-    split_inner(s, sepfn, len(s), true)
+    split_inner(s, sepfn, len(s), true, true)
 }
 
 /**
@@ -498,16 +508,25 @@ pub pure fn splitn(s: &str,
                    sepfn: &fn(char) -> bool,
                    count: uint)
                 -> ~[~str] {
-    split_inner(s, sepfn, count, true)
+    split_inner(s, sepfn, count, true, true)
 }
 
 /// Like `split`, but omits empty strings from the returned vector
 pub pure fn split_nonempty(s: &str, sepfn: &fn(char) -> bool) -> ~[~str] {
-    split_inner(s, sepfn, len(s), false)
+    split_inner(s, sepfn, len(s), false, false)
+}
+
+
+/**
+ * Like `split`, but a trailing empty string is omitted
+ * (e.g. `split_no_trailing("A B ",' ') == ~[~"A",~"B"]`)
+ */
+pub pure fn split_no_trailing(s: &str, sepfn: &fn(char) -> bool) -> ~[~str] {
+    split_inner(s, sepfn, len(s), true, false)
 }
 
 pure fn split_inner(s: &str, sepfn: &fn(cc: char) -> bool, count: uint,
-               allow_empty: bool) -> ~[~str] {
+               allow_empty: bool, allow_trailing_empty: bool) -> ~[~str] {
     let l = len(s);
     let mut result = ~[], i = 0u, start = 0u, done = 0u;
     while i < l && done < count {
@@ -523,7 +542,7 @@ pure fn split_inner(s: &str, sepfn: &fn(cc: char) -> bool, count: uint,
         }
         i = next;
     }
-    if allow_empty || start < l {
+    if allow_trailing_empty || start < l {
         unsafe {
             result.push(raw::slice_bytes_unique(s, start, l));
         }
@@ -630,9 +649,11 @@ pub fn levdistance(s: &str, t: &str) -> uint {
 }
 
 /**
- * Splits a string into a vector of the substrings separated by LF ('\n')
+ * Splits a string into a vector of the substrings separated by LF ('\n').
  */
-pub pure fn lines(s: &str) -> ~[~str] { split_char(s, '\n') }
+pub pure fn lines(s: &str) -> ~[~str] {
+    split_char_no_trailing(s, '\n')
+}
 
 /**
  * Splits a string into a vector of the substrings separated by LF ('\n')
@@ -651,7 +672,7 @@ pub pure fn lines_any(s: &str) -> ~[~str] {
 
 /// Splits a string into a vector of the substrings separated by whitespace
 pub pure fn words(s: &str) -> ~[~str] {
-    split_nonempty(s, |c| char::is_whitespace(c))
+    split_nonempty(s, char::is_whitespace)
 }
 
 /** Split a string into a vector of substrings,
@@ -2669,6 +2690,35 @@ mod tests {
 
     }
 
+    #[test]
+    fn test_split_char_no_trailing() {
+     fn t(s: &str, c: char, u: &[~str]) {
+            debug!(~"split_byte: " + s);
+            let v = split_char_no_trailing(s, c);
+            debug!("split_byte to: %?", v);
+            fail_unless!(vec::all2(v, u, |a,b| a == b));
+        }
+        t(~"abc.hello.there", '.', ~[~"abc", ~"hello", ~"there"]);
+        t(~".hello.there", '.', ~[~"", ~"hello", ~"there"]);
+        t(~"...hello.there.", '.', ~[~"", ~"", ~"", ~"hello", ~"there"]);
+
+        fail_unless!(~[~"", ~"", ~"", ~"hello", ~"there"]
+                     == split_char_no_trailing(~"...hello.there.", '.'));
+
+        fail_unless!(~[] == split_char_no_trailing(~"", 'z'));
+        fail_unless!(~[~""] == split_char_no_trailing(~"z", 'z'));
+        fail_unless!(~[~"ok"] == split_char_no_trailing(~"ok", 'z'));
+    }
+
+    #[test]
+    fn test_split_char_no_trailing_2() {
+        let data = ~"ประเทศไทย中华Việt Nam";
+        fail_unless!(~[~"ประเทศไทย中华", ~"iệt Nam"]
+                     == split_char_no_trailing(data, 'V'));
+        fail_unless!(~[~"ประเ", ~"ศไ", ~"ย中华Việt Nam"]
+                     == split_char_no_trailing(data, 'ท'));
+    }
+
     #[test]
     fn test_split_str() {
         fn t(s: &str, sep: &'a str, i: int, k: &str) {
@@ -2722,28 +2772,45 @@ mod tests {
         fail_unless!(~[~"ok"] == split(~"ok", |cc| cc == 'z'));
     }
 
+    #[test]
+    fn test_split_no_trailing() {
+        let data = ~"ประเทศไทย中华Việt Nam";
+        fail_unless!(~[~"ประเทศไทย中", ~"Việt Nam"]
+                     == split_no_trailing (data, |cc| cc == '华'));
+
+        fail_unless!(~[~"", ~"", ~"XXX", ~"YYY"]
+                     == split_no_trailing(~"zzXXXzYYYz", char::is_lowercase));
+
+        fail_unless!(~[~"zz", ~"", ~"", ~"z", ~"", ~"", ~"z"]
+                     == split_no_trailing(~"zzXXXzYYYz", char::is_uppercase));
+
+        fail_unless!(~[~""] == split_no_trailing(~"z", |cc| cc == 'z'));
+        fail_unless!(~[] == split_no_trailing(~"", |cc| cc == 'z'));
+        fail_unless!(~[~"ok"] == split_no_trailing(~"ok", |cc| cc == 'z'));
+    }
+
     #[test]
     fn test_lines() {
         let lf = ~"\nMary had a little lamb\nLittle lamb\n";
         let crlf = ~"\r\nMary had a little lamb\r\nLittle lamb\r\n";
 
-        fail_unless!(~[~"", ~"Mary had a little lamb", ~"Little lamb", ~""]
+        fail_unless!(~[~"", ~"Mary had a little lamb", ~"Little lamb"]
                      == lines(lf));
 
-        fail_unless!(~[~"", ~"Mary had a little lamb", ~"Little lamb", ~""]
+        fail_unless!(~[~"", ~"Mary had a little lamb", ~"Little lamb"]
                      == lines_any(lf));
 
         fail_unless!(~[~"\r", ~"Mary had a little lamb\r",
-                       ~"Little lamb\r", ~""]
+                       ~"Little lamb\r"]
             == lines(crlf));
 
-        fail_unless!(~[~"", ~"Mary had a little lamb", ~"Little lamb", ~""]
+        fail_unless!(~[~"", ~"Mary had a little lamb", ~"Little lamb"]
             == lines_any(crlf));
 
-        fail_unless!(~[~""] == lines    (~""));
-        fail_unless!(~[~""] == lines_any(~""));
-        fail_unless!(~[~"",~""] == lines    (~"\n"));
-        fail_unless!(~[~"",~""] == lines_any(~"\n"));
+        fail_unless!(~[] == lines    (~""));
+        fail_unless!(~[] == lines_any(~""));
+        fail_unless!(~[~""] == lines    (~"\n"));
+        fail_unless!(~[~""] == lines_any(~"\n"));
         fail_unless!(~[~"banana"] == lines    (~"banana"));
         fail_unless!(~[~"banana"] == lines_any(~"banana"));
     }
@@ -3359,7 +3426,6 @@ mod tests {
                 0 => fail_unless!("" == x),
                 1 => fail_unless!("Mary had a little lamb" == x),
                 2 => fail_unless!("Little lamb" == x),
-                3 => fail_unless!("" == x),
                 _ => ()
             }
             ii += 1;