Skip to content

Commit 1616ffd

Browse files
committed
auto merge of #5398 : dbaupp/rust/core-readlines, r=graydon
The `each_line` function in `ReaderUtil` acts very differently to equivalent functions in Python, Ruby, Clojure etc. E.g. given a file `t` with contents `trailing\nnew line\n` and `n` containing `no trailing\nnew line`: Rust: ```Rust t: ~[~"trailing", ~"new line", ~""] n: ~[~"no trailing", ~"new line"] ``` Python: ```Python >>> open('t').readlines() ['trailing\n', 'new line\n'] >>> open('n').readlines() ['no trailing\n', 'new line'] ``` Ruby: ```Ruby irb(main):001:0> File.readlines('t') => ["trailing\n", "new line\n"] irb(main):002:0> File.readlines('n') => ["no trailing\n", "new line"] ``` Clojure ```Clojure user=> (read-lines "t") ("trailing" "new line") user=> (read-lines "n") ("no trailing" "new line") ``` The extra string that rust includes at the end is inconsistent, and means that it is impossible to distinguish between the "real" empty line a file that ends `...\n\n`, and the "fake" one after the last `\n`. The code attached makes Rust's `each_line` act like Clojure (and PHP, i.e. not including the `\n`), as well as adjusting `str::lines` to fix the trailing empty line problem. Also, add a convenience `read_lines` method to read all the lines in a file into a vector.
2 parents d700500 + f832339 commit 1616ffd

File tree

3 files changed

+152
-43
lines changed

3 files changed

+152
-43
lines changed

src/compiletest/runtest.rs

+11-11
Original file line numberDiff line numberDiff line change
@@ -267,7 +267,7 @@ fn run_debuginfo_test(config: config, props: TestProps, testfile: &Path) {
267267
// check if each line in props.check_lines appears in the
268268
// output (in order)
269269
let mut i = 0u;
270-
for str::lines(ProcRes.stdout).each |line| {
270+
for str::lines_each(ProcRes.stdout) |line| {
271271
if props.check_lines[i].trim() == line.trim() {
272272
i += 1u;
273273
}
@@ -297,8 +297,8 @@ fn check_error_patterns(props: TestProps,
297297
let mut next_err_idx = 0u;
298298
let mut next_err_pat = props.error_patterns[next_err_idx];
299299
let mut done = false;
300-
for str::split_char(ProcRes.stderr, '\n').each |line| {
301-
if str::contains(*line, next_err_pat) {
300+
for str::lines_each(ProcRes.stderr) |line| {
301+
if str::contains(line, next_err_pat) {
302302
debug!("found error pattern %s", next_err_pat);
303303
next_err_idx += 1u;
304304
if next_err_idx == vec::len(props.error_patterns) {
@@ -347,15 +347,15 @@ fn check_expected_errors(expected_errors: ~[errors::ExpectedError],
347347
// filename:line1:col1: line2:col2: *warning:* msg
348348
// where line1:col1: is the starting point, line2:col2:
349349
// is the ending point, and * represents ANSI color codes.
350-
for str::split_char(ProcRes.stderr, '\n').each |line| {
350+
for str::lines_each(ProcRes.stderr) |line| {
351351
let mut was_expected = false;
352352
for vec::eachi(expected_errors) |i, ee| {
353353
if !found_flags[i] {
354354
debug!("prefix=%s ee.kind=%s ee.msg=%s line=%s",
355-
prefixes[i], ee.kind, ee.msg, *line);
356-
if (str::starts_with(*line, prefixes[i]) &&
357-
str::contains(*line, ee.kind) &&
358-
str::contains(*line, ee.msg)) {
355+
prefixes[i], ee.kind, ee.msg, line);
356+
if (str::starts_with(line, prefixes[i]) &&
357+
str::contains(line, ee.kind) &&
358+
str::contains(line, ee.msg)) {
359359
found_flags[i] = true;
360360
was_expected = true;
361361
break;
@@ -364,13 +364,13 @@ fn check_expected_errors(expected_errors: ~[errors::ExpectedError],
364364
}
365365

366366
// ignore this msg which gets printed at the end
367-
if str::contains(*line, ~"aborting due to") {
367+
if str::contains(line, ~"aborting due to") {
368368
was_expected = true;
369369
}
370370

371-
if !was_expected && is_compiler_error_or_warning(*line) {
371+
if !was_expected && is_compiler_error_or_warning(str::from_slice(line)) {
372372
fatal_ProcRes(fmt!("unexpected compiler error or warning: '%s'",
373-
*line),
373+
line),
374374
ProcRes);
375375
}
376376
}

src/libcore/io.rs

+49-6
Original file line numberDiff line numberDiff line change
@@ -99,8 +99,8 @@ pub trait ReaderUtil {
9999
/// Read len bytes into a new vec.
100100
fn read_bytes(&self, len: uint) -> ~[u8];
101101

102-
/// Read up until a specified character (which is not returned) or EOF.
103-
fn read_until(&self, c: char) -> ~str;
102+
/// Read up until a specified character (which is optionally included) or EOF.
103+
fn read_until(&self, c: char, include: bool) -> ~str;
104104

105105
/// Read up until the first '\n' char (which is not returned), or EOF.
106106
fn read_line(&self) -> ~str;
@@ -126,6 +126,9 @@ pub trait ReaderUtil {
126126
/// Iterate over every line until the iterator breaks or EOF.
127127
fn each_line(&self, it: &fn(&str) -> bool);
128128

129+
/// Read all the lines of the file into a vector.
130+
fn read_lines(&self) -> ~[~str];
131+
129132
/// Read n (between 1 and 8) little-endian unsigned integer bytes.
130133
fn read_le_uint_n(&self, nbytes: uint) -> u64;
131134

@@ -219,11 +222,14 @@ impl<T:Reader> ReaderUtil for T {
219222
bytes
220223
}
221224

222-
fn read_until(&self, c: char) -> ~str {
225+
fn read_until(&self, c: char, include: bool) -> ~str {
223226
let mut bytes = ~[];
224227
loop {
225228
let ch = self.read_byte();
226229
if ch == -1 || ch == c as int {
230+
if include && ch == c as int {
231+
bytes.push(ch as u8);
232+
}
227233
break;
228234
}
229235
bytes.push(ch as u8);
@@ -232,7 +238,7 @@ impl<T:Reader> ReaderUtil for T {
232238
}
233239

234240
fn read_line(&self) -> ~str {
235-
self.read_until('\n')
241+
self.read_until('\n', false)
236242
}
237243

238244
fn read_chars(&self, n: uint) -> ~[char] {
@@ -306,7 +312,7 @@ impl<T:Reader> ReaderUtil for T {
306312
}
307313

308314
fn read_c_str(&self) -> ~str {
309-
self.read_until(0 as char)
315+
self.read_until(0 as char, false)
310316
}
311317

312318
fn read_whole_stream(&self) -> ~[u8] {
@@ -329,7 +335,29 @@ impl<T:Reader> ReaderUtil for T {
329335

330336
fn each_line(&self, it: &fn(s: &str) -> bool) {
331337
while !self.eof() {
332-
if !it(self.read_line()) { break; }
338+
// include the \n, so that we can distinguish an entirely empty
339+
// line read after "...\n", and the trailing empty line in
340+
// "...\n\n".
341+
let mut line = self.read_until('\n', true);
342+
343+
// blank line at the end of the reader is ignored
344+
if self.eof() && line.is_empty() { break; }
345+
346+
// trim the \n, so that each_line is consistent with read_line
347+
let n = str::len(line);
348+
if line[n-1] == '\n' as u8 {
349+
unsafe { str::raw::set_len(&mut line, n-1); }
350+
}
351+
352+
if !it(line) { break; }
353+
}
354+
}
355+
356+
fn read_lines(&self) -> ~[~str] {
357+
do vec::build |push| {
358+
for self.each_line |line| {
359+
push(str::from_slice(line));
360+
}
333361
}
334362
}
335363

@@ -1335,6 +1363,21 @@ mod tests {
13351363
}
13361364
}
13371365
1366+
#[test]
1367+
fn test_read_lines() {
1368+
do io::with_str_reader(~"a\nb\nc\n") |inp| {
1369+
fail_unless!(inp.read_lines() == ~[~"a", ~"b", ~"c"]);
1370+
}
1371+
1372+
do io::with_str_reader(~"a\nb\nc") |inp| {
1373+
fail_unless!(inp.read_lines() == ~[~"a", ~"b", ~"c"]);
1374+
}
1375+
1376+
do io::with_str_reader(~"") |inp| {
1377+
fail_unless!(inp.read_lines().is_empty());
1378+
}
1379+
}
1380+
13381381
#[test]
13391382
fn test_readchars_wide() {
13401383
let wide_test = ~"生锈的汤匙切肉汤hello生锈的汤匙切肉汤";

src/libcore/str.rs

+92-26
Original file line numberDiff line numberDiff line change
@@ -437,28 +437,37 @@ pub pure fn slice(s: &'a str, begin: uint, end: uint) -> &'a str {
437437
unsafe { raw::slice_bytes(s, begin, end) }
438438
}
439439

440-
/// Splits a string into substrings at each occurrence of a given character
440+
/// Splits a string into substrings at each occurrence of a given
441+
/// character.
441442
pub pure fn split_char(s: &str, sep: char) -> ~[~str] {
442-
split_char_inner(s, sep, len(s), true)
443+
split_char_inner(s, sep, len(s), true, true)
443444
}
444445

445446
/**
446447
* Splits a string into substrings at each occurrence of a given
447-
* character up to 'count' times
448+
* character up to 'count' times.
448449
*
449450
* The byte must be a valid UTF-8/ASCII byte
450451
*/
451452
pub pure fn splitn_char(s: &str, sep: char, count: uint) -> ~[~str] {
452-
split_char_inner(s, sep, count, true)
453+
split_char_inner(s, sep, count, true, true)
453454
}
454455

455456
/// Like `split_char`, but omits empty strings from the returned vector
456457
pub pure fn split_char_nonempty(s: &str, sep: char) -> ~[~str] {
457-
split_char_inner(s, sep, len(s), false)
458+
split_char_inner(s, sep, len(s), false, false)
458459
}
459460

460-
pure fn split_char_inner(s: &str, sep: char, count: uint, allow_empty: bool)
461-
-> ~[~str] {
461+
/**
462+
* Like `split_char`, but a trailing empty string is omitted
463+
* (e.g. `split_char_no_trailing("A B ",' ') == ~[~"A",~"B"]`)
464+
*/
465+
pub pure fn split_char_no_trailing(s: &str, sep: char) -> ~[~str] {
466+
split_char_inner(s, sep, len(s), true, false)
467+
}
468+
469+
pure fn split_char_inner(s: &str, sep: char, count: uint, allow_empty: bool,
470+
allow_trailing_empty: bool) -> ~[~str] {
462471
if sep < 128u as char {
463472
let b = sep as u8, l = len(s);
464473
let mut result = ~[], done = 0u;
@@ -475,19 +484,20 @@ pure fn split_char_inner(s: &str, sep: char, count: uint, allow_empty: bool)
475484
}
476485
i += 1u;
477486
}
478-
if allow_empty || start < l {
487+
// only push a non-empty trailing substring
488+
if allow_trailing_empty || start < l {
479489
unsafe { result.push(raw::slice_bytes_unique(s, start, l) ) };
480490
}
481491
result
482492
} else {
483-
splitn(s, |cur| cur == sep, count)
493+
split_inner(s, |cur| cur == sep, count, allow_empty, allow_trailing_empty)
484494
}
485495
}
486496

487497

488498
/// Splits a string into substrings using a character function
489499
pub pure fn split(s: &str, sepfn: &fn(char) -> bool) -> ~[~str] {
490-
split_inner(s, sepfn, len(s), true)
500+
split_inner(s, sepfn, len(s), true, true)
491501
}
492502

493503
/**
@@ -498,16 +508,25 @@ pub pure fn splitn(s: &str,
498508
sepfn: &fn(char) -> bool,
499509
count: uint)
500510
-> ~[~str] {
501-
split_inner(s, sepfn, count, true)
511+
split_inner(s, sepfn, count, true, true)
502512
}
503513

504514
/// Like `split`, but omits empty strings from the returned vector
505515
pub pure fn split_nonempty(s: &str, sepfn: &fn(char) -> bool) -> ~[~str] {
506-
split_inner(s, sepfn, len(s), false)
516+
split_inner(s, sepfn, len(s), false, false)
517+
}
518+
519+
520+
/**
521+
* Like `split`, but a trailing empty string is omitted
522+
* (e.g. `split_no_trailing("A B ",' ') == ~[~"A",~"B"]`)
523+
*/
524+
pub pure fn split_no_trailing(s: &str, sepfn: &fn(char) -> bool) -> ~[~str] {
525+
split_inner(s, sepfn, len(s), true, false)
507526
}
508527

509528
pure fn split_inner(s: &str, sepfn: &fn(cc: char) -> bool, count: uint,
510-
allow_empty: bool) -> ~[~str] {
529+
allow_empty: bool, allow_trailing_empty: bool) -> ~[~str] {
511530
let l = len(s);
512531
let mut result = ~[], i = 0u, start = 0u, done = 0u;
513532
while i < l && done < count {
@@ -523,7 +542,7 @@ pure fn split_inner(s: &str, sepfn: &fn(cc: char) -> bool, count: uint,
523542
}
524543
i = next;
525544
}
526-
if allow_empty || start < l {
545+
if allow_trailing_empty || start < l {
527546
unsafe {
528547
result.push(raw::slice_bytes_unique(s, start, l));
529548
}
@@ -630,9 +649,11 @@ pub fn levdistance(s: &str, t: &str) -> uint {
630649
}
631650

632651
/**
633-
* Splits a string into a vector of the substrings separated by LF ('\n')
652+
* Splits a string into a vector of the substrings separated by LF ('\n').
634653
*/
635-
pub pure fn lines(s: &str) -> ~[~str] { split_char(s, '\n') }
654+
pub pure fn lines(s: &str) -> ~[~str] {
655+
split_char_no_trailing(s, '\n')
656+
}
636657

637658
/**
638659
* Splits a string into a vector of the substrings separated by LF ('\n')
@@ -651,7 +672,7 @@ pub pure fn lines_any(s: &str) -> ~[~str] {
651672

652673
/// Splits a string into a vector of the substrings separated by whitespace
653674
pub pure fn words(s: &str) -> ~[~str] {
654-
split_nonempty(s, |c| char::is_whitespace(c))
675+
split_nonempty(s, char::is_whitespace)
655676
}
656677

657678
/** Split a string into a vector of substrings,
@@ -2669,6 +2690,35 @@ mod tests {
26692690

26702691
}
26712692

2693+
#[test]
2694+
fn test_split_char_no_trailing() {
2695+
fn t(s: &str, c: char, u: &[~str]) {
2696+
debug!(~"split_byte: " + s);
2697+
let v = split_char_no_trailing(s, c);
2698+
debug!("split_byte to: %?", v);
2699+
fail_unless!(vec::all2(v, u, |a,b| a == b));
2700+
}
2701+
t(~"abc.hello.there", '.', ~[~"abc", ~"hello", ~"there"]);
2702+
t(~".hello.there", '.', ~[~"", ~"hello", ~"there"]);
2703+
t(~"...hello.there.", '.', ~[~"", ~"", ~"", ~"hello", ~"there"]);
2704+
2705+
fail_unless!(~[~"", ~"", ~"", ~"hello", ~"there"]
2706+
== split_char_no_trailing(~"...hello.there.", '.'));
2707+
2708+
fail_unless!(~[] == split_char_no_trailing(~"", 'z'));
2709+
fail_unless!(~[~""] == split_char_no_trailing(~"z", 'z'));
2710+
fail_unless!(~[~"ok"] == split_char_no_trailing(~"ok", 'z'));
2711+
}
2712+
2713+
#[test]
2714+
fn test_split_char_no_trailing_2() {
2715+
let data = ~"ประเทศไทย中华Việt Nam";
2716+
fail_unless!(~[~"ประเทศไทย中华", ~"iệt Nam"]
2717+
== split_char_no_trailing(data, 'V'));
2718+
fail_unless!(~[~"ประเ", ~"ศไ", ~"ย中华Việt Nam"]
2719+
== split_char_no_trailing(data, 'ท'));
2720+
}
2721+
26722722
#[test]
26732723
fn test_split_str() {
26742724
fn t(s: &str, sep: &'a str, i: int, k: &str) {
@@ -2722,28 +2772,45 @@ mod tests {
27222772
fail_unless!(~[~"ok"] == split(~"ok", |cc| cc == 'z'));
27232773
}
27242774

2775+
#[test]
2776+
fn test_split_no_trailing() {
2777+
let data = ~"ประเทศไทย中华Việt Nam";
2778+
fail_unless!(~[~"ประเทศไทย中", ~"Việt Nam"]
2779+
== split_no_trailing (data, |cc| cc == '华'));
2780+
2781+
fail_unless!(~[~"", ~"", ~"XXX", ~"YYY"]
2782+
== split_no_trailing(~"zzXXXzYYYz", char::is_lowercase));
2783+
2784+
fail_unless!(~[~"zz", ~"", ~"", ~"z", ~"", ~"", ~"z"]
2785+
== split_no_trailing(~"zzXXXzYYYz", char::is_uppercase));
2786+
2787+
fail_unless!(~[~""] == split_no_trailing(~"z", |cc| cc == 'z'));
2788+
fail_unless!(~[] == split_no_trailing(~"", |cc| cc == 'z'));
2789+
fail_unless!(~[~"ok"] == split_no_trailing(~"ok", |cc| cc == 'z'));
2790+
}
2791+
27252792
#[test]
27262793
fn test_lines() {
27272794
let lf = ~"\nMary had a little lamb\nLittle lamb\n";
27282795
let crlf = ~"\r\nMary had a little lamb\r\nLittle lamb\r\n";
27292796

2730-
fail_unless!(~[~"", ~"Mary had a little lamb", ~"Little lamb", ~""]
2797+
fail_unless!(~[~"", ~"Mary had a little lamb", ~"Little lamb"]
27312798
== lines(lf));
27322799

2733-
fail_unless!(~[~"", ~"Mary had a little lamb", ~"Little lamb", ~""]
2800+
fail_unless!(~[~"", ~"Mary had a little lamb", ~"Little lamb"]
27342801
== lines_any(lf));
27352802

27362803
fail_unless!(~[~"\r", ~"Mary had a little lamb\r",
2737-
~"Little lamb\r", ~""]
2804+
~"Little lamb\r"]
27382805
== lines(crlf));
27392806

2740-
fail_unless!(~[~"", ~"Mary had a little lamb", ~"Little lamb", ~""]
2807+
fail_unless!(~[~"", ~"Mary had a little lamb", ~"Little lamb"]
27412808
== lines_any(crlf));
27422809

2743-
fail_unless!(~[~""] == lines (~""));
2744-
fail_unless!(~[~""] == lines_any(~""));
2745-
fail_unless!(~[~"",~""] == lines (~"\n"));
2746-
fail_unless!(~[~"",~""] == lines_any(~"\n"));
2810+
fail_unless!(~[] == lines (~""));
2811+
fail_unless!(~[] == lines_any(~""));
2812+
fail_unless!(~[~""] == lines (~"\n"));
2813+
fail_unless!(~[~""] == lines_any(~"\n"));
27472814
fail_unless!(~[~"banana"] == lines (~"banana"));
27482815
fail_unless!(~[~"banana"] == lines_any(~"banana"));
27492816
}
@@ -3359,7 +3426,6 @@ mod tests {
33593426
0 => fail_unless!("" == x),
33603427
1 => fail_unless!("Mary had a little lamb" == x),
33613428
2 => fail_unless!("Little lamb" == x),
3362-
3 => fail_unless!("" == x),
33633429
_ => ()
33643430
}
33653431
ii += 1;

0 commit comments

Comments
 (0)